kurtosis, skewness, start with ML

2025-08-24 17:42:11 +02:00 · 2018-07-08 22:14:55 +02:00
parent c768ba0d9c
commit 14b990d769
18 changed files with 401 additions and 15 deletions
--- a/R/freq.R
+++ b/R/freq.R
@@ -46,8 +46,8 @@
 #'
 #' For dates and times of any class, these additional values will be calculated with \code{na.rm = TRUE} and shown into the header:
 #' \itemize{
-#'   \item{Oldest, using \code{\link[base]{min}}}
-#'   \item{Newest, using \code{\link[base]{max}}, with difference between newest and oldest}
+#'   \item{Oldest, using \code{\link{min}}}
+#'   \item{Newest, using \code{\link{max}}, with difference between newest and oldest}
 #'   \item{Median, using \code{\link[stats]{median}}, with percentage since oldest}
 #' }
 #'
@@ -522,3 +522,42 @@ as.data.frame.frequency_tbl <- function(x, ...) {
  attr(x, 'opt') <- NULL
  as.data.frame.data.frame(x, ...)
 }
+
+#' @noRd
+#' @exportMethod hist.frequency_tbl
+#' @export
+#' @importFrom dplyr %>% pull
+#' @importFrom graphics hist
+hist.frequency_tbl <- function(x, ...) {
+
+  opt <- attr(x, 'opt')
+
+  if (!is.null(opt$vars)) {
+    title <- opt$vars
+  } else {
+    title <- ""
+  }
+
+  items <- x %>% pull(item)
+  counts <- x %>% pull(count)
+  vect <- rep(items, counts)
+  hist(vect, main = paste("Histogram of", title), xlab = title, ...)
+}
+
+#' @noRd
+#' @exportMethod plot.frequency_tbl
+#' @export
+#' @importFrom dplyr %>% pull
+plot.frequency_tbl <- function(x, y, ...) {
+  opt <- attr(x, 'opt')
+
+  if (!is.null(opt$vars)) {
+    title <- opt$vars
+  } else {
+    title <- ""
+  }
+
+  items <- x %>% pull(item)
+  counts <- x %>% pull(count)
+  plot(x = items, y = counts, ylab = "Count", xlab = title, ...)
+}
--- a/R/globals.R
+++ b/R/globals.R
@@ -35,12 +35,14 @@ globalVariables(c('abname',
                  'key_ab',
                  'key_ab_lag',
                  'key_ab_other',
+                  'labs',
                  'median',
                  'mic',
                  'microorganisms',
                  'mocode',
                  'molis',
                  'n',
+                  'na.omit',
                  'other_pat_or_mo',
                  'patient_id',
                  'quantile',
--- a/R/kurtosis.R
+++ b/R/kurtosis.R
@@ -0,0 +1,40 @@
+#' Kurtosis of the sample
+#'
+#' @description Kurtosis is a measure of the "tailedness" of the probability distribution of a real-valued random variable.
+#'
+#' @param x a vector of values, a \code{matrix} or a \code{data frame}
+#' @param na.rm a logical value indicating whether \code{NA} values should be stripped before the computation proceeds.
+#' @exportMethod kurtosis
+#' @seealso \code{\link{skewness}}
+#' @rdname kurtosis
+#' @export
+kurtosis <- function(x, na.rm = FALSE) {
+  UseMethod("kurtosis")
+}
+
+#' @exportMethod kurtosis.default
+#' @rdname kurtosis
+#' @export
+kurtosis.default <- function (x, na.rm = FALSE) {
+  x <- as.vector(x)
+  if (na.rm == TRUE) {
+    x <- x[!is.na(x)]
+  }
+  n <- length(x)
+  n * base::sum((x - base::mean(x, na.rm = na.rm))^4, na.rm = na.rm) /
+    (base::sum((x - base::mean(x, na.rm = na.rm))^2, na.rm = na.rm)^2)
+}
+
+#' @exportMethod kurtosis.matrix
+#' @rdname kurtosis
+#' @export
+kurtosis.matrix <- function (x, na.rm = FALSE) {
+  base::apply(x, 2, kurtosis.default, na.rm = na.rm)
+}
+
+#' @exportMethod kurtosis.data.frame
+#' @rdname kurtosis
+#' @export
+kurtosis.data.frame <- function (x, na.rm = FALSE) {
+  base::sapply(x, kurtosis.default, na.rm = na.rm)
+}
--- a/R/like.R
+++ b/R/like.R
@@ -18,7 +18,7 @@

 #' Pattern Matching
 #'
-#' Convenient wrapper around \code{\link[base]{grepl}} to match a pattern: \code{a \%like\% b}. It always returns a \code{logical} vector and is always case-insensitive. Also, \code{pattern} (\code{b}) can be as long as \code{x} (\code{a}) to compare items of each index in both vectors.
+#' Convenient wrapper around \code{\link[base]{grep}} to match a pattern: \code{a \%like\% b}. It always returns a \code{logical} vector and is always case-insensitive. Also, \code{pattern} (\code{b}) can be as long as \code{x} (\code{a}) to compare items of each index in both vectors.
 #' @inheritParams base::grepl
 #' @return A \code{logical} vector
 #' @name like
--- a/R/skewness.R
+++ b/R/skewness.R
@@ -0,0 +1,40 @@
+#' Skewness of the sample
+#'
+#' @description Skewness is a measure of the asymmetry of the probability distribution of a real-valued random variable about its mean.
+#'
+#' When negative: the left tail is longer; the mass of the distribution is concentrated on the right of the figure. When positive: the right tail is longer; the mass of the distribution is concentrated on the left of the figure.
+#' @param x a vector of values, a \code{matrix} or a \code{data frame}
+#' @param na.rm a logical value indicating whether \code{NA} values should be stripped before the computation proceeds.
+#' @exportMethod skewness
+#' @seealso \code{\link{kurtosis}}
+#' @rdname skewness
+#' @export
+skewness <- function(x, na.rm = FALSE) {
+  UseMethod("skewness")
+}
+
+#' @exportMethod skewness.default
+#' @rdname skewness
+#' @export
+skewness.default <- function (x, na.rm = FALSE) {
+  x <- as.vector(x)
+  if (na.rm == TRUE) {
+    x <- x[!is.na(x)]
+  }
+  n <- length(x)
+  (base::sum((x - base::mean(x))^3) / n) / (base::sum((x - base::mean(x))^2) / n)^(3/2)
+}
+
+#' @exportMethod skewness.matrix
+#' @rdname skewness
+#' @export
+skewness.matrix <- function (x, na.rm = FALSE) {
+  base::apply(x, 2, skewness.default, na.rm = na.rm)
+}
+
+#' @exportMethod skewness.data.frame
+#' @rdname skewness
+#' @export
+skewness.data.frame <- function (x, na.rm = FALSE) {
+  base::sapply(x, skewness.default, na.rm = na.rm)
+}
--- a/R/trends.R
+++ b/R/trends.R
@@ -0,0 +1,123 @@
+#' Detect trends using Machine Learning
+#'
+#' Test text
+#' @param data a \code{data.frame}
+#' @param threshold_unique do not analyse more unique \code{threshold_unique} items per variable
+#' @param na.rm a logical value indicating whether \code{NA} values should be stripped before the computation proceeds.
+#' @param info print relevant combinations to console
+#' @return A \code{list} with class \code{"trends"}
+#' @importFrom stats na.omit
+#' @importFrom broom tidy
+# @export
+trends <- function(data, threshold_unique = 30, na.rm = TRUE, info = TRUE) {
+
+  cols <- colnames(data)
+  relevant <- list()
+  count <- 0
+  for (x in 1:length(cols)) {
+    for (y in 1:length(cols)) {
+      if (x == y) {
+        next
+      }
+      if (n_distinct(data[, x]) > threshold_unique | n_distinct(data[, y]) > threshold_unique) {
+        next
+      }
+      count <- count + 1
+      df <- data %>%
+        group_by_at(c(cols[x], cols[y])) %>%
+        summarise(n = n())
+      n <- df %>% pull(n)
+      # linear regression model
+      lin <- stats::lm(1:length(n) ~ n, na.action = ifelse(na.rm == TRUE, na.omit, NULL))
+
+      res <- list(
+        df = df,
+        x = cols[x],
+        y = cols[y],
+        m = base::mean(n, na.rm = na.rm),
+        sd = stats::sd(n, na.rm = na.rm),
+        cv = cv(n, na.rm = na.rm),
+        cqv = cqv(n, na.rm = na.rm),
+        kurtosis = kurtosis(n, na.rm = na.rm),
+        skewness = skewness(n, na.rm = na.rm),
+        lin.p = broom::tidy(lin)[2, 'p.value']
+        #binom.p <- broom::tidy(binom)[2, 'p.value']
+      )
+
+      include <- TRUE
+      # ML part
+      if (res$cv > 0.25) {
+        res$reason <- "cv > 0.25"
+      } else if (res$cqv > 0.75) {
+        res$reason <- "cqv > 0.75"
+      } else {
+        include <- FALSE
+      }
+
+      if (include == TRUE) {
+        relevant <- c(relevant, list(res))
+        if (info == TRUE) {
+          # minus one because the whole data will be added later
+          cat(paste0("[", length(relevant), "]"), "Relevant:", cols[x], "vs.", cols[y], "\n")
+        }
+      }
+
+    }
+  }
+
+  cat("Total of", count, "combinations analysed;", length(relevant), "seem relevant.\n")
+  class(relevant) <- 'trends'
+  relevant <- c(relevant, list(data = data))
+  relevant
+
+}
+
+# @exportMethod print.trends
+# @export
+#' @noRd
+print.trends <- function(x, ...) {
+  cat(length(x) - 1, "relevant trends, out of", length(x$data)^2, "\n")
+}
+
+# @exportMethod plot.trends
+# @export
+#' @noRd
+# plot.trends <- function(x, n = NULL, ...) {
+#   if (is.null(n)) {
+#     oask <- devAskNewPage(TRUE)
+#     on.exit(devAskNewPage(oask))
+#     n <- c(1:(length(x) - 1))
+#   } else {
+#     if (n > length(x) - 1) {
+#       stop('trend unavailable, max is ', length(x) - 1, call. = FALSE)
+#     }
+#     oask <- NULL
+#   }
+#   for (i in n) {
+#     data <- x[[i]]$df
+#     if (as.character(i) %like% '1$') {
+#       suffix <- "st"
+#     } else if (as.character(i) %like% '2$') {
+#       suffix <- "nd"
+#     } else if (as.character(i) %like% '3$') {
+#       suffix <- "rd"
+#     } else {
+#       suffix <- "th"
+#     }
+#     if (!is.null(oask)) {
+#       cat(paste("Coming up:", colnames(data)[1], "vs.", colnames(data)[2]), "\n")
+#     }
+#     print(
+#       ggplot(
+#         data,
+#         aes_string(x = colnames(data)[1],
+#                    y = colnames(data)[3],
+#                    group = colnames(data)[2],
+#                    fill = colnames(data)[2])) +
+#         geom_col(position = "dodge") +
+#         theme_minimal() +
+#         labs(title = paste(colnames(data)[1], "vs.", colnames(data)[2]),
+#              subtitle = paste0(i, suffix, " trend"))
+#     )
+#   }
+# }