mirror of
https://github.com/msberends/AMR.git
synced 2025-07-08 10:31:53 +02:00
kurtosis, skewness, start with ML
This commit is contained in:
43
R/freq.R
43
R/freq.R
@ -46,8 +46,8 @@
|
||||
#'
|
||||
#' For dates and times of any class, these additional values will be calculated with \code{na.rm = TRUE} and shown into the header:
|
||||
#' \itemize{
|
||||
#' \item{Oldest, using \code{\link[base]{min}}}
|
||||
#' \item{Newest, using \code{\link[base]{max}}, with difference between newest and oldest}
|
||||
#' \item{Oldest, using \code{\link{min}}}
|
||||
#' \item{Newest, using \code{\link{max}}, with difference between newest and oldest}
|
||||
#' \item{Median, using \code{\link[stats]{median}}, with percentage since oldest}
|
||||
#' }
|
||||
#'
|
||||
@ -522,3 +522,42 @@ as.data.frame.frequency_tbl <- function(x, ...) {
|
||||
attr(x, 'opt') <- NULL
|
||||
as.data.frame.data.frame(x, ...)
|
||||
}
|
||||
|
||||
#' @noRd
|
||||
#' @exportMethod hist.frequency_tbl
|
||||
#' @export
|
||||
#' @importFrom dplyr %>% pull
|
||||
#' @importFrom graphics hist
|
||||
hist.frequency_tbl <- function(x, ...) {
|
||||
|
||||
opt <- attr(x, 'opt')
|
||||
|
||||
if (!is.null(opt$vars)) {
|
||||
title <- opt$vars
|
||||
} else {
|
||||
title <- ""
|
||||
}
|
||||
|
||||
items <- x %>% pull(item)
|
||||
counts <- x %>% pull(count)
|
||||
vect <- rep(items, counts)
|
||||
hist(vect, main = paste("Histogram of", title), xlab = title, ...)
|
||||
}
|
||||
|
||||
#' @noRd
|
||||
#' @exportMethod plot.frequency_tbl
|
||||
#' @export
|
||||
#' @importFrom dplyr %>% pull
|
||||
plot.frequency_tbl <- function(x, y, ...) {
|
||||
opt <- attr(x, 'opt')
|
||||
|
||||
if (!is.null(opt$vars)) {
|
||||
title <- opt$vars
|
||||
} else {
|
||||
title <- ""
|
||||
}
|
||||
|
||||
items <- x %>% pull(item)
|
||||
counts <- x %>% pull(count)
|
||||
plot(x = items, y = counts, ylab = "Count", xlab = title, ...)
|
||||
}
|
||||
|
@ -35,12 +35,14 @@ globalVariables(c('abname',
|
||||
'key_ab',
|
||||
'key_ab_lag',
|
||||
'key_ab_other',
|
||||
'labs',
|
||||
'median',
|
||||
'mic',
|
||||
'microorganisms',
|
||||
'mocode',
|
||||
'molis',
|
||||
'n',
|
||||
'na.omit',
|
||||
'other_pat_or_mo',
|
||||
'patient_id',
|
||||
'quantile',
|
||||
|
40
R/kurtosis.R
Normal file
40
R/kurtosis.R
Normal file
@ -0,0 +1,40 @@
|
||||
#' Kurtosis of the sample
|
||||
#'
|
||||
#' @description Kurtosis is a measure of the "tailedness" of the probability distribution of a real-valued random variable.
|
||||
#'
|
||||
#' @param x a vector of values, a \code{matrix} or a \code{data frame}
|
||||
#' @param na.rm a logical value indicating whether \code{NA} values should be stripped before the computation proceeds.
|
||||
#' @exportMethod kurtosis
|
||||
#' @seealso \code{\link{skewness}}
|
||||
#' @rdname kurtosis
|
||||
#' @export
|
||||
kurtosis <- function(x, na.rm = FALSE) {
|
||||
UseMethod("kurtosis")
|
||||
}
|
||||
|
||||
#' @exportMethod kurtosis.default
|
||||
#' @rdname kurtosis
|
||||
#' @export
|
||||
kurtosis.default <- function (x, na.rm = FALSE) {
|
||||
x <- as.vector(x)
|
||||
if (na.rm == TRUE) {
|
||||
x <- x[!is.na(x)]
|
||||
}
|
||||
n <- length(x)
|
||||
n * base::sum((x - base::mean(x, na.rm = na.rm))^4, na.rm = na.rm) /
|
||||
(base::sum((x - base::mean(x, na.rm = na.rm))^2, na.rm = na.rm)^2)
|
||||
}
|
||||
|
||||
#' @exportMethod kurtosis.matrix
|
||||
#' @rdname kurtosis
|
||||
#' @export
|
||||
kurtosis.matrix <- function (x, na.rm = FALSE) {
|
||||
base::apply(x, 2, kurtosis.default, na.rm = na.rm)
|
||||
}
|
||||
|
||||
#' @exportMethod kurtosis.data.frame
|
||||
#' @rdname kurtosis
|
||||
#' @export
|
||||
kurtosis.data.frame <- function (x, na.rm = FALSE) {
|
||||
base::sapply(x, kurtosis.default, na.rm = na.rm)
|
||||
}
|
2
R/like.R
2
R/like.R
@ -18,7 +18,7 @@
|
||||
|
||||
#' Pattern Matching
|
||||
#'
|
||||
#' Convenient wrapper around \code{\link[base]{grepl}} to match a pattern: \code{a \%like\% b}. It always returns a \code{logical} vector and is always case-insensitive. Also, \code{pattern} (\code{b}) can be as long as \code{x} (\code{a}) to compare items of each index in both vectors.
|
||||
#' Convenient wrapper around \code{\link[base]{grep}} to match a pattern: \code{a \%like\% b}. It always returns a \code{logical} vector and is always case-insensitive. Also, \code{pattern} (\code{b}) can be as long as \code{x} (\code{a}) to compare items of each index in both vectors.
|
||||
#' @inheritParams base::grepl
|
||||
#' @return A \code{logical} vector
|
||||
#' @name like
|
||||
|
40
R/skewness.R
Normal file
40
R/skewness.R
Normal file
@ -0,0 +1,40 @@
|
||||
#' Skewness of the sample
|
||||
#'
|
||||
#' @description Skewness is a measure of the asymmetry of the probability distribution of a real-valued random variable about its mean.
|
||||
#'
|
||||
#' When negative: the left tail is longer; the mass of the distribution is concentrated on the right of the figure. When positive: the right tail is longer; the mass of the distribution is concentrated on the left of the figure.
|
||||
#' @param x a vector of values, a \code{matrix} or a \code{data frame}
|
||||
#' @param na.rm a logical value indicating whether \code{NA} values should be stripped before the computation proceeds.
|
||||
#' @exportMethod skewness
|
||||
#' @seealso \code{\link{kurtosis}}
|
||||
#' @rdname skewness
|
||||
#' @export
|
||||
skewness <- function(x, na.rm = FALSE) {
|
||||
UseMethod("skewness")
|
||||
}
|
||||
|
||||
#' @exportMethod skewness.default
|
||||
#' @rdname skewness
|
||||
#' @export
|
||||
skewness.default <- function (x, na.rm = FALSE) {
|
||||
x <- as.vector(x)
|
||||
if (na.rm == TRUE) {
|
||||
x <- x[!is.na(x)]
|
||||
}
|
||||
n <- length(x)
|
||||
(base::sum((x - base::mean(x))^3) / n) / (base::sum((x - base::mean(x))^2) / n)^(3/2)
|
||||
}
|
||||
|
||||
#' @exportMethod skewness.matrix
|
||||
#' @rdname skewness
|
||||
#' @export
|
||||
skewness.matrix <- function (x, na.rm = FALSE) {
|
||||
base::apply(x, 2, skewness.default, na.rm = na.rm)
|
||||
}
|
||||
|
||||
#' @exportMethod skewness.data.frame
|
||||
#' @rdname skewness
|
||||
#' @export
|
||||
skewness.data.frame <- function (x, na.rm = FALSE) {
|
||||
base::sapply(x, skewness.default, na.rm = na.rm)
|
||||
}
|
123
R/trends.R
Normal file
123
R/trends.R
Normal file
@ -0,0 +1,123 @@
|
||||
#' Detect trends using Machine Learning
|
||||
#'
|
||||
#' Test text
|
||||
#' @param data a \code{data.frame}
|
||||
#' @param threshold_unique do not analyse more unique \code{threshold_unique} items per variable
|
||||
#' @param na.rm a logical value indicating whether \code{NA} values should be stripped before the computation proceeds.
|
||||
#' @param info print relevant combinations to console
|
||||
#' @return A \code{list} with class \code{"trends"}
|
||||
#' @importFrom stats na.omit
|
||||
#' @importFrom broom tidy
|
||||
# @export
|
||||
trends <- function(data, threshold_unique = 30, na.rm = TRUE, info = TRUE) {
|
||||
|
||||
cols <- colnames(data)
|
||||
relevant <- list()
|
||||
count <- 0
|
||||
for (x in 1:length(cols)) {
|
||||
for (y in 1:length(cols)) {
|
||||
if (x == y) {
|
||||
next
|
||||
}
|
||||
if (n_distinct(data[, x]) > threshold_unique | n_distinct(data[, y]) > threshold_unique) {
|
||||
next
|
||||
}
|
||||
count <- count + 1
|
||||
df <- data %>%
|
||||
group_by_at(c(cols[x], cols[y])) %>%
|
||||
summarise(n = n())
|
||||
n <- df %>% pull(n)
|
||||
# linear regression model
|
||||
lin <- stats::lm(1:length(n) ~ n, na.action = ifelse(na.rm == TRUE, na.omit, NULL))
|
||||
|
||||
res <- list(
|
||||
df = df,
|
||||
x = cols[x],
|
||||
y = cols[y],
|
||||
m = base::mean(n, na.rm = na.rm),
|
||||
sd = stats::sd(n, na.rm = na.rm),
|
||||
cv = cv(n, na.rm = na.rm),
|
||||
cqv = cqv(n, na.rm = na.rm),
|
||||
kurtosis = kurtosis(n, na.rm = na.rm),
|
||||
skewness = skewness(n, na.rm = na.rm),
|
||||
lin.p = broom::tidy(lin)[2, 'p.value']
|
||||
#binom.p <- broom::tidy(binom)[2, 'p.value']
|
||||
)
|
||||
|
||||
include <- TRUE
|
||||
# ML part
|
||||
if (res$cv > 0.25) {
|
||||
res$reason <- "cv > 0.25"
|
||||
} else if (res$cqv > 0.75) {
|
||||
res$reason <- "cqv > 0.75"
|
||||
} else {
|
||||
include <- FALSE
|
||||
}
|
||||
|
||||
if (include == TRUE) {
|
||||
relevant <- c(relevant, list(res))
|
||||
if (info == TRUE) {
|
||||
# minus one because the whole data will be added later
|
||||
cat(paste0("[", length(relevant), "]"), "Relevant:", cols[x], "vs.", cols[y], "\n")
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
cat("Total of", count, "combinations analysed;", length(relevant), "seem relevant.\n")
|
||||
class(relevant) <- 'trends'
|
||||
relevant <- c(relevant, list(data = data))
|
||||
relevant
|
||||
|
||||
}
|
||||
|
||||
# @exportMethod print.trends
|
||||
# @export
|
||||
#' @noRd
|
||||
print.trends <- function(x, ...) {
|
||||
cat(length(x) - 1, "relevant trends, out of", length(x$data)^2, "\n")
|
||||
}
|
||||
|
||||
# @exportMethod plot.trends
|
||||
# @export
|
||||
#' @noRd
|
||||
# plot.trends <- function(x, n = NULL, ...) {
|
||||
# if (is.null(n)) {
|
||||
# oask <- devAskNewPage(TRUE)
|
||||
# on.exit(devAskNewPage(oask))
|
||||
# n <- c(1:(length(x) - 1))
|
||||
# } else {
|
||||
# if (n > length(x) - 1) {
|
||||
# stop('trend unavailable, max is ', length(x) - 1, call. = FALSE)
|
||||
# }
|
||||
# oask <- NULL
|
||||
# }
|
||||
# for (i in n) {
|
||||
# data <- x[[i]]$df
|
||||
# if (as.character(i) %like% '1$') {
|
||||
# suffix <- "st"
|
||||
# } else if (as.character(i) %like% '2$') {
|
||||
# suffix <- "nd"
|
||||
# } else if (as.character(i) %like% '3$') {
|
||||
# suffix <- "rd"
|
||||
# } else {
|
||||
# suffix <- "th"
|
||||
# }
|
||||
# if (!is.null(oask)) {
|
||||
# cat(paste("Coming up:", colnames(data)[1], "vs.", colnames(data)[2]), "\n")
|
||||
# }
|
||||
# print(
|
||||
# ggplot(
|
||||
# data,
|
||||
# aes_string(x = colnames(data)[1],
|
||||
# y = colnames(data)[3],
|
||||
# group = colnames(data)[2],
|
||||
# fill = colnames(data)[2])) +
|
||||
# geom_col(position = "dodge") +
|
||||
# theme_minimal() +
|
||||
# labs(title = paste(colnames(data)[1], "vs.", colnames(data)[2]),
|
||||
# subtitle = paste0(i, suffix, " trend"))
|
||||
# )
|
||||
# }
|
||||
# }
|
Reference in New Issue
Block a user