mirror of
https://github.com/msberends/AMR.git
synced 2024-12-27 10:06:12 +01:00
124 lines
3.5 KiB
R
124 lines
3.5 KiB
R
|
#' Detect trends using Machine Learning
|
||
|
#'
|
||
|
#' Test text
|
||
|
#' @param data a \code{data.frame}
|
||
|
#' @param threshold_unique do not analyse more unique \code{threshold_unique} items per variable
|
||
|
#' @param na.rm a logical value indicating whether \code{NA} values should be stripped before the computation proceeds.
|
||
|
#' @param info print relevant combinations to console
|
||
|
#' @return A \code{list} with class \code{"trends"}
|
||
|
#' @importFrom stats na.omit
|
||
|
#' @importFrom broom tidy
|
||
|
# @export
|
||
|
trends <- function(data, threshold_unique = 30, na.rm = TRUE, info = TRUE) {
|
||
|
|
||
|
cols <- colnames(data)
|
||
|
relevant <- list()
|
||
|
count <- 0
|
||
|
for (x in 1:length(cols)) {
|
||
|
for (y in 1:length(cols)) {
|
||
|
if (x == y) {
|
||
|
next
|
||
|
}
|
||
|
if (n_distinct(data[, x]) > threshold_unique | n_distinct(data[, y]) > threshold_unique) {
|
||
|
next
|
||
|
}
|
||
|
count <- count + 1
|
||
|
df <- data %>%
|
||
|
group_by_at(c(cols[x], cols[y])) %>%
|
||
|
summarise(n = n())
|
||
|
n <- df %>% pull(n)
|
||
|
# linear regression model
|
||
|
lin <- stats::lm(1:length(n) ~ n, na.action = ifelse(na.rm == TRUE, na.omit, NULL))
|
||
|
|
||
|
res <- list(
|
||
|
df = df,
|
||
|
x = cols[x],
|
||
|
y = cols[y],
|
||
|
m = base::mean(n, na.rm = na.rm),
|
||
|
sd = stats::sd(n, na.rm = na.rm),
|
||
|
cv = cv(n, na.rm = na.rm),
|
||
|
cqv = cqv(n, na.rm = na.rm),
|
||
|
kurtosis = kurtosis(n, na.rm = na.rm),
|
||
|
skewness = skewness(n, na.rm = na.rm),
|
||
|
lin.p = broom::tidy(lin)[2, 'p.value']
|
||
|
#binom.p <- broom::tidy(binom)[2, 'p.value']
|
||
|
)
|
||
|
|
||
|
include <- TRUE
|
||
|
# ML part
|
||
|
if (res$cv > 0.25) {
|
||
|
res$reason <- "cv > 0.25"
|
||
|
} else if (res$cqv > 0.75) {
|
||
|
res$reason <- "cqv > 0.75"
|
||
|
} else {
|
||
|
include <- FALSE
|
||
|
}
|
||
|
|
||
|
if (include == TRUE) {
|
||
|
relevant <- c(relevant, list(res))
|
||
|
if (info == TRUE) {
|
||
|
# minus one because the whole data will be added later
|
||
|
cat(paste0("[", length(relevant), "]"), "Relevant:", cols[x], "vs.", cols[y], "\n")
|
||
|
}
|
||
|
}
|
||
|
|
||
|
}
|
||
|
}
|
||
|
|
||
|
cat("Total of", count, "combinations analysed;", length(relevant), "seem relevant.\n")
|
||
|
class(relevant) <- 'trends'
|
||
|
relevant <- c(relevant, list(data = data))
|
||
|
relevant
|
||
|
|
||
|
}
|
||
|
|
||
|
# @exportMethod print.trends
|
||
|
# @export
|
||
|
#' @noRd
|
||
|
print.trends <- function(x, ...) {
|
||
|
cat(length(x) - 1, "relevant trends, out of", length(x$data)^2, "\n")
|
||
|
}
|
||
|
|
||
|
# @exportMethod plot.trends
|
||
|
# @export
|
||
|
#' @noRd
|
||
|
# plot.trends <- function(x, n = NULL, ...) {
|
||
|
# if (is.null(n)) {
|
||
|
# oask <- devAskNewPage(TRUE)
|
||
|
# on.exit(devAskNewPage(oask))
|
||
|
# n <- c(1:(length(x) - 1))
|
||
|
# } else {
|
||
|
# if (n > length(x) - 1) {
|
||
|
# stop('trend unavailable, max is ', length(x) - 1, call. = FALSE)
|
||
|
# }
|
||
|
# oask <- NULL
|
||
|
# }
|
||
|
# for (i in n) {
|
||
|
# data <- x[[i]]$df
|
||
|
# if (as.character(i) %like% '1$') {
|
||
|
# suffix <- "st"
|
||
|
# } else if (as.character(i) %like% '2$') {
|
||
|
# suffix <- "nd"
|
||
|
# } else if (as.character(i) %like% '3$') {
|
||
|
# suffix <- "rd"
|
||
|
# } else {
|
||
|
# suffix <- "th"
|
||
|
# }
|
||
|
# if (!is.null(oask)) {
|
||
|
# cat(paste("Coming up:", colnames(data)[1], "vs.", colnames(data)[2]), "\n")
|
||
|
# }
|
||
|
# print(
|
||
|
# ggplot(
|
||
|
# data,
|
||
|
# aes_string(x = colnames(data)[1],
|
||
|
# y = colnames(data)[3],
|
||
|
# group = colnames(data)[2],
|
||
|
# fill = colnames(data)[2])) +
|
||
|
# geom_col(position = "dodge") +
|
||
|
# theme_minimal() +
|
||
|
# labs(title = paste(colnames(data)[1], "vs.", colnames(data)[2]),
|
||
|
# subtitle = paste0(i, suffix, " trend"))
|
||
|
# )
|
||
|
# }
|
||
|
# }
|