diff --git a/DESCRIPTION b/DESCRIPTION index b1347770..beb819a6 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,6 +1,6 @@ Package: AMR -Version: 0.7.1.9014 -Date: 2019-07-16 +Version: 0.7.1.9015 +Date: 2019-07-29 Title: Antimicrobial Resistance Analysis Authors@R: c( person( @@ -45,6 +45,7 @@ Depends: R (>= 3.1.0) Imports: backports, + clean, crayon (>= 1.3.0), data.table (>= 1.9.0), dplyr (>= 0.7.0), @@ -59,7 +60,6 @@ Suggests: covr (>= 3.0.1), curl, readxl, - rmarkdown, rstudioapi, rvest (>= 0.3.2), testthat (>= 1.0.2), diff --git a/NAMESPACE b/NAMESPACE index e233f55e..42b15afb 100755 --- a/NAMESPACE +++ b/NAMESPACE @@ -1,7 +1,6 @@ # Generated by roxygen2: do not edit by hand S3method(as.data.frame,ab) -S3method(as.data.frame,freq) S3method(as.data.frame,mo) S3method(as.double,mic) S3method(as.integer,mic) @@ -10,28 +9,21 @@ S3method(as.rsi,data.frame) S3method(as.rsi,default) S3method(as.rsi,disk) S3method(as.rsi,mic) -S3method(as.vector,freq) -S3method(as_tibble,freq) S3method(barplot,mic) S3method(barplot,rsi) -S3method(boxplot,freq) -S3method(diff,freq) S3method(droplevels,mic) S3method(droplevels,rsi) -S3method(format,freq) -S3method(hist,freq) +S3method(freq,mo) +S3method(freq,rsi) S3method(kurtosis,data.frame) S3method(kurtosis,default) S3method(kurtosis,matrix) -S3method(plot,freq) S3method(plot,mic) S3method(plot,resistance_predict) S3method(plot,rsi) S3method(print,ab) S3method(print,catalogue_of_life_version) S3method(print,disk) -S3method(print,freq) -S3method(print,frequency_tbl) S3method(print,mic) S3method(print,mo) S3method(print,mo_renamed) @@ -39,7 +31,6 @@ S3method(print,mo_uncertainties) S3method(print,rsi) S3method(pull,ab) S3method(pull,mo) -S3method(select,freq) S3method(skewness,data.frame) S3method(skewness,default) S3method(skewness,matrix) @@ -98,8 +89,6 @@ export(filter_glycopeptides) export(filter_macrolides) export(filter_tetracyclines) export(first_isolate) -export(freq) -export(frequency_tbl) export(full_join_microorganisms) export(g.test) export(geom_rsi) @@ -108,7 +97,6 @@ export(get_mo_source) export(ggplot_rsi) export(ggplot_rsi_predict) export(guess_ab_col) -export(header) export(inner_join_microorganisms) export(is.ab) export(is.disk) @@ -171,35 +159,26 @@ export(semi_join_microorganisms) export(set_mo_source) export(skewness) export(theme_rsi) -export(top_freq) exportMethods(as.data.frame.ab) -exportMethods(as.data.frame.freq) exportMethods(as.data.frame.mo) exportMethods(as.double.mic) exportMethods(as.integer.mic) exportMethods(as.numeric.mic) -exportMethods(as.vector.freq) -exportMethods(as_tibble.freq) exportMethods(barplot.mic) exportMethods(barplot.rsi) -exportMethods(boxplot.freq) -exportMethods(diff.freq) exportMethods(droplevels.mic) exportMethods(droplevels.rsi) -exportMethods(format.freq) -exportMethods(hist.freq) +exportMethods(freq.mo) +exportMethods(freq.rsi) exportMethods(kurtosis) exportMethods(kurtosis.data.frame) exportMethods(kurtosis.default) exportMethods(kurtosis.matrix) -exportMethods(plot.freq) exportMethods(plot.mic) exportMethods(plot.rsi) exportMethods(print.ab) exportMethods(print.catalogue_of_life_version) exportMethods(print.disk) -exportMethods(print.freq) -exportMethods(print.frequency_tbl) exportMethods(print.mic) exportMethods(print.mo) exportMethods(print.mo_renamed) @@ -209,7 +188,6 @@ exportMethods(pull.ab) exportMethods(pull.mo) exportMethods(scale_type.ab) exportMethods(scale_type.mo) -exportMethods(select.freq) exportMethods(skewness) exportMethods(skewness.data.frame) exportMethods(skewness.default) @@ -217,6 +195,7 @@ exportMethods(skewness.matrix) exportMethods(summary.mic) exportMethods(summary.mo) exportMethods(summary.rsi) +importFrom(clean,freq) importFrom(crayon,bgGreen) importFrom(crayon,bgRed) importFrom(crayon,bgYellow) @@ -232,7 +211,6 @@ importFrom(crayon,strip_style) importFrom(crayon,underline) importFrom(crayon,white) importFrom(crayon,yellow) -importFrom(data.table,address) importFrom(data.table,as.data.table) importFrom(data.table,data.table) importFrom(data.table,setkey) @@ -241,18 +219,14 @@ importFrom(dplyr,all_vars) importFrom(dplyr,any_vars) importFrom(dplyr,arrange) importFrom(dplyr,arrange_at) -importFrom(dplyr,as_tibble) importFrom(dplyr,between) -importFrom(dplyr,bind_cols) importFrom(dplyr,bind_rows) importFrom(dplyr,case_when) -importFrom(dplyr,desc) importFrom(dplyr,distinct) importFrom(dplyr,everything) importFrom(dplyr,filter) importFrom(dplyr,filter_all) importFrom(dplyr,filter_at) -importFrom(dplyr,full_join) importFrom(dplyr,funs) importFrom(dplyr,group_by) importFrom(dplyr,group_by_at) @@ -275,35 +249,23 @@ importFrom(dplyr,slice) importFrom(dplyr,summarise) importFrom(dplyr,summarise_if) importFrom(dplyr,tibble) -importFrom(dplyr,top_n) importFrom(dplyr,transmute) -importFrom(dplyr,ungroup) importFrom(dplyr,vars) -importFrom(grDevices,boxplot.stats) importFrom(graphics,arrows) importFrom(graphics,axis) importFrom(graphics,barplot) -importFrom(graphics,boxplot) -importFrom(graphics,hist) importFrom(graphics,par) importFrom(graphics,plot) importFrom(graphics,points) importFrom(graphics,text) -importFrom(hms,is.hms) -importFrom(knitr,kable) importFrom(microbenchmark,microbenchmark) importFrom(rlang,as_label) importFrom(rlang,enquos) -importFrom(rlang,eval_tidy) importFrom(scales,percent) importFrom(stats,complete.cases) -importFrom(stats,fivenum) importFrom(stats,glm) importFrom(stats,lm) -importFrom(stats,mad) importFrom(stats,pchisq) importFrom(stats,predict) -importFrom(stats,sd) importFrom(utils,browseURL) -importFrom(utils,browseVignettes) importFrom(utils,installed.packages) diff --git a/NEWS.md b/NEWS.md index dc924db4..4dee93cc 100755 --- a/NEWS.md +++ b/NEWS.md @@ -1,4 +1,7 @@ -# AMR 0.7.1.9014 +# AMR 0.7.1.9015 + +### Breaking +* Function `freq()` has moved to a new package, [`clean`](https://github.com/msberends/clean) ([CRAN link](https://cran.r-project.org/package=clean)). Creating frequency tables is actually not the scope of this package (never was) and this function has matured a lot over the last two years. We decided to create a new package for data cleaning and checking and it perfectly fits the `freq()` function. The [`clean`](https://github.com/msberends/clean) package is available on CRAN and will be installed automatically when updating the `AMR` package, that now imports it. In a later stage, the `skewness()` and `kurtosis()` functions will be moved to the `clean` package too. ### New * Additional way to calculate co-resistance, i.e. when using multiple antibiotics as input for `portion_*` functions or `count_*` functions. This can be used to determine the empiric susceptibily of a combination therapy. A new parameter `only_all_tested` (**which defaults to `FALSE`**) replaces the old `also_single_tested` and can be used to select one of the two methods to count isolates and calculate portions. The difference can be seen in this example table (which is also on the `portion` and `count` help pages), where the %SI is being determined: diff --git a/R/freq.R b/R/freq.R index 83cadbe9..38ff3880 100755 --- a/R/freq.R +++ b/R/freq.R @@ -19,1254 +19,36 @@ # Visit our website for more info: https://msberends.gitlab.io/AMR. # # ==================================================================== # -#' Frequency table -#' -#' Create a frequency table of a vector with items or a \code{data.frame}. Supports quasiquotation and markdown for reports. Best practice is: \code{data \%>\% freq(var)}.\cr -#' \code{top_freq} can be used to get the top/bottom \emph{n} items of a frequency table, with counts as names. -#' @param x vector of any class or a \code{\link{data.frame}}, \code{\link{tibble}} (may contain a grouping variable) or \code{\link{table}} -#' @param ... up to nine different columns of \code{x} when \code{x} is a \code{data.frame} or \code{tibble}, to calculate frequencies from - see Examples. Also supports quasiquotion. -#' @param sort.count sort on count, i.e. frequencies. This will be \code{TRUE} at default for everything except when using grouping variables. -#' @param nmax number of row to print. The default, \code{15}, uses \code{\link{getOption}("max.print.freq")}. Use \code{nmax = 0}, \code{nmax = Inf}, \code{nmax = NULL} or \code{nmax = NA} to print all rows. -#' @param na.rm a logical value indicating whether \code{NA} values should be removed from the frequency table. The header (if set) will always print the amount of \code{NA}s. -#' @param row.names a logical value indicating whether row indices should be printed as \code{1:nrow(x)} -#' @param markdown a logical value indicating whether the frequency table should be printed in markdown format. This will print all rows (except when \code{nmax} is defined) and is default behaviour in non-interactive R sessions (like when knitting RMarkdown files). -#' @param digits how many significant digits are to be used for numeric values in the header (not for the items themselves, that depends on \code{\link{getOption}("digits")}) -#' @param quote a logical value indicating whether or not strings should be printed with surrounding quotes. Default is to print them only around characters that are actually numeric values. -#' @param header a logical value indicating whether an informative header should be printed -#' @param title text to show above frequency table, at default to tries to coerce from the variables passed to \code{x} -#' @param na a character string that should be used to show empty (\code{NA}) values (only useful when \code{na.rm = FALSE}) -#' @param droplevels a logical value indicating whether in factors empty levels should be dropped -#' @param sep a character string to separate the terms when selecting multiple columns -#' @inheritParams base::format -#' @param f a frequency table -#' @param n number of top \emph{n} items to return, use -n for the bottom \emph{n} items. It will include more than \code{n} rows if there are ties. -#' @param property property in header to return this value directly -#' @details Frequency tables (or frequency distributions) are summaries of the distribution of values in a sample. With the `freq` function, you can create univariate frequency tables. Multiple variables will be pasted into one variable, so it forces a univariate distribution. This package also has a vignette available to explain the use of this function further, run \code{browseVignettes("AMR")} to read it. -#' -#' For numeric values of any class, these additional values will all be calculated with \code{na.rm = TRUE} and shown into the header: -#' \itemize{ -#' \item{Mean, using \code{\link[base]{mean}}} -#' \item{Standard Deviation, using \code{\link[stats]{sd}}} -#' \item{Coefficient of Variation (CV), the standard deviation divided by the mean} -#' \item{Mean Absolute Deviation (MAD), using \code{\link[stats]{mad}}} -#' \item{Tukey Five-Number Summaries (minimum, Q1, median, Q3, maximum), using \code{\link[stats]{fivenum}}} -#' \item{Interquartile Range (IQR) calculated as \code{Q3 - Q1} using the Tukey Five-Number Summaries, i.e. \strong{not} using the \code{\link[stats]{quantile}} function} -#' \item{Coefficient of Quartile Variation (CQV, sometimes called coefficient of dispersion), calculated as \code{(Q3 - Q1) / (Q3 + Q1)} using the Tukey Five-Number Summaries} -#' \item{Outliers (total count and unique count), using \code{\link[grDevices]{boxplot.stats}}} -#' } -#' -#' For dates and times of any class, these additional values will be calculated with \code{na.rm = TRUE} and shown into the header: -#' \itemize{ -#' \item{Oldest, using \code{\link{min}}} -#' \item{Newest, using \code{\link{max}}, with difference between newest and oldest} -#' \item{Median, using \code{\link[stats]{median}}, with percentage since oldest} -#' } -#' -#' In factors, all factor levels that are not existing in the input data will be dropped. -#' -#' The function \code{top_freq} uses \code{\link[dplyr]{top_n}} internally and will include more than \code{n} rows if there are ties. -#' @importFrom stats fivenum sd mad -#' @importFrom grDevices boxplot.stats -#' @importFrom dplyr %>% arrange arrange_at bind_cols desc filter_at funs group_by mutate mutate_at n n_distinct pull select summarise tibble ungroup vars all_vars -#' @importFrom utils browseVignettes -#' @importFrom hms is.hms -#' @importFrom crayon red green silver -#' @importFrom rlang enquos eval_tidy as_label -#' @keywords summary summarise frequency freq -#' @rdname freq -#' @name freq -#' @return A \code{data.frame} (with an additional class \code{"freq"}) with five columns: \code{item}, \code{count}, \code{percent}, \code{cum_count} and \code{cum_percent}. -#' @export -#' @inheritSection AMR Read more on our website! -#' @examples -#' library(dplyr) -#' -#' # this all gives the same result: -#' freq(septic_patients$hospital_id) -#' freq(septic_patients[, "hospital_id"]) -#' septic_patients$hospital_id %>% freq() -#' septic_patients[, "hospital_id"] %>% freq() -#' septic_patients %>% freq("hospital_id") -#' septic_patients %>% freq(hospital_id) #<- easiest to remember (tidyverse) -#' -#' -#' # you could also use `select` or `pull` to get your variables -#' septic_patients %>% -#' filter(hospital_id == "A") %>% -#' select(mo) %>% -#' freq() -#' -#' -#' # multiple selected variables will be pasted together -#' septic_patients %>% -#' left_join_microorganisms %>% -#' freq(genus, species) -#' -#' # functions as quasiquotation are also supported -#' septic_patients %>% -#' freq(mo_genus(mo), mo_species(mo)) -#' -#' -#' # group a variable and analyse another -#' septic_patients %>% -#' group_by(hospital_id) %>% -#' freq(gender) -#' -#' -#' # get top 10 bugs of hospital A as a vector -#' septic_patients %>% -#' filter(hospital_id == "A") %>% -#' freq(mo) %>% -#' top_freq(10) -#' -#' -#' # save frequency table to an object -#' years <- septic_patients %>% -#' mutate(year = format(date, "%Y")) %>% -#' freq(year) -#' -#' -#' # show only the top 5 -#' years %>% print(nmax = 5) -#' -#' -#' # save to an object with formatted percentages -#' years <- format(years) -#' -#' -#' # print a histogram of numeric values -#' septic_patients %>% -#' freq(age) %>% -#' hist() -#' -#' # or a boxplot of numeric values -#' septic_patients %>% -#' freq(age) %>% -#' boxplot() -#' -#' # or even a boxplot per group -#' septic_patients %>% -#' group_by(hospital_id) %>% -#' freq(age) %>% -#' boxplot() -#' -#' # or print all points to a regular plot -#' septic_patients %>% -#' freq(age) %>% -#' plot() -#' -#' -#' # transform to a data.frame or tibble -#' septic_patients %>% -#' freq(age) %>% -#' as.data.frame() -#' -#' -#' # or transform (back) to a vector -#' septic_patients %>% -#' freq(age) %>% -#' as.vector() -#' -#' identical(septic_patients %>% -#' freq(age) %>% -#' as.vector() %>% -#' sort(), -#' sort(septic_patients$age)) # TRUE -#' -#' -#' # it also supports `table` objects -#' table(septic_patients$gender, -#' septic_patients$age) %>% -#' freq(sep = " **sep** ") -#' -#' -#' # only get selected columns -#' septic_patients %>% -#' freq(hospital_id) %>% -#' select(item, percent) -#' -#' septic_patients %>% -#' freq(hospital_id) %>% -#' select(-count, -cum_count) -#' -#' -#' # check differences between frequency tables -#' diff(freq(septic_patients$TMP), -#' freq(septic_patients$SXT)) -freq <- function(x, - ..., - sort.count = TRUE, - nmax = getOption("max.print.freq"), - na.rm = TRUE, - row.names = TRUE, - markdown = !interactive(), - digits = 2, - quote = NULL, - header = TRUE, - title = NULL, - na = "", - droplevels = TRUE, - sep = " ", - decimal.mark = getOption("OutDec"), - big.mark = ifelse(decimal.mark != ",", ",", ".")) { - - mult.columns <- 0 - x.group = character(0) - df <- NULL - x.name <- NULL - cols <- NULL - cols.names <- NULL - if (any(class(x) == "list")) { - cols <- names(x) - cols.names <- cols - x <- as.data.frame(x, stringsAsFactors = FALSE) - x.name <- "a list" - } else if (any(class(x) == "matrix")) { - x <- as.data.frame(x, stringsAsFactors = FALSE) - x.name <- "a matrix" - cols <- colnames(x) - quote <- FALSE - cols.names <- cols - if (all(cols %like% "V[0-9]")) { - cols <- NULL - } - } - - if (any(class(x) == "data.frame")) { - - if (is.null(x.name)) { - x.name <- deparse(substitute(x)) - } - if (x.name %like% "(%>%)") { - x.name <- x.name %>% strsplit("%>%", fixed = TRUE) %>% unlist() %>% .[1] %>% trimws() - } - if (x.name == ".") { - # passed on with pipe - x.name <- get_data_source_name(x) - if (!is.null(x.name)) { - x.name <- paste0("`", x.name, "`") - } else { - x.name <- "a data.frame" - } - } else if (!x.name %in% c("a list", "a matrix")) { - x.name <- paste0("`", x.name, "`") - } - x.name.dims <- x %>% - dim() %>% - format(decimal.mark = decimal.mark, big.mark = big.mark) %>% - trimws() %>% - paste(collapse = " x ") - x.name <- paste0(x.name, " (", x.name.dims, ")") - - x.group <- group_vars(x) - if (length(x.group) > 1) { - x.group <- x.group[1L] - warning("freq supports one grouping variable, only `", x.group, "` will be kept.", call. = FALSE) - } - - user_exprs <- enquos(...) - - if (length(user_exprs) > 0) { - new_list <- list(0) - for (i in 1:length(user_exprs)) { - new_list[[i]] <- tryCatch(eval_tidy(user_exprs[[i]], data = x), - error = function(e) stop(e$message, call. = FALSE)) - if (length(new_list[[i]]) == 1) { - if (is.character(new_list[[i]]) & new_list[[i]] %in% colnames(x)) { - # support septic_patients %>% freq("hospital_id") - new_list[[i]] <- x %>% pull(new_list[[i]]) - } - } - cols <- c(cols, as_label(user_exprs[[i]])) - } - - if (length(new_list) == 1 & length(x.group) == 0) { - # is now character - x <- new_list[[1]] - df <- NULL - } else { - # create data frame - df <- as.data.frame(new_list, col.names = cols, stringsAsFactors = FALSE) - cols.names <- colnames(df) - } - } else { - # complete data frame - df <- x - } - - if (identical(x.group, cols.names)) { - # ... %>% group_by(var = calculation(..)) %>% freq(var) - x.group <- NULL - } - - # support grouping variables - if (length(x.group) > 0) { - x.group_cols <- c(x.group, cols.names) - x <- bind_cols(x, df) - # if (droplevels == TRUE) { - # x <- x %>% mutate_at(vars(x.group_cols), droplevels) - # } - suppressWarnings( - df <- x %>% - group_by_at(vars(x.group_cols)) %>% - summarise(count = n()) - ) - if (na.rm == TRUE) { - df <- df %>% filter_at(vars(x.group_cols), all_vars(!is.na(.))) - } - if (!missing(sort.count)) { - if (sort.count == TRUE) { - df <- df %>% arrange_at(c(x.group_cols, "count"), desc) - } - } - df <- df %>% - mutate(cum_count = cumsum(count)) - - df.topleft <- df[1, 1] - df <- df %>% - ungroup() %>% - # do not repeat group labels - mutate_at(vars(x.group), ~(ifelse(lag(.) == ., "", .))) - df[1, 1] <- df.topleft - colnames(df)[1:2] <- c("group", "item") - - if (!is.null(levels(df$item)) & droplevels == TRUE) { - # is factor - df <- df %>% filter(count != 0) - } - } else { - if (!is.null(df)) { - # no groups, multiple values like: septic_patients %>% freq(mo, mo_genus(mo)) - x <- df - df <- NULL - } - } - if (length(cols) > 0 & is.data.frame(x)) { - x <- x[, cols.names] - } - - } else if (any(class(x) == "table")) { - x <- as.data.frame(x, stringsAsFactors = FALSE) - # now this DF contains 3 columns: the 2 vars and a Freq column - # paste the first 2 cols and repeat them Freq times: - x <- rep(x = do.call(paste, c(x[colnames(x)[1:2]], sep = sep)), - times = x$Freq) - x.name <- "a `table` object" - cols <- NULL - # mult.columns <- 2 - } else { - x.name <- deparse(substitute(x)) - if (all(x.name %like% "[$]") & length(x.name) == 1) { - cols <- unlist(strsplit(x.name, "$", fixed = TRUE))[2] - x.name <- unlist(strsplit(x.name, "$", fixed = TRUE))[1] - # try to find the object to determine dimensions - - x.obj <- tryCatch(get(x.name), error = function(e) NULL) - x.name <- paste0("`", x.name , "`") - if (!is.null(dim(x.obj))) { - x.name <- paste0(x.name, - " (", - x.obj %>% - dim() %>% - format(decimal.mark = decimal.mark, big.mark = big.mark) %>% - trimws() %>% - paste(collapse = " x "), - ")") - } - } else { - x.name <- NULL - cols <- NULL - } - } - - if (!is.null(ncol(x))) { - if (ncol(x) == 1 & any(class(x) == "data.frame")) { - x <- x %>% pull(1) - } else if (ncol(x) < 10) { - mult.columns <- ncol(x) - # paste old columns together - x <- do.call(paste, c(x[colnames(x)], sep = sep)) - } else { - stop("A maximum of 9 columns can be analysed at the same time.", call. = FALSE) - } - } - - if (mult.columns > 1) { - NAs <- x[is.na(x) | x == trimws(strrep("NA ", mult.columns))] - } else { - NAs <- x[is.na(x)] - } - - if (mult.columns > 0) { - header_list <- list(columns = mult.columns) - } else { - header_list <- list(class = class(x), - mode = mode(x)) - } - header_list$length <- length(x) - - if (na.rm == TRUE) { - x_class <- class(x) - x <- x[!x %in% NAs] - class(x) <- x_class - } - - markdown_line <- "" - if (markdown == TRUE) { - markdown_line <- " " - } - x_align <- "l" - - if (!is.null(levels(x))) { - header_list$levels <- levels(x) - header_list$ordered <- is.ordered(x) - # drop levels of non-existing factor values, - # since dplyr >= 0.8.0 does not do this anymore in group_by - if (droplevels == TRUE) { - x <- droplevels(x) - } - } - - header_list$na_length <- length(NAs) - header_list$unique <- n_distinct(x) - - if (NROW(x) > 0 & any(class(x) == "character")) { - header_list$shortest <- x %>% base::nchar() %>% base::min(na.rm = TRUE) - header_list$longest <- x %>% base::nchar() %>% base::max(na.rm = TRUE) - } - - if (NROW(x) > 0 & any(class(x) == "mo")) { - x_mo <- as.mo(x) # do it once for all three - header_list$families <- x_mo %>% mo_family() %>% n_distinct() - header_list$genera <- x_mo %>% mo_genus() %>% n_distinct() - header_list$species <- x_mo %>% mo_species() %>% n_distinct() - } - - if (NROW(x) > 0 & any(class(x) == "difftime") & !is.hms(x)) { - header_list$units <- attributes(x)$units - x <- as.double(x) - # after this, the numeric header_txt continues - } - - if (NROW(x) > 0 & any(class(x) %in% c("double", "integer", "numeric", "raw", "single"))) { - # right align number - x_align <- "r" - header_list$mean <- base::mean(x, na.rm = TRUE) - header_list$sd <- stats::sd(x, na.rm = TRUE) - header_list$cv <- cv(x, na.rm = TRUE) - header_list$mad <- stats::mad(x, na.rm = TRUE) - Tukey_five <- stats::fivenum(x, na.rm = TRUE) - header_list$fivenum <- Tukey_five - header_list$IQR <- Tukey_five[4] - Tukey_five[2] - header_list$cqv <- cqv(x, na.rm = TRUE) - header_list$outliers_total <- length(boxplot.stats(x)$out) - header_list$outliers_unique <- n_distinct(boxplot.stats(x)$out) - } - - if (any(class(x) == "rsi")) { - header_list$count_SI <- max(0, sum(x %in% c("S", "I"), na.rm = TRUE), na.rm = TRUE) - header_list$count_R <- max(0, sum(x == "R", na.rm = TRUE), na.rm = TRUE) - } - - formatdates <- "%e %B %Y" # = d mmmm yyyy - if (is.hms(x)) { - x <- x %>% as.POSIXlt() - formatdates <- "%H:%M:%S" - } - if (NROW(x) > 0 & any(class(x) %in% c("Date", "POSIXct", "POSIXlt"))) { - if (formatdates == "%H:%M:%S") { - # hms - header_list$earliest <- min(x, na.rm = TRUE) - header_list$latest <- max(x, na.rm = TRUE) - - } else { - # other date formats - header_list$oldest <- min(x, na.rm = TRUE) - header_list$newest <- max(x, na.rm = TRUE) - } - header_list$median <- median(x, na.rm = TRUE) - header_list$date_format <- formatdates - } - if (any(class(x) == "POSIXlt")) { - x <- x %>% format(formatdates) - } - - nmax.set <- !missing(nmax) - if (!nmax.set & is.null(nmax) & is.null(base::getOption("max.print.freq", default = NULL))) { - # default for max print setting - nmax <- 15 - } else if (is.null(nmax)) { - nmax <- length(x) - } - - if (nmax %in% c(0, Inf, NA, NULL)) { - nmax <- length(x) - } - - column_names <- c("Item", "Count", "Percent", "Cum. Count", "Cum. Percent") - column_names_df <- c("item", "count", "percent", "cum_count", "cum_percent") - column_align <- c(x_align, "r", "r", "r", "r") - - if (is.null(df)) { - - suppressWarnings( # suppress since dplyr 0.8.0, which idiotly warns about included NAs :( - # create table with counts and percentages - df <- tibble(item = x) %>% - group_by(item) %>% - summarise(count = n()) - ) - - # sort according to setting - if (sort.count == TRUE) { - df <- df %>% arrange(desc(count), item) - } else { - df <- df %>% arrange(item) - } - } else { - column_names <- c("Group", column_names) - column_names_df <-c("group", column_names_df) - column_align <- c("l", column_align) - } - - if (df$item %>% paste(collapse = ",") %like% "\033") { - # remove escape char - # see https://en.wikipedia.org/wiki/Escape_character#ASCII_escape_character - df <- df %>% mutate(item = item %>% gsub("\033", " ", ., fixed = TRUE)) - } - - if (is.null(quote)) { - if (!is.numeric(df$item) & all(df$item %like% "^[0-9]+$", na.rm = TRUE)) { - quote <- TRUE - } else { - quote <- FALSE - } - } - - if (quote == TRUE) { - df$item <- paste0('"', df$item, '"') - if (length(x.group) != 0) { - df$group <- paste0('"', df$group, '"') - } - } - - df <- as.data.frame(df, stringsAsFactors = FALSE) - - df$percent <- df$count / base::sum(df$count, na.rm = TRUE) - if (length(x.group) == 0) { - df$cum_count <- base::cumsum(df$count) - } - df$cum_percent <- df$cum_count / base::sum(df$count, na.rm = TRUE) - if (length(x.group) != 0) { - # sort columns - df <- df[, column_names_df] - } - - if (markdown == TRUE) { - tbl_format <- "markdown" - } else { - tbl_format <- "pandoc" - } - - if (!is.null(title)) { - title <- trimws(gsub("^Frequency table of", "", title[1L], ignore.case = TRUE)) - } - - # if (nmax.set == FALSE) { - # nmax <- nrow(df) - # } - - structure(.Data = df, - class = unique(c("freq", class(df))), - header = header_list, - opt = list(title = title, - data = x.name, - vars = cols, - group_var = x.group, - header = header, - row_names = row.names, - column_names = column_names, - column_align = column_align, - decimal.mark = decimal.mark, - big.mark = big.mark, - tbl_format = tbl_format, - na = na, - digits = digits, - nmax = nmax, - nmax.set = nmax.set)) -} - -#' @rdname freq -#' @export -frequency_tbl <- freq - -is.freq <- function(f) { - any(c("freq", "frequency_tbl") %in% class(f)) -} - -#' @importFrom crayon silver green red -#' @importFrom dplyr %>% -format_header <- function(x, markdown = FALSE, decimal.mark = ".", big.mark = ",", digits = 2) { - newline <-"\n" - if (markdown == TRUE) { - newline <- " \n" - # no colours in markdown - silver <- function(x) x - green <- function(x) x - red <- function(x) x - } - - header <- header(x) - x_class <- header$class - has_length <- header$length > 0 - - # FORMATTING - # rsi - if (has_length == TRUE & any(x_class == "rsi")) { - if (!is.null(attributes(x)$opt$vars)) { - ab <- tryCatch(as.ab(attributes(x)$opt$vars), error = function(e) NA) - if (!is.na(ab) & isTRUE(length(ab) > 0)) { - header$drug <- paste0(ab_name(ab[1L]), " (", ab[1L], ", ", ab_atc(ab[1L]), ")") - header$group <- ab_group(ab[1L]) - } - } - header$`%SI` <- percent(header$count_SI / (header$count_SI + header$count_R), - force_zero = TRUE, round = digits, decimal.mark = decimal.mark) - } - header <- header[!names(header) %in% c("count_SI", "count_R")] - # dates - if (!is.null(header$date_format)) { - if (header$date_format == "%H:%M:%S") { - header$median <- paste0(format(header$median, header$date_format), - " (", - (as.double(difftime(header$median, header$earliest, units = "auto")) / - as.double(difftime(header$latest, header$earliest, units = "auto"))) %>% - percent(round = digits, decimal.mark = decimal.mark), ")") - header$latest <- paste0(format(header$latest, header$date_format), - " (+", - difftime(header$latest, header$earliest, units = "mins") %>% - as.double() %>% - format(digits = digits, decimal.mark = decimal.mark, big.mark = big.mark), - " min.)") - header$earliest <- format(header$earliest, header$date_format) - - header$median <- trimws(header$median) - header$latest <- trimws(header$latest) - header$earliest <- trimws(header$earliest) - } else { - header$median <- paste0(format(header$median, header$date_format), - " (", - (as.double(difftime(header$median, header$oldest, units = "auto")) / - as.double(difftime(header$newest, header$oldest, units = "auto"))) %>% - percent(round = digits, decimal.mark = decimal.mark), ")") - header$newest <- paste0(format(header$newest, header$date_format), - " (+", - difftime(header$newest, header$oldest, units = "auto") %>% - as.double() %>% - format(digits = digits, decimal.mark = decimal.mark, big.mark = big.mark), - ")") - header$oldest <- format(header$oldest, header$date_format) - - header$median <- trimws(header$median) - header$newest <- trimws(header$newest) - header$oldest <- trimws(header$oldest) - } - header <- header[names(header) != "date_format"] - } - - # class and mode - if (is.null(header$columns)) { - if (!header$mode %in% header$class) { - header$class <- header$class %>% rev() %>% paste(collapse = " > ") %>% paste0(silver(paste0(" (", header$mode, ")"))) - } else { - header$class <- header$class %>% rev() %>% paste(collapse = " > ") - } - header <- header[names(header) != "mode"] - } - # levels - if (!is.null(header$levels)) { - if (header$ordered == TRUE) { - levels_text <- paste0(header$levels, collapse = " < ") - } else { - levels_text <- paste0(header$levels, collapse = ", ") - } - if (nchar(levels_text) > 70) { - # levels text wider than half the console - levels_text <- paste0(substr(levels_text, 1, 70 - 3), "...") - if (nchar(gsub("[^`]", "", levels_text)) %% 2 == 1) { - # odd number of backticks, should be even - levels_text <- paste0(levels_text, "`") - } - } - header$levels <- paste0(length(header$levels), ": ", levels_text) - header <- header[names(header) != "ordered"] - } - # length and NAs - if (has_length == TRUE) { - na_txt <- paste0(header$na_length %>% format(decimal.mark = decimal.mark, big.mark = big.mark), " = ", - (header$na_length / header$length) %>% percent(force_zero = TRUE, round = digits, decimal.mark = decimal.mark) %>% - sub("NaN", "0", ., fixed = TRUE)) - if (!na_txt %like% "^0 =") { - na_txt <- red(na_txt) - } else { - na_txt <- green(na_txt) - } - na_txt <- paste0("(of which NA: ", na_txt, ")") - } else { - na_txt <- "" - } - header$length <- paste(format(header$length, decimal.mark = decimal.mark, big.mark = big.mark), - na_txt) - header <- header[names(header) != "na_length"] - - # format all numeric values - header <- lapply(header, function(x) { - if (is.numeric(x)) { - if (any(x < 1000, na.rm = TRUE)) { - format(round2(x, digits = digits), decimal.mark = decimal.mark, big.mark = big.mark) - } else { - format(x, digits = digits, decimal.mark = decimal.mark, big.mark = big.mark) - } - } else { - x - } - }) - - # numeric values - if (has_length == TRUE & !is.null(header$sd)) { - # any(x_class %in% c("double", "integer", "numeric", "raw", "single"))) { - header$sd <- paste0(header$sd, " (CV: ", header$cv, ", MAD: ", header$mad, ")") - header$fivenum <- paste0(paste(trimws(header$fivenum), collapse = " | "), " (IQR: ", header$IQR, ", CQV: ", header$cqv, ")") - header$outliers_total <- paste0(header$outliers_total, " (unique count: ", header$outliers_unique, ")") - header <- header[!names(header) %in% c("cv", "mad", "IQR", "cqv", "outliers_unique")] - } - - # header names - header_names <- paste0(names(header), ": ") - header_names <- gsub("sd", "SD", header_names) - header_names <- gsub("fivenum", "Five-Num", header_names) - header_names <- gsub("outliers_total", "Outliers", header_names) - # capitalise first character - header_names <- gsub("^(.)", "\\U\\1", header_names, perl = TRUE) - # make all header captions equal size - header_names <- gsub("\\s", " ", format(header_names, - width = max(nchar(header_names), - na.rm = TRUE))) - header <- paste0(header_names, header) - header <- paste(header, collapse = newline) - # add newline after 'Unique' - gsub("(.*Unique.*\\n)(.*?)", paste0("\\1", newline, "\\2"), header) -} - -#' @rdname freq -#' @export -#' @importFrom dplyr top_n pull -top_freq <- function(f, n) { - if (!is.freq(f)) { - stop("`top_freq` can only be applied to frequency tables", call. = FALSE) - } - if (!is.numeric(n) | length(n) != 1L) { - stop("For `top_freq`, 'n' must be a number of length 1", call. = FALSE) - } - top <- f %>% top_n(n, count) - vect <- top %>% pull(item) - names(vect) <- top %>% pull(count) - if (length(vect) > abs(n)) { - message("top_freq: selecting ", length(vect), " items instead of ", abs(n), ", because of ties") - } - vect -} - -#' @rdname freq -#' @export -header <- function(f, property = NULL) { - if (!is.freq(f)) { - stop("`header` can only be applied to frequency tables", call. = FALSE) - } - if (is.null(property)) { - attributes(f)$header - } else { - a <- attributes(f)$header - if (any(property %in% names(f))) { - a[names(a) %in% property] - } - } -} - -#' @noRd -#' @exportMethod diff.freq -#' @importFrom dplyr %>% full_join mutate -#' @export -diff.freq <- function(x, y, ...) { - # check classes - if (!is.freq(x) | !is.freq(y)) { - stop("Both x and y must be a frequency table.") - } - - cat("Differences between frequency tables") - if (identical(x, y)) { - cat("\n\nNo differences found.\n") - return(invisible()) - } - - x.attr <- attributes(x)$opt - - # only keep item and count - x <- x[, 1:2] - y <- y[, 1:2] - - x <- x %>% - full_join(y, - by = colnames(x)[1], - suffix = c(".x", ".y")) %>% - mutate( - diff = case_when( - is.na(count.y) ~ -count.x, - is.na(count.x) ~ count.y, - TRUE ~ count.y - count.x)) %>% - mutate( - diff.percent = percent( - diff / count.x, - force_zero = TRUE)) %>% - mutate(diff = ifelse(diff %like% "^-", - diff, - paste0("+", diff)), - diff.percent = ifelse(diff.percent %like% "^-", - diff.percent, - paste0("+", diff.percent))) - - print( - knitr::kable(x, - format = x.attr$tbl_format, - col.names = c("Item", "Count #1", "Count #2", "Difference", "Diff. percent"), - align = paste0(x.attr$column_align[1], "rrrr"), - padding = 1) - ) -} - -#' @rdname freq -#' @exportMethod print.freq -#' @importFrom knitr kable +freq_def <- clean:::freq.default +#' @exportMethod freq.mo #' @importFrom dplyr n_distinct -#' @importFrom crayon bold silver +#' @importFrom clean freq #' @export -print.freq <- function(x, - nmax = getOption("max.print.freq", default = 15), - markdown = !interactive(), - header = TRUE, - decimal.mark = getOption("OutDec"), - big.mark = ifelse(decimal.mark != ",", ",", "."), - ...) { - - opt <- attr(x, "opt") - if (is.null(opt)) { - # selection of frequency table, return original class - class(x) <- class(x)[!class(x) %in% c("freq", "frequency_tbl")] - print(x) - return(invisible()) - } - - opt$header_txt <- header(x) - if (is.null(opt$nmax)) { - opt$nmax <- 0 - } - if (is.null(opt$tbl_format)) { - opt$tbl_format <- "pandoc" - } - - dots <- list(...) - if ("markdown" %in% names(dots)) { - if (dots$markdown == TRUE) { - opt$tbl_format <- "markdown" - } else { - opt$tbl_format <- "pandoc" - } - } - if (!missing(markdown)) { - if (markdown == TRUE) { - opt$tbl_format <- "markdown" - } else { - opt$tbl_format <- "pandoc" - } - } - - if (length(opt$vars) == 0) { - opt$vars <- NULL - } - - if (is.null(opt$title)) { - if (isTRUE(opt$data %like% "^a data.frame") & isTRUE(opt$tbl_format == "markdown")) { - opt$data <- gsub("data.frame", "`data.frame`", opt$data, fixed = TRUE) - } - if (!is.null(opt$data) & !is.null(opt$vars)) { - title <- paste0("`", paste0(opt$vars, collapse = "` and `"), "` from ", opt$data) - } else if (!is.null(opt$data) & is.null(opt$vars)) { - title <- opt$data - } else if (is.null(opt$data) & !is.null(opt$vars)) { - title <- paste0("`", paste0(opt$vars, collapse = "` and `"), "`") - } else { - title <- "" - } - if (title != "" & length(opt$group_var) != 0) { - group_var <- paste0("(grouped by `", opt$group_var, "`)") - if (opt$tbl_format == "pandoc") { - group_var <- silver(group_var) - } - title <- paste(title, group_var) - } - title <- trimws(title) - if (title == "") { - title <- "Frequency table" - } else { - title <- paste("Frequency table of", trimws(title)) - } - } else { - title <- opt$title - } - - if (!missing(nmax) | is.null(opt$nmax)) { - opt$nmax <- nmax - opt$nmax.set <- TRUE - } - if (isTRUE(opt$nmax %in% c(0, Inf, NA, NULL))) { - opt$nmax <- NROW(x) - opt$nmax.set <- FALSE - } else if (isTRUE(opt$nmax >= NROW(x))) { - opt$nmax.set <- FALSE - } - - if (!missing(decimal.mark) | is.null(opt$decimal.mark)) { - opt$decimal.mark <- decimal.mark - } - if (!missing(big.mark) | is.null(opt$big.mark)) { - opt$big.mark <- big.mark - } - if (!missing(header)) { - opt$header <- header - } - - # bold title - if (isTRUE(opt$tbl_format == "pandoc")) { - title <- bold(title) - } else if (isTRUE(opt$tbl_format == "markdown")) { - title <- paste0("\n\n**", title, "** ") # two space for newline - } - - cat(title, "\n\n") - - if (NROW(x) == 0 | isTRUE(all(is.na(x$item)))) { - cat("No observations") - if (isTRUE(all(is.na(x$item) | identical(x$item, "") | identical(x$item, "(NA)")))) { - cat(" - all values are missing ()") - } - cat(".\n") - if (opt$tbl_format == "markdown") { - cat("\n") - } - return(invisible()) - } - - if (isTRUE(opt$header == TRUE)) { - if (!is.null(opt$header_txt)) { - if (is.null(opt$digits)) { - opt$digits <- 2 - } - cat(format_header(x, digits = opt$digits, markdown = (opt$tbl_format == "markdown"), - decimal.mark = decimal.mark, big.mark = big.mark)) - } - } - - # save old NA setting for kable - opt.old <- options()$knitr.kable.NA - if (is.null(opt$na)) { - opt$na <- "" - } - if (isTRUE(opt$tbl_format == "markdown")) { - # no HTML tags - opt$na <- gsub("<", "(", opt$na, fixed = TRUE) - opt$na <- gsub(">", ")", opt$na, fixed = TRUE) - } - options(knitr.kable.NA = opt$na) - - x.rows <- nrow(x) - x.unprinted <- base::sum(x[(opt$nmax + 1):nrow(x), "count"], na.rm = TRUE) - x.printed <- base::sum(x$count) - x.unprinted - - if (nrow(x) > opt$nmax & isTRUE(opt$tbl_format != "markdown")) { - - if (opt$nmax.set == TRUE) { - nmax <- opt$nmax - } else { - nmax <- getOption("max.print.freq", default = 15) - } - - x <- x[1:nmax,] - - if (opt$nmax.set == TRUE) { - footer <- paste("[ reached `nmax = ", opt$nmax, "`", sep = "") - } else { - footer <- '[ reached getOption("max.print.freq")' - } - footer <- paste(footer, - " -- omitted ", - format(x.rows - opt$nmax, big.mark = opt$big.mark, decimal.mark = opt$decimal.mark), - " entries, n = ", - format(x.unprinted, big.mark = opt$big.mark, decimal.mark = opt$decimal.mark), - " (", - (x.unprinted / (x.unprinted + x.printed)) %>% percent(force_zero = TRUE, decimal.mark = opt$decimal.mark), - ") ]\n", sep = "") - if (opt$tbl_format == "pandoc") { - footer <- silver(footer) # only silver in regular printing - } - } else if (opt$tbl_format == "markdown") { - if (opt$nmax.set == TRUE) { - x <- x[1:opt$nmax,] - footer <- paste("\n(omitted ", - format(x.rows - opt$nmax, big.mark = opt$big.mark, decimal.mark = opt$decimal.mark), - " entries, n = ", - format(x.unprinted, big.mark = opt$big.mark, decimal.mark = opt$decimal.mark), - " [", - (x.unprinted / (x.unprinted + x.printed)) %>% percent(force_zero = TRUE, decimal.mark = opt$decimal.mark), - "])\n", sep = "") - } else { - footer <- NULL - } - } else { - footer <- NULL - } - - if ("item" %in% colnames(x)) { - if (any(class(x$item) %in% c("double", "integer", "numeric", "raw", "single"))) { - x$item <- format(x$item, decimal.mark = opt$decimal.mark, big.mark = opt$big.mark) - } - } else { - opt$column_names <- opt$column_names[!opt$column_names == "Item"] - } - - all_unique <- FALSE - if ("count" %in% colnames(x)) { - if (all(x$count == 1)) { - all_unique <- TRUE - } - x$count <- format(x$count, decimal.mark = opt$decimal.mark, big.mark = opt$big.mark) - } else { - opt$column_names <- opt$column_names[!opt$column_names == "Count"] - } - if ("percent" %in% colnames(x)) { - x$percent <- percent(x$percent, force_zero = TRUE, decimal.mark = opt$decimal.mark) - } else { - opt$column_names <- opt$column_names[!opt$column_names == "Percent"] - } - if ("cum_count" %in% colnames(x)) { - x$cum_count <- format(x$cum_count, decimal.mark = opt$decimal.mark, big.mark = opt$big.mark) - } else { - opt$column_names <- opt$column_names[!opt$column_names == "Cum. Count"] - } - if ("cum_percent" %in% colnames(x)) { - x$cum_percent <- percent(x$cum_percent, force_zero = TRUE, decimal.mark = opt$decimal.mark) - } else { - opt$column_names <- opt$column_names[!opt$column_names == "Cum. Percent"] - } - - if (opt$tbl_format == "markdown") { - cat("\n") - } - - if (is.null(opt$row_names)) { - opt$row_names <- TRUE - } - if (is.null(opt$column_names)) { - opt$column_names <- colnames(x) - } - - print( - knitr::kable(x, - format = opt$tbl_format, - row.names = opt$row_names, - col.names = opt$column_names, - align = opt$column_align, - padding = 1) - ) - - if (!is.null(footer)) { - cat(footer) - } - - if (opt$tbl_format == "markdown") { - cat("\n\n") - } else { - cat("\n") - } - - if (all_unique == TRUE) { - message("NOTE: All observations are unique.") - } - - # reset old kable setting - options(knitr.kable.NA = opt.old) - return(invisible()) - -} - #' @noRd -#' @exportMethod print.frequency_tbl -#' @export -print.frequency_tbl <- print.freq +freq.mo <- function(x, ...) { + # replace with freq.default() if next `clean` version is published on CRAN + freq_def(x = x, ..., + .add_header = list(families = n_distinct(mo_family(x, language = NULL)), + genera = n_distinct(mo_genus(x, language = NULL)), + species = n_distinct(paste(mo_genus(x, language = NULL), + mo_species(x, language = NULL))))) +} +#' @exportMethod freq.rsi +#' @importFrom clean freq +#' @export #' @noRd -#' @exportMethod as.data.frame.freq -#' @export -as.data.frame.freq <- function(x, ...) { - attr(x, "package") <- NULL - attr(x, "opt") <- NULL - as.data.frame.data.frame(x, ...) -} - -#' @exportMethod select.freq -#' @export -#' @importFrom dplyr select -#' @noRd -select.freq <- function(.data, ...) { - select(as.data.frame(.data), ...) -} - -#' @noRd -#' @exportMethod as_tibble.freq -#' @export -#' @importFrom dplyr as_tibble -as_tibble.freq <- function(x, validate = TRUE, ..., rownames = NA) { - attr(x, "package") <- NULL - attr(x, "opt") <- NULL - as_tibble(x = as.data.frame(x), validate = validate, ..., rownames = rownames) -} - -#' @noRd -#' @exportMethod hist.freq -#' @export -#' @importFrom graphics hist -hist.freq <- function(x, breaks = "Sturges", main = NULL, xlab = NULL, ...) { - opt <- attr(x, "opt") - if (!class(x$item) %in% c("numeric", "double", "integer", "Date")) { - stop("`x` must be numeric or Date.", call. = FALSE) - } - if (!is.null(opt$vars)) { - title <- opt$vars - } else if (!is.null(opt$data)) { - title <- opt$data +freq.rsi <- function(x, ...) { + x_name <- deparse(substitute(x)) + x_name <- gsub(".*[$]", "", x_name) + ab <- suppressMessages(suppressWarnings(AMR::as.ab(x_name))) + if (!is.na(ab)) { + freq_def(x = x, ..., + .add_header = list(Drug = paste0(ab_name(ab), " (", ab, ", ", ab_atc(ab), ")"), + group = ab_group(ab), + `%SI` = portion_SI(x, minimum = 0, as_percent = TRUE))) } else { - title <- "frequency table" - } - if (class(x$item) == "Date") { - x <- as.Date(as.vector(x), origin = "1970-01-01") - } else { - x <- as.vector(x) - } - if (is.null(main)) { - main <- paste("Histogram of", title) - } - if (is.null(xlab)) { - xlab <- title - } - hist(x, main = main, xlab = xlab, breaks = breaks, ...) -} - -#' @noRd -#' @exportMethod boxplot.freq -#' @export -#' @importFrom graphics boxplot -boxplot.freq <- function(x, main = NULL, xlab = NULL, ...) { - opt <- attr(x, "opt") - x.bak <- x - if (!class(x$item) %in% c("numeric", "double", "integer", "Date")) { - stop("`x` must be numeric or Date.", call. = FALSE) - } - if (!is.null(opt$vars)) { - title <- opt$vars - } else if (!is.null(opt$data)) { - title <- opt$data - } else { - title <- "frequency table" - } - if (class(x$item) == "Date") { - x <- as.Date(as.vector(x), origin = "1970-01-01") - } else { - x <- as.vector(x) - } - if (is.null(main)) { - main <- paste("Boxplot of", title) - } - if (is.null(xlab)) { - xlab <- title - } - if (!is.null(opt$group_var) & isTRUE(length(opt$group_var) > 0)) { - # support for grouped frequency table - x.new <- data.frame(group = character(0), item = character(0)) - for (i in 1:nrow(x.bak)) { - if (x.bak[i, "group"] == "") { - x.bak[i, "group"] <- x.bak[i - 1, "group"] - } - for (j in 1:x.bak[i, "count"]) { - x.new <- rbind(x.new, - data.frame(group = x.bak[i, "group"], - item = x.bak[i, "item"])) - } - } - boxplot(item ~ group, data = x.bak, main = main, ylab = xlab, xlab = opt$group_var, ...) - } else { - boxplot(x, main = main, xlab = xlab, ...) - } -} - -#' @noRd -#' @exportMethod plot.freq -#' @export -plot.freq <- function(x, y, ...) { - opt <- attr(x, "opt") - if (!is.null(opt$vars)) { - title <- opt$vars - } else { - title <- "" - } - plot(x = x$item, y = x$count, ylab = "Count", xlab = title, ...) -} - -#' @noRd -#' @exportMethod as.vector.freq -#' @export -as.vector.freq <- function(x, mode = "any") { - as.vector(rep(x$item, x$count), mode = mode) -} - -#' @noRd -#' @exportMethod format.freq -#' @export -format.freq <- function(x, digits = 1, ...) { - opt <- attr(x, "opt") - if (opt$nmax.set == TRUE) { - nmax <- opt$nmax - } else { - nmax <- getOption("max.print.freq", default = 15) - } - - x <- x[1:nmax,] - x$percent <- percent(x$percent, round = digits, force_zero = TRUE) - x$cum_percent <- percent(x$cum_percent, round = digits, force_zero = TRUE) - base::format.data.frame(x, ...) -} - -#' @importFrom data.table address -get_data_source_name <- function(x, else_txt = NULL) { - obj_addr <- address(x) - # try global environment - addrs <- unlist(lapply(ls(".GlobalEnv"), function(x) address(get(x)))) - res <- ls(".GlobalEnv")[addrs == obj_addr] - if (length(res) == 0) { - # check AMR package - some users might use our data sets for testing - addrs <- unlist(lapply(ls("package:AMR"), function(x) address(get(x)))) - res <- ls("package:AMR")[addrs == obj_addr] - } - if (length(res) == 0) { - else_txt - } else { - res + freq_def(x = x, ..., + .add_header = list(`%SI` = portion_SI(x, minimum = 0, as_percent = TRUE))) } } diff --git a/_pkgdown.yml b/_pkgdown.yml index 380e5f53..b230db94 100644 --- a/_pkgdown.yml +++ b/_pkgdown.yml @@ -62,12 +62,6 @@ navbar: - text: "Get properties of an antibiotic" icon: "fa-capsules" href: "reference/ab_property.html" # reference instead of article - - text: "Create frequency tables" - icon: "fa-sort-amount-down" - href: "articles/freq.html" -# - text: "Use the G-test" -# icon: "fa-clipboard-check" -# href: "reference/g.test.html" # reference instead of article - text: "Other: benchmarks" icon: "fa-shipping-fast" href: "articles/benchmarks.html" @@ -130,13 +124,12 @@ reference: - title: "Analysing your data" desc: > Functions for conducting AMR analysis, like counting isolates, calculating - resistance or susceptibility, creating frequency tables or make plots. + resistance or susceptibility, or make plots. contents: - "`availability`" - "`count`" - "`portion`" - "`filter_ab_class`" - - "`freq`" - "`g.test`" - "`ggplot_rsi`" - "`kurtosis`" diff --git a/docs/LICENSE-text.html b/docs/LICENSE-text.html index bcc8c1b6..9aba1cdb 100644 --- a/docs/LICENSE-text.html +++ b/docs/LICENSE-text.html @@ -78,7 +78,7 @@ AMR (for R) - 0.7.1.9014 + 0.7.1.9015 @@ -156,13 +156,6 @@ Get properties of an antibiotic -
  • - - - - Create frequency tables - -
  • diff --git a/docs/articles/AMR.html b/docs/articles/AMR.html index f1537c11..bed45232 100644 --- a/docs/articles/AMR.html +++ b/docs/articles/AMR.html @@ -40,7 +40,7 @@ AMR (for R) - 0.7.1.9012 + 0.7.1.9015 @@ -118,13 +118,6 @@ Get properties of an antibiotic
  • -
  • - - - - Create frequency tables - -
  • @@ -192,7 +185,7 @@

    How to conduct AMR analysis

    Matthijs S. Berends

    -

    10 July 2019

    +

    29 July 2019

    @@ -201,7 +194,7 @@ -

    Note: values on this page will change with every website update since they are based on randomly created values and the page was written in R Markdown. However, the methodology remains unchanged. This page was generated on 10 July 2019.

    +

    Note: values on this page will change with every website update since they are based on randomly created values and the page was written in R Markdown. However, the methodology remains unchanged. This page was generated on 29 July 2019.

    Introduction

    @@ -217,21 +210,21 @@ -2019-07-10 +2019-07-29 abcd Escherichia coli S S -2019-07-10 +2019-07-29 abcd Escherichia coli S R -2019-07-10 +2019-07-29 efgh Escherichia coli R @@ -327,42 +320,9 @@ -2011-07-20 -X1 -Hospital C -Staphylococcus aureus -R -I -S -S -F - - -2017-12-18 -Q8 -Hospital B -Streptococcus pneumoniae -S -S -S -S -F - - -2015-12-08 -X5 -Hospital B -Staphylococcus aureus -S -S -R -R -F - - -2011-12-08 -X3 -Hospital B +2017-01-19 +U10 +Hospital A Klebsiella pneumoniae S S @@ -370,27 +330,60 @@ S F + +2011-09-03 +T7 +Hospital B +Escherichia coli +I +S +S +R +F + -2011-01-02 -E1 +2011-12-04 +U5 Hospital A -Streptococcus pneumoniae +Staphylococcus aureus R S +R +S +F + + +2014-06-26 +O1 +Hospital A +Staphylococcus aureus +S +I +S +S +F + + +2012-08-04 +G2 +Hospital B +Streptococcus pneumoniae +R +R S S M -2012-10-22 -K1 -Hospital C +2015-05-10 +W8 +Hospital B Streptococcus pneumoniae S S S S -M +F @@ -400,9 +393,11 @@

    Cleaning the data

    -

    Use the frequency table function freq() to look specifically for unique values in any variable. For example, for the gender variable:

    -
    data %>% freq(gender) # this would be the same: freq(data$gender)
    -
    # Frequency table of `gender` from `data` (20,000 x 9) 
    +

    We also created a package dedicated to data cleaning and checking, called the clean package. It gets automatically installed with the AMR package, so we only have to load it:

    +
    library(clean)
    +

    Use the frequency table function freq() from this clean package to look specifically for unique values in any variable. For example, for the gender variable:

    +
    data %>% freq(gender) # this would be the same: freq(data$gender)
    +
    # Frequency table 
     # 
     # Class:   factor (numeric)
     # Length:  20,000 (of which NA: 0 = 0.00%)
    @@ -411,82 +406,82 @@
     # 
     #      Item     Count   Percent   Cum. Count   Cum. Percent
     # ---  -----  -------  --------  -----------  -------------
    -# 1    M       10,454     52.3%       10,454          52.3%
    -# 2    F        9,546     47.7%       20,000         100.0%
    +# 1 M 10,424 52.1% 10,424 52.1% +# 2 F 9,576 47.9% 20,000 100.0%

    So, we can draw at least two conclusions immediately. From a data scientists perspective, the data looks clean: only values M and F. From a researchers perspective: there are slightly more men. Nothing we didn’t already know.

    The data is already quite clean, but we still need to transform some variables. The bacteria column now consists of text, and we want to add more variables based on microbial IDs later on. So, we will transform this column to valid IDs. The mutate() function of the dplyr package makes this really easy:

    -
    data <- data %>%
    -  mutate(bacteria = as.mo(bacteria))
    -

    We also want to transform the antibiotics, because in real life data we don’t know if they are really clean. The as.rsi() function ensures reliability and reproducibility in these kind of variables. The mutate_at() will run the as.rsi() function on defined variables:

    data <- data %>%
    -  mutate_at(vars(AMX:GEN), as.rsi)
    + mutate(bacteria = as.mo(bacteria))
    +

    We also want to transform the antibiotics, because in real life data we don’t know if they are really clean. The as.rsi() function ensures reliability and reproducibility in these kind of variables. The mutate_at() will run the as.rsi() function on defined variables:

    +
    data <- data %>%
    +  mutate_at(vars(AMX:GEN), as.rsi)

    Finally, we will apply EUCAST rules on our antimicrobial results. In Europe, most medical microbiological laboratories already apply these rules. Our package features their latest insights on intrinsic resistance and exceptional phenotypes. Moreover, the eucast_rules() function can also apply additional rules, like forcing ampicillin = R when amoxicillin/clavulanic acid = R.

    Because the amoxicillin (column AMX) and amoxicillin/clavulanic acid (column AMC) in our data were generated randomly, some rows will undoubtedly contain AMX = S and AMC = R, which is technically impossible. The eucast_rules() fixes this:

    -
    data <- eucast_rules(data, col_mo = "bacteria")
    -# 
    -# Rules by the European Committee on Antimicrobial Susceptibility Testing (EUCAST)
    -# http://eucast.org/
    -# 
    -# EUCAST Clinical Breakpoints (v9.0, 2019)
    -# Aerococcus sanguinicola (no new changes)
    -# Aerococcus urinae (no new changes)
    -# Anaerobic Gram-negatives (no new changes)
    -# Anaerobic Gram-positives (no new changes)
    -# Campylobacter coli (no new changes)
    -# Campylobacter jejuni (no new changes)
    -# Enterobacteriales (Order) (no new changes)
    -# Enterococcus (no new changes)
    -# Haemophilus influenzae (no new changes)
    -# Kingella kingae (no new changes)
    -# Moraxella catarrhalis (no new changes)
    -# Pasteurella multocida (no new changes)
    -# Staphylococcus (no new changes)
    -# Streptococcus groups A, B, C, G (no new changes)
    -# Streptococcus pneumoniae (1,481 new changes)
    -# Viridans group streptococci (no new changes)
    -# 
    -# EUCAST Expert Rules, Intrinsic Resistance and Exceptional Phenotypes (v3.1, 2016)
    -# Table 01: Intrinsic resistance in Enterobacteriaceae (1,328 new changes)
    -# Table 02: Intrinsic resistance in non-fermentative Gram-negative bacteria (no new changes)
    -# Table 03: Intrinsic resistance in other Gram-negative bacteria (no new changes)
    -# Table 04: Intrinsic resistance in Gram-positive bacteria (2,778 new changes)
    -# Table 08: Interpretive rules for B-lactam agents and Gram-positive cocci (no new changes)
    -# Table 09: Interpretive rules for B-lactam agents and Gram-negative rods (no new changes)
    -# Table 11: Interpretive rules for macrolides, lincosamides, and streptogramins (no new changes)
    -# Table 12: Interpretive rules for aminoglycosides (no new changes)
    -# Table 13: Interpretive rules for quinolones (no new changes)
    -# 
    -# Other rules
    -# Non-EUCAST: amoxicillin/clav acid = S where ampicillin = S (2,245 new changes)
    -# Non-EUCAST: ampicillin = R where amoxicillin/clav acid = R (118 new changes)
    -# Non-EUCAST: piperacillin = R where piperacillin/tazobactam = R (no new changes)
    -# Non-EUCAST: piperacillin/tazobactam = S where piperacillin = S (no new changes)
    -# Non-EUCAST: trimethoprim = R where trimethoprim/sulfa = R (no new changes)
    -# Non-EUCAST: trimethoprim/sulfa = S where trimethoprim = S (no new changes)
    -# 
    -# --------------------------------------------------------------------------
    -# EUCAST rules affected 6,581 out of 20,000 rows, making a total of 7,950 edits
    -# => added 0 test results
    -# 
    -# => changed 7,950 test results
    -#    - 120 test results changed from S to I
    -#    - 4,812 test results changed from S to R
    -#    - 1,089 test results changed from I to S
    -#    - 317 test results changed from I to R
    -#    - 1,588 test results changed from R to S
    -#    - 24 test results changed from R to I
    -# --------------------------------------------------------------------------
    -# 
    -# Use verbose = TRUE (on your original data) to get a data.frame with all specified edits instead.
    +
    data <- eucast_rules(data, col_mo = "bacteria")
    +# 
    +# Rules by the European Committee on Antimicrobial Susceptibility Testing (EUCAST)
    +# http://eucast.org/
    +# 
    +# EUCAST Clinical Breakpoints (v9.0, 2019)
    +# Aerococcus sanguinicola (no new changes)
    +# Aerococcus urinae (no new changes)
    +# Anaerobic Gram-negatives (no new changes)
    +# Anaerobic Gram-positives (no new changes)
    +# Campylobacter coli (no new changes)
    +# Campylobacter jejuni (no new changes)
    +# Enterobacteriales (Order) (no new changes)
    +# Enterococcus (no new changes)
    +# Haemophilus influenzae (no new changes)
    +# Kingella kingae (no new changes)
    +# Moraxella catarrhalis (no new changes)
    +# Pasteurella multocida (no new changes)
    +# Staphylococcus (no new changes)
    +# Streptococcus groups A, B, C, G (no new changes)
    +# Streptococcus pneumoniae (1,485 new changes)
    +# Viridans group streptococci (no new changes)
    +# 
    +# EUCAST Expert Rules, Intrinsic Resistance and Exceptional Phenotypes (v3.1, 2016)
    +# Table 01: Intrinsic resistance in Enterobacteriaceae (1,312 new changes)
    +# Table 02: Intrinsic resistance in non-fermentative Gram-negative bacteria (no new changes)
    +# Table 03: Intrinsic resistance in other Gram-negative bacteria (no new changes)
    +# Table 04: Intrinsic resistance in Gram-positive bacteria (2,746 new changes)
    +# Table 08: Interpretive rules for B-lactam agents and Gram-positive cocci (no new changes)
    +# Table 09: Interpretive rules for B-lactam agents and Gram-negative rods (no new changes)
    +# Table 11: Interpretive rules for macrolides, lincosamides, and streptogramins (no new changes)
    +# Table 12: Interpretive rules for aminoglycosides (no new changes)
    +# Table 13: Interpretive rules for quinolones (no new changes)
    +# 
    +# Other rules
    +# Non-EUCAST: amoxicillin/clav acid = S where ampicillin = S (2,284 new changes)
    +# Non-EUCAST: ampicillin = R where amoxicillin/clav acid = R (116 new changes)
    +# Non-EUCAST: piperacillin = R where piperacillin/tazobactam = R (no new changes)
    +# Non-EUCAST: piperacillin/tazobactam = S where piperacillin = S (no new changes)
    +# Non-EUCAST: trimethoprim = R where trimethoprim/sulfa = R (no new changes)
    +# Non-EUCAST: trimethoprim/sulfa = S where trimethoprim = S (no new changes)
    +# 
    +# --------------------------------------------------------------------------
    +# EUCAST rules affected 6,581 out of 20,000 rows, making a total of 7,943 edits
    +# => added 0 test results
    +# 
    +# => changed 7,943 test results
    +#    - 100 test results changed from S to I
    +#    - 4,767 test results changed from S to R
    +#    - 1,140 test results changed from I to S
    +#    - 315 test results changed from I to R
    +#    - 1,602 test results changed from R to S
    +#    - 19 test results changed from R to I
    +# --------------------------------------------------------------------------
    +# 
    +# Use verbose = TRUE (on your original data) to get a data.frame with all specified edits instead.

    Adding new variables

    Now that we have the microbial ID, we can add some taxonomic properties:

    -
    data <- data %>% 
    -  mutate(gramstain = mo_gramstain(bacteria),
    -         genus = mo_genus(bacteria),
    -         species = mo_species(bacteria))
    +
    data <- data %>% 
    +  mutate(gramstain = mo_gramstain(bacteria),
    +         genus = mo_genus(bacteria),
    +         species = mo_species(bacteria))

    First isolates

    @@ -497,23 +492,23 @@

    (…) When preparing a cumulative antibiogram to guide clinical decisions about empirical antimicrobial therapy of initial infections, only the first isolate of a given species per patient, per analysis period (eg, one year) should be included, irrespective of body site, antimicrobial susceptibility profile, or other phenotypical characteristics (eg, biotype). The first isolate is easily identified, and cumulative antimicrobial susceptibility test data prepared using the first isolate are generally comparable to cumulative antimicrobial susceptibility test data calculated by other methods, providing duplicate isolates are excluded.
    M39-A4 Analysis and Presentation of Cumulative Antimicrobial Susceptibility Test Data, 4th Edition. CLSI, 2014. Chapter 6.4

    This AMR package includes this methodology with the first_isolate() function. It adopts the episode of a year (can be changed by user) and it starts counting days after every selected isolate. This new variable can easily be added to our data:

    - -

    So only 28.4% is suitable for resistance analysis! We can now filter on it with the filter() function, also from the dplyr package:

    - -

    For future use, the above two syntaxes can be shortened with the filter_first_isolate() function:

    + +

    So only 28.5% is suitable for resistance analysis! We can now filter on it with the filter() function, also from the dplyr package:

    + filter(first == TRUE)
    +

    For future use, the above two syntaxes can be shortened with the filter_first_isolate() function:

    +

    First weighted isolates

    -

    We made a slight twist to the CLSI algorithm, to take into account the antimicrobial susceptibility profile. Have a look at all isolates of patient B6, sorted on date:

    +

    We made a slight twist to the CLSI algorithm, to take into account the antimicrobial susceptibility profile. Have a look at all isolates of patient S2, sorted on date:

    @@ -529,21 +524,21 @@ - - + + - - + + - - + + - + @@ -551,21 +546,21 @@ - - + + + + + - - - - - + + - + @@ -573,8 +568,8 @@ - - + + @@ -584,10 +579,10 @@ - - + + - + @@ -595,8 +590,8 @@ - - + + @@ -606,32 +601,32 @@ - - + + - - + + - - + + - + - - + + - + @@ -639,18 +634,18 @@
    isolate
    12010-02-26B62010-03-05S2 B_ESCHR_COLR SRSS S TRUE
    22010-05-20B62010-03-09S2 B_ESCHR_COLRS S S S
    32010-05-28B62010-08-25S2 B_ESCHR_COLSSR RSSS FALSE
    42010-06-27B62010-09-20S2 B_ESCHR_COLRS S S S
    52010-09-06B62010-10-11S2 B_ESCHR_COL R S
    62010-10-16B62011-02-11S2 B_ESCHR_COLRS S S S
    72010-10-20B62011-02-24S2 B_ESCHR_COL S S
    82010-11-01B62011-03-12S2 B_ESCHR_COL S SS RFALSESTRUE
    92010-11-14B62011-04-21S2 B_ESCHR_COL S SSR S FALSE
    102011-01-25B62011-05-04S2 B_ESCHR_COLRS S S S
    -

    Only 1 isolates are marked as ‘first’ according to CLSI guideline. But when reviewing the antibiogram, it is obvious that some isolates are absolutely different strains and should be included too. This is why we weigh isolates, based on their antibiogram. The key_antibiotics() function adds a vector with 18 key antibiotics: 6 broad spectrum ones, 6 small spectrum for Gram negatives and 6 small spectrum for Gram positives. These can be defined by the user.

    +

    Only 2 isolates are marked as ‘first’ according to CLSI guideline. But when reviewing the antibiogram, it is obvious that some isolates are absolutely different strains and should be included too. This is why we weigh isolates, based on their antibiogram. The key_antibiotics() function adds a vector with 18 key antibiotics: 6 broad spectrum ones, 6 small spectrum for Gram negatives and 6 small spectrum for Gram positives. These can be defined by the user.

    If a column exists with a name like ‘key(…)ab’ the first_isolate() function will automatically use it and determine the first weighted isolates. Mind the NOTEs in below output:

    - + @@ -667,80 +662,44 @@ - - + + - - + + - - + + - + - + - - + + + + + - - - - + - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + + @@ -749,36 +708,72 @@ + + + + + + + + + + + + - - - + + + - + + + + + + + + + + + + + + + + + + + + + + + + + - - + + - + - + - - + + - + @@ -787,16 +782,16 @@
    isolate
    12010-02-26B62010-03-05S2 B_ESCHR_COLR SRSS S TRUE TRUE
    22010-05-20B62010-03-09S2 B_ESCHR_COLRS S S S FALSETRUEFALSE
    32010-05-28B62010-08-25S2 B_ESCHR_COLSSR RSSSFALSE FALSETRUE
    42010-06-27B6B_ESCHR_COLRSSSFALSEFALSE
    52010-09-06B6B_ESCHR_COLRSSSFALSEFALSE
    62010-10-16B6B_ESCHR_COLRSSSFALSEFALSE
    72010-10-20B62010-09-20S2 B_ESCHR_COL S SFALSE TRUE
    52010-10-11S2B_ESCHR_COLRSSSFALSETRUE
    82010-11-01B662011-02-11S2 B_ESCHR_COL S S SRS FALSE TRUE
    72011-02-24S2B_ESCHR_COLSSSSFALSEFALSE
    82011-03-12S2B_ESCHR_COLSSRSTRUETRUE
    92010-11-14B62011-04-21S2 B_ESCHR_COL S SSR S FALSETRUEFALSE
    102011-01-25B62011-05-04S2 B_ESCHR_COLRS S S S
    -

    Instead of 1, now 6 isolates are flagged. In total, 75% of all isolates are marked ‘first weighted’ - 46.6% more than when using the CLSI guideline. In real life, this novel algorithm will yield 5-10% more isolates than the classic CLSI guideline.

    +

    Instead of 2, now 7 isolates are flagged. In total, 75.7% of all isolates are marked ‘first weighted’ - 47.2% more than when using the CLSI guideline. In real life, this novel algorithm will yield 5-10% more isolates than the classic CLSI guideline.

    As with filter_first_isolate(), there’s a shortcut for this new algorithm too:

    - -

    So we end up with 15,003 isolates for analysis.

    + +

    So we end up with 15,136 isolates for analysis.

    We can remove unneeded columns:

    - +

    Now our data looks like:

    -
    head(data_1st)
    +
    head(data_1st)
    @@ -817,30 +812,30 @@ - - + + - - + + - - - + + + - - - + + + - + - + @@ -849,25 +844,25 @@ - - - - - + + + + + - - - + + + - - - + + + @@ -880,35 +875,35 @@ - - - - - + + + + + + - - - - + + + - - - - - - - + + + + + + + + - - - + + @@ -923,14 +918,14 @@

    Dispersion of species

    -

    To just get an idea how the species are distributed, create a frequency table with our freq() function. We created the genus and species column earlier based on the microbial ID. With paste(), we can concatenate them together.

    -

    The freq() function can be used like the base R language was intended:

    -
    freq(paste(data_1st$genus, data_1st$species))
    +

    To just get an idea how the species are distributed, create a frequency table with our freq() function. We created the genus and species column earlier based on the microbial ID. With paste(), we can concatenate them together.

    +

    The freq() function can be used like the base R language was intended:

    +
    freq(paste(data_1st$genus, data_1st$species))

    Or can be used like the dplyr way, which is easier readable:

    -
    data_1st %>% freq(genus, species)
    -

    Frequency table of genus and species from data_1st (15,003 x 13)

    -

    Columns: 2
    -Length: 15,003 (of which NA: 0 = 0.00%)
    +

    data_1st %>% freq(genus, species)
    +

    Frequency table

    +

    Class: character
    +Length: 15,136 (of which NA: 0 = 0.00%)
    Unique: 4

    Shortest: 16
    Longest: 24

    @@ -947,33 +942,33 @@ Longest: 24

    - - - - + + + + - - - - + + + + - - - - + + + + - - - + + + @@ -983,12 +978,12 @@ Longest: 24

    Resistance percentages

    The functions portion_S(), portion_SI(), portion_I(), portion_IR() and portion_R() can be used to determine the portion of a specific antimicrobial outcome. As per the EUCAST guideline of 2019, we calculate resistance as the portion of R (portion_R()) and susceptibility as the portion of S and I (portion_SI()). These functions can be used on their own:

    - +

    Or can be used in conjuction with group_by() and summarise(), both from the dplyr package:

    -
    data_1st %>% 
    -  group_by(hospital) %>% 
    -  summarise(amoxicillin = portion_R(AMX))
    +
    data_1st %>% 
    +  group_by(hospital) %>% 
    +  summarise(amoxicillin = portion_R(AMX))
    22017-12-18Q82011-09-03T7 Hospital BB_STRPT_PNESB_ESCHR_COLI S S R FGram-positiveStreptococcuspneumoniaeGram-negativeEscherichiacoli TRUE
    32015-12-08X5Hospital B2011-12-04U5Hospital A B_STPHY_AURSR S RRS F Gram-positive Staphylococcus
    42011-12-08X3Hospital BB_KLBSL_PNER2014-06-26O1Hospital AB_STPHY_AURS S S S FGram-negativeKlebsiellapneumoniaeGram-positiveStaphylococcusaureus TRUE
    52011-01-02E1Hospital A2012-08-04G2Hospital B B_STRPT_PNE R RTRUE
    62012-10-22K1Hospital CB_STRPT_PNE72012-11-12G6Hospital BB_ESCHR_COLS S S SR MGram-positiveStreptococcuspneumoniaeGram-negativeEscherichiacoli TRUE
    72016-07-14C5Hospital DB_STRPT_PNERR82017-07-19G4Hospital CB_STPHY_AURSSS SR M Gram-positiveStreptococcuspneumoniaeStaphylococcusaureus TRUE
    1 Escherichia coli7,25148.3%7,25148.3%7,45849.3%7,45849.3%
    2 Staphylococcus aureus3,83425.6%11,08573.9%3,73924.7%11,19774.0%
    3 Streptococcus pneumoniae2,32815.5%13,41389.4%2,36215.6%13,55989.6%
    4 Klebsiella pneumoniae1,59010.6%15,0031,57710.4%15,136 100.0%
    @@ -997,27 +992,27 @@ Longest: 24

    - + - + - + - +
    hospital
    Hospital A0.46987150.4701912
    Hospital B0.47079170.4536024
    Hospital C0.45487530.4589011
    Hospital D0.48474030.4714194

    Of course it would be very convenient to know the number of isolates responsible for the percentages. For that purpose the n_rsi() can be used, which works exactly like n_distinct() from the dplyr package. It counts all isolates available for every group (i.e. values S, I or R):

    -
    data_1st %>% 
    -  group_by(hospital) %>% 
    -  summarise(amoxicillin = portion_R(AMX),
    -            available = n_rsi(AMX))
    +
    data_1st %>% 
    +  group_by(hospital) %>% 
    +  summarise(amoxicillin = portion_R(AMX),
    +            available = n_rsi(AMX))
    @@ -1027,32 +1022,32 @@ Longest: 24

    - - + + - - + + - - + + - - + +
    hospital
    Hospital A0.469871545140.47019124445
    Hospital B0.470791752040.45360245302
    Hospital C0.454875322050.45890112275
    Hospital D0.484740330800.47141943114

    These functions can also be used to get the portion of multiple antibiotics, to calculate empiric susceptibility of combination therapies very easily:

    - + @@ -1063,94 +1058,94 @@ Longest: 24

    - - - + + + - - - + + + - - - + + + - + - +
    genus
    Escherichia0.92332090.89394570.99420770.92491280.89246450.9945025
    Klebsiella0.83144650.91509430.98742140.83386180.89283450.9822448
    Staphylococcus0.92383930.91601460.99217530.91521800.91842740.9946510
    Streptococcus0.61211340.6147333 0.00000000.61211340.6147333

    To make a transition to the next part, let’s see how this difference could be plotted:

    -
    data_1st %>% 
    -  group_by(genus) %>% 
    -  summarise("1. Amoxi/clav" = portion_SI(AMC),
    -            "2. Gentamicin" = portion_SI(GEN),
    -            "3. Amoxi/clav + genta" = portion_SI(AMC, GEN)) %>% 
    -  tidyr::gather("antibiotic", "S", -genus) %>%
    -  ggplot(aes(x = genus,
    -             y = S,
    -             fill = antibiotic)) +
    -  geom_col(position = "dodge2")
    +
    data_1st %>% 
    +  group_by(genus) %>% 
    +  summarise("1. Amoxi/clav" = portion_SI(AMC),
    +            "2. Gentamicin" = portion_SI(GEN),
    +            "3. Amoxi/clav + genta" = portion_SI(AMC, GEN)) %>% 
    +  tidyr::gather("antibiotic", "S", -genus) %>%
    +  ggplot(aes(x = genus,
    +             y = S,
    +             fill = antibiotic)) +
    +  geom_col(position = "dodge2")

    Plots

    To show results in plots, most R users would nowadays use the ggplot2 package. This package lets you create plots in layers. You can read more about it on their website. A quick example would look like these syntaxes:

    -
    ggplot(data = a_data_set,
    -       mapping = aes(x = year,
    -                     y = value)) +
    -  geom_col() +
    -  labs(title = "A title",
    -       subtitle = "A subtitle",
    -       x = "My X axis",
    -       y = "My Y axis")
    -
    -# or as short as:
    -ggplot(a_data_set) +
    -  geom_bar(aes(year))
    +
    ggplot(data = a_data_set,
    +       mapping = aes(x = year,
    +                     y = value)) +
    +  geom_col() +
    +  labs(title = "A title",
    +       subtitle = "A subtitle",
    +       x = "My X axis",
    +       y = "My Y axis")
    +
    +# or as short as:
    +ggplot(a_data_set) +
    +  geom_bar(aes(year))

    The AMR package contains functions to extend this ggplot2 package, for example geom_rsi(). It automatically transforms data with count_df() or portion_df() and show results in stacked bars. Its simplest and shortest example:

    -
    ggplot(data_1st) +
    -  geom_rsi(translate_ab = FALSE)
    +
    ggplot(data_1st) +
    +  geom_rsi(translate_ab = FALSE)

    Omit the translate_ab = FALSE to have the antibiotic codes (AMX, AMC, CIP, GEN) translated to official WHO names (amoxicillin, amoxicillin/clavulanic acid, ciprofloxacin, gentamicin).

    If we group on e.g. the genus column and add some additional functions from our package, we can create this:

    - +

    To simplify this, we also created the ggplot_rsi() function, which combines almost all above functions:

    - +

    @@ -1158,33 +1153,33 @@ Longest: 24

    Independence test

    The next example uses the included septic_patients, which is an anonymised data set containing 2,000 microbial blood culture isolates with their full antibiograms found in septic patients in 4 different hospitals in the Netherlands, between 2001 and 2017. It is true, genuine data. This data.frame can be used to practice AMR analysis.

    We will compare the resistance to fosfomycin (column FOS) in hospital A and D. The input for the fisher.test() can be retrieved with a transformation like this:

    -
    check_FOS <- septic_patients %>%
    -  filter(hospital_id %in% c("A", "D")) %>% # filter on only hospitals A and D
    -  select(hospital_id, FOS) %>%             # select the hospitals and fosfomycin
    -  group_by(hospital_id) %>%                # group on the hospitals
    -  count_df(combine_SI = TRUE) %>%          # count all isolates per group (hospital_id)
    -  tidyr::spread(hospital_id, value) %>%    # transform output so A and D are columns
    -  select(A, D) %>%                         # and select these only
    -  as.matrix()                              # transform to good old matrix for fisher.test()
    -
    -check_FOS
    -#       A  D
    -# [1,] 25 77
    -# [2,] 24 33
    +
    check_FOS <- septic_patients %>%
    +  filter(hospital_id %in% c("A", "D")) %>% # filter on only hospitals A and D
    +  select(hospital_id, FOS) %>%             # select the hospitals and fosfomycin
    +  group_by(hospital_id) %>%                # group on the hospitals
    +  count_df(combine_SI = TRUE) %>%          # count all isolates per group (hospital_id)
    +  tidyr::spread(hospital_id, value) %>%    # transform output so A and D are columns
    +  select(A, D) %>%                         # and select these only
    +  as.matrix()                              # transform to good old matrix for fisher.test()
    +
    +check_FOS
    +#       A  D
    +# [1,] 25 77
    +# [2,] 24 33

    We can apply the test now with:

    - +

    As can be seen, the p value is 0.031, which means that the fosfomycin resistances found in hospital A and D are really different.

    diff --git a/docs/articles/AMR_files/figure-html/plot 1-1.png b/docs/articles/AMR_files/figure-html/plot 1-1.png index 4b81035a..7e48e6a0 100644 Binary files a/docs/articles/AMR_files/figure-html/plot 1-1.png and b/docs/articles/AMR_files/figure-html/plot 1-1.png differ diff --git a/docs/articles/AMR_files/figure-html/plot 3-1.png b/docs/articles/AMR_files/figure-html/plot 3-1.png index e7b3a110..e398d1b1 100644 Binary files a/docs/articles/AMR_files/figure-html/plot 3-1.png and b/docs/articles/AMR_files/figure-html/plot 3-1.png differ diff --git a/docs/articles/AMR_files/figure-html/plot 4-1.png b/docs/articles/AMR_files/figure-html/plot 4-1.png index f4358c5d..c5771f88 100644 Binary files a/docs/articles/AMR_files/figure-html/plot 4-1.png and b/docs/articles/AMR_files/figure-html/plot 4-1.png differ diff --git a/docs/articles/AMR_files/figure-html/plot 5-1.png b/docs/articles/AMR_files/figure-html/plot 5-1.png index 59f73e6e..76aa3dc7 100644 Binary files a/docs/articles/AMR_files/figure-html/plot 5-1.png and b/docs/articles/AMR_files/figure-html/plot 5-1.png differ diff --git a/docs/articles/EUCAST.html b/docs/articles/EUCAST.html index 087c2f29..a9d2dded 100644 --- a/docs/articles/EUCAST.html +++ b/docs/articles/EUCAST.html @@ -40,7 +40,7 @@ AMR (for R) - 0.7.1.9012 + 0.7.1.9015 @@ -118,13 +118,6 @@ Get properties of an antibiotic
  • -
  • - - - - Create frequency tables - -
  • @@ -192,7 +185,7 @@

    How to apply EUCAST rules

    Matthijs S. Berends

    -

    10 July 2019

    +

    29 July 2019

    diff --git a/docs/articles/MDR.html b/docs/articles/MDR.html index aaebb3d6..968a23dc 100644 --- a/docs/articles/MDR.html +++ b/docs/articles/MDR.html @@ -40,7 +40,7 @@
    AMR (for R) - 0.7.1.9012 + 0.7.1.9015 @@ -118,13 +118,6 @@ Get properties of an antibiotic
  • -
  • - - - - Create frequency tables - -
  • @@ -192,7 +185,7 @@

    How to determine multi-drug resistance (MDR)

    Matthijs S. Berends

    -

    10 July 2019

    +

    29 July 2019

    @@ -235,18 +228,18 @@

    The data set looks like this now:

    We can now add the interpretation of MDR-TB to our data set:

    -

    And review the result with a frequency table:

    -
    freq(my_TB_data$mdr)
    -

    Frequency table of mdr from my_TB_data (5,000 x 8)

    +

    We also created a package dedicated to data cleaning and checking, called the clean package. It gets automatically installed with the AMR package, so we only have to load it:

    +
    library(clean)
    +

    It contains the freq() function, to create a frequency table:

    +
    freq(my_TB_data$mdr)
    +

    Frequency table

    Class: factor > ordered (numeric)
    Length: 5,000 (of which NA: 0 = 0.00%)
    Levels: 5: Negative < Mono-resistance < Poly-resistance < Multidrug resistance…
    @@ -277,41 +272,41 @@ Unique: 5

    1 Mono-resistance -3,273 -65.5% -3,273 -65.5% +3225 +64.5% +3225 +64.5% 2 Negative -687 -13.7% -3,960 -79.2% +644 +12.9% +3869 +77.4% 3 Multidrug resistance -569 -11.4% -4,529 -90.6% +626 +12.5% +4495 +89.9% 4 Poly-resistance -277 -5.5% -4,806 -96.1% +288 +5.8% +4783 +95.7% 5 Extensive drug resistance -194 -3.9% -5,000 +217 +4.3% +5000 100.0% diff --git a/docs/articles/SPSS.html b/docs/articles/SPSS.html index 67304bb9..3364be69 100644 --- a/docs/articles/SPSS.html +++ b/docs/articles/SPSS.html @@ -40,7 +40,7 @@ AMR (for R) - 0.7.1.9012 + 0.7.1.9015 @@ -118,13 +118,6 @@ Get properties of an antibiotic
  • -
  • - - - - Create frequency tables - -
  • @@ -192,7 +185,7 @@

    How to import data from SPSS / SAS / Stata

    Matthijs S. Berends

    -

    10 July 2019

    +

    29 July 2019

    diff --git a/docs/articles/WHONET.html b/docs/articles/WHONET.html index ac5cd4e0..64a1fb84 100644 --- a/docs/articles/WHONET.html +++ b/docs/articles/WHONET.html @@ -40,7 +40,7 @@
    AMR (for R) - 0.7.1.9012 + 0.7.1.9015 @@ -118,13 +118,6 @@ Get properties of an antibiotic
  • -
  • - - - - Create frequency tables - -
  • @@ -192,7 +185,7 @@

    How to work with WHONET data

    Matthijs S. Berends

    -

    10 July 2019

    +

    29 July 2019

    @@ -228,16 +221,20 @@
    mutate(mo = as.mo(Organism)) %>% # transform everything from "AMP_ND10" to "CIP_EE" to the new `rsi` class mutate_at(vars(AMP_ND10:CIP_EE), as.rsi) -

    No errors or warnings, so all values are transformed succesfully. Let’s check it though, with a couple of frequency tables:

    - -

    Frequency table of mo from data (500 x 54)

    +

    No errors or warnings, so all values are transformed succesfully.

    +

    We created a package dedicated to data cleaning and checking, called the clean package. It gets automatically installed with the AMR package, so we only have to load it:

    +
    library(clean)
    +

    It contains the freq() function, to create frequency tables.

    +

    So let’s check our data, with a couple of frequency tables:

    + +

    Frequency table

    Class: mo (character)
    Length: 500 (of which NA: 0 = 0.00%)
    Unique: 39

    Families: 10
    Genera: 17
    -Species: 38

    +Species: 39

    @@ -331,18 +328,16 @@ Species: 38

    (omitted 29 entries, n = 57 [11.4%])

    - -

    Frequency table of AMC_ND2 from data (500 x 54)

    + +

    Frequency table

    Class: factor > ordered > rsi (numeric)
    -Length: 500 (of which NA: 19 = 3.80%)
    +Length: 481 (of which NA: 19 = 3.95%)
    Levels: 3: S < I < R
    Unique: 3

    -

    Drug: Amoxicillin/clavulanic acid (AMC, J01CR02)
    -Group: Beta-lactams/penicillins
    -%SI: 78.59%

    +

    %SI: 78.6%

    diff --git a/docs/articles/benchmarks.html b/docs/articles/benchmarks.html index 84caae51..0c5b69c9 100644 --- a/docs/articles/benchmarks.html +++ b/docs/articles/benchmarks.html @@ -40,7 +40,7 @@ AMR (for R) - 0.7.1.9012 + 0.7.1.9015 @@ -118,13 +118,6 @@ Get properties of an antibiotic -
  • - - - - Create frequency tables - -
  • @@ -192,7 +185,7 @@

    Benchmarks

    Matthijs S. Berends

    -

    10 July 2019

    +

    29 July 2019

    @@ -217,14 +210,14 @@
    times = 10) print(S.aureus, unit = "ms", signif = 2) # Unit: milliseconds -# expr min lq mean median uq max neval -# as.mo("sau") 8.5 8.7 12.0 8.9 9.4 26 10 -# as.mo("stau") 31.0 32.0 42.0 33.0 34.0 120 10 -# as.mo("staaur") 8.6 8.7 11.0 9.1 9.2 26 10 -# as.mo("STAAUR") 8.7 9.1 9.3 9.2 9.4 11 10 -# as.mo("S. aureus") 23.0 23.0 30.0 24.0 40.0 46 10 -# as.mo("S. aureus") 22.0 23.0 27.0 24.0 25.0 41 10 -# as.mo("Staphylococcus aureus") 3.9 4.0 5.7 4.1 4.4 20 10 +# expr min lq mean median uq max neval +# as.mo("sau") 8.3 8.4 15.0 8.9 9.5 57.0 10 +# as.mo("stau") 30.0 31.0 42.0 32.0 48.0 110.0 10 +# as.mo("staaur") 8.3 8.4 10.0 8.5 8.9 25.0 10 +# as.mo("STAAUR") 8.1 8.4 10.0 8.4 9.1 24.0 10 +# as.mo("S. aureus") 23.0 23.0 29.0 24.0 38.0 45.0 10 +# as.mo("S. aureus") 22.0 22.0 24.0 23.0 24.0 38.0 10 +# as.mo("Staphylococcus aureus") 3.8 3.9 4.1 4.1 4.2 4.6 10

    In the table above, all measurements are shown in milliseconds (thousands of seconds). A value of 5 milliseconds means it can determine 200 input values per second. It case of 100 milliseconds, this is only 10 input values per second. The second input is the only one that has to be looked up thoroughly. All the others are known codes (the first one is a WHONET code) or common laboratory codes, or common full organism names like the last one. Full organism names are always preferred.

    To achieve this speed, the as.mo function also takes into account the prevalence of human pathogenic microorganisms. The downside is of course that less prevalent microorganisms will be determined less fast. See this example for the ID of Thermus islandicus (B_THERMS_ISL), a bug probably never found before in humans:

    -

    That takes 10.2 times as much time on average. A value of 100 milliseconds means it can only determine ~10 different input values per second. We can conclude that looking up arbitrary codes of less prevalent microorganisms is the worst way to go, in terms of calculation performance. Full names (like Thermus islandicus) are almost fast - these are the most probable input from most data sets.

    +# as.mo("theisl") 250 260 260 260 270 290 10 +# as.mo("THEISL") 250 260 270 260 270 320 10 +# as.mo("T. islandicus") 120 120 130 130 140 150 10 +# as.mo("T. islandicus") 120 130 140 140 140 140 10 +# as.mo("Thermus islandicus") 45 46 55 56 62 67 10 +

    That takes 8.8 times as much time on average. A value of 100 milliseconds means it can only determine ~10 different input values per second. We can conclude that looking up arbitrary codes of less prevalent microorganisms is the worst way to go, in terms of calculation performance. Full names (like Thermus islandicus) are almost fast - these are the most probable input from most data sets.

    In the figure below, we compare Escherichia coli (which is very common) with Prevotella brevis (which is moderately common) and with Thermus islandicus (which is very uncommon):

    par(mar = c(5, 16, 4, 2)) # set more space for left margin text (16)
     
    @@ -287,8 +280,8 @@
     print(run_it, unit = "ms", signif = 3)
     # Unit: milliseconds
     #            expr min  lq mean median  uq max neval
    -#  mo_fullname(x) 611 628  643    635 650 714    10
    -

    So transforming 500,000 values (!!) of 50 unique values only takes 0.63 seconds (634 ms). You only lose time on your unique input values.

    +# mo_fullname(x) 586 611 623 619 638 671 10 +

    So transforming 500,000 values (!!) of 50 unique values only takes 0.62 seconds (618 ms). You only lose time on your unique input values.

    -

    So going from mo_fullname("Staphylococcus aureus") to "Staphylococcus aureus" takes 0.0009 seconds - it doesn’t even start calculating if the result would be the same as the expected resulting value. That goes for all helper functions:

    +# A 6.350 6.600 7.050 6.870 7.35 8.37 10 +# B 21.300 21.500 25.300 22.200 22.70 48.20 10 +# C 0.624 0.753 0.804 0.783 0.87 1.01 10 +

    So going from mo_fullname("Staphylococcus aureus") to "Staphylococcus aureus" takes 0.0008 seconds - it doesn’t even start calculating if the result would be the same as the expected resulting value. That goes for all helper functions:

    +# A 0.436 0.454 0.460 0.460 0.462 0.491 10 +# B 0.472 0.480 0.496 0.488 0.513 0.542 10 +# C 0.657 0.672 0.757 0.750 0.797 0.952 10 +# D 0.478 0.495 0.500 0.499 0.503 0.540 10 +# E 0.436 0.446 0.456 0.448 0.455 0.507 10 +# F 0.437 0.447 0.455 0.454 0.460 0.478 10 +# G 0.428 0.441 0.449 0.447 0.455 0.477 10 +# H 0.438 0.445 0.456 0.451 0.472 0.477 10

    Of course, when running mo_phylum("Firmicutes") the function has zero knowledge about the actual microorganism, namely S. aureus. But since the result would be "Firmicutes" too, there is no point in calculating the result. And because this package ‘knows’ all phyla of all known bacteria (according to the Catalogue of Life), it can just return the initial value immediately.

    +# en 17.21 17.67 18.20 18.40 18.62 18.81 10 +# de 18.76 19.01 21.59 19.45 19.82 41.96 10 +# nl 24.15 24.57 29.11 25.67 26.49 45.25 10 +# es 18.31 19.02 19.33 19.37 19.93 20.09 10 +# it 18.96 19.27 23.48 19.58 20.91 41.27 10 +# fr 18.33 18.80 19.46 19.27 19.97 21.10 10 +# pt 18.89 19.50 20.54 19.70 20.36 27.83 10

    Currently supported are German, Dutch, Spanish, Italian, French and Portuguese.

    diff --git a/docs/articles/benchmarks_files/figure-html/unnamed-chunk-5-1.png b/docs/articles/benchmarks_files/figure-html/unnamed-chunk-5-1.png index ae65b719..95aa1508 100644 Binary files a/docs/articles/benchmarks_files/figure-html/unnamed-chunk-5-1.png and b/docs/articles/benchmarks_files/figure-html/unnamed-chunk-5-1.png differ diff --git a/docs/articles/index.html b/docs/articles/index.html index d641e7b2..b6c61875 100644 --- a/docs/articles/index.html +++ b/docs/articles/index.html @@ -78,7 +78,7 @@ AMR (for R) - 0.7.1.9014 + 0.7.1.9015 @@ -156,13 +156,6 @@ Get properties of an antibiotic
  • -
  • - - - - Create frequency tables - -
  • @@ -242,7 +235,6 @@
  • How to import data from SPSS / SAS / Stata
  • How to work with WHONET data
  • Benchmarks
  • -
  • How to create frequency tables
  • How to predict antimicrobial resistance
  • diff --git a/docs/articles/resistance_predict.html b/docs/articles/resistance_predict.html index 707b0872..a2a5bee3 100644 --- a/docs/articles/resistance_predict.html +++ b/docs/articles/resistance_predict.html @@ -40,7 +40,7 @@ AMR (for R) - 0.7.1.9012 + 0.7.1.9015 @@ -118,13 +118,6 @@ Get properties of an antibiotic -
  • - - - - Create frequency tables - -
  • @@ -192,7 +185,7 @@

    How to predict antimicrobial resistance

    Matthijs S. Berends

    -

    10 July 2019

    +

    29 July 2019

    diff --git a/docs/authors.html b/docs/authors.html index 9efd958e..5150acd4 100644 --- a/docs/authors.html +++ b/docs/authors.html @@ -78,7 +78,7 @@
    AMR (for R) - 0.7.1.9014 + 0.7.1.9015 @@ -156,13 +156,6 @@ Get properties of an antibiotic
  • -
  • - - - - Create frequency tables - -
  • diff --git a/docs/index.html b/docs/index.html index 0fd4ae62..9fc450e4 100644 --- a/docs/index.html +++ b/docs/index.html @@ -42,7 +42,7 @@ AMR (for R) - 0.7.1.9014 + 0.7.1.9015 @@ -120,13 +120,6 @@ Get properties of an antibiotic
  • -
  • - - - - Create frequency tables - -
  • @@ -213,7 +206,6 @@
  • Plotting antimicrobial resistance (tutorial)
  • Determining first isolates to be used for AMR analysis (manual)
  • Applying EUCAST expert rules (manual)
  • -
  • Descriptive statistics: frequency tables, kurtosis and skewness (tutorial)
  • This package is ready-to-use for a professional environment by specialists in the following fields:

    Medical Microbiology

    @@ -340,8 +332,6 @@
  • Calculate the resistance (and even co-resistance) of microbial isolates with the portion_R(), portion_IR(), portion_I(), portion_SI() and portion_S() functions. Similarly, the number of isolates can be determined with the count_R(), count_IR(), count_I(), count_SI() and count_S() functions. All these functions can be used with the dplyr package (e.g. in conjunction with summarise())
  • Plot AMR results with geom_rsi(), a function made for the ggplot2 package
  • Predict antimicrobial resistance for the nextcoming years using logistic regression models with the resistance_predict() function
  • -
  • Conduct descriptive statistics to enhance base R: calculate kurtosis(), skewness() and create frequency tables with freq() -
  • diff --git a/docs/news/index.html b/docs/news/index.html index f7115623..5efeb137 100644 --- a/docs/news/index.html +++ b/docs/news/index.html @@ -78,7 +78,7 @@ AMR (for R) - 0.7.1.9014 + 0.7.1.9015 @@ -156,13 +156,6 @@ Get properties of an antibiotic
  • -
  • - - - - Create frequency tables - -
  • @@ -232,10 +225,17 @@ -
    +

    -AMR 0.7.1.9014 Unreleased +AMR 0.7.1.9015 Unreleased

    +
    +

    +Breaking

    +
      +
    • Function freq() has moved to a new package, clean (CRAN link). Creating frequency tables is actually not the scope of this package (never was) and this function has matured a lot over the last two years. We decided to create a new package for data cleaning and checking and it perfectly fits the freq() function. The clean package is available on CRAN and will be installed automatically when updating the AMR package, that now imports it. In a later stage, the skewness() and kurtosis() functions will be moved to the clean package too.
    • +
    +

    New

    @@ -421,7 +421,7 @@ Please age() function gained a new parameter exact to determine ages with decimals
  • Removed deprecated functions guess_mo(), guess_atc(), EUCAST_rules(), interpretive_reading(), rsi()
  • -
  • Frequency tables (freq()): +
  • Frequency tables (freq()): @@ -445,7 +445,7 @@ Please age_groups(), to let groups of fives and tens end with 100+ instead of 120+
  • -
  • Fix for freq() for when all values are NA +
  • Fix for freq() for when all values are NA
  • Fix for first_isolate() for when dates are missing
  • Improved speed of guess_ab_col() @@ -667,7 +667,7 @@ Using as.mo(..., allow_uncertain = 3)
  • -
  • Frequency tables (freq() function): +
  • Frequency tables (freq() function):
  • @@ -440,12 +433,6 @@ - - - - diff --git a/docs/reference/join.html b/docs/reference/join.html index 0f87e79f..8f972cd6 100644 --- a/docs/reference/join.html +++ b/docs/reference/join.html @@ -80,7 +80,7 @@ AMR (for R) - 0.7.1.9012 + 0.7.1.9015 @@ -158,13 +158,6 @@ Get properties of an antibiotic -
  • - - - - Create frequency tables - -
  • diff --git a/docs/reference/key_antibiotics.html b/docs/reference/key_antibiotics.html index a48aeb53..22c913b5 100644 --- a/docs/reference/key_antibiotics.html +++ b/docs/reference/key_antibiotics.html @@ -80,7 +80,7 @@ AMR (for R) - 0.7.1.9012 + 0.7.1.9015 @@ -158,13 +158,6 @@ Get properties of an antibiotic
  • -
  • - - - - Create frequency tables - -
  • diff --git a/docs/reference/kurtosis.html b/docs/reference/kurtosis.html index 771735a5..977c8b5f 100644 --- a/docs/reference/kurtosis.html +++ b/docs/reference/kurtosis.html @@ -80,7 +80,7 @@ AMR (for R) - 0.7.1.9012 + 0.7.1.9015 @@ -158,13 +158,6 @@ Get properties of an antibiotic
  • -
  • - - - - Create frequency tables - -
  • diff --git a/docs/reference/like.html b/docs/reference/like.html index 9e4e20a3..189415b0 100644 --- a/docs/reference/like.html +++ b/docs/reference/like.html @@ -80,7 +80,7 @@ AMR (for R) - 0.7.1.9012 + 0.7.1.9015 @@ -158,13 +158,6 @@ Get properties of an antibiotic
  • -
  • - - - - Create frequency tables - -
  • @@ -309,7 +302,7 @@ septic_patients %>% left_join_microorganisms() %>% filter(genus %like% '^ent') %>% - freq(genus, species) + freq(genus, species) # } @@ -158,13 +158,6 @@ Get properties of an antibiotic
  • -
  • - - - - Create frequency tables - -
  • diff --git a/docs/reference/microorganisms.codes.html b/docs/reference/microorganisms.codes.html index 0b4f0e17..541e64c0 100644 --- a/docs/reference/microorganisms.codes.html +++ b/docs/reference/microorganisms.codes.html @@ -80,7 +80,7 @@ AMR (for R) - 0.7.1.9009 + 0.7.1.9015 @@ -158,13 +158,6 @@ Get properties of an antibiotic
  • -
  • - - - - Create frequency tables - -
  • diff --git a/docs/reference/microorganisms.html b/docs/reference/microorganisms.html index fe0359ce..1f15e96c 100644 --- a/docs/reference/microorganisms.html +++ b/docs/reference/microorganisms.html @@ -80,7 +80,7 @@ AMR (for R) - 0.7.1.9009 + 0.7.1.9015 @@ -158,13 +158,6 @@ Get properties of an antibiotic
  • -
  • - - - - Create frequency tables - -
  • diff --git a/docs/reference/microorganisms.old.html b/docs/reference/microorganisms.old.html index 03147f1a..6695e187 100644 --- a/docs/reference/microorganisms.old.html +++ b/docs/reference/microorganisms.old.html @@ -80,7 +80,7 @@ AMR (for R) - 0.7.1.9009 + 0.7.1.9015 @@ -158,13 +158,6 @@ Get properties of an antibiotic
  • -
  • - - - - Create frequency tables - -
  • diff --git a/docs/reference/mo_property.html b/docs/reference/mo_property.html index 3c5a2819..ff83d0d8 100644 --- a/docs/reference/mo_property.html +++ b/docs/reference/mo_property.html @@ -80,7 +80,7 @@ AMR (for R) - 0.7.1.9009 + 0.7.1.9015 @@ -158,13 +158,6 @@ Get properties of an antibiotic
  • -
  • - - - - Create frequency tables - -
  • diff --git a/docs/reference/mo_source.html b/docs/reference/mo_source.html index cce98d60..50d67bc8 100644 --- a/docs/reference/mo_source.html +++ b/docs/reference/mo_source.html @@ -81,7 +81,7 @@ This is the fastest way to have your organisation (or analysis) specific codes p AMR (for R) - 0.7.1.9012 + 0.7.1.9015 @@ -159,13 +159,6 @@ This is the fastest way to have your organisation (or analysis) specific codes p Get properties of an antibiotic
  • -
  • - - - - Create frequency tables - -
  • diff --git a/docs/reference/p.symbol.html b/docs/reference/p.symbol.html index 722d4578..35774dfc 100644 --- a/docs/reference/p.symbol.html +++ b/docs/reference/p.symbol.html @@ -80,7 +80,7 @@ AMR (for R) - 0.7.1.9012 + 0.7.1.9015 @@ -158,13 +158,6 @@ Get properties of an antibiotic
  • -
  • - - - - Create frequency tables - -
  • diff --git a/docs/reference/portion.html b/docs/reference/portion.html index 0ead9559..a3433e1e 100644 --- a/docs/reference/portion.html +++ b/docs/reference/portion.html @@ -81,7 +81,7 @@ portion_R and portion_IR can be used to calculate resistance, portion_S and port AMR (for R) - 0.7.1.9012 + 0.7.1.9015 @@ -159,13 +159,6 @@ portion_R and portion_IR can be used to calculate resistance, portion_S and port Get properties of an antibiotic
  • -
  • - - - - Create frequency tables - -
  • diff --git a/docs/reference/read.4D.html b/docs/reference/read.4D.html index b49e4369..76d0263d 100644 --- a/docs/reference/read.4D.html +++ b/docs/reference/read.4D.html @@ -80,7 +80,7 @@ AMR (for R) - 0.7.1.9012 + 0.7.1.9015 @@ -158,13 +158,6 @@ Get properties of an antibiotic
  • -
  • - - - - Create frequency tables - -
  • diff --git a/docs/reference/resistance_predict.html b/docs/reference/resistance_predict.html index 2ca6a2d4..ee1abfe4 100644 --- a/docs/reference/resistance_predict.html +++ b/docs/reference/resistance_predict.html @@ -80,7 +80,7 @@ AMR (for R) - 0.7.1.9012 + 0.7.1.9015 @@ -158,13 +158,6 @@ Get properties of an antibiotic
  • -
  • - - - - Create frequency tables - -
  • diff --git a/docs/reference/rsi_translation.html b/docs/reference/rsi_translation.html index 69f648aa..ed09bc3a 100644 --- a/docs/reference/rsi_translation.html +++ b/docs/reference/rsi_translation.html @@ -80,7 +80,7 @@ AMR (for R) - 0.7.1.9012 + 0.7.1.9015 @@ -158,13 +158,6 @@ Get properties of an antibiotic
  • -
  • - - - - Create frequency tables - -
  • diff --git a/docs/reference/septic_patients.html b/docs/reference/septic_patients.html index 85f4b704..3442d8a4 100644 --- a/docs/reference/septic_patients.html +++ b/docs/reference/septic_patients.html @@ -80,7 +80,7 @@ AMR (for R) - 0.7.1.9012 + 0.7.1.9015 @@ -158,13 +158,6 @@ Get properties of an antibiotic
  • -
  • - - - - Create frequency tables - -
  • diff --git a/docs/reference/skewness.html b/docs/reference/skewness.html index 9a765c78..120de23a 100644 --- a/docs/reference/skewness.html +++ b/docs/reference/skewness.html @@ -81,7 +81,7 @@ When negative: the left tail is longer; the mass of the distribution is concentr AMR (for R) - 0.7.1.9012 + 0.7.1.9015 @@ -159,13 +159,6 @@ When negative: the left tail is longer; the mass of the distribution is concentr Get properties of an antibiotic
  • -
  • - - - - Create frequency tables - -
  • diff --git a/docs/reference/translate.html b/docs/reference/translate.html index d372d531..8de2ba51 100644 --- a/docs/reference/translate.html +++ b/docs/reference/translate.html @@ -80,7 +80,7 @@ AMR (for R) - 0.7.1.9012 + 0.7.1.9015 @@ -158,13 +158,6 @@ Get properties of an antibiotic
  • -
  • - - - - Create frequency tables - -
  • diff --git a/docs/sitemap.xml b/docs/sitemap.xml index 67b4672e..c327e49f 100644 --- a/docs/sitemap.xml +++ b/docs/sitemap.xml @@ -69,9 +69,6 @@ https://msberends.gitlab.io/AMR/reference/first_isolate.html - - https://msberends.gitlab.io/AMR/reference/freq.html - https://msberends.gitlab.io/AMR/reference/g.test.html @@ -153,9 +150,6 @@ https://msberends.gitlab.io/AMR/articles/benchmarks.html - - https://msberends.gitlab.io/AMR/articles/freq.html - https://msberends.gitlab.io/AMR/articles/resistance_predict.html diff --git a/index.md b/index.md index 8957bd87..08a2ea8e 100644 --- a/index.md +++ b/index.md @@ -26,7 +26,6 @@ This package can be used for: * Plotting antimicrobial resistance ([tutorial](./articles/AMR.html)) * Determining first isolates to be used for AMR analysis ([manual](./reference/first_isolate.html)) * Applying EUCAST expert rules ([manual](./reference/eucast_rules.html)) - * Descriptive statistics: frequency tables, kurtosis and skewness ([tutorial](./articles/freq.html)) This package is ready-to-use for a professional environment by specialists in the following fields: @@ -154,7 +153,6 @@ The `AMR` package basically does four important things: * Calculate the resistance (and even co-resistance) of microbial isolates with the `portion_R()`, `portion_IR()`, `portion_I()`, `portion_SI()` and `portion_S()` functions. Similarly, the *number* of isolates can be determined with the `count_R()`, `count_IR()`, `count_I()`, `count_SI()` and `count_S()` functions. All these functions can be used with the `dplyr` package (e.g. in conjunction with `summarise()`) * Plot AMR results with `geom_rsi()`, a function made for the `ggplot2` package * Predict antimicrobial resistance for the nextcoming years using logistic regression models with the `resistance_predict()` function - * Conduct descriptive statistics to enhance base R: calculate `kurtosis()`, `skewness()` and create frequency tables with `freq()` 4. It **teaches the user** how to use all the above actions. diff --git a/man/freq.Rd b/man/freq.Rd deleted file mode 100755 index cafaf4ae..00000000 --- a/man/freq.Rd +++ /dev/null @@ -1,234 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/freq.R -\name{freq} -\alias{freq} -\alias{frequency_tbl} -\alias{top_freq} -\alias{header} -\alias{print.freq} -\title{Frequency table} -\usage{ -freq(x, ..., sort.count = TRUE, nmax = getOption("max.print.freq"), - na.rm = TRUE, row.names = TRUE, markdown = !interactive(), - digits = 2, quote = NULL, header = TRUE, title = NULL, - na = "", droplevels = TRUE, sep = " ", - decimal.mark = getOption("OutDec"), big.mark = ifelse(decimal.mark != - ",", ",", ".")) - -frequency_tbl(x, ..., sort.count = TRUE, - nmax = getOption("max.print.freq"), na.rm = TRUE, row.names = TRUE, - markdown = !interactive(), digits = 2, quote = NULL, - header = TRUE, title = NULL, na = "", droplevels = TRUE, - sep = " ", decimal.mark = getOption("OutDec"), - big.mark = ifelse(decimal.mark != ",", ",", ".")) - -top_freq(f, n) - -header(f, property = NULL) - -\method{print}{freq}(x, nmax = getOption("max.print.freq", default = 15), - markdown = !interactive(), header = TRUE, - decimal.mark = getOption("OutDec"), big.mark = ifelse(decimal.mark != - ",", ",", "."), ...) -} -\arguments{ -\item{x}{vector of any class or a \code{\link{data.frame}}, \code{\link{tibble}} (may contain a grouping variable) or \code{\link{table}}} - -\item{...}{up to nine different columns of \code{x} when \code{x} is a \code{data.frame} or \code{tibble}, to calculate frequencies from - see Examples. Also supports quasiquotion.} - -\item{sort.count}{sort on count, i.e. frequencies. This will be \code{TRUE} at default for everything except when using grouping variables.} - -\item{nmax}{number of row to print. The default, \code{15}, uses \code{\link{getOption}("max.print.freq")}. Use \code{nmax = 0}, \code{nmax = Inf}, \code{nmax = NULL} or \code{nmax = NA} to print all rows.} - -\item{na.rm}{a logical value indicating whether \code{NA} values should be removed from the frequency table. The header (if set) will always print the amount of \code{NA}s.} - -\item{row.names}{a logical value indicating whether row indices should be printed as \code{1:nrow(x)}} - -\item{markdown}{a logical value indicating whether the frequency table should be printed in markdown format. This will print all rows (except when \code{nmax} is defined) and is default behaviour in non-interactive R sessions (like when knitting RMarkdown files).} - -\item{digits}{how many significant digits are to be used for numeric values in the header (not for the items themselves, that depends on \code{\link{getOption}("digits")})} - -\item{quote}{a logical value indicating whether or not strings should be printed with surrounding quotes. Default is to print them only around characters that are actually numeric values.} - -\item{header}{a logical value indicating whether an informative header should be printed} - -\item{title}{text to show above frequency table, at default to tries to coerce from the variables passed to \code{x}} - -\item{na}{a character string that should be used to show empty (\code{NA}) values (only useful when \code{na.rm = FALSE})} - -\item{droplevels}{a logical value indicating whether in factors empty levels should be dropped} - -\item{sep}{a character string to separate the terms when selecting multiple columns} - -\item{decimal.mark}{% - used for prettying (longish) numerical and complex sequences. - Passed to \code{\link{prettyNum}}: that help page explains the details.} - -\item{big.mark}{% - used for prettying (longish) numerical and complex sequences. - Passed to \code{\link{prettyNum}}: that help page explains the details.} - -\item{f}{a frequency table} - -\item{n}{number of top \emph{n} items to return, use -n for the bottom \emph{n} items. It will include more than \code{n} rows if there are ties.} - -\item{property}{property in header to return this value directly} -} -\value{ -A \code{data.frame} (with an additional class \code{"freq"}) with five columns: \code{item}, \code{count}, \code{percent}, \code{cum_count} and \code{cum_percent}. -} -\description{ -Create a frequency table of a vector with items or a \code{data.frame}. Supports quasiquotation and markdown for reports. Best practice is: \code{data \%>\% freq(var)}.\cr -\code{top_freq} can be used to get the top/bottom \emph{n} items of a frequency table, with counts as names. -} -\details{ -Frequency tables (or frequency distributions) are summaries of the distribution of values in a sample. With the `freq` function, you can create univariate frequency tables. Multiple variables will be pasted into one variable, so it forces a univariate distribution. This package also has a vignette available to explain the use of this function further, run \code{browseVignettes("AMR")} to read it. - -For numeric values of any class, these additional values will all be calculated with \code{na.rm = TRUE} and shown into the header: -\itemize{ - \item{Mean, using \code{\link[base]{mean}}} - \item{Standard Deviation, using \code{\link[stats]{sd}}} - \item{Coefficient of Variation (CV), the standard deviation divided by the mean} - \item{Mean Absolute Deviation (MAD), using \code{\link[stats]{mad}}} - \item{Tukey Five-Number Summaries (minimum, Q1, median, Q3, maximum), using \code{\link[stats]{fivenum}}} - \item{Interquartile Range (IQR) calculated as \code{Q3 - Q1} using the Tukey Five-Number Summaries, i.e. \strong{not} using the \code{\link[stats]{quantile}} function} - \item{Coefficient of Quartile Variation (CQV, sometimes called coefficient of dispersion), calculated as \code{(Q3 - Q1) / (Q3 + Q1)} using the Tukey Five-Number Summaries} - \item{Outliers (total count and unique count), using \code{\link[grDevices]{boxplot.stats}}} -} - -For dates and times of any class, these additional values will be calculated with \code{na.rm = TRUE} and shown into the header: -\itemize{ - \item{Oldest, using \code{\link{min}}} - \item{Newest, using \code{\link{max}}, with difference between newest and oldest} - \item{Median, using \code{\link[stats]{median}}, with percentage since oldest} -} - -In factors, all factor levels that are not existing in the input data will be dropped. - -The function \code{top_freq} uses \code{\link[dplyr]{top_n}} internally and will include more than \code{n} rows if there are ties. -} -\section{Read more on our website!}{ - -On our website \url{https://msberends.gitlab.io/AMR} you can find \href{https://msberends.gitlab.io/AMR/articles/AMR.html}{a tutorial} about how to conduct AMR analysis, the \href{https://msberends.gitlab.io/AMR/reference}{complete documentation of all functions} (which reads a lot easier than here in R) and \href{https://msberends.gitlab.io/AMR/articles/WHONET.html}{an example analysis using WHONET data}. -} - -\examples{ -library(dplyr) - -# this all gives the same result: -freq(septic_patients$hospital_id) -freq(septic_patients[, "hospital_id"]) -septic_patients$hospital_id \%>\% freq() -septic_patients[, "hospital_id"] \%>\% freq() -septic_patients \%>\% freq("hospital_id") -septic_patients \%>\% freq(hospital_id) #<- easiest to remember (tidyverse) - - -# you could also use `select` or `pull` to get your variables -septic_patients \%>\% - filter(hospital_id == "A") \%>\% - select(mo) \%>\% - freq() - - -# multiple selected variables will be pasted together -septic_patients \%>\% - left_join_microorganisms \%>\% - freq(genus, species) - -# functions as quasiquotation are also supported -septic_patients \%>\% - freq(mo_genus(mo), mo_species(mo)) - - -# group a variable and analyse another -septic_patients \%>\% - group_by(hospital_id) \%>\% - freq(gender) - - -# get top 10 bugs of hospital A as a vector -septic_patients \%>\% - filter(hospital_id == "A") \%>\% - freq(mo) \%>\% - top_freq(10) - - -# save frequency table to an object -years <- septic_patients \%>\% - mutate(year = format(date, "\%Y")) \%>\% - freq(year) - - -# show only the top 5 -years \%>\% print(nmax = 5) - - -# save to an object with formatted percentages -years <- format(years) - - -# print a histogram of numeric values -septic_patients \%>\% - freq(age) \%>\% - hist() - -# or a boxplot of numeric values -septic_patients \%>\% - freq(age) \%>\% - boxplot() - -# or even a boxplot per group -septic_patients \%>\% - group_by(hospital_id) \%>\% - freq(age) \%>\% - boxplot() - -# or print all points to a regular plot -septic_patients \%>\% - freq(age) \%>\% - plot() - - -# transform to a data.frame or tibble -septic_patients \%>\% - freq(age) \%>\% - as.data.frame() - - -# or transform (back) to a vector -septic_patients \%>\% - freq(age) \%>\% - as.vector() - -identical(septic_patients \%>\% - freq(age) \%>\% - as.vector() \%>\% - sort(), - sort(septic_patients$age)) # TRUE - - -# it also supports `table` objects -table(septic_patients$gender, - septic_patients$age) \%>\% - freq(sep = " **sep** ") - - -# only get selected columns -septic_patients \%>\% - freq(hospital_id) \%>\% - select(item, percent) - -septic_patients \%>\% - freq(hospital_id) \%>\% - select(-count, -cum_count) - - -# check differences between frequency tables -diff(freq(septic_patients$TMP), - freq(septic_patients$SXT)) -} -\keyword{freq} -\keyword{frequency} -\keyword{summarise} -\keyword{summary} diff --git a/tests/testthat/test-freq.R b/tests/testthat/test-freq.R index 8e95ad03..02ebd32e 100755 --- a/tests/testthat/test-freq.R +++ b/tests/testthat/test-freq.R @@ -22,168 +22,10 @@ context("freq.R") test_that("frequency table works", { - library(dplyr) - - expect_equal(nrow(freq(c(1, 1, 2, 2, 3, 3, 4, 4, 5, 5))), 5) - expect_equal(nrow(frequency_tbl(c(1, 1, 2, 2, 3, 3, 4, 4, 5, 5))), 5) - - # date column of septic_patients should contain 1140 unique dates - expect_equal(nrow(freq(septic_patients$date)), 1140) - expect_equal(nrow(freq(septic_patients$date)), - length(unique(septic_patients$date))) - - expect_output(print(septic_patients %>% freq(age))) - expect_output(print(septic_patients %>% freq(age, nmax = 5))) - expect_output(print(septic_patients %>% freq(age, nmax = Inf, markdown = FALSE))) - expect_output(print(freq(septic_patients$age, nmax = Inf))) - expect_output(print(freq(septic_patients$age, nmax = NA))) - expect_output(print(freq(septic_patients$age, nmax = NULL))) - expect_output(print(freq(septic_patients$age, sort.count = FALSE))) - expect_output(print(freq(septic_patients$age, markdown = TRUE))) - expect_output(print(freq(septic_patients$age, markdown = TRUE), markdown = FALSE)) - expect_output(print(freq(septic_patients$age, markdown = TRUE), markdown = TRUE)) - expect_output(print(freq(septic_patients$age[0]))) - expect_output(print(freq(septic_patients$age, quote = TRUE))) - expect_output(print(freq(septic_patients$age, markdown = TRUE, title = "TITLE"))) - - # character - expect_output(print(freq(microorganisms$genus))) - expect_output(print(structure(freq(microorganisms$genus), - # check printing of old class: - class = c("frequency_tbl", "data.frame")))) + library(clean) # mo expect_output(print(freq(septic_patients$mo))) # rsi expect_output(print(freq(septic_patients$AMX))) - # integer - expect_output(print(freq(septic_patients$age))) - # date - expect_output(print(freq(septic_patients$date))) - # factor - expect_output(print(freq(septic_patients$hospital_id))) - # table - expect_output(print(freq(table(septic_patients$gender, septic_patients$age)))) - # rsi - expect_output(print(freq(septic_patients$AMC))) - # hms - expect_output(print(freq(hms::as.hms(sample(c(0:86399), 50))))) - # matrix - expect_output(print(freq(as.matrix(septic_patients$age)))) - expect_output(print(freq(as.matrix(septic_patients[, c("age", "gender")])))) - # list - expect_output(print(freq(list(age = septic_patients$age)))) - expect_output(print(freq(list(age = septic_patients$age, gender = septic_patients$gender)))) - # difftime - expect_output(print( - freq(difftime(Sys.time(), - Sys.time() - runif(5, min = 0, max = 60 * 60 * 24), - units = "hours")))) - - expect_output(print(freq(septic_patients$age)[,1:3])) - - library(dplyr) - expect_output(septic_patients %>% select(1:2) %>% freq() %>% print()) - expect_output(septic_patients %>% select(1:3) %>% freq() %>% print()) - expect_output(septic_patients %>% select(1:4) %>% freq() %>% print()) - expect_output(septic_patients %>% select(1:5) %>% freq() %>% print()) - expect_output(septic_patients %>% select(1:6) %>% freq() %>% print()) - expect_output(septic_patients %>% select(1:7) %>% freq() %>% print()) - expect_output(septic_patients %>% select(1:8) %>% freq() %>% print()) - expect_output(septic_patients %>% select(1:9) %>% freq() %>% print()) - expect_output(print(freq(septic_patients$age), nmax = 20)) - - # grouping variable - expect_output(print(septic_patients %>% group_by(gender) %>% freq(hospital_id))) - expect_output(print(septic_patients %>% group_by(gender) %>% freq(AMX, quote = TRUE))) - expect_output(print(septic_patients %>% group_by(gender) %>% freq(AMX, markdown = TRUE))) - - # quasiquotation - expect_output(print(septic_patients %>% freq(mo_genus(mo)))) - expect_output(print(septic_patients %>% freq(mo, mo_genus(mo)))) - expect_output(print(septic_patients %>% group_by(gender) %>% freq(mo_genus(mo)))) - expect_output(print(septic_patients %>% group_by(gender) %>% freq(mo, mo_genus(mo)))) - - # top 5 - expect_equal( - septic_patients %>% - freq(mo) %>% - top_freq(5) %>% - length(), - 5) - # there are more than 5 lowest values - expect_gt( - septic_patients %>% - freq(mo) %>% - top_freq(-5) %>% - length(), - 5) - # n has length > 1 - expect_error( - septic_patients %>% - freq(mo) %>% - top_freq(n = c(1, 2)) - ) - # input must be freq tbl - expect_error(septic_patients %>% top_freq(1)) - - # charts from plot, hist and boxplot, should not raise errors - plot(freq(septic_patients, age)) - hist(freq(septic_patients, age)) - boxplot(freq(septic_patients, age)) - boxplot(freq(dplyr::group_by(septic_patients, gender), age)) - - # check vector - expect_identical(septic_patients %>% - freq(age) %>% - as.vector() %>% - sort(), - septic_patients %>% - pull(age) %>% - sort()) - - # check format - expect_identical(septic_patients %>% - freq(age) %>% - format() %>% - apply(2, class) %>% - unname(), - rep("character", 5)) - - # check tibble - expect_identical(septic_patients %>% - freq(age) %>% - as_tibble() %>% - class() %>% - .[1], - "tbl_df") - - expect_error(septic_patients %>% freq(nonexisting)) - expect_error(septic_patients %>% select(1:10) %>% freq()) - expect_error(septic_patients %>% freq(peni, oxac, clox, AMX, AMC, - ampi, pita, czol, cfep, cfur)) - - # (un)select columns - expect_equal(septic_patients %>% freq(hospital_id) %>% select(item) %>% ncol(), - 1) - expect_equal(septic_patients %>% freq(hospital_id) %>% select(-item) %>% ncol(), - 4) - - # run diff - expect_output(print( - diff(freq(septic_patients$AMC), - freq(septic_patients$AMX)) - )) - expect_output(print( - diff(freq(septic_patients$age), - freq(septic_patients$age)) # "No differences found." - )) - expect_error(print( - diff(freq(septic_patients$AMX), - "Just a string") # not a freq tbl - )) - - # directly on group - expect_output(print(septic_patients %>% group_by(ageplusone = as.character(age + 1)) %>% freq(ageplusone))) - }) diff --git a/vignettes/AMR.Rmd b/vignettes/AMR.Rmd index 97fbef2e..4dce8bf3 100755 --- a/vignettes/AMR.Rmd +++ b/vignettes/AMR.Rmd @@ -144,7 +144,14 @@ knitr::kable(head(data), align = "c") Now, let's start the cleaning and the analysis! # Cleaning the data -Use the frequency table function `freq()` to look specifically for unique values in any variable. For example, for the `gender` variable: + +We also created a package dedicated to data cleaning and checking, called the `clean` package. It gets automatically installed with the `AMR` package, so we only have to load it: + +```{r lib clean, message = FALSE} +library(clean) +``` + +Use the frequency table function `freq()` from this `clean` package to look specifically for unique values in any variable. For example, for the `gender` variable: ```{r freq gender 1, eval = FALSE} data %>% freq(gender) # this would be the same: freq(data$gender) diff --git a/vignettes/MDR.Rmd b/vignettes/MDR.Rmd index 356389b6..601e2988 100644 --- a/vignettes/MDR.Rmd +++ b/vignettes/MDR.Rmd @@ -72,7 +72,13 @@ We can now add the interpretation of MDR-TB to our data set: my_TB_data$mdr <- mdr_tb(my_TB_data) ``` -And review the result with a frequency table: +We also created a package dedicated to data cleaning and checking, called the `clean` package. It gets automatically installed with the `AMR` package, so we only have to load it: + +```{r lib clean, message = FALSE} +library(clean) +``` + +It contains the `freq()` function, to create a frequency table: ```{r, results = 'asis'} freq(my_TB_data$mdr) diff --git a/vignettes/WHONET.Rmd b/vignettes/WHONET.Rmd index 667bdc5d..6dda5603 100644 --- a/vignettes/WHONET.Rmd +++ b/vignettes/WHONET.Rmd @@ -60,7 +60,17 @@ data <- WHONET %>% mutate_at(vars(AMP_ND10:CIP_EE), as.rsi) ``` -No errors or warnings, so all values are transformed succesfully. Let's check it though, with a couple of frequency tables: +No errors or warnings, so all values are transformed succesfully. + +We created a package dedicated to data cleaning and checking, called the `clean` package. It gets automatically installed with the `AMR` package, so we only have to load it: + +```{r lib clean, message = FALSE} +library(clean) +``` + +It contains the `freq()` function, to create frequency tables. + +So let's check our data, with a couple of frequency tables: ```{r, results = 'asis'} # our newly created `mo` variable diff --git a/vignettes/freq.Rmd b/vignettes/freq.Rmd deleted file mode 100644 index e450dc2d..00000000 --- a/vignettes/freq.Rmd +++ /dev/null @@ -1,182 +0,0 @@ ---- -title: "How to create frequency tables" -author: "Matthijs S. Berends" -date: '`r format(Sys.Date(), "%d %B %Y")`' -output: - rmarkdown::html_vignette: - toc: true - toc_depth: 3 -vignette: > - %\VignetteIndexEntry{How to create frequency tables} - %\VignetteEncoding{UTF-8} - %\VignetteEngine{knitr::rmarkdown} -editor_options: - chunk_output_type: console ---- - -```{r setup, include = FALSE, results = 'asis'} -knitr::opts_chunk$set( - collapse = TRUE, - comment = "#", - results = 'asis', - fig.width = 7.5, - fig.height = 4.5 -) -library(dplyr) -library(AMR) -``` - -## Introduction - -Frequency tables (or frequency distributions) are summaries of the distribution of values in a sample. With the `freq()` function, you can create univariate frequency tables. Multiple variables will be pasted into one variable, so it forces a univariate distribution. We take the `septic_patients` dataset (included in this AMR package) as example. - -## Frequencies of one variable - -To only show and quickly review the content of one variable, you can just select this variable in various ways. Let's say we want to get the frequencies of the `gender` variable of the `septic_patients` dataset: -```{r, echo = TRUE} -# Any of these will work: -# freq(septic_patients$gender) -# freq(septic_patients[, "gender"]) - -# Using tidyverse: -# septic_patients$gender %>% freq() -# septic_patients[, "gender"] %>% freq() -# septic_patients %>% freq("gender") - -# Probably the fastest and easiest: -septic_patients %>% freq(gender) -``` -This immediately shows the class of the variable, its length and availability (i.e. the amount of `NA`), the amount of unique values and (most importantly) that among septic patients men are more prevalent than women. - -## Frequencies of more than one variable - -Multiple variables will be pasted into one variable to review individual cases, keeping a univariate frequency table. - -For illustration, we could add some more variables to the `septic_patients` dataset to learn about bacterial properties: -```{r, echo = TRUE, results = 'hide'} -my_patients <- septic_patients %>% left_join_microorganisms() -``` -Now all variables of the `microorganisms` dataset have been joined to the `septic_patients` dataset. The `microorganisms` dataset consists of the following variables: -```{r, echo = TRUE, results = 'markup'} -colnames(microorganisms) -``` - -If we compare the dimensions between the old and new dataset, we can see that these `r ncol(my_patients) - ncol(septic_patients)` variables were added: -```{r, echo = TRUE, results = 'markup'} -dim(septic_patients) -dim(my_patients) -``` - -So now the `genus` and `species` variables are available. A frequency table of these combined variables can be created like this: -```{r, echo = TRUE} -my_patients %>% - freq(genus, species, nmax = 15) -``` - -## Frequencies of numeric values - -Frequency tables can be created of any input. - -In case of numeric values (like integers, doubles, etc.) additional descriptive statistics will be calculated and shown into the header: - -```{r, echo = TRUE} -# # get age distribution of unique patients -septic_patients %>% - distinct(patient_id, .keep_all = TRUE) %>% - freq(age, nmax = 5, header = TRUE) -``` - -So the following properties are determined, where `NA` values are always ignored: - -* **Mean** - -* **Standard deviation** - -* **Coefficient of variation** (CV), the standard deviation divided by the mean - -* **Mean absolute deviation** (MAD), the median of the absolute deviations from the median - a more robust statistic than the standard deviation - -* **Five numbers of Tukey**, namely: the minimum, Q1, median, Q3 and maximum - -* **Interquartile range** (IQR), the distance between Q1 and Q3 - -* **Coefficient of quartile variation** (CQV, sometimes called *coefficient of dispersion*), calculated as (Q3 - Q1) / (Q3 + Q1) using `quantile()` with `type = 6` as quantile algorithm to comply with SPSS standards - -* **Outliers** (total count and unique count) - -So for example, the above frequency table quickly shows the median age of patients being `r my_patients %>% distinct(patient_id, .keep_all = TRUE) %>% pull(age) %>% median(na.rm = TRUE)`. - -## Frequencies of factors - -To sort frequencies of factors on their levels instead of item count, use the `sort.count` parameter. - -`sort.count` is `TRUE` by default. Compare this default behaviour... - -```{r, echo = TRUE} -septic_patients %>% - freq(hospital_id) -``` - -... to this, where items are now sorted on factor levels: - -```{r, echo = TRUE} -septic_patients %>% - freq(hospital_id, sort.count = FALSE) -``` - -All classes will be printed into the header. Variables with the new `rsi` class of this AMR package are actually ordered factors and have three classes (look at `Class` in the header): - -```{r, echo = TRUE} -septic_patients %>% - freq(AMX, header = TRUE) -``` - -## Frequencies of dates - -Frequencies of dates will show the oldest and newest date in the data, and the amount of days between them: - -```{r, echo = TRUE} -septic_patients %>% - freq(date, nmax = 5, header = TRUE) -``` - -## Assigning a frequency table to an object - -A frequency table is actually a regular `data.frame`, with the exception that it contains an additional class. - -```{r, echo = TRUE} -my_df <- septic_patients %>% freq(age) -class(my_df) -``` - -Because of this additional class, a frequency table prints like the examples above. But the object itself contains the complete table without a row limitation: - -```{r, echo = TRUE} -dim(my_df) -``` - -## Additional parameters - -### Parameter `na.rm` -With the `na.rm` parameter you can remove `NA` values from the frequency table (defaults to `TRUE`, but the number of `NA` values will always be shown into the header): - -```{r, echo = TRUE} -septic_patients %>% - freq(AMX, na.rm = FALSE) -``` - -### Parameter `row.names` -A frequency table shows row indices. To remove them, use `row.names = FALSE`: - -```{r, echo = TRUE} -septic_patients %>% - freq(hospital_id, row.names = FALSE) -``` - -### Parameter `markdown` -The `markdown` parameter is `TRUE` at default in non-interactive sessions, like in reports created with R Markdown. This will always print all rows, unless `nmax` is set. Without markdown (like in regular R), a frequency table would print like: - -```{r, echo = TRUE, results = 'markup'} -septic_patients %>% - freq(hospital_id, markdown = FALSE) -```
  • Analysing your data

    -

    Functions for conducting AMR analysis, like counting isolates, calculating resistance or susceptibility, creating frequency tables or make plots.

    +

    Functions for conducting AMR analysis, like counting isolates, calculating resistance or susceptibility, or make plots.

    Filter isolates on result in antibiotic class

    -

    freq() frequency_tbl() top_freq() header() print(<freq>)

    -

    Frequency table

    g.test()