From 84e08f516a91aee85280e1b7eafdccdea25a7a27 Mon Sep 17 00:00:00 2001 From: "Matthijs S. Berends" Date: Tue, 6 Nov 2018 16:41:59 +0100 Subject: [PATCH] grouping var for freq --- NAMESPACE | 2 +- NEWS.md | 6 ++ R/freq.R | 133 ++++++++++++++++++++++-------- man/freq.Rd | 11 ++- tests/testthat/test-freq.R | 3 + tests/testthat/test-mo_property.R | 2 +- 6 files changed, 116 insertions(+), 41 deletions(-) diff --git a/NAMESPACE b/NAMESPACE index 25d69444..fcc31c14 100755 --- a/NAMESPACE +++ b/NAMESPACE @@ -212,6 +212,7 @@ importFrom(dplyr,summarise) importFrom(dplyr,summarise_if) importFrom(dplyr,tibble) importFrom(dplyr,top_n) +importFrom(dplyr,ungroup) importFrom(dplyr,vars) importFrom(grDevices,boxplot.stats) importFrom(graphics,axis) @@ -235,5 +236,4 @@ importFrom(stats,predict) importFrom(stats,sd) importFrom(utils,View) importFrom(utils,browseVignettes) -importFrom(utils,installed.packages) importFrom(xml2,read_html) diff --git a/NEWS.md b/NEWS.md index ec6860a9..aaf0f3b9 100755 --- a/NEWS.md +++ b/NEWS.md @@ -25,6 +25,12 @@ * Using `portion_*` functions now throws a warning when total available isolate is below parameter `minimum` * Functions `as.mo`, `as.rsi`, `as.mic`, `as.atc` and `freq` will not set package name as attribute anymore * Frequency tables - `freq()`: + * Support for grouping variables, test with: + ```r + septic_patients %>% + group_by(hospital_id) %>% + freq(gender) + ``` * Check for `hms::is.hms` * Now prints in markdown at default in non-interactive sessions * No longer adds the factor level column and sorts factors on count again diff --git a/R/freq.R b/R/freq.R index 90ed50d5..ded0d3b8 100755 --- a/R/freq.R +++ b/R/freq.R @@ -19,9 +19,9 @@ #' Frequency table #' #' Create a frequency table of a vector with items or a data frame. Supports quasiquotation and markdown for reports. \code{top_freq} can be used to get the top/bottom \emph{n} items of a frequency table, with counts as names. -#' @param x vector of any class or a \code{\link{data.frame}}, \code{\link{tibble}} or \code{\link{table}} +#' @param x vector of any class or a \code{\link{data.frame}}, \code{\link{tibble}} (may contain a grouping variable) or \code{\link{table}} #' @param ... up to nine different columns of \code{x} when \code{x} is a \code{data.frame} or \code{tibble}, to calculate frequencies from - see Examples -#' @param sort.count sort on count, i.e. frequencies. This will be \code{TRUE} at default for everything except for factors. +#' @param sort.count sort on count, i.e. frequencies. This will be \code{TRUE} at default for everything except when using grouping variables. #' @param nmax number of row to print. The default, \code{15}, uses \code{\link{getOption}("max.print.freq")}. Use \code{nmax = 0}, \code{nmax = Inf}, \code{nmax = NULL} or \code{nmax = NA} to print all rows. #' @param na.rm a logical value indicating whether \code{NA} values should be removed from the frequency table. The header (if set) will always print the amount of \code{NA}s. #' @param row.names a logical value indicating whether row indices should be printed as \code{1:nrow(x)} @@ -59,8 +59,8 @@ #' The function \code{top_freq} uses \code{\link[dplyr]{top_n}} internally and will include more than \code{n} rows if there are ties. #' @importFrom stats fivenum sd mad #' @importFrom grDevices boxplot.stats -#' @importFrom dplyr %>% select pull n_distinct group_by arrange desc mutate summarise n_distinct tibble -#' @importFrom utils browseVignettes installed.packages +#' @importFrom dplyr %>% arrange arrange_at desc funs group_by mutate mutate_at n_distinct pull select summarise tibble ungroup vars +#' @importFrom utils browseVignettes #' @importFrom hms is.hms #' @importFrom crayon red green silver #' @keywords summary summarise frequency freq @@ -77,7 +77,7 @@ #' septic_patients$hospital_id %>% freq() #' septic_patients[, "hospital_id"] %>% freq() #' septic_patients %>% freq("hospital_id") -#' septic_patients %>% freq(hospital_id) #<- easiest to remember when you're used to tidyverse +#' septic_patients %>% freq(hospital_id) #<- easiest to remember (tidyverse) #' #' # you could also use `select` or `pull` to get your variables #' septic_patients %>% @@ -91,6 +91,11 @@ #' filter(hospital_id == "A") %>% #' freq(genus, species) #' +#' # group a variable and analyse another +#' septic_patients %>% +#' group_by(hospital_id) %>% +#' freq(gender) +#' #' # get top 10 bugs of hospital A as a vector #' septic_patients %>% #' filter(hospital_id == "A") %>% @@ -157,6 +162,8 @@ frequency_tbl <- function(x, sep = " ") { mult.columns <- 0 + x.group = character(0) + df <- NULL x.name <- NULL cols <- NULL @@ -174,6 +181,12 @@ frequency_tbl <- function(x, } if (any(class(x) == 'data.frame')) { + x.group <- group_vars(x) + if (length(x.group) > 1) { + x.group <- x.group[1L] + warning("freq supports one grouping variable, only `", x.group, "` will be kept.", call. = FALSE) + } + if (is.null(x.name)) { x.name <- deparse(substitute(x)) } @@ -188,6 +201,27 @@ frequency_tbl <- function(x, if (!all(cols %in% colnames(x))) { stop("one or more columns not found: `", paste(cols, collapse = "`, `"), '`', call. = FALSE) } + if (length(x.group) > 0) { + x.group_cols <- c(x.group, cols) + df <- x %>% + group_by_at(vars(x.group_cols)) %>% + summarise(count = n()) + if (!missing(sort.count)) { + if (sort.count == TRUE) { + df <- df %>% arrange_at(c(x.group, "count"), desc) + } + } + df <- df %>% + mutate(cum_count = cumsum(count)) + + df.topleft <- df[1, 1] + df <- df %>% + ungroup() %>% + # do not repeat group labels + mutate_at(vars(x.group), funs(ifelse(lag(.) == ., "", .))) + df[1, 1] <- df.topleft + colnames(df)[1:2] <- c("group", "item") + } if (length(cols) > 0) { x <- x[, cols] } @@ -233,10 +267,9 @@ frequency_tbl <- function(x, class(x) <- x_class } - if (sort.count == FALSE & 'factor' %in% class(x)) { - # warning("Sorting a factor sorts on factor level, not necessarily alphabetically.", call. = FALSE) - } - + # if (sort.count == FALSE & 'factor' %in% class(x)) { + # warning("Sorting a factor sorts on factor level, not necessarily alphabetically.", call. = FALSE) + # } header_txt <- character(0) markdown_line <- '' @@ -352,40 +385,52 @@ frequency_tbl <- function(x, nmax <- length(x) } - # create table with counts and percentages column_names <- c('Item', 'Count', 'Percent', 'Cum. Count', 'Cum. Percent') column_names_df <- c('item', 'count', 'percent', 'cum_count', 'cum_percent') - - - df <- tibble(item = x) %>% - group_by(item) %>% - summarise(count = n()) column_align <- c(x_align, 'r', 'r', 'r', 'r') + if (is.null(df)) { + # create table with counts and percentages + df <- tibble(item = x) %>% + group_by(item) %>% + summarise(count = n()) + + # sort according to setting + if (sort.count == TRUE) { + df <- df %>% arrange(desc(count), item) + } else { + df <- df %>% arrange(item) + } + } else { + column_names <- c("Group", column_names) + column_names_df <-c("group", column_names_df) + column_align <- c("l", column_align) + } + if (df$item %>% paste(collapse = ',') %like% '\033') { # remove escape char # see https://en.wikipedia.org/wiki/Escape_character#ASCII_escape_character df <- df %>% mutate(item = item %>% gsub('\033', ' ', ., fixed = TRUE)) } - # sort according to setting - if (sort.count == TRUE) { - df <- df %>% arrange(desc(count), item) - } else { - df <- df %>% arrange(item) - } - if (quote == TRUE) { df$item <- paste0('"', df$item, '"') + if (length(x.group) != 0) { + df$group <- paste0('"', df$group, '"') + } } df <- as.data.frame(df, stringsAsFactors = FALSE) df$percent <- df$count / base::sum(df$count, na.rm = TRUE) - df$cum_count <- base::cumsum(df$count) + if (length(x.group) == 0) { + df$cum_count <- base::cumsum(df$count) + } df$cum_percent <- df$cum_count / base::sum(df$count, na.rm = TRUE) - - colnames(df) <- column_names_df + if (length(x.group) != 0) { + # sort columns + df <- df[, column_names_df] + } if (markdown == TRUE) { tbl_format <- 'markdown' @@ -394,14 +439,15 @@ frequency_tbl <- function(x, } if (!is.null(title)) { - x.name <- trimws(gsub("^Frequency table of", "", title[1L], ignore.case = TRUE)) - cols <- NULL + title <- trimws(gsub("^Frequency table of", "", title[1L], ignore.case = TRUE)) } structure(.Data = df, class = c('frequency_tbl', class(df)), - opt = list(data = x.name, + opt = list(title = title, + data = x.name, vars = cols, + group_var = x.group, header = header, header_txt = header_txt, row_names = row.names, @@ -502,14 +548,25 @@ print.frequency_tbl <- function(x, nmax = getOption("max.print.freq", default = opt$vars <- NULL } - if (!is.null(opt$data) & !is.null(opt$vars)) { - title <- paste0("of `", paste0(opt$vars, collapse = "` and `"), "` from ", opt$data) - } else if (!is.null(opt$data) & is.null(opt$vars)) { - title <- paste("of", opt$data) - } else if (is.null(opt$data) & !is.null(opt$vars)) { - title <- paste0("of `", paste0(opt$vars, collapse = "` and `"), "`") + if (is.null(opt$title)) { + if (!is.null(opt$data) & !is.null(opt$vars)) { + title <- paste0("`", paste0(opt$vars, collapse = "` and `"), "` from ", opt$data) + } else if (!is.null(opt$data) & is.null(opt$vars)) { + title <- opt$data + } else if (is.null(opt$data) & !is.null(opt$vars)) { + title <- paste0("`", paste0(opt$vars, collapse = "` and `"), "`") + } else { + title <- "" + } + if (title != "" & length(opt$group_var) != 0) { + group_var <- paste0("(grouped by `", opt$group_var, "`)") + if (opt$tbl_format == "pandoc") { + group_var <- silver(group_var) + } + title <- paste(title, group_var) + } } else { - title <- "" + title <- opt$title } if (!missing(nmax)) { @@ -525,7 +582,11 @@ print.frequency_tbl <- function(x, nmax = getOption("max.print.freq", default = } } - title <- paste("Frequency table", trimws(title)) + if (trimws(title) == "") { + title <- "Frequency table" + } else { + title <- paste("Frequency table of", trimws(title)) + } # bold title if (opt$tbl_format == "pandoc") { diff --git a/man/freq.Rd b/man/freq.Rd index 5a752a02..80f75795 100755 --- a/man/freq.Rd +++ b/man/freq.Rd @@ -23,11 +23,11 @@ top_freq(f, n) default = 15), ...) } \arguments{ -\item{x}{vector of any class or a \code{\link{data.frame}}, \code{\link{tibble}} or \code{\link{table}}} +\item{x}{vector of any class or a \code{\link{data.frame}}, \code{\link{tibble}} (may contain a grouping variable) or \code{\link{table}}} \item{...}{up to nine different columns of \code{x} when \code{x} is a \code{data.frame} or \code{tibble}, to calculate frequencies from - see Examples} -\item{sort.count}{sort on count, i.e. frequencies. This will be \code{TRUE} at default for everything except for factors.} +\item{sort.count}{sort on count, i.e. frequencies. This will be \code{TRUE} at default for everything except when using grouping variables.} \item{nmax}{number of row to print. The default, \code{15}, uses \code{\link{getOption}("max.print.freq")}. Use \code{nmax = 0}, \code{nmax = Inf}, \code{nmax = NULL} or \code{nmax = NA} to print all rows.} @@ -93,7 +93,7 @@ freq(septic_patients[, "hospital_id"]) septic_patients$hospital_id \%>\% freq() septic_patients[, "hospital_id"] \%>\% freq() septic_patients \%>\% freq("hospital_id") -septic_patients \%>\% freq(hospital_id) #<- easiest to remember when you're used to tidyverse +septic_patients \%>\% freq(hospital_id) #<- easiest to remember (tidyverse) # you could also use `select` or `pull` to get your variables septic_patients \%>\% @@ -107,6 +107,11 @@ septic_patients \%>\% filter(hospital_id == "A") \%>\% freq(genus, species) +# group a variable and analyse another +septic_patients \%>\% + group_by(hospital_id) \%>\% + freq(gender) + # get top 10 bugs of hospital A as a vector septic_patients \%>\% filter(hospital_id == "A") \%>\% diff --git a/tests/testthat/test-freq.R b/tests/testthat/test-freq.R index 88af5bec..c8959002 100755 --- a/tests/testthat/test-freq.R +++ b/tests/testthat/test-freq.R @@ -61,6 +61,9 @@ test_that("frequency table works", { expect_output(septic_patients %>% select(1:9) %>% freq() %>% print()) expect_output(print(freq(septic_patients$age), nmax = 20)) + # grouping variable + expect_output(print(septic_patients %>% group_by(gender) %>% freq(hospital_id))) + # top 5 expect_equal( septic_patients %>% diff --git a/tests/testthat/test-mo_property.R b/tests/testthat/test-mo_property.R index c7755fb9..aa1300a3 100644 --- a/tests/testthat/test-mo_property.R +++ b/tests/testthat/test-mo_property.R @@ -56,7 +56,7 @@ test_that("mo_property works", { # check vector with random values library(dplyr) df_sample <- AMR::microorganisms %>% sample_n(100) - expect_identical(df_sample %>% pull(mo) %>% mo_fullname(), + expect_identical(df_sample %>% pull(mo) %>% mo_fullname(language = "en"), df_sample %>% pull(fullname)) })