From 299c5bea43176b63ffdbf914bdf10c2a09214528 Mon Sep 17 00:00:00 2001 From: "Matthijs S. Berends" Date: Tue, 23 Oct 2018 09:42:26 +0200 Subject: [PATCH] na freq, removed factors --- DESCRIPTION | 2 +- NEWS.md | 2 + R/freq.R | 126 ++++++++++++++++++++++++---------------------------- README.md | 61 ++++++------------------- 4 files changed, 76 insertions(+), 115 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index 4ac31f72..cd2545ef 100755 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,6 +1,6 @@ Package: AMR Version: 0.4.0.9006 -Date: 2018-10-22 +Date: 2018-10-23 Title: Antimicrobial Resistance Analysis Authors@R: c( person( diff --git a/NEWS.md b/NEWS.md index 40b7f01a..bf1c4ef7 100755 --- a/NEWS.md +++ b/NEWS.md @@ -22,7 +22,9 @@ * Check for `hms::is.hms` in frequency tables (`freq()`) * New parameter `header` for frequency tables to turn them off (default when `markdown = TRUE`) * Freq now prints in markdown at default in non-interactive sessions +* Freq no longer add the factor level column and sorts factors on count again * Removed diacritics from all authors (columns `microorganisms$ref` and `microorganisms.old$ref`) to comply with CRAN policy to only allow ASCII characters +* Freq gained `na` parameter, to choose with character to print for empty values * Fix for `mo_property` not working properly * Fix for `EUCAST_rules` where some Streptococci would become ceftazidime R in EUCAST rule 4.5 * Support for class `difftime` in frequency tables diff --git a/R/freq.R b/R/freq.R index 35da5298..894b6dd6 100755 --- a/R/freq.R +++ b/R/freq.R @@ -23,12 +23,13 @@ #' @param ... up to nine different columns of \code{x} when \code{x} is a \code{data.frame} or \code{tibble}, to calculate frequencies from - see Examples #' @param sort.count sort on count, i.e. frequencies. This will be \code{TRUE} at default for everything except for factors. #' @param nmax number of row to print. The default, \code{15}, uses \code{\link{getOption}("max.print.freq")}. Use \code{nmax = 0}, \code{nmax = Inf}, \code{nmax = NULL} or \code{nmax = NA} to print all rows. -#' @param na.rm a logical value indicating whether \code{NA} values should be removed from the frequency table. The header will always print the amount of \code{NA}s. +#' @param na.rm a logical value indicating whether \code{NA} values should be removed from the frequency table. The header (if set) will always print the amount of \code{NA}s. #' @param row.names a logical value indicating whether row indices should be printed as \code{1:nrow(x)} #' @param markdown a logical value indicating whether the frequency table should be printed in markdown format. This will print all rows and is default behaviour in non-interactive R sessions (like when knitting RMarkdown files). #' @param digits how many significant digits are to be used for numeric values in the header (not for the items themselves, that depends on \code{\link{getOption}("digits")}) #' @param quote a logical value indicating whether or not strings should be printed with surrounding quotes #' @param header a logical value indicating whether an informative header should be printed +#' @param na a character string to should be used to show empty (\code{NA}) values (only useful when \code{na.rm = FALSE}) #' @param sep a character string to separate the terms when selecting multiple columns #' @param f a frequency table #' @param n number of top \emph{n} items to return, use -n for the bottom \emph{n} items. It will include more than \code{n} rows if there are ties. @@ -150,6 +151,7 @@ frequency_tbl <- function(x, digits = 2, quote = FALSE, header = !markdown, + na = "", sep = " ") { mult.columns <- 0 @@ -197,7 +199,7 @@ frequency_tbl <- function(x, # now this DF contains 3 columns: the 2 vars and a Freq column # paste the first 2 cols and repeat them Freq times: x <- rep(x = do.call(paste, c(x[colnames(x)[1:2]], sep = sep)), - times = x$Freq) + times = x$Freq) x.name <- "a `table` object" cols <- NULL #mult.columns <- 2 @@ -229,9 +231,8 @@ frequency_tbl <- function(x, class(x) <- x_class } - if (missing(sort.count) & 'factor' %in% class(x)) { - # sort on factor level at default when x is a factor and sort.count is not set - sort.count <- FALSE + if (sort.count == FALSE & 'factor' %in% class(x)) { + # warning("Sorting a factor sorts on factor level, not necessarily alphabetically.", call. = FALSE) } header_txt <- character(0) @@ -260,11 +261,11 @@ frequency_tbl <- function(x, } header_txt <- header_txt %>% paste0(markdown_line, '\nLength: ', (NAs %>% length() + x %>% length()) %>% format(), - ' (of which NA: ', NAs %>% length() %>% format() %>% NAs_to_red(), - ' = ', (NAs %>% length() / (NAs %>% length() + x %>% length())) %>% - percent(force_zero = TRUE, round = digits) %>% - sub('NaN', '0', ., fixed = TRUE) %>% - NAs_to_red(), ')') + ' (of which NA: ', NAs %>% length() %>% format() %>% NAs_to_red(), + ' = ', (NAs %>% length() / (NAs %>% length() + x %>% length())) %>% + percent(force_zero = TRUE, round = digits) %>% + sub('NaN', '0', ., fixed = TRUE) %>% + NAs_to_red(), ')') header_txt <- header_txt %>% paste0(markdown_line, '\nUnique: ', x %>% n_distinct() %>% format()) if (NROW(x) > 0 & any(class(x) == "character")) { @@ -287,11 +288,11 @@ frequency_tbl <- function(x, header_txt <- header_txt %>% paste0('\n') header_txt <- header_txt %>% paste(markdown_line, '\nMean: ', x %>% base::mean(na.rm = TRUE) %>% format(digits = digits)) header_txt <- header_txt %>% paste0(markdown_line, '\nStd. dev.: ', x %>% stats::sd(na.rm = TRUE) %>% format(digits = digits), - ' (CV: ', x %>% cv(na.rm = TRUE) %>% format(digits = digits), - ', MAD: ', x %>% stats::mad(na.rm = TRUE) %>% format(digits = digits), ')') + ' (CV: ', x %>% cv(na.rm = TRUE) %>% format(digits = digits), + ', MAD: ', x %>% stats::mad(na.rm = TRUE) %>% format(digits = digits), ')') header_txt <- header_txt %>% paste0(markdown_line, '\nFive-Num: ', Tukey_five %>% format(digits = digits) %>% trimws() %>% paste(collapse = ' | '), - ' (IQR: ', (Tukey_five[4] - Tukey_five[2]) %>% format(digits = digits), - ', CQV: ', x %>% cqv(na.rm = TRUE) %>% format(digits = digits), ')') + ' (IQR: ', (Tukey_five[4] - Tukey_five[2]) %>% format(digits = digits), + ', CQV: ', x %>% cqv(na.rm = TRUE) %>% format(digits = digits), ')') outlier_length <- length(boxplot.stats(x)$out) header_txt <- header_txt %>% paste0(markdown_line, '\nOutliers: ', outlier_length) if (outlier_length > 0) { @@ -300,14 +301,14 @@ frequency_tbl <- function(x, } if (NROW(x) > 0 & any(class(x) == "rsi")) { header_txt <- header_txt %>% paste0('\n') - cnt_S <- sum(x == "S") - cnt_I <- sum(x == "I") - cnt_R <- sum(x == "R") + cnt_S <- sum(x == "S", na.rm = TRUE) + cnt_I <- sum(x == "I", na.rm = TRUE) + cnt_R <- sum(x == "R", na.rm = TRUE) header_txt <- header_txt %>% paste(markdown_line, '\n%IR: ', - ((cnt_I + cnt_R) / sum(!is.na(x))) %>% percent(force_zero = TRUE, round = digits)) + ((cnt_I + cnt_R) / sum(!is.na(x), na.rm = TRUE)) %>% percent(force_zero = TRUE, round = digits)) header_txt <- header_txt %>% paste0(markdown_line, '\nRatio SIR: 1.0 : ', - (cnt_I / cnt_S) %>% format(digits = 1, nsmall = 1), " : ", - (cnt_R / cnt_S) %>% format(digits = 1, nsmall = 1)) + (cnt_I / cnt_S) %>% format(digits = 1, nsmall = 1), " : ", + (cnt_R / cnt_S) %>% format(digits = 1, nsmall = 1)) } formatdates <- "%e %B %Y" # = d mmmm yyyy @@ -327,15 +328,15 @@ frequency_tbl <- function(x, # hms header_txt <- header_txt %>% paste0(markdown_line, '\nEarliest: ', mindate %>% format(formatdates) %>% trimws()) header_txt <- header_txt %>% paste0(markdown_line, '\nLatest: ', maxdate %>% format(formatdates) %>% trimws(), - ' (+', difftime(maxdate, mindate, units = 'mins') %>% as.double() %>% format(digits = digits), ' min.)') + ' (+', difftime(maxdate, mindate, units = 'mins') %>% as.double() %>% format(digits = digits), ' min.)') } else { # other date formats header_txt <- header_txt %>% paste0(markdown_line, '\nOldest: ', mindate %>% format(formatdates) %>% trimws()) header_txt <- header_txt %>% paste0(markdown_line, '\nNewest: ', maxdate %>% format(formatdates) %>% trimws(), - ' (+', difftime(maxdate, mindate, units = 'auto') %>% as.double() %>% format(digits = digits), ')') + ' (+', difftime(maxdate, mindate, units = 'auto') %>% as.double() %>% format(digits = digits), ')') } header_txt <- header_txt %>% paste0(markdown_line, '\nMedian: ', mediandate %>% format(formatdates) %>% trimws(), - ' (~', percent(median_days / maxdate_days, round = 0), ')') + ' (~', percent(median_days / maxdate_days, round = 0), ')') } if (any(class(x) == 'POSIXlt')) { x <- x %>% format(formatdates) @@ -354,23 +355,14 @@ frequency_tbl <- function(x, } # create table with counts and percentages - column_names <- c('Item', 'Count', 'Percent', 'Cum. Count', 'Cum. Percent', '(Factor Level)') - column_names_df <- c('item', 'count', 'percent', 'cum_count', 'cum_percent', 'factor_level') + column_names <- c('Item', 'Count', 'Percent', 'Cum. Count', 'Cum. Percent') + column_names_df <- c('item', 'count', 'percent', 'cum_count', 'cum_percent') - if (any(class(x) == 'factor')) { - df <- tibble(item = x, - fctlvl = x %>% as.integer()) %>% - group_by(item, fctlvl) - column_align <- c('l', 'r', 'r', 'r', 'r', 'r') - } else { - df <- tibble(item = x) %>% - group_by(item) - # strip factor lvl from col names - column_names <- column_names[1:length(column_names) - 1] - column_names_df <- column_names_df[1:length(column_names_df) - 1] - column_align <- c(x_align, 'r', 'r', 'r', 'r') - } - df <- df %>% summarise(count = n()) + + df <- tibble(item = x) %>% + group_by(item) %>% + summarise(count = n()) + column_align <- c(x_align, 'r', 'r', 'r', 'r') if (df$item %>% paste(collapse = ',') %like% '\033') { # remove escape char @@ -382,11 +374,7 @@ frequency_tbl <- function(x, if (sort.count == TRUE) { df <- df %>% arrange(desc(count), item) } else { - if (any(class(x) == 'factor')) { - df <- df %>% arrange(fctlvl, item) - } else { - df <- df %>% arrange(item) - } + df <- df %>% arrange(item) } if (quote == TRUE) { @@ -399,34 +387,27 @@ frequency_tbl <- function(x, df$cum_count <- base::cumsum(df$count) df$cum_percent <- df$cum_count / base::sum(df$count, na.rm = TRUE) - if (any(class(x) == 'factor')) { - # put factor last - df <- df %>% select(item, count, percent, cum_count, cum_percent, fctlvl) - } - colnames(df) <- column_names_df - class(df) <- c('frequency_tbl', class(df)) - attr(df, 'package') <- 'AMR' - if (markdown == TRUE) { tbl_format <- 'markdown' } else { tbl_format <- 'pandoc' } - attr(df, 'opt') <- list(data = x.name, - vars = cols, - header = header, - header_txt = header_txt, - row_names = row.names, - column_names = column_names, - column_align = column_align, - tbl_format = tbl_format, - nmax = nmax, - nmax.set = nmax.set) - - df + structure(.Data = df, + class = c('frequency_tbl', class(df)), + opt = list(data = x.name, + vars = cols, + header = header, + header_txt = header_txt, + row_names = row.names, + column_names = column_names, + column_align = column_align, + tbl_format = tbl_format, + na = na, + nmax = nmax, + nmax.set = nmax.set)) } #' @rdname freq @@ -547,7 +528,7 @@ print.frequency_tbl <- function(x, nmax = getOption("max.print.freq", default = if (opt$tbl_format == "pandoc") { title <- bold(title) } else if (opt$tbl_format == "markdown") { - title <- paste0("**", title, "**") + title <- paste0("**", title, "**") } if (opt$header == TRUE) { @@ -571,7 +552,10 @@ print.frequency_tbl <- function(x, nmax = getOption("max.print.freq", default = # save old NA setting for kable opt.old <- options()$knitr.kable.NA - options(knitr.kable.NA = "") + if (is.null(opt$na)) { + opt$na <- "" + } + options(knitr.kable.NA = opt$na) if (nrow(x) > opt$nmax & opt$tbl_format != "markdown") { @@ -615,6 +599,10 @@ print.frequency_tbl <- function(x, nmax = getOption("max.print.freq", default = x$cum_count <- format(x$cum_count) x$cum_percent <- percent(x$cum_percent, force_zero = TRUE) + if (opt$tbl_format == "markdown") { + cat("\n\n") + } + print( knitr::kable(x, format = opt$tbl_format, @@ -628,7 +616,11 @@ print.frequency_tbl <- function(x, nmax = getOption("max.print.freq", default = cat(footer) } - cat('\n') + if (opt$tbl_format == "markdown") { + cat("\n\n") + } else { + cat('\n') + } # reset old kable setting options(knitr.kable.NA = opt.old) diff --git a/README.md b/README.md index 12010865..544d8bb8 100755 --- a/README.md +++ b/README.md @@ -361,69 +361,36 @@ freq(mydata$myvariable) mydata %>% freq(myvariable) ``` -Factors sort on item by default: +Frequency are of course sorted by count at default: ```r septic_patients %>% freq(hospital_id) -# Frequency table of `hospital_id` # Class: factor (numeric) # Length: 2000 (of which NA: 0 = 0.00%) # Unique: 4 # -# Item Count Percent Cum. Count Cum. Percent (Factor Level) -# --- ----- ------ -------- ----------- ------------- --------------- -# 1 A 321 16.1% 321 16.1% 1 -# 2 B 663 33.1% 984 49.2% 2 -# 3 C 254 12.7% 1238 61.9% 3 -# 4 D 762 38.1% 2000 100.0% 4 +# Item Count Percent Cum. Count Cum. Percent +# --- ----- ------ -------- ----------- ------------- +# 1 D 762 38.1% 762 38.1% +# 2 B 663 33.1% 1425 71.2% +# 3 A 321 16.1% 1746 87.3% +# 4 C 254 12.7% 2000 100.0% ``` This can be changed with the `sort.count` parameter: ```r -septic_patients %>% freq(hospital_id, sort.count = TRUE) -# Frequency table of `hospital_id` +septic_patients %>% freq(hospital_id, sort.count = FALSE) # Class: factor (numeric) # Length: 2000 (of which NA: 0 = 0.00%) # Unique: 4 # -# Item Count Percent Cum. Count Cum. Percent (Factor Level) -# --- ----- ------ -------- ----------- ------------- --------------- -# 1 D 762 38.1% 762 38.1% 4 -# 2 B 663 33.1% 1425 71.2% 2 -# 3 A 321 16.1% 1746 87.3% 1 -# 4 C 254 12.7% 2000 100.0% 3 +# Item Count Percent Cum. Count Cum. Percent +# --- ----- ------ -------- ----------- ------------- +# 1 A 321 16.1% 321 16.1% +# 2 B 663 33.1% 984 49.2% +# 3 C 254 12.7% 1238 61.9% +# 4 D 762 38.1% 2000 100.0% ``` -All other types, like numbers, characters and dates, sort on count by default: -```r -septic_patients %>% freq(date) -# Frequency table of `date` -# Class: Date -# Length: 2000 (of which NA: 0 = 0.0%) -# Unique: 1151 -# -# Oldest: 2 January 2002 -# Newest: 28 December 2017 (+5839) -# Median: 31 July 2009 (~47%) -# -# Item Count Percent Cum. Count Cum. Percent -# --- ----------- ------ -------- ----------- ------------- -# 1 2016-05-21 10 0.5% 10 0.5% -# 2 2004-11-15 8 0.4% 18 0.9% -# 3 2013-07-29 8 0.4% 26 1.3% -# 4 2017-06-12 8 0.4% 34 1.7% -# 5 2015-11-19 7 0.4% 41 2.1% -# 6 2005-12-22 6 0.3% 47 2.4% -# 7 2015-10-12 6 0.3% 53 2.6% -# 8 2002-02-27 5 0.2% 58 2.9% -# 9 2003-10-20 5 0.2% 63 3.1% -# 10 2004-02-02 5 0.2% 68 3.4% -# 11 2004-02-18 5 0.2% 73 3.6% -# 12 2004-06-22 5 0.2% 78 3.9% -# 13 2004-12-01 5 0.2% 83 4.2% -# 14 2005-08-16 5 0.2% 88 4.4% -# 15 2005-09-01 5 0.2% 93 4.7% -# [ reached getOption("max.print.freq") -- omitted 1136 entries, n = 1907 (95.3%) ] -``` For numeric values, some extra descriptive statistics will be calculated: ```r freq(runif(n = 10, min = 1, max = 5))