na freq, removed factors

2025-07-23 10:23:18 +02:00 · 2018-10-23 09:42:26 +02:00
parent 9c56257da1
commit 299c5bea43
4 changed files with 76 additions and 115 deletions
--- a/2
+++ b/2
@ -1,6 +1,6 @@
 Package: AMR
 Version: 0.4.0.9006
-Date: 2018-10-22
+Date: 2018-10-23
 Title: Antimicrobial Resistance Analysis
 Authors@R: c(
    person(
--- a/NEWS.md
+++ b/NEWS.md
@ -22,7 +22,9 @@
 * Check for `hms::is.hms` in frequency tables (`freq()`)
 * New parameter `header` for frequency tables to turn them off (default when `markdown = TRUE`)
 * Freq now prints in markdown at default in non-interactive sessions
+* Freq no longer add the factor level column and sorts factors on count again
 * Removed diacritics from all authors (columns `microorganisms$ref` and `microorganisms.old$ref`) to comply with CRAN policy to only allow ASCII characters
+* Freq gained `na` parameter, to choose with character to print for empty values
 * Fix for `mo_property` not working properly
 * Fix for `EUCAST_rules` where some Streptococci would become ceftazidime R in EUCAST rule 4.5
 * Support for class `difftime` in frequency tables
--- a/R/freq.R
+++ b/R/freq.R
@ -23,12 +23,13 @@
 #' @param ... up to nine different columns of \code{x} when \code{x} is a \code{data.frame} or \code{tibble}, to calculate frequencies from - see Examples
 #' @param sort.count sort on count, i.e. frequencies. This will be \code{TRUE} at default for everything except for factors.
 #' @param nmax number of row to print. The default, \code{15}, uses \code{\link{getOption}("max.print.freq")}. Use \code{nmax = 0}, \code{nmax = Inf}, \code{nmax = NULL} or \code{nmax = NA} to print all rows.
-#' @param na.rm a logical value indicating whether \code{NA} values should be removed from the frequency table. The header will always print the amount of \code{NA}s.
+#' @param na.rm a logical value indicating whether \code{NA} values should be removed from the frequency table. The header (if set) will always print the amount of \code{NA}s.
 #' @param row.names a logical value indicating whether row indices should be printed as \code{1:nrow(x)}
 #' @param markdown a logical value indicating whether the frequency table should be printed in markdown format. This will print all rows and is default behaviour in non-interactive R sessions (like when knitting RMarkdown files).
 #' @param digits how many significant digits are to be used for numeric values in the header (not for the items themselves, that depends on \code{\link{getOption}("digits")})
 #' @param quote a logical value indicating whether or not strings should be printed with surrounding quotes
 #' @param header a logical value indicating whether an informative header should be printed
+#' @param na a character string to should be used to show empty (\code{NA}) values (only useful when \code{na.rm = FALSE})
 #' @param sep a character string to separate the terms when selecting multiple columns
 #' @param f a frequency table
 #' @param n number of top \emph{n} items to return, use -n for the bottom \emph{n} items. It will include more than \code{n} rows if there are ties.
@ -150,6 +151,7 @@ frequency_tbl <- function(x,
                          digits = 2,
                          quote = FALSE,
                          header = !markdown,
+                          na = "<NA>",
                          sep = " ") {

  mult.columns <- 0
@ -197,7 +199,7 @@ frequency_tbl <- function(x,
    # now this DF contains 3 columns: the 2 vars and a Freq column
    # paste the first 2 cols and repeat them Freq times:
    x <- rep(x = do.call(paste, c(x[colnames(x)[1:2]], sep = sep)),
-              times = x$Freq)
+             times = x$Freq)
    x.name <- "a `table` object"
    cols <- NULL
    #mult.columns <- 2
@ -229,9 +231,8 @@ frequency_tbl <- function(x,
    class(x) <- x_class
  }

-  if (missing(sort.count) & 'factor' %in% class(x)) {
-    # sort on factor level at default when x is a factor and sort.count is not set
-    sort.count <- FALSE
+  if (sort.count == FALSE & 'factor' %in% class(x)) {
+    # warning("Sorting a factor sorts on factor level, not necessarily alphabetically.", call. = FALSE)
  }

  header_txt <- character(0)
@ -260,11 +261,11 @@ frequency_tbl <- function(x,
  }

  header_txt <- header_txt %>% paste0(markdown_line, '\nLength:    ', (NAs %>% length() + x %>% length()) %>% format(),
-                              ' (of which NA: ', NAs %>% length() %>% format() %>% NAs_to_red(),
-                              ' = ', (NAs %>% length() / (NAs %>% length() + x %>% length())) %>%
-                                percent(force_zero = TRUE, round = digits) %>%
-                                sub('NaN', '0', ., fixed = TRUE) %>%
-                                NAs_to_red(), ')')
+                                      ' (of which NA: ', NAs %>% length() %>% format() %>% NAs_to_red(),
+                                      ' = ', (NAs %>% length() / (NAs %>% length() + x %>% length())) %>%
+                                        percent(force_zero = TRUE, round = digits) %>%
+                                        sub('NaN', '0', ., fixed = TRUE) %>%
+                                        NAs_to_red(), ')')
  header_txt <- header_txt %>% paste0(markdown_line, '\nUnique:    ', x %>% n_distinct() %>% format())

  if (NROW(x) > 0 & any(class(x) == "character")) {
@ -287,11 +288,11 @@ frequency_tbl <- function(x,
    header_txt <- header_txt %>% paste0('\n')
    header_txt <- header_txt %>% paste(markdown_line, '\nMean:     ', x %>% base::mean(na.rm = TRUE) %>% format(digits = digits))
    header_txt <- header_txt %>% paste0(markdown_line, '\nStd. dev.: ', x %>% stats::sd(na.rm = TRUE) %>% format(digits = digits),
-                                ' (CV: ', x %>% cv(na.rm = TRUE) %>% format(digits = digits),
-                                ', MAD: ', x %>% stats::mad(na.rm = TRUE) %>% format(digits = digits), ')')
+                                        ' (CV: ', x %>% cv(na.rm = TRUE) %>% format(digits = digits),
+                                        ', MAD: ', x %>% stats::mad(na.rm = TRUE) %>% format(digits = digits), ')')
    header_txt <- header_txt %>% paste0(markdown_line, '\nFive-Num:  ', Tukey_five %>% format(digits = digits) %>% trimws() %>% paste(collapse = ' | '),
-                                ' (IQR: ', (Tukey_five[4] - Tukey_five[2]) %>% format(digits = digits),
-                                ', CQV: ', x %>% cqv(na.rm = TRUE) %>% format(digits = digits), ')')
+                                        ' (IQR: ', (Tukey_five[4] - Tukey_five[2]) %>% format(digits = digits),
+                                        ', CQV: ', x %>% cqv(na.rm = TRUE) %>% format(digits = digits), ')')
    outlier_length <- length(boxplot.stats(x)$out)
    header_txt <- header_txt %>% paste0(markdown_line, '\nOutliers:  ', outlier_length)
    if (outlier_length > 0) {
@ -300,14 +301,14 @@ frequency_tbl <- function(x,
  }
  if (NROW(x) > 0 & any(class(x) == "rsi")) {
    header_txt <- header_txt %>% paste0('\n')
-    cnt_S <- sum(x == "S")
-    cnt_I <- sum(x == "I")
-    cnt_R <- sum(x == "R")
+    cnt_S <- sum(x == "S", na.rm = TRUE)
+    cnt_I <- sum(x == "I", na.rm = TRUE)
+    cnt_R <- sum(x == "R", na.rm = TRUE)
    header_txt <- header_txt %>% paste(markdown_line, '\n%IR:      ',
-                               ((cnt_I + cnt_R) / sum(!is.na(x))) %>% percent(force_zero = TRUE, round = digits))
+                                       ((cnt_I + cnt_R) / sum(!is.na(x), na.rm = TRUE)) %>% percent(force_zero = TRUE, round = digits))
    header_txt <- header_txt %>% paste0(markdown_line, '\nRatio SIR: 1.0 : ',
-                                (cnt_I / cnt_S) %>% format(digits = 1, nsmall = 1), " : ",
-                                (cnt_R / cnt_S) %>% format(digits = 1, nsmall = 1))
+                                        (cnt_I / cnt_S) %>% format(digits = 1, nsmall = 1), " : ",
+                                        (cnt_R / cnt_S) %>% format(digits = 1, nsmall = 1))
  }

  formatdates <- "%e %B %Y" # = d mmmm yyyy
@ -327,15 +328,15 @@ frequency_tbl <- function(x,
      # hms
      header_txt <- header_txt %>% paste0(markdown_line, '\nEarliest:  ', mindate %>% format(formatdates) %>% trimws())
      header_txt <- header_txt %>% paste0(markdown_line, '\nLatest:    ', maxdate %>% format(formatdates) %>% trimws(),
-                                  ' (+', difftime(maxdate, mindate, units = 'mins') %>% as.double() %>% format(digits = digits), ' min.)')
+                                          ' (+', difftime(maxdate, mindate, units = 'mins') %>% as.double() %>% format(digits = digits), ' min.)')
    } else {
      # other date formats
      header_txt <- header_txt %>% paste0(markdown_line, '\nOldest:    ', mindate %>% format(formatdates) %>% trimws())
      header_txt <- header_txt %>% paste0(markdown_line, '\nNewest:    ', maxdate %>% format(formatdates) %>% trimws(),
-                                  ' (+', difftime(maxdate, mindate, units = 'auto') %>% as.double() %>% format(digits = digits), ')')
+                                          ' (+', difftime(maxdate, mindate, units = 'auto') %>% as.double() %>% format(digits = digits), ')')
    }
    header_txt <- header_txt %>% paste0(markdown_line, '\nMedian:    ', mediandate %>% format(formatdates) %>% trimws(),
-                                ' (~', percent(median_days / maxdate_days, round = 0), ')')
+                                        ' (~', percent(median_days / maxdate_days, round = 0), ')')
  }
  if (any(class(x) == 'POSIXlt')) {
    x <- x %>% format(formatdates)
@ -354,23 +355,14 @@ frequency_tbl <- function(x,
  }

  # create table with counts and percentages
-  column_names <- c('Item', 'Count', 'Percent', 'Cum. Count', 'Cum. Percent', '(Factor Level)')
-  column_names_df <- c('item', 'count', 'percent', 'cum_count', 'cum_percent', 'factor_level')
+  column_names <- c('Item', 'Count', 'Percent', 'Cum. Count', 'Cum. Percent')
+  column_names_df <- c('item', 'count', 'percent', 'cum_count', 'cum_percent')

-  if (any(class(x) == 'factor')) {
-    df <- tibble(item = x,
-                         fctlvl = x %>% as.integer()) %>%
-      group_by(item, fctlvl)
-    column_align <- c('l', 'r', 'r', 'r', 'r', 'r')
-  } else {
-    df <- tibble(item = x) %>%
-      group_by(item)
-    # strip factor lvl from col names
-    column_names <- column_names[1:length(column_names) - 1]
-    column_names_df <- column_names_df[1:length(column_names_df) - 1]
-    column_align <- c(x_align, 'r', 'r', 'r', 'r')
-  }
-  df <- df %>% summarise(count = n())
+
+  df <- tibble(item = x) %>%
+    group_by(item) %>%
+    summarise(count = n())
+  column_align <- c(x_align, 'r', 'r', 'r', 'r')

  if (df$item %>% paste(collapse = ',') %like% '\033') {
    # remove escape char
@ -382,11 +374,7 @@ frequency_tbl <- function(x,
  if (sort.count == TRUE) {
    df <- df %>% arrange(desc(count), item)
  } else {
-    if (any(class(x) == 'factor')) {
-      df <- df %>% arrange(fctlvl, item)
-    } else {
-      df <- df %>% arrange(item)
-    }
+    df <- df %>% arrange(item)
  }

  if (quote == TRUE) {
@ -399,34 +387,27 @@ frequency_tbl <- function(x,
  df$cum_count <- base::cumsum(df$count)
  df$cum_percent <- df$cum_count / base::sum(df$count, na.rm = TRUE)

-  if (any(class(x) == 'factor')) {
-    # put factor last
-    df <- df %>% select(item, count, percent, cum_count, cum_percent, fctlvl)
-  }
-
  colnames(df) <- column_names_df

-  class(df) <- c('frequency_tbl', class(df))
-  attr(df, 'package') <- 'AMR'
-
  if (markdown == TRUE) {
    tbl_format <- 'markdown'
  } else {
    tbl_format <- 'pandoc'
  }

-  attr(df, 'opt') <- list(data = x.name,
-                          vars = cols,
-                          header = header,
-                          header_txt = header_txt,
-                          row_names = row.names,
-                          column_names = column_names,
-                          column_align = column_align,
-                          tbl_format = tbl_format,
-                          nmax = nmax,
-                          nmax.set = nmax.set)
-
-  df
+  structure(.Data = df,
+            class = c('frequency_tbl', class(df)),
+            opt = list(data = x.name,
+                       vars = cols,
+                       header = header,
+                       header_txt = header_txt,
+                       row_names = row.names,
+                       column_names = column_names,
+                       column_align = column_align,
+                       tbl_format = tbl_format,
+                       na = na,
+                       nmax = nmax,
+                       nmax.set = nmax.set))
 }

 #' @rdname freq
@ -547,7 +528,7 @@ print.frequency_tbl <- function(x, nmax = getOption("max.print.freq", default =
  if (opt$tbl_format == "pandoc") {
    title <- bold(title)
  } else if (opt$tbl_format == "markdown") {
-   title <- paste0("**", title, "**")
+    title <- paste0("**", title, "**")
  }

  if (opt$header == TRUE) {
@ -571,7 +552,10 @@ print.frequency_tbl <- function(x, nmax = getOption("max.print.freq", default =

  # save old NA setting for kable
  opt.old <- options()$knitr.kable.NA
-  options(knitr.kable.NA = "<NA>")
+  if (is.null(opt$na)) {
+    opt$na <- "<NA>"
+  }
+  options(knitr.kable.NA = opt$na)

  if (nrow(x) > opt$nmax & opt$tbl_format != "markdown") {

@ -615,6 +599,10 @@ print.frequency_tbl <- function(x, nmax = getOption("max.print.freq", default =
  x$cum_count <- format(x$cum_count)
  x$cum_percent <- percent(x$cum_percent, force_zero = TRUE)

+  if (opt$tbl_format == "markdown") {
+    cat("\n\n")
+  }
+
  print(
    knitr::kable(x,
                 format = opt$tbl_format,
@ -628,7 +616,11 @@ print.frequency_tbl <- function(x, nmax = getOption("max.print.freq", default =
    cat(footer)
  }

-  cat('\n')
+  if (opt$tbl_format == "markdown") {
+    cat("\n\n")
+  } else {
+    cat('\n')
+  }

  # reset old kable setting
  options(knitr.kable.NA = opt.old)
--- a/README.md
+++ b/README.md
@ -361,69 +361,36 @@ freq(mydata$myvariable)
 mydata %>% freq(myvariable)
 ```

-Factors sort on item by default:
+Frequency are of course sorted by count at default:
 ```r
 septic_patients %>% freq(hospital_id)
-# Frequency table of `hospital_id` 
 # Class:     factor (numeric)
 # Length:    2000 (of which NA: 0 = 0.00%)
 # Unique:    4
 # 
-#      Item    Count   Percent   Cum. Count   Cum. Percent   (Factor Level)
-# ---  -----  ------  --------  -----------  -------------  ---------------
-# 1    A         321     16.1%          321          16.1%                1
-# 2    B         663     33.1%          984          49.2%                2
-# 3    C         254     12.7%         1238          61.9%                3
-# 4    D         762     38.1%         2000         100.0%                4
+#      Item    Count   Percent   Cum. Count   Cum. Percent
+# ---  -----  ------  --------  -----------  -------------
+# 1    D         762     38.1%          762          38.1%
+# 2    B         663     33.1%         1425          71.2%
+# 3    A         321     16.1%         1746          87.3%
+# 4    C         254     12.7%         2000         100.0%
 ```

 This can be changed with the `sort.count` parameter:
 ```r
-septic_patients %>% freq(hospital_id, sort.count = TRUE)
-# Frequency table of `hospital_id` 
+septic_patients %>% freq(hospital_id, sort.count = FALSE)
 # Class:     factor (numeric)
 # Length:    2000 (of which NA: 0 = 0.00%)
 # Unique:    4
 # 
-#      Item    Count   Percent   Cum. Count   Cum. Percent   (Factor Level)
-# ---  -----  ------  --------  -----------  -------------  ---------------
-# 1    D         762     38.1%          762          38.1%                4
-# 2    B         663     33.1%         1425          71.2%                2
-# 3    A         321     16.1%         1746          87.3%                1
-# 4    C         254     12.7%         2000         100.0%                3
+#      Item    Count   Percent   Cum. Count   Cum. Percent
+# ---  -----  ------  --------  -----------  -------------
+# 1    A         321     16.1%          321          16.1%
+# 2    B         663     33.1%          984          49.2%
+# 3    C         254     12.7%         1238          61.9%
+# 4    D         762     38.1%         2000         100.0%
 ```

-All other types, like numbers, characters and dates, sort on count by default:
-```r
-septic_patients %>% freq(date)
-# Frequency table of `date` 
-# Class:     Date
-# Length:    2000 (of which NA: 0 = 0.0%)
-# Unique:    1151
-# 
-# Oldest:    2 January 2002
-# Newest:    28 December 2017 (+5839)
-# Median:    31 July 2009 (~47%)
-# 
-#      Item          Count   Percent   Cum. Count   Cum. Percent
-# ---  -----------  ------  --------  -----------  -------------
-# 1    2016-05-21       10      0.5%           10           0.5%
-# 2    2004-11-15        8      0.4%           18           0.9%
-# 3    2013-07-29        8      0.4%           26           1.3%
-# 4    2017-06-12        8      0.4%           34           1.7%
-# 5    2015-11-19        7      0.4%           41           2.1%
-# 6    2005-12-22        6      0.3%           47           2.4%
-# 7    2015-10-12        6      0.3%           53           2.6%
-# 8    2002-02-27        5      0.2%           58           2.9%
-# 9    2003-10-20        5      0.2%           63           3.1%
-# 10   2004-02-02        5      0.2%           68           3.4%
-# 11   2004-02-18        5      0.2%           73           3.6%
-# 12   2004-06-22        5      0.2%           78           3.9%
-# 13   2004-12-01        5      0.2%           83           4.2%
-# 14   2005-08-16        5      0.2%           88           4.4%
-# 15   2005-09-01        5      0.2%           93           4.7%
-# [ reached getOption("max.print.freq") -- omitted 1136 entries, n = 1907 (95.3%) ]
-```
 For numeric values, some extra descriptive statistics will be calculated:
 ```r
 freq(runif(n = 10, min = 1, max = 5))