freq: support for table

2025-06-08 11:13:58 +02:00 · 2018-07-09 14:02:58 +02:00 · 2018-07-09 14:02:58 +02:00 · fc30d3fb13
commit fc30d3fb13
parent 18c91786bf
9 changed files with 199 additions and 93 deletions
--- a/.Rbuildignore
+++ b/.Rbuildignore
@ -1,3 +1,4 @@
 ^.*\.Rproj$
 ^\.Rproj\.user$
 .travis.yml
 .zenodo.json
--- a/3
+++ b/3
@ -43,7 +43,8 @@ Suggests:
    testthat (>= 1.0.2),
    covr (>= 3.0.1),
    rmarkdown,
-    rstudioapi
+    rstudioapi,
    tidyr
 VignetteBuilder: knitr
 URL: https://github.com/msberends/AMR
 BugReports: https://github.com/msberends/AMR/issues
--- a/5
+++ b/5
@ -4,6 +4,8 @@ S3method(as.data.frame,frequency_tbl)
 S3method(as.double,mic)
 S3method(as.integer,mic)
 S3method(as.numeric,mic)
 S3method(as.vector,frequency_tbl)
 S3method(as_tibble,frequency_tbl)
 S3method(barplot,mic)
 S3method(barplot,rsi)
 S3method(hist,frequency_tbl)
@ -69,6 +71,8 @@ exportMethods(as.data.frame.frequency_tbl)
 exportMethods(as.double.mic)
 exportMethods(as.integer.mic)
 exportMethods(as.numeric.mic)
 exportMethods(as.vector.frequency_tbl)
 exportMethods(as_tibble.frequency_tbl)
 exportMethods(barplot.mic)
 exportMethods(barplot.rsi)
 exportMethods(hist.frequency_tbl)
@ -147,6 +151,7 @@ importFrom(stats,sd)
 importFrom(tibble,tibble)
 importFrom(utils,View)
 importFrom(utils,browseVignettes)
 importFrom(utils,installed.packages)
 importFrom(utils,object.size)
 importFrom(utils,packageDescription)
 importFrom(utils,read.delim)
--- a/NEWS.md
+++ b/NEWS.md
@ -4,7 +4,9 @@
 * For convience, descriptive statistical functions `kurtosis` and `skewness` that are lacking in base R - they are generic functions and have support for vectors, data.frames and matrices
 * New for frequency tables (function `freq`):
  * A vignette to explain its usage
  * Support for `table` to use as input: `freq(table(x, y))`
  * Support for existing functions `hist` and `plot` to use a frequency table as input: `hist(freq(df$age))`
  * Support for `as.vector`, `as.data.frame` and `as_tibble`
  * Support for quasiquotation: `freq(mydata, mycolumn)` is the same as `mydata %>% freq(mycolumn)`
  * Function `top_freq` function to return the top/below *n* items as vector
  * Header of frequency tables now also show Mean Absolute Deviaton (MAD) and Interquartile Range (IQR)
--- a/R/freq.R
+++ b/R/freq.R
@ -19,8 +19,8 @@
 #' Frequency table
 #'
 #' Create a frequency table of a vector with items or a data frame. Supports quasiquotation and markdown for reports. \code{top_freq} can be used to get the top/bottom \emph{n} items of a frequency table, with counts as names.
-#' @param x vector with items, or a \code{data.frame}
+#' @param x vector of any class or a \code{\link{data.frame}}, \code{\link{tibble}} or \code{\link{table}}
-#' @param ... up to nine different columns of \code{x} to calculate frequencies from, see Examples
+#' @param ... up to nine different columns of \code{x} when \code{x} is a \code{data.frame} or \code{tibble}, to calculate frequencies from - see Examples
 #' @param sort.count sort on count, i.e. frequencies. This will be \code{TRUE} at default for everything except for factors.
 #' @param nmax number of row to print. The default, \code{15}, uses \code{\link{getOption}("max.print.freq")}. Use \code{nmax = 0}, \code{nmax = Inf}, \code{nmax = NULL} or \code{nmax = NA} to print all rows.
 #' @param na.rm a logical value indicating whether \code{NA} values should be removed from the frequency table. The header will always print the amount of \code{NA}s.
@ -56,7 +56,7 @@
 #' @importFrom stats fivenum sd mad
 #' @importFrom grDevices boxplot.stats
 #' @importFrom dplyr %>% select pull n_distinct group_by arrange desc mutate summarise n_distinct
-#' @importFrom utils browseVignettes
+#' @importFrom utils browseVignettes installed.packages
 #' @importFrom tibble tibble
 #' @keywords summary summarise frequency freq
 #' @rdname freq
@ -72,20 +72,15 @@
 #' septic_patients$hospital_id %>% freq()
 #' septic_patients[, "hospital_id"] %>% freq()
 #' septic_patients %>% freq("hospital_id")
-#' septic_patients %>% freq(hospital_id)  # <- easiest to remember when used to tidyverse
+#' septic_patients %>% freq(hospital_id)  #<- easiest to remember when you're used to tidyverse
 #'
-#' # you could use `select`...
+#' # you could also use `select` or `pull` to get your variables
 #' septic_patients %>%
 #'   filter(hospital_id == "A") %>%
 #'   select(bactid) %>%
 #'   freq()
 #'
-#' # ... or you use `freq` to select it immediately
+#' # multiple selected variables will be pasted together
 #' septic_patients %>%
 #'   filter(hospital_id == "A") %>%
 #'   freq(bactid)
 #'
 #' # select multiple columns; they will be pasted together
 #' septic_patients %>%
 #'   left_join_microorganisms %>%
 #'   filter(hospital_id == "A") %>%
@ -102,13 +97,40 @@
 #'   mutate(year = format(date, "%Y")) %>%
 #'   freq(year)
 #'
-#' # print only top 5
+#' # show only the top 5
 #' years %>% print(nmax = 5)
 #'
-#' # transform to plain data.frame
+#' # print a histogram of numeric values
 #' septic_patients %>%
 #'   freq(age) %>%
 #'   hist() # prettier: ggplot(septic_patients, aes(age)) + geom_histogram()
 #'
 #' # or print all points to a regular plot
 #' septic_patients %>%
 #'   freq(age) %>%
 #'   plot()
 #'
 #' # transform to a data.frame or tibble
 #' septic_patients %>%
 #'   freq(age) %>%
 #'   as.data.frame()
 #'
 #' # or transform (back) to a vector
 #' septic_patients %>%
 #'   freq(age) %>%
 #'   as.vector()
 #'
 #' identical(septic_patients %>%
 #'             freq(age) %>%
 #'             as.vector() %>%
 #'             sort(),
 #'           sort(septic_patients$age)
 #' ) # TRUE
 #'
 #' # also supports table:
 #' table(septic_patients$sex,
 #'       septic_patients$age) %>%
 #'   freq()
 frequency_tbl <- function(x,
                          ...,
                          sort.count = TRUE,
@ -138,6 +160,24 @@ frequency_tbl <- function(x,
    } else {
      cols <- NULL
    }
  } else if (any(class(x) == 'table')) {
    if (!"tidyr" %in% rownames(installed.packages())) {
      stop('transformation from `table` to frequency table requires the tidyr package.', call. = FALSE)
    }
    values <- x %>%
      as.data.frame(stringsAsFactors = FALSE) %>%
      # delete last variable: these are frequencies
      select(-ncol(.)) %>%
      # paste all other columns:
      tidyr::unite(sep = sep) %>%
      .[, 1]
    counts <- x %>%
      as.data.frame(stringsAsFactors = FALSE) %>%
      # get last variable: these are frequencies
      pull(ncol(.))
    x <- rep(values, counts)
    x.name <- NULL
    cols <- NULL
  } else {
    x.name <- NULL
    cols <- NULL
@ -523,41 +563,47 @@ as.data.frame.frequency_tbl <- function(x, ...) {
  as.data.frame.data.frame(x, ...)
 }
 #' @noRd
 #' @exportMethod as_tibble.frequency_tbl
 #' @export
 #' @importFrom dplyr as_tibble
 as_tibble.frequency_tbl <- function(x, validate = TRUE, ..., rownames = NA) {
  attr(x, 'package') <- NULL
  attr(x, 'package.version') <- NULL
  attr(x, 'opt') <- NULL
  as_tibble(x = as.data.frame(x), validate = validate, ..., rownames = rownames)
 }
 #' @noRd
 #' @exportMethod hist.frequency_tbl
 #' @export
 #' @importFrom dplyr %>% pull
 #' @importFrom graphics hist
 hist.frequency_tbl <- function(x, ...) {
  opt <- attr(x, 'opt')
  if (!is.null(opt$vars)) {
    title <- opt$vars
  } else {
    title <- ""
  }
-
+  hist(as.vector(x), main = paste("Histogram of", title), xlab = title, ...)
  items <- x %>% pull(item)
  counts <- x %>% pull(count)
  vect <- rep(items, counts)
  hist(vect, main = paste("Histogram of", title), xlab = title, ...)
 }
 #' @noRd
 #' @exportMethod plot.frequency_tbl
 #' @export
 #' @importFrom dplyr %>% pull
 plot.frequency_tbl <- function(x, y, ...) {
  opt <- attr(x, 'opt')
  if (!is.null(opt$vars)) {
    title <- opt$vars
  } else {
    title <- ""
  }
-
+  plot(x = x$item, y = x$count, ylab = "Count", xlab = title, ...)
-  items <- x %>% pull(item)
+}
-  counts <- x %>% pull(count)
+
-  plot(x = items, y = counts, ylab = "Count", xlab = title, ...)
+#' @noRd
 #' @exportMethod as.vector.frequency_tbl
 #' @export
 as.vector.frequency_tbl <- function(x, mode = "any") {
  as.vector(rep(x$item, x$count), mode = mode)
 }
--- a/R/globals.R
+++ b/R/globals.R
@ -22,6 +22,7 @@ globalVariables(c('abname',
                  'bactid',
                  'cnt',
                  'count',
                  'counts',
                  'cum_count',
                  'cum_percent',
                  'date_lab',
@ -50,6 +51,7 @@ globalVariables(c('abname',
                  'septic_patients',
                  'species',
                  'umcg',
                  'values',
                  'View',
                  'y',
                  '.'))
--- a/README.md
+++ b/README.md
@ -47,9 +47,12 @@ With the `MDRO` function (abbreviation of Multi Drug Resistant Organisms), you c
 This package is available on CRAN and also here on GitHub.
 ### From CRAN (recommended)
 Latest released version on CRAN:
 [![CRAN_Badge](https://img.shields.io/cran/v/AMR.svg?label=CRAN&colorB=3679BC)](http://cran.r-project.org/package=AMR)
-Downloads via RStudio CRAN server (downloads by all other CRAN mirrors not measured):
+Downloads via RStudio CRAN server (downloads by all other CRAN mirrors **not** measured, including the official https://cran.r-project.org):
 [![CRAN_Downloads](https://cranlogs.r-pkg.org/badges/grand-total/AMR)](http://cran.r-project.org/package=AMR)
 [![CRAN_Downloads](https://cranlogs.r-pkg.org/badges/AMR)](https://cranlogs.r-pkg.org/downloads/daily/last-month/AMR)
@ -122,80 +125,91 @@ after
 ```
 ### Frequency tables
-Base R lacks a simple function to create frequency tables. We created such a function that works with almost all data types: `freq` (or `frequency_tbl`).
+Base R lacks a simple function to create frequency tables. We created such a function that works with almost all data types: `freq` (or `frequency_tbl`). It can be used in two ways:
 ```r
-## Factors sort on item by default:
+# Like base R:
 freq(mydata$myvariable)
-freq(septic_patients$hospital_id)
+# And like tidyverse:
 mydata %>% freq(myvariable)
 ```
 Factors sort on item by default:
 ```r
 septic_patients %>% freq(hospital_id)
 # Frequency table of `hospital_id`
 # Class:     factor
 # Length:    2000 (of which NA: 0 = 0.0%)
 # Unique:    5
 # 
-# Item    Count   Percent   Cum. Count   Cum. Percent   (Factor Level)
+#      Item    Count   Percent   Cum. Count   Cum. Percent   (Factor Level)
-# -----  ------  --------  -----------  -------------  ---------------
+# ---  -----  ------  --------  -----------  -------------  ---------------
-# A         233     11.7%          233          11.7%                1
+# 1    A         233     11.7%          233          11.7%                1
-# B         583     29.1%          816          40.8%                2
+# 2    B         583     29.1%          816          40.8%                2
-# C         221     11.1%         1037          51.8%                3
+# 3    C         221     11.1%         1037          51.8%                3
-# D         650     32.5%         1687          84.4%                4
+# 4    D         650     32.5%         1687          84.4%                4
-# E         313     15.7%         2000         100.0%                5
+# 5    E         313     15.7%         2000         100.0%                5
 ```
-
+This can be changed with the `sort.count` parameter:
-## This can be changed with the `sort.count` parameter:
+```r
-
+septic_patients %>% freq(hospital_id, sort.count = TRUE)
-freq(septic_patients$hospital_id, sort.count = TRUE)
+# Frequency table of `hospital_id`
 # Class:     factor
 # Length:    2000 (of which NA: 0 = 0.0%)
 # Unique:    5
 # 
-# Item    Count   Percent   Cum. Count   Cum. Percent   (Factor Level)
+#      Item    Count   Percent   Cum. Count   Cum. Percent   (Factor Level)
-# -----  ------  --------  -----------  -------------  ---------------
+# ---  -----  ------  --------  -----------  -------------  ---------------
-# D         650     32.5%          650          32.5%                4
+# 1    D         650     32.5%          650          32.5%                4
-# B         583     29.1%         1233          61.7%                2
+# 2    B         583     29.1%         1233          61.7%                2
-# E         313     15.7%         1546          77.3%                5
+# 3    E         313     15.7%         1546          77.3%                5
-# A         233     11.7%         1779          88.9%                1
+# 4    A         233     11.7%         1779          88.9%                1
-# C         221     11.1%         2000         100.0%                3
+# 5    C         221     11.1%         2000         100.0%                3
 ```
-
+All other types, like numbers, characters and dates, sort on count by default:
-## Other types, like numbers or dates, sort on count by default:
+```r
-
+septic_patients %>% freq(date)
-> freq(septic_patients$date)
+# Frequency table of `date` 
 # Class:     Date
 # Length:    2000 (of which NA: 0 = 0.0%)
 # Unique:    1662
 # 
 # Oldest:    2 January 2001
 # Newest:    18 October 2017 (+6133)
 # Median:    6 December 2009 (~53%)
 # 
-# Item          Count   Percent   Cum. Count   Cum. Percent
+#      Item          Count   Percent   Cum. Count   Cum. Percent
-# -----------  ------  --------  -----------  -------------
+# ---  -----------  ------  --------  -----------  -------------
-# 2008-12-24        5      0.2%            5           0.2%
+# 1    2008-12-24        5      0.2%            5           0.2%
-# 2010-12-10        4      0.2%            9           0.4%
+# 2    2010-12-10        4      0.2%            9           0.4%
-# 2011-03-03        4      0.2%           13           0.6%
+# 3    2011-03-03        4      0.2%           13           0.6%
-# 2013-06-24        4      0.2%           17           0.8%
+# 4    2013-06-24        4      0.2%           17           0.8%
-# 2017-09-01        4      0.2%           21           1.1%
+# 5    2017-09-01        4      0.2%           21           1.1%
-# 2002-09-02        3      0.2%           24           1.2%
+# 6    2002-09-02        3      0.2%           24           1.2%
-# 2003-10-14        3      0.2%           27           1.4%
+# 7    2003-10-14        3      0.2%           27           1.4%
-# 2004-06-25        3      0.2%           30           1.5%
+# 8    2004-06-25        3      0.2%           30           1.5%
-# 2004-06-27        3      0.2%           33           1.7%
+# 9    2004-06-27        3      0.2%           33           1.7%
-# 2004-10-29        3      0.2%           36           1.8%
+# 10   2004-10-29        3      0.2%           36           1.8%
-# 2005-09-27        3      0.2%           39           2.0%
+# 11   2005-09-27        3      0.2%           39           2.0%
-# 2006-08-01        3      0.2%           42           2.1%
+# 12   2006-08-01        3      0.2%           42           2.1%
-# 2006-10-10        3      0.2%           45           2.2%
+# 13   2006-10-10        3      0.2%           45           2.2%
-# 2007-11-16        3      0.2%           48           2.4%
+# 14   2007-11-16        3      0.2%           48           2.4%
-# 2008-03-09        3      0.2%           51           2.5%
+# 15   2008-03-09        3      0.2%           51           2.5%
-# ... and 1647 more (n = 1949; 97.5%). Use `nmax` to show more rows.
+# [ reached getOption("max.print.freq") -- omitted 1647 entries, n = 1949 (97.5%) ]
-
+```
-
+For numeric values, some extra descriptive statistics will be calculated:
-## For numeric values, some extra descriptive statistics will be calculated:
+```r
-
+freq(runif(n = 10, min = 1, max = 5))
-> freq(runif(n = 10, min = 1, max = 5))
+# Frequency table 
 # Class:     numeric
 # Length:    10 (of which NA: 0 = 0.0%)
 # Unique:    10
 #   
-# Mean:      3
+# Mean:      2.9
-# Std. dev.: 0.93 (CV: 0.31)
+# Std. dev.: 1.3 (CV: 0.43, MAD: 1.5)
-# Five-Num:  1.1  |  2.3  |  3.1  |  3.8  |  4.0 (CQV: 0.25)
+# Five-Num:  1.5 | 1.7 | 2.6 | 4.0 | 4.7 (IQR: 2.3, CQV: 0.4)
 # Outliers:  0
 # 
 #      Item   Count   Percent   Cum. Count   Cum. Percent
--- a/man/freq.Rd
+++ b/man/freq.Rd
@ -21,9 +21,9 @@ top_freq(f, n)
  15), ...)
 }
 \arguments{
-\item{x}{vector with items, or a \code{data.frame}}
+\item{x}{vector of any class or a \code{\link{data.frame}}, \code{\link{tibble}} or \code{\link{table}}}
-\item{...}{up to nine different columns of \code{x} to calculate frequencies from, see Examples}
+\item{...}{up to nine different columns of \code{x} when \code{x} is a \code{data.frame} or \code{tibble}, to calculate frequencies from - see Examples}
 \item{sort.count}{sort on count, i.e. frequencies. This will be \code{TRUE} at default for everything except for factors.}
@ -83,20 +83,15 @@ freq(septic_patients[, "hospital_id"])
 septic_patients$hospital_id \%>\% freq()
 septic_patients[, "hospital_id"] \%>\% freq()
 septic_patients \%>\% freq("hospital_id")
-septic_patients \%>\% freq(hospital_id)  # <- easiest to remember when used to tidyverse
+septic_patients \%>\% freq(hospital_id)  #<- easiest to remember when you're used to tidyverse
-# you could use `select`...
+# you could also use `select` or `pull` to get your variables
 septic_patients \%>\%
  filter(hospital_id == "A") \%>\%
  select(bactid) \%>\%
  freq()
-# ... or you use `freq` to select it immediately
+# multiple selected variables will be pasted together
 septic_patients \%>\%
  filter(hospital_id == "A") \%>\%
  freq(bactid)
 # select multiple columns; they will be pasted together
 septic_patients \%>\%
  left_join_microorganisms \%>\%
  filter(hospital_id == "A") \%>\%
@ -113,13 +108,40 @@ years <- septic_patients \%>\%
  mutate(year = format(date, "\%Y")) \%>\%
  freq(year)
-# print only top 5
+# show only the top 5
 years \%>\% print(nmax = 5)
-# transform to plain data.frame
+# print a histogram of numeric values
 septic_patients \%>\%
  freq(age) \%>\%
  hist() # prettier: ggplot(septic_patients, aes(age)) + geom_histogram()
 # or print all points to a regular plot
 septic_patients \%>\%
  freq(age) \%>\%
  plot()
 # transform to a data.frame or tibble
 septic_patients \%>\%
  freq(age) \%>\%
  as.data.frame()
 # or transform (back) to a vector
 septic_patients \%>\%
  freq(age) \%>\%
  as.vector()
 identical(septic_patients \%>\%
            freq(age) \%>\%
            as.vector() \%>\%
            sort(),
          sort(septic_patients$age)
 ) # TRUE
 # also supports table:
 table(septic_patients$sex,
      septic_patients$age) \%>\%
  freq()
 }
 \keyword{freq}
 \keyword{frequency}
--- a/tests/testthat/test-freq.R
+++ b/tests/testthat/test-freq.R
@ -9,12 +9,16 @@ test_that("frequency table works", {
  expect_equal(nrow(freq(septic_patients$date)),
               length(unique(septic_patients$date)))
-  # int
+  # character
  expect_output(print(freq(septic_patients$bactid)))
  # integer
  expect_output(print(freq(septic_patients$age)))
  # date
  expect_output(print(freq(septic_patients$date)))
  # factor
  expect_output(print(freq(septic_patients$hospital_id)))
  # table
  expect_output(print(freq(table(septic_patients$sex, septic_patients$age))))
  library(dplyr)
  expect_output(septic_patients %>% select(1:2) %>% freq() %>% print())
@ -53,5 +57,14 @@ test_that("frequency table works", {
  plot(freq(septic_patients, age))
  hist(freq(septic_patients, age))
  # check vector
  expect_identical(septic_patients %>%
                     freq(age) %>%
                     as.vector() %>%
                     sort(),
                   septic_patients %>%
                     pull(age) %>%
                     sort())
 })