freq: support for table

2026-07-19 01:50:55 +02:00 · 2018-07-09 14:02:58 +02:00
parent 18c91786bf
commit fc30d3fb13
9 changed files with 199 additions and 93 deletions
--- a/.Rbuildignore
+++ b/.Rbuildignore
@@ -1,3 +1,4 @@
 ^.*\.Rproj$
 ^\.Rproj\.user$
 .travis.yml
+.zenodo.json
--- a/3
+++ b/3
@@ -43,7 +43,8 @@ Suggests:
    testthat (>= 1.0.2),
    covr (>= 3.0.1),
    rmarkdown,
-    rstudioapi
+    rstudioapi,
+    tidyr
 VignetteBuilder: knitr
 URL: https://github.com/msberends/AMR
 BugReports: https://github.com/msberends/AMR/issues
--- a/5
+++ b/5
@@ -4,6 +4,8 @@ S3method(as.data.frame,frequency_tbl)
 S3method(as.double,mic)
 S3method(as.integer,mic)
 S3method(as.numeric,mic)
+S3method(as.vector,frequency_tbl)
+S3method(as_tibble,frequency_tbl)
 S3method(barplot,mic)
 S3method(barplot,rsi)
 S3method(hist,frequency_tbl)
@@ -69,6 +71,8 @@ exportMethods(as.data.frame.frequency_tbl)
 exportMethods(as.double.mic)
 exportMethods(as.integer.mic)
 exportMethods(as.numeric.mic)
+exportMethods(as.vector.frequency_tbl)
+exportMethods(as_tibble.frequency_tbl)
 exportMethods(barplot.mic)
 exportMethods(barplot.rsi)
 exportMethods(hist.frequency_tbl)
@@ -147,6 +151,7 @@ importFrom(stats,sd)
 importFrom(tibble,tibble)
 importFrom(utils,View)
 importFrom(utils,browseVignettes)
+importFrom(utils,installed.packages)
 importFrom(utils,object.size)
 importFrom(utils,packageDescription)
 importFrom(utils,read.delim)
--- a/NEWS.md
+++ b/NEWS.md
@@ -4,7 +4,9 @@
 * For convience, descriptive statistical functions `kurtosis` and `skewness` that are lacking in base R - they are generic functions and have support for vectors, data.frames and matrices
 * New for frequency tables (function `freq`):
  * A vignette to explain its usage
+  * Support for `table` to use as input: `freq(table(x, y))`
  * Support for existing functions `hist` and `plot` to use a frequency table as input: `hist(freq(df$age))`
+  * Support for `as.vector`, `as.data.frame` and `as_tibble`
  * Support for quasiquotation: `freq(mydata, mycolumn)` is the same as `mydata %>% freq(mycolumn)`
  * Function `top_freq` function to return the top/below *n* items as vector
  * Header of frequency tables now also show Mean Absolute Deviaton (MAD) and Interquartile Range (IQR)
--- a/R/freq.R
+++ b/R/freq.R
@@ -19,8 +19,8 @@
 #' Frequency table
 #'
 #' Create a frequency table of a vector with items or a data frame. Supports quasiquotation and markdown for reports. \code{top_freq} can be used to get the top/bottom \emph{n} items of a frequency table, with counts as names.
-#' @param x vector with items, or a \code{data.frame}
-#' @param ... up to nine different columns of \code{x} to calculate frequencies from, see Examples
+#' @param x vector of any class or a \code{\link{data.frame}}, \code{\link{tibble}} or \code{\link{table}}
+#' @param ... up to nine different columns of \code{x} when \code{x} is a \code{data.frame} or \code{tibble}, to calculate frequencies from - see Examples
 #' @param sort.count sort on count, i.e. frequencies. This will be \code{TRUE} at default for everything except for factors.
 #' @param nmax number of row to print. The default, \code{15}, uses \code{\link{getOption}("max.print.freq")}. Use \code{nmax = 0}, \code{nmax = Inf}, \code{nmax = NULL} or \code{nmax = NA} to print all rows.
 #' @param na.rm a logical value indicating whether \code{NA} values should be removed from the frequency table. The header will always print the amount of \code{NA}s.
@@ -56,7 +56,7 @@
 #' @importFrom stats fivenum sd mad
 #' @importFrom grDevices boxplot.stats
 #' @importFrom dplyr %>% select pull n_distinct group_by arrange desc mutate summarise n_distinct
-#' @importFrom utils browseVignettes
+#' @importFrom utils browseVignettes installed.packages
 #' @importFrom tibble tibble
 #' @keywords summary summarise frequency freq
 #' @rdname freq
@@ -72,20 +72,15 @@
 #' septic_patients$hospital_id %>% freq()
 #' septic_patients[, "hospital_id"] %>% freq()
 #' septic_patients %>% freq("hospital_id")
-#' septic_patients %>% freq(hospital_id)  # <- easiest to remember when used to tidyverse
+#' septic_patients %>% freq(hospital_id)  #<- easiest to remember when you're used to tidyverse
 #'
-#' # you could use `select`...
+#' # you could also use `select` or `pull` to get your variables
 #' septic_patients %>%
 #'   filter(hospital_id == "A") %>%
 #'   select(bactid) %>%
 #'   freq()
 #'
-#' # ... or you use `freq` to select it immediately
-#' septic_patients %>%
-#'   filter(hospital_id == "A") %>%
-#'   freq(bactid)
-#'
-#' # select multiple columns; they will be pasted together
+#' # multiple selected variables will be pasted together
 #' septic_patients %>%
 #'   left_join_microorganisms %>%
 #'   filter(hospital_id == "A") %>%
@@ -102,13 +97,40 @@
 #'   mutate(year = format(date, "%Y")) %>%
 #'   freq(year)
 #'
-#' # print only top 5
+#' # show only the top 5
 #' years %>% print(nmax = 5)
 #'
-#' # transform to plain data.frame
+#' # print a histogram of numeric values
+#' septic_patients %>%
+#'   freq(age) %>%
+#'   hist() # prettier: ggplot(septic_patients, aes(age)) + geom_histogram()
+#'
+#' # or print all points to a regular plot
+#' septic_patients %>%
+#'   freq(age) %>%
+#'   plot()
+#'
+#' # transform to a data.frame or tibble
 #' septic_patients %>%
 #'   freq(age) %>%
 #'   as.data.frame()
+#'
+#' # or transform (back) to a vector
+#' septic_patients %>%
+#'   freq(age) %>%
+#'   as.vector()
+#'
+#' identical(septic_patients %>%
+#'             freq(age) %>%
+#'             as.vector() %>%
+#'             sort(),
+#'           sort(septic_patients$age)
+#' ) # TRUE
+#'
+#' # also supports table:
+#' table(septic_patients$sex,
+#'       septic_patients$age) %>%
+#'   freq()
 frequency_tbl <- function(x,
                          ...,
                          sort.count = TRUE,
@@ -138,6 +160,24 @@ frequency_tbl <- function(x,
    } else {
      cols <- NULL
    }
+  } else if (any(class(x) == 'table')) {
+    if (!"tidyr" %in% rownames(installed.packages())) {
+      stop('transformation from `table` to frequency table requires the tidyr package.', call. = FALSE)
+    }
+    values <- x %>%
+      as.data.frame(stringsAsFactors = FALSE) %>%
+      # delete last variable: these are frequencies
+      select(-ncol(.)) %>%
+      # paste all other columns:
+      tidyr::unite(sep = sep) %>%
+      .[, 1]
+    counts <- x %>%
+      as.data.frame(stringsAsFactors = FALSE) %>%
+      # get last variable: these are frequencies
+      pull(ncol(.))
+    x <- rep(values, counts)
+    x.name <- NULL
+    cols <- NULL
  } else {
    x.name <- NULL
    cols <- NULL
@@ -523,41 +563,47 @@ as.data.frame.frequency_tbl <- function(x, ...) {
  as.data.frame.data.frame(x, ...)
 }

+#' @noRd
+#' @exportMethod as_tibble.frequency_tbl
+#' @export
+#' @importFrom dplyr as_tibble
+as_tibble.frequency_tbl <- function(x, validate = TRUE, ..., rownames = NA) {
+  attr(x, 'package') <- NULL
+  attr(x, 'package.version') <- NULL
+  attr(x, 'opt') <- NULL
+  as_tibble(x = as.data.frame(x), validate = validate, ..., rownames = rownames)
+}
+
 #' @noRd
 #' @exportMethod hist.frequency_tbl
 #' @export
-#' @importFrom dplyr %>% pull
 #' @importFrom graphics hist
 hist.frequency_tbl <- function(x, ...) {
-
  opt <- attr(x, 'opt')
-
  if (!is.null(opt$vars)) {
    title <- opt$vars
  } else {
    title <- ""
  }
-
-  items <- x %>% pull(item)
-  counts <- x %>% pull(count)
-  vect <- rep(items, counts)
-  hist(vect, main = paste("Histogram of", title), xlab = title, ...)
+  hist(as.vector(x), main = paste("Histogram of", title), xlab = title, ...)
 }

 #' @noRd
 #' @exportMethod plot.frequency_tbl
 #' @export
-#' @importFrom dplyr %>% pull
 plot.frequency_tbl <- function(x, y, ...) {
  opt <- attr(x, 'opt')
-
  if (!is.null(opt$vars)) {
    title <- opt$vars
  } else {
    title <- ""
  }
-
-  items <- x %>% pull(item)
-  counts <- x %>% pull(count)
-  plot(x = items, y = counts, ylab = "Count", xlab = title, ...)
+  plot(x = x$item, y = x$count, ylab = "Count", xlab = title, ...)
+}
+
+#' @noRd
+#' @exportMethod as.vector.frequency_tbl
+#' @export
+as.vector.frequency_tbl <- function(x, mode = "any") {
+  as.vector(rep(x$item, x$count), mode = mode)
 }
--- a/R/globals.R
+++ b/R/globals.R
@@ -22,6 +22,7 @@ globalVariables(c('abname',
                  'bactid',
                  'cnt',
                  'count',
+                  'counts',
                  'cum_count',
                  'cum_percent',
                  'date_lab',
@@ -50,6 +51,7 @@ globalVariables(c('abname',
                  'septic_patients',
                  'species',
                  'umcg',
+                  'values',
                  'View',
                  'y',
                  '.'))
--- a/README.md
+++ b/README.md
@@ -47,9 +47,12 @@ With the `MDRO` function (abbreviation of Multi Drug Resistant Organisms), you c
 This package is available on CRAN and also here on GitHub.

 ### From CRAN (recommended)
+Latest released version on CRAN:
+
 [![CRAN_Badge](https://img.shields.io/cran/v/AMR.svg?label=CRAN&colorB=3679BC)](http://cran.r-project.org/package=AMR)

-Downloads via RStudio CRAN server (downloads by all other CRAN mirrors not measured):
+Downloads via RStudio CRAN server (downloads by all other CRAN mirrors **not** measured, including the official https://cran.r-project.org):
+
 [![CRAN_Downloads](https://cranlogs.r-pkg.org/badges/grand-total/AMR)](http://cran.r-project.org/package=AMR)
 [![CRAN_Downloads](https://cranlogs.r-pkg.org/badges/AMR)](https://cranlogs.r-pkg.org/downloads/daily/last-month/AMR)
  
@@ -122,80 +125,91 @@ after
 ```

 ### Frequency tables
-Base R lacks a simple function to create frequency tables. We created such a function that works with almost all data types: `freq` (or `frequency_tbl`).
+Base R lacks a simple function to create frequency tables. We created such a function that works with almost all data types: `freq` (or `frequency_tbl`). It can be used in two ways:
 ```r
-## Factors sort on item by default:
+# Like base R:
+freq(mydata$myvariable)

-freq(septic_patients$hospital_id)
+# And like tidyverse:
+mydata %>% freq(myvariable)
+```
+
+Factors sort on item by default:
+```r
+septic_patients %>% freq(hospital_id)
+# Frequency table of `hospital_id`
 # Class:     factor
 # Length:    2000 (of which NA: 0 = 0.0%)
 # Unique:    5
 # 
 #      Item    Count   Percent   Cum. Count   Cum. Percent   (Factor Level)
-# -----  ------  --------  -----------  -------------  ---------------
-# A         233     11.7%          233          11.7%                1
-# B         583     29.1%          816          40.8%                2
-# C         221     11.1%         1037          51.8%                3
-# D         650     32.5%         1687          84.4%                4
-# E         313     15.7%         2000         100.0%                5
+# ---  -----  ------  --------  -----------  -------------  ---------------
+# 1    A         233     11.7%          233          11.7%                1
+# 2    B         583     29.1%          816          40.8%                2
+# 3    C         221     11.1%         1037          51.8%                3
+# 4    D         650     32.5%         1687          84.4%                4
+# 5    E         313     15.7%         2000         100.0%                5
+```

-
-## This can be changed with the `sort.count` parameter:
-
-freq(septic_patients$hospital_id, sort.count = TRUE)
+This can be changed with the `sort.count` parameter:
+```r
+septic_patients %>% freq(hospital_id, sort.count = TRUE)
+# Frequency table of `hospital_id`
 # Class:     factor
 # Length:    2000 (of which NA: 0 = 0.0%)
 # Unique:    5
 # 
 #      Item    Count   Percent   Cum. Count   Cum. Percent   (Factor Level)
-# -----  ------  --------  -----------  -------------  ---------------
-# D         650     32.5%          650          32.5%                4
-# B         583     29.1%         1233          61.7%                2
-# E         313     15.7%         1546          77.3%                5
-# A         233     11.7%         1779          88.9%                1
-# C         221     11.1%         2000         100.0%                3
+# ---  -----  ------  --------  -----------  -------------  ---------------
+# 1    D         650     32.5%          650          32.5%                4
+# 2    B         583     29.1%         1233          61.7%                2
+# 3    E         313     15.7%         1546          77.3%                5
+# 4    A         233     11.7%         1779          88.9%                1
+# 5    C         221     11.1%         2000         100.0%                3
+```

-
-## Other types, like numbers or dates, sort on count by default:
-
-> freq(septic_patients$date)
+All other types, like numbers, characters and dates, sort on count by default:
+```r
+septic_patients %>% freq(date)
+# Frequency table of `date` 
 # Class:     Date
 # Length:    2000 (of which NA: 0 = 0.0%)
 # Unique:    1662
 # 
 # Oldest:    2 January 2001
 # Newest:    18 October 2017 (+6133)
+# Median:    6 December 2009 (~53%)
 # 
 #      Item          Count   Percent   Cum. Count   Cum. Percent
-# -----------  ------  --------  -----------  -------------
-# 2008-12-24        5      0.2%            5           0.2%
-# 2010-12-10        4      0.2%            9           0.4%
-# 2011-03-03        4      0.2%           13           0.6%
-# 2013-06-24        4      0.2%           17           0.8%
-# 2017-09-01        4      0.2%           21           1.1%
-# 2002-09-02        3      0.2%           24           1.2%
-# 2003-10-14        3      0.2%           27           1.4%
-# 2004-06-25        3      0.2%           30           1.5%
-# 2004-06-27        3      0.2%           33           1.7%
-# 2004-10-29        3      0.2%           36           1.8%
-# 2005-09-27        3      0.2%           39           2.0%
-# 2006-08-01        3      0.2%           42           2.1%
-# 2006-10-10        3      0.2%           45           2.2%
-# 2007-11-16        3      0.2%           48           2.4%
-# 2008-03-09        3      0.2%           51           2.5%
-# ... and 1647 more (n = 1949; 97.5%). Use `nmax` to show more rows.
-
-
-## For numeric values, some extra descriptive statistics will be calculated:
-
-> freq(runif(n = 10, min = 1, max = 5))
+# ---  -----------  ------  --------  -----------  -------------
+# 1    2008-12-24        5      0.2%            5           0.2%
+# 2    2010-12-10        4      0.2%            9           0.4%
+# 3    2011-03-03        4      0.2%           13           0.6%
+# 4    2013-06-24        4      0.2%           17           0.8%
+# 5    2017-09-01        4      0.2%           21           1.1%
+# 6    2002-09-02        3      0.2%           24           1.2%
+# 7    2003-10-14        3      0.2%           27           1.4%
+# 8    2004-06-25        3      0.2%           30           1.5%
+# 9    2004-06-27        3      0.2%           33           1.7%
+# 10   2004-10-29        3      0.2%           36           1.8%
+# 11   2005-09-27        3      0.2%           39           2.0%
+# 12   2006-08-01        3      0.2%           42           2.1%
+# 13   2006-10-10        3      0.2%           45           2.2%
+# 14   2007-11-16        3      0.2%           48           2.4%
+# 15   2008-03-09        3      0.2%           51           2.5%
+# [ reached getOption("max.print.freq") -- omitted 1647 entries, n = 1949 (97.5%) ]
+```
+For numeric values, some extra descriptive statistics will be calculated:
+```r
+freq(runif(n = 10, min = 1, max = 5))
+# Frequency table 
 # Class:     numeric
 # Length:    10 (of which NA: 0 = 0.0%)
 # Unique:    10
 #   
-# Mean:      3
-# Std. dev.: 0.93 (CV: 0.31)
-# Five-Num:  1.1  |  2.3  |  3.1  |  3.8  |  4.0 (CQV: 0.25)
+# Mean:      2.9
+# Std. dev.: 1.3 (CV: 0.43, MAD: 1.5)
+# Five-Num:  1.5 | 1.7 | 2.6 | 4.0 | 4.7 (IQR: 2.3, CQV: 0.4)
 # Outliers:  0
 # 
 #      Item   Count   Percent   Cum. Count   Cum. Percent
--- a/man/freq.Rd
+++ b/man/freq.Rd
@@ -21,9 +21,9 @@ top_freq(f, n)
  15), ...)
 }
 \arguments{
-\item{x}{vector with items, or a \code{data.frame}}
+\item{x}{vector of any class or a \code{\link{data.frame}}, \code{\link{tibble}} or \code{\link{table}}}

-\item{...}{up to nine different columns of \code{x} to calculate frequencies from, see Examples}
+\item{...}{up to nine different columns of \code{x} when \code{x} is a \code{data.frame} or \code{tibble}, to calculate frequencies from - see Examples}

 \item{sort.count}{sort on count, i.e. frequencies. This will be \code{TRUE} at default for everything except for factors.}

@@ -83,20 +83,15 @@ freq(septic_patients[, "hospital_id"])
 septic_patients$hospital_id \%>\% freq()
 septic_patients[, "hospital_id"] \%>\% freq()
 septic_patients \%>\% freq("hospital_id")
-septic_patients \%>\% freq(hospital_id)  # <- easiest to remember when used to tidyverse
+septic_patients \%>\% freq(hospital_id)  #<- easiest to remember when you're used to tidyverse

-# you could use `select`...
+# you could also use `select` or `pull` to get your variables
 septic_patients \%>\%
  filter(hospital_id == "A") \%>\%
  select(bactid) \%>\%
  freq()

-# ... or you use `freq` to select it immediately
-septic_patients \%>\%
-  filter(hospital_id == "A") \%>\%
-  freq(bactid)
-
-# select multiple columns; they will be pasted together
+# multiple selected variables will be pasted together
 septic_patients \%>\%
  left_join_microorganisms \%>\%
  filter(hospital_id == "A") \%>\%
@@ -113,13 +108,40 @@ years <- septic_patients \%>\%
  mutate(year = format(date, "\%Y")) \%>\%
  freq(year)

-# print only top 5
+# show only the top 5
 years \%>\% print(nmax = 5)

-# transform to plain data.frame
+# print a histogram of numeric values
+septic_patients \%>\%
+  freq(age) \%>\%
+  hist() # prettier: ggplot(septic_patients, aes(age)) + geom_histogram()
+
+# or print all points to a regular plot
+septic_patients \%>\%
+  freq(age) \%>\%
+  plot()
+
+# transform to a data.frame or tibble
 septic_patients \%>\%
  freq(age) \%>\%
  as.data.frame()
+
+# or transform (back) to a vector
+septic_patients \%>\%
+  freq(age) \%>\%
+  as.vector()
+
+identical(septic_patients \%>\%
+            freq(age) \%>\%
+            as.vector() \%>\%
+            sort(),
+          sort(septic_patients$age)
+) # TRUE
+
+# also supports table:
+table(septic_patients$sex,
+      septic_patients$age) \%>\%
+  freq()
 }
 \keyword{freq}
 \keyword{frequency}
--- a/tests/testthat/test-freq.R
+++ b/tests/testthat/test-freq.R
@@ -9,12 +9,16 @@ test_that("frequency table works", {
  expect_equal(nrow(freq(septic_patients$date)),
               length(unique(septic_patients$date)))

-  # int
+  # character
+  expect_output(print(freq(septic_patients$bactid)))
+  # integer
  expect_output(print(freq(septic_patients$age)))
  # date
  expect_output(print(freq(septic_patients$date)))
  # factor
  expect_output(print(freq(septic_patients$hospital_id)))
+  # table
+  expect_output(print(freq(table(septic_patients$sex, septic_patients$age))))

  library(dplyr)
  expect_output(septic_patients %>% select(1:2) %>% freq() %>% print())
@@ -53,5 +57,14 @@ test_that("frequency table works", {
  plot(freq(septic_patients, age))
  hist(freq(septic_patients, age))

+  # check vector
+  expect_identical(septic_patients %>%
+                     freq(age) %>%
+                     as.vector() %>%
+                     sort(),
+                   septic_patients %>%
+                     pull(age) %>%
+                     sort())
+
 })