From fc30d3fb13e7cf2580e131feb637ac6928ec108f Mon Sep 17 00:00:00 2001 From: "Matthijs S. Berends" Date: Mon, 9 Jul 2018 14:02:58 +0200 Subject: [PATCH] freq: support for table --- .Rbuildignore | 1 + DESCRIPTION | 3 +- NAMESPACE | 5 ++ NEWS.md | 2 + R/freq.R | 100 ++++++++++++++++++++++--------- R/globals.R | 2 + README.md | 118 +++++++++++++++++++++---------------- man/freq.Rd | 46 +++++++++++---- tests/testthat/test-freq.R | 15 ++++- 9 files changed, 199 insertions(+), 93 deletions(-) diff --git a/.Rbuildignore b/.Rbuildignore index b9c851b0..dbd2c8e7 100755 --- a/.Rbuildignore +++ b/.Rbuildignore @@ -1,3 +1,4 @@ ^.*\.Rproj$ ^\.Rproj\.user$ .travis.yml +.zenodo.json diff --git a/DESCRIPTION b/DESCRIPTION index e986ca3b..0c6779df 100755 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -43,7 +43,8 @@ Suggests: testthat (>= 1.0.2), covr (>= 3.0.1), rmarkdown, - rstudioapi + rstudioapi, + tidyr VignetteBuilder: knitr URL: https://github.com/msberends/AMR BugReports: https://github.com/msberends/AMR/issues diff --git a/NAMESPACE b/NAMESPACE index 70c9dd55..ff05410e 100755 --- a/NAMESPACE +++ b/NAMESPACE @@ -4,6 +4,8 @@ S3method(as.data.frame,frequency_tbl) S3method(as.double,mic) S3method(as.integer,mic) S3method(as.numeric,mic) +S3method(as.vector,frequency_tbl) +S3method(as_tibble,frequency_tbl) S3method(barplot,mic) S3method(barplot,rsi) S3method(hist,frequency_tbl) @@ -69,6 +71,8 @@ exportMethods(as.data.frame.frequency_tbl) exportMethods(as.double.mic) exportMethods(as.integer.mic) exportMethods(as.numeric.mic) +exportMethods(as.vector.frequency_tbl) +exportMethods(as_tibble.frequency_tbl) exportMethods(barplot.mic) exportMethods(barplot.rsi) exportMethods(hist.frequency_tbl) @@ -147,6 +151,7 @@ importFrom(stats,sd) importFrom(tibble,tibble) importFrom(utils,View) importFrom(utils,browseVignettes) +importFrom(utils,installed.packages) importFrom(utils,object.size) importFrom(utils,packageDescription) importFrom(utils,read.delim) diff --git a/NEWS.md b/NEWS.md index 43102c01..0513a02d 100755 --- a/NEWS.md +++ b/NEWS.md @@ -4,7 +4,9 @@ * For convience, descriptive statistical functions `kurtosis` and `skewness` that are lacking in base R - they are generic functions and have support for vectors, data.frames and matrices * New for frequency tables (function `freq`): * A vignette to explain its usage + * Support for `table` to use as input: `freq(table(x, y))` * Support for existing functions `hist` and `plot` to use a frequency table as input: `hist(freq(df$age))` + * Support for `as.vector`, `as.data.frame` and `as_tibble` * Support for quasiquotation: `freq(mydata, mycolumn)` is the same as `mydata %>% freq(mycolumn)` * Function `top_freq` function to return the top/below *n* items as vector * Header of frequency tables now also show Mean Absolute Deviaton (MAD) and Interquartile Range (IQR) diff --git a/R/freq.R b/R/freq.R index af5bc3b5..0d26482d 100755 --- a/R/freq.R +++ b/R/freq.R @@ -19,8 +19,8 @@ #' Frequency table #' #' Create a frequency table of a vector with items or a data frame. Supports quasiquotation and markdown for reports. \code{top_freq} can be used to get the top/bottom \emph{n} items of a frequency table, with counts as names. -#' @param x vector with items, or a \code{data.frame} -#' @param ... up to nine different columns of \code{x} to calculate frequencies from, see Examples +#' @param x vector of any class or a \code{\link{data.frame}}, \code{\link{tibble}} or \code{\link{table}} +#' @param ... up to nine different columns of \code{x} when \code{x} is a \code{data.frame} or \code{tibble}, to calculate frequencies from - see Examples #' @param sort.count sort on count, i.e. frequencies. This will be \code{TRUE} at default for everything except for factors. #' @param nmax number of row to print. The default, \code{15}, uses \code{\link{getOption}("max.print.freq")}. Use \code{nmax = 0}, \code{nmax = Inf}, \code{nmax = NULL} or \code{nmax = NA} to print all rows. #' @param na.rm a logical value indicating whether \code{NA} values should be removed from the frequency table. The header will always print the amount of \code{NA}s. @@ -56,7 +56,7 @@ #' @importFrom stats fivenum sd mad #' @importFrom grDevices boxplot.stats #' @importFrom dplyr %>% select pull n_distinct group_by arrange desc mutate summarise n_distinct -#' @importFrom utils browseVignettes +#' @importFrom utils browseVignettes installed.packages #' @importFrom tibble tibble #' @keywords summary summarise frequency freq #' @rdname freq @@ -72,20 +72,15 @@ #' septic_patients$hospital_id %>% freq() #' septic_patients[, "hospital_id"] %>% freq() #' septic_patients %>% freq("hospital_id") -#' septic_patients %>% freq(hospital_id) # <- easiest to remember when used to tidyverse +#' septic_patients %>% freq(hospital_id) #<- easiest to remember when you're used to tidyverse #' -#' # you could use `select`... +#' # you could also use `select` or `pull` to get your variables #' septic_patients %>% #' filter(hospital_id == "A") %>% #' select(bactid) %>% #' freq() #' -#' # ... or you use `freq` to select it immediately -#' septic_patients %>% -#' filter(hospital_id == "A") %>% -#' freq(bactid) -#' -#' # select multiple columns; they will be pasted together +#' # multiple selected variables will be pasted together #' septic_patients %>% #' left_join_microorganisms %>% #' filter(hospital_id == "A") %>% @@ -102,13 +97,40 @@ #' mutate(year = format(date, "%Y")) %>% #' freq(year) #' -#' # print only top 5 +#' # show only the top 5 #' years %>% print(nmax = 5) #' -#' # transform to plain data.frame +#' # print a histogram of numeric values +#' septic_patients %>% +#' freq(age) %>% +#' hist() # prettier: ggplot(septic_patients, aes(age)) + geom_histogram() +#' +#' # or print all points to a regular plot +#' septic_patients %>% +#' freq(age) %>% +#' plot() +#' +#' # transform to a data.frame or tibble #' septic_patients %>% #' freq(age) %>% #' as.data.frame() +#' +#' # or transform (back) to a vector +#' septic_patients %>% +#' freq(age) %>% +#' as.vector() +#' +#' identical(septic_patients %>% +#' freq(age) %>% +#' as.vector() %>% +#' sort(), +#' sort(septic_patients$age) +#' ) # TRUE +#' +#' # also supports table: +#' table(septic_patients$sex, +#' septic_patients$age) %>% +#' freq() frequency_tbl <- function(x, ..., sort.count = TRUE, @@ -138,6 +160,24 @@ frequency_tbl <- function(x, } else { cols <- NULL } + } else if (any(class(x) == 'table')) { + if (!"tidyr" %in% rownames(installed.packages())) { + stop('transformation from `table` to frequency table requires the tidyr package.', call. = FALSE) + } + values <- x %>% + as.data.frame(stringsAsFactors = FALSE) %>% + # delete last variable: these are frequencies + select(-ncol(.)) %>% + # paste all other columns: + tidyr::unite(sep = sep) %>% + .[, 1] + counts <- x %>% + as.data.frame(stringsAsFactors = FALSE) %>% + # get last variable: these are frequencies + pull(ncol(.)) + x <- rep(values, counts) + x.name <- NULL + cols <- NULL } else { x.name <- NULL cols <- NULL @@ -523,41 +563,47 @@ as.data.frame.frequency_tbl <- function(x, ...) { as.data.frame.data.frame(x, ...) } +#' @noRd +#' @exportMethod as_tibble.frequency_tbl +#' @export +#' @importFrom dplyr as_tibble +as_tibble.frequency_tbl <- function(x, validate = TRUE, ..., rownames = NA) { + attr(x, 'package') <- NULL + attr(x, 'package.version') <- NULL + attr(x, 'opt') <- NULL + as_tibble(x = as.data.frame(x), validate = validate, ..., rownames = rownames) +} + #' @noRd #' @exportMethod hist.frequency_tbl #' @export -#' @importFrom dplyr %>% pull #' @importFrom graphics hist hist.frequency_tbl <- function(x, ...) { - opt <- attr(x, 'opt') - if (!is.null(opt$vars)) { title <- opt$vars } else { title <- "" } - - items <- x %>% pull(item) - counts <- x %>% pull(count) - vect <- rep(items, counts) - hist(vect, main = paste("Histogram of", title), xlab = title, ...) + hist(as.vector(x), main = paste("Histogram of", title), xlab = title, ...) } #' @noRd #' @exportMethod plot.frequency_tbl #' @export -#' @importFrom dplyr %>% pull plot.frequency_tbl <- function(x, y, ...) { opt <- attr(x, 'opt') - if (!is.null(opt$vars)) { title <- opt$vars } else { title <- "" } - - items <- x %>% pull(item) - counts <- x %>% pull(count) - plot(x = items, y = counts, ylab = "Count", xlab = title, ...) + plot(x = x$item, y = x$count, ylab = "Count", xlab = title, ...) +} + +#' @noRd +#' @exportMethod as.vector.frequency_tbl +#' @export +as.vector.frequency_tbl <- function(x, mode = "any") { + as.vector(rep(x$item, x$count), mode = mode) } diff --git a/R/globals.R b/R/globals.R index 66c3c07c..b865b41e 100755 --- a/R/globals.R +++ b/R/globals.R @@ -22,6 +22,7 @@ globalVariables(c('abname', 'bactid', 'cnt', 'count', + 'counts', 'cum_count', 'cum_percent', 'date_lab', @@ -50,6 +51,7 @@ globalVariables(c('abname', 'septic_patients', 'species', 'umcg', + 'values', 'View', 'y', '.')) diff --git a/README.md b/README.md index 6080297d..c3e31208 100755 --- a/README.md +++ b/README.md @@ -47,9 +47,12 @@ With the `MDRO` function (abbreviation of Multi Drug Resistant Organisms), you c This package is available on CRAN and also here on GitHub. ### From CRAN (recommended) +Latest released version on CRAN: + [![CRAN_Badge](https://img.shields.io/cran/v/AMR.svg?label=CRAN&colorB=3679BC)](http://cran.r-project.org/package=AMR) -Downloads via RStudio CRAN server (downloads by all other CRAN mirrors not measured): +Downloads via RStudio CRAN server (downloads by all other CRAN mirrors **not** measured, including the official https://cran.r-project.org): + [![CRAN_Downloads](https://cranlogs.r-pkg.org/badges/grand-total/AMR)](http://cran.r-project.org/package=AMR) [![CRAN_Downloads](https://cranlogs.r-pkg.org/badges/AMR)](https://cranlogs.r-pkg.org/downloads/daily/last-month/AMR) @@ -122,80 +125,91 @@ after ``` ### Frequency tables -Base R lacks a simple function to create frequency tables. We created such a function that works with almost all data types: `freq` (or `frequency_tbl`). +Base R lacks a simple function to create frequency tables. We created such a function that works with almost all data types: `freq` (or `frequency_tbl`). It can be used in two ways: ```r -## Factors sort on item by default: +# Like base R: +freq(mydata$myvariable) -freq(septic_patients$hospital_id) +# And like tidyverse: +mydata %>% freq(myvariable) +``` + +Factors sort on item by default: +```r +septic_patients %>% freq(hospital_id) +# Frequency table of `hospital_id` # Class: factor # Length: 2000 (of which NA: 0 = 0.0%) # Unique: 5 # -# Item Count Percent Cum. Count Cum. Percent (Factor Level) -# ----- ------ -------- ----------- ------------- --------------- -# A 233 11.7% 233 11.7% 1 -# B 583 29.1% 816 40.8% 2 -# C 221 11.1% 1037 51.8% 3 -# D 650 32.5% 1687 84.4% 4 -# E 313 15.7% 2000 100.0% 5 +# Item Count Percent Cum. Count Cum. Percent (Factor Level) +# --- ----- ------ -------- ----------- ------------- --------------- +# 1 A 233 11.7% 233 11.7% 1 +# 2 B 583 29.1% 816 40.8% 2 +# 3 C 221 11.1% 1037 51.8% 3 +# 4 D 650 32.5% 1687 84.4% 4 +# 5 E 313 15.7% 2000 100.0% 5 +``` - -## This can be changed with the `sort.count` parameter: - -freq(septic_patients$hospital_id, sort.count = TRUE) +This can be changed with the `sort.count` parameter: +```r +septic_patients %>% freq(hospital_id, sort.count = TRUE) +# Frequency table of `hospital_id` # Class: factor # Length: 2000 (of which NA: 0 = 0.0%) # Unique: 5 # -# Item Count Percent Cum. Count Cum. Percent (Factor Level) -# ----- ------ -------- ----------- ------------- --------------- -# D 650 32.5% 650 32.5% 4 -# B 583 29.1% 1233 61.7% 2 -# E 313 15.7% 1546 77.3% 5 -# A 233 11.7% 1779 88.9% 1 -# C 221 11.1% 2000 100.0% 3 +# Item Count Percent Cum. Count Cum. Percent (Factor Level) +# --- ----- ------ -------- ----------- ------------- --------------- +# 1 D 650 32.5% 650 32.5% 4 +# 2 B 583 29.1% 1233 61.7% 2 +# 3 E 313 15.7% 1546 77.3% 5 +# 4 A 233 11.7% 1779 88.9% 1 +# 5 C 221 11.1% 2000 100.0% 3 +``` - -## Other types, like numbers or dates, sort on count by default: - -> freq(septic_patients$date) +All other types, like numbers, characters and dates, sort on count by default: +```r +septic_patients %>% freq(date) +# Frequency table of `date` # Class: Date # Length: 2000 (of which NA: 0 = 0.0%) # Unique: 1662 # # Oldest: 2 January 2001 # Newest: 18 October 2017 (+6133) +# Median: 6 December 2009 (~53%) # -# Item Count Percent Cum. Count Cum. Percent -# ----------- ------ -------- ----------- ------------- -# 2008-12-24 5 0.2% 5 0.2% -# 2010-12-10 4 0.2% 9 0.4% -# 2011-03-03 4 0.2% 13 0.6% -# 2013-06-24 4 0.2% 17 0.8% -# 2017-09-01 4 0.2% 21 1.1% -# 2002-09-02 3 0.2% 24 1.2% -# 2003-10-14 3 0.2% 27 1.4% -# 2004-06-25 3 0.2% 30 1.5% -# 2004-06-27 3 0.2% 33 1.7% -# 2004-10-29 3 0.2% 36 1.8% -# 2005-09-27 3 0.2% 39 2.0% -# 2006-08-01 3 0.2% 42 2.1% -# 2006-10-10 3 0.2% 45 2.2% -# 2007-11-16 3 0.2% 48 2.4% -# 2008-03-09 3 0.2% 51 2.5% -# ... and 1647 more (n = 1949; 97.5%). Use `nmax` to show more rows. - - -## For numeric values, some extra descriptive statistics will be calculated: - -> freq(runif(n = 10, min = 1, max = 5)) +# Item Count Percent Cum. Count Cum. Percent +# --- ----------- ------ -------- ----------- ------------- +# 1 2008-12-24 5 0.2% 5 0.2% +# 2 2010-12-10 4 0.2% 9 0.4% +# 3 2011-03-03 4 0.2% 13 0.6% +# 4 2013-06-24 4 0.2% 17 0.8% +# 5 2017-09-01 4 0.2% 21 1.1% +# 6 2002-09-02 3 0.2% 24 1.2% +# 7 2003-10-14 3 0.2% 27 1.4% +# 8 2004-06-25 3 0.2% 30 1.5% +# 9 2004-06-27 3 0.2% 33 1.7% +# 10 2004-10-29 3 0.2% 36 1.8% +# 11 2005-09-27 3 0.2% 39 2.0% +# 12 2006-08-01 3 0.2% 42 2.1% +# 13 2006-10-10 3 0.2% 45 2.2% +# 14 2007-11-16 3 0.2% 48 2.4% +# 15 2008-03-09 3 0.2% 51 2.5% +# [ reached getOption("max.print.freq") -- omitted 1647 entries, n = 1949 (97.5%) ] +``` +For numeric values, some extra descriptive statistics will be calculated: +```r +freq(runif(n = 10, min = 1, max = 5)) +# Frequency table # Class: numeric # Length: 10 (of which NA: 0 = 0.0%) # Unique: 10 # -# Mean: 3 -# Std. dev.: 0.93 (CV: 0.31) -# Five-Num: 1.1 | 2.3 | 3.1 | 3.8 | 4.0 (CQV: 0.25) +# Mean: 2.9 +# Std. dev.: 1.3 (CV: 0.43, MAD: 1.5) +# Five-Num: 1.5 | 1.7 | 2.6 | 4.0 | 4.7 (IQR: 2.3, CQV: 0.4) # Outliers: 0 # # Item Count Percent Cum. Count Cum. Percent diff --git a/man/freq.Rd b/man/freq.Rd index b213db3c..f4745802 100755 --- a/man/freq.Rd +++ b/man/freq.Rd @@ -21,9 +21,9 @@ top_freq(f, n) 15), ...) } \arguments{ -\item{x}{vector with items, or a \code{data.frame}} +\item{x}{vector of any class or a \code{\link{data.frame}}, \code{\link{tibble}} or \code{\link{table}}} -\item{...}{up to nine different columns of \code{x} to calculate frequencies from, see Examples} +\item{...}{up to nine different columns of \code{x} when \code{x} is a \code{data.frame} or \code{tibble}, to calculate frequencies from - see Examples} \item{sort.count}{sort on count, i.e. frequencies. This will be \code{TRUE} at default for everything except for factors.} @@ -83,20 +83,15 @@ freq(septic_patients[, "hospital_id"]) septic_patients$hospital_id \%>\% freq() septic_patients[, "hospital_id"] \%>\% freq() septic_patients \%>\% freq("hospital_id") -septic_patients \%>\% freq(hospital_id) # <- easiest to remember when used to tidyverse +septic_patients \%>\% freq(hospital_id) #<- easiest to remember when you're used to tidyverse -# you could use `select`... +# you could also use `select` or `pull` to get your variables septic_patients \%>\% filter(hospital_id == "A") \%>\% select(bactid) \%>\% freq() -# ... or you use `freq` to select it immediately -septic_patients \%>\% - filter(hospital_id == "A") \%>\% - freq(bactid) - -# select multiple columns; they will be pasted together +# multiple selected variables will be pasted together septic_patients \%>\% left_join_microorganisms \%>\% filter(hospital_id == "A") \%>\% @@ -113,13 +108,40 @@ years <- septic_patients \%>\% mutate(year = format(date, "\%Y")) \%>\% freq(year) -# print only top 5 +# show only the top 5 years \%>\% print(nmax = 5) -# transform to plain data.frame +# print a histogram of numeric values +septic_patients \%>\% + freq(age) \%>\% + hist() # prettier: ggplot(septic_patients, aes(age)) + geom_histogram() + +# or print all points to a regular plot +septic_patients \%>\% + freq(age) \%>\% + plot() + +# transform to a data.frame or tibble septic_patients \%>\% freq(age) \%>\% as.data.frame() + +# or transform (back) to a vector +septic_patients \%>\% + freq(age) \%>\% + as.vector() + +identical(septic_patients \%>\% + freq(age) \%>\% + as.vector() \%>\% + sort(), + sort(septic_patients$age) +) # TRUE + +# also supports table: +table(septic_patients$sex, + septic_patients$age) \%>\% + freq() } \keyword{freq} \keyword{frequency} diff --git a/tests/testthat/test-freq.R b/tests/testthat/test-freq.R index b96730cc..be3de4c3 100755 --- a/tests/testthat/test-freq.R +++ b/tests/testthat/test-freq.R @@ -9,12 +9,16 @@ test_that("frequency table works", { expect_equal(nrow(freq(septic_patients$date)), length(unique(septic_patients$date))) - # int + # character + expect_output(print(freq(septic_patients$bactid))) + # integer expect_output(print(freq(septic_patients$age))) # date expect_output(print(freq(septic_patients$date))) # factor expect_output(print(freq(septic_patients$hospital_id))) + # table + expect_output(print(freq(table(septic_patients$sex, septic_patients$age)))) library(dplyr) expect_output(septic_patients %>% select(1:2) %>% freq() %>% print()) @@ -53,5 +57,14 @@ test_that("frequency table works", { plot(freq(septic_patients, age)) hist(freq(septic_patients, age)) + # check vector + expect_identical(septic_patients %>% + freq(age) %>% + as.vector() %>% + sort(), + septic_patients %>% + pull(age) %>% + sort()) + })