freq: support for table

This commit is contained in:
dr. M.S. (Matthijs) Berends 2018-07-09 14:02:58 +02:00
parent 18c91786bf
commit fc30d3fb13
9 changed files with 199 additions and 93 deletions

View File

@ -1,3 +1,4 @@
^.*\.Rproj$
^\.Rproj\.user$
.travis.yml
.zenodo.json

View File

@ -43,7 +43,8 @@ Suggests:
testthat (>= 1.0.2),
covr (>= 3.0.1),
rmarkdown,
rstudioapi
rstudioapi,
tidyr
VignetteBuilder: knitr
URL: https://github.com/msberends/AMR
BugReports: https://github.com/msberends/AMR/issues

View File

@ -4,6 +4,8 @@ S3method(as.data.frame,frequency_tbl)
S3method(as.double,mic)
S3method(as.integer,mic)
S3method(as.numeric,mic)
S3method(as.vector,frequency_tbl)
S3method(as_tibble,frequency_tbl)
S3method(barplot,mic)
S3method(barplot,rsi)
S3method(hist,frequency_tbl)
@ -69,6 +71,8 @@ exportMethods(as.data.frame.frequency_tbl)
exportMethods(as.double.mic)
exportMethods(as.integer.mic)
exportMethods(as.numeric.mic)
exportMethods(as.vector.frequency_tbl)
exportMethods(as_tibble.frequency_tbl)
exportMethods(barplot.mic)
exportMethods(barplot.rsi)
exportMethods(hist.frequency_tbl)
@ -147,6 +151,7 @@ importFrom(stats,sd)
importFrom(tibble,tibble)
importFrom(utils,View)
importFrom(utils,browseVignettes)
importFrom(utils,installed.packages)
importFrom(utils,object.size)
importFrom(utils,packageDescription)
importFrom(utils,read.delim)

View File

@ -4,7 +4,9 @@
* For convience, descriptive statistical functions `kurtosis` and `skewness` that are lacking in base R - they are generic functions and have support for vectors, data.frames and matrices
* New for frequency tables (function `freq`):
* A vignette to explain its usage
* Support for `table` to use as input: `freq(table(x, y))`
* Support for existing functions `hist` and `plot` to use a frequency table as input: `hist(freq(df$age))`
* Support for `as.vector`, `as.data.frame` and `as_tibble`
* Support for quasiquotation: `freq(mydata, mycolumn)` is the same as `mydata %>% freq(mycolumn)`
* Function `top_freq` function to return the top/below *n* items as vector
* Header of frequency tables now also show Mean Absolute Deviaton (MAD) and Interquartile Range (IQR)

100
R/freq.R
View File

@ -19,8 +19,8 @@
#' Frequency table
#'
#' Create a frequency table of a vector with items or a data frame. Supports quasiquotation and markdown for reports. \code{top_freq} can be used to get the top/bottom \emph{n} items of a frequency table, with counts as names.
#' @param x vector with items, or a \code{data.frame}
#' @param ... up to nine different columns of \code{x} to calculate frequencies from, see Examples
#' @param x vector of any class or a \code{\link{data.frame}}, \code{\link{tibble}} or \code{\link{table}}
#' @param ... up to nine different columns of \code{x} when \code{x} is a \code{data.frame} or \code{tibble}, to calculate frequencies from - see Examples
#' @param sort.count sort on count, i.e. frequencies. This will be \code{TRUE} at default for everything except for factors.
#' @param nmax number of row to print. The default, \code{15}, uses \code{\link{getOption}("max.print.freq")}. Use \code{nmax = 0}, \code{nmax = Inf}, \code{nmax = NULL} or \code{nmax = NA} to print all rows.
#' @param na.rm a logical value indicating whether \code{NA} values should be removed from the frequency table. The header will always print the amount of \code{NA}s.
@ -56,7 +56,7 @@
#' @importFrom stats fivenum sd mad
#' @importFrom grDevices boxplot.stats
#' @importFrom dplyr %>% select pull n_distinct group_by arrange desc mutate summarise n_distinct
#' @importFrom utils browseVignettes
#' @importFrom utils browseVignettes installed.packages
#' @importFrom tibble tibble
#' @keywords summary summarise frequency freq
#' @rdname freq
@ -72,20 +72,15 @@
#' septic_patients$hospital_id %>% freq()
#' septic_patients[, "hospital_id"] %>% freq()
#' septic_patients %>% freq("hospital_id")
#' septic_patients %>% freq(hospital_id) # <- easiest to remember when used to tidyverse
#' septic_patients %>% freq(hospital_id) #<- easiest to remember when you're used to tidyverse
#'
#' # you could use `select`...
#' # you could also use `select` or `pull` to get your variables
#' septic_patients %>%
#' filter(hospital_id == "A") %>%
#' select(bactid) %>%
#' freq()
#'
#' # ... or you use `freq` to select it immediately
#' septic_patients %>%
#' filter(hospital_id == "A") %>%
#' freq(bactid)
#'
#' # select multiple columns; they will be pasted together
#' # multiple selected variables will be pasted together
#' septic_patients %>%
#' left_join_microorganisms %>%
#' filter(hospital_id == "A") %>%
@ -102,13 +97,40 @@
#' mutate(year = format(date, "%Y")) %>%
#' freq(year)
#'
#' # print only top 5
#' # show only the top 5
#' years %>% print(nmax = 5)
#'
#' # transform to plain data.frame
#' # print a histogram of numeric values
#' septic_patients %>%
#' freq(age) %>%
#' hist() # prettier: ggplot(septic_patients, aes(age)) + geom_histogram()
#'
#' # or print all points to a regular plot
#' septic_patients %>%
#' freq(age) %>%
#' plot()
#'
#' # transform to a data.frame or tibble
#' septic_patients %>%
#' freq(age) %>%
#' as.data.frame()
#'
#' # or transform (back) to a vector
#' septic_patients %>%
#' freq(age) %>%
#' as.vector()
#'
#' identical(septic_patients %>%
#' freq(age) %>%
#' as.vector() %>%
#' sort(),
#' sort(septic_patients$age)
#' ) # TRUE
#'
#' # also supports table:
#' table(septic_patients$sex,
#' septic_patients$age) %>%
#' freq()
frequency_tbl <- function(x,
...,
sort.count = TRUE,
@ -138,6 +160,24 @@ frequency_tbl <- function(x,
} else {
cols <- NULL
}
} else if (any(class(x) == 'table')) {
if (!"tidyr" %in% rownames(installed.packages())) {
stop('transformation from `table` to frequency table requires the tidyr package.', call. = FALSE)
}
values <- x %>%
as.data.frame(stringsAsFactors = FALSE) %>%
# delete last variable: these are frequencies
select(-ncol(.)) %>%
# paste all other columns:
tidyr::unite(sep = sep) %>%
.[, 1]
counts <- x %>%
as.data.frame(stringsAsFactors = FALSE) %>%
# get last variable: these are frequencies
pull(ncol(.))
x <- rep(values, counts)
x.name <- NULL
cols <- NULL
} else {
x.name <- NULL
cols <- NULL
@ -523,41 +563,47 @@ as.data.frame.frequency_tbl <- function(x, ...) {
as.data.frame.data.frame(x, ...)
}
#' @noRd
#' @exportMethod as_tibble.frequency_tbl
#' @export
#' @importFrom dplyr as_tibble
as_tibble.frequency_tbl <- function(x, validate = TRUE, ..., rownames = NA) {
attr(x, 'package') <- NULL
attr(x, 'package.version') <- NULL
attr(x, 'opt') <- NULL
as_tibble(x = as.data.frame(x), validate = validate, ..., rownames = rownames)
}
#' @noRd
#' @exportMethod hist.frequency_tbl
#' @export
#' @importFrom dplyr %>% pull
#' @importFrom graphics hist
hist.frequency_tbl <- function(x, ...) {
opt <- attr(x, 'opt')
if (!is.null(opt$vars)) {
title <- opt$vars
} else {
title <- ""
}
items <- x %>% pull(item)
counts <- x %>% pull(count)
vect <- rep(items, counts)
hist(vect, main = paste("Histogram of", title), xlab = title, ...)
hist(as.vector(x), main = paste("Histogram of", title), xlab = title, ...)
}
#' @noRd
#' @exportMethod plot.frequency_tbl
#' @export
#' @importFrom dplyr %>% pull
plot.frequency_tbl <- function(x, y, ...) {
opt <- attr(x, 'opt')
if (!is.null(opt$vars)) {
title <- opt$vars
} else {
title <- ""
}
items <- x %>% pull(item)
counts <- x %>% pull(count)
plot(x = items, y = counts, ylab = "Count", xlab = title, ...)
plot(x = x$item, y = x$count, ylab = "Count", xlab = title, ...)
}
#' @noRd
#' @exportMethod as.vector.frequency_tbl
#' @export
as.vector.frequency_tbl <- function(x, mode = "any") {
as.vector(rep(x$item, x$count), mode = mode)
}

View File

@ -22,6 +22,7 @@ globalVariables(c('abname',
'bactid',
'cnt',
'count',
'counts',
'cum_count',
'cum_percent',
'date_lab',
@ -50,6 +51,7 @@ globalVariables(c('abname',
'septic_patients',
'species',
'umcg',
'values',
'View',
'y',
'.'))

118
README.md
View File

@ -47,9 +47,12 @@ With the `MDRO` function (abbreviation of Multi Drug Resistant Organisms), you c
This package is available on CRAN and also here on GitHub.
### From CRAN (recommended)
Latest released version on CRAN:
[![CRAN_Badge](https://img.shields.io/cran/v/AMR.svg?label=CRAN&colorB=3679BC)](http://cran.r-project.org/package=AMR)
Downloads via RStudio CRAN server (downloads by all other CRAN mirrors not measured):
Downloads via RStudio CRAN server (downloads by all other CRAN mirrors **not** measured, including the official https://cran.r-project.org):
[![CRAN_Downloads](https://cranlogs.r-pkg.org/badges/grand-total/AMR)](http://cran.r-project.org/package=AMR)
[![CRAN_Downloads](https://cranlogs.r-pkg.org/badges/AMR)](https://cranlogs.r-pkg.org/downloads/daily/last-month/AMR)
@ -122,80 +125,91 @@ after
```
### Frequency tables
Base R lacks a simple function to create frequency tables. We created such a function that works with almost all data types: `freq` (or `frequency_tbl`).
Base R lacks a simple function to create frequency tables. We created such a function that works with almost all data types: `freq` (or `frequency_tbl`). It can be used in two ways:
```r
## Factors sort on item by default:
# Like base R:
freq(mydata$myvariable)
freq(septic_patients$hospital_id)
# And like tidyverse:
mydata %>% freq(myvariable)
```
Factors sort on item by default:
```r
septic_patients %>% freq(hospital_id)
# Frequency table of `hospital_id`
# Class: factor
# Length: 2000 (of which NA: 0 = 0.0%)
# Unique: 5
#
# Item Count Percent Cum. Count Cum. Percent (Factor Level)
# ----- ------ -------- ----------- ------------- ---------------
# A 233 11.7% 233 11.7% 1
# B 583 29.1% 816 40.8% 2
# C 221 11.1% 1037 51.8% 3
# D 650 32.5% 1687 84.4% 4
# E 313 15.7% 2000 100.0% 5
# Item Count Percent Cum. Count Cum. Percent (Factor Level)
# --- ----- ------ -------- ----------- ------------- ---------------
# 1 A 233 11.7% 233 11.7% 1
# 2 B 583 29.1% 816 40.8% 2
# 3 C 221 11.1% 1037 51.8% 3
# 4 D 650 32.5% 1687 84.4% 4
# 5 E 313 15.7% 2000 100.0% 5
```
## This can be changed with the `sort.count` parameter:
freq(septic_patients$hospital_id, sort.count = TRUE)
This can be changed with the `sort.count` parameter:
```r
septic_patients %>% freq(hospital_id, sort.count = TRUE)
# Frequency table of `hospital_id`
# Class: factor
# Length: 2000 (of which NA: 0 = 0.0%)
# Unique: 5
#
# Item Count Percent Cum. Count Cum. Percent (Factor Level)
# ----- ------ -------- ----------- ------------- ---------------
# D 650 32.5% 650 32.5% 4
# B 583 29.1% 1233 61.7% 2
# E 313 15.7% 1546 77.3% 5
# A 233 11.7% 1779 88.9% 1
# C 221 11.1% 2000 100.0% 3
# Item Count Percent Cum. Count Cum. Percent (Factor Level)
# --- ----- ------ -------- ----------- ------------- ---------------
# 1 D 650 32.5% 650 32.5% 4
# 2 B 583 29.1% 1233 61.7% 2
# 3 E 313 15.7% 1546 77.3% 5
# 4 A 233 11.7% 1779 88.9% 1
# 5 C 221 11.1% 2000 100.0% 3
```
## Other types, like numbers or dates, sort on count by default:
> freq(septic_patients$date)
All other types, like numbers, characters and dates, sort on count by default:
```r
septic_patients %>% freq(date)
# Frequency table of `date`
# Class: Date
# Length: 2000 (of which NA: 0 = 0.0%)
# Unique: 1662
#
# Oldest: 2 January 2001
# Newest: 18 October 2017 (+6133)
# Median: 6 December 2009 (~53%)
#
# Item Count Percent Cum. Count Cum. Percent
# ----------- ------ -------- ----------- -------------
# 2008-12-24 5 0.2% 5 0.2%
# 2010-12-10 4 0.2% 9 0.4%
# 2011-03-03 4 0.2% 13 0.6%
# 2013-06-24 4 0.2% 17 0.8%
# 2017-09-01 4 0.2% 21 1.1%
# 2002-09-02 3 0.2% 24 1.2%
# 2003-10-14 3 0.2% 27 1.4%
# 2004-06-25 3 0.2% 30 1.5%
# 2004-06-27 3 0.2% 33 1.7%
# 2004-10-29 3 0.2% 36 1.8%
# 2005-09-27 3 0.2% 39 2.0%
# 2006-08-01 3 0.2% 42 2.1%
# 2006-10-10 3 0.2% 45 2.2%
# 2007-11-16 3 0.2% 48 2.4%
# 2008-03-09 3 0.2% 51 2.5%
# ... and 1647 more (n = 1949; 97.5%). Use `nmax` to show more rows.
## For numeric values, some extra descriptive statistics will be calculated:
> freq(runif(n = 10, min = 1, max = 5))
# Item Count Percent Cum. Count Cum. Percent
# --- ----------- ------ -------- ----------- -------------
# 1 2008-12-24 5 0.2% 5 0.2%
# 2 2010-12-10 4 0.2% 9 0.4%
# 3 2011-03-03 4 0.2% 13 0.6%
# 4 2013-06-24 4 0.2% 17 0.8%
# 5 2017-09-01 4 0.2% 21 1.1%
# 6 2002-09-02 3 0.2% 24 1.2%
# 7 2003-10-14 3 0.2% 27 1.4%
# 8 2004-06-25 3 0.2% 30 1.5%
# 9 2004-06-27 3 0.2% 33 1.7%
# 10 2004-10-29 3 0.2% 36 1.8%
# 11 2005-09-27 3 0.2% 39 2.0%
# 12 2006-08-01 3 0.2% 42 2.1%
# 13 2006-10-10 3 0.2% 45 2.2%
# 14 2007-11-16 3 0.2% 48 2.4%
# 15 2008-03-09 3 0.2% 51 2.5%
# [ reached getOption("max.print.freq") -- omitted 1647 entries, n = 1949 (97.5%) ]
```
For numeric values, some extra descriptive statistics will be calculated:
```r
freq(runif(n = 10, min = 1, max = 5))
# Frequency table
# Class: numeric
# Length: 10 (of which NA: 0 = 0.0%)
# Unique: 10
#
# Mean: 3
# Std. dev.: 0.93 (CV: 0.31)
# Five-Num: 1.1 | 2.3 | 3.1 | 3.8 | 4.0 (CQV: 0.25)
# Mean: 2.9
# Std. dev.: 1.3 (CV: 0.43, MAD: 1.5)
# Five-Num: 1.5 | 1.7 | 2.6 | 4.0 | 4.7 (IQR: 2.3, CQV: 0.4)
# Outliers: 0
#
# Item Count Percent Cum. Count Cum. Percent

View File

@ -21,9 +21,9 @@ top_freq(f, n)
15), ...)
}
\arguments{
\item{x}{vector with items, or a \code{data.frame}}
\item{x}{vector of any class or a \code{\link{data.frame}}, \code{\link{tibble}} or \code{\link{table}}}
\item{...}{up to nine different columns of \code{x} to calculate frequencies from, see Examples}
\item{...}{up to nine different columns of \code{x} when \code{x} is a \code{data.frame} or \code{tibble}, to calculate frequencies from - see Examples}
\item{sort.count}{sort on count, i.e. frequencies. This will be \code{TRUE} at default for everything except for factors.}
@ -83,20 +83,15 @@ freq(septic_patients[, "hospital_id"])
septic_patients$hospital_id \%>\% freq()
septic_patients[, "hospital_id"] \%>\% freq()
septic_patients \%>\% freq("hospital_id")
septic_patients \%>\% freq(hospital_id) # <- easiest to remember when used to tidyverse
septic_patients \%>\% freq(hospital_id) #<- easiest to remember when you're used to tidyverse
# you could use `select`...
# you could also use `select` or `pull` to get your variables
septic_patients \%>\%
filter(hospital_id == "A") \%>\%
select(bactid) \%>\%
freq()
# ... or you use `freq` to select it immediately
septic_patients \%>\%
filter(hospital_id == "A") \%>\%
freq(bactid)
# select multiple columns; they will be pasted together
# multiple selected variables will be pasted together
septic_patients \%>\%
left_join_microorganisms \%>\%
filter(hospital_id == "A") \%>\%
@ -113,13 +108,40 @@ years <- septic_patients \%>\%
mutate(year = format(date, "\%Y")) \%>\%
freq(year)
# print only top 5
# show only the top 5
years \%>\% print(nmax = 5)
# transform to plain data.frame
# print a histogram of numeric values
septic_patients \%>\%
freq(age) \%>\%
hist() # prettier: ggplot(septic_patients, aes(age)) + geom_histogram()
# or print all points to a regular plot
septic_patients \%>\%
freq(age) \%>\%
plot()
# transform to a data.frame or tibble
septic_patients \%>\%
freq(age) \%>\%
as.data.frame()
# or transform (back) to a vector
septic_patients \%>\%
freq(age) \%>\%
as.vector()
identical(septic_patients \%>\%
freq(age) \%>\%
as.vector() \%>\%
sort(),
sort(septic_patients$age)
) # TRUE
# also supports table:
table(septic_patients$sex,
septic_patients$age) \%>\%
freq()
}
\keyword{freq}
\keyword{frequency}

View File

@ -9,12 +9,16 @@ test_that("frequency table works", {
expect_equal(nrow(freq(septic_patients$date)),
length(unique(septic_patients$date)))
# int
# character
expect_output(print(freq(septic_patients$bactid)))
# integer
expect_output(print(freq(septic_patients$age)))
# date
expect_output(print(freq(septic_patients$date)))
# factor
expect_output(print(freq(septic_patients$hospital_id)))
# table
expect_output(print(freq(table(septic_patients$sex, septic_patients$age))))
library(dplyr)
expect_output(septic_patients %>% select(1:2) %>% freq() %>% print())
@ -53,5 +57,14 @@ test_that("frequency table works", {
plot(freq(septic_patients, age))
hist(freq(septic_patients, age))
# check vector
expect_identical(septic_patients %>%
freq(age) %>%
as.vector() %>%
sort(),
septic_patients %>%
pull(age) %>%
sort())
})