diff --git a/.gitignore b/.gitignore index e4a31cd9..d7381655 100644 --- a/.gitignore +++ b/.gitignore @@ -4,3 +4,4 @@ .Ruserdata AMR.Rproj tests/testthat/Rplots.pdf +inst/doc diff --git a/DESCRIPTION b/DESCRIPTION index f06f8f3f..26eb8ac1 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,6 +1,6 @@ Package: AMR -Version: 0.2.0 -Date: 2018-05-02 +Version: 0.2.0.9000 +Date: 2018-05-09 Title: Antimicrobial Resistance Analysis Authors@R: c( person( @@ -37,7 +37,10 @@ Imports: tibble Suggests: testthat (>= 1.0.2), - covr (>= 3.0.1) + covr (>= 3.0.1), + knitr, + rmarkdown +VignetteBuilder: knitr URL: https://github.com/msberends/AMR BugReports: https://github.com/msberends/AMR/issues License: GPL-2 | file LICENSE diff --git a/NEWS.md b/NEWS.md index 51194f3d..cffdf5e7 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,4 +1,13 @@ -# 0.2.0 +# 0.2.9000 (development version) +#### New +* Vignettes about frequency tables: [vignettes/freq.html](vignettes/freq.html) +* Possibility to globally set the default for the amount of items to print in frequency tables (`freq` function), with `options(max.print.freq = n)` + +#### Changed +* Renamed `toConsole` parameter of `freq` to `as.data.frame` +* Small translational improvements to the `septic_patients` dataset + +# 0.2.0 (latest stable version) #### New * Full support for Windows, Linux and macOS * Full support for old R versions, only R-3.0.0 (April 2013) or later is needed (needed packages may have other dependencies) diff --git a/R/freq.R b/R/freq.R index bb592c0e..00a791ad 100644 --- a/R/freq.R +++ b/R/freq.R @@ -21,10 +21,10 @@ #' Create a frequency table of a vector of data, a single column or a maximum of 9 columns of a data frame. Supports markdown for reports. #' @param x data #' @param sort.count Sort on count. Use \code{FALSE} to sort alphabetically on item. -#' @param nmax number of row to print. Use \code{nmax = 0} or \code{nmax = NA} to print all rows. -#' @param na.rm a logical value indicating whether NA values should be removed from the frequency table. The header will always print the amount of\code{NA}s. +#' @param nmax number of row to print. The default, \code{15}, uses \code{\link[base]{getOption}("max.print.freq")}. Use \code{nmax = 0} or \code{nmax = NA} to print all rows. +#' @param na.rm a logical value indicating whether NA values should be removed from the frequency table. The header will always print the amount of \code{NA}s. #' @param markdown print table in markdown format (this forces \code{nmax = NA}) -#' @param toConsole Print table to the console. Use \code{FALSE} to assign the table to an object. +#' @param as.data.frame return frequency table without header as a \code{data.frame} (e.g. to assign the table to an object) #' @param digits how many significant digits are to be used for numeric values (not for the items themselves, that depends on \code{\link{getOption}("digits")}) #' @param sep a character string to separate the terms when selecting multiple columns #' @details For numeric values, the next values will be calculated and shown into the header: @@ -32,7 +32,7 @@ #' \item{Mean, using \code{\link[base]{mean}}} #' \item{Standard deviation, using \code{\link[stats]{sd}}} #' \item{Five numbers of Tukey (min, Q1, median, Q3, max), using \code{\link[stats]{fivenum}}} -#' \item{Outliers (count and list), using \code{\link{boxplot.stats}}} +#' \item{Outliers (total count and unique count), using \code{\link{boxplot.stats}}} #' \item{Coefficient of variation (CV), the standard deviation divided by the mean} #' \item{Coefficient of quartile variation (CQV, sometimes called coefficient of dispersion), calculated as \code{(Q3 - Q1) / (Q3 + Q1)} using \code{\link{quantile}} with \code{type = 6} as quantile algorithm to comply with SPSS standards} #' } @@ -63,13 +63,13 @@ #' years <- septic_patients %>% #' mutate(year = format(date, "%Y")) %>% #' select(year) %>% -#' freq(toConsole = FALSE) +#' freq(as.data.frame = TRUE) freq <- function(x, sort.count = TRUE, - nmax = 15, + nmax = getOption("max.print.freq"), na.rm = TRUE, markdown = FALSE, - toConsole = TRUE, + as.data.frame = FALSE, digits = 2, sep = " ") { @@ -156,8 +156,8 @@ freq <- function(x, stop('A maximum of 9 columns can be analysed at the same time.', call. = FALSE) } } - if (markdown == TRUE & toConsole == FALSE) { - warning('`toConsole = FALSE` will be ignored when `markdown = TRUE`.') + if (markdown == TRUE & as.data.frame == TRUE) { + warning('`as.data.frame = TRUE` will be ignored when `markdown = TRUE`.') } if (mult.columns > 1) { @@ -232,7 +232,7 @@ freq <- function(x, x <- x %>% format(formatdates) } - if (toConsole == TRUE) { + if (as.data.frame == FALSE) { cat(header) } @@ -244,22 +244,30 @@ freq <- function(x, warning('All observations are unique.', call. = FALSE) } - if (nmax == 0 | is.na(nmax)) { + nmax.set <- !missing(nmax) + if (is.null(nmax) & is.null(base::getOption("max.print.freq", default = NULL))) { + # default for max print setting + nmax <- 15 + } + + if (nmax == 0 | is.na(nmax) | is.null(nmax)) { nmax <- length(x) } nmax.1 <- min(length(x), nmax + 1) # create table with counts and percentages + column_names <- c('Item', 'Count', 'Percent', 'Cum. Count', 'Cum. Percent', '(Factor Level)') + column_names_df <- c('item', 'count', 'percent', 'cum_count', 'cum_percent', 'factor_level') if (any(class(x) == 'factor')) { df <- tibble::tibble(Item = x, Fctlvl = x %>% as.integer()) %>% group_by(Item, Fctlvl) - column_names <- c('Item', 'Count', 'Percent', 'Cum. Count', 'Cum. Percent', '(Factor Level)') column_align <- c('l', 'r', 'r', 'r', 'r', 'r') } else { df <- tibble::tibble(Item = x) %>% group_by(Item) - column_names <- c('Item', 'Count', 'Percent', 'Cum. Count', 'Cum. Percent') + column_names <- column_names[1:5] # strip factor lvl + column_names_df <- column_names_df[1:5] # strip factor lvl column_align <- c(x_align, 'r', 'r', 'r', 'r') } df <- df %>% @@ -276,10 +284,10 @@ freq <- function(x, # sort according to setting if (sort.count == TRUE) { - df <- df %>% arrange(desc(Count)) + df <- df %>% arrange(desc(Count), Item) } else { if (any(class(x) == 'factor')) { - df <- df %>% arrange(Fctlvl) + df <- df %>% arrange(Fctlvl, Item) } else { df <- df %>% arrange(Item) } @@ -295,65 +303,68 @@ freq <- function(x, df <- df %>% select(Item, Count, Percent, Cum, CumTot, Fctlvl) } + if (as.data.frame == TRUE) { + # assign to object + df[, 3] <- df[, 2] / sum(df[, 2], na.rm = TRUE) + df[, 4] <- cumsum(df[, 2]) + df[, 5] <- df[, 4] / sum(df[, 2], na.rm = TRUE) + colnames(df) <- column_names_df + return(as.data.frame(df, stringsAsFactors = FALSE)) + } + if (markdown == TRUE) { tblformat <- 'markdown' } else { tblformat <- 'pandoc' } - if (toConsole == FALSE) { - # assign to object - df[, 3] <- df[, 2] / sum(df[, 2], na.rm = TRUE) - df[, 4] <- cumsum(df[, 2]) - df[, 5] <- df[, 4] / sum(df[, 2], na.rm = TRUE) - return(df) + # save old NA setting for kable + opt.old <- options()$knitr.kable.NA + options(knitr.kable.NA = "") - } else { + Count.rest <- sum(df[nmax.1:nrow(df), 'Count'], na.rm = TRUE) + if (any(class(x) %in% c('double', 'integer', 'numeric', 'raw', 'single'))) { + df <- df %>% mutate(Item = format(Item)) + } + df <- df %>% mutate(Count = format(Count)) - # save old NA setting for kable - opt.old <- options()$knitr.kable.NA - options(knitr.kable.NA = "") - - Count.rest <- sum(df[nmax.1:nrow(df), 'Count'], na.rm = TRUE) - if (any(class(x) %in% c('double', 'integer', 'numeric', 'raw', 'single'))) { - df <- df %>% mutate(Item = format(Item)) - } - df <- df %>% mutate(Count = format(Count)) - - if (nrow(df) > nmax.1 & markdown == FALSE) { - df2 <- df[1:nmax,] - print( - knitr::kable(df2, - format = tblformat, - col.names = column_names, - align = column_align, - padding = 1) - ) - cat('... and ', - format(nrow(df) - nmax), - ' more ', - paste0('(n = ', - format(Count.rest), - '; ', - (Count.rest / length(x)) %>% percent(force_zero = TRUE), - ')'), - '. Use `nmax` to show more rows.\n', sep = '') - - } else { - print( - knitr::kable(df, - format = tblformat, - col.names = column_names, - align = column_align, - padding = 1) - ) + if (nrow(df) > nmax.1 & markdown == FALSE) { + df2 <- df[1:nmax,] + print( + knitr::kable(df2, + format = tblformat, + col.names = column_names, + align = column_align, + padding = 1) + ) + cat('... and ', + format(nrow(df) - nmax), + ' more ', + paste0('(n = ', + format(Count.rest), + '; ', + (Count.rest / length(x)) %>% percent(force_zero = TRUE), + ')'), + '.', sep = '') + if (nmax.set == FALSE) { + cat(' Use `nmax` to show more or less rows.') } cat('\n') - # reset old kable setting - options(knitr.kable.NA = opt.old) - return(invisible()) + } else { + print( + knitr::kable(df, + format = tblformat, + col.names = column_names, + align = column_align, + padding = 1) + ) } + cat('\n') + + # reset old kable setting + options(knitr.kable.NA = opt.old) + return(invisible()) } #' @rdname freq diff --git a/README.md b/README.md index 4d889336..8107df26 100644 --- a/README.md +++ b/README.md @@ -3,7 +3,7 @@ [![logo_rug](man/figures/logo_rug.png)](https://www.rug.nl)[![logo_umcg](man/figures/logo_umcg.png)](https://www.umcg.nl) -This R package was created for academic research by PhD students of the Faculty of Medical Sciences of the [University of Groningen)](https://www.rug.nl) and the Medical Microbiology & Infection Prevention (MMBI) department of the [University Medical Center Groningen (UMCG)](https://www.umcg.nl). See [Authors](#authors). +This R package was created for academic research by PhD students of the Faculty of Medical Sciences of the [University of Groningen](https://www.rug.nl) and the Medical Microbiology & Infection Prevention (MMBI) department of the [University Medical Center Groningen (UMCG)](https://www.umcg.nl). See [Authors](#authors). ## Why this package? This R package contains functions to make **microbiological, epidemiological data analysis easier**. It allows the use of some new classes to work with MIC values and antimicrobial interpretations (i.e. values S, I and R). diff --git a/data/septic_patients.rda b/data/septic_patients.rda index 272d28f9..9a4cb8c3 100644 Binary files a/data/septic_patients.rda and b/data/septic_patients.rda differ diff --git a/man/freq.Rd b/man/freq.Rd index ced3a297..a50bb03e 100644 --- a/man/freq.Rd +++ b/man/freq.Rd @@ -5,24 +5,26 @@ \alias{frequency_tbl} \title{Frequency table} \usage{ -freq(x, sort.count = TRUE, nmax = 15, na.rm = TRUE, markdown = FALSE, - toConsole = TRUE, digits = 2, sep = " ") +freq(x, sort.count = TRUE, nmax = getOption("max.print.freq"), + na.rm = TRUE, markdown = FALSE, as.data.frame = FALSE, digits = 2, + sep = " ") -frequency_tbl(x, sort.count = TRUE, nmax = 15, na.rm = TRUE, - markdown = FALSE, toConsole = TRUE, digits = 2, sep = " ") +frequency_tbl(x, sort.count = TRUE, nmax = getOption("max.print.freq"), + na.rm = TRUE, markdown = FALSE, as.data.frame = FALSE, digits = 2, + sep = " ") } \arguments{ \item{x}{data} \item{sort.count}{Sort on count. Use \code{FALSE} to sort alphabetically on item.} -\item{nmax}{number of row to print. Use \code{nmax = 0} or \code{nmax = NA} to print all rows.} +\item{nmax}{number of row to print. The default, \code{15}, uses \code{\link[base]{getOption}("max.print.freq")}. Use \code{nmax = 0} or \code{nmax = NA} to print all rows.} -\item{na.rm}{a logical value indicating whether NA values should be removed from the frequency table. The header will always print the amount of\code{NA}s.} +\item{na.rm}{a logical value indicating whether NA values should be removed from the frequency table. The header will always print the amount of \code{NA}s.} \item{markdown}{print table in markdown format (this forces \code{nmax = NA})} -\item{toConsole}{Print table to the console. Use \code{FALSE} to assign the table to an object.} +\item{as.data.frame}{return frequency table without header as a \code{data.frame} (e.g. to assign the table to an object)} \item{digits}{how many significant digits are to be used for numeric values (not for the items themselves, that depends on \code{\link{getOption}("digits")})} @@ -37,7 +39,7 @@ For numeric values, the next values will be calculated and shown into the header \item{Mean, using \code{\link[base]{mean}}} \item{Standard deviation, using \code{\link[stats]{sd}}} \item{Five numbers of Tukey (min, Q1, median, Q3, max), using \code{\link[stats]{fivenum}}} - \item{Outliers (count and list), using \code{\link{boxplot.stats}}} + \item{Outliers (total count and unique count), using \code{\link{boxplot.stats}}} \item{Coefficient of variation (CV), the standard deviation divided by the mean} \item{Coefficient of quartile variation (CQV, sometimes called coefficient of dispersion), calculated as \code{(Q3 - Q1) / (Q3 + Q1)} using \code{\link{quantile}} with \code{type = 6} as quantile algorithm to comply with SPSS standards} } @@ -63,7 +65,7 @@ septic_patients \%>\% years <- septic_patients \%>\% mutate(year = format(date, "\%Y")) \%>\% select(year) \%>\% - freq(toConsole = FALSE) + freq(as.data.frame = TRUE) } \keyword{freq} \keyword{frequency} diff --git a/tests/testthat/test-eucast.R b/tests/testthat/test-eucast.R index a7d8a0e1..666f5970 100644 --- a/tests/testthat/test-eucast.R +++ b/tests/testthat/test-eucast.R @@ -1,7 +1,7 @@ context("eucast.R") test_that("EUCAST rules work", { - a <- EUCAST_rules(septic_patients) + a <- suppressWarnings(EUCAST_rules(septic_patients)) a <- data.frame(bactid = c("KLEPNE", # Klebsiella pneumoniae "PSEAER", # Pseudomonas aeruginosa diff --git a/tests/testthat/test-freq.R b/tests/testthat/test-freq.R index 5aa65cb9..02bf695d 100644 --- a/tests/testthat/test-freq.R +++ b/tests/testthat/test-freq.R @@ -1,12 +1,12 @@ context("freq.R") test_that("frequency table works", { - expect_equal(nrow(freq(c(1, 1, 2, 2, 3, 3, 4, 4, 5, 5), toConsole = FALSE)), 5) - expect_equal(nrow(frequency_tbl(c(1, 1, 2, 2, 3, 3, 4, 4, 5, 5), toConsole = FALSE)), 5) + expect_equal(nrow(freq(c(1, 1, 2, 2, 3, 3, 4, 4, 5, 5), as.data.frame = TRUE)), 5) + expect_equal(nrow(frequency_tbl(c(1, 1, 2, 2, 3, 3, 4, 4, 5, 5), as.data.frame = TRUE)), 5) # date column of septic_patients should contain 1662 unique dates - expect_equal(nrow(freq(septic_patients$date, toConsole = FALSE)), 1662) - expect_equal(nrow(freq(septic_patients$date, toConsole = FALSE)), + expect_equal(nrow(freq(septic_patients$date, as.data.frame = TRUE)), 1662) + expect_equal(nrow(freq(septic_patients$date, as.data.frame = TRUE)), length(unique(septic_patients$date))) expect_output(freq(septic_patients$age)) diff --git a/tests/testthat/test-mdro.R b/tests/testthat/test-mdro.R index a8fc1a00..d2d38986 100644 --- a/tests/testthat/test-mdro.R +++ b/tests/testthat/test-mdro.R @@ -13,7 +13,7 @@ test_that("MDRO works", { expect_equal(outcome %>% class(), c('ordered', 'factor')) # septic_patients should have these finding using Dutch guidelines - expect_equal(outcome %>% freq(toConsole = FALSE) %>% pull(Count), c(3, 21)) + expect_equal(outcome %>% freq(as.data.frame = TRUE) %>% pull(count), c(3, 21)) expect_equal(BRMO(septic_patients, info = FALSE), MDRO(septic_patients, "nl", info = FALSE)) diff --git a/vignettes/freq.R b/vignettes/freq.R new file mode 100644 index 00000000..84ddea08 --- /dev/null +++ b/vignettes/freq.R @@ -0,0 +1,91 @@ +## ----setup, include = FALSE, results = 'markup'-------------------------- +knitr::opts_chunk$set( + collapse = TRUE, + comment = "#" +) +library(dplyr) +library(AMR) + +## ---- echo = TRUE, results = 'hide'-------------------------------------- +# # just using base R +freq(septic_patients$sex) + +# # using base R to select the variable and pass it on with a pipe +septic_patients$sex %>% freq() + +# # do it all with pipes, using the `select` function of the dplyr package +septic_patients %>% + select(sex) %>% + freq() + +## ---- echo = TRUE-------------------------------------------------------- +freq(septic_patients$sex) + +## ---- echo = TRUE, results = 'hide'-------------------------------------- +my_patients <- septic_patients %>% + left_join_microorganisms() + +## ---- echo = TRUE-------------------------------------------------------- +colnames(microorganisms) + +## ---- echo = TRUE-------------------------------------------------------- +dim(septic_patients) +dim(my_patients) + +## ---- echo = TRUE-------------------------------------------------------- +my_patients %>% + select(genus, species) %>% + freq() + +## ---- echo = TRUE-------------------------------------------------------- +# # get age distribution of unique patients +septic_patients %>% + distinct(patient_id, .keep_all = TRUE) %>% + select(age) %>% + freq(nmax = 5) + +## ---- echo = TRUE-------------------------------------------------------- +septic_patients %>% + select(hospital_id) %>% + freq() + +## ---- echo = TRUE-------------------------------------------------------- +septic_patients %>% + select(hospital_id) %>% + freq(sort.count = TRUE) + +## ---- echo = TRUE-------------------------------------------------------- +septic_patients %>% + select(amox) %>% + freq() + +## ---- echo = TRUE-------------------------------------------------------- +septic_patients %>% + select(date) %>% + freq(nmax = 5) + +## ---- echo = TRUE-------------------------------------------------------- +septic_patients %>% + select(amox) %>% + freq(na.rm = FALSE) + +## ---- echo = TRUE-------------------------------------------------------- +septic_patients %>% + select(hospital_id) %>% + freq(markdown = TRUE) + +## ---- echo = TRUE-------------------------------------------------------- +my_df <- septic_patients %>% + select(hospital_id) %>% + freq(as.data.frame = TRUE) + +my_df + +class(my_df) + +## ---- echo = FALSE------------------------------------------------------- +# this will print "2018" in 2018, and "2018-yyyy" after 2018. +yrs <- c(2018:format(Sys.Date(), "%Y")) +yrs <- c(min(yrs), max(yrs)) +yrs <- paste(unique(yrs), collapse = "-") + diff --git a/vignettes/freq.Rmd b/vignettes/freq.Rmd new file mode 100644 index 00000000..7eb895e7 --- /dev/null +++ b/vignettes/freq.Rmd @@ -0,0 +1,183 @@ +--- +title: "Creating Frequency Tables" +author: "Matthijs S. Berends" +output: + rmarkdown::html_vignette: + toc: true +vignette: > + %\VignetteIndexEntry{Vignette Title} + %\VignetteEngine{knitr::rmarkdown} + %\VignetteEncoding{UTF-8} +--- + +```{r setup, include = FALSE, results = 'markup'} +knitr::opts_chunk$set( + collapse = TRUE, + comment = "#" +) +library(dplyr) +library(AMR) +``` + +## Introduction + +Frequency tables (or frequency distributions) are summaries of the distribution of values in a sample. With the `freq` function, you can create univariate frequency tables. Multiple variables will be pasted into one variable, so it forces a univariate distribution. We take the `septic_patients` dataset (included in this AMR package) as example. + +## Frequencies of one variable + +To only show and quickly review the content of one variable, you can just select this variable in various ways. Let's say we want to get the frequencies of the `sex` variable of the `septic_patients` dataset: +```{r, echo = TRUE, results = 'hide'} +# # just using base R +freq(septic_patients$sex) + +# # using base R to select the variable and pass it on with a pipe +septic_patients$sex %>% freq() + +# # do it all with pipes, using the `select` function of the dplyr package +septic_patients %>% + select(sex) %>% + freq() +``` +This will all lead to the following table: +```{r, echo = TRUE} +freq(septic_patients$sex) +``` +This immediately shows the class of the variable, its length and availability (i.e. the amount of `NA`), the amount of unique values and (most importantly) that among septic patients men are more prevalent than women. + +## Frequencies of more than one variable + +Multiple variables will be pasted into one variable to review individual cases, keeping a univariate frequency table. + +For illustration, we could add some more variables to the `septic_patients` dataset to learn about bacterial properties: +```{r, echo = TRUE, results = 'hide'} +my_patients <- septic_patients %>% + left_join_microorganisms() +``` +Now all variables of the `microorganisms` dataset have been joined to the `septic_patients` dataset. The `microorganisms` dataset consists of the following variables: +```{r, echo = TRUE} +colnames(microorganisms) +``` + +If we compare the dimensions between the old and new dataset, we can see that these `r ncol(my_patients) - ncol(septic_patients)` variables were added: +```{r, echo = TRUE} +dim(septic_patients) +dim(my_patients) +``` + +So now the `genus` and `species` variables are available. A frequency table of these combined variables can be created like this: +```{r, echo = TRUE} +my_patients %>% + select(genus, species) %>% + freq() +``` + +## Frequencies of numeric values + +Frequency tables can be created of any input. + +In case of numeric values (like integers, doubles, etc.) additional descriptive statistics will be calculated and shown into the header: + +```{r, echo = TRUE} +# # get age distribution of unique patients +septic_patients %>% + distinct(patient_id, .keep_all = TRUE) %>% + select(age) %>% + freq(nmax = 5) +``` + +So the following properties are determined, where `NA` values are always ignored: + +* **Mean** + +* **Standard deviation** + +* **Coefficient of variation** (CV), the standard deviation divided by the mean + +* **Five numbers of Tukey** (min, Q1, median, Q3, max) + +* **Coefficient of quartile variation** (CQV, sometimes called coefficient of dispersion), calculated as (Q3 - Q1) / (Q3 + Q1) using quantile with `type = 6` as quantile algorithm to comply with SPSS standards + +* **Outliers** (total count and unique count) + +So for example, the above frequency table quickly shows the median age of patients being `r my_patients %>% distinct(patient_id, .keep_all = TRUE) %>% pull(age) %>% median(na.rm = TRUE)`. + +## Frequencies of factors + +Frequencies of factors will be sorted on factor level instead of item count by default. This can be changed with the `sort.count` parameter. Frequency tables of factors always show the factor level as an additional last column. + +`sort.count` is `TRUE` by default, except for factors. Compare this default behaviour: + +```{r, echo = TRUE} +septic_patients %>% + select(hospital_id) %>% + freq() +``` + +To this, where items are now sorted on item count: + +```{r, echo = TRUE} +septic_patients %>% + select(hospital_id) %>% + freq(sort.count = TRUE) +``` + +All classes will be printed into the header. Variables with the new `rsi` class of this AMR package are actually ordered factors and have three classes (look at `Class` in the header): + +```{r, echo = TRUE} +septic_patients %>% + select(amox) %>% + freq() +``` + +## Frequencies of dates + +Frequencies of dates will show the oldest and newest date in the data, and the amount of days between them: + +```{r, echo = TRUE} +septic_patients %>% + select(date) %>% + freq(nmax = 5) +``` + +## Additional parameters + +### Parameter `na.rm` +With the `na.rm` parameter (defaults to `TRUE`, but they will always be shown into the header), you can include `NA` values in the frequency table: +```{r, echo = TRUE} +septic_patients %>% + select(amox) %>% + freq(na.rm = FALSE) +``` + +### Parameter `markdown` +The `markdown` parameter can be used in reports created with R Markdown. This will always print all rows: + +```{r, echo = TRUE} +septic_patients %>% + select(hospital_id) %>% + freq(markdown = TRUE) +``` + +### Parameter `as.data.frame` +With the `as.data.frame` parameter you can assign the frequency table to an object, or just print it as a `data.frame` to the console: + +```{r, echo = TRUE} +my_df <- septic_patients %>% + select(hospital_id) %>% + freq(as.data.frame = TRUE) + +my_df + +class(my_df) +``` + +---- +```{r, echo = FALSE} +# this will print "2018" in 2018, and "2018-yyyy" after 2018. +yrs <- c(2018:format(Sys.Date(), "%Y")) +yrs <- c(min(yrs), max(yrs)) +yrs <- paste(unique(yrs), collapse = "-") +``` +AMR, (c) `r yrs`, `r packageDescription("AMR")$URL` + +Licensed under the [GNU General Public License v2.0](https://github.com/msberends/AMR/blob/master/LICENSE). diff --git a/vignettes/freq.html b/vignettes/freq.html new file mode 100644 index 00000000..f956a698 --- /dev/null +++ b/vignettes/freq.html @@ -0,0 +1,344 @@ + + + + + + + + + + + + + + + +Creating Frequency Tables + + + + + + + + + + + + + + + + + +

Creating Frequency Tables

+

Matthijs S. Berends

+ + + + +
+

Introduction

+

Frequency tables (or frequency distributions) are summaries of the distribution of values in a sample. With the freq function, you can create univariate frequency tables. Multiple variables will be pasted into one variable, so it forces a univariate distribution. We take the septic_patients dataset (included in this AMR package) as example.

+
+
+

Frequencies of one variable

+

To only show and quickly review the content of one variable, you can just select this variable in various ways. Let’s say we want to get the frequencies of the sex variable of the septic_patients dataset:

+
# # just using base R
+freq(septic_patients$sex)
+
+# # using base R to select the variable and pass it on with a pipe
+septic_patients$sex %>% freq()
+
+# # do it all with pipes, using the `select` function of the dplyr package
+septic_patients %>%
+  select(sex) %>%
+  freq()
+

This will all lead to the following table:

+
freq(septic_patients$sex)
+# Class:     character
+# Length:    2000 (of which NA: 0 = 0.0%)
+# Unique:    2
+# 
+# Item    Count   Percent   Cum. Count   Cum. Percent
+# -----  ------  --------  -----------  -------------
+# M        1112     55.6%         1112          55.6%
+# F         888     44.4%         2000         100.0%
+

This immediately shows the class of the variable, its length and availability (i.e. the amount of NA), the amount of unique values and (most importantly) that among septic patients men are more prevalent than women.

+
+
+

Frequencies of more than one variable

+

Multiple variables will be pasted into one variable to review individual cases, keeping a univariate frequency table.

+

For illustration, we could add some more variables to the septic_patients dataset to learn about bacterial properties:

+
my_patients <- septic_patients %>% 
+  left_join_microorganisms()
+

Now all variables of the microorganisms dataset have been joined to the septic_patients dataset. The microorganisms dataset consists of the following variables:

+
colnames(microorganisms)
+#  [1] "bactid"       "bactsys"      "family"       "genus"       
+#  [5] "species"      "subspecies"   "fullname"     "type"        
+#  [9] "gramstain"    "aerobic"      "type_nl"      "gramstain_nl"
+

If we compare the dimensions between the old and new dataset, we can see that these 11 variables were added:

+
dim(septic_patients)
+# [1] 2000   47
+dim(my_patients)
+# [1] 2000   58
+

So now the genus and species variables are available. A frequency table of these combined variables can be created like this:

+
my_patients %>%
+  select(genus, species) %>%
+  freq()
+# Columns:   2
+# Length:    2000 (of which NA: 0 = 0.0%)
+# Unique:    137
+# 
+# Item                                 Count   Percent   Cum. Count   Cum. Percent
+# ----------------------------------  ------  --------  -----------  -------------
+# Escherichia coli                       485     24.2%          485          24.2%
+# Staphylococcus coagulase negatief      297     14.8%          782          39.1%
+# Staphylococcus aureus                  200     10.0%          982          49.1%
+# Staphylococcus epidermidis             150      7.5%         1132          56.6%
+# Streptococcus pneumoniae                97      4.9%         1229          61.5%
+# Staphylococcus hominis                  67      3.4%         1296          64.8%
+# Klebsiella pneumoniae                   65      3.2%         1361          68.0%
+# Enterococcus faecalis                   44      2.2%         1405          70.2%
+# Proteus mirabilis                       33      1.7%         1438          71.9%
+# Pseudomonas aeruginosa                  31      1.6%         1469          73.5%
+# Streptococcus pyogenes                  30      1.5%         1499          75.0%
+# Enterococcus faecium                    27      1.4%         1526          76.3%
+# Bacteroides fragilis                    26      1.3%         1552          77.6%
+# Enterobacter cloacae                    25      1.2%         1577          78.8%
+# Klebsiella oxytoca                      23      1.1%         1600          80.0%
+# ... and 122 more (n = 400; 20.0%). Use `nmax` to show more or less rows.
+
+
+

Frequencies of numeric values

+

Frequency tables can be created of any input.

+

In case of numeric values (like integers, doubles, etc.) additional descriptive statistics will be calculated and shown into the header:

+
# # get age distribution of unique patients
+septic_patients %>% 
+  distinct(patient_id, .keep_all = TRUE) %>% 
+  select(age) %>% 
+  freq(nmax = 5)
+# Class:     integer
+# Length:    1920 (of which NA: 0 = 0.0%)
+# Unique:    94
+#   
+# Mean:      68
+# Std. dev.: 18 (CV: 0.27)
+# Five-Num:  0  |  61  |  72  |  80  |  101 (CQV: 0.13)
+# Outliers:  94 (unique: 26)
+# 
+#  Item   Count   Percent   Cum. Count   Cum. Percent
+# -----  ------  --------  -----------  -------------
+#     0      34      1.8%           34           1.8%
+#     1       5      0.3%           39           2.0%
+#     2       5      0.3%           44           2.3%
+#     3       2      0.1%           46           2.4%
+#     4       1      0.1%           47           2.4%
+# ... and 89 more (n = 1873; 97.6%).
+

So the following properties are determined, where NA values are always ignored:

+
    +
  • Mean

  • +
  • Standard deviation

  • +
  • Coefficient of variation (CV), the standard deviation divided by the mean

  • +
  • Five numbers of Tukey (min, Q1, median, Q3, max)

  • +
  • Coefficient of quartile variation (CQV, sometimes called coefficient of dispersion), calculated as (Q3 - Q1) / (Q3 + Q1) using quantile with type = 6 as quantile algorithm to comply with SPSS standards

  • +
  • Outliers (total count and unique count)

  • +
+

So for example, the above frequency table quickly shows the median age of patients being 72.

+
+
+

Frequencies of factors

+

Frequencies of factors will be sorted on factor level instead of item count by default. This can be changed with the sort.count parameter. Frequency tables of factors always show the factor level as an additional last column.

+

sort.count is TRUE by default, except for factors. Compare this default behaviour:

+
septic_patients %>%
+  select(hospital_id) %>% 
+  freq()
+# Class:     factor
+# Length:    2000 (of which NA: 0 = 0.0%)
+# Unique:    5
+# 
+# Item    Count   Percent   Cum. Count   Cum. Percent   (Factor Level)
+# -----  ------  --------  -----------  -------------  ---------------
+# A         233     11.7%          233          11.7%                1
+# B         583     29.1%          816          40.8%                2
+# C         221     11.1%         1037          51.8%                3
+# D         650     32.5%         1687          84.4%                4
+# E         313     15.7%         2000         100.0%                5
+

To this, where items are now sorted on item count:

+
septic_patients %>%
+  select(hospital_id) %>% 
+  freq(sort.count = TRUE)
+# Class:     factor
+# Length:    2000 (of which NA: 0 = 0.0%)
+# Unique:    5
+# 
+# Item    Count   Percent   Cum. Count   Cum. Percent   (Factor Level)
+# -----  ------  --------  -----------  -------------  ---------------
+# D         650     32.5%          650          32.5%                4
+# B         583     29.1%         1233          61.7%                2
+# E         313     15.7%         1546          77.3%                5
+# A         233     11.7%         1779          88.9%                1
+# C         221     11.1%         2000         100.0%                3
+

All classes will be printed into the header. Variables with the new rsi class of this AMR package are actually ordered factors and have three classes (look at Class in the header):

+
septic_patients %>%
+  select(amox) %>% 
+  freq()
+# Class:     factor > ordered > rsi
+# Length:    2000 (of which NA: 678 = 33.9%)
+# Unique:    3
+# 
+# Item    Count   Percent   Cum. Count   Cum. Percent   (Factor Level)
+# -----  ------  --------  -----------  -------------  ---------------
+# S         561     42.4%          561          42.4%                1
+# I          49      3.7%          610          46.1%                2
+# R         712     53.9%         1322         100.0%                3
+
+
+

Frequencies of dates

+

Frequencies of dates will show the oldest and newest date in the data, and the amount of days between them:

+
septic_patients %>%
+  select(date) %>% 
+  freq(nmax = 5)
+# Class:     Date
+# Length:    2000 (of which NA: 0 = 0.0%)
+# Unique:    1662
+# 
+# Oldest:    2 januari 2001
+# Newest:    18 oktober 2017 (+6133)
+# 
+# Item          Count   Percent   Cum. Count   Cum. Percent
+# -----------  ------  --------  -----------  -------------
+# 2008-12-24        5      0.2%            5           0.2%
+# 2010-12-10        4      0.2%            9           0.4%
+# 2011-03-03        4      0.2%           13           0.6%
+# 2013-06-24        4      0.2%           17           0.8%
+# 2017-09-01        4      0.2%           21           1.1%
+# ... and 1657 more (n = 1979; 99.0%).
+
+
+

Additional parameters

+
+

Parameter na.rm

+

With the na.rm parameter (defaults to TRUE, but they will always be shown into the header), you can include NA values in the frequency table:

+
septic_patients %>%
+  select(amox) %>% 
+  freq(na.rm = FALSE)
+# Class:     factor > ordered > rsi
+# Length:    2678 (of which NA: 678 = 25.3%)
+# Unique:    4
+# 
+# Item    Count   Percent   Cum. Count   Cum. Percent   (Factor Level)
+# -----  ------  --------  -----------  -------------  ---------------
+# S         561     28.1%          561          28.1%                1
+# I          49      2.5%          610          30.5%                2
+# R         712     35.6%         1322          66.1%                3
+# <NA>      678     33.9%         2000         100.0%             <NA>
+
+
+

Parameter markdown

+

The markdown parameter can be used in reports created with R Markdown. This will always print all rows:

+
septic_patients %>%
+  select(hospital_id) %>% 
+  freq(markdown = TRUE)
+# 
+# Class:     factor
+# 
+# Length:    2000 (of which NA: 0 = 0.0%)
+# 
+# Unique:    5
+# 
+# |Item | Count| Percent| Cum. Count| Cum. Percent| (Factor Level)|
+# |:----|-----:|-------:|----------:|------------:|--------------:|
+# |A    |   233|   11.7%|        233|        11.7%|              1|
+# |B    |   583|   29.1%|        816|        40.8%|              2|
+# |C    |   221|   11.1%|       1037|        51.8%|              3|
+# |D    |   650|   32.5%|       1687|        84.4%|              4|
+# |E    |   313|   15.7%|       2000|       100.0%|              5|
+
+
+

Parameter as.data.frame

+

With the as.data.frame parameter you can assign the frequency table to an object, or just print it as a data.frame to the console:

+
my_df <- septic_patients %>%
+  select(hospital_id) %>% 
+  freq(as.data.frame = TRUE)
+
+my_df
+#   item count percent cum_count cum_percent factor_level
+# 1    A   233  0.1165       233      0.1165            1
+# 2    B   583  0.2915       816      0.4080            2
+# 3    C   221  0.1105      1037      0.5185            3
+# 4    D   650  0.3250      1687      0.8435            4
+# 5    E   313  0.1565      2000      1.0000            5
+
+class(my_df)
+# [1] "data.frame"
+
+

AMR, (c) 2018, https://github.com/msberends/AMR

+

Licensed under the GNU General Public License v2.0.

+
+
+ + + + + + + +