added vignette of freq

This commit is contained in:
dr. M.S. (Matthijs) Berends 2018-05-09 11:44:46 +02:00
parent 25b3346d9a
commit f05e7178cb
13 changed files with 726 additions and 82 deletions

1
.gitignore vendored
View File

@ -4,3 +4,4 @@
.Ruserdata
AMR.Rproj
tests/testthat/Rplots.pdf
inst/doc

View File

@ -1,6 +1,6 @@
Package: AMR
Version: 0.2.0
Date: 2018-05-02
Version: 0.2.0.9000
Date: 2018-05-09
Title: Antimicrobial Resistance Analysis
Authors@R: c(
person(
@ -37,7 +37,10 @@ Imports:
tibble
Suggests:
testthat (>= 1.0.2),
covr (>= 3.0.1)
covr (>= 3.0.1),
knitr,
rmarkdown
VignetteBuilder: knitr
URL: https://github.com/msberends/AMR
BugReports: https://github.com/msberends/AMR/issues
License: GPL-2 | file LICENSE

11
NEWS.md
View File

@ -1,4 +1,13 @@
# 0.2.0
# 0.2.9000 (development version)
#### New
* Vignettes about frequency tables: [vignettes/freq.html](vignettes/freq.html)
* Possibility to globally set the default for the amount of items to print in frequency tables (`freq` function), with `options(max.print.freq = n)`
#### Changed
* Renamed `toConsole` parameter of `freq` to `as.data.frame`
* Small translational improvements to the `septic_patients` dataset
# 0.2.0 (latest stable version)
#### New
* Full support for Windows, Linux and macOS
* Full support for old R versions, only R-3.0.0 (April 2013) or later is needed (needed packages may have other dependencies)

135
R/freq.R
View File

@ -21,10 +21,10 @@
#' Create a frequency table of a vector of data, a single column or a maximum of 9 columns of a data frame. Supports markdown for reports.
#' @param x data
#' @param sort.count Sort on count. Use \code{FALSE} to sort alphabetically on item.
#' @param nmax number of row to print. Use \code{nmax = 0} or \code{nmax = NA} to print all rows.
#' @param na.rm a logical value indicating whether NA values should be removed from the frequency table. The header will always print the amount of\code{NA}s.
#' @param nmax number of row to print. The default, \code{15}, uses \code{\link[base]{getOption}("max.print.freq")}. Use \code{nmax = 0} or \code{nmax = NA} to print all rows.
#' @param na.rm a logical value indicating whether NA values should be removed from the frequency table. The header will always print the amount of \code{NA}s.
#' @param markdown print table in markdown format (this forces \code{nmax = NA})
#' @param toConsole Print table to the console. Use \code{FALSE} to assign the table to an object.
#' @param as.data.frame return frequency table without header as a \code{data.frame} (e.g. to assign the table to an object)
#' @param digits how many significant digits are to be used for numeric values (not for the items themselves, that depends on \code{\link{getOption}("digits")})
#' @param sep a character string to separate the terms when selecting multiple columns
#' @details For numeric values, the next values will be calculated and shown into the header:
@ -32,7 +32,7 @@
#' \item{Mean, using \code{\link[base]{mean}}}
#' \item{Standard deviation, using \code{\link[stats]{sd}}}
#' \item{Five numbers of Tukey (min, Q1, median, Q3, max), using \code{\link[stats]{fivenum}}}
#' \item{Outliers (count and list), using \code{\link{boxplot.stats}}}
#' \item{Outliers (total count and unique count), using \code{\link{boxplot.stats}}}
#' \item{Coefficient of variation (CV), the standard deviation divided by the mean}
#' \item{Coefficient of quartile variation (CQV, sometimes called coefficient of dispersion), calculated as \code{(Q3 - Q1) / (Q3 + Q1)} using \code{\link{quantile}} with \code{type = 6} as quantile algorithm to comply with SPSS standards}
#' }
@ -63,13 +63,13 @@
#' years <- septic_patients %>%
#' mutate(year = format(date, "%Y")) %>%
#' select(year) %>%
#' freq(toConsole = FALSE)
#' freq(as.data.frame = TRUE)
freq <- function(x,
sort.count = TRUE,
nmax = 15,
nmax = getOption("max.print.freq"),
na.rm = TRUE,
markdown = FALSE,
toConsole = TRUE,
as.data.frame = FALSE,
digits = 2,
sep = " ") {
@ -156,8 +156,8 @@ freq <- function(x,
stop('A maximum of 9 columns can be analysed at the same time.', call. = FALSE)
}
}
if (markdown == TRUE & toConsole == FALSE) {
warning('`toConsole = FALSE` will be ignored when `markdown = TRUE`.')
if (markdown == TRUE & as.data.frame == TRUE) {
warning('`as.data.frame = TRUE` will be ignored when `markdown = TRUE`.')
}
if (mult.columns > 1) {
@ -232,7 +232,7 @@ freq <- function(x,
x <- x %>% format(formatdates)
}
if (toConsole == TRUE) {
if (as.data.frame == FALSE) {
cat(header)
}
@ -244,22 +244,30 @@ freq <- function(x,
warning('All observations are unique.', call. = FALSE)
}
if (nmax == 0 | is.na(nmax)) {
nmax.set <- !missing(nmax)
if (is.null(nmax) & is.null(base::getOption("max.print.freq", default = NULL))) {
# default for max print setting
nmax <- 15
}
if (nmax == 0 | is.na(nmax) | is.null(nmax)) {
nmax <- length(x)
}
nmax.1 <- min(length(x), nmax + 1)
# create table with counts and percentages
column_names <- c('Item', 'Count', 'Percent', 'Cum. Count', 'Cum. Percent', '(Factor Level)')
column_names_df <- c('item', 'count', 'percent', 'cum_count', 'cum_percent', 'factor_level')
if (any(class(x) == 'factor')) {
df <- tibble::tibble(Item = x,
Fctlvl = x %>% as.integer()) %>%
group_by(Item, Fctlvl)
column_names <- c('Item', 'Count', 'Percent', 'Cum. Count', 'Cum. Percent', '(Factor Level)')
column_align <- c('l', 'r', 'r', 'r', 'r', 'r')
} else {
df <- tibble::tibble(Item = x) %>%
group_by(Item)
column_names <- c('Item', 'Count', 'Percent', 'Cum. Count', 'Cum. Percent')
column_names <- column_names[1:5] # strip factor lvl
column_names_df <- column_names_df[1:5] # strip factor lvl
column_align <- c(x_align, 'r', 'r', 'r', 'r')
}
df <- df %>%
@ -276,10 +284,10 @@ freq <- function(x,
# sort according to setting
if (sort.count == TRUE) {
df <- df %>% arrange(desc(Count))
df <- df %>% arrange(desc(Count), Item)
} else {
if (any(class(x) == 'factor')) {
df <- df %>% arrange(Fctlvl)
df <- df %>% arrange(Fctlvl, Item)
} else {
df <- df %>% arrange(Item)
}
@ -295,65 +303,68 @@ freq <- function(x,
df <- df %>% select(Item, Count, Percent, Cum, CumTot, Fctlvl)
}
if (as.data.frame == TRUE) {
# assign to object
df[, 3] <- df[, 2] / sum(df[, 2], na.rm = TRUE)
df[, 4] <- cumsum(df[, 2])
df[, 5] <- df[, 4] / sum(df[, 2], na.rm = TRUE)
colnames(df) <- column_names_df
return(as.data.frame(df, stringsAsFactors = FALSE))
}
if (markdown == TRUE) {
tblformat <- 'markdown'
} else {
tblformat <- 'pandoc'
}
if (toConsole == FALSE) {
# assign to object
df[, 3] <- df[, 2] / sum(df[, 2], na.rm = TRUE)
df[, 4] <- cumsum(df[, 2])
df[, 5] <- df[, 4] / sum(df[, 2], na.rm = TRUE)
return(df)
# save old NA setting for kable
opt.old <- options()$knitr.kable.NA
options(knitr.kable.NA = "<NA>")
} else {
Count.rest <- sum(df[nmax.1:nrow(df), 'Count'], na.rm = TRUE)
if (any(class(x) %in% c('double', 'integer', 'numeric', 'raw', 'single'))) {
df <- df %>% mutate(Item = format(Item))
}
df <- df %>% mutate(Count = format(Count))
# save old NA setting for kable
opt.old <- options()$knitr.kable.NA
options(knitr.kable.NA = "<NA>")
Count.rest <- sum(df[nmax.1:nrow(df), 'Count'], na.rm = TRUE)
if (any(class(x) %in% c('double', 'integer', 'numeric', 'raw', 'single'))) {
df <- df %>% mutate(Item = format(Item))
}
df <- df %>% mutate(Count = format(Count))
if (nrow(df) > nmax.1 & markdown == FALSE) {
df2 <- df[1:nmax,]
print(
knitr::kable(df2,
format = tblformat,
col.names = column_names,
align = column_align,
padding = 1)
)
cat('... and ',
format(nrow(df) - nmax),
' more ',
paste0('(n = ',
format(Count.rest),
'; ',
(Count.rest / length(x)) %>% percent(force_zero = TRUE),
')'),
'. Use `nmax` to show more rows.\n', sep = '')
} else {
print(
knitr::kable(df,
format = tblformat,
col.names = column_names,
align = column_align,
padding = 1)
)
if (nrow(df) > nmax.1 & markdown == FALSE) {
df2 <- df[1:nmax,]
print(
knitr::kable(df2,
format = tblformat,
col.names = column_names,
align = column_align,
padding = 1)
)
cat('... and ',
format(nrow(df) - nmax),
' more ',
paste0('(n = ',
format(Count.rest),
'; ',
(Count.rest / length(x)) %>% percent(force_zero = TRUE),
')'),
'.', sep = '')
if (nmax.set == FALSE) {
cat(' Use `nmax` to show more or less rows.')
}
cat('\n')
# reset old kable setting
options(knitr.kable.NA = opt.old)
return(invisible())
} else {
print(
knitr::kable(df,
format = tblformat,
col.names = column_names,
align = column_align,
padding = 1)
)
}
cat('\n')
# reset old kable setting
options(knitr.kable.NA = opt.old)
return(invisible())
}
#' @rdname freq

View File

@ -3,7 +3,7 @@
[![logo_rug](man/figures/logo_rug.png)](https://www.rug.nl)[![logo_umcg](man/figures/logo_umcg.png)](https://www.umcg.nl)
This R package was created for academic research by PhD students of the Faculty of Medical Sciences of the [University of Groningen)](https://www.rug.nl) and the Medical Microbiology & Infection Prevention (MMBI) department of the [University Medical Center Groningen (UMCG)](https://www.umcg.nl). See [Authors](#authors).
This R package was created for academic research by PhD students of the Faculty of Medical Sciences of the [University of Groningen](https://www.rug.nl) and the Medical Microbiology & Infection Prevention (MMBI) department of the [University Medical Center Groningen (UMCG)](https://www.umcg.nl). See [Authors](#authors).
## Why this package?
This R package contains functions to make **microbiological, epidemiological data analysis easier**. It allows the use of some new classes to work with MIC values and antimicrobial interpretations (i.e. values S, I and R).

Binary file not shown.

View File

@ -5,24 +5,26 @@
\alias{frequency_tbl}
\title{Frequency table}
\usage{
freq(x, sort.count = TRUE, nmax = 15, na.rm = TRUE, markdown = FALSE,
toConsole = TRUE, digits = 2, sep = " ")
freq(x, sort.count = TRUE, nmax = getOption("max.print.freq"),
na.rm = TRUE, markdown = FALSE, as.data.frame = FALSE, digits = 2,
sep = " ")
frequency_tbl(x, sort.count = TRUE, nmax = 15, na.rm = TRUE,
markdown = FALSE, toConsole = TRUE, digits = 2, sep = " ")
frequency_tbl(x, sort.count = TRUE, nmax = getOption("max.print.freq"),
na.rm = TRUE, markdown = FALSE, as.data.frame = FALSE, digits = 2,
sep = " ")
}
\arguments{
\item{x}{data}
\item{sort.count}{Sort on count. Use \code{FALSE} to sort alphabetically on item.}
\item{nmax}{number of row to print. Use \code{nmax = 0} or \code{nmax = NA} to print all rows.}
\item{nmax}{number of row to print. The default, \code{15}, uses \code{\link[base]{getOption}("max.print.freq")}. Use \code{nmax = 0} or \code{nmax = NA} to print all rows.}
\item{na.rm}{a logical value indicating whether NA values should be removed from the frequency table. The header will always print the amount of\code{NA}s.}
\item{na.rm}{a logical value indicating whether NA values should be removed from the frequency table. The header will always print the amount of \code{NA}s.}
\item{markdown}{print table in markdown format (this forces \code{nmax = NA})}
\item{toConsole}{Print table to the console. Use \code{FALSE} to assign the table to an object.}
\item{as.data.frame}{return frequency table without header as a \code{data.frame} (e.g. to assign the table to an object)}
\item{digits}{how many significant digits are to be used for numeric values (not for the items themselves, that depends on \code{\link{getOption}("digits")})}
@ -37,7 +39,7 @@ For numeric values, the next values will be calculated and shown into the header
\item{Mean, using \code{\link[base]{mean}}}
\item{Standard deviation, using \code{\link[stats]{sd}}}
\item{Five numbers of Tukey (min, Q1, median, Q3, max), using \code{\link[stats]{fivenum}}}
\item{Outliers (count and list), using \code{\link{boxplot.stats}}}
\item{Outliers (total count and unique count), using \code{\link{boxplot.stats}}}
\item{Coefficient of variation (CV), the standard deviation divided by the mean}
\item{Coefficient of quartile variation (CQV, sometimes called coefficient of dispersion), calculated as \code{(Q3 - Q1) / (Q3 + Q1)} using \code{\link{quantile}} with \code{type = 6} as quantile algorithm to comply with SPSS standards}
}
@ -63,7 +65,7 @@ septic_patients \%>\%
years <- septic_patients \%>\%
mutate(year = format(date, "\%Y")) \%>\%
select(year) \%>\%
freq(toConsole = FALSE)
freq(as.data.frame = TRUE)
}
\keyword{freq}
\keyword{frequency}

View File

@ -1,7 +1,7 @@
context("eucast.R")
test_that("EUCAST rules work", {
a <- EUCAST_rules(septic_patients)
a <- suppressWarnings(EUCAST_rules(septic_patients))
a <- data.frame(bactid = c("KLEPNE", # Klebsiella pneumoniae
"PSEAER", # Pseudomonas aeruginosa

View File

@ -1,12 +1,12 @@
context("freq.R")
test_that("frequency table works", {
expect_equal(nrow(freq(c(1, 1, 2, 2, 3, 3, 4, 4, 5, 5), toConsole = FALSE)), 5)
expect_equal(nrow(frequency_tbl(c(1, 1, 2, 2, 3, 3, 4, 4, 5, 5), toConsole = FALSE)), 5)
expect_equal(nrow(freq(c(1, 1, 2, 2, 3, 3, 4, 4, 5, 5), as.data.frame = TRUE)), 5)
expect_equal(nrow(frequency_tbl(c(1, 1, 2, 2, 3, 3, 4, 4, 5, 5), as.data.frame = TRUE)), 5)
# date column of septic_patients should contain 1662 unique dates
expect_equal(nrow(freq(septic_patients$date, toConsole = FALSE)), 1662)
expect_equal(nrow(freq(septic_patients$date, toConsole = FALSE)),
expect_equal(nrow(freq(septic_patients$date, as.data.frame = TRUE)), 1662)
expect_equal(nrow(freq(septic_patients$date, as.data.frame = TRUE)),
length(unique(septic_patients$date)))
expect_output(freq(septic_patients$age))

View File

@ -13,7 +13,7 @@ test_that("MDRO works", {
expect_equal(outcome %>% class(), c('ordered', 'factor'))
# septic_patients should have these finding using Dutch guidelines
expect_equal(outcome %>% freq(toConsole = FALSE) %>% pull(Count), c(3, 21))
expect_equal(outcome %>% freq(as.data.frame = TRUE) %>% pull(count), c(3, 21))
expect_equal(BRMO(septic_patients, info = FALSE), MDRO(septic_patients, "nl", info = FALSE))

91
vignettes/freq.R Normal file
View File

@ -0,0 +1,91 @@
## ----setup, include = FALSE, results = 'markup'--------------------------
knitr::opts_chunk$set(
collapse = TRUE,
comment = "#"
)
library(dplyr)
library(AMR)
## ---- echo = TRUE, results = 'hide'--------------------------------------
# # just using base R
freq(septic_patients$sex)
# # using base R to select the variable and pass it on with a pipe
septic_patients$sex %>% freq()
# # do it all with pipes, using the `select` function of the dplyr package
septic_patients %>%
select(sex) %>%
freq()
## ---- echo = TRUE--------------------------------------------------------
freq(septic_patients$sex)
## ---- echo = TRUE, results = 'hide'--------------------------------------
my_patients <- septic_patients %>%
left_join_microorganisms()
## ---- echo = TRUE--------------------------------------------------------
colnames(microorganisms)
## ---- echo = TRUE--------------------------------------------------------
dim(septic_patients)
dim(my_patients)
## ---- echo = TRUE--------------------------------------------------------
my_patients %>%
select(genus, species) %>%
freq()
## ---- echo = TRUE--------------------------------------------------------
# # get age distribution of unique patients
septic_patients %>%
distinct(patient_id, .keep_all = TRUE) %>%
select(age) %>%
freq(nmax = 5)
## ---- echo = TRUE--------------------------------------------------------
septic_patients %>%
select(hospital_id) %>%
freq()
## ---- echo = TRUE--------------------------------------------------------
septic_patients %>%
select(hospital_id) %>%
freq(sort.count = TRUE)
## ---- echo = TRUE--------------------------------------------------------
septic_patients %>%
select(amox) %>%
freq()
## ---- echo = TRUE--------------------------------------------------------
septic_patients %>%
select(date) %>%
freq(nmax = 5)
## ---- echo = TRUE--------------------------------------------------------
septic_patients %>%
select(amox) %>%
freq(na.rm = FALSE)
## ---- echo = TRUE--------------------------------------------------------
septic_patients %>%
select(hospital_id) %>%
freq(markdown = TRUE)
## ---- echo = TRUE--------------------------------------------------------
my_df <- septic_patients %>%
select(hospital_id) %>%
freq(as.data.frame = TRUE)
my_df
class(my_df)
## ---- echo = FALSE-------------------------------------------------------
# this will print "2018" in 2018, and "2018-yyyy" after 2018.
yrs <- c(2018:format(Sys.Date(), "%Y"))
yrs <- c(min(yrs), max(yrs))
yrs <- paste(unique(yrs), collapse = "-")

183
vignettes/freq.Rmd Normal file
View File

@ -0,0 +1,183 @@
---
title: "Creating Frequency Tables"
author: "Matthijs S. Berends"
output:
rmarkdown::html_vignette:
toc: true
vignette: >
%\VignetteIndexEntry{Vignette Title}
%\VignetteEngine{knitr::rmarkdown}
%\VignetteEncoding{UTF-8}
---
```{r setup, include = FALSE, results = 'markup'}
knitr::opts_chunk$set(
collapse = TRUE,
comment = "#"
)
library(dplyr)
library(AMR)
```
## Introduction
Frequency tables (or frequency distributions) are summaries of the distribution of values in a sample. With the `freq` function, you can create univariate frequency tables. Multiple variables will be pasted into one variable, so it forces a univariate distribution. We take the `septic_patients` dataset (included in this AMR package) as example.
## Frequencies of one variable
To only show and quickly review the content of one variable, you can just select this variable in various ways. Let's say we want to get the frequencies of the `sex` variable of the `septic_patients` dataset:
```{r, echo = TRUE, results = 'hide'}
# # just using base R
freq(septic_patients$sex)
# # using base R to select the variable and pass it on with a pipe
septic_patients$sex %>% freq()
# # do it all with pipes, using the `select` function of the dplyr package
septic_patients %>%
select(sex) %>%
freq()
```
This will all lead to the following table:
```{r, echo = TRUE}
freq(septic_patients$sex)
```
This immediately shows the class of the variable, its length and availability (i.e. the amount of `NA`), the amount of unique values and (most importantly) that among septic patients men are more prevalent than women.
## Frequencies of more than one variable
Multiple variables will be pasted into one variable to review individual cases, keeping a univariate frequency table.
For illustration, we could add some more variables to the `septic_patients` dataset to learn about bacterial properties:
```{r, echo = TRUE, results = 'hide'}
my_patients <- septic_patients %>%
left_join_microorganisms()
```
Now all variables of the `microorganisms` dataset have been joined to the `septic_patients` dataset. The `microorganisms` dataset consists of the following variables:
```{r, echo = TRUE}
colnames(microorganisms)
```
If we compare the dimensions between the old and new dataset, we can see that these `r ncol(my_patients) - ncol(septic_patients)` variables were added:
```{r, echo = TRUE}
dim(septic_patients)
dim(my_patients)
```
So now the `genus` and `species` variables are available. A frequency table of these combined variables can be created like this:
```{r, echo = TRUE}
my_patients %>%
select(genus, species) %>%
freq()
```
## Frequencies of numeric values
Frequency tables can be created of any input.
In case of numeric values (like integers, doubles, etc.) additional descriptive statistics will be calculated and shown into the header:
```{r, echo = TRUE}
# # get age distribution of unique patients
septic_patients %>%
distinct(patient_id, .keep_all = TRUE) %>%
select(age) %>%
freq(nmax = 5)
```
So the following properties are determined, where `NA` values are always ignored:
* **Mean**
* **Standard deviation**
* **Coefficient of variation** (CV), the standard deviation divided by the mean
* **Five numbers of Tukey** (min, Q1, median, Q3, max)
* **Coefficient of quartile variation** (CQV, sometimes called coefficient of dispersion), calculated as (Q3 - Q1) / (Q3 + Q1) using quantile with `type = 6` as quantile algorithm to comply with SPSS standards
* **Outliers** (total count and unique count)
So for example, the above frequency table quickly shows the median age of patients being `r my_patients %>% distinct(patient_id, .keep_all = TRUE) %>% pull(age) %>% median(na.rm = TRUE)`.
## Frequencies of factors
Frequencies of factors will be sorted on factor level instead of item count by default. This can be changed with the `sort.count` parameter. Frequency tables of factors always show the factor level as an additional last column.
`sort.count` is `TRUE` by default, except for factors. Compare this default behaviour:
```{r, echo = TRUE}
septic_patients %>%
select(hospital_id) %>%
freq()
```
To this, where items are now sorted on item count:
```{r, echo = TRUE}
septic_patients %>%
select(hospital_id) %>%
freq(sort.count = TRUE)
```
All classes will be printed into the header. Variables with the new `rsi` class of this AMR package are actually ordered factors and have three classes (look at `Class` in the header):
```{r, echo = TRUE}
septic_patients %>%
select(amox) %>%
freq()
```
## Frequencies of dates
Frequencies of dates will show the oldest and newest date in the data, and the amount of days between them:
```{r, echo = TRUE}
septic_patients %>%
select(date) %>%
freq(nmax = 5)
```
## Additional parameters
### Parameter `na.rm`
With the `na.rm` parameter (defaults to `TRUE`, but they will always be shown into the header), you can include `NA` values in the frequency table:
```{r, echo = TRUE}
septic_patients %>%
select(amox) %>%
freq(na.rm = FALSE)
```
### Parameter `markdown`
The `markdown` parameter can be used in reports created with R Markdown. This will always print all rows:
```{r, echo = TRUE}
septic_patients %>%
select(hospital_id) %>%
freq(markdown = TRUE)
```
### Parameter `as.data.frame`
With the `as.data.frame` parameter you can assign the frequency table to an object, or just print it as a `data.frame` to the console:
```{r, echo = TRUE}
my_df <- septic_patients %>%
select(hospital_id) %>%
freq(as.data.frame = TRUE)
my_df
class(my_df)
```
----
```{r, echo = FALSE}
# this will print "2018" in 2018, and "2018-yyyy" after 2018.
yrs <- c(2018:format(Sys.Date(), "%Y"))
yrs <- c(min(yrs), max(yrs))
yrs <- paste(unique(yrs), collapse = "-")
```
AMR, (c) `r yrs`, `r packageDescription("AMR")$URL`
Licensed under the [GNU General Public License v2.0](https://github.com/msberends/AMR/blob/master/LICENSE).

344
vignettes/freq.html Normal file

File diff suppressed because one or more lines are too long