2018-04-18 12:24:54 +02:00
# ==================================================================== #
# TITLE #
# Antimicrobial Resistance (AMR) Analysis #
# #
# AUTHORS #
# Berends MS (m.s.berends@umcg.nl), Luz CF (c.f.luz@umcg.nl) #
# #
# LICENCE #
# This program is free software; you can redistribute it and/or modify #
# it under the terms of the GNU General Public License version 2.0, #
# as published by the Free Software Foundation. #
# #
# This program is distributed in the hope that it will be useful, #
# but WITHOUT ANY WARRANTY; without even the implied warranty of #
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the #
# GNU General Public License for more details. #
# ==================================================================== #
#' Frequency table
#'
2018-06-20 14:47:37 +02:00
#' Create a frequency table of a vector of data, a single column or a maximum of 9 columns of a data frame. Supports markdown for reports. \code{top_freq} can be used to get the top/bottom \emph{n} items of a frequency table, with counts as names.
2018-04-18 12:24:54 +02:00
#' @param x data
2018-05-30 23:02:16 +02:00
#' @param sort.count sort on count. Use \code{FALSE} to sort alphabetically on item.
2018-05-22 16:34:22 +02:00
#' @param nmax number of row to print. The default, \code{15}, uses \code{\link{getOption}("max.print.freq")}. Use \code{nmax = 0} or \code{nmax = NA} to print all rows.
2018-05-09 11:44:46 +02:00
#' @param na.rm a logical value indicating whether NA values should be removed from the frequency table. The header will always print the amount of \code{NA}s.
2018-06-19 15:20:14 +02:00
#' @param row.names a logical value indicating whether row indices should be printed as \code{1:nrow(x)}
2018-04-18 12:24:54 +02:00
#' @param markdown print table in markdown format (this forces \code{nmax = NA})
2018-05-09 11:44:46 +02:00
#' @param as.data.frame return frequency table without header as a \code{data.frame} (e.g. to assign the table to an object)
2018-04-18 12:24:54 +02:00
#' @param digits how many significant digits are to be used for numeric values (not for the items themselves, that depends on \code{\link{getOption}("digits")})
#' @param sep a character string to separate the terms when selecting multiple columns
2018-06-20 14:47:37 +02:00
#' @param f a frequency table as \code{data.frame}, used as \code{freq(..., as.data.frame = TRUE)}
#' @param n number of top \emph{n} items to return, use -n for the bottom \emph{n} items. It will include more than \code{n} rows if there are ties.
#' @details This package also has a vignette available about this function, run: \code{browseVignettes("AMR")} to read it.
#'
#' For numeric values of any class, these additional values will be calculated and shown into the header:
2018-04-18 12:24:54 +02:00
#' \itemize{
#' \item{Mean, using \code{\link[base]{mean}}}
#' \item{Standard deviation, using \code{\link[stats]{sd}}}
#' \item{Five numbers of Tukey (min, Q1, median, Q3, max), using \code{\link[stats]{fivenum}}}
2018-05-09 11:44:46 +02:00
#' \item{Outliers (total count and unique count), using \code{\link{boxplot.stats}}}
2018-04-18 12:24:54 +02:00
#' \item{Coefficient of variation (CV), the standard deviation divided by the mean}
#' \item{Coefficient of quartile variation (CQV, sometimes called coefficient of dispersion), calculated as \code{(Q3 - Q1) / (Q3 + Q1)} using \code{\link{quantile}} with \code{type = 6} as quantile algorithm to comply with SPSS standards}
#' }
2018-06-20 14:47:37 +02:00
#'
#' For dates and times of any class, these additional values will be calculated and shown into the header:
#' \itemize{
#' \item{Oldest, using \code{\link[base]{min}}}
#' \item{Newest, using \code{\link[base]{max}}, with difference between newest and oldest}
#' \item{Median, using \code{\link[stats]{median}}, with percentage since oldest}
#' }
#'
#' The function \code{top_freq} uses \code{\link[dplyr]{top_n}} internally and will include more than \code{n} rows if there are ties.
2018-04-18 12:24:54 +02:00
#' @importFrom stats fivenum sd quantile
#' @importFrom grDevices boxplot.stats
#' @importFrom dplyr %>% select pull n_distinct group_by arrange desc mutate summarise
2018-06-20 14:47:37 +02:00
#' @importFrom utils browseVignettes
2018-04-18 12:24:54 +02:00
#' @keywords summary summarise frequency freq
#' @rdname freq
2018-06-20 14:47:37 +02:00
#' @return \itemize{
#' \item{When using \code{as.data.frame = FALSE} (default): only printed text}
#' \item{When using \code{as.data.frame = TRUE}: a \code{data.frame} object with an additional class \code{"frequency_tbl"}}
#' }
2018-04-18 12:24:54 +02:00
#' @export
#' @examples
#' library(dplyr)
#'
#' freq(septic_patients$hospital_id)
#'
#' septic_patients %>%
#' filter(hospital_id == "A") %>%
#' select(bactid) %>%
#' freq()
#'
#' # select multiple columns; they will be pasted together
#' septic_patients %>%
#' left_join_microorganisms %>%
#' filter(hospital_id == "A") %>%
#' select(genus, species) %>%
#' freq()
#'
#' # save frequency table to an object
#' years <- septic_patients %>%
#' mutate(year = format(date, "%Y")) %>%
#' select(year) %>%
2018-05-09 11:44:46 +02:00
#' freq(as.data.frame = TRUE)
2018-06-20 14:47:37 +02:00
#'
#' # get top 10 bugs of hospital A as a vector
#' septic_patients %>%
#' filter(hospital_id == "A") %>%
#' select(bactid) %>%
#' freq(as.data.frame = TRUE) %>%
#' top_freq(10)
2018-04-18 12:24:54 +02:00
freq <- function ( x ,
sort.count = TRUE ,
2018-05-09 11:44:46 +02:00
nmax = getOption ( " max.print.freq" ) ,
2018-04-18 12:24:54 +02:00
na.rm = TRUE ,
2018-06-19 15:20:14 +02:00
row.names = TRUE ,
2018-04-18 12:24:54 +02:00
markdown = FALSE ,
2018-05-09 11:44:46 +02:00
as.data.frame = FALSE ,
2018-04-18 12:24:54 +02:00
digits = 2 ,
sep = " " ) {
mult.columns <- 0
if ( NROW ( x ) == 0 ) {
cat ( ' \nNo observations.\n' )
return ( invisible ( ) )
}
if ( ! is.null ( ncol ( x ) ) ) {
if ( ncol ( x ) == 1 & any ( class ( x ) == ' data.frame' ) ) {
x <- x %>% pull ( 1 )
} else if ( ncol ( x ) < 10 ) {
mult.columns <- ncol ( x )
colnames ( x ) <- LETTERS [1 : ncol ( x ) ]
if ( ncol ( x ) == 2 ) {
x $ total <- paste ( x $ A %>% as.character ( ) ,
x $ B %>% as.character ( ) ,
sep = sep )
} else if ( ncol ( x ) == 3 ) {
x $ total <- paste ( x $ A %>% as.character ( ) ,
x $ B %>% as.character ( ) ,
x $ C %>% as.character ( ) ,
sep = sep )
} else if ( ncol ( x ) == 4 ) {
x $ total <- paste ( x $ A %>% as.character ( ) ,
x $ B %>% as.character ( ) ,
x $ C %>% as.character ( ) ,
x $ D %>% as.character ( ) ,
sep = sep )
} else if ( ncol ( x ) == 5 ) {
x $ total <- paste ( x $ A %>% as.character ( ) ,
x $ B %>% as.character ( ) ,
x $ C %>% as.character ( ) ,
x $ D %>% as.character ( ) ,
x $ E %>% as.character ( ) ,
sep = sep )
} else if ( ncol ( x ) == 6 ) {
x $ total <- paste ( x $ A %>% as.character ( ) ,
x $ B %>% as.character ( ) ,
x $ C %>% as.character ( ) ,
x $ D %>% as.character ( ) ,
x $ E %>% as.character ( ) ,
x $ F %>% as.character ( ) ,
sep = sep )
} else if ( ncol ( x ) == 7 ) {
x $ total <- paste ( x $ A %>% as.character ( ) ,
x $ B %>% as.character ( ) ,
x $ C %>% as.character ( ) ,
x $ D %>% as.character ( ) ,
x $ E %>% as.character ( ) ,
x $ F %>% as.character ( ) ,
x $ G %>% as.character ( ) ,
sep = sep )
} else if ( ncol ( x ) == 8 ) {
x $ total <- paste ( x $ A %>% as.character ( ) ,
x $ B %>% as.character ( ) ,
x $ C %>% as.character ( ) ,
x $ D %>% as.character ( ) ,
x $ E %>% as.character ( ) ,
x $ F %>% as.character ( ) ,
x $ G %>% as.character ( ) ,
x $ H %>% as.character ( ) ,
sep = sep )
} else if ( ncol ( x ) == 9 ) {
x $ total <- paste ( x $ A %>% as.character ( ) ,
x $ B %>% as.character ( ) ,
x $ C %>% as.character ( ) ,
x $ D %>% as.character ( ) ,
x $ E %>% as.character ( ) ,
x $ F %>% as.character ( ) ,
x $ G %>% as.character ( ) ,
x $ H %>% as.character ( ) ,
x $ I %>% as.character ( ) ,
sep = sep )
}
x <- x $ total
} else {
stop ( ' A maximum of 9 columns can be analysed at the same time.' , call. = FALSE )
}
}
2018-05-09 11:44:46 +02:00
if ( markdown == TRUE & as.data.frame == TRUE ) {
warning ( ' `as.data.frame = TRUE` will be ignored when `markdown = TRUE`.' )
2018-04-18 12:24:54 +02:00
}
if ( mult.columns > 1 ) {
2018-04-19 14:10:57 +02:00
NAs <- x [is.na ( x ) | x == trimws ( strrep ( ' NA ' , mult.columns ) ) ]
2018-04-18 12:24:54 +02:00
} else {
NAs <- x [is.na ( x ) ]
}
if ( na.rm == TRUE ) {
x <- x [ ! x %in% NAs ]
}
if ( missing ( sort.count ) & any ( class ( x ) %in% c ( ' double' , ' integer' , ' numeric' , ' raw' , ' single' , ' factor' ) ) ) {
# sort on item/level at default when x is numeric or a factor and sort.count is not set
sort.count <- FALSE
}
header <- character ( 0 )
markdown_line <- ' '
if ( markdown == TRUE ) {
markdown_line <- ' \n'
}
x_align <- ' l'
if ( mult.columns > 0 ) {
header <- header %>% paste0 ( markdown_line , ' Columns: ' , mult.columns )
} else {
header <- header %>% paste0 ( markdown_line , ' Class: ' , class ( x ) %>% rev ( ) %>% paste ( collapse = " > " ) )
}
if ( is.list ( x ) | is.matrix ( x ) | is.environment ( x ) | is.function ( x ) ) {
cat ( header , " \n" )
stop ( ' `freq()` does not support lists, matrices, environments or functions.' , call. = FALSE )
}
header <- header %>% paste0 ( markdown_line , ' \nLength: ' , ( NAs %>% length ( ) + x %>% length ( ) ) %>% format ( ) ,
' (of which NA: ' , NAs %>% length ( ) %>% format ( ) ,
' = ' , ( NAs %>% length ( ) / ( NAs %>% length ( ) + x %>% length ( ) ) ) %>% percent ( force_zero = TRUE ) , ' )' )
header <- header %>% paste0 ( markdown_line , ' \nUnique: ' , x %>% n_distinct ( ) %>% format ( ) )
if ( any ( class ( x ) %in% c ( ' double' , ' integer' , ' numeric' , ' raw' , ' single' ) ) ) {
# right align number
x_align <- ' r'
header <- header %>% paste0 ( ' \n' )
header <- header %>% paste ( markdown_line , ' \nMean: ' , x %>% base :: mean ( na.rm = TRUE ) %>% format ( digits = digits ) )
header <- header %>% paste0 ( markdown_line , ' \nStd. dev.: ' , x %>% stats :: sd ( na.rm = TRUE ) %>% format ( digits = digits ) ,
' (CV: ' , x %>% cv ( na.rm = TRUE ) %>% format ( digits = digits ) , ' )' )
header <- header %>% paste0 ( markdown_line , ' \nFive-Num: ' , x %>% stats :: fivenum ( na.rm = TRUE ) %>% format ( digits = digits ) %>% trimws ( ) %>% paste ( collapse = ' | ' ) ,
' (CQV: ' , x %>% cqv ( na.rm = TRUE ) %>% format ( digits = digits ) , ' )' )
outlier_length <- length ( boxplot.stats ( x ) $ out )
header <- header %>% paste0 ( markdown_line , ' \nOutliers: ' , outlier_length )
if ( outlier_length > 0 ) {
header <- header %>% paste0 ( ' (unique: ' , boxplot.stats ( x ) $ out %>% unique ( ) %>% length ( ) , ' )' )
}
}
formatdates <- " %e %B %Y" # = d mmmm yyyy
if ( any ( class ( x ) == ' hms' ) ) {
x <- x %>% as.POSIXlt ( )
formatdates <- " %H:%M:%S"
}
if ( any ( class ( x ) %in% c ( ' Date' , ' POSIXct' , ' POSIXlt' ) ) ) {
header <- header %>% paste0 ( ' \n' )
2018-06-19 15:20:14 +02:00
mindate <- x %>% min ( na.rm = TRUE )
maxdate <- x %>% max ( na.rm = TRUE )
2018-06-20 14:47:37 +02:00
maxdate_days <- difftime ( maxdate , mindate , units = ' auto' ) %>% as.double ( )
2018-06-19 15:20:14 +02:00
mediandate <- x %>% median ( na.rm = TRUE )
2018-06-20 14:47:37 +02:00
median_days <- difftime ( mediandate , mindate , units = ' auto' ) %>% as.double ( )
2018-06-19 15:20:14 +02:00
header <- header %>% paste0 ( markdown_line , ' \nOldest: ' , mindate %>% format ( formatdates ) %>% trimws ( ) )
header <- header %>% paste0 ( markdown_line , ' \nNewest: ' , maxdate %>% format ( formatdates ) %>% trimws ( ) ,
' (+' , difftime ( maxdate , mindate , units = ' auto' ) %>% as.double ( ) %>% format ( ) , ' )' )
2018-06-20 14:47:37 +02:00
header <- header %>% paste0 ( markdown_line , ' \nMedian: ' , mediandate %>% format ( formatdates ) %>% trimws ( ) ,
' (~' , percent ( median_days / maxdate_days , round = 0 ) , ' )' )
2018-04-18 12:24:54 +02:00
}
if ( any ( class ( x ) == ' POSIXlt' ) ) {
x <- x %>% format ( formatdates )
}
2018-05-09 11:44:46 +02:00
if ( as.data.frame == FALSE ) {
2018-04-18 12:24:54 +02:00
cat ( header )
}
if ( all ( is.na ( x ) ) ) {
cat ( ' \n\nNo observations.\n' )
return ( invisible ( ) )
}
if ( n_distinct ( x ) == length ( x ) ) {
warning ( ' All observations are unique.' , call. = FALSE )
}
2018-05-09 11:44:46 +02:00
nmax.set <- ! missing ( nmax )
if ( is.null ( nmax ) & is.null ( base :: getOption ( " max.print.freq" , default = NULL ) ) ) {
# default for max print setting
nmax <- 15
}
if ( nmax == 0 | is.na ( nmax ) | is.null ( nmax ) ) {
2018-04-18 12:24:54 +02:00
nmax <- length ( x )
}
nmax.1 <- min ( length ( x ) , nmax + 1 )
# create table with counts and percentages
2018-05-09 11:44:46 +02:00
column_names <- c ( ' Item' , ' Count' , ' Percent' , ' Cum. Count' , ' Cum. Percent' , ' (Factor Level)' )
column_names_df <- c ( ' item' , ' count' , ' percent' , ' cum_count' , ' cum_percent' , ' factor_level' )
2018-04-18 12:24:54 +02:00
if ( any ( class ( x ) == ' factor' ) ) {
df <- tibble :: tibble ( Item = x ,
Fctlvl = x %>% as.integer ( ) ) %>%
group_by ( Item , Fctlvl )
column_align <- c ( ' l' , ' r' , ' r' , ' r' , ' r' , ' r' )
} else {
df <- tibble :: tibble ( Item = x ) %>%
group_by ( Item )
2018-06-19 15:20:14 +02:00
# strip factor lvl from col names
column_names <- column_names [1 : length ( column_names ) - 1 ]
column_names_df <- column_names_df [1 : length ( column_names_df ) - 1 ]
2018-04-18 12:24:54 +02:00
column_align <- c ( x_align , ' r' , ' r' , ' r' , ' r' )
}
df <- df %>%
summarise ( Count = n ( ) ,
Percent = ( n ( ) / length ( x ) ) %>% percent ( force_zero = TRUE ) )
if ( df $ Item %>% paste ( collapse = ' ,' ) %like% ' \033' ) {
df <- df %>%
mutate ( Item = Item %>%
# remove escape char
# see https://en.wikipedia.org/wiki/Escape_character#ASCII_escape_character
gsub ( ' \033' , ' ' , ., fixed = TRUE ) )
}
# sort according to setting
if ( sort.count == TRUE ) {
2018-05-09 11:44:46 +02:00
df <- df %>% arrange ( desc ( Count ) , Item )
2018-04-18 12:24:54 +02:00
} else {
if ( any ( class ( x ) == ' factor' ) ) {
2018-05-09 11:44:46 +02:00
df <- df %>% arrange ( Fctlvl , Item )
2018-04-18 12:24:54 +02:00
} else {
df <- df %>% arrange ( Item )
}
}
# add cumulative values
df $ Cum <- cumsum ( df $ Count )
df $ CumTot <- ( df $ Cum / sum ( df $ Count , na.rm = TRUE ) ) %>% percent ( force_zero = TRUE )
df $ Cum <- df $ Cum %>% format ( )
if ( any ( class ( x ) == ' factor' ) ) {
# put factor last
df <- df %>% select ( Item , Count , Percent , Cum , CumTot , Fctlvl )
}
2018-05-09 11:44:46 +02:00
if ( as.data.frame == TRUE ) {
2018-04-18 12:24:54 +02:00
# assign to object
df [ , 3 ] <- df [ , 2 ] / sum ( df [ , 2 ] , na.rm = TRUE )
df [ , 4 ] <- cumsum ( df [ , 2 ] )
df [ , 5 ] <- df [ , 4 ] / sum ( df [ , 2 ] , na.rm = TRUE )
2018-05-09 11:44:46 +02:00
colnames ( df ) <- column_names_df
2018-06-20 14:47:37 +02:00
df <- as.data.frame ( df , stringsAsFactors = FALSE )
class ( df ) <- c ( ' frequency_tbl' , class ( df ) )
return ( df )
2018-05-09 11:44:46 +02:00
}
2018-04-18 12:24:54 +02:00
2018-05-09 11:44:46 +02:00
if ( markdown == TRUE ) {
tblformat <- ' markdown'
2018-04-18 12:24:54 +02:00
} else {
2018-05-09 11:44:46 +02:00
tblformat <- ' pandoc'
}
2018-04-18 12:24:54 +02:00
2018-05-09 11:44:46 +02:00
# save old NA setting for kable
opt.old <- options ( ) $ knitr.kable.NA
options ( knitr.kable.NA = " <NA>" )
2018-04-18 12:24:54 +02:00
2018-05-09 11:44:46 +02:00
Count.rest <- sum ( df [nmax.1 : nrow ( df ) , ' Count' ] , na.rm = TRUE )
if ( any ( class ( x ) %in% c ( ' double' , ' integer' , ' numeric' , ' raw' , ' single' ) ) ) {
df <- df %>% mutate ( Item = format ( Item ) )
}
df <- df %>% mutate ( Count = format ( Count ) )
if ( nrow ( df ) > nmax.1 & markdown == FALSE ) {
df2 <- df [1 : nmax , ]
print (
knitr :: kable ( df2 ,
format = tblformat ,
2018-06-19 15:20:14 +02:00
row.names = row.names ,
2018-05-09 11:44:46 +02:00
col.names = column_names ,
align = column_align ,
padding = 1 )
)
2018-05-22 16:34:22 +02:00
if ( nmax.set == TRUE ) {
cat ( ' [ reached `nmax = ' , nmax , ' `' , sep = ' ' )
} else {
cat ( ' [ reached getOption("max.print.freq")' )
2018-04-18 12:24:54 +02:00
}
2018-05-22 16:34:22 +02:00
cat ( ' -- omitted ' ,
format ( nrow ( df ) - nmax ) ,
' entries, n = ' ,
format ( Count.rest ) ,
' (' ,
( Count.rest / length ( x ) ) %>% percent ( force_zero = TRUE ) ,
' ) ]\n' , sep = ' ' )
2018-04-18 12:24:54 +02:00
2018-05-09 11:44:46 +02:00
} else {
print (
knitr :: kable ( df ,
format = tblformat ,
2018-06-19 15:20:14 +02:00
row.names = row.names ,
2018-05-09 11:44:46 +02:00
col.names = column_names ,
align = column_align ,
padding = 1 )
)
2018-04-18 12:24:54 +02:00
}
2018-05-09 11:44:46 +02:00
cat ( ' \n' )
# reset old kable setting
options ( knitr.kable.NA = opt.old )
return ( invisible ( ) )
2018-04-18 12:24:54 +02:00
}
#' @rdname freq
#' @export
frequency_tbl <- freq
2018-06-20 14:47:37 +02:00
#' @rdname freq
#' @export
#' @importFrom dplyr top_n pull
top_freq <- function ( f , n ) {
if ( ! ' frequency_tbl' %in% class ( f ) ) {
stop ( ' top_freq can only be applied to frequency tables' , call. = FALSE )
}
if ( ! is.numeric ( n ) | length ( n ) != 1L ) {
stop ( ' For top_freq, `nmax` must be a number of length 1' , call. = FALSE )
}
top <- f %>% top_n ( n , count )
vect <- top %>% pull ( item )
names ( vect ) <- top %>% pull ( count )
if ( length ( vect ) > abs ( n ) ) {
message ( " top_freq: selecting " , length ( vect ) , " items instead of " , abs ( n ) , " , because of ties" )
}
vect
}