2018-04-18 12:24:54 +02:00
# ==================================================================== #
# TITLE #
# Antimicrobial Resistance (AMR) Analysis #
# #
# AUTHORS #
# Berends MS (m.s.berends@umcg.nl), Luz CF (c.f.luz@umcg.nl) #
# #
# LICENCE #
# This program is free software; you can redistribute it and/or modify #
# it under the terms of the GNU General Public License version 2.0, #
# as published by the Free Software Foundation. #
# #
# This program is distributed in the hope that it will be useful, #
# but WITHOUT ANY WARRANTY; without even the implied warranty of #
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the #
# GNU General Public License for more details. #
# ==================================================================== #
#' Frequency table
#'
2018-07-01 21:40:37 +02:00
#' Create a frequency table of a vector with items or a data frame. Supports quasiquotation and markdown for reports. \code{top_freq} can be used to get the top/bottom \emph{n} items of a frequency table, with counts as names.
2018-07-09 14:02:58 +02:00
#' @param x vector of any class or a \code{\link{data.frame}}, \code{\link{tibble}} or \code{\link{table}}
#' @param ... up to nine different columns of \code{x} when \code{x} is a \code{data.frame} or \code{tibble}, to calculate frequencies from - see Examples
2018-07-03 11:30:40 +02:00
#' @param sort.count sort on count, i.e. frequencies. This will be \code{TRUE} at default for everything except for factors.
#' @param nmax number of row to print. The default, \code{15}, uses \code{\link{getOption}("max.print.freq")}. Use \code{nmax = 0}, \code{nmax = Inf}, \code{nmax = NULL} or \code{nmax = NA} to print all rows.
2018-10-23 09:42:26 +02:00
#' @param na.rm a logical value indicating whether \code{NA} values should be removed from the frequency table. The header (if set) will always print the amount of \code{NA}s.
2018-06-19 15:20:14 +02:00
#' @param row.names a logical value indicating whether row indices should be printed as \code{1:nrow(x)}
2018-10-22 12:32:59 +02:00
#' @param markdown a logical value indicating whether the frequency table should be printed in markdown format. This will print all rows and is default behaviour in non-interactive R sessions (like when knitting RMarkdown files).
#' @param digits how many significant digits are to be used for numeric values in the header (not for the items themselves, that depends on \code{\link{getOption}("digits")})
2018-09-10 15:45:25 +02:00
#' @param quote a logical value indicating whether or not strings should be printed with surrounding quotes
2018-10-19 21:52:08 +02:00
#' @param header a logical value indicating whether an informative header should be printed
2018-10-23 09:42:26 +02:00
#' @param na a character string to should be used to show empty (\code{NA}) values (only useful when \code{na.rm = FALSE})
2018-04-18 12:24:54 +02:00
#' @param sep a character string to separate the terms when selecting multiple columns
2018-07-01 21:40:37 +02:00
#' @param f a frequency table
2018-06-20 14:47:37 +02:00
#' @param n number of top \emph{n} items to return, use -n for the bottom \emph{n} items. It will include more than \code{n} rows if there are ties.
2018-07-03 11:30:40 +02:00
#' @details Frequency tables (or frequency distributions) are summaries of the distribution of values in a sample. With the `freq` function, you can create univariate frequency tables. Multiple variables will be pasted into one variable, so it forces a univariate distribution. This package also has a vignette available to explain the use of this function further, run \code{browseVignettes("AMR")} to read it.
2018-06-20 14:47:37 +02:00
#'
2018-10-22 12:32:59 +02:00
#' For numeric values of any class, these additional values will all be calculated with \code{na.rm = TRUE} and shown into the header:
2018-04-18 12:24:54 +02:00
#' \itemize{
#' \item{Mean, using \code{\link[base]{mean}}}
2018-07-03 11:30:40 +02:00
#' \item{Standard Deviation, using \code{\link[stats]{sd}}}
#' \item{Coefficient of Variation (CV), the standard deviation divided by the mean}
#' \item{Mean Absolute Deviation (MAD), using \code{\link[stats]{mad}}}
#' \item{Tukey Five-Number Summaries (minimum, Q1, median, Q3, maximum), using \code{\link[stats]{fivenum}}}
#' \item{Interquartile Range (IQR) calculated as \code{Q3 - Q1} using the Tukey Five-Number Summaries, i.e. \strong{not} using the \code{\link[stats]{quantile}} function}
#' \item{Coefficient of Quartile Variation (CQV, sometimes called coefficient of dispersion), calculated as \code{(Q3 - Q1) / (Q3 + Q1)} using the Tukey Five-Number Summaries}
#' \item{Outliers (total count and unique count), using \code{\link[grDevices]{boxplot.stats}}}
2018-04-18 12:24:54 +02:00
#' }
2018-06-20 14:47:37 +02:00
#'
2018-10-22 12:32:59 +02:00
#' For dates and times of any class, these additional values will be calculated with \code{na.rm = TRUE} and shown into the header:
2018-06-20 14:47:37 +02:00
#' \itemize{
2018-07-08 22:14:55 +02:00
#' \item{Oldest, using \code{\link{min}}}
#' \item{Newest, using \code{\link{max}}, with difference between newest and oldest}
2018-06-20 14:47:37 +02:00
#' \item{Median, using \code{\link[stats]{median}}, with percentage since oldest}
#' }
#'
2018-07-03 11:30:40 +02:00
#'
2018-06-20 14:47:37 +02:00
#' The function \code{top_freq} uses \code{\link[dplyr]{top_n}} internally and will include more than \code{n} rows if there are ties.
2018-07-03 11:30:40 +02:00
#' @importFrom stats fivenum sd mad
2018-04-18 12:24:54 +02:00
#' @importFrom grDevices boxplot.stats
2018-08-23 00:40:36 +02:00
#' @importFrom dplyr %>% select pull n_distinct group_by arrange desc mutate summarise n_distinct tibble
2018-07-09 14:02:58 +02:00
#' @importFrom utils browseVignettes installed.packages
2018-10-09 13:53:33 +02:00
#' @importFrom hms is.hms
2018-10-18 12:10:10 +02:00
#' @importFrom crayon red silver
2018-04-18 12:24:54 +02:00
#' @keywords summary summarise frequency freq
#' @rdname freq
2018-07-01 21:40:37 +02:00
#' @name freq
#' @return A \code{data.frame} with an additional class \code{"frequency_tbl"}
2018-04-18 12:24:54 +02:00
#' @export
#' @examples
#' library(dplyr)
#'
2018-07-01 21:40:37 +02:00
#' # this all gives the same result:
2018-04-18 12:24:54 +02:00
#' freq(septic_patients$hospital_id)
2018-07-01 21:40:37 +02:00
#' freq(septic_patients[, "hospital_id"])
#' septic_patients$hospital_id %>% freq()
#' septic_patients[, "hospital_id"] %>% freq()
#' septic_patients %>% freq("hospital_id")
2018-07-09 14:02:58 +02:00
#' septic_patients %>% freq(hospital_id) #<- easiest to remember when you're used to tidyverse
2018-04-18 12:24:54 +02:00
#'
2018-07-09 14:02:58 +02:00
#' # you could also use `select` or `pull` to get your variables
2018-04-18 12:24:54 +02:00
#' septic_patients %>%
#' filter(hospital_id == "A") %>%
2018-08-31 13:36:19 +02:00
#' select(mo) %>%
2018-04-18 12:24:54 +02:00
#' freq()
#'
2018-07-09 14:02:58 +02:00
#' # multiple selected variables will be pasted together
2018-04-18 12:24:54 +02:00
#' septic_patients %>%
#' left_join_microorganisms %>%
#' filter(hospital_id == "A") %>%
2018-07-01 21:40:37 +02:00
#' freq(genus, species)
2018-04-18 12:24:54 +02:00
#'
2018-07-03 11:30:40 +02:00
#' # get top 10 bugs of hospital A as a vector
#' septic_patients %>%
#' filter(hospital_id == "A") %>%
2018-08-31 13:36:19 +02:00
#' freq(mo) %>%
2018-07-03 11:30:40 +02:00
#' top_freq(10)
#'
2018-04-18 12:24:54 +02:00
#' # save frequency table to an object
#' years <- septic_patients %>%
#' mutate(year = format(date, "%Y")) %>%
2018-07-01 21:40:37 +02:00
#' freq(year)
2018-06-20 14:47:37 +02:00
#'
2018-07-09 14:02:58 +02:00
#' # show only the top 5
2018-07-03 11:30:40 +02:00
#' years %>% print(nmax = 5)
#'
2018-07-16 16:41:48 +02:00
#' # save to an object with formatted percentages
#' years <- format(years)
#'
2018-07-09 14:02:58 +02:00
#' # print a histogram of numeric values
#' septic_patients %>%
#' freq(age) %>%
2018-10-01 11:39:43 +02:00
#' hist()
2018-07-09 14:02:58 +02:00
#'
#' # or print all points to a regular plot
#' septic_patients %>%
#' freq(age) %>%
#' plot()
#'
#' # transform to a data.frame or tibble
2018-06-20 14:47:37 +02:00
#' septic_patients %>%
2018-07-03 11:30:40 +02:00
#' freq(age) %>%
#' as.data.frame()
2018-07-09 14:02:58 +02:00
#'
#' # or transform (back) to a vector
#' septic_patients %>%
#' freq(age) %>%
#' as.vector()
#'
#' identical(septic_patients %>%
#' freq(age) %>%
#' as.vector() %>%
#' sort(),
2018-07-16 16:41:48 +02:00
#' sort(septic_patients$age)) # TRUE
2018-07-09 14:02:58 +02:00
#'
2018-07-16 16:41:48 +02:00
#' # it also supports `table` objects:
2018-09-29 21:54:32 +02:00
#' table(septic_patients$gender,
2018-07-09 14:02:58 +02:00
#' septic_patients$age) %>%
2018-07-16 16:41:48 +02:00
#' freq(sep = " **sep** ")
#'
2018-10-01 11:39:43 +02:00
#' # check differences between frequency tables
#' diff(freq(septic_patients$trim),
#' freq(septic_patients$trsu))
2018-07-01 21:40:37 +02:00
frequency_tbl <- function ( x ,
... ,
sort.count = TRUE ,
nmax = getOption ( " max.print.freq" ) ,
na.rm = TRUE ,
row.names = TRUE ,
2018-10-22 12:32:59 +02:00
markdown = ! interactive ( ) ,
2018-07-01 21:40:37 +02:00
digits = 2 ,
2018-09-10 15:45:25 +02:00
quote = FALSE ,
2018-10-19 21:52:08 +02:00
header = ! markdown ,
2018-10-23 09:42:26 +02:00
na = " <NA>" ,
2018-07-01 21:40:37 +02:00
sep = " " ) {
2018-07-10 12:27:07 +02:00
mult.columns <- 0
2018-08-24 11:08:20 +02:00
x.name <- NULL
cols <- NULL
if ( any ( class ( x ) == ' list' ) ) {
cols <- names ( x )
x <- as.data.frame ( x , stringsAsFactors = FALSE )
x.name <- " a list"
} else if ( any ( class ( x ) == ' matrix' ) ) {
x <- as.data.frame ( x , stringsAsFactors = FALSE )
x.name <- " a matrix"
cols <- colnames ( x )
if ( all ( cols %like% ' V[0-9]' ) ) {
cols <- NULL
}
}
2018-07-01 21:40:37 +02:00
if ( any ( class ( x ) == ' data.frame' ) ) {
2018-08-24 11:08:20 +02:00
if ( is.null ( x.name ) ) {
x.name <- deparse ( substitute ( x ) )
}
2018-07-01 21:40:37 +02:00
if ( x.name == " ." ) {
x.name <- NULL
}
2018-07-01 22:23:34 +02:00
dots <- base :: eval ( base :: substitute ( base :: alist ( ... ) ) )
2018-07-01 21:40:37 +02:00
ndots <- length ( dots )
2018-08-24 11:08:20 +02:00
if ( ndots < 10 ) {
2018-07-01 21:40:37 +02:00
cols <- as.character ( dots )
2018-07-01 22:23:34 +02:00
if ( ! all ( cols %in% colnames ( x ) ) ) {
stop ( " one or more columns not found: `" , paste ( cols , collapse = " `, `" ) , ' `' , call. = FALSE )
}
2018-08-24 11:08:20 +02:00
if ( length ( cols ) > 0 ) {
x <- x [ , cols ]
}
2018-07-01 21:40:37 +02:00
} else if ( ndots >= 10 ) {
stop ( ' A maximum of 9 columns can be analysed at the same time.' , call. = FALSE )
} else {
cols <- NULL
}
2018-07-09 14:02:58 +02:00
} else if ( any ( class ( x ) == ' table' ) ) {
2018-10-22 12:32:59 +02:00
x <- as.data.frame ( x , stringsAsFactors = FALSE )
# now this DF contains 3 columns: the 2 vars and a Freq column
# paste the first 2 cols and repeat them Freq times:
x <- rep ( x = do.call ( paste , c ( x [colnames ( x ) [1 : 2 ] ] , sep = sep ) ) ,
2018-10-23 09:42:26 +02:00
times = x $ Freq )
2018-07-10 12:27:07 +02:00
x.name <- " a `table` object"
2018-07-09 14:02:58 +02:00
cols <- NULL
2018-10-22 12:32:59 +02:00
#mult.columns <- 2
2018-07-01 21:40:37 +02:00
} else {
x.name <- NULL
cols <- NULL
}
2018-04-18 12:24:54 +02:00
if ( ! is.null ( ncol ( x ) ) ) {
if ( ncol ( x ) == 1 & any ( class ( x ) == ' data.frame' ) ) {
x <- x %>% pull ( 1 )
} else if ( ncol ( x ) < 10 ) {
mult.columns <- ncol ( x )
2018-10-22 12:32:59 +02:00
x <- do.call ( paste , c ( x [colnames ( x ) ] , sep = sep ) )
2018-04-18 12:24:54 +02:00
} else {
stop ( ' A maximum of 9 columns can be analysed at the same time.' , call. = FALSE )
}
}
if ( mult.columns > 1 ) {
2018-04-19 14:10:57 +02:00
NAs <- x [is.na ( x ) | x == trimws ( strrep ( ' NA ' , mult.columns ) ) ]
2018-04-18 12:24:54 +02:00
} else {
NAs <- x [is.na ( x ) ]
}
2018-07-23 14:14:03 +02:00
2018-04-18 12:24:54 +02:00
if ( na.rm == TRUE ) {
2018-07-23 14:14:03 +02:00
x_class <- class ( x )
2018-04-18 12:24:54 +02:00
x <- x [ ! x %in% NAs ]
2018-07-23 14:14:03 +02:00
class ( x ) <- x_class
2018-04-18 12:24:54 +02:00
}
2018-10-23 09:42:26 +02:00
if ( sort.count == FALSE & ' factor' %in% class ( x ) ) {
# warning("Sorting a factor sorts on factor level, not necessarily alphabetically.", call. = FALSE)
2018-04-18 12:24:54 +02:00
}
2018-10-19 21:52:08 +02:00
header_txt <- character ( 0 )
2018-04-18 12:24:54 +02:00
markdown_line <- ' '
if ( markdown == TRUE ) {
markdown_line <- ' \n'
}
x_align <- ' l'
if ( mult.columns > 0 ) {
2018-10-19 21:52:08 +02:00
header_txt <- header_txt %>% paste0 ( markdown_line , ' Columns: ' , mult.columns )
2018-04-18 12:24:54 +02:00
} else {
2018-10-19 21:52:08 +02:00
header_txt <- header_txt %>% paste0 ( markdown_line , ' Class: ' , class ( x ) %>% rev ( ) %>% paste ( collapse = " > " ) )
2018-09-17 09:42:09 +02:00
if ( ! mode ( x ) %in% class ( x ) ) {
2018-10-19 21:52:08 +02:00
header_txt <- header_txt %>% paste0 ( silver ( paste0 ( " (" , mode ( x ) , " )" ) ) )
2018-10-18 12:10:10 +02:00
}
}
NAs_to_red <- function ( x ) {
if ( ! x %in% c ( " 0" , " 0.00%" ) ) {
red ( x )
} else {
x
2018-09-17 09:42:09 +02:00
}
2018-04-18 12:24:54 +02:00
}
2018-10-19 21:52:08 +02:00
header_txt <- header_txt %>% paste0 ( markdown_line , ' \nLength: ' , ( NAs %>% length ( ) + x %>% length ( ) ) %>% format ( ) ,
2018-10-23 09:42:26 +02:00
' (of which NA: ' , NAs %>% length ( ) %>% format ( ) %>% NAs_to_red ( ) ,
' = ' , ( NAs %>% length ( ) / ( NAs %>% length ( ) + x %>% length ( ) ) ) %>%
percent ( force_zero = TRUE , round = digits ) %>%
sub ( ' NaN' , ' 0' , ., fixed = TRUE ) %>%
NAs_to_red ( ) , ' )' )
2018-10-19 21:52:08 +02:00
header_txt <- header_txt %>% paste0 ( markdown_line , ' \nUnique: ' , x %>% n_distinct ( ) %>% format ( ) )
2018-04-18 12:24:54 +02:00
2018-09-17 09:42:09 +02:00
if ( NROW ( x ) > 0 & any ( class ( x ) == " character" ) ) {
2018-10-19 21:52:08 +02:00
header_txt <- header_txt %>% paste0 ( ' \n' )
header_txt <- header_txt %>% paste0 ( markdown_line , ' \nShortest: ' , x %>% base :: nchar ( ) %>% base :: min ( na.rm = TRUE ) )
header_txt <- header_txt %>% paste0 ( markdown_line , ' \nLongest: ' , x %>% base :: nchar ( ) %>% base :: max ( na.rm = TRUE ) )
2018-09-17 09:42:09 +02:00
}
2018-10-12 16:35:18 +02:00
if ( NROW ( x ) > 0 & any ( class ( x ) == " difftime" ) ) {
2018-10-19 21:52:08 +02:00
header_txt <- header_txt %>% paste0 ( ' \n' )
header_txt <- header_txt %>% paste ( markdown_line , ' \nUnits: ' , attributes ( x ) $ units )
2018-10-12 16:35:18 +02:00
x <- as.double ( x )
2018-10-19 21:52:08 +02:00
# after this, the numeric header_txt continues
2018-10-12 16:35:18 +02:00
}
2018-07-03 11:30:40 +02:00
if ( NROW ( x ) > 0 & any ( class ( x ) %in% c ( ' double' , ' integer' , ' numeric' , ' raw' , ' single' ) ) ) {
2018-04-18 12:24:54 +02:00
# right align number
2018-07-03 11:30:40 +02:00
Tukey_five <- stats :: fivenum ( x , na.rm = TRUE )
2018-04-18 12:24:54 +02:00
x_align <- ' r'
2018-10-19 21:52:08 +02:00
header_txt <- header_txt %>% paste0 ( ' \n' )
header_txt <- header_txt %>% paste ( markdown_line , ' \nMean: ' , x %>% base :: mean ( na.rm = TRUE ) %>% format ( digits = digits ) )
header_txt <- header_txt %>% paste0 ( markdown_line , ' \nStd. dev.: ' , x %>% stats :: sd ( na.rm = TRUE ) %>% format ( digits = digits ) ,
2018-10-23 09:42:26 +02:00
' (CV: ' , x %>% cv ( na.rm = TRUE ) %>% format ( digits = digits ) ,
' , MAD: ' , x %>% stats :: mad ( na.rm = TRUE ) %>% format ( digits = digits ) , ' )' )
2018-10-19 21:52:08 +02:00
header_txt <- header_txt %>% paste0 ( markdown_line , ' \nFive-Num: ' , Tukey_five %>% format ( digits = digits ) %>% trimws ( ) %>% paste ( collapse = ' | ' ) ,
2018-10-23 09:42:26 +02:00
' (IQR: ' , ( Tukey_five [4 ] - Tukey_five [2 ] ) %>% format ( digits = digits ) ,
' , CQV: ' , x %>% cqv ( na.rm = TRUE ) %>% format ( digits = digits ) , ' )' )
2018-04-18 12:24:54 +02:00
outlier_length <- length ( boxplot.stats ( x ) $ out )
2018-10-19 21:52:08 +02:00
header_txt <- header_txt %>% paste0 ( markdown_line , ' \nOutliers: ' , outlier_length )
2018-04-18 12:24:54 +02:00
if ( outlier_length > 0 ) {
2018-10-19 21:52:08 +02:00
header_txt <- header_txt %>% paste0 ( ' (unique count: ' , boxplot.stats ( x ) $ out %>% n_distinct ( ) , ' )' )
2018-04-18 12:24:54 +02:00
}
}
2018-08-03 09:59:39 +02:00
if ( NROW ( x ) > 0 & any ( class ( x ) == " rsi" ) ) {
2018-10-19 21:52:08 +02:00
header_txt <- header_txt %>% paste0 ( ' \n' )
2018-10-23 09:42:26 +02:00
cnt_S <- sum ( x == " S" , na.rm = TRUE )
cnt_I <- sum ( x == " I" , na.rm = TRUE )
cnt_R <- sum ( x == " R" , na.rm = TRUE )
2018-10-19 21:52:08 +02:00
header_txt <- header_txt %>% paste ( markdown_line , ' \n%IR: ' ,
2018-10-23 09:42:26 +02:00
( ( cnt_I + cnt_R ) / sum ( ! is.na ( x ) , na.rm = TRUE ) ) %>% percent ( force_zero = TRUE , round = digits ) )
2018-10-19 21:52:08 +02:00
header_txt <- header_txt %>% paste0 ( markdown_line , ' \nRatio SIR: 1.0 : ' ,
2018-10-23 09:42:26 +02:00
( cnt_I / cnt_S ) %>% format ( digits = 1 , nsmall = 1 ) , " : " ,
( cnt_R / cnt_S ) %>% format ( digits = 1 , nsmall = 1 ) )
2018-08-01 22:37:28 +02:00
}
2018-04-18 12:24:54 +02:00
formatdates <- " %e %B %Y" # = d mmmm yyyy
2018-10-09 13:53:33 +02:00
if ( is.hms ( x ) ) {
2018-04-18 12:24:54 +02:00
x <- x %>% as.POSIXlt ( )
formatdates <- " %H:%M:%S"
}
2018-07-03 11:30:40 +02:00
if ( NROW ( x ) > 0 & any ( class ( x ) %in% c ( ' Date' , ' POSIXct' , ' POSIXlt' ) ) ) {
2018-10-19 21:52:08 +02:00
header_txt <- header_txt %>% paste0 ( ' \n' )
2018-06-19 15:20:14 +02:00
mindate <- x %>% min ( na.rm = TRUE )
maxdate <- x %>% max ( na.rm = TRUE )
2018-06-20 14:47:37 +02:00
maxdate_days <- difftime ( maxdate , mindate , units = ' auto' ) %>% as.double ( )
2018-06-19 15:20:14 +02:00
mediandate <- x %>% median ( na.rm = TRUE )
2018-06-20 14:47:37 +02:00
median_days <- difftime ( mediandate , mindate , units = ' auto' ) %>% as.double ( )
2018-08-23 21:27:15 +02:00
if ( formatdates == " %H:%M:%S" ) {
# hms
2018-10-19 21:52:08 +02:00
header_txt <- header_txt %>% paste0 ( markdown_line , ' \nEarliest: ' , mindate %>% format ( formatdates ) %>% trimws ( ) )
header_txt <- header_txt %>% paste0 ( markdown_line , ' \nLatest: ' , maxdate %>% format ( formatdates ) %>% trimws ( ) ,
2018-10-23 09:42:26 +02:00
' (+' , difftime ( maxdate , mindate , units = ' mins' ) %>% as.double ( ) %>% format ( digits = digits ) , ' min.)' )
2018-08-23 21:27:15 +02:00
} else {
# other date formats
2018-10-19 21:52:08 +02:00
header_txt <- header_txt %>% paste0 ( markdown_line , ' \nOldest: ' , mindate %>% format ( formatdates ) %>% trimws ( ) )
header_txt <- header_txt %>% paste0 ( markdown_line , ' \nNewest: ' , maxdate %>% format ( formatdates ) %>% trimws ( ) ,
2018-10-23 09:42:26 +02:00
' (+' , difftime ( maxdate , mindate , units = ' auto' ) %>% as.double ( ) %>% format ( digits = digits ) , ' )' )
2018-08-23 21:27:15 +02:00
}
2018-10-19 21:52:08 +02:00
header_txt <- header_txt %>% paste0 ( markdown_line , ' \nMedian: ' , mediandate %>% format ( formatdates ) %>% trimws ( ) ,
2018-10-23 09:42:26 +02:00
' (~' , percent ( median_days / maxdate_days , round = 0 ) , ' )' )
2018-04-18 12:24:54 +02:00
}
if ( any ( class ( x ) == ' POSIXlt' ) ) {
x <- x %>% format ( formatdates )
}
2018-05-09 11:44:46 +02:00
nmax.set <- ! missing ( nmax )
2018-07-01 21:40:37 +02:00
if ( ! nmax.set & is.null ( nmax ) & is.null ( base :: getOption ( " max.print.freq" , default = NULL ) ) ) {
2018-05-09 11:44:46 +02:00
# default for max print setting
nmax <- 15
2018-07-01 21:40:37 +02:00
} else if ( is.null ( nmax ) ) {
nmax <- length ( x )
2018-05-09 11:44:46 +02:00
}
2018-07-03 11:30:40 +02:00
if ( nmax %in% c ( 0 , Inf , NA , NULL ) ) {
2018-04-18 12:24:54 +02:00
nmax <- length ( x )
}
# create table with counts and percentages
2018-10-23 09:42:26 +02:00
column_names <- c ( ' Item' , ' Count' , ' Percent' , ' Cum. Count' , ' Cum. Percent' )
column_names_df <- c ( ' item' , ' count' , ' percent' , ' cum_count' , ' cum_percent' )
df <- tibble ( item = x ) %>%
group_by ( item ) %>%
summarise ( count = n ( ) )
column_align <- c ( x_align , ' r' , ' r' , ' r' , ' r' )
2018-04-18 12:24:54 +02:00
2018-07-01 21:40:37 +02:00
if ( df $ item %>% paste ( collapse = ' ,' ) %like% ' \033' ) {
2018-08-24 14:18:38 +02:00
# remove escape char
# see https://en.wikipedia.org/wiki/Escape_character#ASCII_escape_character
df <- df %>% mutate ( item = item %>% gsub ( ' \033' , ' ' , ., fixed = TRUE ) )
2018-04-18 12:24:54 +02:00
}
# sort according to setting
if ( sort.count == TRUE ) {
2018-07-01 21:40:37 +02:00
df <- df %>% arrange ( desc ( count ) , item )
2018-04-18 12:24:54 +02:00
} else {
2018-10-23 09:42:26 +02:00
df <- df %>% arrange ( item )
2018-04-18 12:24:54 +02:00
}
2018-09-10 15:45:25 +02:00
if ( quote == TRUE ) {
df $ item <- paste0 ( ' "' , df $ item , ' "' )
}
2018-07-01 21:40:37 +02:00
df <- as.data.frame ( df , stringsAsFactors = FALSE )
df $ percent <- df $ count / base :: sum ( df $ count , na.rm = TRUE )
df $ cum_count <- base :: cumsum ( df $ count )
df $ cum_percent <- df $ cum_count / base :: sum ( df $ count , na.rm = TRUE )
2018-04-18 12:24:54 +02:00
2018-07-01 21:40:37 +02:00
colnames ( df ) <- column_names_df
2018-04-18 12:24:54 +02:00
2018-07-01 21:40:37 +02:00
if ( markdown == TRUE ) {
tbl_format <- ' markdown'
2018-05-09 11:44:46 +02:00
} else {
2018-07-01 21:40:37 +02:00
tbl_format <- ' pandoc'
2018-04-18 12:24:54 +02:00
}
2018-05-09 11:44:46 +02:00
2018-10-23 09:42:26 +02:00
structure ( .Data = df ,
class = c ( ' frequency_tbl' , class ( df ) ) ,
opt = list ( data = x.name ,
vars = cols ,
header = header ,
header_txt = header_txt ,
row_names = row.names ,
column_names = column_names ,
column_align = column_align ,
tbl_format = tbl_format ,
na = na ,
nmax = nmax ,
nmax.set = nmax.set ) )
2018-04-18 12:24:54 +02:00
}
#' @rdname freq
#' @export
2018-07-01 21:40:37 +02:00
freq <- frequency_tbl
2018-06-20 14:47:37 +02:00
#' @rdname freq
#' @export
#' @importFrom dplyr top_n pull
top_freq <- function ( f , n ) {
if ( ! ' frequency_tbl' %in% class ( f ) ) {
stop ( ' top_freq can only be applied to frequency tables' , call. = FALSE )
}
if ( ! is.numeric ( n ) | length ( n ) != 1L ) {
stop ( ' For top_freq, `nmax` must be a number of length 1' , call. = FALSE )
}
top <- f %>% top_n ( n , count )
vect <- top %>% pull ( item )
names ( vect ) <- top %>% pull ( count )
if ( length ( vect ) > abs ( n ) ) {
message ( " top_freq: selecting " , length ( vect ) , " items instead of " , abs ( n ) , " , because of ties" )
}
vect
}
2018-10-01 11:39:43 +02:00
#' @noRd
2018-09-29 21:54:32 +02:00
#' @exportMethod diff.frequency_tbl
#' @importFrom dplyr %>% full_join mutate
#' @export
diff.frequency_tbl <- function ( x , y , ... ) {
# check classes
if ( ! " frequency_tbl" %in% class ( x )
| ! " frequency_tbl" %in% class ( y ) ) {
stop ( " Both x and y must be a frequency table." )
}
2018-10-01 14:44:40 +02:00
cat ( " Differences between frequency tables" )
if ( identical ( x , y ) ) {
cat ( " \n\nNo differences found.\n" )
return ( invisible ( ) )
}
2018-09-29 21:54:32 +02:00
x.attr <- attributes ( x ) $ opt
# only keep item and count
x <- x [ , 1 : 2 ]
y <- y [ , 1 : 2 ]
x <- x %>%
full_join ( y ,
by = colnames ( x ) [1 ] ,
suffix = c ( " .x" , " .y" ) ) %>%
mutate (
diff = case_when (
is.na ( count.y ) ~ - count.x ,
is.na ( count.x ) ~ count.y ,
TRUE ~ count.y - count.x ) ) %>%
mutate (
diff.percent = percent (
diff / count.x ,
2018-10-01 11:39:43 +02:00
force_zero = TRUE ) ) %>%
mutate ( diff = ifelse ( diff %like% ' ^-' ,
diff ,
paste0 ( " +" , diff ) ) ,
diff.percent = ifelse ( diff.percent %like% ' ^-' ,
diff.percent ,
paste0 ( " +" , diff.percent ) ) )
2018-09-29 21:54:32 +02:00
print (
knitr :: kable ( x ,
format = x.attr $ tbl_format ,
col.names = c ( " Item" , " Count #1" , " Count #2" , " Difference" , " Diff. percent" ) ,
2018-10-01 14:44:40 +02:00
align = paste0 ( x.attr $ column_align [1 ] , " rrrr" ) ,
2018-09-29 21:54:32 +02:00
padding = 1 )
)
}
2018-07-03 11:30:40 +02:00
#' @rdname freq
2018-07-01 21:40:37 +02:00
#' @exportMethod print.frequency_tbl
#' @importFrom knitr kable
#' @importFrom dplyr n_distinct
2018-10-22 12:32:59 +02:00
#' @importFrom crayon bold silver
2018-07-01 21:40:37 +02:00
#' @export
2018-07-03 11:30:40 +02:00
print.frequency_tbl <- function ( x , nmax = getOption ( " max.print.freq" , default = 15 ) , ... ) {
2018-07-01 21:40:37 +02:00
opt <- attr ( x , ' opt' )
2018-08-24 11:08:20 +02:00
if ( length ( opt $ vars ) == 0 ) {
opt $ vars <- NULL
}
2018-07-01 21:40:37 +02:00
if ( ! is.null ( opt $ data ) & ! is.null ( opt $ vars ) ) {
title <- paste0 ( " of `" , paste0 ( opt $ vars , collapse = " ` and `" ) , " ` from " , opt $ data )
} else if ( ! is.null ( opt $ data ) & is.null ( opt $ vars ) ) {
title <- paste ( " of" , opt $ data )
} else if ( is.null ( opt $ data ) & ! is.null ( opt $ vars ) ) {
title <- paste0 ( " of `" , paste0 ( opt $ vars , collapse = " ` and `" ) , " `" )
} else {
title <- " "
}
2018-07-03 11:30:40 +02:00
if ( ! missing ( nmax ) ) {
opt $ nmax <- nmax
opt $ nmax.set <- TRUE
}
2018-08-23 21:27:15 +02:00
dots <- list ( ... )
if ( " markdown" %in% names ( dots ) ) {
if ( dots $ markdown == TRUE ) {
opt $ tbl_format <- " markdown"
} else {
opt $ tbl_format <- " pandoc"
}
}
2018-07-03 11:30:40 +02:00
2018-10-18 12:10:10 +02:00
title <- paste ( " Frequency table" , title )
2018-10-22 13:06:54 +02:00
# bold title
2018-10-18 12:10:10 +02:00
if ( opt $ tbl_format == " pandoc" ) {
2018-10-22 13:06:54 +02:00
title <- bold ( title )
} else if ( opt $ tbl_format == " markdown" ) {
2018-10-23 09:42:26 +02:00
title <- paste0 ( " **" , title , " **" )
2018-10-18 12:10:10 +02:00
}
2018-10-19 21:52:08 +02:00
if ( opt $ header == TRUE ) {
cat ( title , " \n" )
if ( ! is.null ( opt $ header_txt ) ) {
cat ( opt $ header_txt )
}
2018-10-22 12:32:59 +02:00
} else if ( opt $ tbl_format == " markdown" ) {
# do print title as caption in markdown
cat ( " \n" , title , sep = " " )
2018-07-01 21:40:37 +02:00
}
if ( NROW ( x ) == 0 ) {
cat ( ' \n\nNo observations.\n' )
return ( invisible ( ) )
}
if ( all ( x $ count == 1 ) ) {
warning ( ' All observations are unique.' , call. = FALSE )
}
# save old NA setting for kable
opt.old <- options ( ) $ knitr.kable.NA
2018-10-23 09:42:26 +02:00
if ( is.null ( opt $ na ) ) {
opt $ na <- " <NA>"
}
options ( knitr.kable.NA = opt $ na )
2018-07-01 21:40:37 +02:00
if ( nrow ( x ) > opt $ nmax & opt $ tbl_format != " markdown" ) {
x.rows <- nrow ( x )
x.unprinted <- base :: sum ( x [ ( opt $ nmax + 1 ) : nrow ( x ) , ' count' ] , na.rm = TRUE )
x.printed <- base :: sum ( x $ count ) - x.unprinted
2018-07-03 11:30:40 +02:00
if ( opt $ nmax.set == TRUE ) {
nmax <- opt $ nmax
} else {
nmax <- getOption ( " max.print.freq" , default = 15 )
}
x <- x [1 : nmax , ]
2018-07-01 21:40:37 +02:00
if ( opt $ nmax.set == TRUE ) {
footer <- paste ( ' [ reached `nmax = ' , opt $ nmax , ' `' , sep = ' ' )
} else {
footer <- ' [ reached getOption("max.print.freq")'
}
footer <- paste ( footer ,
' -- omitted ' ,
format ( x.rows - opt $ nmax ) ,
' entries, n = ' ,
format ( x.unprinted ) ,
' (' ,
( x.unprinted / ( x.unprinted + x.printed ) ) %>% percent ( force_zero = TRUE ) ,
' ) ]\n' , sep = ' ' )
2018-10-22 12:32:59 +02:00
if ( opt $ tbl_format == " pandoc" ) {
footer <- silver ( footer ) # only silver in regular printing
}
2018-07-01 21:40:37 +02:00
} else {
footer <- NULL
}
if ( any ( class ( x $ item ) %in% c ( ' double' , ' integer' , ' numeric' , ' raw' , ' single' ) ) ) {
x $ item <- format ( x $ item )
}
x $ count <- format ( x $ count )
x $ percent <- percent ( x $ percent , force_zero = TRUE )
x $ cum_count <- format ( x $ cum_count )
x $ cum_percent <- percent ( x $ cum_percent , force_zero = TRUE )
2018-10-23 09:42:26 +02:00
if ( opt $ tbl_format == " markdown" ) {
cat ( " \n\n" )
}
2018-07-01 21:40:37 +02:00
print (
knitr :: kable ( x ,
format = opt $ tbl_format ,
row.names = opt $ row_names ,
col.names = opt $ column_names ,
align = opt $ column_align ,
padding = 1 )
)
if ( ! is.null ( footer ) ) {
cat ( footer )
}
2018-10-23 09:42:26 +02:00
if ( opt $ tbl_format == " markdown" ) {
cat ( " \n\n" )
} else {
cat ( ' \n' )
}
2018-07-01 21:40:37 +02:00
# reset old kable setting
options ( knitr.kable.NA = opt.old )
return ( invisible ( ) )
}
2018-06-20 14:47:37 +02:00
2018-07-03 11:30:40 +02:00
#' @noRd
#' @exportMethod as.data.frame.frequency_tbl
#' @export
as.data.frame.frequency_tbl <- function ( x , ... ) {
attr ( x , ' package' ) <- NULL
attr ( x , ' opt' ) <- NULL
as.data.frame.data.frame ( x , ... )
}
2018-07-08 22:14:55 +02:00
2018-07-09 14:02:58 +02:00
#' @noRd
#' @exportMethod as_tibble.frequency_tbl
#' @export
#' @importFrom dplyr as_tibble
as_tibble.frequency_tbl <- function ( x , validate = TRUE , ... , rownames = NA ) {
attr ( x , ' package' ) <- NULL
attr ( x , ' opt' ) <- NULL
as_tibble ( x = as.data.frame ( x ) , validate = validate , ... , rownames = rownames )
}
2018-07-08 22:14:55 +02:00
#' @noRd
#' @exportMethod hist.frequency_tbl
#' @export
#' @importFrom graphics hist
hist.frequency_tbl <- function ( x , ... ) {
opt <- attr ( x , ' opt' )
if ( ! is.null ( opt $ vars ) ) {
title <- opt $ vars
} else {
title <- " "
}
2018-07-09 14:02:58 +02:00
hist ( as.vector ( x ) , main = paste ( " Histogram of" , title ) , xlab = title , ... )
2018-07-08 22:14:55 +02:00
}
#' @noRd
#' @exportMethod plot.frequency_tbl
#' @export
plot.frequency_tbl <- function ( x , y , ... ) {
opt <- attr ( x , ' opt' )
if ( ! is.null ( opt $ vars ) ) {
title <- opt $ vars
} else {
title <- " "
}
2018-07-09 14:02:58 +02:00
plot ( x = x $ item , y = x $ count , ylab = " Count" , xlab = title , ... )
}
2018-07-08 22:14:55 +02:00
2018-07-09 14:02:58 +02:00
#' @noRd
#' @exportMethod as.vector.frequency_tbl
#' @export
as.vector.frequency_tbl <- function ( x , mode = " any" ) {
as.vector ( rep ( x $ item , x $ count ) , mode = mode )
2018-07-08 22:14:55 +02:00
}
2018-07-16 16:41:48 +02:00
#' @noRd
#' @exportMethod format.frequency_tbl
#' @export
format.frequency_tbl <- function ( x , digits = 1 , ... ) {
opt <- attr ( x , ' opt' )
if ( opt $ nmax.set == TRUE ) {
nmax <- opt $ nmax
} else {
nmax <- getOption ( " max.print.freq" , default = 15 )
}
x <- x [1 : nmax , ]
x $ percent <- percent ( x $ percent , round = digits , force_zero = TRUE )
x $ cum_percent <- percent ( x $ cum_percent , round = digits , force_zero = TRUE )
base :: format.data.frame ( x , ... )
}