2018-04-18 12:24:54 +02:00
# ==================================================================== #
# TITLE #
# Antimicrobial Resistance (AMR) Analysis #
# #
# AUTHORS #
# Berends MS (m.s.berends@umcg.nl), Luz CF (c.f.luz@umcg.nl) #
# #
# LICENCE #
# This program is free software; you can redistribute it and/or modify #
# it under the terms of the GNU General Public License version 2.0, #
# as published by the Free Software Foundation. #
# #
# This program is distributed in the hope that it will be useful, #
# but WITHOUT ANY WARRANTY; without even the implied warranty of #
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the #
# GNU General Public License for more details. #
# ==================================================================== #
#' Frequency table
#'
2018-07-01 21:40:37 +02:00
#' Create a frequency table of a vector with items or a data frame. Supports quasiquotation and markdown for reports. \code{top_freq} can be used to get the top/bottom \emph{n} items of a frequency table, with counts as names.
2018-07-09 14:02:58 +02:00
#' @param x vector of any class or a \code{\link{data.frame}}, \code{\link{tibble}} or \code{\link{table}}
#' @param ... up to nine different columns of \code{x} when \code{x} is a \code{data.frame} or \code{tibble}, to calculate frequencies from - see Examples
2018-07-03 11:30:40 +02:00
#' @param sort.count sort on count, i.e. frequencies. This will be \code{TRUE} at default for everything except for factors.
#' @param nmax number of row to print. The default, \code{15}, uses \code{\link{getOption}("max.print.freq")}. Use \code{nmax = 0}, \code{nmax = Inf}, \code{nmax = NULL} or \code{nmax = NA} to print all rows.
#' @param na.rm a logical value indicating whether \code{NA} values should be removed from the frequency table. The header will always print the amount of \code{NA}s.
2018-06-19 15:20:14 +02:00
#' @param row.names a logical value indicating whether row indices should be printed as \code{1:nrow(x)}
2018-04-18 12:24:54 +02:00
#' @param markdown print table in markdown format (this forces \code{nmax = NA})
2018-07-01 21:40:37 +02:00
#' @param digits how many significant digits are to be used for numeric values in the header (not for the items themselves, that depends on \code{\link{getOption}("digits")})
2018-04-18 12:24:54 +02:00
#' @param sep a character string to separate the terms when selecting multiple columns
2018-07-01 21:40:37 +02:00
#' @param f a frequency table
2018-06-20 14:47:37 +02:00
#' @param n number of top \emph{n} items to return, use -n for the bottom \emph{n} items. It will include more than \code{n} rows if there are ties.
2018-07-03 11:30:40 +02:00
#' @details Frequency tables (or frequency distributions) are summaries of the distribution of values in a sample. With the `freq` function, you can create univariate frequency tables. Multiple variables will be pasted into one variable, so it forces a univariate distribution. This package also has a vignette available to explain the use of this function further, run \code{browseVignettes("AMR")} to read it.
2018-06-20 14:47:37 +02:00
#'
2018-07-03 11:30:40 +02:00
#' For numeric values of any class, these additional values will all be calculated with \code{na.rm = TRUE} and shown into the header:
2018-04-18 12:24:54 +02:00
#' \itemize{
#' \item{Mean, using \code{\link[base]{mean}}}
2018-07-03 11:30:40 +02:00
#' \item{Standard Deviation, using \code{\link[stats]{sd}}}
#' \item{Coefficient of Variation (CV), the standard deviation divided by the mean}
#' \item{Mean Absolute Deviation (MAD), using \code{\link[stats]{mad}}}
#' \item{Tukey Five-Number Summaries (minimum, Q1, median, Q3, maximum), using \code{\link[stats]{fivenum}}}
#' \item{Interquartile Range (IQR) calculated as \code{Q3 - Q1} using the Tukey Five-Number Summaries, i.e. \strong{not} using the \code{\link[stats]{quantile}} function}
#' \item{Coefficient of Quartile Variation (CQV, sometimes called coefficient of dispersion), calculated as \code{(Q3 - Q1) / (Q3 + Q1)} using the Tukey Five-Number Summaries}
#' \item{Outliers (total count and unique count), using \code{\link[grDevices]{boxplot.stats}}}
2018-04-18 12:24:54 +02:00
#' }
2018-06-20 14:47:37 +02:00
#'
2018-07-03 11:30:40 +02:00
#' For dates and times of any class, these additional values will be calculated with \code{na.rm = TRUE} and shown into the header:
2018-06-20 14:47:37 +02:00
#' \itemize{
2018-07-08 22:14:55 +02:00
#' \item{Oldest, using \code{\link{min}}}
#' \item{Newest, using \code{\link{max}}, with difference between newest and oldest}
2018-06-20 14:47:37 +02:00
#' \item{Median, using \code{\link[stats]{median}}, with percentage since oldest}
#' }
#'
2018-07-03 11:30:40 +02:00
#'
2018-06-20 14:47:37 +02:00
#' The function \code{top_freq} uses \code{\link[dplyr]{top_n}} internally and will include more than \code{n} rows if there are ties.
2018-07-03 11:30:40 +02:00
#' @importFrom stats fivenum sd mad
2018-04-18 12:24:54 +02:00
#' @importFrom grDevices boxplot.stats
2018-07-03 11:30:40 +02:00
#' @importFrom dplyr %>% select pull n_distinct group_by arrange desc mutate summarise n_distinct
2018-07-09 14:02:58 +02:00
#' @importFrom utils browseVignettes installed.packages
2018-07-01 21:40:37 +02:00
#' @importFrom tibble tibble
2018-04-18 12:24:54 +02:00
#' @keywords summary summarise frequency freq
#' @rdname freq
2018-07-01 21:40:37 +02:00
#' @name freq
#' @return A \code{data.frame} with an additional class \code{"frequency_tbl"}
2018-04-18 12:24:54 +02:00
#' @export
#' @examples
#' library(dplyr)
#'
2018-07-01 21:40:37 +02:00
#' # this all gives the same result:
2018-04-18 12:24:54 +02:00
#' freq(septic_patients$hospital_id)
2018-07-01 21:40:37 +02:00
#' freq(septic_patients[, "hospital_id"])
#' septic_patients$hospital_id %>% freq()
#' septic_patients[, "hospital_id"] %>% freq()
#' septic_patients %>% freq("hospital_id")
2018-07-09 14:02:58 +02:00
#' septic_patients %>% freq(hospital_id) #<- easiest to remember when you're used to tidyverse
2018-04-18 12:24:54 +02:00
#'
2018-07-09 14:02:58 +02:00
#' # you could also use `select` or `pull` to get your variables
2018-04-18 12:24:54 +02:00
#' septic_patients %>%
#' filter(hospital_id == "A") %>%
#' select(bactid) %>%
#' freq()
#'
2018-07-09 14:02:58 +02:00
#' # multiple selected variables will be pasted together
2018-04-18 12:24:54 +02:00
#' septic_patients %>%
#' left_join_microorganisms %>%
#' filter(hospital_id == "A") %>%
2018-07-01 21:40:37 +02:00
#' freq(genus, species)
2018-04-18 12:24:54 +02:00
#'
2018-07-03 11:30:40 +02:00
#' # get top 10 bugs of hospital A as a vector
#' septic_patients %>%
#' filter(hospital_id == "A") %>%
#' freq(bactid) %>%
#' top_freq(10)
#'
2018-04-18 12:24:54 +02:00
#' # save frequency table to an object
#' years <- septic_patients %>%
#' mutate(year = format(date, "%Y")) %>%
2018-07-01 21:40:37 +02:00
#' freq(year)
2018-06-20 14:47:37 +02:00
#'
2018-07-09 14:02:58 +02:00
#' # show only the top 5
2018-07-03 11:30:40 +02:00
#' years %>% print(nmax = 5)
#'
2018-07-16 16:41:48 +02:00
#' # save to an object with formatted percentages
#' years <- format(years)
#'
2018-07-09 14:02:58 +02:00
#' # print a histogram of numeric values
#' septic_patients %>%
#' freq(age) %>%
2018-07-16 16:41:48 +02:00
#' hist() # prettier: ggplot(septic_patients, aes(age)) + geom_histogram()
2018-07-09 14:02:58 +02:00
#'
#' # or print all points to a regular plot
#' septic_patients %>%
#' freq(age) %>%
#' plot()
#'
#' # transform to a data.frame or tibble
2018-06-20 14:47:37 +02:00
#' septic_patients %>%
2018-07-03 11:30:40 +02:00
#' freq(age) %>%
#' as.data.frame()
2018-07-09 14:02:58 +02:00
#'
#' # or transform (back) to a vector
#' septic_patients %>%
#' freq(age) %>%
#' as.vector()
#'
#' identical(septic_patients %>%
#' freq(age) %>%
#' as.vector() %>%
#' sort(),
2018-07-16 16:41:48 +02:00
#' sort(septic_patients$age)) # TRUE
2018-07-09 14:02:58 +02:00
#'
2018-07-16 16:41:48 +02:00
#' # it also supports `table` objects:
2018-07-09 14:02:58 +02:00
#' table(septic_patients$sex,
#' septic_patients$age) %>%
2018-07-16 16:41:48 +02:00
#' freq(sep = " **sep** ")
#'
#' \dontrun{
#' # send frequency table to clipboard (e.g. for pasting in Excel)
#' septic_patients %>%
#' freq(age) %>%
#' format() %>% # this will format the percentages
#' clipboard_export()
#' }
2018-07-01 21:40:37 +02:00
frequency_tbl <- function ( x ,
... ,
sort.count = TRUE ,
nmax = getOption ( " max.print.freq" ) ,
na.rm = TRUE ,
row.names = TRUE ,
markdown = FALSE ,
digits = 2 ,
sep = " " ) {
2018-07-10 12:27:07 +02:00
mult.columns <- 0
2018-07-01 21:40:37 +02:00
if ( any ( class ( x ) == ' data.frame' ) ) {
x.name <- deparse ( substitute ( x ) )
if ( x.name == " ." ) {
x.name <- NULL
}
2018-07-01 22:23:34 +02:00
dots <- base :: eval ( base :: substitute ( base :: alist ( ... ) ) )
2018-07-01 21:40:37 +02:00
ndots <- length ( dots )
2018-07-15 22:56:41 +02:00
if ( NROW ( x ) == 0 ) {
x <- NA
} else if ( ndots > 0 & ndots < 10 ) {
2018-07-01 21:40:37 +02:00
cols <- as.character ( dots )
2018-07-01 22:23:34 +02:00
if ( ! all ( cols %in% colnames ( x ) ) ) {
stop ( " one or more columns not found: `" , paste ( cols , collapse = " `, `" ) , ' `' , call. = FALSE )
}
2018-07-01 21:40:37 +02:00
x <- x [ , cols ]
} else if ( ndots >= 10 ) {
stop ( ' A maximum of 9 columns can be analysed at the same time.' , call. = FALSE )
} else {
cols <- NULL
}
2018-07-09 14:02:58 +02:00
} else if ( any ( class ( x ) == ' table' ) ) {
if ( ! " tidyr" %in% rownames ( installed.packages ( ) ) ) {
stop ( ' transformation from `table` to frequency table requires the tidyr package.' , call. = FALSE )
}
2018-07-11 12:34:02 +02:00
x <- x %>%
2018-07-09 14:02:58 +02:00
as.data.frame ( stringsAsFactors = FALSE ) %>%
2018-07-11 12:34:02 +02:00
# paste first two columns
tidyr :: unite ( col = " Pasted" , 1 : 2 , sep = sep , remove = TRUE )
x <- rep ( x %>% pull ( Pasted ) , x %>% pull ( Freq ) )
2018-07-10 12:27:07 +02:00
x.name <- " a `table` object"
2018-07-09 14:02:58 +02:00
cols <- NULL
2018-07-10 12:27:07 +02:00
mult.columns <- 2
2018-07-01 21:40:37 +02:00
} else {
x.name <- NULL
cols <- NULL
}
2018-04-18 12:24:54 +02:00
if ( ! is.null ( ncol ( x ) ) ) {
if ( ncol ( x ) == 1 & any ( class ( x ) == ' data.frame' ) ) {
x <- x %>% pull ( 1 )
} else if ( ncol ( x ) < 10 ) {
mult.columns <- ncol ( x )
colnames ( x ) <- LETTERS [1 : ncol ( x ) ]
if ( ncol ( x ) == 2 ) {
x $ total <- paste ( x $ A %>% as.character ( ) ,
2018-07-01 21:40:37 +02:00
x $ B %>% as.character ( ) ,
sep = sep )
2018-04-18 12:24:54 +02:00
} else if ( ncol ( x ) == 3 ) {
x $ total <- paste ( x $ A %>% as.character ( ) ,
2018-07-01 21:40:37 +02:00
x $ B %>% as.character ( ) ,
x $ C %>% as.character ( ) ,
sep = sep )
2018-04-18 12:24:54 +02:00
} else if ( ncol ( x ) == 4 ) {
x $ total <- paste ( x $ A %>% as.character ( ) ,
2018-07-01 21:40:37 +02:00
x $ B %>% as.character ( ) ,
x $ C %>% as.character ( ) ,
x $ D %>% as.character ( ) ,
sep = sep )
2018-04-18 12:24:54 +02:00
} else if ( ncol ( x ) == 5 ) {
x $ total <- paste ( x $ A %>% as.character ( ) ,
2018-07-01 21:40:37 +02:00
x $ B %>% as.character ( ) ,
x $ C %>% as.character ( ) ,
x $ D %>% as.character ( ) ,
x $ E %>% as.character ( ) ,
sep = sep )
2018-04-18 12:24:54 +02:00
} else if ( ncol ( x ) == 6 ) {
x $ total <- paste ( x $ A %>% as.character ( ) ,
2018-07-01 21:40:37 +02:00
x $ B %>% as.character ( ) ,
x $ C %>% as.character ( ) ,
x $ D %>% as.character ( ) ,
x $ E %>% as.character ( ) ,
x $ F %>% as.character ( ) ,
sep = sep )
2018-04-18 12:24:54 +02:00
} else if ( ncol ( x ) == 7 ) {
x $ total <- paste ( x $ A %>% as.character ( ) ,
2018-07-01 21:40:37 +02:00
x $ B %>% as.character ( ) ,
x $ C %>% as.character ( ) ,
x $ D %>% as.character ( ) ,
x $ E %>% as.character ( ) ,
x $ F %>% as.character ( ) ,
x $ G %>% as.character ( ) ,
sep = sep )
2018-04-18 12:24:54 +02:00
} else if ( ncol ( x ) == 8 ) {
x $ total <- paste ( x $ A %>% as.character ( ) ,
2018-07-01 21:40:37 +02:00
x $ B %>% as.character ( ) ,
x $ C %>% as.character ( ) ,
x $ D %>% as.character ( ) ,
x $ E %>% as.character ( ) ,
x $ F %>% as.character ( ) ,
x $ G %>% as.character ( ) ,
x $ H %>% as.character ( ) ,
sep = sep )
2018-04-18 12:24:54 +02:00
} else if ( ncol ( x ) == 9 ) {
x $ total <- paste ( x $ A %>% as.character ( ) ,
2018-07-01 21:40:37 +02:00
x $ B %>% as.character ( ) ,
x $ C %>% as.character ( ) ,
x $ D %>% as.character ( ) ,
x $ E %>% as.character ( ) ,
x $ F %>% as.character ( ) ,
x $ G %>% as.character ( ) ,
x $ H %>% as.character ( ) ,
x $ I %>% as.character ( ) ,
sep = sep )
2018-04-18 12:24:54 +02:00
}
x <- x $ total
} else {
stop ( ' A maximum of 9 columns can be analysed at the same time.' , call. = FALSE )
}
}
if ( mult.columns > 1 ) {
2018-04-19 14:10:57 +02:00
NAs <- x [is.na ( x ) | x == trimws ( strrep ( ' NA ' , mult.columns ) ) ]
2018-04-18 12:24:54 +02:00
} else {
NAs <- x [is.na ( x ) ]
}
2018-07-23 14:14:03 +02:00
2018-04-18 12:24:54 +02:00
if ( na.rm == TRUE ) {
2018-07-23 14:14:03 +02:00
x_class <- class ( x )
2018-04-18 12:24:54 +02:00
x <- x [ ! x %in% NAs ]
2018-07-23 14:14:03 +02:00
class ( x ) <- x_class
2018-04-18 12:24:54 +02:00
}
2018-07-03 11:30:40 +02:00
if ( missing ( sort.count ) & ' factor' %in% class ( x ) ) {
# sort on factor level at default when x is a factor and sort.count is not set
2018-04-18 12:24:54 +02:00
sort.count <- FALSE
}
header <- character ( 0 )
markdown_line <- ' '
if ( markdown == TRUE ) {
markdown_line <- ' \n'
}
x_align <- ' l'
if ( mult.columns > 0 ) {
header <- header %>% paste0 ( markdown_line , ' Columns: ' , mult.columns )
} else {
header <- header %>% paste0 ( markdown_line , ' Class: ' , class ( x ) %>% rev ( ) %>% paste ( collapse = " > " ) )
}
if ( is.list ( x ) | is.matrix ( x ) | is.environment ( x ) | is.function ( x ) ) {
2018-07-03 11:30:40 +02:00
stop ( ' frequency tables do not support lists, matrices, environments and functions.' , call. = FALSE )
2018-04-18 12:24:54 +02:00
}
header <- header %>% paste0 ( markdown_line , ' \nLength: ' , ( NAs %>% length ( ) + x %>% length ( ) ) %>% format ( ) ,
' (of which NA: ' , NAs %>% length ( ) %>% format ( ) ,
2018-07-03 11:30:40 +02:00
' = ' , ( NAs %>% length ( ) / ( NAs %>% length ( ) + x %>% length ( ) ) ) %>% percent ( force_zero = TRUE ) %>% sub ( ' NaN' , ' 0' , ., fixed = TRUE ) , ' )' )
2018-04-18 12:24:54 +02:00
header <- header %>% paste0 ( markdown_line , ' \nUnique: ' , x %>% n_distinct ( ) %>% format ( ) )
2018-07-03 11:30:40 +02:00
if ( NROW ( x ) > 0 & any ( class ( x ) %in% c ( ' double' , ' integer' , ' numeric' , ' raw' , ' single' ) ) ) {
2018-04-18 12:24:54 +02:00
# right align number
2018-07-03 11:30:40 +02:00
Tukey_five <- stats :: fivenum ( x , na.rm = TRUE )
2018-04-18 12:24:54 +02:00
x_align <- ' r'
header <- header %>% paste0 ( ' \n' )
header <- header %>% paste ( markdown_line , ' \nMean: ' , x %>% base :: mean ( na.rm = TRUE ) %>% format ( digits = digits ) )
header <- header %>% paste0 ( markdown_line , ' \nStd. dev.: ' , x %>% stats :: sd ( na.rm = TRUE ) %>% format ( digits = digits ) ,
2018-07-03 11:30:40 +02:00
' (CV: ' , x %>% cv ( na.rm = TRUE ) %>% format ( digits = digits ) ,
' , MAD: ' , x %>% stats :: mad ( na.rm = TRUE ) %>% format ( digits = digits ) , ' )' )
header <- header %>% paste0 ( markdown_line , ' \nFive-Num: ' , Tukey_five %>% format ( digits = digits ) %>% trimws ( ) %>% paste ( collapse = ' | ' ) ,
' (IQR: ' , ( Tukey_five [4 ] - Tukey_five [2 ] ) %>% format ( digits = digits ) ,
' , CQV: ' , x %>% cqv ( na.rm = TRUE ) %>% format ( digits = digits ) , ' )' )
2018-04-18 12:24:54 +02:00
outlier_length <- length ( boxplot.stats ( x ) $ out )
header <- header %>% paste0 ( markdown_line , ' \nOutliers: ' , outlier_length )
if ( outlier_length > 0 ) {
2018-07-03 11:30:40 +02:00
header <- header %>% paste0 ( ' (unique: ' , boxplot.stats ( x ) $ out %>% n_distinct ( ) , ' )' )
2018-04-18 12:24:54 +02:00
}
}
formatdates <- " %e %B %Y" # = d mmmm yyyy
if ( any ( class ( x ) == ' hms' ) ) {
x <- x %>% as.POSIXlt ( )
formatdates <- " %H:%M:%S"
}
2018-07-03 11:30:40 +02:00
if ( NROW ( x ) > 0 & any ( class ( x ) %in% c ( ' Date' , ' POSIXct' , ' POSIXlt' ) ) ) {
2018-04-18 12:24:54 +02:00
header <- header %>% paste0 ( ' \n' )
2018-06-19 15:20:14 +02:00
mindate <- x %>% min ( na.rm = TRUE )
maxdate <- x %>% max ( na.rm = TRUE )
2018-06-20 14:47:37 +02:00
maxdate_days <- difftime ( maxdate , mindate , units = ' auto' ) %>% as.double ( )
2018-06-19 15:20:14 +02:00
mediandate <- x %>% median ( na.rm = TRUE )
2018-06-20 14:47:37 +02:00
median_days <- difftime ( mediandate , mindate , units = ' auto' ) %>% as.double ( )
2018-06-19 15:20:14 +02:00
header <- header %>% paste0 ( markdown_line , ' \nOldest: ' , mindate %>% format ( formatdates ) %>% trimws ( ) )
header <- header %>% paste0 ( markdown_line , ' \nNewest: ' , maxdate %>% format ( formatdates ) %>% trimws ( ) ,
' (+' , difftime ( maxdate , mindate , units = ' auto' ) %>% as.double ( ) %>% format ( ) , ' )' )
2018-06-20 14:47:37 +02:00
header <- header %>% paste0 ( markdown_line , ' \nMedian: ' , mediandate %>% format ( formatdates ) %>% trimws ( ) ,
' (~' , percent ( median_days / maxdate_days , round = 0 ) , ' )' )
2018-04-18 12:24:54 +02:00
}
if ( any ( class ( x ) == ' POSIXlt' ) ) {
x <- x %>% format ( formatdates )
}
2018-05-09 11:44:46 +02:00
nmax.set <- ! missing ( nmax )
2018-07-01 21:40:37 +02:00
if ( ! nmax.set & is.null ( nmax ) & is.null ( base :: getOption ( " max.print.freq" , default = NULL ) ) ) {
2018-05-09 11:44:46 +02:00
# default for max print setting
nmax <- 15
2018-07-01 21:40:37 +02:00
} else if ( is.null ( nmax ) ) {
nmax <- length ( x )
2018-05-09 11:44:46 +02:00
}
2018-07-03 11:30:40 +02:00
if ( nmax %in% c ( 0 , Inf , NA , NULL ) ) {
2018-04-18 12:24:54 +02:00
nmax <- length ( x )
}
# create table with counts and percentages
2018-05-09 11:44:46 +02:00
column_names <- c ( ' Item' , ' Count' , ' Percent' , ' Cum. Count' , ' Cum. Percent' , ' (Factor Level)' )
column_names_df <- c ( ' item' , ' count' , ' percent' , ' cum_count' , ' cum_percent' , ' factor_level' )
2018-07-01 21:40:37 +02:00
2018-04-18 12:24:54 +02:00
if ( any ( class ( x ) == ' factor' ) ) {
2018-07-01 21:40:37 +02:00
df <- tibble :: tibble ( item = x ,
fctlvl = x %>% as.integer ( ) ) %>%
group_by ( item , fctlvl )
2018-04-18 12:24:54 +02:00
column_align <- c ( ' l' , ' r' , ' r' , ' r' , ' r' , ' r' )
} else {
2018-07-01 21:40:37 +02:00
df <- tibble :: tibble ( item = x ) %>%
group_by ( item )
2018-06-19 15:20:14 +02:00
# strip factor lvl from col names
column_names <- column_names [1 : length ( column_names ) - 1 ]
column_names_df <- column_names_df [1 : length ( column_names_df ) - 1 ]
2018-04-18 12:24:54 +02:00
column_align <- c ( x_align , ' r' , ' r' , ' r' , ' r' )
}
2018-07-01 21:40:37 +02:00
df <- df %>% summarise ( count = n ( ) )
2018-04-18 12:24:54 +02:00
2018-07-01 21:40:37 +02:00
if ( df $ item %>% paste ( collapse = ' ,' ) %like% ' \033' ) {
2018-04-18 12:24:54 +02:00
df <- df %>%
2018-07-01 21:40:37 +02:00
mutate ( item = item %>%
2018-04-18 12:24:54 +02:00
# remove escape char
# see https://en.wikipedia.org/wiki/Escape_character#ASCII_escape_character
gsub ( ' \033' , ' ' , ., fixed = TRUE ) )
}
# sort according to setting
if ( sort.count == TRUE ) {
2018-07-01 21:40:37 +02:00
df <- df %>% arrange ( desc ( count ) , item )
2018-04-18 12:24:54 +02:00
} else {
if ( any ( class ( x ) == ' factor' ) ) {
2018-07-01 21:40:37 +02:00
df <- df %>% arrange ( fctlvl , item )
2018-04-18 12:24:54 +02:00
} else {
2018-07-01 21:40:37 +02:00
df <- df %>% arrange ( item )
2018-04-18 12:24:54 +02:00
}
}
2018-07-01 21:40:37 +02:00
df <- as.data.frame ( df , stringsAsFactors = FALSE )
df $ percent <- df $ count / base :: sum ( df $ count , na.rm = TRUE )
df $ cum_count <- base :: cumsum ( df $ count )
df $ cum_percent <- df $ cum_count / base :: sum ( df $ count , na.rm = TRUE )
2018-04-18 12:24:54 +02:00
if ( any ( class ( x ) == ' factor' ) ) {
# put factor last
2018-07-01 21:40:37 +02:00
df <- df %>% select ( item , count , percent , cum_count , cum_percent , fctlvl )
2018-05-09 11:44:46 +02:00
}
2018-04-18 12:24:54 +02:00
2018-07-01 21:40:37 +02:00
colnames ( df ) <- column_names_df
2018-04-18 12:24:54 +02:00
2018-07-01 21:40:37 +02:00
class ( df ) <- c ( ' frequency_tbl' , class ( df ) )
attr ( df , ' package' ) <- ' AMR'
2018-04-18 12:24:54 +02:00
2018-07-01 21:40:37 +02:00
if ( markdown == TRUE ) {
tbl_format <- ' markdown'
2018-05-09 11:44:46 +02:00
} else {
2018-07-01 21:40:37 +02:00
tbl_format <- ' pandoc'
2018-04-18 12:24:54 +02:00
}
2018-05-09 11:44:46 +02:00
2018-07-01 21:40:37 +02:00
attr ( df , ' opt' ) <- list ( data = x.name ,
vars = cols ,
header = header ,
row_names = row.names ,
column_names = column_names ,
column_align = column_align ,
tbl_format = tbl_format ,
nmax = nmax ,
nmax.set = nmax.set )
df
2018-04-18 12:24:54 +02:00
}
#' @rdname freq
#' @export
2018-07-01 21:40:37 +02:00
freq <- frequency_tbl
2018-06-20 14:47:37 +02:00
#' @rdname freq
#' @export
#' @importFrom dplyr top_n pull
top_freq <- function ( f , n ) {
if ( ! ' frequency_tbl' %in% class ( f ) ) {
stop ( ' top_freq can only be applied to frequency tables' , call. = FALSE )
}
if ( ! is.numeric ( n ) | length ( n ) != 1L ) {
stop ( ' For top_freq, `nmax` must be a number of length 1' , call. = FALSE )
}
top <- f %>% top_n ( n , count )
vect <- top %>% pull ( item )
names ( vect ) <- top %>% pull ( count )
if ( length ( vect ) > abs ( n ) ) {
message ( " top_freq: selecting " , length ( vect ) , " items instead of " , abs ( n ) , " , because of ties" )
}
vect
}
2018-07-03 11:30:40 +02:00
#' @rdname freq
2018-07-01 21:40:37 +02:00
#' @exportMethod print.frequency_tbl
#' @importFrom knitr kable
#' @importFrom dplyr n_distinct
#' @export
2018-07-03 11:30:40 +02:00
print.frequency_tbl <- function ( x , nmax = getOption ( " max.print.freq" , default = 15 ) , ... ) {
2018-07-01 21:40:37 +02:00
opt <- attr ( x , ' opt' )
if ( ! is.null ( opt $ data ) & ! is.null ( opt $ vars ) ) {
title <- paste0 ( " of `" , paste0 ( opt $ vars , collapse = " ` and `" ) , " ` from " , opt $ data )
} else if ( ! is.null ( opt $ data ) & is.null ( opt $ vars ) ) {
title <- paste ( " of" , opt $ data )
} else if ( is.null ( opt $ data ) & ! is.null ( opt $ vars ) ) {
title <- paste0 ( " of `" , paste0 ( opt $ vars , collapse = " ` and `" ) , " `" )
} else {
title <- " "
}
2018-07-03 11:30:40 +02:00
if ( ! missing ( nmax ) ) {
opt $ nmax <- nmax
opt $ nmax.set <- TRUE
}
cat ( " Frequency table" , title , " \n" )
2018-07-01 21:40:37 +02:00
if ( ! is.null ( opt $ header ) ) {
cat ( opt $ header )
}
if ( NROW ( x ) == 0 ) {
cat ( ' \n\nNo observations.\n' )
return ( invisible ( ) )
}
if ( all ( x $ count == 1 ) ) {
warning ( ' All observations are unique.' , call. = FALSE )
}
# save old NA setting for kable
opt.old <- options ( ) $ knitr.kable.NA
options ( knitr.kable.NA = " <NA>" )
if ( nrow ( x ) > opt $ nmax & opt $ tbl_format != " markdown" ) {
x.rows <- nrow ( x )
x.unprinted <- base :: sum ( x [ ( opt $ nmax + 1 ) : nrow ( x ) , ' count' ] , na.rm = TRUE )
x.printed <- base :: sum ( x $ count ) - x.unprinted
2018-07-03 11:30:40 +02:00
if ( opt $ nmax.set == TRUE ) {
nmax <- opt $ nmax
} else {
nmax <- getOption ( " max.print.freq" , default = 15 )
}
x <- x [1 : nmax , ]
2018-07-01 21:40:37 +02:00
if ( opt $ nmax.set == TRUE ) {
footer <- paste ( ' [ reached `nmax = ' , opt $ nmax , ' `' , sep = ' ' )
} else {
footer <- ' [ reached getOption("max.print.freq")'
}
footer <- paste ( footer ,
' -- omitted ' ,
format ( x.rows - opt $ nmax ) ,
' entries, n = ' ,
format ( x.unprinted ) ,
' (' ,
( x.unprinted / ( x.unprinted + x.printed ) ) %>% percent ( force_zero = TRUE ) ,
' ) ]\n' , sep = ' ' )
} else {
footer <- NULL
}
if ( any ( class ( x $ item ) %in% c ( ' double' , ' integer' , ' numeric' , ' raw' , ' single' ) ) ) {
x $ item <- format ( x $ item )
}
x $ count <- format ( x $ count )
x $ percent <- percent ( x $ percent , force_zero = TRUE )
x $ cum_count <- format ( x $ cum_count )
x $ cum_percent <- percent ( x $ cum_percent , force_zero = TRUE )
print (
knitr :: kable ( x ,
format = opt $ tbl_format ,
row.names = opt $ row_names ,
col.names = opt $ column_names ,
align = opt $ column_align ,
padding = 1 )
)
if ( ! is.null ( footer ) ) {
cat ( footer )
}
cat ( ' \n' )
# reset old kable setting
options ( knitr.kable.NA = opt.old )
return ( invisible ( ) )
}
2018-06-20 14:47:37 +02:00
2018-07-03 11:30:40 +02:00
#' @noRd
#' @exportMethod as.data.frame.frequency_tbl
#' @export
as.data.frame.frequency_tbl <- function ( x , ... ) {
attr ( x , ' package' ) <- NULL
attr ( x , ' opt' ) <- NULL
as.data.frame.data.frame ( x , ... )
}
2018-07-08 22:14:55 +02:00
2018-07-09 14:02:58 +02:00
#' @noRd
#' @exportMethod as_tibble.frequency_tbl
#' @export
#' @importFrom dplyr as_tibble
as_tibble.frequency_tbl <- function ( x , validate = TRUE , ... , rownames = NA ) {
attr ( x , ' package' ) <- NULL
attr ( x , ' opt' ) <- NULL
as_tibble ( x = as.data.frame ( x ) , validate = validate , ... , rownames = rownames )
}
2018-07-08 22:14:55 +02:00
#' @noRd
#' @exportMethod hist.frequency_tbl
#' @export
#' @importFrom graphics hist
hist.frequency_tbl <- function ( x , ... ) {
opt <- attr ( x , ' opt' )
if ( ! is.null ( opt $ vars ) ) {
title <- opt $ vars
} else {
title <- " "
}
2018-07-09 14:02:58 +02:00
hist ( as.vector ( x ) , main = paste ( " Histogram of" , title ) , xlab = title , ... )
2018-07-08 22:14:55 +02:00
}
#' @noRd
#' @exportMethod plot.frequency_tbl
#' @export
plot.frequency_tbl <- function ( x , y , ... ) {
opt <- attr ( x , ' opt' )
if ( ! is.null ( opt $ vars ) ) {
title <- opt $ vars
} else {
title <- " "
}
2018-07-09 14:02:58 +02:00
plot ( x = x $ item , y = x $ count , ylab = " Count" , xlab = title , ... )
}
2018-07-08 22:14:55 +02:00
2018-07-09 14:02:58 +02:00
#' @noRd
#' @exportMethod as.vector.frequency_tbl
#' @export
as.vector.frequency_tbl <- function ( x , mode = " any" ) {
as.vector ( rep ( x $ item , x $ count ) , mode = mode )
2018-07-08 22:14:55 +02:00
}
2018-07-16 16:41:48 +02:00
#' @noRd
#' @exportMethod format.frequency_tbl
#' @export
format.frequency_tbl <- function ( x , digits = 1 , ... ) {
opt <- attr ( x , ' opt' )
if ( opt $ nmax.set == TRUE ) {
nmax <- opt $ nmax
} else {
nmax <- getOption ( " max.print.freq" , default = 15 )
}
x <- x [1 : nmax , ]
x $ percent <- percent ( x $ percent , round = digits , force_zero = TRUE )
x $ cum_percent <- percent ( x $ cum_percent , round = digits , force_zero = TRUE )
base :: format.data.frame ( x , ... )
}