2018-04-18 12:24:54 +02:00
# ==================================================================== #
# TITLE #
# Antimicrobial Resistance (AMR) Analysis #
# #
2019-01-02 23:24:07 +01:00
# SOURCE #
# https://gitlab.com/msberends/AMR #
2018-04-18 12:24:54 +02:00
# #
# LICENCE #
2019-01-02 23:24:07 +01:00
# (c) 2019 Berends MS (m.s.berends@umcg.nl), Luz CF (c.f.luz@umcg.nl) #
2018-04-18 12:24:54 +02:00
# #
2019-01-02 23:24:07 +01:00
# This R package is free software; you can freely use and distribute #
# it for both personal and commercial purposes under the terms of the #
# GNU General Public License version 2.0 (GNU GPL-2), as published by #
# the Free Software Foundation. #
# #
# This R package was created for academic research and was publicly #
# released in the hope that it will be useful, but it comes WITHOUT #
# ANY WARRANTY OR LIABILITY. #
# Visit our website for more info: https://msberends.gitab.io/AMR. #
2018-04-18 12:24:54 +02:00
# ==================================================================== #
#' Frequency table
#'
2019-01-28 11:20:32 +01:00
#' Create a frequency table of a vector with items or a \code{data.frame}. Supports quasiquotation and markdown for reports. Best practice is: \code{data \%>\% freq(var)}.\cr
2018-12-31 10:35:26 +01:00
#' \code{top_freq} can be used to get the top/bottom \emph{n} items of a frequency table, with counts as names.
2018-11-06 16:41:59 +01:00
#' @param x vector of any class or a \code{\link{data.frame}}, \code{\link{tibble}} (may contain a grouping variable) or \code{\link{table}}
2019-01-28 11:20:32 +01:00
#' @param ... up to nine different columns of \code{x} when \code{x} is a \code{data.frame} or \code{tibble}, to calculate frequencies from - see Examples. Also supports quasiquotion.
2018-11-06 16:41:59 +01:00
#' @param sort.count sort on count, i.e. frequencies. This will be \code{TRUE} at default for everything except when using grouping variables.
2018-07-03 11:30:40 +02:00
#' @param nmax number of row to print. The default, \code{15}, uses \code{\link{getOption}("max.print.freq")}. Use \code{nmax = 0}, \code{nmax = Inf}, \code{nmax = NULL} or \code{nmax = NA} to print all rows.
2018-10-23 09:42:26 +02:00
#' @param na.rm a logical value indicating whether \code{NA} values should be removed from the frequency table. The header (if set) will always print the amount of \code{NA}s.
2018-06-19 15:20:14 +02:00
#' @param row.names a logical value indicating whether row indices should be printed as \code{1:nrow(x)}
2018-12-29 22:24:19 +01:00
#' @param markdown a logical value indicating whether the frequency table should be printed in markdown format. This will print all rows (except when \code{nmax} is defined) and is default behaviour in non-interactive R sessions (like when knitting RMarkdown files).
2018-10-22 12:32:59 +02:00
#' @param digits how many significant digits are to be used for numeric values in the header (not for the items themselves, that depends on \code{\link{getOption}("digits")})
2018-09-10 15:45:25 +02:00
#' @param quote a logical value indicating whether or not strings should be printed with surrounding quotes
2018-10-19 21:52:08 +02:00
#' @param header a logical value indicating whether an informative header should be printed
2018-10-31 12:10:49 +01:00
#' @param title text to show above frequency table, at default to tries to coerce from the variables passed to \code{x}
2018-12-31 10:35:26 +01:00
#' @param na a character string that should be used to show empty (\code{NA}) values (only useful when \code{na.rm = FALSE})
2018-12-22 22:39:34 +01:00
#' @param droplevels a logical value indicating whether in factors empty levels should be dropped
2018-04-18 12:24:54 +02:00
#' @param sep a character string to separate the terms when selecting multiple columns
2018-12-10 10:13:40 +01:00
#' @inheritParams base::format
2018-07-01 21:40:37 +02:00
#' @param f a frequency table
2018-06-20 14:47:37 +02:00
#' @param n number of top \emph{n} items to return, use -n for the bottom \emph{n} items. It will include more than \code{n} rows if there are ties.
2018-12-29 22:24:19 +01:00
#' @param property property in header to return this value directly
2018-07-03 11:30:40 +02:00
#' @details Frequency tables (or frequency distributions) are summaries of the distribution of values in a sample. With the `freq` function, you can create univariate frequency tables. Multiple variables will be pasted into one variable, so it forces a univariate distribution. This package also has a vignette available to explain the use of this function further, run \code{browseVignettes("AMR")} to read it.
2018-06-20 14:47:37 +02:00
#'
2018-10-22 12:32:59 +02:00
#' For numeric values of any class, these additional values will all be calculated with \code{na.rm = TRUE} and shown into the header:
2018-04-18 12:24:54 +02:00
#' \itemize{
#' \item{Mean, using \code{\link[base]{mean}}}
2018-07-03 11:30:40 +02:00
#' \item{Standard Deviation, using \code{\link[stats]{sd}}}
#' \item{Coefficient of Variation (CV), the standard deviation divided by the mean}
#' \item{Mean Absolute Deviation (MAD), using \code{\link[stats]{mad}}}
#' \item{Tukey Five-Number Summaries (minimum, Q1, median, Q3, maximum), using \code{\link[stats]{fivenum}}}
#' \item{Interquartile Range (IQR) calculated as \code{Q3 - Q1} using the Tukey Five-Number Summaries, i.e. \strong{not} using the \code{\link[stats]{quantile}} function}
#' \item{Coefficient of Quartile Variation (CQV, sometimes called coefficient of dispersion), calculated as \code{(Q3 - Q1) / (Q3 + Q1)} using the Tukey Five-Number Summaries}
#' \item{Outliers (total count and unique count), using \code{\link[grDevices]{boxplot.stats}}}
2018-04-18 12:24:54 +02:00
#' }
2018-06-20 14:47:37 +02:00
#'
2018-10-22 12:32:59 +02:00
#' For dates and times of any class, these additional values will be calculated with \code{na.rm = TRUE} and shown into the header:
2018-06-20 14:47:37 +02:00
#' \itemize{
2018-07-08 22:14:55 +02:00
#' \item{Oldest, using \code{\link{min}}}
#' \item{Newest, using \code{\link{max}}, with difference between newest and oldest}
2018-06-20 14:47:37 +02:00
#' \item{Median, using \code{\link[stats]{median}}, with percentage since oldest}
#' }
#'
2018-12-22 22:39:34 +01:00
#' In factors, all factor levels that are not existing in the input data will be dropped.
2018-07-03 11:30:40 +02:00
#'
2018-06-20 14:47:37 +02:00
#' The function \code{top_freq} uses \code{\link[dplyr]{top_n}} internally and will include more than \code{n} rows if there are ties.
2018-07-03 11:30:40 +02:00
#' @importFrom stats fivenum sd mad
2018-04-18 12:24:54 +02:00
#' @importFrom grDevices boxplot.stats
2019-01-28 11:20:32 +01:00
#' @importFrom dplyr %>% arrange arrange_at bind_cols desc filter_at funs group_by mutate mutate_at n n_distinct pull select summarise tibble ungroup vars all_vars
2018-11-06 16:41:59 +01:00
#' @importFrom utils browseVignettes
2018-10-09 13:53:33 +02:00
#' @importFrom hms is.hms
2018-11-02 10:27:57 +01:00
#' @importFrom crayon red green silver
2019-01-29 20:20:09 +01:00
#' @importFrom rlang enquos eval_tidy as_label
2018-04-18 12:24:54 +02:00
#' @keywords summary summarise frequency freq
#' @rdname freq
2018-07-01 21:40:37 +02:00
#' @name freq
2018-11-19 13:00:22 +01:00
#' @return A \code{data.frame} (with an additional class \code{"frequency_tbl"}) with five columns: \code{item}, \code{count}, \code{percent}, \code{cum_count} and \code{cum_percent}.
2018-04-18 12:24:54 +02:00
#' @export
2019-01-02 23:24:07 +01:00
#' @inheritSection AMR Read more on our website!
2018-04-18 12:24:54 +02:00
#' @examples
#' library(dplyr)
#'
2018-07-01 21:40:37 +02:00
#' # this all gives the same result:
2018-04-18 12:24:54 +02:00
#' freq(septic_patients$hospital_id)
2018-07-01 21:40:37 +02:00
#' freq(septic_patients[, "hospital_id"])
#' septic_patients$hospital_id %>% freq()
#' septic_patients[, "hospital_id"] %>% freq()
#' septic_patients %>% freq("hospital_id")
2018-11-06 16:41:59 +01:00
#' septic_patients %>% freq(hospital_id) #<- easiest to remember (tidyverse)
2018-04-18 12:24:54 +02:00
#'
2018-11-19 13:00:22 +01:00
#'
2018-07-09 14:02:58 +02:00
#' # you could also use `select` or `pull` to get your variables
2018-04-18 12:24:54 +02:00
#' septic_patients %>%
#' filter(hospital_id == "A") %>%
2018-08-31 13:36:19 +02:00
#' select(mo) %>%
2018-04-18 12:24:54 +02:00
#' freq()
#'
2018-11-19 13:00:22 +01:00
#'
2018-07-09 14:02:58 +02:00
#' # multiple selected variables will be pasted together
2018-04-18 12:24:54 +02:00
#' septic_patients %>%
#' left_join_microorganisms %>%
2018-07-01 21:40:37 +02:00
#' freq(genus, species)
2018-04-18 12:24:54 +02:00
#'
2019-01-28 11:20:32 +01:00
#' # functions as quasiquotation are also supported
#' septic_patients %>%
#' freq(mo_genus(mo), mo_species(mo))
#'
2018-11-19 13:00:22 +01:00
#'
2018-11-06 16:41:59 +01:00
#' # group a variable and analyse another
#' septic_patients %>%
#' group_by(hospital_id) %>%
#' freq(gender)
#'
2018-11-19 13:00:22 +01:00
#'
2018-07-03 11:30:40 +02:00
#' # get top 10 bugs of hospital A as a vector
#' septic_patients %>%
#' filter(hospital_id == "A") %>%
2018-08-31 13:36:19 +02:00
#' freq(mo) %>%
2018-07-03 11:30:40 +02:00
#' top_freq(10)
#'
2018-11-19 13:00:22 +01:00
#'
2018-04-18 12:24:54 +02:00
#' # save frequency table to an object
#' years <- septic_patients %>%
#' mutate(year = format(date, "%Y")) %>%
2018-07-01 21:40:37 +02:00
#' freq(year)
2018-06-20 14:47:37 +02:00
#'
2018-11-19 13:00:22 +01:00
#'
2018-07-09 14:02:58 +02:00
#' # show only the top 5
2018-07-03 11:30:40 +02:00
#' years %>% print(nmax = 5)
#'
2018-11-19 13:00:22 +01:00
#'
2018-07-16 16:41:48 +02:00
#' # save to an object with formatted percentages
#' years <- format(years)
#'
2018-11-19 13:00:22 +01:00
#'
2018-07-09 14:02:58 +02:00
#' # print a histogram of numeric values
#' septic_patients %>%
#' freq(age) %>%
2018-10-01 11:39:43 +02:00
#' hist()
2018-07-09 14:02:58 +02:00
#'
2018-11-19 13:00:22 +01:00
#'
2018-07-09 14:02:58 +02:00
#' # or print all points to a regular plot
#' septic_patients %>%
#' freq(age) %>%
#' plot()
#'
2018-11-19 13:00:22 +01:00
#'
2018-07-09 14:02:58 +02:00
#' # transform to a data.frame or tibble
2018-06-20 14:47:37 +02:00
#' septic_patients %>%
2018-07-03 11:30:40 +02:00
#' freq(age) %>%
#' as.data.frame()
2018-07-09 14:02:58 +02:00
#'
2018-11-19 13:00:22 +01:00
#'
2018-07-09 14:02:58 +02:00
#' # or transform (back) to a vector
#' septic_patients %>%
#' freq(age) %>%
#' as.vector()
#'
#' identical(septic_patients %>%
#' freq(age) %>%
#' as.vector() %>%
#' sort(),
2018-07-16 16:41:48 +02:00
#' sort(septic_patients$age)) # TRUE
2018-07-09 14:02:58 +02:00
#'
2018-11-19 13:00:22 +01:00
#'
#' # it also supports `table` objects
2018-09-29 21:54:32 +02:00
#' table(septic_patients$gender,
2018-07-09 14:02:58 +02:00
#' septic_patients$age) %>%
2018-07-16 16:41:48 +02:00
#' freq(sep = " **sep** ")
#'
2018-11-19 13:00:22 +01:00
#'
#' # only get selected columns
#' septic_patients %>%
#' freq(hospital_id) %>%
#' select(item, percent)
#'
#' septic_patients %>%
#' freq(hospital_id) %>%
#' select(-count, -cum_count)
#'
#'
2018-10-01 11:39:43 +02:00
#' # check differences between frequency tables
#' diff(freq(septic_patients$trim),
#' freq(septic_patients$trsu))
2018-07-01 21:40:37 +02:00
frequency_tbl <- function ( x ,
... ,
sort.count = TRUE ,
nmax = getOption ( " max.print.freq" ) ,
na.rm = TRUE ,
row.names = TRUE ,
2018-10-22 12:32:59 +02:00
markdown = ! interactive ( ) ,
2018-07-01 21:40:37 +02:00
digits = 2 ,
2018-09-10 15:45:25 +02:00
quote = FALSE ,
2019-01-30 16:00:55 +01:00
header = TRUE ,
2018-10-31 12:10:49 +01:00
title = NULL ,
2018-10-23 09:42:26 +02:00
na = " <NA>" ,
2018-12-22 22:39:34 +01:00
droplevels = TRUE ,
2018-12-10 10:13:40 +01:00
sep = " " ,
decimal.mark = getOption ( " OutDec" ) ,
big.mark = ifelse ( decimal.mark != " ," , " ," , " ." ) ) {
2018-07-01 21:40:37 +02:00
2018-07-10 12:27:07 +02:00
mult.columns <- 0
2018-11-06 16:41:59 +01:00
x.group = character ( 0 )
df <- NULL
2018-08-24 11:08:20 +02:00
x.name <- NULL
cols <- NULL
2019-01-29 20:20:09 +01:00
cols.names <- NULL
2018-12-22 22:39:34 +01:00
if ( any ( class ( x ) == " list" ) ) {
2018-08-24 11:08:20 +02:00
cols <- names ( x )
x <- as.data.frame ( x , stringsAsFactors = FALSE )
x.name <- " a list"
2018-12-22 22:39:34 +01:00
} else if ( any ( class ( x ) == " matrix" ) ) {
2018-08-24 11:08:20 +02:00
x <- as.data.frame ( x , stringsAsFactors = FALSE )
x.name <- " a matrix"
cols <- colnames ( x )
2018-12-22 22:39:34 +01:00
if ( all ( cols %like% " V[0-9]" ) ) {
2018-08-24 11:08:20 +02:00
cols <- NULL
}
}
2018-12-22 22:39:34 +01:00
if ( any ( class ( x ) == " data.frame" ) ) {
2018-11-06 16:41:59 +01:00
2018-08-24 11:08:20 +02:00
if ( is.null ( x.name ) ) {
x.name <- deparse ( substitute ( x ) )
}
2019-01-29 20:20:09 +01:00
if ( x.name %like% " (%>%)" ) {
x.name <- x.name %>% strsplit ( " %>%" , fixed = TRUE ) %>% unlist ( ) %>% .[1 ] %>% trimws ( )
}
2018-07-01 21:40:37 +02:00
if ( x.name == " ." ) {
2019-02-08 16:06:54 +01:00
x.name <- " a data.frame"
2019-01-30 16:00:55 +01:00
} else {
x.name <- paste0 ( " `" , x.name , " `" )
2018-07-01 21:40:37 +02:00
}
2019-01-29 20:20:09 +01:00
x.name.dims <- x %>%
dim ( ) %>%
format ( decimal.mark = decimal.mark , big.mark = big.mark ) %>%
trimws ( ) %>%
paste ( collapse = " x " )
x.name <- paste0 ( x.name , " (" , x.name.dims , " )" )
2018-07-01 21:40:37 +02:00
2019-01-28 11:20:32 +01:00
x.group <- group_vars ( x )
if ( length ( x.group ) > 1 ) {
x.group <- x.group [1L ]
warning ( " freq supports one grouping variable, only `" , x.group , " ` will be kept." , call. = FALSE )
}
user_exprs <- enquos ( ... )
if ( length ( user_exprs ) > 0 ) {
new_list <- list ( 0 )
for ( i in 1 : length ( user_exprs ) ) {
new_list [ [i ] ] <- eval_tidy ( user_exprs [ [i ] ] , data = x )
2019-01-30 19:52:58 +01:00
if ( length ( new_list [ [i ] ] ) == 1 ) {
if ( is.character ( new_list [ [i ] ] ) & new_list [ [i ] ] %in% colnames ( x ) ) {
# support septic_patients %>% freq("hospital_id")
new_list [ [i ] ] <- x %>% pull ( new_list [ [i ] ] )
}
}
2019-01-29 20:20:09 +01:00
cols <- c ( cols , as_label ( user_exprs [ [i ] ] ) )
2019-01-28 11:20:32 +01:00
}
2018-11-06 16:41:59 +01:00
2019-01-28 11:20:32 +01:00
if ( length ( new_list ) == 1 & length ( x.group ) == 0 ) {
# is now character
x <- new_list [ [1 ] ]
df <- NULL
} else {
# create data frame
2019-01-29 20:20:09 +01:00
df <- as.data.frame ( new_list , col.names = cols , stringsAsFactors = FALSE )
cols.names <- colnames ( df )
2019-01-28 11:20:32 +01:00
}
} else {
# complete data frame
df <- x
}
2018-12-22 22:39:34 +01:00
2019-01-28 11:20:32 +01:00
# support grouping variables
if ( length ( x.group ) > 0 ) {
2019-01-29 20:20:09 +01:00
x.group_cols <- c ( x.group , cols.names )
2019-01-28 11:20:32 +01:00
x <- bind_cols ( x , df )
# if (droplevels == TRUE) {
# x <- x %>% mutate_at(vars(x.group_cols), droplevels)
# }
suppressWarnings (
df <- x %>%
group_by_at ( vars ( x.group_cols ) ) %>%
summarise ( count = n ( ) )
)
if ( na.rm == TRUE ) {
df <- df %>% filter_at ( vars ( x.group_cols ) , all_vars ( ! is.na ( .) ) )
}
if ( ! missing ( sort.count ) ) {
if ( sort.count == TRUE ) {
df <- df %>% arrange_at ( c ( x.group_cols , " count" ) , desc )
2018-12-22 22:39:34 +01:00
}
2018-11-06 16:41:59 +01:00
}
2019-01-28 11:20:32 +01:00
df <- df %>%
mutate ( cum_count = cumsum ( count ) )
df.topleft <- df [1 , 1 ]
df <- df %>%
ungroup ( ) %>%
# do not repeat group labels
mutate_at ( vars ( x.group ) , funs ( ifelse ( lag ( .) == ., " " , .) ) )
df [1 , 1 ] <- df.topleft
colnames ( df ) [1 : 2 ] <- c ( " group" , " item" )
if ( ! is.null ( levels ( df $ item ) ) & droplevels == TRUE ) {
# is factor
df <- df %>% filter ( count != 0 )
2018-08-24 11:08:20 +02:00
}
2018-07-01 21:40:37 +02:00
} else {
2019-01-28 11:20:32 +01:00
if ( ! is.null ( df ) ) {
# no groups, multiple values like: septic_patients %>% freq(mo, mo_genus(mo))
x <- df
df <- NULL
}
}
if ( length ( cols ) > 0 & is.data.frame ( x ) ) {
2019-01-29 20:20:09 +01:00
x <- x [ , cols.names ]
2018-07-01 21:40:37 +02:00
}
2019-01-28 11:20:32 +01:00
2018-12-22 22:39:34 +01:00
} else if ( any ( class ( x ) == " table" ) ) {
2018-10-22 12:32:59 +02:00
x <- as.data.frame ( x , stringsAsFactors = FALSE )
# now this DF contains 3 columns: the 2 vars and a Freq column
# paste the first 2 cols and repeat them Freq times:
x <- rep ( x = do.call ( paste , c ( x [colnames ( x ) [1 : 2 ] ] , sep = sep ) ) ,
2018-10-23 09:42:26 +02:00
times = x $ Freq )
2018-07-10 12:27:07 +02:00
x.name <- " a `table` object"
2018-07-09 14:02:58 +02:00
cols <- NULL
2019-01-28 11:20:32 +01:00
# mult.columns <- 2
2018-07-01 21:40:37 +02:00
} else {
x.name <- NULL
cols <- NULL
}
2018-04-18 12:24:54 +02:00
if ( ! is.null ( ncol ( x ) ) ) {
2018-12-22 22:39:34 +01:00
if ( ncol ( x ) == 1 & any ( class ( x ) == " data.frame" ) ) {
2018-04-18 12:24:54 +02:00
x <- x %>% pull ( 1 )
} else if ( ncol ( x ) < 10 ) {
mult.columns <- ncol ( x )
2019-01-29 20:20:09 +01:00
# paste old columns together
2018-10-22 12:32:59 +02:00
x <- do.call ( paste , c ( x [colnames ( x ) ] , sep = sep ) )
2018-04-18 12:24:54 +02:00
} else {
2018-12-22 22:39:34 +01:00
stop ( " A maximum of 9 columns can be analysed at the same time." , call. = FALSE )
2018-04-18 12:24:54 +02:00
}
}
if ( mult.columns > 1 ) {
2018-12-22 22:39:34 +01:00
NAs <- x [is.na ( x ) | x == trimws ( strrep ( " NA " , mult.columns ) ) ]
2018-04-18 12:24:54 +02:00
} else {
NAs <- x [is.na ( x ) ]
}
2018-07-23 14:14:03 +02:00
2018-04-18 12:24:54 +02:00
if ( na.rm == TRUE ) {
2018-07-23 14:14:03 +02:00
x_class <- class ( x )
2018-04-18 12:24:54 +02:00
x <- x [ ! x %in% NAs ]
2018-07-23 14:14:03 +02:00
class ( x ) <- x_class
2018-04-18 12:24:54 +02:00
}
2018-12-22 22:39:34 +01:00
markdown_line <- " "
2018-04-18 12:24:54 +02:00
if ( markdown == TRUE ) {
2018-12-29 22:24:19 +01:00
markdown_line <- " "
2018-04-18 12:24:54 +02:00
}
2018-12-22 22:39:34 +01:00
x_align <- " l"
2018-04-18 12:24:54 +02:00
if ( mult.columns > 0 ) {
2018-12-29 22:24:19 +01:00
header_list <- list ( columns = mult.columns )
2018-04-18 12:24:54 +02:00
} else {
2018-12-29 22:24:19 +01:00
header_list <- list ( class = class ( x ) ,
mode = mode ( x ) )
2018-04-18 12:24:54 +02:00
}
2018-11-24 20:25:09 +01:00
2018-12-22 22:39:34 +01:00
if ( ! is.null ( levels ( x ) ) ) {
2018-12-29 22:24:19 +01:00
header_list $ levels <- levels ( x )
header_list $ ordered <- is.ordered ( x )
2018-12-22 22:39:34 +01:00
# drop levels of non-existing factor values,
# since dplyr >= 0.8.0 does not do this anymore in group_by
if ( droplevels == TRUE ) {
x <- droplevels ( x )
}
}
2018-12-29 22:24:19 +01:00
header_list $ length <- length ( x )
header_list $ na_length <- length ( NAs )
header_list $ unique <- n_distinct ( x )
2018-04-18 12:24:54 +02:00
2018-09-17 09:42:09 +02:00
if ( NROW ( x ) > 0 & any ( class ( x ) == " character" ) ) {
2018-12-29 22:24:19 +01:00
header_list $ shortest <- x %>% base :: nchar ( ) %>% base :: min ( na.rm = TRUE )
header_list $ longest <- x %>% base :: nchar ( ) %>% base :: max ( na.rm = TRUE )
2018-12-10 10:13:40 +01:00
}
if ( NROW ( x ) > 0 & any ( class ( x ) == " mo" ) ) {
2018-12-29 22:24:19 +01:00
header_list $ families <- x %>% mo_family ( ) %>% n_distinct ( )
header_list $ genera <- x %>% mo_genus ( ) %>% n_distinct ( )
header_list $ species <- x %>% mo_species ( ) %>% n_distinct ( )
2018-09-17 09:42:09 +02:00
}
2018-11-30 12:05:59 +01:00
if ( NROW ( x ) > 0 & any ( class ( x ) == " difftime" ) & ! is.hms ( x ) ) {
2018-12-29 22:24:19 +01:00
header_list $ units <- attributes ( x ) $ units
2018-10-12 16:35:18 +02:00
x <- as.double ( x )
2018-10-19 21:52:08 +02:00
# after this, the numeric header_txt continues
2018-10-12 16:35:18 +02:00
}
2018-12-22 22:39:34 +01:00
if ( NROW ( x ) > 0 & any ( class ( x ) %in% c ( " double" , " integer" , " numeric" , " raw" , " single" ) ) ) {
2018-04-18 12:24:54 +02:00
# right align number
2018-12-22 22:39:34 +01:00
x_align <- " r"
2018-12-29 22:24:19 +01:00
header_list $ mean <- base :: mean ( x , na.rm = TRUE )
header_list $ sd <- stats :: sd ( x , na.rm = TRUE )
header_list $ cv <- cv ( x , na.rm = TRUE )
header_list $ mad <- stats :: mad ( x , na.rm = TRUE )
Tukey_five <- stats :: fivenum ( x , na.rm = TRUE )
header_list $ fivenum <- Tukey_five
header_list $ IQR <- Tukey_five [4 ] - Tukey_five [2 ]
header_list $ cqv <- cqv ( x , na.rm = TRUE )
header_list $ outliers_total <- length ( boxplot.stats ( x ) $ out )
header_list $ outliers_unique <- n_distinct ( boxplot.stats ( x ) $ out )
2018-04-18 12:24:54 +02:00
}
2018-12-29 22:24:19 +01:00
2018-08-03 09:59:39 +02:00
if ( NROW ( x ) > 0 & any ( class ( x ) == " rsi" ) ) {
2018-12-29 22:24:19 +01:00
header_list $ count_S <- sum ( x == " S" , na.rm = TRUE )
header_list $ count_IR <- sum ( x %in% c ( " I" , " R" ) , na.rm = TRUE )
2018-08-01 22:37:28 +02:00
}
2018-04-18 12:24:54 +02:00
formatdates <- " %e %B %Y" # = d mmmm yyyy
2018-10-09 13:53:33 +02:00
if ( is.hms ( x ) ) {
2018-04-18 12:24:54 +02:00
x <- x %>% as.POSIXlt ( )
formatdates <- " %H:%M:%S"
}
2018-12-22 22:39:34 +01:00
if ( NROW ( x ) > 0 & any ( class ( x ) %in% c ( " Date" , " POSIXct" , " POSIXlt" ) ) ) {
2018-08-23 21:27:15 +02:00
if ( formatdates == " %H:%M:%S" ) {
# hms
2018-12-29 22:24:19 +01:00
header_list $ earliest <- min ( x , na.rm = TRUE )
header_list $ latest <- max ( x , na.rm = TRUE )
2018-08-23 21:27:15 +02:00
} else {
# other date formats
2018-12-29 22:24:19 +01:00
header_list $ oldest <- min ( x , na.rm = TRUE )
header_list $ newest <- max ( x , na.rm = TRUE )
2018-08-23 21:27:15 +02:00
}
2018-12-29 22:24:19 +01:00
header_list $ median <- median ( x , na.rm = TRUE )
header_list $ date_format <- formatdates
2018-04-18 12:24:54 +02:00
}
2018-12-22 22:39:34 +01:00
if ( any ( class ( x ) == " POSIXlt" ) ) {
2018-04-18 12:24:54 +02:00
x <- x %>% format ( formatdates )
}
2018-05-09 11:44:46 +02:00
nmax.set <- ! missing ( nmax )
2018-07-01 21:40:37 +02:00
if ( ! nmax.set & is.null ( nmax ) & is.null ( base :: getOption ( " max.print.freq" , default = NULL ) ) ) {
2018-05-09 11:44:46 +02:00
# default for max print setting
nmax <- 15
2018-07-01 21:40:37 +02:00
} else if ( is.null ( nmax ) ) {
nmax <- length ( x )
2018-05-09 11:44:46 +02:00
}
2018-07-03 11:30:40 +02:00
if ( nmax %in% c ( 0 , Inf , NA , NULL ) ) {
2018-04-18 12:24:54 +02:00
nmax <- length ( x )
}
2018-12-22 22:39:34 +01:00
column_names <- c ( " Item" , " Count" , " Percent" , " Cum. Count" , " Cum. Percent" )
column_names_df <- c ( " item" , " count" , " percent" , " cum_count" , " cum_percent" )
column_align <- c ( x_align , " r" , " r" , " r" , " r" )
2018-10-23 09:42:26 +02:00
2018-11-06 16:41:59 +01:00
if ( is.null ( df ) ) {
2018-12-29 22:24:19 +01:00
suppressWarnings ( # suppress since dplyr 0.8.0, which idiotly warns about included NAs :(
# create table with counts and percentages
df <- tibble ( item = x ) %>%
group_by ( item ) %>%
summarise ( count = n ( ) )
)
2018-10-23 09:42:26 +02:00
2018-11-06 16:41:59 +01:00
# sort according to setting
if ( sort.count == TRUE ) {
df <- df %>% arrange ( desc ( count ) , item )
} else {
df <- df %>% arrange ( item )
}
} else {
column_names <- c ( " Group" , column_names )
column_names_df <- c ( " group" , column_names_df )
column_align <- c ( " l" , column_align )
}
2018-04-18 12:24:54 +02:00
2018-12-22 22:39:34 +01:00
if ( df $ item %>% paste ( collapse = " ," ) %like% " \033" ) {
2018-08-24 14:18:38 +02:00
# remove escape char
# see https://en.wikipedia.org/wiki/Escape_character#ASCII_escape_character
2018-12-22 22:39:34 +01:00
df <- df %>% mutate ( item = item %>% gsub ( " \033" , " " , ., fixed = TRUE ) )
2018-04-18 12:24:54 +02:00
}
2018-09-10 15:45:25 +02:00
if ( quote == TRUE ) {
df $ item <- paste0 ( ' "' , df $ item , ' "' )
2018-11-06 16:41:59 +01:00
if ( length ( x.group ) != 0 ) {
df $ group <- paste0 ( ' "' , df $ group , ' "' )
}
2018-09-10 15:45:25 +02:00
}
2018-07-01 21:40:37 +02:00
df <- as.data.frame ( df , stringsAsFactors = FALSE )
df $ percent <- df $ count / base :: sum ( df $ count , na.rm = TRUE )
2018-11-06 16:41:59 +01:00
if ( length ( x.group ) == 0 ) {
df $ cum_count <- base :: cumsum ( df $ count )
}
2018-07-01 21:40:37 +02:00
df $ cum_percent <- df $ cum_count / base :: sum ( df $ count , na.rm = TRUE )
2018-11-06 16:41:59 +01:00
if ( length ( x.group ) != 0 ) {
# sort columns
df <- df [ , column_names_df ]
}
2018-04-18 12:24:54 +02:00
2018-07-01 21:40:37 +02:00
if ( markdown == TRUE ) {
2018-12-22 22:39:34 +01:00
tbl_format <- " markdown"
2018-05-09 11:44:46 +02:00
} else {
2018-12-22 22:39:34 +01:00
tbl_format <- " pandoc"
2018-04-18 12:24:54 +02:00
}
2018-05-09 11:44:46 +02:00
2018-10-31 12:10:49 +01:00
if ( ! is.null ( title ) ) {
2018-11-06 16:41:59 +01:00
title <- trimws ( gsub ( " ^Frequency table of" , " " , title [1L ] , ignore.case = TRUE ) )
2018-10-31 12:10:49 +01:00
}
2018-12-29 22:24:19 +01:00
# if (nmax.set == FALSE) {
# nmax <- nrow(df)
# }
2018-10-23 09:42:26 +02:00
structure ( .Data = df ,
2018-12-22 22:39:34 +01:00
class = c ( " frequency_tbl" , class ( df ) ) ,
2018-12-29 22:24:19 +01:00
header = header_list ,
2018-11-06 16:41:59 +01:00
opt = list ( title = title ,
data = x.name ,
2018-10-23 09:42:26 +02:00
vars = cols ,
2018-11-06 16:41:59 +01:00
group_var = x.group ,
2018-10-23 09:42:26 +02:00
header = header ,
row_names = row.names ,
column_names = column_names ,
column_align = column_align ,
2018-12-10 10:13:40 +01:00
decimal.mark = decimal.mark ,
big.mark = big.mark ,
2018-10-23 09:42:26 +02:00
tbl_format = tbl_format ,
na = na ,
2018-12-29 22:24:19 +01:00
digits = digits ,
2018-10-23 09:42:26 +02:00
nmax = nmax ,
nmax.set = nmax.set ) )
2018-04-18 12:24:54 +02:00
}
#' @rdname freq
#' @export
2018-07-01 21:40:37 +02:00
freq <- frequency_tbl
2018-06-20 14:47:37 +02:00
2018-12-29 22:24:19 +01:00
#' @importFrom crayon silver green red
#' @importFrom dplyr %>%
format_header <- function ( x , markdown = FALSE , decimal.mark = " ." , big.mark = " ," , digits = 2 ) {
newline <- " \n"
if ( markdown == TRUE ) {
newline <- " \n"
2019-01-30 16:00:55 +01:00
# no colours in markdown
silver <- function ( x ) x
green <- function ( x ) x
red <- function ( x ) x
2018-12-29 22:24:19 +01:00
}
header <- header ( x )
x_class <- header $ class
has_length <- header $ length + header $ na_length > 0
# FORMATTING
# rsi
if ( has_length == TRUE & any ( x_class == " rsi" ) ) {
header $ `%IR` <- paste ( ( header $ count_IR / header $ length ) %>% percent ( force_zero = TRUE , round = digits , decimal.mark = decimal.mark ) ,
paste0 ( " (ratio S : IR = 1.0 : " , ( header $ count_IR / header $ count_S ) %>% format ( digits = 1 , nsmall = 1 , decimal.mark = decimal.mark , big.mark = big.mark ) , " )" ) )
header <- header [ ! names ( header ) %in% c ( " count_S" , " count_IR" ) ]
}
# dates
if ( ! is.null ( header $ date_format ) ) {
if ( header $ date_format == " %H:%M:%S" ) {
header $ median <- paste0 ( format ( header $ median , header $ date_format ) ,
" (" ,
( as.double ( difftime ( header $ median , header $ earliest , units = " auto" ) ) /
as.double ( difftime ( header $ latest , header $ earliest , units = " auto" ) ) ) %>%
percent ( round = digits , decimal.mark = decimal.mark ) , " )" )
header $ latest <- paste0 ( format ( header $ latest , header $ date_format ) ,
" (+" ,
difftime ( header $ latest , header $ earliest , units = " mins" ) %>%
as.double ( ) %>%
format ( digits = digits , decimal.mark = decimal.mark , big.mark = big.mark ) ,
" min.)" )
header $ earliest <- format ( header $ earliest , header $ date_format )
header $ median <- trimws ( header $ median )
header $ latest <- trimws ( header $ latest )
header $ earliest <- trimws ( header $ earliest )
} else {
header $ median <- paste0 ( format ( header $ median , header $ date_format ) ,
" (" ,
( as.double ( difftime ( header $ median , header $ oldest , units = " auto" ) ) /
as.double ( difftime ( header $ newest , header $ oldest , units = " auto" ) ) ) %>%
percent ( round = digits , decimal.mark = decimal.mark ) , " )" )
header $ newest <- paste0 ( format ( header $ newest , header $ date_format ) ,
" (+" ,
difftime ( header $ newest , header $ oldest , units = " auto" ) %>%
as.double ( ) %>%
format ( digits = digits , decimal.mark = decimal.mark , big.mark = big.mark ) ,
" )" )
header $ oldest <- format ( header $ oldest , header $ date_format )
header $ median <- trimws ( header $ median )
header $ newest <- trimws ( header $ newest )
header $ oldest <- trimws ( header $ oldest )
}
header <- header [names ( header ) != " date_format" ]
}
# class and mode
if ( is.null ( header $ columns ) ) {
2019-02-01 16:55:55 +01:00
if ( markdown == TRUE ) {
header $ class <- paste0 ( " `" , header $ class , " `" )
}
2018-12-29 22:24:19 +01:00
if ( ! header $ mode %in% header $ class ) {
2019-02-01 16:55:55 +01:00
if ( markdown == TRUE ) {
header $ mode <- paste0 ( " `" , header $ mode , " `" )
}
2018-12-29 22:24:19 +01:00
header $ class <- header $ class %>% rev ( ) %>% paste ( collapse = " > " ) %>% paste0 ( silver ( paste0 ( " (" , header $ mode , " )" ) ) )
} else {
header $ class <- header $ class %>% rev ( ) %>% paste ( collapse = " > " )
}
header <- header [names ( header ) != " mode" ]
}
# levels
if ( ! is.null ( header $ levels ) ) {
n_levels <- header $ levels %>% length ( )
n_levels_list <- header $ levels
if ( n_levels > 5 ) {
n_levels_list <- c ( n_levels_list [1 : 5 ] , " ..." )
}
if ( header $ ordered == TRUE ) {
2018-12-31 01:48:53 +01:00
n_levels_list <- paste0 ( n_levels_list , collapse = " < " )
2018-12-29 22:24:19 +01:00
} else {
2018-12-31 01:48:53 +01:00
n_levels_list <- paste0 ( n_levels_list , collapse = " , " )
2018-12-29 22:24:19 +01:00
}
header $ levels <- n_levels_list
header <- header [names ( header ) != " ordered" ]
}
# length and NAs
if ( has_length == TRUE ) {
na_txt <- paste0 ( header $ na_length %>% format ( decimal.mark = decimal.mark , big.mark = big.mark ) , " = " ,
( header $ na_length / ( header $ na_length + header $ length ) ) %>% percent ( force_zero = TRUE , round = digits , decimal.mark = decimal.mark ) %>%
sub ( " NaN" , " 0" , ., fixed = TRUE ) )
if ( ! na_txt %like% " ^0 =" ) {
na_txt <- red ( na_txt )
} else {
na_txt <- green ( na_txt )
}
na_txt <- paste0 ( " (of which NA: " , na_txt , " )" )
} else {
na_txt <- " "
}
header $ length <- paste ( ( header $ na_length + header $ length ) %>% format ( decimal.mark = decimal.mark , big.mark = big.mark ) ,
na_txt )
header <- header [names ( header ) != " na_length" ]
# format all numeric values
header <- lapply ( header , function ( x )
if ( is.numeric ( x ) )
if ( any ( x < 1000 ) ) {
format ( round2 ( x , digits = digits ) , decimal.mark = decimal.mark , big.mark = big.mark )
} else {
format ( x , digits = digits , decimal.mark = decimal.mark , big.mark = big.mark )
}
else
x
)
# numeric values
if ( has_length == TRUE & any ( x_class %in% c ( " double" , " integer" , " numeric" , " raw" , " single" ) ) ) {
header $ sd <- paste0 ( header $ sd , " (CV: " , header $ cv , " , MAD: " , header $ mad , " )" )
header $ fivenum <- paste0 ( paste ( header $ fivenum , collapse = " | " ) , " (IQR: " , header $ IQR , " , CQV: " , header $ cqv , " )" )
header $ outliers_total <- paste0 ( header $ outliers_total , " (unique count: " , header $ outliers_unique , " )" )
header <- header [ ! names ( header ) %in% c ( " cv" , " mad" , " IQR" , " cqv" , " outliers_unique" ) ]
}
# header names
header_names <- paste0 ( names ( header ) , " : " )
header_names <- gsub ( " sd" , " SD" , header_names )
header_names <- gsub ( " fivenum" , " Five-Num" , header_names )
header_names <- gsub ( " outliers_total" , " Outliers" , header_names )
# capitalise first character
header_names <- gsub ( " ^(.)" , " \\U\\1" , header_names , perl = TRUE )
# make all header captions equal size
header_names <- gsub ( " \\s" , " " , format ( header_names ,
width = max ( nchar ( header_names ) ,
na.rm = TRUE ) ) )
header <- paste0 ( header_names , header )
header <- paste ( header , collapse = newline )
# add newline after 'Unique'
gsub ( " (.*Unique.*\\n)(.*?)" , paste0 ( " \\1" , newline , " \\2" ) , header )
}
2018-06-20 14:47:37 +02:00
#' @rdname freq
#' @export
#' @importFrom dplyr top_n pull
top_freq <- function ( f , n ) {
2018-12-22 22:39:34 +01:00
if ( ! " frequency_tbl" %in% class ( f ) ) {
2019-01-30 16:00:55 +01:00
stop ( " `top_freq` can only be applied to frequency tables" , call. = FALSE )
2018-06-20 14:47:37 +02:00
}
if ( ! is.numeric ( n ) | length ( n ) != 1L ) {
2019-01-30 16:00:55 +01:00
stop ( " For `top_freq`, 'n' must be a number of length 1" , call. = FALSE )
2018-06-20 14:47:37 +02:00
}
top <- f %>% top_n ( n , count )
vect <- top %>% pull ( item )
names ( vect ) <- top %>% pull ( count )
if ( length ( vect ) > abs ( n ) ) {
message ( " top_freq: selecting " , length ( vect ) , " items instead of " , abs ( n ) , " , because of ties" )
}
vect
}
2018-12-29 22:24:19 +01:00
#' @rdname freq
#' @export
header <- function ( f , property = NULL ) {
2019-01-30 16:00:55 +01:00
if ( ! " frequency_tbl" %in% class ( f ) ) {
stop ( " `header` can only be applied to frequency tables" , call. = FALSE )
}
2018-12-29 22:24:19 +01:00
if ( is.null ( property ) ) {
attributes ( f ) $ header
} else {
a <- attributes ( f ) $ header
if ( any ( property %in% names ( f ) ) ) {
a [names ( a ) %in% property ]
}
}
}
2018-10-01 11:39:43 +02:00
#' @noRd
2018-09-29 21:54:32 +02:00
#' @exportMethod diff.frequency_tbl
#' @importFrom dplyr %>% full_join mutate
#' @export
diff.frequency_tbl <- function ( x , y , ... ) {
# check classes
if ( ! " frequency_tbl" %in% class ( x )
| ! " frequency_tbl" %in% class ( y ) ) {
stop ( " Both x and y must be a frequency table." )
}
2018-10-01 14:44:40 +02:00
cat ( " Differences between frequency tables" )
if ( identical ( x , y ) ) {
cat ( " \n\nNo differences found.\n" )
return ( invisible ( ) )
}
2018-09-29 21:54:32 +02:00
x.attr <- attributes ( x ) $ opt
# only keep item and count
x <- x [ , 1 : 2 ]
y <- y [ , 1 : 2 ]
x <- x %>%
full_join ( y ,
by = colnames ( x ) [1 ] ,
suffix = c ( " .x" , " .y" ) ) %>%
mutate (
diff = case_when (
is.na ( count.y ) ~ - count.x ,
is.na ( count.x ) ~ count.y ,
TRUE ~ count.y - count.x ) ) %>%
mutate (
diff.percent = percent (
diff / count.x ,
2018-10-01 11:39:43 +02:00
force_zero = TRUE ) ) %>%
2018-12-22 22:39:34 +01:00
mutate ( diff = ifelse ( diff %like% " ^-" ,
2018-10-01 11:39:43 +02:00
diff ,
paste0 ( " +" , diff ) ) ,
2018-12-22 22:39:34 +01:00
diff.percent = ifelse ( diff.percent %like% " ^-" ,
2018-10-01 11:39:43 +02:00
diff.percent ,
paste0 ( " +" , diff.percent ) ) )
2018-09-29 21:54:32 +02:00
print (
knitr :: kable ( x ,
format = x.attr $ tbl_format ,
col.names = c ( " Item" , " Count #1" , " Count #2" , " Difference" , " Diff. percent" ) ,
2018-10-01 14:44:40 +02:00
align = paste0 ( x.attr $ column_align [1 ] , " rrrr" ) ,
2018-09-29 21:54:32 +02:00
padding = 1 )
)
}
2018-07-03 11:30:40 +02:00
#' @rdname freq
2018-07-01 21:40:37 +02:00
#' @exportMethod print.frequency_tbl
#' @importFrom knitr kable
#' @importFrom dplyr n_distinct
2018-10-22 12:32:59 +02:00
#' @importFrom crayon bold silver
2018-07-01 21:40:37 +02:00
#' @export
2018-12-29 22:24:19 +01:00
print.frequency_tbl <- function ( x ,
nmax = getOption ( " max.print.freq" , default = 15 ) ,
markdown = ! interactive ( ) ,
2019-01-30 16:00:55 +01:00
header = TRUE ,
2018-12-10 10:13:40 +01:00
decimal.mark = getOption ( " OutDec" ) ,
big.mark = ifelse ( decimal.mark != " ," , " ," , " ." ) ,
... ) {
2018-07-01 21:40:37 +02:00
2018-12-22 22:39:34 +01:00
opt <- attr ( x , " opt" )
2018-12-29 22:24:19 +01:00
opt $ header_txt <- header ( x )
2018-07-01 21:40:37 +02:00
2019-02-08 16:06:54 +01:00
dots <- list ( ... )
if ( " markdown" %in% names ( dots ) ) {
if ( dots $ markdown == TRUE ) {
opt $ tbl_format <- " markdown"
} else {
opt $ tbl_format <- " pandoc"
}
}
if ( ! missing ( markdown ) ) {
if ( markdown == TRUE ) {
opt $ tbl_format <- " markdown"
} else {
opt $ tbl_format <- " pandoc"
}
}
2018-08-24 11:08:20 +02:00
if ( length ( opt $ vars ) == 0 ) {
opt $ vars <- NULL
}
2018-11-06 16:41:59 +01:00
if ( is.null ( opt $ title ) ) {
2019-02-08 16:06:54 +01:00
if ( isTRUE ( opt $ data %like% " ^a data.frame" ) & opt $ tbl_format == " markdown" ) {
opt $ data <- gsub ( " data.frame" , " `data.frame`" , opt $ data , fixed = TRUE )
}
2018-11-06 16:41:59 +01:00
if ( ! is.null ( opt $ data ) & ! is.null ( opt $ vars ) ) {
title <- paste0 ( " `" , paste0 ( opt $ vars , collapse = " ` and `" ) , " ` from " , opt $ data )
} else if ( ! is.null ( opt $ data ) & is.null ( opt $ vars ) ) {
title <- opt $ data
} else if ( is.null ( opt $ data ) & ! is.null ( opt $ vars ) ) {
title <- paste0 ( " `" , paste0 ( opt $ vars , collapse = " ` and `" ) , " `" )
} else {
title <- " "
}
if ( title != " " & length ( opt $ group_var ) != 0 ) {
group_var <- paste0 ( " (grouped by `" , opt $ group_var , " `)" )
if ( opt $ tbl_format == " pandoc" ) {
group_var <- silver ( group_var )
}
title <- paste ( title , group_var )
}
2018-11-24 20:25:09 +01:00
title <- trimws ( title )
if ( title == " " ) {
title <- " Frequency table"
} else {
title <- paste ( " Frequency table of" , trimws ( title ) )
}
2018-07-01 21:40:37 +02:00
} else {
2018-11-06 16:41:59 +01:00
title <- opt $ title
2018-07-01 21:40:37 +02:00
}
2018-07-03 11:30:40 +02:00
if ( ! missing ( nmax ) ) {
opt $ nmax <- nmax
opt $ nmax.set <- TRUE
}
2018-12-29 22:24:19 +01:00
if ( opt $ nmax %in% c ( 0 , Inf , NA , NULL ) ) {
opt $ nmax <- NROW ( x )
opt $ nmax.set <- FALSE
} else if ( opt $ nmax >= NROW ( x ) ) {
opt $ nmax.set <- FALSE
}
2018-12-10 10:13:40 +01:00
if ( ! missing ( decimal.mark ) ) {
opt $ decimal.mark <- decimal.mark
}
if ( ! missing ( big.mark ) ) {
opt $ big.mark <- big.mark
}
2018-11-16 21:57:55 +01:00
if ( ! missing ( header ) ) {
opt $ header <- header
}
2018-07-03 11:30:40 +02:00
2018-10-22 13:06:54 +02:00
# bold title
2018-10-18 12:10:10 +02:00
if ( opt $ tbl_format == " pandoc" ) {
2018-10-22 13:06:54 +02:00
title <- bold ( title )
} else if ( opt $ tbl_format == " markdown" ) {
2019-01-02 23:24:07 +01:00
title <- paste0 ( " \n**" , title , " ** " ) # two space for newline
2018-10-18 12:10:10 +02:00
}
2018-10-19 21:52:08 +02:00
if ( opt $ header == TRUE ) {
cat ( title , " \n" )
if ( ! is.null ( opt $ header_txt ) ) {
2018-12-29 22:24:19 +01:00
if ( is.null ( opt $ digits ) ) {
opt $ digits <- 2
}
2019-01-30 16:00:55 +01:00
cat ( format_header ( x , digits = opt $ digits , markdown = ( opt $ tbl_format == " markdown" ) ,
decimal.mark = decimal.mark , big.mark = big.mark ) )
2018-10-19 21:52:08 +02:00
}
2018-10-22 12:32:59 +02:00
} else if ( opt $ tbl_format == " markdown" ) {
# do print title as caption in markdown
2019-01-02 23:24:07 +01:00
cat ( " \n" , title , sep = " " ) # two trailing spaces for markdown
2018-07-01 21:40:37 +02:00
}
if ( NROW ( x ) == 0 ) {
2018-12-22 22:39:34 +01:00
cat ( " \n\nNo observations.\n" )
2018-07-01 21:40:37 +02:00
return ( invisible ( ) )
}
# save old NA setting for kable
opt.old <- options ( ) $ knitr.kable.NA
2018-10-23 09:42:26 +02:00
if ( is.null ( opt $ na ) ) {
opt $ na <- " <NA>"
}
2019-01-02 23:24:07 +01:00
if ( opt $ tbl_format == " markdown" ) {
# no HTML tags
opt $ na <- gsub ( " <" , " (" , opt $ na , fixed = TRUE )
opt $ na <- gsub ( " >" , " )" , opt $ na , fixed = TRUE )
}
2018-10-23 09:42:26 +02:00
options ( knitr.kable.NA = opt $ na )
2018-07-01 21:40:37 +02:00
2018-12-29 22:24:19 +01:00
x.rows <- nrow ( x )
x.unprinted <- base :: sum ( x [ ( opt $ nmax + 1 ) : nrow ( x ) , " count" ] , na.rm = TRUE )
x.printed <- base :: sum ( x $ count ) - x.unprinted
2018-07-01 21:40:37 +02:00
2018-12-29 22:24:19 +01:00
if ( nrow ( x ) > opt $ nmax & opt $ tbl_format != " markdown" ) {
2018-07-01 21:40:37 +02:00
2018-07-03 11:30:40 +02:00
if ( opt $ nmax.set == TRUE ) {
nmax <- opt $ nmax
} else {
nmax <- getOption ( " max.print.freq" , default = 15 )
}
x <- x [1 : nmax , ]
2018-07-01 21:40:37 +02:00
if ( opt $ nmax.set == TRUE ) {
2018-12-22 22:39:34 +01:00
footer <- paste ( " [ reached `nmax = " , opt $ nmax , " `" , sep = " " )
2018-07-01 21:40:37 +02:00
} else {
footer <- ' [ reached getOption("max.print.freq")'
}
footer <- paste ( footer ,
2018-12-22 22:39:34 +01:00
" -- omitted " ,
2018-12-29 22:24:19 +01:00
format ( x.rows - opt $ nmax , big.mark = opt $ big.mark , decimal.mark = opt $ decimal.mark ) ,
2018-12-22 22:39:34 +01:00
" entries, n = " ,
2018-12-29 22:24:19 +01:00
format ( x.unprinted , big.mark = opt $ big.mark , decimal.mark = opt $ decimal.mark ) ,
2018-12-22 22:39:34 +01:00
" (" ,
2018-12-10 10:13:40 +01:00
( x.unprinted / ( x.unprinted + x.printed ) ) %>% percent ( force_zero = TRUE , decimal.mark = opt $ decimal.mark ) ,
2018-12-22 22:39:34 +01:00
" ) ]\n" , sep = " " )
2018-10-22 12:32:59 +02:00
if ( opt $ tbl_format == " pandoc" ) {
footer <- silver ( footer ) # only silver in regular printing
}
2018-12-29 22:24:19 +01:00
} else if ( opt $ tbl_format == " markdown" ) {
if ( opt $ nmax.set == TRUE ) {
x <- x [1 : opt $ nmax , ]
footer <- paste ( " \n(omitted " ,
format ( x.rows - opt $ nmax , big.mark = opt $ big.mark , decimal.mark = opt $ decimal.mark ) ,
" entries, n = " ,
format ( x.unprinted , big.mark = opt $ big.mark , decimal.mark = opt $ decimal.mark ) ,
" [" ,
( x.unprinted / ( x.unprinted + x.printed ) ) %>% percent ( force_zero = TRUE , decimal.mark = opt $ decimal.mark ) ,
" ])\n" , sep = " " )
} else {
footer <- NULL
}
2018-07-01 21:40:37 +02:00
} else {
footer <- NULL
}
2018-11-19 13:00:22 +01:00
if ( " item" %in% colnames ( x ) ) {
2018-12-22 22:39:34 +01:00
if ( any ( class ( x $ item ) %in% c ( " double" , " integer" , " numeric" , " raw" , " single" ) ) ) {
2018-12-10 10:13:40 +01:00
x $ item <- format ( x $ item , decimal.mark = opt $ decimal.mark , big.mark = opt $ big.mark )
2018-11-19 13:06:07 +01:00
}
2018-11-19 13:00:22 +01:00
} else {
opt $ column_names <- opt $ column_names [ ! opt $ column_names == " Item" ]
}
if ( " count" %in% colnames ( x ) ) {
if ( all ( x $ count == 1 ) ) {
2018-12-22 22:39:34 +01:00
warning ( " All observations are unique." , call. = FALSE )
2018-11-19 13:00:22 +01:00
}
2018-12-10 10:13:40 +01:00
x $ count <- format ( x $ count , decimal.mark = opt $ decimal.mark , big.mark = opt $ big.mark )
2018-11-19 13:00:22 +01:00
} else {
opt $ column_names <- opt $ column_names [ ! opt $ column_names == " Count" ]
}
if ( " percent" %in% colnames ( x ) ) {
2018-12-10 10:13:40 +01:00
x $ percent <- percent ( x $ percent , force_zero = TRUE , decimal.mark = opt $ decimal.mark )
2018-11-19 13:00:22 +01:00
} else {
opt $ column_names <- opt $ column_names [ ! opt $ column_names == " Percent" ]
}
if ( " cum_count" %in% colnames ( x ) ) {
2018-12-10 10:13:40 +01:00
x $ cum_count <- format ( x $ cum_count , decimal.mark = opt $ decimal.mark , big.mark = opt $ big.mark )
2018-11-19 13:00:22 +01:00
} else {
opt $ column_names <- opt $ column_names [ ! opt $ column_names == " Cum. Count" ]
}
if ( " cum_percent" %in% colnames ( x ) ) {
2018-12-10 10:13:40 +01:00
x $ cum_percent <- percent ( x $ cum_percent , force_zero = TRUE , decimal.mark = opt $ decimal.mark )
2018-11-19 13:00:22 +01:00
} else {
opt $ column_names <- opt $ column_names [ ! opt $ column_names == " Cum. Percent" ]
}
2018-07-01 21:40:37 +02:00
2018-10-23 09:42:26 +02:00
if ( opt $ tbl_format == " markdown" ) {
2018-11-02 10:27:57 +01:00
cat ( " \n" )
2018-10-23 09:42:26 +02:00
}
2018-07-01 21:40:37 +02:00
print (
knitr :: kable ( x ,
format = opt $ tbl_format ,
row.names = opt $ row_names ,
col.names = opt $ column_names ,
align = opt $ column_align ,
padding = 1 )
)
if ( ! is.null ( footer ) ) {
cat ( footer )
}
2018-10-23 09:42:26 +02:00
if ( opt $ tbl_format == " markdown" ) {
cat ( " \n\n" )
} else {
2018-12-22 22:39:34 +01:00
cat ( " \n" )
2018-10-23 09:42:26 +02:00
}
2018-07-01 21:40:37 +02:00
# reset old kable setting
options ( knitr.kable.NA = opt.old )
return ( invisible ( ) )
}
2018-06-20 14:47:37 +02:00
2018-07-03 11:30:40 +02:00
#' @noRd
#' @exportMethod as.data.frame.frequency_tbl
#' @export
as.data.frame.frequency_tbl <- function ( x , ... ) {
2018-12-22 22:39:34 +01:00
attr ( x , " package" ) <- NULL
attr ( x , " opt" ) <- NULL
2018-07-03 11:30:40 +02:00
as.data.frame.data.frame ( x , ... )
}
2018-07-08 22:14:55 +02:00
2019-01-17 12:08:04 +01:00
#' @exportMethod select.frequency_tbl
#' @export
#' @importFrom dplyr select
#' @noRd
select.frequency_tbl <- function ( .data , ... ) {
select ( as.data.frame ( .data ) , ... )
}
2018-07-09 14:02:58 +02:00
#' @noRd
#' @exportMethod as_tibble.frequency_tbl
#' @export
#' @importFrom dplyr as_tibble
as_tibble.frequency_tbl <- function ( x , validate = TRUE , ... , rownames = NA ) {
2018-12-22 22:39:34 +01:00
attr ( x , " package" ) <- NULL
attr ( x , " opt" ) <- NULL
2018-07-09 14:02:58 +02:00
as_tibble ( x = as.data.frame ( x ) , validate = validate , ... , rownames = rownames )
}
2018-07-08 22:14:55 +02:00
#' @noRd
#' @exportMethod hist.frequency_tbl
#' @export
#' @importFrom graphics hist
2018-12-22 22:39:34 +01:00
hist.frequency_tbl <- function ( x , breaks = " Sturges" , main = NULL , xlab = NULL , ... ) {
opt <- attr ( x , " opt" )
2018-11-01 17:06:08 +01:00
if ( ! class ( x $ item ) %in% c ( " numeric" , " double" , " integer" , " Date" ) ) {
2018-12-22 22:39:34 +01:00
stop ( " `x` must be numeric or Date." , call. = FALSE )
2018-11-01 17:06:08 +01:00
}
2018-07-08 22:14:55 +02:00
if ( ! is.null ( opt $ vars ) ) {
title <- opt $ vars
2018-11-01 17:06:08 +01:00
} else if ( ! is.null ( opt $ data ) ) {
title <- opt $ data
2018-07-08 22:14:55 +02:00
} else {
2018-11-01 17:06:08 +01:00
title <- " frequency table"
}
if ( class ( x $ item ) == " Date" ) {
x <- as.Date ( as.vector ( x ) , origin = " 1970-01-01" )
} else {
x <- as.vector ( x )
}
if ( is.null ( main ) ) {
main <- paste ( " Histogram of" , title )
2018-07-08 22:14:55 +02:00
}
2018-12-22 22:39:34 +01:00
if ( is.null ( xlab ) ) {
xlab <- title
}
hist ( x , main = main , xlab = xlab , breaks = breaks , ... )
2018-07-08 22:14:55 +02:00
}
#' @noRd
#' @exportMethod plot.frequency_tbl
#' @export
plot.frequency_tbl <- function ( x , y , ... ) {
2018-12-22 22:39:34 +01:00
opt <- attr ( x , " opt" )
2018-07-08 22:14:55 +02:00
if ( ! is.null ( opt $ vars ) ) {
title <- opt $ vars
} else {
title <- " "
}
2018-07-09 14:02:58 +02:00
plot ( x = x $ item , y = x $ count , ylab = " Count" , xlab = title , ... )
}
2018-07-08 22:14:55 +02:00
2018-07-09 14:02:58 +02:00
#' @noRd
#' @exportMethod as.vector.frequency_tbl
#' @export
as.vector.frequency_tbl <- function ( x , mode = " any" ) {
as.vector ( rep ( x $ item , x $ count ) , mode = mode )
2018-07-08 22:14:55 +02:00
}
2018-07-16 16:41:48 +02:00
#' @noRd
#' @exportMethod format.frequency_tbl
#' @export
format.frequency_tbl <- function ( x , digits = 1 , ... ) {
2018-12-22 22:39:34 +01:00
opt <- attr ( x , " opt" )
2018-07-16 16:41:48 +02:00
if ( opt $ nmax.set == TRUE ) {
nmax <- opt $ nmax
} else {
nmax <- getOption ( " max.print.freq" , default = 15 )
}
x <- x [1 : nmax , ]
x $ percent <- percent ( x $ percent , round = digits , force_zero = TRUE )
x $ cum_percent <- percent ( x $ cum_percent , round = digits , force_zero = TRUE )
base :: format.data.frame ( x , ... )
}