grouping var for freq

This commit is contained in:
dr. M.S. (Matthijs) Berends 2018-11-06 16:41:59 +01:00
parent 54368fafef
commit 84e08f516a
6 changed files with 116 additions and 41 deletions

View File

@ -212,6 +212,7 @@ importFrom(dplyr,summarise)
importFrom(dplyr,summarise_if)
importFrom(dplyr,tibble)
importFrom(dplyr,top_n)
importFrom(dplyr,ungroup)
importFrom(dplyr,vars)
importFrom(grDevices,boxplot.stats)
importFrom(graphics,axis)
@ -235,5 +236,4 @@ importFrom(stats,predict)
importFrom(stats,sd)
importFrom(utils,View)
importFrom(utils,browseVignettes)
importFrom(utils,installed.packages)
importFrom(xml2,read_html)

View File

@ -25,6 +25,12 @@
* Using `portion_*` functions now throws a warning when total available isolate is below parameter `minimum`
* Functions `as.mo`, `as.rsi`, `as.mic`, `as.atc` and `freq` will not set package name as attribute anymore
* Frequency tables - `freq()`:
* Support for grouping variables, test with:
```r
septic_patients %>%
group_by(hospital_id) %>%
freq(gender)
```
* Check for `hms::is.hms`
* Now prints in markdown at default in non-interactive sessions
* No longer adds the factor level column and sorts factors on count again

133
R/freq.R
View File

@ -19,9 +19,9 @@
#' Frequency table
#'
#' Create a frequency table of a vector with items or a data frame. Supports quasiquotation and markdown for reports. \code{top_freq} can be used to get the top/bottom \emph{n} items of a frequency table, with counts as names.
#' @param x vector of any class or a \code{\link{data.frame}}, \code{\link{tibble}} or \code{\link{table}}
#' @param x vector of any class or a \code{\link{data.frame}}, \code{\link{tibble}} (may contain a grouping variable) or \code{\link{table}}
#' @param ... up to nine different columns of \code{x} when \code{x} is a \code{data.frame} or \code{tibble}, to calculate frequencies from - see Examples
#' @param sort.count sort on count, i.e. frequencies. This will be \code{TRUE} at default for everything except for factors.
#' @param sort.count sort on count, i.e. frequencies. This will be \code{TRUE} at default for everything except when using grouping variables.
#' @param nmax number of row to print. The default, \code{15}, uses \code{\link{getOption}("max.print.freq")}. Use \code{nmax = 0}, \code{nmax = Inf}, \code{nmax = NULL} or \code{nmax = NA} to print all rows.
#' @param na.rm a logical value indicating whether \code{NA} values should be removed from the frequency table. The header (if set) will always print the amount of \code{NA}s.
#' @param row.names a logical value indicating whether row indices should be printed as \code{1:nrow(x)}
@ -59,8 +59,8 @@
#' The function \code{top_freq} uses \code{\link[dplyr]{top_n}} internally and will include more than \code{n} rows if there are ties.
#' @importFrom stats fivenum sd mad
#' @importFrom grDevices boxplot.stats
#' @importFrom dplyr %>% select pull n_distinct group_by arrange desc mutate summarise n_distinct tibble
#' @importFrom utils browseVignettes installed.packages
#' @importFrom dplyr %>% arrange arrange_at desc funs group_by mutate mutate_at n_distinct pull select summarise tibble ungroup vars
#' @importFrom utils browseVignettes
#' @importFrom hms is.hms
#' @importFrom crayon red green silver
#' @keywords summary summarise frequency freq
@ -77,7 +77,7 @@
#' septic_patients$hospital_id %>% freq()
#' septic_patients[, "hospital_id"] %>% freq()
#' septic_patients %>% freq("hospital_id")
#' septic_patients %>% freq(hospital_id) #<- easiest to remember when you're used to tidyverse
#' septic_patients %>% freq(hospital_id) #<- easiest to remember (tidyverse)
#'
#' # you could also use `select` or `pull` to get your variables
#' septic_patients %>%
@ -91,6 +91,11 @@
#' filter(hospital_id == "A") %>%
#' freq(genus, species)
#'
#' # group a variable and analyse another
#' septic_patients %>%
#' group_by(hospital_id) %>%
#' freq(gender)
#'
#' # get top 10 bugs of hospital A as a vector
#' septic_patients %>%
#' filter(hospital_id == "A") %>%
@ -157,6 +162,8 @@ frequency_tbl <- function(x,
sep = " ") {
mult.columns <- 0
x.group = character(0)
df <- NULL
x.name <- NULL
cols <- NULL
@ -174,6 +181,12 @@ frequency_tbl <- function(x,
}
if (any(class(x) == 'data.frame')) {
x.group <- group_vars(x)
if (length(x.group) > 1) {
x.group <- x.group[1L]
warning("freq supports one grouping variable, only `", x.group, "` will be kept.", call. = FALSE)
}
if (is.null(x.name)) {
x.name <- deparse(substitute(x))
}
@ -188,6 +201,27 @@ frequency_tbl <- function(x,
if (!all(cols %in% colnames(x))) {
stop("one or more columns not found: `", paste(cols, collapse = "`, `"), '`', call. = FALSE)
}
if (length(x.group) > 0) {
x.group_cols <- c(x.group, cols)
df <- x %>%
group_by_at(vars(x.group_cols)) %>%
summarise(count = n())
if (!missing(sort.count)) {
if (sort.count == TRUE) {
df <- df %>% arrange_at(c(x.group, "count"), desc)
}
}
df <- df %>%
mutate(cum_count = cumsum(count))
df.topleft <- df[1, 1]
df <- df %>%
ungroup() %>%
# do not repeat group labels
mutate_at(vars(x.group), funs(ifelse(lag(.) == ., "", .)))
df[1, 1] <- df.topleft
colnames(df)[1:2] <- c("group", "item")
}
if (length(cols) > 0) {
x <- x[, cols]
}
@ -233,10 +267,9 @@ frequency_tbl <- function(x,
class(x) <- x_class
}
if (sort.count == FALSE & 'factor' %in% class(x)) {
# warning("Sorting a factor sorts on factor level, not necessarily alphabetically.", call. = FALSE)
}
# if (sort.count == FALSE & 'factor' %in% class(x)) {
# warning("Sorting a factor sorts on factor level, not necessarily alphabetically.", call. = FALSE)
# }
header_txt <- character(0)
markdown_line <- ''
@ -352,40 +385,52 @@ frequency_tbl <- function(x,
nmax <- length(x)
}
# create table with counts and percentages
column_names <- c('Item', 'Count', 'Percent', 'Cum. Count', 'Cum. Percent')
column_names_df <- c('item', 'count', 'percent', 'cum_count', 'cum_percent')
df <- tibble(item = x) %>%
group_by(item) %>%
summarise(count = n())
column_align <- c(x_align, 'r', 'r', 'r', 'r')
if (is.null(df)) {
# create table with counts and percentages
df <- tibble(item = x) %>%
group_by(item) %>%
summarise(count = n())
# sort according to setting
if (sort.count == TRUE) {
df <- df %>% arrange(desc(count), item)
} else {
df <- df %>% arrange(item)
}
} else {
column_names <- c("Group", column_names)
column_names_df <-c("group", column_names_df)
column_align <- c("l", column_align)
}
if (df$item %>% paste(collapse = ',') %like% '\033') {
# remove escape char
# see https://en.wikipedia.org/wiki/Escape_character#ASCII_escape_character
df <- df %>% mutate(item = item %>% gsub('\033', ' ', ., fixed = TRUE))
}
# sort according to setting
if (sort.count == TRUE) {
df <- df %>% arrange(desc(count), item)
} else {
df <- df %>% arrange(item)
}
if (quote == TRUE) {
df$item <- paste0('"', df$item, '"')
if (length(x.group) != 0) {
df$group <- paste0('"', df$group, '"')
}
}
df <- as.data.frame(df, stringsAsFactors = FALSE)
df$percent <- df$count / base::sum(df$count, na.rm = TRUE)
df$cum_count <- base::cumsum(df$count)
if (length(x.group) == 0) {
df$cum_count <- base::cumsum(df$count)
}
df$cum_percent <- df$cum_count / base::sum(df$count, na.rm = TRUE)
colnames(df) <- column_names_df
if (length(x.group) != 0) {
# sort columns
df <- df[, column_names_df]
}
if (markdown == TRUE) {
tbl_format <- 'markdown'
@ -394,14 +439,15 @@ frequency_tbl <- function(x,
}
if (!is.null(title)) {
x.name <- trimws(gsub("^Frequency table of", "", title[1L], ignore.case = TRUE))
cols <- NULL
title <- trimws(gsub("^Frequency table of", "", title[1L], ignore.case = TRUE))
}
structure(.Data = df,
class = c('frequency_tbl', class(df)),
opt = list(data = x.name,
opt = list(title = title,
data = x.name,
vars = cols,
group_var = x.group,
header = header,
header_txt = header_txt,
row_names = row.names,
@ -502,14 +548,25 @@ print.frequency_tbl <- function(x, nmax = getOption("max.print.freq", default =
opt$vars <- NULL
}
if (!is.null(opt$data) & !is.null(opt$vars)) {
title <- paste0("of `", paste0(opt$vars, collapse = "` and `"), "` from ", opt$data)
} else if (!is.null(opt$data) & is.null(opt$vars)) {
title <- paste("of", opt$data)
} else if (is.null(opt$data) & !is.null(opt$vars)) {
title <- paste0("of `", paste0(opt$vars, collapse = "` and `"), "`")
if (is.null(opt$title)) {
if (!is.null(opt$data) & !is.null(opt$vars)) {
title <- paste0("`", paste0(opt$vars, collapse = "` and `"), "` from ", opt$data)
} else if (!is.null(opt$data) & is.null(opt$vars)) {
title <- opt$data
} else if (is.null(opt$data) & !is.null(opt$vars)) {
title <- paste0("`", paste0(opt$vars, collapse = "` and `"), "`")
} else {
title <- ""
}
if (title != "" & length(opt$group_var) != 0) {
group_var <- paste0("(grouped by `", opt$group_var, "`)")
if (opt$tbl_format == "pandoc") {
group_var <- silver(group_var)
}
title <- paste(title, group_var)
}
} else {
title <- ""
title <- opt$title
}
if (!missing(nmax)) {
@ -525,7 +582,11 @@ print.frequency_tbl <- function(x, nmax = getOption("max.print.freq", default =
}
}
title <- paste("Frequency table", trimws(title))
if (trimws(title) == "") {
title <- "Frequency table"
} else {
title <- paste("Frequency table of", trimws(title))
}
# bold title
if (opt$tbl_format == "pandoc") {

View File

@ -23,11 +23,11 @@ top_freq(f, n)
default = 15), ...)
}
\arguments{
\item{x}{vector of any class or a \code{\link{data.frame}}, \code{\link{tibble}} or \code{\link{table}}}
\item{x}{vector of any class or a \code{\link{data.frame}}, \code{\link{tibble}} (may contain a grouping variable) or \code{\link{table}}}
\item{...}{up to nine different columns of \code{x} when \code{x} is a \code{data.frame} or \code{tibble}, to calculate frequencies from - see Examples}
\item{sort.count}{sort on count, i.e. frequencies. This will be \code{TRUE} at default for everything except for factors.}
\item{sort.count}{sort on count, i.e. frequencies. This will be \code{TRUE} at default for everything except when using grouping variables.}
\item{nmax}{number of row to print. The default, \code{15}, uses \code{\link{getOption}("max.print.freq")}. Use \code{nmax = 0}, \code{nmax = Inf}, \code{nmax = NULL} or \code{nmax = NA} to print all rows.}
@ -93,7 +93,7 @@ freq(septic_patients[, "hospital_id"])
septic_patients$hospital_id \%>\% freq()
septic_patients[, "hospital_id"] \%>\% freq()
septic_patients \%>\% freq("hospital_id")
septic_patients \%>\% freq(hospital_id) #<- easiest to remember when you're used to tidyverse
septic_patients \%>\% freq(hospital_id) #<- easiest to remember (tidyverse)
# you could also use `select` or `pull` to get your variables
septic_patients \%>\%
@ -107,6 +107,11 @@ septic_patients \%>\%
filter(hospital_id == "A") \%>\%
freq(genus, species)
# group a variable and analyse another
septic_patients \%>\%
group_by(hospital_id) \%>\%
freq(gender)
# get top 10 bugs of hospital A as a vector
septic_patients \%>\%
filter(hospital_id == "A") \%>\%

View File

@ -61,6 +61,9 @@ test_that("frequency table works", {
expect_output(septic_patients %>% select(1:9) %>% freq() %>% print())
expect_output(print(freq(septic_patients$age), nmax = 20))
# grouping variable
expect_output(print(septic_patients %>% group_by(gender) %>% freq(hospital_id)))
# top 5
expect_equal(
septic_patients %>%

View File

@ -56,7 +56,7 @@ test_that("mo_property works", {
# check vector with random values
library(dplyr)
df_sample <- AMR::microorganisms %>% sample_n(100)
expect_identical(df_sample %>% pull(mo) %>% mo_fullname(),
expect_identical(df_sample %>% pull(mo) %>% mo_fullname(language = "en"),
df_sample %>% pull(fullname))
})