mirror of
https://github.com/msberends/AMR.git
synced 2024-12-26 18:46:13 +01:00
grouping var for freq
This commit is contained in:
parent
54368fafef
commit
84e08f516a
@ -212,6 +212,7 @@ importFrom(dplyr,summarise)
|
|||||||
importFrom(dplyr,summarise_if)
|
importFrom(dplyr,summarise_if)
|
||||||
importFrom(dplyr,tibble)
|
importFrom(dplyr,tibble)
|
||||||
importFrom(dplyr,top_n)
|
importFrom(dplyr,top_n)
|
||||||
|
importFrom(dplyr,ungroup)
|
||||||
importFrom(dplyr,vars)
|
importFrom(dplyr,vars)
|
||||||
importFrom(grDevices,boxplot.stats)
|
importFrom(grDevices,boxplot.stats)
|
||||||
importFrom(graphics,axis)
|
importFrom(graphics,axis)
|
||||||
@ -235,5 +236,4 @@ importFrom(stats,predict)
|
|||||||
importFrom(stats,sd)
|
importFrom(stats,sd)
|
||||||
importFrom(utils,View)
|
importFrom(utils,View)
|
||||||
importFrom(utils,browseVignettes)
|
importFrom(utils,browseVignettes)
|
||||||
importFrom(utils,installed.packages)
|
|
||||||
importFrom(xml2,read_html)
|
importFrom(xml2,read_html)
|
||||||
|
6
NEWS.md
6
NEWS.md
@ -25,6 +25,12 @@
|
|||||||
* Using `portion_*` functions now throws a warning when total available isolate is below parameter `minimum`
|
* Using `portion_*` functions now throws a warning when total available isolate is below parameter `minimum`
|
||||||
* Functions `as.mo`, `as.rsi`, `as.mic`, `as.atc` and `freq` will not set package name as attribute anymore
|
* Functions `as.mo`, `as.rsi`, `as.mic`, `as.atc` and `freq` will not set package name as attribute anymore
|
||||||
* Frequency tables - `freq()`:
|
* Frequency tables - `freq()`:
|
||||||
|
* Support for grouping variables, test with:
|
||||||
|
```r
|
||||||
|
septic_patients %>%
|
||||||
|
group_by(hospital_id) %>%
|
||||||
|
freq(gender)
|
||||||
|
```
|
||||||
* Check for `hms::is.hms`
|
* Check for `hms::is.hms`
|
||||||
* Now prints in markdown at default in non-interactive sessions
|
* Now prints in markdown at default in non-interactive sessions
|
||||||
* No longer adds the factor level column and sorts factors on count again
|
* No longer adds the factor level column and sorts factors on count again
|
||||||
|
113
R/freq.R
113
R/freq.R
@ -19,9 +19,9 @@
|
|||||||
#' Frequency table
|
#' Frequency table
|
||||||
#'
|
#'
|
||||||
#' Create a frequency table of a vector with items or a data frame. Supports quasiquotation and markdown for reports. \code{top_freq} can be used to get the top/bottom \emph{n} items of a frequency table, with counts as names.
|
#' Create a frequency table of a vector with items or a data frame. Supports quasiquotation and markdown for reports. \code{top_freq} can be used to get the top/bottom \emph{n} items of a frequency table, with counts as names.
|
||||||
#' @param x vector of any class or a \code{\link{data.frame}}, \code{\link{tibble}} or \code{\link{table}}
|
#' @param x vector of any class or a \code{\link{data.frame}}, \code{\link{tibble}} (may contain a grouping variable) or \code{\link{table}}
|
||||||
#' @param ... up to nine different columns of \code{x} when \code{x} is a \code{data.frame} or \code{tibble}, to calculate frequencies from - see Examples
|
#' @param ... up to nine different columns of \code{x} when \code{x} is a \code{data.frame} or \code{tibble}, to calculate frequencies from - see Examples
|
||||||
#' @param sort.count sort on count, i.e. frequencies. This will be \code{TRUE} at default for everything except for factors.
|
#' @param sort.count sort on count, i.e. frequencies. This will be \code{TRUE} at default for everything except when using grouping variables.
|
||||||
#' @param nmax number of row to print. The default, \code{15}, uses \code{\link{getOption}("max.print.freq")}. Use \code{nmax = 0}, \code{nmax = Inf}, \code{nmax = NULL} or \code{nmax = NA} to print all rows.
|
#' @param nmax number of row to print. The default, \code{15}, uses \code{\link{getOption}("max.print.freq")}. Use \code{nmax = 0}, \code{nmax = Inf}, \code{nmax = NULL} or \code{nmax = NA} to print all rows.
|
||||||
#' @param na.rm a logical value indicating whether \code{NA} values should be removed from the frequency table. The header (if set) will always print the amount of \code{NA}s.
|
#' @param na.rm a logical value indicating whether \code{NA} values should be removed from the frequency table. The header (if set) will always print the amount of \code{NA}s.
|
||||||
#' @param row.names a logical value indicating whether row indices should be printed as \code{1:nrow(x)}
|
#' @param row.names a logical value indicating whether row indices should be printed as \code{1:nrow(x)}
|
||||||
@ -59,8 +59,8 @@
|
|||||||
#' The function \code{top_freq} uses \code{\link[dplyr]{top_n}} internally and will include more than \code{n} rows if there are ties.
|
#' The function \code{top_freq} uses \code{\link[dplyr]{top_n}} internally and will include more than \code{n} rows if there are ties.
|
||||||
#' @importFrom stats fivenum sd mad
|
#' @importFrom stats fivenum sd mad
|
||||||
#' @importFrom grDevices boxplot.stats
|
#' @importFrom grDevices boxplot.stats
|
||||||
#' @importFrom dplyr %>% select pull n_distinct group_by arrange desc mutate summarise n_distinct tibble
|
#' @importFrom dplyr %>% arrange arrange_at desc funs group_by mutate mutate_at n_distinct pull select summarise tibble ungroup vars
|
||||||
#' @importFrom utils browseVignettes installed.packages
|
#' @importFrom utils browseVignettes
|
||||||
#' @importFrom hms is.hms
|
#' @importFrom hms is.hms
|
||||||
#' @importFrom crayon red green silver
|
#' @importFrom crayon red green silver
|
||||||
#' @keywords summary summarise frequency freq
|
#' @keywords summary summarise frequency freq
|
||||||
@ -77,7 +77,7 @@
|
|||||||
#' septic_patients$hospital_id %>% freq()
|
#' septic_patients$hospital_id %>% freq()
|
||||||
#' septic_patients[, "hospital_id"] %>% freq()
|
#' septic_patients[, "hospital_id"] %>% freq()
|
||||||
#' septic_patients %>% freq("hospital_id")
|
#' septic_patients %>% freq("hospital_id")
|
||||||
#' septic_patients %>% freq(hospital_id) #<- easiest to remember when you're used to tidyverse
|
#' septic_patients %>% freq(hospital_id) #<- easiest to remember (tidyverse)
|
||||||
#'
|
#'
|
||||||
#' # you could also use `select` or `pull` to get your variables
|
#' # you could also use `select` or `pull` to get your variables
|
||||||
#' septic_patients %>%
|
#' septic_patients %>%
|
||||||
@ -91,6 +91,11 @@
|
|||||||
#' filter(hospital_id == "A") %>%
|
#' filter(hospital_id == "A") %>%
|
||||||
#' freq(genus, species)
|
#' freq(genus, species)
|
||||||
#'
|
#'
|
||||||
|
#' # group a variable and analyse another
|
||||||
|
#' septic_patients %>%
|
||||||
|
#' group_by(hospital_id) %>%
|
||||||
|
#' freq(gender)
|
||||||
|
#'
|
||||||
#' # get top 10 bugs of hospital A as a vector
|
#' # get top 10 bugs of hospital A as a vector
|
||||||
#' septic_patients %>%
|
#' septic_patients %>%
|
||||||
#' filter(hospital_id == "A") %>%
|
#' filter(hospital_id == "A") %>%
|
||||||
@ -157,6 +162,8 @@ frequency_tbl <- function(x,
|
|||||||
sep = " ") {
|
sep = " ") {
|
||||||
|
|
||||||
mult.columns <- 0
|
mult.columns <- 0
|
||||||
|
x.group = character(0)
|
||||||
|
df <- NULL
|
||||||
|
|
||||||
x.name <- NULL
|
x.name <- NULL
|
||||||
cols <- NULL
|
cols <- NULL
|
||||||
@ -174,6 +181,12 @@ frequency_tbl <- function(x,
|
|||||||
}
|
}
|
||||||
|
|
||||||
if (any(class(x) == 'data.frame')) {
|
if (any(class(x) == 'data.frame')) {
|
||||||
|
x.group <- group_vars(x)
|
||||||
|
if (length(x.group) > 1) {
|
||||||
|
x.group <- x.group[1L]
|
||||||
|
warning("freq supports one grouping variable, only `", x.group, "` will be kept.", call. = FALSE)
|
||||||
|
}
|
||||||
|
|
||||||
if (is.null(x.name)) {
|
if (is.null(x.name)) {
|
||||||
x.name <- deparse(substitute(x))
|
x.name <- deparse(substitute(x))
|
||||||
}
|
}
|
||||||
@ -188,6 +201,27 @@ frequency_tbl <- function(x,
|
|||||||
if (!all(cols %in% colnames(x))) {
|
if (!all(cols %in% colnames(x))) {
|
||||||
stop("one or more columns not found: `", paste(cols, collapse = "`, `"), '`', call. = FALSE)
|
stop("one or more columns not found: `", paste(cols, collapse = "`, `"), '`', call. = FALSE)
|
||||||
}
|
}
|
||||||
|
if (length(x.group) > 0) {
|
||||||
|
x.group_cols <- c(x.group, cols)
|
||||||
|
df <- x %>%
|
||||||
|
group_by_at(vars(x.group_cols)) %>%
|
||||||
|
summarise(count = n())
|
||||||
|
if (!missing(sort.count)) {
|
||||||
|
if (sort.count == TRUE) {
|
||||||
|
df <- df %>% arrange_at(c(x.group, "count"), desc)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
df <- df %>%
|
||||||
|
mutate(cum_count = cumsum(count))
|
||||||
|
|
||||||
|
df.topleft <- df[1, 1]
|
||||||
|
df <- df %>%
|
||||||
|
ungroup() %>%
|
||||||
|
# do not repeat group labels
|
||||||
|
mutate_at(vars(x.group), funs(ifelse(lag(.) == ., "", .)))
|
||||||
|
df[1, 1] <- df.topleft
|
||||||
|
colnames(df)[1:2] <- c("group", "item")
|
||||||
|
}
|
||||||
if (length(cols) > 0) {
|
if (length(cols) > 0) {
|
||||||
x <- x[, cols]
|
x <- x[, cols]
|
||||||
}
|
}
|
||||||
@ -233,10 +267,9 @@ frequency_tbl <- function(x,
|
|||||||
class(x) <- x_class
|
class(x) <- x_class
|
||||||
}
|
}
|
||||||
|
|
||||||
if (sort.count == FALSE & 'factor' %in% class(x)) {
|
# if (sort.count == FALSE & 'factor' %in% class(x)) {
|
||||||
# warning("Sorting a factor sorts on factor level, not necessarily alphabetically.", call. = FALSE)
|
# warning("Sorting a factor sorts on factor level, not necessarily alphabetically.", call. = FALSE)
|
||||||
}
|
# }
|
||||||
|
|
||||||
header_txt <- character(0)
|
header_txt <- character(0)
|
||||||
|
|
||||||
markdown_line <- ''
|
markdown_line <- ''
|
||||||
@ -352,21 +385,15 @@ frequency_tbl <- function(x,
|
|||||||
nmax <- length(x)
|
nmax <- length(x)
|
||||||
}
|
}
|
||||||
|
|
||||||
# create table with counts and percentages
|
|
||||||
column_names <- c('Item', 'Count', 'Percent', 'Cum. Count', 'Cum. Percent')
|
column_names <- c('Item', 'Count', 'Percent', 'Cum. Count', 'Cum. Percent')
|
||||||
column_names_df <- c('item', 'count', 'percent', 'cum_count', 'cum_percent')
|
column_names_df <- c('item', 'count', 'percent', 'cum_count', 'cum_percent')
|
||||||
|
column_align <- c(x_align, 'r', 'r', 'r', 'r')
|
||||||
|
|
||||||
|
if (is.null(df)) {
|
||||||
|
# create table with counts and percentages
|
||||||
df <- tibble(item = x) %>%
|
df <- tibble(item = x) %>%
|
||||||
group_by(item) %>%
|
group_by(item) %>%
|
||||||
summarise(count = n())
|
summarise(count = n())
|
||||||
column_align <- c(x_align, 'r', 'r', 'r', 'r')
|
|
||||||
|
|
||||||
if (df$item %>% paste(collapse = ',') %like% '\033') {
|
|
||||||
# remove escape char
|
|
||||||
# see https://en.wikipedia.org/wiki/Escape_character#ASCII_escape_character
|
|
||||||
df <- df %>% mutate(item = item %>% gsub('\033', ' ', ., fixed = TRUE))
|
|
||||||
}
|
|
||||||
|
|
||||||
# sort according to setting
|
# sort according to setting
|
||||||
if (sort.count == TRUE) {
|
if (sort.count == TRUE) {
|
||||||
@ -374,18 +401,36 @@ frequency_tbl <- function(x,
|
|||||||
} else {
|
} else {
|
||||||
df <- df %>% arrange(item)
|
df <- df %>% arrange(item)
|
||||||
}
|
}
|
||||||
|
} else {
|
||||||
|
column_names <- c("Group", column_names)
|
||||||
|
column_names_df <-c("group", column_names_df)
|
||||||
|
column_align <- c("l", column_align)
|
||||||
|
}
|
||||||
|
|
||||||
|
if (df$item %>% paste(collapse = ',') %like% '\033') {
|
||||||
|
# remove escape char
|
||||||
|
# see https://en.wikipedia.org/wiki/Escape_character#ASCII_escape_character
|
||||||
|
df <- df %>% mutate(item = item %>% gsub('\033', ' ', ., fixed = TRUE))
|
||||||
|
}
|
||||||
|
|
||||||
if (quote == TRUE) {
|
if (quote == TRUE) {
|
||||||
df$item <- paste0('"', df$item, '"')
|
df$item <- paste0('"', df$item, '"')
|
||||||
|
if (length(x.group) != 0) {
|
||||||
|
df$group <- paste0('"', df$group, '"')
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
df <- as.data.frame(df, stringsAsFactors = FALSE)
|
df <- as.data.frame(df, stringsAsFactors = FALSE)
|
||||||
|
|
||||||
df$percent <- df$count / base::sum(df$count, na.rm = TRUE)
|
df$percent <- df$count / base::sum(df$count, na.rm = TRUE)
|
||||||
|
if (length(x.group) == 0) {
|
||||||
df$cum_count <- base::cumsum(df$count)
|
df$cum_count <- base::cumsum(df$count)
|
||||||
|
}
|
||||||
df$cum_percent <- df$cum_count / base::sum(df$count, na.rm = TRUE)
|
df$cum_percent <- df$cum_count / base::sum(df$count, na.rm = TRUE)
|
||||||
|
if (length(x.group) != 0) {
|
||||||
colnames(df) <- column_names_df
|
# sort columns
|
||||||
|
df <- df[, column_names_df]
|
||||||
|
}
|
||||||
|
|
||||||
if (markdown == TRUE) {
|
if (markdown == TRUE) {
|
||||||
tbl_format <- 'markdown'
|
tbl_format <- 'markdown'
|
||||||
@ -394,14 +439,15 @@ frequency_tbl <- function(x,
|
|||||||
}
|
}
|
||||||
|
|
||||||
if (!is.null(title)) {
|
if (!is.null(title)) {
|
||||||
x.name <- trimws(gsub("^Frequency table of", "", title[1L], ignore.case = TRUE))
|
title <- trimws(gsub("^Frequency table of", "", title[1L], ignore.case = TRUE))
|
||||||
cols <- NULL
|
|
||||||
}
|
}
|
||||||
|
|
||||||
structure(.Data = df,
|
structure(.Data = df,
|
||||||
class = c('frequency_tbl', class(df)),
|
class = c('frequency_tbl', class(df)),
|
||||||
opt = list(data = x.name,
|
opt = list(title = title,
|
||||||
|
data = x.name,
|
||||||
vars = cols,
|
vars = cols,
|
||||||
|
group_var = x.group,
|
||||||
header = header,
|
header = header,
|
||||||
header_txt = header_txt,
|
header_txt = header_txt,
|
||||||
row_names = row.names,
|
row_names = row.names,
|
||||||
@ -502,15 +548,26 @@ print.frequency_tbl <- function(x, nmax = getOption("max.print.freq", default =
|
|||||||
opt$vars <- NULL
|
opt$vars <- NULL
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (is.null(opt$title)) {
|
||||||
if (!is.null(opt$data) & !is.null(opt$vars)) {
|
if (!is.null(opt$data) & !is.null(opt$vars)) {
|
||||||
title <- paste0("of `", paste0(opt$vars, collapse = "` and `"), "` from ", opt$data)
|
title <- paste0("`", paste0(opt$vars, collapse = "` and `"), "` from ", opt$data)
|
||||||
} else if (!is.null(opt$data) & is.null(opt$vars)) {
|
} else if (!is.null(opt$data) & is.null(opt$vars)) {
|
||||||
title <- paste("of", opt$data)
|
title <- opt$data
|
||||||
} else if (is.null(opt$data) & !is.null(opt$vars)) {
|
} else if (is.null(opt$data) & !is.null(opt$vars)) {
|
||||||
title <- paste0("of `", paste0(opt$vars, collapse = "` and `"), "`")
|
title <- paste0("`", paste0(opt$vars, collapse = "` and `"), "`")
|
||||||
} else {
|
} else {
|
||||||
title <- ""
|
title <- ""
|
||||||
}
|
}
|
||||||
|
if (title != "" & length(opt$group_var) != 0) {
|
||||||
|
group_var <- paste0("(grouped by `", opt$group_var, "`)")
|
||||||
|
if (opt$tbl_format == "pandoc") {
|
||||||
|
group_var <- silver(group_var)
|
||||||
|
}
|
||||||
|
title <- paste(title, group_var)
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
title <- opt$title
|
||||||
|
}
|
||||||
|
|
||||||
if (!missing(nmax)) {
|
if (!missing(nmax)) {
|
||||||
opt$nmax <- nmax
|
opt$nmax <- nmax
|
||||||
@ -525,7 +582,11 @@ print.frequency_tbl <- function(x, nmax = getOption("max.print.freq", default =
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
title <- paste("Frequency table", trimws(title))
|
if (trimws(title) == "") {
|
||||||
|
title <- "Frequency table"
|
||||||
|
} else {
|
||||||
|
title <- paste("Frequency table of", trimws(title))
|
||||||
|
}
|
||||||
|
|
||||||
# bold title
|
# bold title
|
||||||
if (opt$tbl_format == "pandoc") {
|
if (opt$tbl_format == "pandoc") {
|
||||||
|
11
man/freq.Rd
11
man/freq.Rd
@ -23,11 +23,11 @@ top_freq(f, n)
|
|||||||
default = 15), ...)
|
default = 15), ...)
|
||||||
}
|
}
|
||||||
\arguments{
|
\arguments{
|
||||||
\item{x}{vector of any class or a \code{\link{data.frame}}, \code{\link{tibble}} or \code{\link{table}}}
|
\item{x}{vector of any class or a \code{\link{data.frame}}, \code{\link{tibble}} (may contain a grouping variable) or \code{\link{table}}}
|
||||||
|
|
||||||
\item{...}{up to nine different columns of \code{x} when \code{x} is a \code{data.frame} or \code{tibble}, to calculate frequencies from - see Examples}
|
\item{...}{up to nine different columns of \code{x} when \code{x} is a \code{data.frame} or \code{tibble}, to calculate frequencies from - see Examples}
|
||||||
|
|
||||||
\item{sort.count}{sort on count, i.e. frequencies. This will be \code{TRUE} at default for everything except for factors.}
|
\item{sort.count}{sort on count, i.e. frequencies. This will be \code{TRUE} at default for everything except when using grouping variables.}
|
||||||
|
|
||||||
\item{nmax}{number of row to print. The default, \code{15}, uses \code{\link{getOption}("max.print.freq")}. Use \code{nmax = 0}, \code{nmax = Inf}, \code{nmax = NULL} or \code{nmax = NA} to print all rows.}
|
\item{nmax}{number of row to print. The default, \code{15}, uses \code{\link{getOption}("max.print.freq")}. Use \code{nmax = 0}, \code{nmax = Inf}, \code{nmax = NULL} or \code{nmax = NA} to print all rows.}
|
||||||
|
|
||||||
@ -93,7 +93,7 @@ freq(septic_patients[, "hospital_id"])
|
|||||||
septic_patients$hospital_id \%>\% freq()
|
septic_patients$hospital_id \%>\% freq()
|
||||||
septic_patients[, "hospital_id"] \%>\% freq()
|
septic_patients[, "hospital_id"] \%>\% freq()
|
||||||
septic_patients \%>\% freq("hospital_id")
|
septic_patients \%>\% freq("hospital_id")
|
||||||
septic_patients \%>\% freq(hospital_id) #<- easiest to remember when you're used to tidyverse
|
septic_patients \%>\% freq(hospital_id) #<- easiest to remember (tidyverse)
|
||||||
|
|
||||||
# you could also use `select` or `pull` to get your variables
|
# you could also use `select` or `pull` to get your variables
|
||||||
septic_patients \%>\%
|
septic_patients \%>\%
|
||||||
@ -107,6 +107,11 @@ septic_patients \%>\%
|
|||||||
filter(hospital_id == "A") \%>\%
|
filter(hospital_id == "A") \%>\%
|
||||||
freq(genus, species)
|
freq(genus, species)
|
||||||
|
|
||||||
|
# group a variable and analyse another
|
||||||
|
septic_patients \%>\%
|
||||||
|
group_by(hospital_id) \%>\%
|
||||||
|
freq(gender)
|
||||||
|
|
||||||
# get top 10 bugs of hospital A as a vector
|
# get top 10 bugs of hospital A as a vector
|
||||||
septic_patients \%>\%
|
septic_patients \%>\%
|
||||||
filter(hospital_id == "A") \%>\%
|
filter(hospital_id == "A") \%>\%
|
||||||
|
@ -61,6 +61,9 @@ test_that("frequency table works", {
|
|||||||
expect_output(septic_patients %>% select(1:9) %>% freq() %>% print())
|
expect_output(septic_patients %>% select(1:9) %>% freq() %>% print())
|
||||||
expect_output(print(freq(septic_patients$age), nmax = 20))
|
expect_output(print(freq(septic_patients$age), nmax = 20))
|
||||||
|
|
||||||
|
# grouping variable
|
||||||
|
expect_output(print(septic_patients %>% group_by(gender) %>% freq(hospital_id)))
|
||||||
|
|
||||||
# top 5
|
# top 5
|
||||||
expect_equal(
|
expect_equal(
|
||||||
septic_patients %>%
|
septic_patients %>%
|
||||||
|
@ -56,7 +56,7 @@ test_that("mo_property works", {
|
|||||||
# check vector with random values
|
# check vector with random values
|
||||||
library(dplyr)
|
library(dplyr)
|
||||||
df_sample <- AMR::microorganisms %>% sample_n(100)
|
df_sample <- AMR::microorganisms %>% sample_n(100)
|
||||||
expect_identical(df_sample %>% pull(mo) %>% mo_fullname(),
|
expect_identical(df_sample %>% pull(mo) %>% mo_fullname(language = "en"),
|
||||||
df_sample %>% pull(fullname))
|
df_sample %>% pull(fullname))
|
||||||
|
|
||||||
})
|
})
|
||||||
|
Loading…
Reference in New Issue
Block a user