mirror of
https://github.com/msberends/AMR.git
synced 2024-12-25 18:46:11 +01:00
na freq, removed factors
This commit is contained in:
parent
9c56257da1
commit
299c5bea43
@ -1,6 +1,6 @@
|
||||
Package: AMR
|
||||
Version: 0.4.0.9006
|
||||
Date: 2018-10-22
|
||||
Date: 2018-10-23
|
||||
Title: Antimicrobial Resistance Analysis
|
||||
Authors@R: c(
|
||||
person(
|
||||
|
2
NEWS.md
2
NEWS.md
@ -22,7 +22,9 @@
|
||||
* Check for `hms::is.hms` in frequency tables (`freq()`)
|
||||
* New parameter `header` for frequency tables to turn them off (default when `markdown = TRUE`)
|
||||
* Freq now prints in markdown at default in non-interactive sessions
|
||||
* Freq no longer add the factor level column and sorts factors on count again
|
||||
* Removed diacritics from all authors (columns `microorganisms$ref` and `microorganisms.old$ref`) to comply with CRAN policy to only allow ASCII characters
|
||||
* Freq gained `na` parameter, to choose with character to print for empty values
|
||||
* Fix for `mo_property` not working properly
|
||||
* Fix for `EUCAST_rules` where some Streptococci would become ceftazidime R in EUCAST rule 4.5
|
||||
* Support for class `difftime` in frequency tables
|
||||
|
126
R/freq.R
126
R/freq.R
@ -23,12 +23,13 @@
|
||||
#' @param ... up to nine different columns of \code{x} when \code{x} is a \code{data.frame} or \code{tibble}, to calculate frequencies from - see Examples
|
||||
#' @param sort.count sort on count, i.e. frequencies. This will be \code{TRUE} at default for everything except for factors.
|
||||
#' @param nmax number of row to print. The default, \code{15}, uses \code{\link{getOption}("max.print.freq")}. Use \code{nmax = 0}, \code{nmax = Inf}, \code{nmax = NULL} or \code{nmax = NA} to print all rows.
|
||||
#' @param na.rm a logical value indicating whether \code{NA} values should be removed from the frequency table. The header will always print the amount of \code{NA}s.
|
||||
#' @param na.rm a logical value indicating whether \code{NA} values should be removed from the frequency table. The header (if set) will always print the amount of \code{NA}s.
|
||||
#' @param row.names a logical value indicating whether row indices should be printed as \code{1:nrow(x)}
|
||||
#' @param markdown a logical value indicating whether the frequency table should be printed in markdown format. This will print all rows and is default behaviour in non-interactive R sessions (like when knitting RMarkdown files).
|
||||
#' @param digits how many significant digits are to be used for numeric values in the header (not for the items themselves, that depends on \code{\link{getOption}("digits")})
|
||||
#' @param quote a logical value indicating whether or not strings should be printed with surrounding quotes
|
||||
#' @param header a logical value indicating whether an informative header should be printed
|
||||
#' @param na a character string to should be used to show empty (\code{NA}) values (only useful when \code{na.rm = FALSE})
|
||||
#' @param sep a character string to separate the terms when selecting multiple columns
|
||||
#' @param f a frequency table
|
||||
#' @param n number of top \emph{n} items to return, use -n for the bottom \emph{n} items. It will include more than \code{n} rows if there are ties.
|
||||
@ -150,6 +151,7 @@ frequency_tbl <- function(x,
|
||||
digits = 2,
|
||||
quote = FALSE,
|
||||
header = !markdown,
|
||||
na = "<NA>",
|
||||
sep = " ") {
|
||||
|
||||
mult.columns <- 0
|
||||
@ -197,7 +199,7 @@ frequency_tbl <- function(x,
|
||||
# now this DF contains 3 columns: the 2 vars and a Freq column
|
||||
# paste the first 2 cols and repeat them Freq times:
|
||||
x <- rep(x = do.call(paste, c(x[colnames(x)[1:2]], sep = sep)),
|
||||
times = x$Freq)
|
||||
times = x$Freq)
|
||||
x.name <- "a `table` object"
|
||||
cols <- NULL
|
||||
#mult.columns <- 2
|
||||
@ -229,9 +231,8 @@ frequency_tbl <- function(x,
|
||||
class(x) <- x_class
|
||||
}
|
||||
|
||||
if (missing(sort.count) & 'factor' %in% class(x)) {
|
||||
# sort on factor level at default when x is a factor and sort.count is not set
|
||||
sort.count <- FALSE
|
||||
if (sort.count == FALSE & 'factor' %in% class(x)) {
|
||||
# warning("Sorting a factor sorts on factor level, not necessarily alphabetically.", call. = FALSE)
|
||||
}
|
||||
|
||||
header_txt <- character(0)
|
||||
@ -260,11 +261,11 @@ frequency_tbl <- function(x,
|
||||
}
|
||||
|
||||
header_txt <- header_txt %>% paste0(markdown_line, '\nLength: ', (NAs %>% length() + x %>% length()) %>% format(),
|
||||
' (of which NA: ', NAs %>% length() %>% format() %>% NAs_to_red(),
|
||||
' = ', (NAs %>% length() / (NAs %>% length() + x %>% length())) %>%
|
||||
percent(force_zero = TRUE, round = digits) %>%
|
||||
sub('NaN', '0', ., fixed = TRUE) %>%
|
||||
NAs_to_red(), ')')
|
||||
' (of which NA: ', NAs %>% length() %>% format() %>% NAs_to_red(),
|
||||
' = ', (NAs %>% length() / (NAs %>% length() + x %>% length())) %>%
|
||||
percent(force_zero = TRUE, round = digits) %>%
|
||||
sub('NaN', '0', ., fixed = TRUE) %>%
|
||||
NAs_to_red(), ')')
|
||||
header_txt <- header_txt %>% paste0(markdown_line, '\nUnique: ', x %>% n_distinct() %>% format())
|
||||
|
||||
if (NROW(x) > 0 & any(class(x) == "character")) {
|
||||
@ -287,11 +288,11 @@ frequency_tbl <- function(x,
|
||||
header_txt <- header_txt %>% paste0('\n')
|
||||
header_txt <- header_txt %>% paste(markdown_line, '\nMean: ', x %>% base::mean(na.rm = TRUE) %>% format(digits = digits))
|
||||
header_txt <- header_txt %>% paste0(markdown_line, '\nStd. dev.: ', x %>% stats::sd(na.rm = TRUE) %>% format(digits = digits),
|
||||
' (CV: ', x %>% cv(na.rm = TRUE) %>% format(digits = digits),
|
||||
', MAD: ', x %>% stats::mad(na.rm = TRUE) %>% format(digits = digits), ')')
|
||||
' (CV: ', x %>% cv(na.rm = TRUE) %>% format(digits = digits),
|
||||
', MAD: ', x %>% stats::mad(na.rm = TRUE) %>% format(digits = digits), ')')
|
||||
header_txt <- header_txt %>% paste0(markdown_line, '\nFive-Num: ', Tukey_five %>% format(digits = digits) %>% trimws() %>% paste(collapse = ' | '),
|
||||
' (IQR: ', (Tukey_five[4] - Tukey_five[2]) %>% format(digits = digits),
|
||||
', CQV: ', x %>% cqv(na.rm = TRUE) %>% format(digits = digits), ')')
|
||||
' (IQR: ', (Tukey_five[4] - Tukey_five[2]) %>% format(digits = digits),
|
||||
', CQV: ', x %>% cqv(na.rm = TRUE) %>% format(digits = digits), ')')
|
||||
outlier_length <- length(boxplot.stats(x)$out)
|
||||
header_txt <- header_txt %>% paste0(markdown_line, '\nOutliers: ', outlier_length)
|
||||
if (outlier_length > 0) {
|
||||
@ -300,14 +301,14 @@ frequency_tbl <- function(x,
|
||||
}
|
||||
if (NROW(x) > 0 & any(class(x) == "rsi")) {
|
||||
header_txt <- header_txt %>% paste0('\n')
|
||||
cnt_S <- sum(x == "S")
|
||||
cnt_I <- sum(x == "I")
|
||||
cnt_R <- sum(x == "R")
|
||||
cnt_S <- sum(x == "S", na.rm = TRUE)
|
||||
cnt_I <- sum(x == "I", na.rm = TRUE)
|
||||
cnt_R <- sum(x == "R", na.rm = TRUE)
|
||||
header_txt <- header_txt %>% paste(markdown_line, '\n%IR: ',
|
||||
((cnt_I + cnt_R) / sum(!is.na(x))) %>% percent(force_zero = TRUE, round = digits))
|
||||
((cnt_I + cnt_R) / sum(!is.na(x), na.rm = TRUE)) %>% percent(force_zero = TRUE, round = digits))
|
||||
header_txt <- header_txt %>% paste0(markdown_line, '\nRatio SIR: 1.0 : ',
|
||||
(cnt_I / cnt_S) %>% format(digits = 1, nsmall = 1), " : ",
|
||||
(cnt_R / cnt_S) %>% format(digits = 1, nsmall = 1))
|
||||
(cnt_I / cnt_S) %>% format(digits = 1, nsmall = 1), " : ",
|
||||
(cnt_R / cnt_S) %>% format(digits = 1, nsmall = 1))
|
||||
}
|
||||
|
||||
formatdates <- "%e %B %Y" # = d mmmm yyyy
|
||||
@ -327,15 +328,15 @@ frequency_tbl <- function(x,
|
||||
# hms
|
||||
header_txt <- header_txt %>% paste0(markdown_line, '\nEarliest: ', mindate %>% format(formatdates) %>% trimws())
|
||||
header_txt <- header_txt %>% paste0(markdown_line, '\nLatest: ', maxdate %>% format(formatdates) %>% trimws(),
|
||||
' (+', difftime(maxdate, mindate, units = 'mins') %>% as.double() %>% format(digits = digits), ' min.)')
|
||||
' (+', difftime(maxdate, mindate, units = 'mins') %>% as.double() %>% format(digits = digits), ' min.)')
|
||||
} else {
|
||||
# other date formats
|
||||
header_txt <- header_txt %>% paste0(markdown_line, '\nOldest: ', mindate %>% format(formatdates) %>% trimws())
|
||||
header_txt <- header_txt %>% paste0(markdown_line, '\nNewest: ', maxdate %>% format(formatdates) %>% trimws(),
|
||||
' (+', difftime(maxdate, mindate, units = 'auto') %>% as.double() %>% format(digits = digits), ')')
|
||||
' (+', difftime(maxdate, mindate, units = 'auto') %>% as.double() %>% format(digits = digits), ')')
|
||||
}
|
||||
header_txt <- header_txt %>% paste0(markdown_line, '\nMedian: ', mediandate %>% format(formatdates) %>% trimws(),
|
||||
' (~', percent(median_days / maxdate_days, round = 0), ')')
|
||||
' (~', percent(median_days / maxdate_days, round = 0), ')')
|
||||
}
|
||||
if (any(class(x) == 'POSIXlt')) {
|
||||
x <- x %>% format(formatdates)
|
||||
@ -354,23 +355,14 @@ frequency_tbl <- function(x,
|
||||
}
|
||||
|
||||
# create table with counts and percentages
|
||||
column_names <- c('Item', 'Count', 'Percent', 'Cum. Count', 'Cum. Percent', '(Factor Level)')
|
||||
column_names_df <- c('item', 'count', 'percent', 'cum_count', 'cum_percent', 'factor_level')
|
||||
column_names <- c('Item', 'Count', 'Percent', 'Cum. Count', 'Cum. Percent')
|
||||
column_names_df <- c('item', 'count', 'percent', 'cum_count', 'cum_percent')
|
||||
|
||||
if (any(class(x) == 'factor')) {
|
||||
df <- tibble(item = x,
|
||||
fctlvl = x %>% as.integer()) %>%
|
||||
group_by(item, fctlvl)
|
||||
column_align <- c('l', 'r', 'r', 'r', 'r', 'r')
|
||||
} else {
|
||||
df <- tibble(item = x) %>%
|
||||
group_by(item)
|
||||
# strip factor lvl from col names
|
||||
column_names <- column_names[1:length(column_names) - 1]
|
||||
column_names_df <- column_names_df[1:length(column_names_df) - 1]
|
||||
column_align <- c(x_align, 'r', 'r', 'r', 'r')
|
||||
}
|
||||
df <- df %>% summarise(count = n())
|
||||
|
||||
df <- tibble(item = x) %>%
|
||||
group_by(item) %>%
|
||||
summarise(count = n())
|
||||
column_align <- c(x_align, 'r', 'r', 'r', 'r')
|
||||
|
||||
if (df$item %>% paste(collapse = ',') %like% '\033') {
|
||||
# remove escape char
|
||||
@ -382,11 +374,7 @@ frequency_tbl <- function(x,
|
||||
if (sort.count == TRUE) {
|
||||
df <- df %>% arrange(desc(count), item)
|
||||
} else {
|
||||
if (any(class(x) == 'factor')) {
|
||||
df <- df %>% arrange(fctlvl, item)
|
||||
} else {
|
||||
df <- df %>% arrange(item)
|
||||
}
|
||||
df <- df %>% arrange(item)
|
||||
}
|
||||
|
||||
if (quote == TRUE) {
|
||||
@ -399,34 +387,27 @@ frequency_tbl <- function(x,
|
||||
df$cum_count <- base::cumsum(df$count)
|
||||
df$cum_percent <- df$cum_count / base::sum(df$count, na.rm = TRUE)
|
||||
|
||||
if (any(class(x) == 'factor')) {
|
||||
# put factor last
|
||||
df <- df %>% select(item, count, percent, cum_count, cum_percent, fctlvl)
|
||||
}
|
||||
|
||||
colnames(df) <- column_names_df
|
||||
|
||||
class(df) <- c('frequency_tbl', class(df))
|
||||
attr(df, 'package') <- 'AMR'
|
||||
|
||||
if (markdown == TRUE) {
|
||||
tbl_format <- 'markdown'
|
||||
} else {
|
||||
tbl_format <- 'pandoc'
|
||||
}
|
||||
|
||||
attr(df, 'opt') <- list(data = x.name,
|
||||
vars = cols,
|
||||
header = header,
|
||||
header_txt = header_txt,
|
||||
row_names = row.names,
|
||||
column_names = column_names,
|
||||
column_align = column_align,
|
||||
tbl_format = tbl_format,
|
||||
nmax = nmax,
|
||||
nmax.set = nmax.set)
|
||||
|
||||
df
|
||||
structure(.Data = df,
|
||||
class = c('frequency_tbl', class(df)),
|
||||
opt = list(data = x.name,
|
||||
vars = cols,
|
||||
header = header,
|
||||
header_txt = header_txt,
|
||||
row_names = row.names,
|
||||
column_names = column_names,
|
||||
column_align = column_align,
|
||||
tbl_format = tbl_format,
|
||||
na = na,
|
||||
nmax = nmax,
|
||||
nmax.set = nmax.set))
|
||||
}
|
||||
|
||||
#' @rdname freq
|
||||
@ -547,7 +528,7 @@ print.frequency_tbl <- function(x, nmax = getOption("max.print.freq", default =
|
||||
if (opt$tbl_format == "pandoc") {
|
||||
title <- bold(title)
|
||||
} else if (opt$tbl_format == "markdown") {
|
||||
title <- paste0("**", title, "**")
|
||||
title <- paste0("**", title, "**")
|
||||
}
|
||||
|
||||
if (opt$header == TRUE) {
|
||||
@ -571,7 +552,10 @@ print.frequency_tbl <- function(x, nmax = getOption("max.print.freq", default =
|
||||
|
||||
# save old NA setting for kable
|
||||
opt.old <- options()$knitr.kable.NA
|
||||
options(knitr.kable.NA = "<NA>")
|
||||
if (is.null(opt$na)) {
|
||||
opt$na <- "<NA>"
|
||||
}
|
||||
options(knitr.kable.NA = opt$na)
|
||||
|
||||
if (nrow(x) > opt$nmax & opt$tbl_format != "markdown") {
|
||||
|
||||
@ -615,6 +599,10 @@ print.frequency_tbl <- function(x, nmax = getOption("max.print.freq", default =
|
||||
x$cum_count <- format(x$cum_count)
|
||||
x$cum_percent <- percent(x$cum_percent, force_zero = TRUE)
|
||||
|
||||
if (opt$tbl_format == "markdown") {
|
||||
cat("\n\n")
|
||||
}
|
||||
|
||||
print(
|
||||
knitr::kable(x,
|
||||
format = opt$tbl_format,
|
||||
@ -628,7 +616,11 @@ print.frequency_tbl <- function(x, nmax = getOption("max.print.freq", default =
|
||||
cat(footer)
|
||||
}
|
||||
|
||||
cat('\n')
|
||||
if (opt$tbl_format == "markdown") {
|
||||
cat("\n\n")
|
||||
} else {
|
||||
cat('\n')
|
||||
}
|
||||
|
||||
# reset old kable setting
|
||||
options(knitr.kable.NA = opt.old)
|
||||
|
61
README.md
61
README.md
@ -361,69 +361,36 @@ freq(mydata$myvariable)
|
||||
mydata %>% freq(myvariable)
|
||||
```
|
||||
|
||||
Factors sort on item by default:
|
||||
Frequency are of course sorted by count at default:
|
||||
```r
|
||||
septic_patients %>% freq(hospital_id)
|
||||
# Frequency table of `hospital_id`
|
||||
# Class: factor (numeric)
|
||||
# Length: 2000 (of which NA: 0 = 0.00%)
|
||||
# Unique: 4
|
||||
#
|
||||
# Item Count Percent Cum. Count Cum. Percent (Factor Level)
|
||||
# --- ----- ------ -------- ----------- ------------- ---------------
|
||||
# 1 A 321 16.1% 321 16.1% 1
|
||||
# 2 B 663 33.1% 984 49.2% 2
|
||||
# 3 C 254 12.7% 1238 61.9% 3
|
||||
# 4 D 762 38.1% 2000 100.0% 4
|
||||
# Item Count Percent Cum. Count Cum. Percent
|
||||
# --- ----- ------ -------- ----------- -------------
|
||||
# 1 D 762 38.1% 762 38.1%
|
||||
# 2 B 663 33.1% 1425 71.2%
|
||||
# 3 A 321 16.1% 1746 87.3%
|
||||
# 4 C 254 12.7% 2000 100.0%
|
||||
```
|
||||
|
||||
This can be changed with the `sort.count` parameter:
|
||||
```r
|
||||
septic_patients %>% freq(hospital_id, sort.count = TRUE)
|
||||
# Frequency table of `hospital_id`
|
||||
septic_patients %>% freq(hospital_id, sort.count = FALSE)
|
||||
# Class: factor (numeric)
|
||||
# Length: 2000 (of which NA: 0 = 0.00%)
|
||||
# Unique: 4
|
||||
#
|
||||
# Item Count Percent Cum. Count Cum. Percent (Factor Level)
|
||||
# --- ----- ------ -------- ----------- ------------- ---------------
|
||||
# 1 D 762 38.1% 762 38.1% 4
|
||||
# 2 B 663 33.1% 1425 71.2% 2
|
||||
# 3 A 321 16.1% 1746 87.3% 1
|
||||
# 4 C 254 12.7% 2000 100.0% 3
|
||||
# Item Count Percent Cum. Count Cum. Percent
|
||||
# --- ----- ------ -------- ----------- -------------
|
||||
# 1 A 321 16.1% 321 16.1%
|
||||
# 2 B 663 33.1% 984 49.2%
|
||||
# 3 C 254 12.7% 1238 61.9%
|
||||
# 4 D 762 38.1% 2000 100.0%
|
||||
```
|
||||
|
||||
All other types, like numbers, characters and dates, sort on count by default:
|
||||
```r
|
||||
septic_patients %>% freq(date)
|
||||
# Frequency table of `date`
|
||||
# Class: Date
|
||||
# Length: 2000 (of which NA: 0 = 0.0%)
|
||||
# Unique: 1151
|
||||
#
|
||||
# Oldest: 2 January 2002
|
||||
# Newest: 28 December 2017 (+5839)
|
||||
# Median: 31 July 2009 (~47%)
|
||||
#
|
||||
# Item Count Percent Cum. Count Cum. Percent
|
||||
# --- ----------- ------ -------- ----------- -------------
|
||||
# 1 2016-05-21 10 0.5% 10 0.5%
|
||||
# 2 2004-11-15 8 0.4% 18 0.9%
|
||||
# 3 2013-07-29 8 0.4% 26 1.3%
|
||||
# 4 2017-06-12 8 0.4% 34 1.7%
|
||||
# 5 2015-11-19 7 0.4% 41 2.1%
|
||||
# 6 2005-12-22 6 0.3% 47 2.4%
|
||||
# 7 2015-10-12 6 0.3% 53 2.6%
|
||||
# 8 2002-02-27 5 0.2% 58 2.9%
|
||||
# 9 2003-10-20 5 0.2% 63 3.1%
|
||||
# 10 2004-02-02 5 0.2% 68 3.4%
|
||||
# 11 2004-02-18 5 0.2% 73 3.6%
|
||||
# 12 2004-06-22 5 0.2% 78 3.9%
|
||||
# 13 2004-12-01 5 0.2% 83 4.2%
|
||||
# 14 2005-08-16 5 0.2% 88 4.4%
|
||||
# 15 2005-09-01 5 0.2% 93 4.7%
|
||||
# [ reached getOption("max.print.freq") -- omitted 1136 entries, n = 1907 (95.3%) ]
|
||||
```
|
||||
For numeric values, some extra descriptive statistics will be calculated:
|
||||
```r
|
||||
freq(runif(n = 10, min = 1, max = 5))
|
||||
|
Loading…
Reference in New Issue
Block a user