From 3527894b49bc7fc7cf8037cdafe20ef2d946ddb4 Mon Sep 17 00:00:00 2001 From: "Matthijs S. Berends" Date: Sun, 1 Jul 2018 21:40:37 +0200 Subject: [PATCH] new g.test() and edited freq() --- DESCRIPTION | 5 +- NAMESPACE | 7 + NEWS.md | 11 +- R/freq.R | 395 ++++++++++++++++++++--------------- R/g.test.R | 264 +++++++++++++++++++++++ R/globals.R | 9 +- R/p.symbol.R | 57 +++++ R/print.R | 7 - man/freq.Rd | 55 ++--- man/g.test.Rd | 108 ++++++++++ man/p.symbol.Rd | 19 ++ man/print.Rd | 14 +- man/vector2ratio.Rd | 64 ++++++ tests/testthat/test-freq.R | 31 ++- tests/testthat/test-g.test.R | 19 ++ tests/testthat/test-mdro.R | 2 +- vignettes/freq.R | 61 +++--- vignettes/freq.Rmd | 61 +++--- 18 files changed, 893 insertions(+), 296 deletions(-) create mode 100644 R/g.test.R create mode 100644 R/p.symbol.R create mode 100644 man/g.test.Rd create mode 100644 man/p.symbol.Rd create mode 100644 man/vector2ratio.Rd create mode 100644 tests/testthat/test-g.test.R diff --git a/DESCRIPTION b/DESCRIPTION index caa918db..7a5463c0 100755 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,6 +1,6 @@ Package: AMR -Version: 0.2.0.9006 -Date: 2018-06-29 +Version: 0.2.0.9007 +Date: 2018-07-01 Title: Antimicrobial Resistance Analysis Authors@R: c( person( @@ -36,6 +36,7 @@ Imports: xml2 (>= 1.0.0), knitr (>= 1.0.0), readr, + rlang, rvest (>= 0.3.2), tibble Suggests: diff --git a/NAMESPACE b/NAMESPACE index e739d96c..7d23858c 100755 --- a/NAMESPACE +++ b/NAMESPACE @@ -34,6 +34,7 @@ export(first_isolate) export(freq) export(frequency_tbl) export(full_join_microorganisms) +export(g.test) export(guess_atc) export(guess_bactid) export(inner_join_microorganisms) @@ -44,12 +45,14 @@ export(key_antibiotics) export(left_join_microorganisms) export(mo_property) export(n_rsi) +export(p.symbol) export(right_join_microorganisms) export(rsi) export(rsi_df) export(rsi_predict) export(semi_join_microorganisms) export(top_freq) +export(vector2ratio) exportMethods(as.double.mic) exportMethods(as.integer.mic) exportMethods(as.numeric.mic) @@ -103,16 +106,20 @@ importFrom(graphics,axis) importFrom(graphics,barplot) importFrom(graphics,plot) importFrom(graphics,text) +importFrom(knitr,kable) importFrom(readr,locale) importFrom(readr,parse_guess) importFrom(reshape2,dcast) +importFrom(rlang,ensyms) importFrom(rvest,html_children) importFrom(rvest,html_node) importFrom(rvest,html_nodes) importFrom(rvest,html_table) importFrom(stats,fivenum) +importFrom(stats,pchisq) importFrom(stats,quantile) importFrom(stats,sd) +importFrom(tibble,tibble) importFrom(utils,browseVignettes) importFrom(utils,object.size) importFrom(utils,packageDescription) diff --git a/NEWS.md b/NEWS.md index 9e1c176e..094e92e2 100755 --- a/NEWS.md +++ b/NEWS.md @@ -4,10 +4,17 @@ * Vignette about frequency tables * Possibility to globally set the default for the amount of items to print in frequency tables (`freq` function), with `options(max.print.freq = n)` * Functions `clipboard_import` and `clipboard_export` as helper functions to quickly copy and paste from/to software like Excel and SPSS +* Function `g.test` to perform the Χ2 distributed [*G*-test](https://en.wikipedia.org/wiki/G-test) +* Function `p.symbol` to transform p value to their related symbol: `0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1` +* Function `vector2ratio` to transform a vector of values to a preset ratio. For example: +```r +vector2ratio(c(772, 1611, 737), ratio = "1:2:1") +# [1] 780 1560 780 +``` #### Changed -* Renamed `toConsole` parameter of `freq` function to `as.data.frame` -* Added pretty printing for frequency tables when returned as `data.frame` +* Frequency tables (function `freq`) now supports quasiquotation: `freq(mydata, mycolumn)`, or `mydata %>% freq(mycolumn)` +* Frequency tables are now actual `data.frame`s with altered console printing to make it look like a frequency table. Because of this, the parameter `toConsole` is not longer needed. * Small translational improvements to the `septic_patients` dataset * Combined MIC/RSI values will now be coerced by the `rsi` and `mic` functions: * `as.rsi("<=0.002; S")` will return `S` diff --git a/R/freq.R b/R/freq.R index afcbbcc3..0edcdadf 100755 --- a/R/freq.R +++ b/R/freq.R @@ -18,17 +18,17 @@ #' Frequency table #' -#' Create a frequency table of a vector of data, a single column or a maximum of 9 columns of a data frame. Supports markdown for reports. \code{top_freq} can be used to get the top/bottom \emph{n} items of a frequency table, with counts as names. -#' @param x data -#' @param sort.count sort on count. Use \code{FALSE} to sort alphabetically on item. -#' @param nmax number of row to print. The default, \code{15}, uses \code{\link{getOption}("max.print.freq")}. Use \code{nmax = 0} or \code{nmax = NA} to print all rows. +#' Create a frequency table of a vector with items or a data frame. Supports quasiquotation and markdown for reports. \code{top_freq} can be used to get the top/bottom \emph{n} items of a frequency table, with counts as names. +#' @param x vector with items, or \code{data.frame} +#' @param ... up to nine different columns of \code{x} to calculate frequencies from, see Examples +#' @param sort.count sort on count, i.e. frequencies. Use \code{FALSE} to sort alphabetically on item. +#' @param nmax number of row to print. The default, \code{15}, uses \code{\link{getOption}("max.print.freq")}. Use \code{nmax = 0}, \code{nmax = NULL} or \code{nmax = NA} to print all rows. #' @param na.rm a logical value indicating whether NA values should be removed from the frequency table. The header will always print the amount of \code{NA}s. #' @param row.names a logical value indicating whether row indices should be printed as \code{1:nrow(x)} #' @param markdown print table in markdown format (this forces \code{nmax = NA}) -#' @param as.data.frame return frequency table without header as a \code{data.frame} (e.g. to assign the table to an object) -#' @param digits how many significant digits are to be used for numeric values (not for the items themselves, that depends on \code{\link{getOption}("digits")}) +#' @param digits how many significant digits are to be used for numeric values in the header (not for the items themselves, that depends on \code{\link{getOption}("digits")}) #' @param sep a character string to separate the terms when selecting multiple columns -#' @param f a frequency table as \code{data.frame}, used as \code{freq(..., as.data.frame = TRUE)} +#' @param f a frequency table #' @param n number of top \emph{n} items to return, use -n for the bottom \emph{n} items. It will include more than \code{n} rows if there are ties. #' @details This package also has a vignette available about this function, run: \code{browseVignettes("AMR")} to read it. #' @@ -54,51 +54,82 @@ #' @importFrom grDevices boxplot.stats #' @importFrom dplyr %>% select pull n_distinct group_by arrange desc mutate summarise #' @importFrom utils browseVignettes +#' @importFrom tibble tibble +#' @importFrom rlang ensyms #' @keywords summary summarise frequency freq #' @rdname freq -#' @return \itemize{ -#' \item{When using \code{as.data.frame = FALSE} (default): only printed text} -#' \item{When using \code{as.data.frame = TRUE}: a \code{data.frame} object with an additional class \code{"frequency_tbl"}} -#' } +#' @name freq +#' @return A \code{data.frame} with an additional class \code{"frequency_tbl"} #' @export #' @examples #' library(dplyr) #' +#' # this all gives the same result: #' freq(septic_patients$hospital_id) +#' freq(septic_patients[, "hospital_id"]) +#' septic_patients$hospital_id %>% freq() +#' septic_patients[, "hospital_id"] %>% freq() +#' septic_patients %>% freq("hospital_id") +#' septic_patients %>% freq(hospital_id) # <- easiest to remember when used to tidyverse #' +#' # you could use `select`... #' septic_patients %>% #' filter(hospital_id == "A") %>% #' select(bactid) %>% #' freq() #' +#' # ... or you use `freq` to select it immediately +#' septic_patients %>% +#' filter(hospital_id == "A") %>% +#' freq(bactid) +#' #' # select multiple columns; they will be pasted together #' septic_patients %>% #' left_join_microorganisms %>% #' filter(hospital_id == "A") %>% -#' select(genus, species) %>% -#' freq() +#' freq(genus, species) #' #' # save frequency table to an object #' years <- septic_patients %>% #' mutate(year = format(date, "%Y")) %>% -#' select(year) %>% -#' freq(as.data.frame = TRUE) +#' freq(year) +#' years %>% pull(item) #' #' # get top 10 bugs of hospital A as a vector #' septic_patients %>% #' filter(hospital_id == "A") %>% -#' select(bactid) %>% -#' freq(as.data.frame = TRUE) %>% +#' freq(bactid) %>% #' top_freq(10) -freq <- function(x, - sort.count = TRUE, - nmax = getOption("max.print.freq"), - na.rm = TRUE, - row.names = TRUE, - markdown = FALSE, - as.data.frame = FALSE, - digits = 2, - sep = " ") { +frequency_tbl <- function(x, + ..., + sort.count = TRUE, + nmax = getOption("max.print.freq"), + na.rm = TRUE, + row.names = TRUE, + markdown = FALSE, + digits = 2, + sep = " ") { + + if (any(class(x) == 'data.frame')) { + x.name <- deparse(substitute(x)) + if (x.name == ".") { + x.name <- NULL + } + dots <- rlang::ensyms(...) + ndots <- length(dots) + + if (ndots > 0 & ndots < 10) { + cols <- as.character(dots) + x <- x[, cols] + } else if (ndots >= 10) { + stop('A maximum of 9 columns can be analysed at the same time.', call. = FALSE) + } else { + cols <- NULL + } + } else { + x.name <- NULL + cols <- NULL + } mult.columns <- 0 @@ -117,64 +148,64 @@ freq <- function(x, colnames(x) <- LETTERS[1:ncol(x)] if (ncol(x) == 2) { x$total <- paste(x$A %>% as.character(), - x$B %>% as.character(), - sep = sep) + x$B %>% as.character(), + sep = sep) } else if (ncol(x) == 3) { x$total <- paste(x$A %>% as.character(), - x$B %>% as.character(), - x$C %>% as.character(), - sep = sep) + x$B %>% as.character(), + x$C %>% as.character(), + sep = sep) } else if (ncol(x) == 4) { x$total <- paste(x$A %>% as.character(), - x$B %>% as.character(), - x$C %>% as.character(), - x$D %>% as.character(), - sep = sep) + x$B %>% as.character(), + x$C %>% as.character(), + x$D %>% as.character(), + sep = sep) } else if (ncol(x) == 5) { x$total <- paste(x$A %>% as.character(), - x$B %>% as.character(), - x$C %>% as.character(), - x$D %>% as.character(), - x$E %>% as.character(), - sep = sep) + x$B %>% as.character(), + x$C %>% as.character(), + x$D %>% as.character(), + x$E %>% as.character(), + sep = sep) } else if (ncol(x) == 6) { x$total <- paste(x$A %>% as.character(), - x$B %>% as.character(), - x$C %>% as.character(), - x$D %>% as.character(), - x$E %>% as.character(), - x$F %>% as.character(), - sep = sep) + x$B %>% as.character(), + x$C %>% as.character(), + x$D %>% as.character(), + x$E %>% as.character(), + x$F %>% as.character(), + sep = sep) } else if (ncol(x) == 7) { x$total <- paste(x$A %>% as.character(), - x$B %>% as.character(), - x$C %>% as.character(), - x$D %>% as.character(), - x$E %>% as.character(), - x$F %>% as.character(), - x$G %>% as.character(), - sep = sep) + x$B %>% as.character(), + x$C %>% as.character(), + x$D %>% as.character(), + x$E %>% as.character(), + x$F %>% as.character(), + x$G %>% as.character(), + sep = sep) } else if (ncol(x) == 8) { x$total <- paste(x$A %>% as.character(), - x$B %>% as.character(), - x$C %>% as.character(), - x$D %>% as.character(), - x$E %>% as.character(), - x$F %>% as.character(), - x$G %>% as.character(), - x$H %>% as.character(), - sep = sep) + x$B %>% as.character(), + x$C %>% as.character(), + x$D %>% as.character(), + x$E %>% as.character(), + x$F %>% as.character(), + x$G %>% as.character(), + x$H %>% as.character(), + sep = sep) } else if (ncol(x) == 9) { x$total <- paste(x$A %>% as.character(), - x$B %>% as.character(), - x$C %>% as.character(), - x$D %>% as.character(), - x$E %>% as.character(), - x$F %>% as.character(), - x$G %>% as.character(), - x$H %>% as.character(), - x$I %>% as.character(), - sep = sep) + x$B %>% as.character(), + x$C %>% as.character(), + x$D %>% as.character(), + x$E %>% as.character(), + x$F %>% as.character(), + x$G %>% as.character(), + x$H %>% as.character(), + x$I %>% as.character(), + sep = sep) } x <- x$total @@ -183,9 +214,6 @@ freq <- function(x, stop('A maximum of 9 columns can be analysed at the same time.', call. = FALSE) } } - if (markdown == TRUE & as.data.frame == TRUE) { - warning('`as.data.frame = TRUE` will be ignored when `markdown = TRUE`.') - } if (mult.columns > 1) { NAs <- x[is.na(x) | x == trimws(strrep('NA ', mult.columns))] @@ -264,22 +292,12 @@ freq <- function(x, x <- x %>% format(formatdates) } - if (as.data.frame == FALSE) { - cat(header) - } - - if (all(is.na(x))) { - cat('\n\nNo observations.\n') - return(invisible()) - } - if (n_distinct(x) == length(x)) { - warning('All observations are unique.', call. = FALSE) - } - nmax.set <- !missing(nmax) - if (is.null(nmax) & is.null(base::getOption("max.print.freq", default = NULL))) { + if (!nmax.set & is.null(nmax) & is.null(base::getOption("max.print.freq", default = NULL))) { # default for max print setting nmax <- 15 + } else if (is.null(nmax)) { + nmax <- length(x) } if (nmax == 0 | is.na(nmax) | is.null(nmax)) { @@ -290,26 +308,25 @@ freq <- function(x, # create table with counts and percentages column_names <- c('Item', 'Count', 'Percent', 'Cum. Count', 'Cum. Percent', '(Factor Level)') column_names_df <- c('item', 'count', 'percent', 'cum_count', 'cum_percent', 'factor_level') + if (any(class(x) == 'factor')) { - df <- tibble::tibble(Item = x, - Fctlvl = x %>% as.integer()) %>% - group_by(Item, Fctlvl) + df <- tibble::tibble(item = x, + fctlvl = x %>% as.integer()) %>% + group_by(item, fctlvl) column_align <- c('l', 'r', 'r', 'r', 'r', 'r') } else { - df <- tibble::tibble(Item = x) %>% - group_by(Item) + df <- tibble::tibble(item = x) %>% + group_by(item) # strip factor lvl from col names column_names <- column_names[1:length(column_names) - 1] column_names_df <- column_names_df[1:length(column_names_df) - 1] column_align <- c(x_align, 'r', 'r', 'r', 'r') } - df <- df %>% - summarise(Count = n(), - Percent = (n() / length(x)) %>% percent(force_zero = TRUE)) + df <- df %>% summarise(count = n()) - if (df$Item %>% paste(collapse = ',') %like% '\033') { + if (df$item %>% paste(collapse = ',') %like% '\033') { df <- df %>% - mutate(Item = Item %>% + mutate(item = item %>% # remove escape char # see https://en.wikipedia.org/wiki/Escape_character#ASCII_escape_character gsub('\033', ' ', ., fixed = TRUE)) @@ -317,95 +334,54 @@ freq <- function(x, # sort according to setting if (sort.count == TRUE) { - df <- df %>% arrange(desc(Count), Item) + df <- df %>% arrange(desc(count), item) } else { if (any(class(x) == 'factor')) { - df <- df %>% arrange(Fctlvl, Item) + df <- df %>% arrange(fctlvl, item) } else { - df <- df %>% arrange(Item) + df <- df %>% arrange(item) } } - # add cumulative values - df$Cum <- cumsum(df$Count) - df$CumTot <- (df$Cum / sum(df$Count, na.rm = TRUE)) %>% percent(force_zero = TRUE) - df$Cum <- df$Cum %>% format() + df <- as.data.frame(df, stringsAsFactors = FALSE) + + df$percent <- df$count / base::sum(df$count, na.rm = TRUE) + df$cum_count <- base::cumsum(df$count) + df$cum_percent <- df$cum_count / base::sum(df$count, na.rm = TRUE) if (any(class(x) == 'factor')) { # put factor last - df <- df %>% select(Item, Count, Percent, Cum, CumTot, Fctlvl) + df <- df %>% select(item, count, percent, cum_count, cum_percent, fctlvl) } - if (as.data.frame == TRUE) { - # assign to object - df[, 3] <- df[, 2] / sum(df[, 2], na.rm = TRUE) - df[, 4] <- cumsum(df[, 2]) - df[, 5] <- df[, 4] / sum(df[, 2], na.rm = TRUE) - colnames(df) <- column_names_df - df <- as.data.frame(df, stringsAsFactors = FALSE) - class(df) <- c('frequency_tbl', class(df)) - return(df) - } + colnames(df) <- column_names_df + + class(df) <- c('frequency_tbl', class(df)) + attr(df, 'package') <- 'AMR' + attr(df, 'package.version') <- packageDescription('AMR')$Version if (markdown == TRUE) { - tblformat <- 'markdown' + tbl_format <- 'markdown' } else { - tblformat <- 'pandoc' + tbl_format <- 'pandoc' } - # save old NA setting for kable - opt.old <- options()$knitr.kable.NA - options(knitr.kable.NA = "") + attr(df, 'opt') <- list(data = x.name, + vars = cols, + header = header, + row_names = row.names, + column_names = column_names, + column_align = column_align, + tbl_format = tbl_format, + nmax = nmax, + nmax.set = nmax.set) - Count.rest <- sum(df[nmax.1:nrow(df), 'Count'], na.rm = TRUE) - if (any(class(x) %in% c('double', 'integer', 'numeric', 'raw', 'single'))) { - df <- df %>% mutate(Item = format(Item)) - } - df <- df %>% mutate(Count = format(Count)) - - if (nrow(df) > nmax.1 & markdown == FALSE) { - df2 <- df[1:nmax,] - print( - knitr::kable(df2, - format = tblformat, - row.names = row.names, - col.names = column_names, - align = column_align, - padding = 1) - ) - if (nmax.set == TRUE) { - cat('[ reached `nmax = ', nmax, '`', sep = '') - } else { - cat('[ reached getOption("max.print.freq")') - } - cat(' -- omitted ', - format(nrow(df) - nmax), - ' entries, n = ', - format(Count.rest), - ' (', - (Count.rest / length(x)) %>% percent(force_zero = TRUE), - ') ]\n', sep = '') - - } else { - print( - knitr::kable(df, - format = tblformat, - row.names = row.names, - col.names = column_names, - align = column_align, - padding = 1) - ) - } - cat('\n') - - # reset old kable setting - options(knitr.kable.NA = opt.old) - return(invisible()) + df } #' @rdname freq #' @export -frequency_tbl <- freq +freq <- frequency_tbl #' @rdname freq #' @export @@ -426,4 +402,95 @@ top_freq <- function(f, n) { vect } +#' @rdname print +#' @exportMethod print.frequency_tbl +#' @importFrom knitr kable +#' @importFrom dplyr n_distinct +#' @export +print.frequency_tbl <- function(x, ...) { + + opt <- attr(x, 'opt') + + if (!is.null(opt$data) & !is.null(opt$vars)) { + title <- paste0("of `", paste0(opt$vars, collapse = "` and `"), "` from ", opt$data) + } else if (!is.null(opt$data) & is.null(opt$vars)) { + title <- paste("of", opt$data) + } else if (is.null(opt$data) & !is.null(opt$vars)) { + title <- paste0("of `", paste0(opt$vars, collapse = "` and `"), "`") + } else { + title <- "" + } + + cat("Frequency table", title, "\n\n") + + if (!is.null(opt$header)) { + cat(opt$header) + } + + if (NROW(x) == 0) { + cat('\n\nNo observations.\n') + return(invisible()) + } + + if (all(x$count == 1)) { + warning('All observations are unique.', call. = FALSE) + } + + # save old NA setting for kable + opt.old <- options()$knitr.kable.NA + options(knitr.kable.NA = "") + + if (nrow(x) > opt$nmax & opt$tbl_format != "markdown") { + + x.rows <- nrow(x) + x.unprinted <- base::sum(x[(opt$nmax + 1):nrow(x), 'count'], na.rm = TRUE) + x.printed <- base::sum(x$count) - x.unprinted + + x <- x[1:opt$nmax,] + + if (opt$nmax.set == TRUE) { + footer <- paste('[ reached `nmax = ', opt$nmax, '`', sep = '') + } else { + footer <- '[ reached getOption("max.print.freq")' + } + footer <- paste(footer, + ' -- omitted ', + format(x.rows - opt$nmax), + ' entries, n = ', + format(x.unprinted), + ' (', + (x.unprinted / (x.unprinted + x.printed)) %>% percent(force_zero = TRUE), + ') ]\n', sep = '') + } else { + footer <- NULL + } + + if (any(class(x$item) %in% c('double', 'integer', 'numeric', 'raw', 'single'))) { + x$item <- format(x$item) + } + x$count <- format(x$count) + x$percent <- percent(x$percent, force_zero = TRUE) + x$cum_count <- format(x$cum_count) + x$cum_percent <- percent(x$cum_percent, force_zero = TRUE) + + print( + knitr::kable(x, + format = opt$tbl_format, + row.names = opt$row_names, + col.names = opt$column_names, + align = opt$column_align, + padding = 1) + ) + + if (!is.null(footer)) { + cat(footer) + } + + cat('\n') + + # reset old kable setting + options(knitr.kable.NA = opt.old) + return(invisible()) + +} diff --git a/R/g.test.R b/R/g.test.R new file mode 100644 index 00000000..5932e8c0 --- /dev/null +++ b/R/g.test.R @@ -0,0 +1,264 @@ +# ==================================================================== # +# TITLE # +# Antimicrobial Resistance (AMR) Analysis # +# # +# AUTHORS # +# Berends MS (m.s.berends@umcg.nl), Luz CF (c.f.luz@umcg.nl) # +# # +# LICENCE # +# This program is free software; you can redistribute it and/or modify # +# it under the terms of the GNU General Public License version 2.0, # +# as published by the Free Software Foundation. # +# # +# This program is distributed in the hope that it will be useful, # +# but WITHOUT ANY WARRANTY; without even the implied warranty of # +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # +# GNU General Public License for more details. # +# ==================================================================== # + +#' \emph{G}-test of matrix or vector +#' +#' A \emph{G}-test can be used to see whether the number of observations in each category fits a theoretical expectation (called a \strong{\emph{G}-test of goodness-of-fit}), or to see whether the proportions of one variable are different for different values of the other variable (called a \strong{\emph{G}-test of independence}). +#' @param x numeric vector or matrix +#' @param y expected value of \code{x}. Leave empty to determine automatically. This can also be ratios of \code{x}, e.g. calculated with \code{\link{vector2ratio}}. +#' @param alpha value to test the p value against +#' @param info logical to determine whether the analysis should be printed +#' @param minimum the test with fail if any of the observed values is below this value. Use \code{minimum = 30} for microbial epidemiology, to prevent calculating a p value when less than 30 isolates are available. +#' @section \emph{G}-test of goodness-of-fit (likelihood ratio test): +#' Use the \emph{G}-test of goodness-of-fit when you have one nominal variable with two or more values (such as male and female, or red, pink and white flowers). You compare the observed counts of numbers of observations in each category with the expected counts, which you calculate using some kind of theoretical expectation (such as a 1:1 sex ratio or a 1:2:1 ratio in a genetic cross). +#' +#' If the expected number of observations in any category is too small, the \emph{G}-test may give inaccurate results, and you should use an exact test instead. See the web page on small sample sizes for discussion of what "small" means. +#' +#' The \emph{G}-test of goodness-of-fit is an alternative to the chi-square test of goodness-of-fit; each of these tests has some advantages and some disadvantages, and the results of the two tests are usually very similar. +#' +#' @section \emph{G}-test of independence: +#' Use the \emph{G}-test of independence when you have two nominal variables, each with two or more possible values. You want to know whether the proportions for one variable are different among values of the other variable. +#' +#' It is also possible to do a \emph{G}-test of independence with more than two nominal variables. For example, Jackson et al. (2013) also had data for children under 3, so you could do an analysis of old vs. young, thigh vs. arm, and reaction vs. no reaction, all analyzed together. +#' +#' Fisher's exact test is more accurate than the \emph{G}-test of independence when the expected numbers are small, so it is recommend to only use the \emph{G}-test if your total sample size is greater than 1000. +#' +#' The \emph{G}-test of independence is an alternative to the chi-square test of independence, and they will give approximately the same results. +#' @section How the test works: +#' Unlike the exact test of goodness-of-fit, the \emph{G}-test does not directly calculate the probability of obtaining the observed results or something more extreme. Instead, like almost all statistical tests, the \emph{G}-test has an intermediate step; it uses the data to calculate a test statistic that measures how far the observed data are from the null expectation. You then use a mathematical relationship, in this case the chi-square distribution, to estimate the probability of obtaining that value of the test statistic. +#' +#' The \emph{G}-test uses the log of the ratio of two likelihoods as the test statistic, which is why it is also called a likelihood ratio test or log-likelihood ratio test. The formula to calculate a \emph{G}-statistic is: +#' +#' \code{G <- 2 * sum(x * log(x / x.expected))} +#' +#' Since this is chi-square distributed, the p value can be calculated with: +#' +#' \code{p <- 1 - stats::pchisq(G, df))} +#' +#' where \code{df} are the degrees of freedom: \code{max(NROW(x) - 1, 1) * max(NCOL(x) - 1, 1)}. +#' +#' If there are more than two categories and you want to find out which ones are significantly different from their null expectation, you can use the same method of testing each category vs. the sum of all categories, with the Bonferroni correction. You use \emph{G}-tests for each category, of course. +#' @keywords chi +#' @seealso \code{\link{chisq.test}} +#' @references McDonald, J.H. 2014. \strong{Handbook of Biological Statistics (3rd ed.)}. Sparky House Publishing, Baltimore, Maryland. \url{http://www.biostathandbook.com/gtestgof.html}. +#' @export +#' @importFrom stats pchisq +#' @importFrom dplyr %>% +#' @examples +#' # = EXAMPLE 1 = +#' # Shivrain et al. (2006) crossed clearfield rice (which are resistant +#' # to the herbicide imazethapyr) with red rice (which are susceptible to +#' # imazethapyr). They then crossed the hybrid offspring and examined the +#' # F2 generation, where they found 772 resistant plants, 1611 moderately +#' # resistant plants, and 737 susceptible plants. If resistance is controlled +#' # by a single gene with two co-dominant alleles, you would expect a 1:2:1 +#' # ratio. +#' +#' x <- c(772, 1611, 737) +#' x.expected <- vector2ratio(x, ratio = "1:2:1") +#' x.expected +#' # 780 1560 780 +#' +#' g.test(x, x.expected) +#' # p = 0.12574. +#' +#' # There is no significant difference from a 1:2:1 ratio. +#' # Meaning: resistance controlled by a single gene with two co-dominant +#' # alleles, is plausible. +#' +#' +#' # = EXAMPLE 2 = +#' # Red crossbills (Loxia curvirostra) have the tip of the upper bill either +#' # right or left of the lower bill, which helps them extract seeds from pine +#' # cones. Some have hypothesized that frequency-dependent selection would +#' # keep the number of right and left-billed birds at a 1:1 ratio. Groth (1992) +#' # observed 1752 right-billed and 1895 left-billed crossbills. +#' +#' x <- c(1752, 1895) +#' x.expected <- vector2ratio(x, ratio = c(1, 1)) +#' x.expected +#' # 1823.5 1823.5 +#' +#' g.test(x, x.expected) +#' # p = 0.01787343 +#' +#' # There is a significant difference from a 1:1 ratio. +#' # Meaning: there are significantly more left-billed birds. +#' +g.test <- function(x, + y = NULL, + alpha = 0.05, + info = TRUE, + minimum = 0) { + + if (sum(x) < 1000) { + warning('the sum of all observations is < 1000, consider using a Fishers Exact test instead.') + } + + if (!is.numeric(x)) { + stop('`x` must be a vector or matrix with numeric values.') + } + if (!is.matrix(x) & is.null(y)) { + stop('if `x` is not a matrix, `y` must be given.') + } + + # if (!is.matrix(x)) { + # x <- matrix(x, dimnames = list(rep("", length(x)), "")) + # } + + x.expected <- y + # if (!is.null(x.expected) & !is.matrix(x.expected)) { + # x.expected <- matrix(x.expected, dimnames = list(rep("", length(x.expected)), "")) + # } + + if (NCOL(x) > 1) { + matrix2tbl <- function(x) { + if (!is.matrix(x)) { + x <- matrix(x, dimnames = list(rep("", length(x)), "")) + } + x <- rbind(cbind(x, rowSums(x)), colSums(cbind(x, rowSums(x)))) %>% + as.data.frame() + colnames(x) <- c(paste0('c', 1:(NCOL(x) - 1)), 'Total') + rownames(x) <- c(strrep(" ", 1:(NROW(x) - 1)), 'Total') + x + } + + x.with_totals <- matrix2tbl(x) + if (is.null(x.expected)) { + x.expected <- outer(rowSums(x), colSums(x), "*") / sum(x) + x.expected.with_totals <- matrix2tbl(x.expected) + } + + } else { + x.with_totals <- x + if (is.null(x.expected)) { + x.expected <- matrix(rep(mean(x), length(x)), dimnames = list(rep("", length(x)), "")) + warning('Expected values set to the mean of `x`, because it consists of only one column.') + } + x.expected.with_totals <- x.expected + } + + if (any(x < minimum)) { + warning('One of the observed values is lower than the required minimum of ', minimum, '.') + return(NA_real_) + } + if (any(x.expected < 5)) { + warning('One of the expected values is lower than 5, thus G-statistic approximation may be incorrect.', + '\n Consider doing an Exact test (exact.test).') + } + + Gstat <- 2 * base::sum(x * log(x / x.expected), na.rm = TRUE) + df <- base::max(NROW(x) - 1, 1, na.rm = TRUE) * base::max(NCOL(x) - 1, 1, na.rm = TRUE) + + pval <- 1 - stats::pchisq(Gstat, df = df) + + if (info == TRUE) { + + if (NROW(x) == 2 & NCOL(x) == 2) { + cat('G-test of independence\n\n') + } else { + cat('G-test of goodness-of-fit\n') + cat('(likelihood ratio test)\n\n') + } + + cat(paste0("(O) Observed values:\n")) + print(x.with_totals %>% round(2)) + + cat('\n') + cat('(E) Expected values under null hypothesis:\n') + print(x.expected.with_totals %>% round(2)) + + cat('\n============[G-test]============\n') + cat(' G-statistic :', round(Gstat, 4), '\n') + cat(' Degrees of freedom :', df, '\n') + cat(' P value :', round(pval, 4), '\n') + cat(' Alpha :', round(alpha, 2), '\n') + cat(' Significance :', p.symbol(pval, "-"), '\n') + cat('================================\n\n') + + } + + pval + +} + +#' Transform vector to ratio +#' @param x vector of values +#' @param ratio vector with ratios of \code{x} and with same length (like \code{ratio = c(1, 2, 1)}) or a text with characters \code{":"}, \code{"-"} or \code{","} (like \code{ratio = "1:2:1"} or even \code{ratio = "1:2:1.25"}) +#' @export +#' @seealso \code{\link{g.test}} +#' @references McDonald, J.H. 2014. \strong{Handbook of Biological Statistics (3rd ed.)}. Sparky House Publishing, Baltimore, Maryland. +#' @importFrom dplyr %>% +#' @examples +#' # = EXAMPLE 1 = +#' # Shivrain et al. (2006) crossed clearfield rice (which are resistant +#' # to the herbicide imazethapyr) with red rice (which are susceptible to +#' # imazethapyr). They then crossed the hybrid offspring and examined the +#' # F2 generation, where they found 772 resistant plants, 1611 moderately +#' # resistant plants, and 737 susceptible plants. If resistance is controlled +#' # by a single gene with two co-dominant alleles, you would expect a 1:2:1 +#' # ratio. +#' +#' x <- c(772, 1611, 737) +#' x.expected <- vector2ratio(x, ratio = "1:2:1") +#' x.expected +#' # 780 1560 780 +#' +#' g.test(x, x.expected) +#' # p = 0.12574. +#' +#' # There is no significant difference from a 1:2:1 ratio. +#' # Meaning: resistance controlled by a single gene with two co-dominant +#' # alleles, is plausible. +#' +#' +#' # = EXAMPLE 2 = +#' # Red crossbills (Loxia curvirostra) have the tip of the upper bill either +#' # right or left of the lower bill, which helps them extract seeds from pine +#' # cones. Some have hypothesized that frequency-dependent selection would +#' # keep the number of right and left-billed birds at a 1:1 ratio. Groth (1992) +#' # observed 1752 right-billed and 1895 left-billed crossbills. +#' +#' x <- c(1752, 1895) +#' x.expected <- vector2ratio(x, ratio = c(1, 1)) +#' x.expected +#' # 1823.5 1823.5 +#' +#' g.test(x, x.expected) +#' # p = 0.01787343 +#' +#' # There is a significant difference from a 1:1 ratio. +#' # Meaning: there are significantly more left-billed birds. +#' +vector2ratio <- function(x, ratio) { + if (!all(is.numeric(x))) { + stop('`x` must be a vector of numeric values.') + } + if (length(ratio) == 1) { + if (ratio %like% '^([0-9]+([.][0-9]+)?[-,:])+[0-9]+([.][0-9]+)?$') { + # support for "1:2:1", "1-2-1", "1,2,1" and even "1.75:2:1.5" + ratio <- ratio %>% base::strsplit("[-,:]") %>% base::unlist() %>% base::as.double() + } else { + stop('Invalid `ratio`: ', ratio, '.') + } + } + if (length(x) != length(ratio)) { + stop('`x` and `ratio` must be of same size.') + } + base::sum(x, na.rm = TRUE) * (ratio / base::sum(ratio, na.rm = TRUE)) +} diff --git a/R/globals.R b/R/globals.R index 19b0f2f8..28463b21 100755 --- a/R/globals.R +++ b/R/globals.R @@ -20,18 +20,16 @@ globalVariables(c('abname', 'atc', 'bactid', 'cnt', - 'Count', 'count', - 'Cum', - 'CumTot', + 'cum_count', + 'cum_percent', 'date_lab', 'days_diff', - 'Fctlvl', + 'fctlvl', 'first_isolate_row_index', 'fullname', 'genus', 'gramstain', - 'Item', 'item', 'key_ab', 'key_ab_lag', @@ -43,7 +41,6 @@ globalVariables(c('abname', 'n', 'other_pat_or_mo', 'patient_id', - 'Percent', 'quantile', 'real_first_isolate', 'species', diff --git a/R/p.symbol.R b/R/p.symbol.R new file mode 100644 index 00000000..c11e2727 --- /dev/null +++ b/R/p.symbol.R @@ -0,0 +1,57 @@ +# ==================================================================== # +# TITLE # +# Antimicrobial Resistance (AMR) Analysis # +# # +# AUTHORS # +# Berends MS (m.s.berends@umcg.nl), Luz CF (c.f.luz@umcg.nl) # +# # +# LICENCE # +# This program is free software; you can redistribute it and/or modify # +# it under the terms of the GNU General Public License version 2.0, # +# as published by the Free Software Foundation. # +# # +# This program is distributed in the hope that it will be useful, # +# but WITHOUT ANY WARRANTY; without even the implied warranty of # +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # +# GNU General Public License for more details. # +# ==================================================================== # + +#' Symbol of a p value +#' +#' Return the symbol related to the p value: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 +#' @param p p value +#' @param emptychar text to show when \code{p > 0.1} +#' @return Text +#' @export +p.symbol <- function(p, emptychar = " ") { + instelling.oud <- options()$scipen + options(scipen = 999) + s <- '' + s[1:length(p)] <- '' + for (i in 1:length(p)) { + if (is.na(p[i])) { + s[i] <- NA + next + } + if (p[i] > 1) { + s[i] <- NA + next + } else { + p_test <- p[i] + } + + if (p_test > 0.1) { + s[i] <- emptychar + } else if (p_test > 0.05) { + s[i] <- '.' + } else if (p_test > 0.01) { + s[i] <- '*' + } else if (p_test > 0.001) { + s[i] <- '**' + } else if (p_test >= 0) { + s[i] <- '***' + } + } + options(scipen = instelling.oud) + s +} diff --git a/R/print.R b/R/print.R index cee58061..30a3248c 100755 --- a/R/print.R +++ b/R/print.R @@ -74,13 +74,6 @@ print.tbl <- function(x, ...) { prettyprint_df(x, ...) } -#' @rdname print -#' @exportMethod print.frequency_tbl -#' @export -print.frequency_tbl <- function(x, ...) { - prettyprint_df(x, ...) -} - #' @rdname print #' @exportMethod print.data.table #' @export diff --git a/man/freq.Rd b/man/freq.Rd index ccf1840e..4c897911 100755 --- a/man/freq.Rd +++ b/man/freq.Rd @@ -6,22 +6,24 @@ \alias{top_freq} \title{Frequency table} \usage{ -freq(x, sort.count = TRUE, nmax = getOption("max.print.freq"), - na.rm = TRUE, row.names = TRUE, markdown = FALSE, - as.data.frame = FALSE, digits = 2, sep = " ") +frequency_tbl(x, ..., sort.count = TRUE, nmax = getOption("max.print.freq"), + na.rm = TRUE, row.names = TRUE, markdown = FALSE, digits = 2, + sep = " ") -frequency_tbl(x, sort.count = TRUE, nmax = getOption("max.print.freq"), - na.rm = TRUE, row.names = TRUE, markdown = FALSE, - as.data.frame = FALSE, digits = 2, sep = " ") +freq(x, ..., sort.count = TRUE, nmax = getOption("max.print.freq"), + na.rm = TRUE, row.names = TRUE, markdown = FALSE, digits = 2, + sep = " ") top_freq(f, n) } \arguments{ -\item{x}{data} +\item{x}{vector with items, or \code{data.frame}} -\item{sort.count}{sort on count. Use \code{FALSE} to sort alphabetically on item.} +\item{...}{up to nine different columns of \code{x} to calculate frequencies from, see Examples} -\item{nmax}{number of row to print. The default, \code{15}, uses \code{\link{getOption}("max.print.freq")}. Use \code{nmax = 0} or \code{nmax = NA} to print all rows.} +\item{sort.count}{sort on count, i.e. frequencies. Use \code{FALSE} to sort alphabetically on item.} + +\item{nmax}{number of row to print. The default, \code{15}, uses \code{\link{getOption}("max.print.freq")}. Use \code{nmax = 0}, \code{nmax = NULL} or \code{nmax = NA} to print all rows.} \item{na.rm}{a logical value indicating whether NA values should be removed from the frequency table. The header will always print the amount of \code{NA}s.} @@ -29,24 +31,19 @@ top_freq(f, n) \item{markdown}{print table in markdown format (this forces \code{nmax = NA})} -\item{as.data.frame}{return frequency table without header as a \code{data.frame} (e.g. to assign the table to an object)} - -\item{digits}{how many significant digits are to be used for numeric values (not for the items themselves, that depends on \code{\link{getOption}("digits")})} +\item{digits}{how many significant digits are to be used for numeric values in the header (not for the items themselves, that depends on \code{\link{getOption}("digits")})} \item{sep}{a character string to separate the terms when selecting multiple columns} -\item{f}{a frequency table as \code{data.frame}, used as \code{freq(..., as.data.frame = TRUE)}} +\item{f}{a frequency table} \item{n}{number of top \emph{n} items to return, use -n for the bottom \emph{n} items. It will include more than \code{n} rows if there are ties.} } \value{ -\itemize{ - \item{When using \code{as.data.frame = FALSE} (default): only printed text} - \item{When using \code{as.data.frame = TRUE}: a \code{data.frame} object with an additional class \code{"frequency_tbl"}} -} +A \code{data.frame} with an additional class \code{"frequency_tbl"} } \description{ -Create a frequency table of a vector of data, a single column or a maximum of 9 columns of a data frame. Supports markdown for reports. \code{top_freq} can be used to get the top/bottom \emph{n} items of a frequency table, with counts as names. +Create a frequency table of a vector with items or a data frame. Supports quasiquotation and markdown for reports. \code{top_freq} can be used to get the top/bottom \emph{n} items of a frequency table, with counts as names. } \details{ This package also has a vignette available about this function, run: \code{browseVignettes("AMR")} to read it. @@ -73,31 +70,41 @@ The function \code{top_freq} uses \code{\link[dplyr]{top_n}} internally and will \examples{ library(dplyr) +# this all gives the same result: freq(septic_patients$hospital_id) +freq(septic_patients[, "hospital_id"]) +septic_patients$hospital_id \%>\% freq() +septic_patients[, "hospital_id"] \%>\% freq() +septic_patients \%>\% freq("hospital_id") +septic_patients \%>\% freq(hospital_id) # <- easiest to remember when used to tidyverse +# you could use `select`... septic_patients \%>\% filter(hospital_id == "A") \%>\% select(bactid) \%>\% freq() +# ... or you use `freq` to select it immediately +septic_patients \%>\% + filter(hospital_id == "A") \%>\% + freq(bactid) + # select multiple columns; they will be pasted together septic_patients \%>\% left_join_microorganisms \%>\% filter(hospital_id == "A") \%>\% - select(genus, species) \%>\% - freq() + freq(genus, species) # save frequency table to an object years <- septic_patients \%>\% mutate(year = format(date, "\%Y")) \%>\% - select(year) \%>\% - freq(as.data.frame = TRUE) + freq(year) +years \%>\% pull(item) # get top 10 bugs of hospital A as a vector septic_patients \%>\% filter(hospital_id == "A") \%>\% - select(bactid) \%>\% - freq(as.data.frame = TRUE) \%>\% + freq(bactid) \%>\% top_freq(10) } \keyword{freq} diff --git a/man/g.test.Rd b/man/g.test.Rd new file mode 100644 index 00000000..7745cb1f --- /dev/null +++ b/man/g.test.Rd @@ -0,0 +1,108 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/g.test.R +\name{g.test} +\alias{g.test} +\title{\emph{G}-test of matrix or vector} +\usage{ +g.test(x, y = NULL, alpha = 0.05, info = TRUE, minimum = 0) +} +\arguments{ +\item{x}{numeric vector or matrix} + +\item{y}{expected value of \code{x}. Leave empty to determine automatically. This can also be ratios of \code{x}, e.g. calculated with \code{\link{vector2ratio}}.} + +\item{alpha}{value to test the p value against} + +\item{info}{logical to determine whether the analysis should be printed} + +\item{minimum}{the test with fail if any of the observed values is below this value. Use \code{minimum = 30} for microbial epidemiology, to prevent calculating a p value when less than 30 isolates are available.} +} +\description{ +A \emph{G}-test can be used to see whether the number of observations in each category fits a theoretical expectation (called a \strong{\emph{G}-test of goodness-of-fit}), or to see whether the proportions of one variable are different for different values of the other variable (called a \strong{\emph{G}-test of independence}). +} +\section{\emph{G}-test of goodness-of-fit (likelihood ratio test)}{ + +Use the \emph{G}-test of goodness-of-fit when you have one nominal variable with two or more values (such as male and female, or red, pink and white flowers). You compare the observed counts of numbers of observations in each category with the expected counts, which you calculate using some kind of theoretical expectation (such as a 1:1 sex ratio or a 1:2:1 ratio in a genetic cross). + +If the expected number of observations in any category is too small, the \emph{G}-test may give inaccurate results, and you should use an exact test instead. See the web page on small sample sizes for discussion of what "small" means. + +The \emph{G}-test of goodness-of-fit is an alternative to the chi-square test of goodness-of-fit; each of these tests has some advantages and some disadvantages, and the results of the two tests are usually very similar. +} + +\section{\emph{G}-test of independence}{ + +Use the \emph{G}-test of independence when you have two nominal variables, each with two or more possible values. You want to know whether the proportions for one variable are different among values of the other variable. + +It is also possible to do a \emph{G}-test of independence with more than two nominal variables. For example, Jackson et al. (2013) also had data for children under 3, so you could do an analysis of old vs. young, thigh vs. arm, and reaction vs. no reaction, all analyzed together. + +Fisher's exact test is more accurate than the \emph{G}-test of independence when the expected numbers are small, so it is recommend to only use the \emph{G}-test if your total sample size is greater than 1000. + +The \emph{G}-test of independence is an alternative to the chi-square test of independence, and they will give approximately the same results. +} + +\section{How the test works}{ + +Unlike the exact test of goodness-of-fit, the \emph{G}-test does not directly calculate the probability of obtaining the observed results or something more extreme. Instead, like almost all statistical tests, the \emph{G}-test has an intermediate step; it uses the data to calculate a test statistic that measures how far the observed data are from the null expectation. You then use a mathematical relationship, in this case the chi-square distribution, to estimate the probability of obtaining that value of the test statistic. + +The \emph{G}-test uses the log of the ratio of two likelihoods as the test statistic, which is why it is also called a likelihood ratio test or log-likelihood ratio test. The formula to calculate a \emph{G}-statistic is: + +\code{G <- 2 * sum(x * log(x / x.expected))} + +Since this is chi-square distributed, the p value can be calculated with: + +\code{p <- 1 - stats::pchisq(G, df))} + +where \code{df} are the degrees of freedom: \code{max(NROW(x) - 1, 1) * max(NCOL(x) - 1, 1)}. + +If there are more than two categories and you want to find out which ones are significantly different from their null expectation, you can use the same method of testing each category vs. the sum of all categories, with the Bonferroni correction. You use \emph{G}-tests for each category, of course. +} + +\examples{ +# = EXAMPLE 1 = +# Shivrain et al. (2006) crossed clearfield rice (which are resistant +# to the herbicide imazethapyr) with red rice (which are susceptible to +# imazethapyr). They then crossed the hybrid offspring and examined the +# F2 generation, where they found 772 resistant plants, 1611 moderately +# resistant plants, and 737 susceptible plants. If resistance is controlled +# by a single gene with two co-dominant alleles, you would expect a 1:2:1 +# ratio. + +x <- c(772, 1611, 737) +x.expected <- vector2ratio(x, ratio = "1:2:1") +x.expected +# 780 1560 780 + +g.test(x, x.expected) +# p = 0.12574. + +# There is no significant difference from a 1:2:1 ratio. +# Meaning: resistance controlled by a single gene with two co-dominant +# alleles, is plausible. + + +# = EXAMPLE 2 = +# Red crossbills (Loxia curvirostra) have the tip of the upper bill either +# right or left of the lower bill, which helps them extract seeds from pine +# cones. Some have hypothesized that frequency-dependent selection would +# keep the number of right and left-billed birds at a 1:1 ratio. Groth (1992) +# observed 1752 right-billed and 1895 left-billed crossbills. + +x <- c(1752, 1895) +x.expected <- vector2ratio(x, ratio = c(1, 1)) +x.expected +# 1823.5 1823.5 + +g.test(x, x.expected) +# p = 0.01787343 + +# There is a significant difference from a 1:1 ratio. +# Meaning: there are significantly more left-billed birds. + +} +\references{ +McDonald, J.H. 2014. \strong{Handbook of Biological Statistics (3rd ed.)}. Sparky House Publishing, Baltimore, Maryland. \url{http://www.biostathandbook.com/gtestgof.html}. +} +\seealso{ +\code{\link{chisq.test}} +} +\keyword{chi} diff --git a/man/p.symbol.Rd b/man/p.symbol.Rd new file mode 100644 index 00000000..d1f91f25 --- /dev/null +++ b/man/p.symbol.Rd @@ -0,0 +1,19 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/p.symbol.R +\name{p.symbol} +\alias{p.symbol} +\title{Symbol of a p value} +\usage{ +p.symbol(p, emptychar = " ") +} +\arguments{ +\item{p}{p value} + +\item{emptychar}{text to show when \code{p > 0.1}} +} +\value{ +Text +} +\description{ +Return the symbol related to the p value: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 +} diff --git a/man/print.Rd b/man/print.Rd index b0e47a78..45761131 100755 --- a/man/print.Rd +++ b/man/print.Rd @@ -1,25 +1,27 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/print.R -\name{print} +% Please edit documentation in R/freq.R, R/print.R +\name{print.frequency_tbl} +\alias{print.frequency_tbl} \alias{print} \alias{print.tbl_df} \alias{print.tbl} -\alias{print.frequency_tbl} \alias{print.data.table} \title{Printing Data Tables and Tibbles} \usage{ +\method{print}{frequency_tbl}(x, ...) + \method{print}{tbl_df}(x, nmax = 10, header = TRUE, row.names = TRUE, right = FALSE, width = 1, na = "", ...) \method{print}{tbl}(x, ...) -\method{print}{frequency_tbl}(x, ...) - \method{print}{data.table}(x, print.keys = FALSE, ...) } \arguments{ \item{x}{object of class \code{data.frame}.} +\item{...}{optional arguments to \code{print} or \code{plot} methods.} + \item{nmax}{amount of rows to print in total. When the total amount of rows exceeds this limit, the first and last \code{nmax / 2} rows will be printed. Use \code{nmax = NA} to print all rows.} \item{header}{print header with information about data size and tibble grouping} @@ -34,8 +36,6 @@ \item{na}{value to print instead of NA} -\item{...}{optional arguments to \code{print} or \code{plot} methods.} - \item{print.keys}{print keys for \code{data.table}} } \description{ diff --git a/man/vector2ratio.Rd b/man/vector2ratio.Rd new file mode 100644 index 00000000..3d8801c5 --- /dev/null +++ b/man/vector2ratio.Rd @@ -0,0 +1,64 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/g.test.R +\name{vector2ratio} +\alias{vector2ratio} +\title{Transform vector to ratio} +\usage{ +vector2ratio(x, ratio) +} +\arguments{ +\item{x}{vector of values} + +\item{ratio}{vector with ratios of \code{x} and with same length (like \code{ratio = c(1, 2, 1)}) or a text with characters \code{":"}, \code{"-"} or \code{","} (like \code{ratio = "1:2:1"} or even \code{ratio = "1:2:1.25"})} +} +\description{ +Transform vector to ratio +} +\examples{ +# = EXAMPLE 1 = +# Shivrain et al. (2006) crossed clearfield rice (which are resistant +# to the herbicide imazethapyr) with red rice (which are susceptible to +# imazethapyr). They then crossed the hybrid offspring and examined the +# F2 generation, where they found 772 resistant plants, 1611 moderately +# resistant plants, and 737 susceptible plants. If resistance is controlled +# by a single gene with two co-dominant alleles, you would expect a 1:2:1 +# ratio. + +x <- c(772, 1611, 737) +x.expected <- vector2ratio(x, ratio = "1:2:1") +x.expected +# 780 1560 780 + +g.test(x, x.expected) +# p = 0.12574. + +# There is no significant difference from a 1:2:1 ratio. +# Meaning: resistance controlled by a single gene with two co-dominant +# alleles, is plausible. + + +# = EXAMPLE 2 = +# Red crossbills (Loxia curvirostra) have the tip of the upper bill either +# right or left of the lower bill, which helps them extract seeds from pine +# cones. Some have hypothesized that frequency-dependent selection would +# keep the number of right and left-billed birds at a 1:1 ratio. Groth (1992) +# observed 1752 right-billed and 1895 left-billed crossbills. + +x <- c(1752, 1895) +x.expected <- vector2ratio(x, ratio = c(1, 1)) +x.expected +# 1823.5 1823.5 + +g.test(x, x.expected) +# p = 0.01787343 + +# There is a significant difference from a 1:1 ratio. +# Meaning: there are significantly more left-billed birds. + +} +\references{ +McDonald, J.H. 2014. \strong{Handbook of Biological Statistics (3rd ed.)}. Sparky House Publishing, Baltimore, Maryland. +} +\seealso{ +\code{\link{g.test}} +} diff --git a/tests/testthat/test-freq.R b/tests/testthat/test-freq.R index c9fbf11c..18099652 100755 --- a/tests/testthat/test-freq.R +++ b/tests/testthat/test-freq.R @@ -10,43 +10,40 @@ test_that("frequency table works", { length(unique(septic_patients$date))) # int - expect_output(freq(septic_patients$age)) + expect_output(print(freq(septic_patients$age))) # date - expect_output(freq(septic_patients$date)) + expect_output(print(freq(septic_patients$date))) # factor - expect_output(freq(septic_patients$hospital_id)) + expect_output(print(freq(septic_patients$hospital_id))) library(dplyr) - expect_output(septic_patients %>% select(1:2) %>% freq()) - expect_output(septic_patients %>% select(1:3) %>% freq()) - expect_output(septic_patients %>% select(1:4) %>% freq()) - expect_output(septic_patients %>% select(1:5) %>% freq()) - expect_output(septic_patients %>% select(1:6) %>% freq()) - expect_output(septic_patients %>% select(1:7) %>% freq()) - expect_output(septic_patients %>% select(1:8) %>% freq()) - expect_output(septic_patients %>% select(1:9) %>% freq()) + expect_output(septic_patients %>% select(1:2) %>% freq() %>% print()) + expect_output(septic_patients %>% select(1:3) %>% freq() %>% print()) + expect_output(septic_patients %>% select(1:4) %>% freq() %>% print()) + expect_output(septic_patients %>% select(1:5) %>% freq() %>% print()) + expect_output(septic_patients %>% select(1:6) %>% freq() %>% print()) + expect_output(septic_patients %>% select(1:7) %>% freq() %>% print()) + expect_output(septic_patients %>% select(1:8) %>% freq() %>% print()) + expect_output(septic_patients %>% select(1:9) %>% freq() %>% print()) # top 5 expect_equal( septic_patients %>% - select(bactid) %>% - freq(as.data.frame = TRUE) %>% + freq(bactid) %>% top_freq(5) %>% length(), 5) # there're more than 5 lowest values expect_gt( septic_patients %>% - select(bactid) %>% - freq(as.data.frame = TRUE) %>% + freq(bactid) %>% top_freq(-5) %>% length(), 5) # n has length > 1 expect_error( septic_patients %>% - select(bactid) %>% - freq(as.data.frame = TRUE) %>% + freq(bactid) %>% top_freq(n = c(1, 2)) ) # input must be freq tbl diff --git a/tests/testthat/test-g.test.R b/tests/testthat/test-g.test.R new file mode 100644 index 00000000..14febb54 --- /dev/null +++ b/tests/testthat/test-g.test.R @@ -0,0 +1,19 @@ +context("g.test.R") + +test_that("G-test works", { + + # example 1: clearfield rice vs. red rice + x <- c(772, 1611, 737) + x.expected <- vector2ratio(x, ratio = "1:2:1") + expect_equal(g.test(x, x.expected), + expected = 0.12574, + tolerance = 0.00001) + + # example 2: red crossbills + x <- c(1752, 1895) + x.expected <- vector2ratio(x, ratio = c(1, 1)) + expect_equal(g.test(x, x.expected), + expected = 0.01787343, + tolerance = 0.00000001) + +}) diff --git a/tests/testthat/test-mdro.R b/tests/testthat/test-mdro.R index d2d38986..9433be36 100755 --- a/tests/testthat/test-mdro.R +++ b/tests/testthat/test-mdro.R @@ -13,7 +13,7 @@ test_that("MDRO works", { expect_equal(outcome %>% class(), c('ordered', 'factor')) # septic_patients should have these finding using Dutch guidelines - expect_equal(outcome %>% freq(as.data.frame = TRUE) %>% pull(count), c(3, 21)) + expect_equal(outcome %>% freq() %>% pull(count), c(3, 21)) expect_equal(BRMO(septic_patients, info = FALSE), MDRO(septic_patients, "nl", info = FALSE)) diff --git a/vignettes/freq.R b/vignettes/freq.R index 74701955..bb94ee97 100644 --- a/vignettes/freq.R +++ b/vignettes/freq.R @@ -10,20 +10,23 @@ library(AMR) # just using base R freq(septic_patients$sex) -# using base R to select the variable and pass it on with a pipe +# using base R to select the variable and pass it on with a pipe from the dplyr package septic_patients$sex %>% freq() -# do it all with pipes, using the `select` function of the dplyr package +# do it all with pipes, using the `select` function from the dplyr package septic_patients %>% select(sex) %>% freq() +# or the preferred way: using a pipe to pass the variable on to the freq function +septic_patients %>% freq(sex) # this also shows 'age' in the title + + ## ---- echo = TRUE-------------------------------------------------------- freq(septic_patients$sex) ## ---- echo = TRUE, results = 'hide'-------------------------------------- -my_patients <- septic_patients %>% - left_join_microorganisms() +my_patients <- septic_patients %>% left_join_microorganisms() ## ---- echo = TRUE-------------------------------------------------------- colnames(microorganisms) @@ -33,26 +36,21 @@ dim(septic_patients) dim(my_patients) ## ---- echo = TRUE-------------------------------------------------------- -my_patients %>% - select(genus, species) %>% - freq() +my_patients %>% freq(genus, species) ## ---- echo = TRUE-------------------------------------------------------- # # get age distribution of unique patients septic_patients %>% distinct(patient_id, .keep_all = TRUE) %>% - select(age) %>% - freq(nmax = 5) + freq(age, nmax = 5) ## ---- echo = TRUE-------------------------------------------------------- septic_patients %>% - select(hospital_id) %>% - freq() + freq(hospital_id) ## ---- echo = TRUE-------------------------------------------------------- septic_patients %>% - select(hospital_id) %>% - freq(sort.count = TRUE) + freq(hospital_id, sort.count = TRUE) ## ---- echo = TRUE-------------------------------------------------------- septic_patients %>% @@ -65,29 +63,24 @@ septic_patients %>% freq(nmax = 5) ## ---- echo = TRUE-------------------------------------------------------- -septic_patients %>% - select(amox) %>% - freq(na.rm = FALSE) - -## ---- echo = TRUE-------------------------------------------------------- -septic_patients %>% - select(hospital_id) %>% - freq(row.names = FALSE) - -## ---- echo = TRUE-------------------------------------------------------- -septic_patients %>% - select(hospital_id) %>% - freq(markdown = TRUE) - -## ---- echo = TRUE-------------------------------------------------------- -my_df <- septic_patients %>% - select(hospital_id) %>% - freq(as.data.frame = TRUE) - -my_df - +my_df <- septic_patients %>% freq(age) class(my_df) +## ---- echo = TRUE-------------------------------------------------------- +dim(my_df) + +## ---- echo = TRUE-------------------------------------------------------- +septic_patients %>% + freq(amox, na.rm = FALSE) + +## ---- echo = TRUE-------------------------------------------------------- +septic_patients %>% + freq(hospital_id, row.names = FALSE) + +## ---- echo = TRUE-------------------------------------------------------- +septic_patients %>% + freq(hospital_id, markdown = TRUE) + ## ---- echo = FALSE------------------------------------------------------- # this will print "2018" in 2018, and "2018-yyyy" after 2018. yrs <- c(2018:format(Sys.Date(), "%Y")) diff --git a/vignettes/freq.Rmd b/vignettes/freq.Rmd index 7d625bc4..0457b42d 100755 --- a/vignettes/freq.Rmd +++ b/vignettes/freq.Rmd @@ -30,13 +30,17 @@ To only show and quickly review the content of one variable, you can just select # just using base R freq(septic_patients$sex) -# using base R to select the variable and pass it on with a pipe +# using base R to select the variable and pass it on with a pipe from the dplyr package septic_patients$sex %>% freq() -# do it all with pipes, using the `select` function of the dplyr package +# do it all with pipes, using the `select` function from the dplyr package septic_patients %>% select(sex) %>% freq() + +# or the preferred way: using a pipe to pass the variable on to the freq function +septic_patients %>% freq(sex) # this also shows 'age' in the title + ``` This will all lead to the following table: ```{r, echo = TRUE} @@ -50,8 +54,7 @@ Multiple variables will be pasted into one variable to review individual cases, For illustration, we could add some more variables to the `septic_patients` dataset to learn about bacterial properties: ```{r, echo = TRUE, results = 'hide'} -my_patients <- septic_patients %>% - left_join_microorganisms() +my_patients <- septic_patients %>% left_join_microorganisms() ``` Now all variables of the `microorganisms` dataset have been joined to the `septic_patients` dataset. The `microorganisms` dataset consists of the following variables: ```{r, echo = TRUE} @@ -66,9 +69,7 @@ dim(my_patients) So now the `genus` and `species` variables are available. A frequency table of these combined variables can be created like this: ```{r, echo = TRUE} -my_patients %>% - select(genus, species) %>% - freq() +my_patients %>% freq(genus, species) ``` ## Frequencies of numeric values @@ -81,8 +82,7 @@ In case of numeric values (like integers, doubles, etc.) additional descriptive # # get age distribution of unique patients septic_patients %>% distinct(patient_id, .keep_all = TRUE) %>% - select(age) %>% - freq(nmax = 5) + freq(age, nmax = 5) ``` So the following properties are determined, where `NA` values are always ignored: @@ -109,16 +109,14 @@ Frequencies of factors will be sorted on factor level instead of item count by d ```{r, echo = TRUE} septic_patients %>% - select(hospital_id) %>% - freq() + freq(hospital_id) ``` ... with this, where items are now sorted on count: ```{r, echo = TRUE} septic_patients %>% - select(hospital_id) %>% - freq(sort.count = TRUE) + freq(hospital_id, sort.count = TRUE) ``` All classes will be printed into the header. Variables with the new `rsi` class of this AMR package are actually ordered factors and have three classes (look at `Class` in the header): @@ -139,6 +137,21 @@ septic_patients %>% freq(nmax = 5) ``` +## Assigning a frequency table to an object + +A frequency table is actaually a regular `data.frame`, with the exception that it contains an additional class. + +```{r, echo = TRUE} +my_df <- septic_patients %>% freq(age) +class(my_df) +``` + +Because of this additional class, a frequency table prints like the examples above. But the object itself contains the complete table without a row limitation: + +```{r, echo = TRUE} +dim(my_df) +``` + ## Additional parameters ### Parameter `na.rm` @@ -146,8 +159,7 @@ With the `na.rm` parameter (defaults to `TRUE`, but they will always be shown in ```{r, echo = TRUE} septic_patients %>% - select(amox) %>% - freq(na.rm = FALSE) + freq(amox, na.rm = FALSE) ``` ### Parameter `row.names` @@ -155,8 +167,7 @@ The default frequency tables shows row indices. To remove them, use `row.names = ```{r, echo = TRUE} septic_patients %>% - select(hospital_id) %>% - freq(row.names = FALSE) + freq(hospital_id, row.names = FALSE) ``` ### Parameter `markdown` @@ -164,21 +175,7 @@ The `markdown` parameter can be used in reports created with R Markdown. This wi ```{r, echo = TRUE} septic_patients %>% - select(hospital_id) %>% - freq(markdown = TRUE) -``` - -### Parameter `as.data.frame` -With the `as.data.frame` parameter you can assign the frequency table to an object, or just print it as a `data.frame` to the console: - -```{r, echo = TRUE} -my_df <- septic_patients %>% - select(hospital_id) %>% - freq(as.data.frame = TRUE) - -my_df - -class(my_df) + freq(hospital_id, markdown = TRUE) ``` ----