1
0
mirror of https://github.com/msberends/AMR.git synced 2025-01-13 22:51:37 +01:00
AMR/man/freq.Rd

194 lines
7.1 KiB
Plaintext
Raw Normal View History

% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/freq.R
\name{freq}
\alias{freq}
\alias{frequency_tbl}
2018-06-20 14:47:37 +02:00
\alias{top_freq}
2018-07-03 11:30:40 +02:00
\alias{print.frequency_tbl}
\title{Frequency table}
\usage{
2018-08-11 21:30:00 +02:00
frequency_tbl(x, ..., sort.count = TRUE,
nmax = getOption("max.print.freq"), na.rm = TRUE, row.names = TRUE,
markdown = !interactive(), digits = 2, quote = FALSE,
2018-10-31 12:10:49 +01:00
header = !markdown, title = NULL, na = "<NA>", sep = " ")
2018-07-01 21:40:37 +02:00
freq(x, ..., sort.count = TRUE, nmax = getOption("max.print.freq"),
na.rm = TRUE, row.names = TRUE, markdown = !interactive(),
2018-10-31 12:10:49 +01:00
digits = 2, quote = FALSE, header = !markdown, title = NULL,
na = "<NA>", sep = " ")
2018-06-20 14:47:37 +02:00
top_freq(f, n)
2018-07-03 11:30:40 +02:00
2018-08-11 21:30:00 +02:00
\method{print}{frequency_tbl}(x, nmax = getOption("max.print.freq",
2018-11-16 22:28:55 +01:00
default = 15), markdown = !interactive(), header = !markdown, ...)
}
\arguments{
2018-11-06 16:41:59 +01:00
\item{x}{vector of any class or a \code{\link{data.frame}}, \code{\link{tibble}} (may contain a grouping variable) or \code{\link{table}}}
2018-07-09 14:02:58 +02:00
\item{...}{up to nine different columns of \code{x} when \code{x} is a \code{data.frame} or \code{tibble}, to calculate frequencies from - see Examples}
2018-11-06 16:41:59 +01:00
\item{sort.count}{sort on count, i.e. frequencies. This will be \code{TRUE} at default for everything except when using grouping variables.}
2018-07-01 21:40:37 +02:00
2018-07-03 11:30:40 +02:00
\item{nmax}{number of row to print. The default, \code{15}, uses \code{\link{getOption}("max.print.freq")}. Use \code{nmax = 0}, \code{nmax = Inf}, \code{nmax = NULL} or \code{nmax = NA} to print all rows.}
2018-10-23 11:15:05 +02:00
\item{na.rm}{a logical value indicating whether \code{NA} values should be removed from the frequency table. The header (if set) will always print the amount of \code{NA}s.}
\item{row.names}{a logical value indicating whether row indices should be printed as \code{1:nrow(x)}}
\item{markdown}{a logical value indicating whether the frequency table should be printed in markdown format. This will print all rows and is default behaviour in non-interactive R sessions (like when knitting RMarkdown files).}
\item{digits}{how many significant digits are to be used for numeric values in the header (not for the items themselves, that depends on \code{\link{getOption}("digits")})}
\item{quote}{a logical value indicating whether or not strings should be printed with surrounding quotes}
2018-10-19 21:52:08 +02:00
\item{header}{a logical value indicating whether an informative header should be printed}
2018-10-31 12:10:49 +01:00
\item{title}{text to show above frequency table, at default to tries to coerce from the variables passed to \code{x}}
2018-10-23 11:15:05 +02:00
\item{na}{a character string to should be used to show empty (\code{NA}) values (only useful when \code{na.rm = FALSE})}
\item{sep}{a character string to separate the terms when selecting multiple columns}
2018-06-20 14:47:37 +02:00
2018-07-01 21:40:37 +02:00
\item{f}{a frequency table}
2018-06-20 14:47:37 +02:00
\item{n}{number of top \emph{n} items to return, use -n for the bottom \emph{n} items. It will include more than \code{n} rows if there are ties.}
}
\value{
A \code{data.frame} (with an additional class \code{"frequency_tbl"}) with five columns: \code{item}, \code{count}, \code{percent}, \code{cum_count} and \code{cum_percent}.
}
\description{
2018-07-01 21:40:37 +02:00
Create a frequency table of a vector with items or a data frame. Supports quasiquotation and markdown for reports. \code{top_freq} can be used to get the top/bottom \emph{n} items of a frequency table, with counts as names.
}
\details{
2018-07-03 11:30:40 +02:00
Frequency tables (or frequency distributions) are summaries of the distribution of values in a sample. With the `freq` function, you can create univariate frequency tables. Multiple variables will be pasted into one variable, so it forces a univariate distribution. This package also has a vignette available to explain the use of this function further, run \code{browseVignettes("AMR")} to read it.
2018-06-20 14:47:37 +02:00
For numeric values of any class, these additional values will all be calculated with \code{na.rm = TRUE} and shown into the header:
\itemize{
\item{Mean, using \code{\link[base]{mean}}}
2018-07-03 11:30:40 +02:00
\item{Standard Deviation, using \code{\link[stats]{sd}}}
\item{Coefficient of Variation (CV), the standard deviation divided by the mean}
\item{Mean Absolute Deviation (MAD), using \code{\link[stats]{mad}}}
\item{Tukey Five-Number Summaries (minimum, Q1, median, Q3, maximum), using \code{\link[stats]{fivenum}}}
\item{Interquartile Range (IQR) calculated as \code{Q3 - Q1} using the Tukey Five-Number Summaries, i.e. \strong{not} using the \code{\link[stats]{quantile}} function}
\item{Coefficient of Quartile Variation (CQV, sometimes called coefficient of dispersion), calculated as \code{(Q3 - Q1) / (Q3 + Q1)} using the Tukey Five-Number Summaries}
\item{Outliers (total count and unique count), using \code{\link[grDevices]{boxplot.stats}}}
}
2018-06-20 14:47:37 +02:00
For dates and times of any class, these additional values will be calculated with \code{na.rm = TRUE} and shown into the header:
2018-06-20 14:47:37 +02:00
\itemize{
2018-07-08 22:14:55 +02:00
\item{Oldest, using \code{\link{min}}}
\item{Newest, using \code{\link{max}}, with difference between newest and oldest}
2018-06-20 14:47:37 +02:00
\item{Median, using \code{\link[stats]{median}}, with percentage since oldest}
}
2018-07-03 11:30:40 +02:00
2018-06-20 14:47:37 +02:00
The function \code{top_freq} uses \code{\link[dplyr]{top_n}} internally and will include more than \code{n} rows if there are ties.
}
\examples{
library(dplyr)
2018-07-01 21:40:37 +02:00
# this all gives the same result:
freq(septic_patients$hospital_id)
2018-07-01 21:40:37 +02:00
freq(septic_patients[, "hospital_id"])
septic_patients$hospital_id \%>\% freq()
septic_patients[, "hospital_id"] \%>\% freq()
septic_patients \%>\% freq("hospital_id")
2018-11-06 16:41:59 +01:00
septic_patients \%>\% freq(hospital_id) #<- easiest to remember (tidyverse)
2018-07-09 14:02:58 +02:00
# you could also use `select` or `pull` to get your variables
septic_patients \%>\%
filter(hospital_id == "A") \%>\%
2018-08-31 13:36:19 +02:00
select(mo) \%>\%
freq()
2018-07-09 14:02:58 +02:00
# multiple selected variables will be pasted together
septic_patients \%>\%
left_join_microorganisms \%>\%
filter(hospital_id == "A") \%>\%
2018-07-01 21:40:37 +02:00
freq(genus, species)
2018-11-06 16:41:59 +01:00
# group a variable and analyse another
septic_patients \%>\%
group_by(hospital_id) \%>\%
freq(gender)
2018-07-03 11:30:40 +02:00
# get top 10 bugs of hospital A as a vector
septic_patients \%>\%
filter(hospital_id == "A") \%>\%
2018-08-31 13:36:19 +02:00
freq(mo) \%>\%
2018-07-03 11:30:40 +02:00
top_freq(10)
# save frequency table to an object
years <- septic_patients \%>\%
mutate(year = format(date, "\%Y")) \%>\%
2018-07-01 21:40:37 +02:00
freq(year)
2018-06-20 14:47:37 +02:00
2018-07-09 14:02:58 +02:00
# show only the top 5
2018-07-03 11:30:40 +02:00
years \%>\% print(nmax = 5)
2018-07-16 16:41:48 +02:00
# save to an object with formatted percentages
years <- format(years)
2018-07-09 14:02:58 +02:00
# print a histogram of numeric values
septic_patients \%>\%
freq(age) \%>\%
2018-10-01 14:44:40 +02:00
hist()
2018-07-09 14:02:58 +02:00
2018-07-09 14:02:58 +02:00
# or print all points to a regular plot
septic_patients \%>\%
freq(age) \%>\%
plot()
2018-07-09 14:02:58 +02:00
# transform to a data.frame or tibble
2018-06-20 14:47:37 +02:00
septic_patients \%>\%
2018-07-03 11:30:40 +02:00
freq(age) \%>\%
as.data.frame()
2018-07-09 14:02:58 +02:00
2018-07-09 14:02:58 +02:00
# or transform (back) to a vector
septic_patients \%>\%
freq(age) \%>\%
as.vector()
identical(septic_patients \%>\%
freq(age) \%>\%
as.vector() \%>\%
sort(),
2018-07-16 16:41:48 +02:00
sort(septic_patients$age)) # TRUE
2018-07-09 14:02:58 +02:00
# it also supports `table` objects
2018-09-29 21:54:32 +02:00
table(septic_patients$gender,
2018-07-09 14:02:58 +02:00
septic_patients$age) \%>\%
2018-07-16 16:41:48 +02:00
freq(sep = " **sep** ")
# only get selected columns
septic_patients \%>\%
freq(hospital_id) \%>\%
select(item, percent)
septic_patients \%>\%
freq(hospital_id) \%>\%
select(-count, -cum_count)
2018-10-01 14:44:40 +02:00
# check differences between frequency tables
diff(freq(septic_patients$trim),
freq(septic_patients$trsu))
}
\keyword{freq}
\keyword{frequency}
\keyword{summarise}
\keyword{summary}