This commit is contained in:
dr. M.S. (Matthijs) Berends 2018-04-03 16:07:32 +02:00
parent 4a47e59e6f
commit 3165c50d06
13 changed files with 226 additions and 158 deletions

5
NEWS
View File

@ -1,10 +1,11 @@
## 0.1.2
- Added full support for Windows, Linux and macOS; this package now works everywhere :)
- New function `guess_bactid` to determine the ID of a microorganism based on genus/species
- Added full support for Windows, Linux and macOS
- New function `guess_bactid` to determine the ID of a microorganism based on genus/species or known abbreviations like MRSA
- New functions `clipboard_import` and `clipboard_export` as helper functions to quickly copy and paste from/to software like Excel and SPSS
- New algorithm to determine weighted isolates, can now be `"points"` or `"keyantibiotics"`, see `?first_isolate`
- Renamed dataset `ablist` to `antibiotics`
- Renamed dataset `bactlist` to `microorganisms`
- Added more microorganisms to `bactlist`
- Added analysis examples on help page of dataset `septic_patients`
- Added support for character vector in join functions
- Added warnings when applying a join results in more rows after than before the join

130
R/atc.R
View File

@ -237,3 +237,133 @@ abname <- function(abcode, from = c("guess", "atc", "molis", "umcg"), to = 'offi
abcode
}
#' Find bacteria ID based on genus/species
#'
#' Use this function to determine a valid ID based on a genus (and species). This input could be a full name (like \code{"Staphylococcus aureus"}), an abbreviated name (like \code{"S. aureus"}), or just a genus. You could also use a \code{\link{paste}} of a genus and species column to use the full name as input: \code{x = paste(df$genus, df$species)}, where \code{df} is your dataframe.
#' @param x character vector to determine \code{bactid}
#' @export
#' @importFrom dplyr %>% filter slice pull
#' @return Character (vector).
#' @seealso \code{\link{microorganisms}} for the dataframe that is being used to determine ID's.
#' @examples
#' # These examples all return "STAAUR", the ID of S. aureus:
#' guess_bactid("stau")
#' guess_bactid("STAU")
#' guess_bactid("staaur")
#' guess_bactid("S. aureus")
#' guess_bactid("S aureus")
#' guess_bactid("Staphylococcus aureus")
#' guess_bactid("MRSA") # Methicillin-resistant S. aureus
#' guess_bactid("VISA") # Vancomycin Intermediate S. aureus
guess_bactid <- function(x) {
# remove dots and other non-text in case of "E. coli" except spaces
x <- gsub("[^a-zA-Z ]+", "", x)
# but spaces before and after should be omitted
x <- trimws(x, which = "both")
x.bak <- x
# replace space by regex sign
x <- gsub(" ", ".*", x, fixed = TRUE)
# add start and stop
x_species <- paste(x, 'species')
x <- paste0('^', x, '$')
for (i in 1:length(x)) {
if (tolower(x[i]) == '^e.*coli$') {
# avoid detection of Entamoeba coli in case of E. coli
x[i] <- 'Escherichia coli'
}
if (tolower(x[i]) == '^h.*influenzae$') {
# avoid detection of Haematobacter influenzae in case of H. influenzae
x[i] <- 'Haemophilus influenzae'
}
if (tolower(x[i]) == '^st.*au$'
| tolower(x[i]) == '^stau$'
| tolower(x[i]) == '^staaur$') {
# avoid detection of Staphylococcus auricularis in case of S. aureus
x[i] <- 'Staphylococcus aureus'
}
if (tolower(x[i]) == '^p.*aer$') {
# avoid detection of Pasteurella aerogenes in case of Pseudomonas aeruginosa
x[i] <- 'Pseudomonas aeruginosa'
}
# translate known trivial names to genus+species
if (toupper(x.bak[i]) == 'MRSA'
| toupper(x.bak[i]) == 'VISA'
| toupper(x.bak[i]) == 'VRSA') {
x[i] <- 'Staphylococcus aureus'
}
if (toupper(x.bak[i]) == 'MRSE') {
x[i] <- 'Staphylococcus epidermidis'
}
if (toupper(x.bak[i]) == 'VRE') {
x[i] <- 'Enterococcus'
}
if (toupper(x.bak[i]) == 'MRPA') {
# multi resistant P. aeruginosa
x[i] <- 'Pseudomonas aeruginosa'
}
if (toupper(x.bak[i]) == 'PISP'
| toupper(x.bak[i]) == 'PRSP') {
# peni resistant S. pneumoniae
x[i] <- 'Streptococcus pneumoniae'
}
if (toupper(x.bak[i]) == 'VISP'
| toupper(x.bak[i]) == 'VRSP') {
# vanco resistant S. pneumoniae
x[i] <- 'Streptococcus pneumoniae'
}
# let's try the ID's first
found <- AMR::microorganisms %>% filter(bactid == x.bak[i])
if (nrow(found) == 0) {
# now try exact match
found <- AMR::microorganisms %>% filter(fullname == x[i])
}
if (nrow(found) == 0) {
# try any match
found <- AMR::microorganisms %>% filter(fullname %like% x[i])
}
if (nrow(found) == 0) {
# try only genus, with 'species' attached
found <- AMR::microorganisms %>% filter(fullname %like% x_species[i])
}
if (nrow(found) == 0) {
# search for GLIMS code
if (toupper(x.bak[i]) %in% toupper(AMR::microorganisms.umcg$mocode)) {
found <- AMR::microorganisms.umcg %>% filter(toupper(mocode) == toupper(x.bak[i]))
}
}
if (nrow(found) == 0) {
# try splitting of characters and then find ID
# like esco = E. coli, klpn = K. pneumoniae, stau = S. aureus
x_split <- x
x_length <- nchar(x.bak[i])
x_split[i] <- paste0(x.bak[i] %>% substr(1, x_length / 2) %>% trimws(),
'.* ',
x.bak[i] %>% substr((x_length / 2) + 1, x_length) %>% trimws())
found <- AMR::microorganisms %>% filter(fullname %like% paste0('^', x_split[i]))
}
if (nrow(found) == 0) {
# try any match with text before and after original search string
# so "negative rods" will be "GNR"
if (x.bak[i] %like% "^Gram") {
x.bak[i] <- gsub("^Gram", "", x.bak[i], ignore.case = TRUE)
# remove leading and trailing spaces again
x.bak[i] <- trimws(x.bak[i], which = "both")
}
found <- AMR::microorganisms %>% filter(fullname %like% x.bak[i])
}
if (nrow(found) != 0) {
x[i] <- found %>%
slice(1) %>%
pull(bactid)
} else {
x[i] <- ""
}
}
x
}

View File

@ -361,26 +361,19 @@ print.mic <- function(x, ...) {
#' @exportMethod summary.mic
#' @export
#' @importFrom dplyr %>% tibble group_by summarise pull
#' @importFrom dplyr %>%
#' @noRd
summary.mic <- function(object, ...) {
x <- object
n_total <- x %>% length()
x <- x[!is.na(x)]
n <- x %>% length()
return(c("Mode" = 'mic',
"<NA>" = n_total - n,
"Min." = sort(x)[1] %>% as.character(),
"Max." = sort(x)[n] %>% as.character()
))
cat("Class 'mic': ", n, " isolates\n", sep = '')
cat('\n')
cat('<NA> ', n_total - n, '\n')
cat('\n')
tbl <- tibble(x = x, y = 1) %>% group_by(x) %>% summarise(y = sum(y))
cnt <- tbl %>% pull(y)
names(cnt) <- tbl %>% pull(x)
print(cnt)
lst <- c('mic',
n_total - n,
sort(x)[1] %>% as.character(),
sort(x)[n] %>% as.character())
names(lst) <- c("Mode", "<NA>", "Min.", "Max.")
lst
}
#' @exportMethod plot.mic

View File

@ -1,17 +1,35 @@
#' Import/export from clipboard
#'
#' These are helper functions around \code{\link{read.table}} and \code{\link{write.table}} to import from and export to clipboard, with support for Windows, Linux and macOS. The data will be read and written as tab-separated by default, which makes it possible to copy and paste from other software like Excel and SPSS without further transformation.
#' These are helper functions around \code{\link{read.table}} and \code{\link{write.table}} to import from and export to clipboard with support for Windows, Linux and macOS. The data will be read and written as tab-separated by default, which makes it possible to copy and paste from other software like Excel and SPSS without further transformation. See Details for an example.
#' @rdname clipboard
#' @name clipboard
#' @inheritParams utils::read.table
#' @inheritParams utils::write.table
#' @param startrow \emph{n}th row to start importing from. For \code{clipboard_import}, when \code{header = TRUE} the import will start on row \code{startrow} \emph{below} the header.
#' @param startrow \emph{n}th row to start importing from. When \code{header = TRUE}, the import will start on row \code{startrow} \emph{below} the header.
#' @param as_vector a logical value indicating whether data consisting of only one column should be imported as vector using \code{\link[dplyr]{pull}}. This will strip off the header.
#' @param info print info about copying
#' @keywords clipboard clipboard_import clipboard_export import export
#' @importFrom dplyr %>% pull as_tibble
#' @importFrom utils read.delim write.table object.size
#' @details For \code{clipboard_export}, the reserved clipboard size for exporting will be set automatically to 125\% of the object size of \code{x}. This way, it is possible to export data with thousands of rows as the only limit will be your systems RAM.
#' @details For \code{clipboard_export()}, the reserved clipboard size for exporting will be set to 125\% of the object size of \code{x}. This way, it is possible to export data with thousands of rows as the only limit will be your systems RAM.
#'
#' Example for copying from Excel:
#' \if{html}{
#' \out{<div style="text-align: left">}\figure{Excel_copy.png}\out{</div>}
#' }
#' \if{latex}{
#' \out{\begin{left}}\figure{Excel_copy.png}\out{\end{left}}
#' }
#' \cr
#' And pasting in R: \cr \cr
#' \code{> data <- clipboard_import()} \cr
#' \code{> data} \cr
#' \if{html}{
#' \out{<div style="text-align: left">}\figure{Excel_paste.png}\out{</div>}
#' }
#' \if{latex}{
#' \out{\begin{left}}\figure{Excel_paste.png}\out{\end{left}}
#' }
#' @export
#' @return data.frame
clipboard_import <- function(sep = '\t',

View File

@ -610,105 +610,3 @@ key_antibiotics_equal <- function(x,
}
result
}
#' Find bacteria ID based on genus/species
#'
#' Use this function to determine a valid ID based on a genus (and species). This input could be a full name (like \code{"Staphylococcus aureus"}), an abbreviated name (like \code{"S. aureus"}), or just a genus. You could also use a \code{\link{paste}} of a genus and species column to use the full name as input: \code{x = paste(df$genus, df$species)}, where \code{df} is your dataframe.
#' @param x character vector to determine \code{bactid}
#' @export
#' @importFrom dplyr %>% filter slice pull
#' @return Character (vector).
#' @seealso \code{\link{microorganisms}} for the dataframe that is being used to determine ID's.
#' @examples
#' # These examples all return "STAAUR", the ID of S. aureus:
#' guess_bactid("stau")
#' guess_bactid("STAU")
#' guess_bactid("staaur")
#' guess_bactid("S. aureus")
#' guess_bactid("S aureus")
#' guess_bactid("Staphylococcus aureus")
#' guess_bactid("MRSA") # Methicillin-resistant S. aureus
#' guess_bactid("VISA") # Vancomycin Intermediate S. aureus
guess_bactid <- function(x) {
# remove dots and other non-text in case of "E. coli" except spaces
x <- gsub("[^a-zA-Z ]+", "", x)
x.bak <- x
# replace space by regex sign
x <- gsub(" ", ".*", x, fixed = TRUE)
# add start and stop
x_species <- paste(x, 'species')
x <- paste0('^', x, '$')
for (i in 1:length(x)) {
if (tolower(x[i]) == '^e.*coli$') {
# avoid detection of Entamoeba coli in case of E. coli
x[i] <- 'Escherichia coli'
}
if (tolower(x[i]) == '^h.*influenzae$') {
# avoid detection of Haematobacter influenzae in case of H. influenzae
x[i] <- 'Haemophilus influenzae'
}
if (tolower(x[i]) == '^st.*au$'
| tolower(x[i]) == '^stau$'
| tolower(x[i]) == '^staaur$') {
# avoid detection of Staphylococcus auricularis in case of S. aureus
x[i] <- 'Staphylococcus aureus'
}
if (tolower(x[i]) == '^p.*aer$') {
# avoid detection of Pasteurella aerogenes in case of Pseudomonas aeruginosa
x[i] <- 'Pseudomonas aeruginosa'
}
# translate known trivial names to genus+species
if (toupper(x.bak[i]) == 'MRSA'
| toupper(x.bak[i]) == 'VISA'
| toupper(x.bak[i]) == 'VRSA') {
x[i] <- 'Staphylococcus aureus'
}
if (toupper(x.bak[i]) == 'MRSE') {
x[i] <- 'Staphylococcus epidermidis'
}
if (toupper(x.bak[i]) == 'VRE') {
x[i] <- 'Enterococcus'
}
# let's try the ID's first
found <- AMR::microorganisms %>% filter(bactid == x.bak[i])
if (nrow(found) == 0) {
# now try exact match
found <- AMR::microorganisms %>% filter(fullname == x[i])
}
if (nrow(found) == 0) {
# try any match
found <- AMR::microorganisms %>% filter(fullname %like% x[i])
}
if (nrow(found) == 0) {
# try only genus, with 'species' attached
found <- AMR::microorganisms %>% filter(fullname %like% x_species[i])
}
if (nrow(found) == 0) {
# search for GLIMS code
if (toupper(x.bak[i]) %in% toupper(AMR::microorganisms.umcg$mocode)) {
found <- AMR::microorganisms.umcg %>% filter(toupper(mocode) == toupper(x.bak[i]))
}
}
if (nrow(found) == 0) {
# try splitting of characters and then find ID
# like esco = E. coli, klpn = K. pneumoniae, stau = S. aureus
x_length <- nchar(x.bak[i])
x[i] <- paste0(x.bak[i] %>% substr(1, x_length / 2) %>% trimws(),
'.* ',
x.bak[i] %>% substr((x_length / 2) + 1, x_length) %>% trimws())
found <- AMR::microorganisms %>% filter(fullname %like% paste0('^', x[i]))
}
if (nrow(found) != 0) {
x[i] <- found %>%
slice(1) %>%
pull(bactid)
} else {
x[i] <- ""
}
}
x
}

Binary file not shown.

View File

@ -28,7 +28,7 @@ clipboard_export(x, sep = "\\t", dec = ".", na = "", header = TRUE,
\item{na}{the string to use for missing values in the data.}
\item{startrow}{\emph{n}th row to start importing from. For \code{clipboard_import}, when \code{header = TRUE} the import will start on row \code{startrow} \emph{below} the header.}
\item{startrow}{\emph{n}th row to start importing from. When \code{header = TRUE}, the import will start on row \code{startrow} \emph{below} the header.}
\item{as_vector}{a logical value indicating whether data consisting of only one column should be imported as vector using \code{\link[dplyr]{pull}}. This will strip off the header.}
@ -41,10 +41,28 @@ clipboard_export(x, sep = "\\t", dec = ".", na = "", header = TRUE,
data.frame
}
\description{
These are helper functions around \code{\link{read.table}} and \code{\link{write.table}} to import from and export to clipboard, with support for Windows, Linux and macOS. The data will be read and written as tab-separated by default, which makes it possible to copy and paste from other software like Excel and SPSS without further transformation.
These are helper functions around \code{\link{read.table}} and \code{\link{write.table}} to import from and export to clipboard with support for Windows, Linux and macOS. The data will be read and written as tab-separated by default, which makes it possible to copy and paste from other software like Excel and SPSS without further transformation. See Details for an example.
}
\details{
For \code{clipboard_export}, the reserved clipboard size for exporting will be set automatically to 125\% of the object size of \code{x}. This way, it is possible to export data with thousands of rows as the only limit will be your systems RAM.
For \code{clipboard_export()}, the reserved clipboard size for exporting will be set to 125\% of the object size of \code{x}. This way, it is possible to export data with thousands of rows as the only limit will be your systems RAM.
Example for copying from Excel:
\if{html}{
\out{<div style="text-align: left">}\figure{Excel_copy.png}\out{</div>}
}
\if{latex}{
\out{\begin{left}}\figure{Excel_copy.png}\out{\end{left}}
}
\cr
And pasting in R: \cr \cr
\code{> data <- clipboard_import()} \cr
\code{> data} \cr
\if{html}{
\out{<div style="text-align: left">}\figure{Excel_paste.png}\out{</div>}
}
\if{latex}{
\out{\begin{left}}\figure{Excel_paste.png}\out{\end{left}}
}
}
\keyword{clipboard}
\keyword{clipboard_export}

BIN
man/figures/Excel_copy.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 2.2 KiB

BIN
man/figures/Excel_paste.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 3.6 KiB

View File

@ -1,5 +1,5 @@
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/first_isolates.R
% Please edit documentation in R/atc.R
\name{guess_bactid}
\alias{guess_bactid}
\title{Find bacteria ID based on genus/species}

View File

@ -14,3 +14,18 @@ test_that("abname works", {
expect_equal(abname("amox", from = 'molis'), "Amoxicillin")
expect_equal(abname("J01CA04", from = 'atc'), "Amoxicillin")
})
test_that("guess_bactid works", {
expect_identical(guess_bactid(c("E. coli", "H. influenzae")), c("ESCCOL", "HAEINF"))
expect_equal(guess_bactid("Escherichia coli"), "ESCCOL")
expect_equal(guess_bactid("Negative rods"), "GNR")
expect_equal(guess_bactid(c("stau",
"STAU",
"staaur",
"S. aureus",
"S aureus",
"Staphylococcus aureus",
"MRSA",
"VISA")),
rep("STAAUR", 8))
})

View File

@ -7,11 +7,6 @@ test_that("keyantibiotics work", {
expect_false(key_antibiotics_equal("SSS", "SIS", ignore_I = FALSE))
})
test_that("guess_bactid works", {
expect_equal(guess_bactid("E. coli"), "ESCCOL")
expect_equal(guess_bactid("Escherichia coli"), "ESCCOL")
})
test_that("first isolates work", {
# septic_patients contains 1960 out of 2000 first isolates
#septic_ptns <- septic_patients