From d15bb2ab3ff06ab18a77f7c06fcd61ff4b372eaf Mon Sep 17 00:00:00 2001 From: "Matthijs S. Berends" Date: Mon, 5 Nov 2018 15:30:49 +0100 Subject: [PATCH] read_4D --- NAMESPACE | 1 + NEWS.md | 1 + R/read_4d.R | 134 +++++++++++++++++++++++++++++++ README.md | 20 +++-- man/read_4D.Rd | 104 ++++++++++++++++++++++++ tests/testthat/test-get_locale.R | 2 +- 6 files changed, 256 insertions(+), 6 deletions(-) create mode 100644 R/read_4d.R create mode 100644 man/read_4D.Rd diff --git a/NAMESPACE b/NAMESPACE index d9f80425..25d69444 100755 --- a/NAMESPACE +++ b/NAMESPACE @@ -116,6 +116,7 @@ export(portion_S) export(portion_SI) export(portion_df) export(ratio) +export(read_4D) export(resistance_predict) export(right_join_microorganisms) export(rsi) diff --git a/NEWS.md b/NEWS.md index dbd0c954..ec6860a9 100755 --- a/NEWS.md +++ b/NEWS.md @@ -5,6 +5,7 @@ * Function `count_all` to get all available isolates (that like all `portion_*` and `count_*` functions also supports `summarise` and `group_by`), the old `n_rsi` is now an alias of `count_all` * Function `get_locale` to determine language for language-dependent output for some `mo_*` functions. This is now the default value for their `language` parameter, by which the system language will be used at default. * Data sets `microorganismsDT`, `microorganisms.prevDT`, `microorganisms.unprevDT` and `microorganisms.oldDT` to improve the speed of `as.mo`. They are for reference only, since they are primarily for internal use of `as.mo`. +* Function `read_4D` to read from the 4D database of the MMB department of the UMCG #### Changed * Big changes to the `EUCAST_rules` function: diff --git a/R/read_4d.R b/R/read_4d.R new file mode 100644 index 00000000..bdf1bf2b --- /dev/null +++ b/R/read_4d.R @@ -0,0 +1,134 @@ +# ==================================================================== # +# TITLE # +# Antimicrobial Resistance (AMR) Analysis # +# # +# AUTHORS # +# Berends MS (m.s.berends@umcg.nl), Luz CF (c.f.luz@umcg.nl) # +# # +# LICENCE # +# This program is free software; you can redistribute it and/or modify # +# it under the terms of the GNU General Public License version 2.0, # +# as published by the Free Software Foundation. # +# # +# This program is distributed in the hope that it will be useful, # +# but WITHOUT ANY WARRANTY; without even the implied warranty of # +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # +# GNU General Public License for more details. # +# ==================================================================== # + +#' Read data from 4D database +#' +#' This function is only useful for the MMB department of the UMCG. Use this function to \strong{import data by just defining the \code{file} parameter}. It will automatically transform birth dates and calculate patients age, translate the data set to English, transform the \code{mo} with \code{\link{as.mo}} and transform all antimicrobial columns with \code{\link{as.rsi}}. +#' @inheritParams utils::read.table +#' @export +read_4D <- function(file, + header = TRUE, + sep = "\t", + quote = "\"'", + dec = ",", + na.strings = c("NA", "", "."), + skip = 2, + check.names = TRUE, + strip.white = TRUE, + fill = TRUE, + blank.lines.skip = TRUE, + stringsAsFactors = FALSE, + fileEncoding = "UTF-8", + encoding = "UTF-8") { + + data_4D <- utils::read.table(file = file, + header = header, + sep = sep, + quote = quote, + dec = dec, + na.strings = na.strings, + skip = skip, + check.names = check.names, + strip.white = strip.white, + fill = fill, + blank.lines.skip = blank.lines.skip, + stringsAsFactors = stringsAsFactors, + fileEncoding = fileEncoding, + encoding = encoding) + + # helper functions + to_date_4D <- function(x) { + date_regular <- as.Date(x, format = "%d-%m-%y") + posixlt <- as.POSIXlt(date_regular) + # born after today will be born 100 years ago + # based on https://stackoverflow.com/a/3312971/4575331 + posixlt[date_regular > Sys.Date()]$year <- posixlt[date_regular > Sys.Date()]$year - 100 + as.Date(posixlt) + } + to_age_4D <- function(from, to) { + from_lt = as.POSIXlt(from) + to_lt = as.POSIXlt(to) + + age = to_lt$year - from_lt$year + + ifelse(to_lt$mon < from_lt$mon | + (to_lt$mon == from_lt$mon & to_lt$mday < from_lt$mday), + age - 1, age) + } + + colnames(data_4D) <- tolower(colnames(data_4D)) + if (all(c("afnamedat", "gebdatum") %in% colnames(data_4D))) { + # add age + data_4D$age <- NA_integer_ + } + cols_wanted <- c("patientnr", "gebdatum", "age", "mv", "monsternr", "afnamedat", "bepaling", + "afd.", "spec", "mat", "matbijz.", "mocode", + "amfo", "amox", "anid", "azit", "casp", "cecl", "cefe", "cfcl", + "cfot", "cfox", "cfta", "cftr", "cfur", "chlo", "cipr", "clin", + "cocl", "ctta", "dapt", "doxy", "eryt", "fluo", "fluz", "fosf", + "fusi", "gehi", "gent", "imip", "kana", "levo", "line", "mero", + "metr", "mico", "mino", "moxi", "mupi", "nali", "nitr", "norf", + "oxac", "peni", "pipe", "pita", "poly", "posa", "quda", "rifa", + "spat", "teic", "tige", "tobr", "trim", "trsu", "vana", "vanb", + "vanc", "vori") + # this ones actually exist + cols_wanted <- cols_wanted[cols_wanted %in% colnames(data_4D)] + # order of columns + data_4D <- data_4D[, cols_wanted] + + # rename of columns + colnames(data_4D) <- gsub("patientnr", "patient_id", colnames(data_4D), fixed = TRUE) + colnames(data_4D) <- gsub("gebdatum", "date_birth", colnames(data_4D), fixed = TRUE) + colnames(data_4D) <- gsub("mv", "gender", colnames(data_4D), fixed = TRUE) + colnames(data_4D) <- gsub("monsternr", "sample_id", colnames(data_4D), fixed = TRUE) + colnames(data_4D) <- gsub("afnamedat", "date_received", colnames(data_4D), fixed = TRUE) + colnames(data_4D) <- gsub("bepaling", "sample_test", colnames(data_4D), fixed = TRUE) + colnames(data_4D) <- gsub("afd.", "department", colnames(data_4D), fixed = TRUE) + colnames(data_4D) <- gsub("spec", "specialty", colnames(data_4D), fixed = TRUE) + colnames(data_4D) <- gsub("matbijz.", "specimen_type", colnames(data_4D), fixed = TRUE) + colnames(data_4D) <- gsub("mat", "specimen_group", colnames(data_4D), fixed = TRUE) + colnames(data_4D) <- gsub("mocode", "mo", colnames(data_4D), fixed = TRUE) + + if ("date_birth" %in% colnames(data_4D)) { + data_4D$date_birth <- to_date_4D(data_4D$date_birth) + + } + if ("date_received" %in% colnames(data_4D)) { + data_4D$date_received <- to_date_4D(data_4D$date_received) + } + if ("age" %in% colnames(data_4D)) { + data_4D$age <- to_age_4D(data_4D$date_birth, data_4D$date_received) + } + if ("gender" %in% colnames(data_4D)) { + data_4D$gender[data_4D$gender == "V"] <- "F" + } + if ("mo" %in% colnames(data_4D)) { + data_4D$mo <- as.mo(data_4D$mo) + # column right of mo is: + drug1 <- colnames(data_4D)[grep("^mo$", colnames(data_4D)) + 1] + if (!is.na(drug1)) { + # and last is: + drug_last <- colnames(data_4D)[length(data_4D)] + # transform those to rsi: + data_4D <- suppressWarnings(mutate_at(data_4D, vars(drug1:drug_last), as.rsi)) + } + } + + data_4D +} + diff --git a/README.md b/README.md index 6b8dc33a..be345a50 100755 --- a/README.md +++ b/README.md @@ -599,14 +599,24 @@ mo_fullname("CoNS", language = "en") # or just mo_fullname("CoNS") on an English mo_fullname("CoNS", language = "fr") # or just mo_fullname("CoNS") on a French system # "Staphylococcus à coagulase négative (CoNS)" -microbenchmark(A = mo_fullname("CoNS", language = "en"), - B = mo_fullname("CoNS", language = "fr"), +microbenchmark(en = mo_fullname("CoNS", language = "en"), + de = mo_fullname("CoNS", language = "de"), + nl = mo_fullname("CoNS", language = "nl"), + es = mo_fullname("CoNS", language = "es"), + it = mo_fullname("CoNS", language = "it"), + fr = mo_fullname("CoNS", language = "fr"), + pt = mo_fullname("CoNS", language = "pt"), times = 10, unit = "ms") # Unit: milliseconds -# expr min lq mean median uq max neval -# A 6.080733 6.33684 6.467129 6.493773 6.593926 6.963666 10 -# B 14.076651 14.10452 14.446035 14.315893 14.636918 15.254106 10 +# expr min lq mean median uq max neval +# en 6.093583 6.51724 6.555105 6.562986 6.630663 6.99698 100 +# de 13.934874 14.35137 16.891587 14.462210 14.764658 43.63956 100 +# nl 13.900092 14.34729 15.943268 14.424565 14.581535 43.76283 100 +# es 13.833813 14.34596 14.574783 14.439757 14.653994 17.49168 100 +# it 13.811883 14.36621 15.179060 14.453515 14.812359 43.64284 100 +# fr 13.798683 14.37019 16.344731 14.468775 14.697610 48.62923 100 +# pt 13.789674 14.36244 15.706321 14.443772 14.679905 44.76701 100 ``` Currently supported are German, Dutch, Spanish, Italian, French and Portuguese. diff --git a/man/read_4D.Rd b/man/read_4D.Rd new file mode 100644 index 00000000..03e64d56 --- /dev/null +++ b/man/read_4D.Rd @@ -0,0 +1,104 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/read_4d.R +\name{read_4D} +\alias{read_4D} +\title{Read data from 4D database} +\usage{ +read_4D(file, header = TRUE, sep = "\\t", quote = "\\"'", + dec = ",", na.strings = c("NA", "", "."), skip = 2, + check.names = TRUE, strip.white = TRUE, fill = TRUE, + blank.lines.skip = TRUE, stringsAsFactors = FALSE, + fileEncoding = "UTF-8", encoding = "UTF-8") +} +\arguments{ +\item{file}{the name of the file which the data are to be read from. + Each row of the table appears as one line of the file. If it does + not contain an \emph{absolute} path, the file name is + \emph{relative} to the current working directory, + \code{\link{getwd}()}. Tilde-expansion is performed where supported. + This can be a compressed file (see \code{\link{file}}). + + Alternatively, \code{file} can be a readable text-mode + \link{connection} (which will be opened for reading if + necessary, and if so \code{\link{close}}d (and hence destroyed) at + the end of the function call). (If \code{\link{stdin}()} is used, + the prompts for lines may be somewhat confusing. Terminate input + with a blank line or an EOF signal, \code{Ctrl-D} on Unix and + \code{Ctrl-Z} on Windows. Any pushback on \code{stdin()} will be + cleared before return.) + + \code{file} can also be a complete URL. (For the supported URL + schemes, see the \sQuote{URLs} section of the help for + \code{\link{url}}.) + } + +\item{header}{a logical value indicating whether the file contains the + names of the variables as its first line. If missing, the value is + determined from the file format: \code{header} is set to \code{TRUE} + if and only if the first row contains one fewer field than the + number of columns.} + +\item{sep}{the field separator character. Values on each line of the + file are separated by this character. If \code{sep = ""} (the + default for \code{read.table}) the separator is \sQuote{white space}, + that is one or more spaces, tabs, newlines or carriage returns.} + +\item{quote}{the set of quoting characters. To disable quoting + altogether, use \code{quote = ""}. See \code{\link{scan}} for the + behaviour on quotes embedded in quotes. Quoting is only considered + for columns read as character, which is all of them unless + \code{colClasses} is specified.} + +\item{dec}{the character used in the file for decimal points.} + +\item{na.strings}{a character vector of strings which are to be + interpreted as \code{\link{NA}} values. Blank fields are also + considered to be missing values in logical, integer, numeric and + complex fields. Note that the test happens \emph{after} + white space is stripped from the input, so \code{na.strings} + values may need their own white space stripped in advance.} + +\item{skip}{integer: the number of lines of the data file to skip before + beginning to read data.} + +\item{check.names}{logical. If \code{TRUE} then the names of the + variables in the data frame are checked to ensure that they are + syntactically valid variable names. If necessary they are adjusted + (by \code{\link{make.names}}) so that they are, and also to ensure + that there are no duplicates.} + +\item{strip.white}{logical. Used only when \code{sep} has + been specified, and allows the stripping of leading and trailing + white space from unquoted \code{character} fields (\code{numeric} fields + are always stripped). See \code{\link{scan}} for further details + (including the exact meaning of \sQuote{white space}), + remembering that the columns may include the row names.} + +\item{fill}{logical. If \code{TRUE} then in case the rows have unequal + length, blank fields are implicitly added. See \sQuote{Details}.} + +\item{blank.lines.skip}{logical: if \code{TRUE} blank lines in the + input are ignored.} + +\item{stringsAsFactors}{logical: should character vectors be converted + to factors? Note that this is overridden by \code{as.is} and + \code{colClasses}, both of which allow finer control.} + +\item{fileEncoding}{character string: if non-empty declares the + encoding used on a file (not a connection) so the character data can + be re-encoded. See the \sQuote{Encoding} section of the help for + \code{\link{file}}, the \sQuote{R Data Import/Export Manual} and + \sQuote{Note}. + } + +\item{encoding}{encoding to be assumed for input strings. It is + used to mark character strings as known to be in + Latin-1 or UTF-8 (see \code{\link{Encoding}}): it is not used to + re-encode the input, but allows \R to handle encoded strings in + their native encoding (if one of those two). See \sQuote{Value} + and \sQuote{Note}. + } +} +\description{ +This function is only useful for the MMB department of the UMCG. Use this function to \strong{import data by just defining the \code{file} parameter}. It will automatically transform birth dates and calculate patients age, translate the data set to English, transform the \code{mo} with \code{\link{as.mo}} and transform all antimicrobial columns with \code{\link{as.rsi}}. +} diff --git a/tests/testthat/test-get_locale.R b/tests/testthat/test-get_locale.R index e8fa9dbc..c31fd38b 100644 --- a/tests/testthat/test-get_locale.R +++ b/tests/testthat/test-get_locale.R @@ -9,7 +9,7 @@ test_that("get_locale works", { expect_identical(mo_fullname("CoNS", "nl"), "Coagulase-negatieve Staphylococcus (CNS)") expect_identical(mo_fullname("CoNS", "es"), "Staphylococcus coagulasa negativo (CoNS)") expect_identical(mo_fullname("CoNS", "it"), "Staphylococcus negativo coagulasi (CoNS)") - expect_identical(mo_fullname("CoNS", "fr"), "Staphylococcus à coagulase négative (CoNS)") + expect_identical(mo_fullname("CoNS", "fr"), "Staphylococcus \u00e0 coagulase n\u00e9gative (CoNS)") expect_identical(mo_fullname("CoNS", "pt"), "Staphylococcus coagulase negativo (CoNS)") })