From d15bb2ab3ff06ab18a77f7c06fcd61ff4b372eaf Mon Sep 17 00:00:00 2001
From: "Matthijs S. Berends" <m.s.berends@umcg.nl>
Date: Mon, 5 Nov 2018 15:30:49 +0100
Subject: [PATCH] read_4D

---
 NAMESPACE                        |   1 +
 NEWS.md                          |   1 +
 R/read_4d.R                      | 134 +++++++++++++++++++++++++++++++
 README.md                        |  20 +++--
 man/read_4D.Rd                   | 104 ++++++++++++++++++++++++
 tests/testthat/test-get_locale.R |   2 +-
 6 files changed, 256 insertions(+), 6 deletions(-)
 create mode 100644 R/read_4d.R
 create mode 100644 man/read_4D.Rd

diff --git a/NAMESPACE b/NAMESPACE
index d9f80425..25d69444 100755
--- a/NAMESPACE
+++ b/NAMESPACE
@@ -116,6 +116,7 @@ export(portion_S)
 export(portion_SI)
 export(portion_df)
 export(ratio)
+export(read_4D)
 export(resistance_predict)
 export(right_join_microorganisms)
 export(rsi)
diff --git a/NEWS.md b/NEWS.md
index dbd0c954..ec6860a9 100755
--- a/NEWS.md
+++ b/NEWS.md
@@ -5,6 +5,7 @@
 * Function `count_all` to get all available isolates (that like all `portion_*` and `count_*` functions also supports `summarise` and `group_by`), the old `n_rsi` is now an alias of `count_all`
 * Function `get_locale` to determine language for language-dependent output for some `mo_*` functions. This is now the default value for their `language` parameter, by which the system language will be used at default.
 * Data sets `microorganismsDT`, `microorganisms.prevDT`, `microorganisms.unprevDT` and `microorganisms.oldDT` to improve the speed of `as.mo`. They are for reference only, since they are primarily for internal use of `as.mo`.
+* Function `read_4D` to read from the 4D database of the MMB department of the UMCG
 
 #### Changed
 * Big changes to the `EUCAST_rules` function:
diff --git a/R/read_4d.R b/R/read_4d.R
new file mode 100644
index 00000000..bdf1bf2b
--- /dev/null
+++ b/R/read_4d.R
@@ -0,0 +1,134 @@
+# ==================================================================== #
+# TITLE                                                                #
+# Antimicrobial Resistance (AMR) Analysis                              #
+#                                                                      #
+# AUTHORS                                                              #
+# Berends MS (m.s.berends@umcg.nl), Luz CF (c.f.luz@umcg.nl)           #
+#                                                                      #
+# LICENCE                                                              #
+# This program is free software; you can redistribute it and/or modify #
+# it under the terms of the GNU General Public License version 2.0,    #
+# as published by the Free Software Foundation.                        #
+#                                                                      #
+# This program is distributed in the hope that it will be useful,      #
+# but WITHOUT ANY WARRANTY; without even the implied warranty of       #
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the        #
+# GNU General Public License for more details.                         #
+# ==================================================================== #
+
+#' Read data from 4D database
+#'
+#' This function is only useful for the MMB department of the UMCG. Use this function to \strong{import data by just defining the \code{file} parameter}. It will automatically transform birth dates and calculate patients age, translate the data set to English, transform the \code{mo} with \code{\link{as.mo}} and transform all antimicrobial columns with \code{\link{as.rsi}}.
+#' @inheritParams utils::read.table
+#' @export
+read_4D <- function(file,
+                    header = TRUE,
+                    sep = "\t",
+                    quote = "\"'",
+                    dec = ",",
+                    na.strings = c("NA", "", "."),
+                    skip = 2,
+                    check.names = TRUE,
+                    strip.white = TRUE,
+                    fill = TRUE,
+                    blank.lines.skip = TRUE,
+                    stringsAsFactors = FALSE,
+                    fileEncoding = "UTF-8",
+                    encoding = "UTF-8") {
+
+  data_4D <- utils::read.table(file = file,
+                               header = header,
+                               sep = sep,
+                               quote = quote,
+                               dec = dec,
+                               na.strings = na.strings,
+                               skip = skip,
+                               check.names = check.names,
+                               strip.white = strip.white,
+                               fill = fill,
+                               blank.lines.skip = blank.lines.skip,
+                               stringsAsFactors = stringsAsFactors,
+                               fileEncoding = fileEncoding,
+                               encoding = encoding)
+
+  # helper functions
+  to_date_4D <- function(x) {
+    date_regular <- as.Date(x, format = "%d-%m-%y")
+    posixlt <- as.POSIXlt(date_regular)
+    # born after today will be born 100 years ago
+    # based on https://stackoverflow.com/a/3312971/4575331
+    posixlt[date_regular > Sys.Date()]$year <- posixlt[date_regular > Sys.Date()]$year - 100
+    as.Date(posixlt)
+  }
+  to_age_4D <- function(from, to) {
+    from_lt = as.POSIXlt(from)
+    to_lt = as.POSIXlt(to)
+
+    age = to_lt$year - from_lt$year
+
+    ifelse(to_lt$mon < from_lt$mon |
+             (to_lt$mon == from_lt$mon & to_lt$mday < from_lt$mday),
+           age - 1, age)
+  }
+
+  colnames(data_4D) <- tolower(colnames(data_4D))
+  if (all(c("afnamedat", "gebdatum") %in% colnames(data_4D))) {
+    # add age
+    data_4D$age <- NA_integer_
+  }
+  cols_wanted <- c("patientnr", "gebdatum", "age", "mv", "monsternr", "afnamedat", "bepaling",
+                   "afd.", "spec", "mat", "matbijz.",  "mocode",
+                   "amfo", "amox", "anid", "azit", "casp", "cecl", "cefe", "cfcl",
+                   "cfot", "cfox", "cfta", "cftr", "cfur", "chlo", "cipr", "clin",
+                   "cocl", "ctta", "dapt", "doxy", "eryt", "fluo", "fluz", "fosf",
+                   "fusi", "gehi", "gent", "imip", "kana", "levo", "line", "mero",
+                   "metr", "mico", "mino", "moxi", "mupi", "nali", "nitr", "norf",
+                   "oxac", "peni", "pipe", "pita", "poly", "posa", "quda", "rifa",
+                   "spat", "teic", "tige", "tobr", "trim", "trsu", "vana", "vanb",
+                   "vanc", "vori")
+  # this ones actually exist
+  cols_wanted <- cols_wanted[cols_wanted %in% colnames(data_4D)]
+  # order of columns
+  data_4D <- data_4D[, cols_wanted]
+
+  # rename of columns
+  colnames(data_4D) <- gsub("patientnr", "patient_id", colnames(data_4D), fixed = TRUE)
+  colnames(data_4D) <- gsub("gebdatum", "date_birth", colnames(data_4D), fixed = TRUE)
+  colnames(data_4D) <- gsub("mv", "gender", colnames(data_4D), fixed = TRUE)
+  colnames(data_4D) <- gsub("monsternr", "sample_id", colnames(data_4D), fixed = TRUE)
+  colnames(data_4D) <- gsub("afnamedat", "date_received", colnames(data_4D), fixed = TRUE)
+  colnames(data_4D) <- gsub("bepaling", "sample_test", colnames(data_4D), fixed = TRUE)
+  colnames(data_4D) <- gsub("afd.", "department", colnames(data_4D), fixed = TRUE)
+  colnames(data_4D) <- gsub("spec", "specialty", colnames(data_4D), fixed = TRUE)
+  colnames(data_4D) <- gsub("matbijz.", "specimen_type", colnames(data_4D), fixed = TRUE)
+  colnames(data_4D) <- gsub("mat", "specimen_group", colnames(data_4D), fixed = TRUE)
+  colnames(data_4D) <- gsub("mocode", "mo", colnames(data_4D), fixed = TRUE)
+
+  if ("date_birth" %in% colnames(data_4D)) {
+    data_4D$date_birth <- to_date_4D(data_4D$date_birth)
+
+  }
+  if ("date_received" %in% colnames(data_4D)) {
+    data_4D$date_received <- to_date_4D(data_4D$date_received)
+  }
+  if ("age" %in% colnames(data_4D)) {
+    data_4D$age <- to_age_4D(data_4D$date_birth, data_4D$date_received)
+  }
+  if ("gender" %in% colnames(data_4D)) {
+    data_4D$gender[data_4D$gender == "V"] <- "F"
+  }
+  if ("mo" %in% colnames(data_4D)) {
+    data_4D$mo <- as.mo(data_4D$mo)
+    # column right of mo is:
+    drug1 <- colnames(data_4D)[grep("^mo$", colnames(data_4D)) + 1]
+    if (!is.na(drug1)) {
+      # and last is:
+      drug_last <- colnames(data_4D)[length(data_4D)]
+      # transform those to rsi:
+      data_4D <- suppressWarnings(mutate_at(data_4D, vars(drug1:drug_last), as.rsi))
+    }
+  }
+
+  data_4D
+}
+
diff --git a/README.md b/README.md
index 6b8dc33a..be345a50 100755
--- a/README.md
+++ b/README.md
@@ -599,14 +599,24 @@ mo_fullname("CoNS", language = "en") # or just mo_fullname("CoNS") on an English
 mo_fullname("CoNS", language = "fr") # or just mo_fullname("CoNS") on a French system
 # "Staphylococcus à coagulase négative (CoNS)"
 
-microbenchmark(A = mo_fullname("CoNS", language = "en"),
-               B = mo_fullname("CoNS", language = "fr"),
+microbenchmark(en = mo_fullname("CoNS", language = "en"),
+               de = mo_fullname("CoNS", language = "de"),
+               nl = mo_fullname("CoNS", language = "nl"),
+               es = mo_fullname("CoNS", language = "es"),
+               it = mo_fullname("CoNS", language = "it"),
+               fr = mo_fullname("CoNS", language = "fr"),
+               pt = mo_fullname("CoNS", language = "pt"),
                times = 10,
                unit = "ms")
 # Unit: milliseconds
-#  expr       min       lq      mean    median        uq       max neval
-#     A  6.080733  6.33684  6.467129  6.493773  6.593926  6.963666    10
-#     B 14.076651 14.10452 14.446035 14.315893 14.636918 15.254106    10
+#  expr       min       lq      mean    median        uq      max neval
+#    en  6.093583  6.51724  6.555105  6.562986  6.630663  6.99698   100
+#    de 13.934874 14.35137 16.891587 14.462210 14.764658 43.63956   100
+#    nl 13.900092 14.34729 15.943268 14.424565 14.581535 43.76283   100
+#    es 13.833813 14.34596 14.574783 14.439757 14.653994 17.49168   100
+#    it 13.811883 14.36621 15.179060 14.453515 14.812359 43.64284   100
+#    fr 13.798683 14.37019 16.344731 14.468775 14.697610 48.62923   100
+#    pt 13.789674 14.36244 15.706321 14.443772 14.679905 44.76701   100
 ```
 
 Currently supported are German, Dutch, Spanish, Italian, French and Portuguese.
diff --git a/man/read_4D.Rd b/man/read_4D.Rd
new file mode 100644
index 00000000..03e64d56
--- /dev/null
+++ b/man/read_4D.Rd
@@ -0,0 +1,104 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/read_4d.R
+\name{read_4D}
+\alias{read_4D}
+\title{Read data from 4D database}
+\usage{
+read_4D(file, header = TRUE, sep = "\\t", quote = "\\"'",
+  dec = ",", na.strings = c("NA", "", "."), skip = 2,
+  check.names = TRUE, strip.white = TRUE, fill = TRUE,
+  blank.lines.skip = TRUE, stringsAsFactors = FALSE,
+  fileEncoding = "UTF-8", encoding = "UTF-8")
+}
+\arguments{
+\item{file}{the name of the file which the data are to be read from.
+    Each row of the table appears as one line of the file.  If it does
+    not contain an \emph{absolute} path, the file name is
+    \emph{relative} to the current working directory,
+    \code{\link{getwd}()}. Tilde-expansion is performed where supported.
+    This can be a compressed file (see \code{\link{file}}).
+
+    Alternatively, \code{file} can be a readable text-mode
+    \link{connection} (which will be opened for reading if
+    necessary, and if so \code{\link{close}}d (and hence destroyed) at
+    the end of the function call).  (If \code{\link{stdin}()} is used,
+    the prompts for lines may be somewhat confusing.  Terminate input
+    with a blank line or an EOF signal, \code{Ctrl-D} on Unix and
+    \code{Ctrl-Z} on Windows.  Any pushback on \code{stdin()} will be
+    cleared before return.)
+
+    \code{file} can also be a complete URL.  (For the supported URL
+    schemes, see the \sQuote{URLs} section of the help for
+    \code{\link{url}}.)
+  }
+
+\item{header}{a logical value indicating whether the file contains the
+    names of the variables as its first line.  If missing, the value is
+    determined from the file format: \code{header} is set to \code{TRUE}
+    if and only if the first row contains one fewer field than the
+    number of columns.}
+
+\item{sep}{the field separator character.  Values on each line of the
+    file are separated by this character.  If \code{sep = ""} (the
+    default for \code{read.table}) the separator is \sQuote{white space},
+    that is one or more spaces, tabs, newlines or carriage returns.}
+
+\item{quote}{the set of quoting characters. To disable quoting
+    altogether, use \code{quote = ""}.  See \code{\link{scan}} for the
+    behaviour on quotes embedded in quotes.  Quoting is only considered
+    for columns read as character, which is all of them unless
+    \code{colClasses} is specified.}
+
+\item{dec}{the character used in the file for decimal points.}
+
+\item{na.strings}{a character vector of strings which are to be
+    interpreted as \code{\link{NA}} values.  Blank fields are also
+    considered to be missing values in logical, integer, numeric and
+    complex fields.  Note that the test happens \emph{after} 
+    white space is stripped from the input, so \code{na.strings} 
+    values may need their own white space stripped in advance.}
+
+\item{skip}{integer: the number of lines of the data file to skip before
+    beginning to read data.}
+
+\item{check.names}{logical.  If \code{TRUE} then the names of the
+    variables in the data frame are checked to ensure that they are
+    syntactically valid variable names.  If necessary they are adjusted
+    (by \code{\link{make.names}}) so that they are, and also to ensure
+    that there are no duplicates.}
+
+\item{strip.white}{logical. Used only when \code{sep} has
+    been specified, and allows the stripping of leading and trailing
+    white space from unquoted \code{character} fields (\code{numeric} fields
+    are always stripped).  See \code{\link{scan}} for further details
+    (including the exact meaning of \sQuote{white space}),
+    remembering that the columns may include the row names.}
+
+\item{fill}{logical. If \code{TRUE} then in case the rows have unequal
+    length, blank fields are implicitly added.  See \sQuote{Details}.}
+
+\item{blank.lines.skip}{logical: if \code{TRUE} blank lines in the
+    input are ignored.}
+
+\item{stringsAsFactors}{logical: should character vectors be converted
+    to factors?  Note that this is overridden by \code{as.is} and
+    \code{colClasses}, both of which allow finer control.}
+
+\item{fileEncoding}{character string: if non-empty declares the
+    encoding used on a file (not a connection) so the character data can
+    be re-encoded.  See the \sQuote{Encoding} section of the help for
+    \code{\link{file}}, the \sQuote{R Data Import/Export Manual} and
+    \sQuote{Note}.
+  }
+
+\item{encoding}{encoding to be assumed for input strings.  It is
+    used to mark character strings as known to be in
+    Latin-1 or UTF-8 (see \code{\link{Encoding}}): it is not used to
+    re-encode the input, but allows \R to handle encoded strings in
+    their native encoding (if one of those two).  See \sQuote{Value}
+    and \sQuote{Note}.
+  }
+}
+\description{
+This function is only useful for the MMB department of the UMCG. Use this function to \strong{import data by just defining the \code{file} parameter}. It will automatically transform birth dates and calculate patients age, translate the data set to English, transform the \code{mo} with \code{\link{as.mo}} and transform all antimicrobial columns with \code{\link{as.rsi}}.
+}
diff --git a/tests/testthat/test-get_locale.R b/tests/testthat/test-get_locale.R
index e8fa9dbc..c31fd38b 100644
--- a/tests/testthat/test-get_locale.R
+++ b/tests/testthat/test-get_locale.R
@@ -9,7 +9,7 @@ test_that("get_locale works", {
   expect_identical(mo_fullname("CoNS", "nl"), "Coagulase-negatieve Staphylococcus (CNS)")
   expect_identical(mo_fullname("CoNS", "es"), "Staphylococcus coagulasa negativo (CoNS)")
   expect_identical(mo_fullname("CoNS", "it"), "Staphylococcus negativo coagulasi (CoNS)")
-  expect_identical(mo_fullname("CoNS", "fr"), "Staphylococcus à coagulase négative (CoNS)")
+  expect_identical(mo_fullname("CoNS", "fr"), "Staphylococcus \u00e0 coagulase n\u00e9gative (CoNS)")
   expect_identical(mo_fullname("CoNS", "pt"), "Staphylococcus coagulase negativo (CoNS)")
 
 })