From fe803f7279f42202207ef4259b6e452c549caf82 Mon Sep 17 00:00:00 2001
From: "Matthijs S. Berends" <m.s.berends@umcg.nl>
Date: Mon, 19 Mar 2018 21:23:21 +0100
Subject: [PATCH] use guess_bactid for GLIMS codes

---
 R/data.R             |  4 ++--
 R/first_isolates.R   | 14 ++++++++++----
 R/globals.R          |  1 +
 man/bactlist.umcg.Rd |  4 ++--
 man/first_isolate.Rd |  8 ++++----
 5 files changed, 19 insertions(+), 12 deletions(-)

diff --git a/R/data.R b/R/data.R
index 5dfb1ca03..435f86f2f 100644
--- a/R/data.R
+++ b/R/data.R
@@ -85,14 +85,14 @@
 
 #' Translation table for UMCG with ~1100 microorganisms
 #'
-#' A dataset containing all bacteria codes of UMCG MMB. These codes can be joined to data with an ID from \code{\link{bactlist}$bactid}, using \code{\link{left_join_bactlist}}.
+#' A dataset containing all bacteria codes of UMCG MMB. These codes can be joined to data with an ID from \code{\link{bactlist}$bactid} (using \code{\link{left_join_bactlist}}). GLIMS codes can also be translated to valid \code{bactid}'s with \code{\link{guess_bactid}}.
 #' @format A data.frame with 1090 observations and 2 variables:
 #' \describe{
 #'   \item{\code{mocode}}{Code of microorganism according to UMCG MMB}
 #'   \item{\code{bactid}}{Code of microorganism in \code{\link{bactlist}}}
 #' }
 #' @source MOLIS (LIS of Certe) - \url{https://www.certe.nl} \cr \cr GLIMS (LIS of UMCG) - \url{https://www.umcg.nl}
-#' @seealso \code{\link{bactlist}}
+#' @seealso \code{\link{guess_bactid}} \code{\link{bactlist}}
 "bactlist.umcg"
 
 #' Dataset with 2000 blood culture isolates of septic patients
diff --git a/R/first_isolates.R b/R/first_isolates.R
index c8d8bbe0b..677995113 100644
--- a/R/first_isolates.R
+++ b/R/first_isolates.R
@@ -41,10 +41,10 @@
 #'     To conduct an analysis of antimicrobial resistance, you should only include the first isolate of every patient per episode \href{https://www.ncbi.nlm.nih.gov/pubmed/17304462}{[1]}. If you would not do this, you could easily get an overestimate or underestimate of the resistance of an antibiotic. Imagine that a patient was admitted with an MRSA and that it was found in 5 different blood cultures the following week. The resistance percentage of oxacillin of all \emph{S. aureus} isolates would be overestimated, because you included this MRSA more than once. It would be \href{https://en.wikipedia.org/wiki/Selection_bias}{selection bias}.
 #'
 #'     \strong{DETERMINING WEIGHTED ISOLATES} \cr
-#'     \strong{1. Using \code{type = "keyantibiotics"} and parameter \code{ignore_I}} \cr
-#'     To determine weighted isolates, the difference between key antibiotics will be checked. Any difference from S to R (or vice versa) will (re)select an isolate as a first weighted isolate. With \code{ignore_I == FALSE}, also differences from I to S|R (or vice versa) will lead to this. This is a reliable and fast method. \cr
-#'     \strong{2. Using \code{type = "points"} and parameter \code{points_threshold}} \cr
-#'     To determine weighted isolates, difference between antimicrobial interpretations will be measured with points. A difference from I to S|R (or vice versa) means 0.5 points. A difference from S to R (or vice versa) means 1 point. When the sum of points exceeds \code{points_threshold}, an isolate will be (re)selected as a first weighted isolate. This method is being used by the Infection Prevention department (Dr M. Lokate) of the University Medical Center Groningen (UMCG).
+#'     \strong{1. Using} \code{type = "keyantibiotics"} \strong{and parameter} \code{ignore_I} \cr
+#'     To determine weighted isolates, the difference between key antibiotics will be checked. Any difference from S to R (or vice versa) will (re)select an isolate as a first weighted isolate. With \code{ignore_I = FALSE}, also differences from I to S|R (or vice versa) will lead to this. This is a reliable method and 30-35 times faster than method 2. \cr
+#'     \strong{2. Using} \code{type = "points"} \strong{and parameter} \code{points_threshold} \cr
+#'     To determine weighted isolates, difference between antimicrobial interpretations will be measured with points. A difference from I to S|R (or vice versa) means 0.5 points, a difference from S to R (or vice versa) means 1 point. When the sum of points exceeds \code{points_threshold}, an isolate will be (re)selected as a first weighted isolate. This method is being used by the Infection Prevention department (Dr M. Lokate) of the University Medical Center Groningen (UMCG).
 #' @keywords isolate isolates first
 #' @export
 #' @importFrom dplyr arrange_at lag between row_number filter mutate arrange
@@ -676,6 +676,12 @@ guess_bactid <- function(x) {
       # try only genus, with 'species' attached
       found <- AMR::bactlist %>% filter(fullname %like% x_species[i])
     }
+    if (nrow(found) == 0) {
+      # search for GLIMS code
+      if (toupper(x.bak[i]) %in% toupper(AMR::bactlist.umcg$mocode)) {
+        found <- AMR::bactlist.umcg %>% filter(toupper(mocode) == toupper(x.bak[i]))
+      }
+    }
     if (nrow(found) == 0) {
       # try splitting of characters and then find ID
       # like esco = E. coli, klpn = K. pneumoniae, stau = S. aureus
diff --git a/R/globals.R b/R/globals.R
index ece183b25..2eb98d09d 100644
--- a/R/globals.R
+++ b/R/globals.R
@@ -30,6 +30,7 @@ globalVariables(c('.',
                   'key_ab_lag',
                   'key_ab_other',
                   'mic',
+                  'mocode',
                   'n',
                   'other_pat_or_mo',
                   'patient_id',
diff --git a/man/bactlist.umcg.Rd b/man/bactlist.umcg.Rd
index 0e789c654..bc14b777f 100644
--- a/man/bactlist.umcg.Rd
+++ b/man/bactlist.umcg.Rd
@@ -16,9 +16,9 @@ MOLIS (LIS of Certe) - \url{https://www.certe.nl} \cr \cr GLIMS (LIS of UMCG) -
 bactlist.umcg
 }
 \description{
-A dataset containing all bacteria codes of UMCG MMB. These codes can be joined to data with an ID from \code{\link{bactlist}$bactid}, using \code{\link{left_join_bactlist}}.
+A dataset containing all bacteria codes of UMCG MMB. These codes can be joined to data with an ID from \code{\link{bactlist}$bactid} (using \code{\link{left_join_bactlist}}). GLIMS codes can also be translated to valid \code{bactid}'s with \code{\link{guess_bactid}}.
 }
 \seealso{
-\code{\link{bactlist}}
+\code{\link{guess_bactid}} \code{\link{bactlist}}
 }
 \keyword{datasets}
diff --git a/man/first_isolate.Rd b/man/first_isolate.Rd
index 7083c81d1..f8054f891 100644
--- a/man/first_isolate.Rd
+++ b/man/first_isolate.Rd
@@ -59,10 +59,10 @@ Determine first (weighted) isolates of all microorganisms of every patient per e
     To conduct an analysis of antimicrobial resistance, you should only include the first isolate of every patient per episode \href{https://www.ncbi.nlm.nih.gov/pubmed/17304462}{[1]}. If you would not do this, you could easily get an overestimate or underestimate of the resistance of an antibiotic. Imagine that a patient was admitted with an MRSA and that it was found in 5 different blood cultures the following week. The resistance percentage of oxacillin of all \emph{S. aureus} isolates would be overestimated, because you included this MRSA more than once. It would be \href{https://en.wikipedia.org/wiki/Selection_bias}{selection bias}.
 
     \strong{DETERMINING WEIGHTED ISOLATES} \cr
-    \strong{1. Using \code{type = "keyantibiotics"} and parameter \code{ignore_I}} \cr
-    To determine weighted isolates, the difference between key antibiotics will be checked. Any difference from S to R (or vice versa) will (re)select an isolate as a first weighted isolate. With \code{ignore_I == FALSE}, also differences from I to S|R (or vice versa) will lead to this. This is a reliable and fast method. \cr
-    \strong{2. Using \code{type = "points"} and parameter \code{points_threshold}} \cr
-    To determine weighted isolates, difference between antimicrobial interpretations will be measured with points. A difference from I to S|R (or vice versa) means 0.5 points. A difference from S to R (or vice versa) means 1 point. When the sum of points exceeds \code{points_threshold}, an isolate will be (re)selected as a first weighted isolate. This method is being used by the Infection Prevention department (Dr M. Lokate) of the University Medical Center Groningen (UMCG).
+    \strong{1. Using} \code{type = "keyantibiotics"} \strong{and parameter} \code{ignore_I} \cr
+    To determine weighted isolates, the difference between key antibiotics will be checked. Any difference from S to R (or vice versa) will (re)select an isolate as a first weighted isolate. With \code{ignore_I = FALSE}, also differences from I to S|R (or vice versa) will lead to this. This is a reliable method and 30-35 times faster than method 2. \cr
+    \strong{2. Using} \code{type = "points"} \strong{and parameter} \code{points_threshold} \cr
+    To determine weighted isolates, difference between antimicrobial interpretations will be measured with points. A difference from I to S|R (or vice versa) means 0.5 points, a difference from S to R (or vice versa) means 1 point. When the sum of points exceeds \code{points_threshold}, an isolate will be (re)selected as a first weighted isolate. This method is being used by the Infection Prevention department (Dr M. Lokate) of the University Medical Center Groningen (UMCG).
 }
 \examples{
 # septic_patients is a dataset available in the AMR package