(v0.9.0.9020) as.mo() improvement

2025-08-25 07:42:09 +02:00 · 2020-02-09 22:04:29 +01:00
parent 4e1c16c5a5
commit c7a76ba713
35 changed files with 70179 additions and 70116 deletions
--- a/R/data.R
+++ b/R/data.R
@@ -56,8 +56,8 @@
 #' ### Direct download
 #' These data sets are available as 'flat files' for use even without R - you can find the files here:
 #' 
-#' * <https://gitlab.com/msberends/AMR/blob/master/data-raw/antibiotics.R>
-#' * <https://gitlab.com/msberends/AMR/blob/master/data-raw/antivirals.R>
+#' * <https://gitlab.com/msberends/AMR/raw/master/data-raw/antibiotics.txt>
+#' * <https://gitlab.com/msberends/AMR/raw/master/data-raw/antivirals.txt>
 #' @source World Health Organization (WHO) Collaborating Centre for Drug Statistics Methodology (WHOCC): <https://www.whocc.no/atc_ddd_index/>
 #'
 #' WHONET 2019 software: <http://www.whonet.org/software.html>
@@ -98,7 +98,7 @@
 #'  ### Direct download
 #'  This data set is available as 'flat file' for use even without R - you can find the file here:
 #' 
-#' * <https://gitlab.com/msberends/AMR/blob/master/data-raw/microorganisms.R>
+#' * <https://gitlab.com/msberends/AMR/raw/master/data-raw/microorganisms.txt>
 #' @section About the records from DSMZ (see source):
 #' Names of prokaryotes are defined as being validly published by the International Code of Nomenclature of Bacteria. Validly published are all names which are included in the Approved Lists of Bacterial Names and the names subsequently published in the International Journal of Systematic Bacteriology (IJSB) and, from January 2000, in the International Journal of Systematic and Evolutionary Microbiology (IJSEM) as original articles or in the validation lists.
 #'
@@ -135,8 +135,8 @@ catalogue_of_life <- list(

 #' Translation table for common microorganism codes
 #'
-#' A data set containing commonly used codes for microorganisms, from laboratory systems and WHONET. Define your own with [set_mo_source()].
-#' @format A [`data.frame`] with 5,433 observations and 2 variables:
+#' A data set containing commonly used codes for microorganisms, from laboratory systems and WHONET. Define your own with [set_mo_source()]. They will all be searched when using [as.mo()] and consequently all the [`mo_*`][mo_property()] functions.
+#' @format A [`data.frame`] with 5,450 observations and 2 variables:
 #' - `code`\cr Commonly used code of a microorganism
 #' - `mo`\cr ID of the microorganism in the [microorganisms] data set
 #' @inheritSection catalogue_of_life Catalogue of Life
--- a/R/mo.R
+++ b/R/mo.R
@@ -57,7 +57,7 @@
 #'
 #' Values that cannot be coered will be considered 'unknown' and will get the MO code `UNKNOWN`.
 #'
-#' Use the [`mo_property_*`][mo_property()] functions to get properties based on the returned code, see Examples.
+#' Use the [`mo_*`][mo_property()] functions to get properties based on the returned code, see Examples.
 #'
 #' The algorithm uses data from the Catalogue of Life (see below) and from one other source (see [microorganisms]).
 #'
@@ -67,7 +67,7 @@
 #' 2. Taxonomic kingdom: the function starts with determining Bacteria, then Fungi, then Protozoa, then others;
 #' 3. Breakdown of input values to identify possible matches.
 #'
-#' This will lead to the effect that e.g. `"E. coli"` (a highly prevalent microorganism found in humans) will return the microbial ID of *Escherichia coli* and not *Entamoeba coli* (a less prevalent microorganism in humans), although the latter would alphabetically come first. 
+#' This will lead to the effect that e.g. `"E. coli"` (a microorganism highly prevalent in humans) will return the microbial ID of *Escherichia coli* and not *Entamoeba coli* (a microorganism less prevalent in humans), although the latter would alphabetically come first. 
 #' 
 #' ## Coping with uncertain results
 #' 
@@ -456,6 +456,8 @@ exec_as.mo <- function(x,
    x <- gsub("fungus[ph|f]rya", "fungiphrya", x)
    # remove non-text in case of "E. coli" except dots and spaces
    x <- trimws(gsub("[^.a-zA-Z0-9/ \\-]+", " ", x))
+    # but make sure that dots are followed by a space
+    x <- gsub("[.] ?", ". ", x)
    # replace minus by a space
    x <- gsub("-+", " ", x)
    # replace hemolytic by haemolytic
@@ -1160,7 +1162,7 @@ exec_as.mo <- function(x,
            if (isTRUE(debug)) {
              cat("\n[ UNCERTAINTY LEVEL", now_checks_for_uncertainty_level, "] (6) try to strip off half an element from end and check the remains\n")
            }
-            x_strip <- a.x_backup %>% strsplit(" ") %>% unlist()
+            x_strip <- a.x_backup %>% strsplit("[ .]") %>% unlist()
            if (length(x_strip) > 1) {
              for (i in seq_len(length(x_strip) - 1)) {
                lastword <- x_strip[length(x_strip) - i + 1]
@@ -1250,7 +1252,7 @@ exec_as.mo <- function(x,
            if (isTRUE(debug)) {
              cat("\n[ UNCERTAINTY LEVEL", now_checks_for_uncertainty_level, "] (9) try to strip off one element from start and check the remains (only allow >= 2-part name outcome)\n")
            }
-            x_strip <- a.x_backup %>% strsplit(" ") %>% unlist()
+            x_strip <- a.x_backup %>% strsplit("[ .]") %>% unlist()
            if (length(x_strip) > 1 & nchar(g.x_backup_without_spp) >= 6) {
              for (i in 2:(length(x_strip))) {
                x_strip_collapsed <- paste(x_strip[i:length(x_strip)], collapse = " ")
@@ -1288,7 +1290,7 @@ exec_as.mo <- function(x,
            if (isTRUE(debug)) {
              cat("\n[ UNCERTAINTY LEVEL", now_checks_for_uncertainty_level, "] (10) try to strip off one element from start and check the remains (any text size)\n")
            }
-            x_strip <- a.x_backup %>% strsplit(" ") %>% unlist()
+            x_strip <- a.x_backup %>% strsplit("[ .]") %>% unlist()
            if (length(x_strip) > 1 & nchar(g.x_backup_without_spp) >= 6) {
              for (i in 2:(length(x_strip))) {
                x_strip_collapsed <- paste(x_strip[i:length(x_strip)], collapse = " ")