1
0
mirror of https://github.com/msberends/AMR.git synced 2025-07-09 04:02:19 +02:00

(v0.9.0.9020) as.mo() improvement

This commit is contained in:
2020-02-09 22:04:29 +01:00
parent 4e1c16c5a5
commit c7a76ba713
35 changed files with 70179 additions and 70116 deletions

12
R/mo.R
View File

@ -57,7 +57,7 @@
#'
#' Values that cannot be coered will be considered 'unknown' and will get the MO code `UNKNOWN`.
#'
#' Use the [`mo_property_*`][mo_property()] functions to get properties based on the returned code, see Examples.
#' Use the [`mo_*`][mo_property()] functions to get properties based on the returned code, see Examples.
#'
#' The algorithm uses data from the Catalogue of Life (see below) and from one other source (see [microorganisms]).
#'
@ -67,7 +67,7 @@
#' 2. Taxonomic kingdom: the function starts with determining Bacteria, then Fungi, then Protozoa, then others;
#' 3. Breakdown of input values to identify possible matches.
#'
#' This will lead to the effect that e.g. `"E. coli"` (a highly prevalent microorganism found in humans) will return the microbial ID of *Escherichia coli* and not *Entamoeba coli* (a less prevalent microorganism in humans), although the latter would alphabetically come first.
#' This will lead to the effect that e.g. `"E. coli"` (a microorganism highly prevalent in humans) will return the microbial ID of *Escherichia coli* and not *Entamoeba coli* (a microorganism less prevalent in humans), although the latter would alphabetically come first.
#'
#' ## Coping with uncertain results
#'
@ -456,6 +456,8 @@ exec_as.mo <- function(x,
x <- gsub("fungus[ph|f]rya", "fungiphrya", x)
# remove non-text in case of "E. coli" except dots and spaces
x <- trimws(gsub("[^.a-zA-Z0-9/ \\-]+", " ", x))
# but make sure that dots are followed by a space
x <- gsub("[.] ?", ". ", x)
# replace minus by a space
x <- gsub("-+", " ", x)
# replace hemolytic by haemolytic
@ -1160,7 +1162,7 @@ exec_as.mo <- function(x,
if (isTRUE(debug)) {
cat("\n[ UNCERTAINTY LEVEL", now_checks_for_uncertainty_level, "] (6) try to strip off half an element from end and check the remains\n")
}
x_strip <- a.x_backup %>% strsplit(" ") %>% unlist()
x_strip <- a.x_backup %>% strsplit("[ .]") %>% unlist()
if (length(x_strip) > 1) {
for (i in seq_len(length(x_strip) - 1)) {
lastword <- x_strip[length(x_strip) - i + 1]
@ -1250,7 +1252,7 @@ exec_as.mo <- function(x,
if (isTRUE(debug)) {
cat("\n[ UNCERTAINTY LEVEL", now_checks_for_uncertainty_level, "] (9) try to strip off one element from start and check the remains (only allow >= 2-part name outcome)\n")
}
x_strip <- a.x_backup %>% strsplit(" ") %>% unlist()
x_strip <- a.x_backup %>% strsplit("[ .]") %>% unlist()
if (length(x_strip) > 1 & nchar(g.x_backup_without_spp) >= 6) {
for (i in 2:(length(x_strip))) {
x_strip_collapsed <- paste(x_strip[i:length(x_strip)], collapse = " ")
@ -1288,7 +1290,7 @@ exec_as.mo <- function(x,
if (isTRUE(debug)) {
cat("\n[ UNCERTAINTY LEVEL", now_checks_for_uncertainty_level, "] (10) try to strip off one element from start and check the remains (any text size)\n")
}
x_strip <- a.x_backup %>% strsplit(" ") %>% unlist()
x_strip <- a.x_backup %>% strsplit("[ .]") %>% unlist()
if (length(x_strip) > 1 & nchar(g.x_backup_without_spp) >= 6) {
for (i in 2:(length(x_strip))) {
x_strip_collapsed <- paste(x_strip[i:length(x_strip)], collapse = " ")