1
0
mirror of https://github.com/msberends/AMR.git synced 2024-12-25 18:06:12 +01:00

(v0.7.1.9038) mo algorithm inprovements

This commit is contained in:
dr. M.S. (Matthijs) Berends 2019-08-12 19:07:15 +02:00
parent c44c5e3913
commit 5f2733349e
10 changed files with 226 additions and 130 deletions

View File

@ -1,5 +1,5 @@
Package: AMR
Version: 0.7.1.9037
Version: 0.7.1.9038
Date: 2019-08-12
Title: Antimicrobial Resistance Analysis
Authors@R: c(

View File

@ -1,4 +1,4 @@
# AMR 0.7.1.9037
# AMR 0.7.1.9038
### Breaking
* Function `freq()` has moved to a new package, [`clean`](https://github.com/msberends/clean) ([CRAN link](https://cran.r-project.org/package=clean)). Creating frequency tables is actually not the scope of this package (never was) and this function has matured a lot over the last two years. Therefore, a new package was created for data cleaning and checking and it perfectly fits the `freq()` function. The [`clean`](https://github.com/msberends/clean) package is available on CRAN and will be installed automatically when updating the `AMR` package, that now imports it. In a later stage, the `skewness()` and `kurtosis()` functions will be moved to the `clean` package too.

142
R/mo.R
View File

@ -286,6 +286,7 @@ is.mo <- function(x) {
#' @importFrom crayon magenta red blue silver italic
# param property a column name of AMR::microorganisms
# param initial_search logical - is FALSE when coming from uncertain tries, which uses exec_as.mo internally too
# param dyslexia_mode logical - also check for characters that resemble others
# param force_mo_history logical - whether found result must be saved with set_mo_history (default FALSE on non-interactive sessions)
# param debug logical - show different lookup texts while searching
exec_as.mo <- function(x,
@ -295,6 +296,7 @@ exec_as.mo <- function(x,
reference_df = get_mo_source(),
property = "mo",
initial_search = TRUE,
dyslexia_mode = FALSE,
force_mo_history = FALSE,
debug = FALSE) {
@ -490,7 +492,7 @@ exec_as.mo <- function(x,
# remove genus as first word
x <- gsub("^Genus ", "", x)
# allow characters that resemble others ----
if (initial_search == FALSE) {
if (dyslexia_mode == TRUE) {
x <- tolower(x)
x <- gsub("[iy]+", "[iy]+", x)
x <- gsub("(c|k|q|qu|s|z|x|ks)+", "(c|k|q|qu|s|z|x|ks)+", x)
@ -1047,10 +1049,15 @@ exec_as.mo <- function(x,
}
if (uncertainty_level >= 1) {
now_checks_for_uncertainty_level <- 1
# (1) look again for old taxonomic names, now for G. species ----
if (isTRUE(debug)) {
cat("\n[UNCERTAINLY LEVEL 1] (1) look again for old taxonomic names, now for G. species\n")
}
if (isTRUE(debug)) {
message("Running '", c.x_withspaces_start_end, "' and '", d.x_withspaces_start_only, "'")
}
found <- microorganisms.oldDT[fullname %like% c.x_withspaces_start_end
| fullname %like% d.x_withspaces_start_only]
if (NROW(found) > 0 & nchar(g.x_backup_without_spp) >= 6) {
@ -1068,7 +1075,7 @@ exec_as.mo <- function(x,
ref_new = microorganismsDT[col_id == found[1, col_id_new], ref],
mo = microorganismsDT[col_id == found[1, col_id_new], mo])
uncertainties <<- rbind(uncertainties,
data.frame(uncertainty = 1,
data.frame(uncertainty = now_checks_for_uncertainty_level,
input = a.x_backup,
fullname = found[1, fullname],
mo = paste("CoL", found[1, col_id])))
@ -1079,16 +1086,24 @@ exec_as.mo <- function(x,
}
# (2) Try with misspelled input ----
# just rerun with initial_search = FALSE will used the extensive regex part above
# just rerun with dyslexia_mode = TRUE will used the extensive regex part above
if (isTRUE(debug)) {
cat("\n[UNCERTAINLY LEVEL 1] (2) Try with misspelled input\n")
}
found <- suppressMessages(suppressWarnings(exec_as.mo(a.x_backup, initial_search = FALSE, allow_uncertain = FALSE, debug = debug)))
if (isTRUE(debug)) {
message("Running '", a.x_backup, "'")
}
# first try without dyslexia mode
found <- suppressMessages(suppressWarnings(exec_as.mo(a.x_backup, initial_search = FALSE, dyslexia_mode = FALSE, allow_uncertain = FALSE, debug = debug)))
if (empty_result(found)) {
# then with dyslexia mode
found <- suppressMessages(suppressWarnings(exec_as.mo(a.x_backup, initial_search = FALSE, dyslexia_mode = TRUE, allow_uncertain = FALSE, debug = debug)))
}
if (!empty_result(found)) {
found_result <- found
found <- microorganismsDT[mo == found, ..property][[1]]
uncertainties <<- rbind(uncertainties,
data.frame(uncertainty = 1,
data.frame(uncertainty = now_checks_for_uncertainty_level,
input = a.x_backup,
fullname = microorganismsDT[mo == found_result[1L], fullname][[1]],
mo = found_result[1L]))
@ -1100,6 +1115,7 @@ exec_as.mo <- function(x,
}
if (uncertainty_level >= 2) {
now_checks_for_uncertainty_level <- 2
# (3) look for genus only, part of name ----
if (isTRUE(debug)) {
@ -1107,12 +1123,15 @@ exec_as.mo <- function(x,
}
if (nchar(g.x_backup_without_spp) > 4 & !b.x_trimmed %like% " ") {
if (!grepl("^[A-Z][a-z]+", b.x_trimmed, ignore.case = FALSE)) {
if (isTRUE(debug)) {
message("Running '", paste(b.x_trimmed, "species"), "'")
}
# not when input is like Genustext, because then Neospora would lead to Actinokineospora
found <- microorganismsDT[fullname_lower %like% paste(b.x_trimmed, "species"), ..property][[1]]
if (length(found) > 0) {
x[i] <- found[1L]
uncertainties <<- rbind(uncertainties,
data.frame(uncertainty = 2,
data.frame(uncertainty = now_checks_for_uncertainty_level,
input = a.x_backup,
fullname = microorganismsDT[mo == found[1L], fullname][[1]],
mo = found[1L]))
@ -1130,12 +1149,20 @@ exec_as.mo <- function(x,
}
a.x_backup_stripped <- gsub("( *[(].*[)] *)", " ", a.x_backup)
a.x_backup_stripped <- trimws(gsub(" +", " ", a.x_backup_stripped))
found <- suppressMessages(suppressWarnings(exec_as.mo(a.x_backup_stripped, initial_search = FALSE, allow_uncertain = FALSE, debug = debug)))
if (isTRUE(debug)) {
message("Running '", a.x_backup_stripped, "'")
}
# first try without dyslexia mode
found <- suppressMessages(suppressWarnings(exec_as.mo(a.x_backup_stripped, initial_search = FALSE, dyslexia_mode = FALSE, allow_uncertain = FALSE, debug = debug)))
if (empty_result(found)) {
# then with dyslexia mode
found <- suppressMessages(suppressWarnings(exec_as.mo(a.x_backup_stripped, initial_search = FALSE, dyslexia_mode = TRUE, allow_uncertain = FALSE, debug = debug)))
}
if (!empty_result(found) & nchar(g.x_backup_without_spp) >= 6) {
found_result <- found
found <- microorganismsDT[mo == found, ..property][[1]]
uncertainties <<- rbind(uncertainties,
data.frame(uncertainty = 2,
data.frame(uncertainty = now_checks_for_uncertainty_level,
input = a.x_backup,
fullname = microorganismsDT[mo == found_result[1L], fullname][[1]],
mo = found_result[1L]))
@ -1156,13 +1183,21 @@ exec_as.mo <- function(x,
lastword_half <- substr(lastword, 1, as.integer(nchar(lastword) / 2))
# remove last half of the second term
x_strip_collapsed <- paste(c(x_strip[1:(length(x_strip) - i)], lastword_half), collapse = " ")
if (nchar(x_strip_collapsed) >= 4) {
found <- suppressMessages(suppressWarnings(exec_as.mo(x_strip_collapsed, initial_search = FALSE, allow_uncertain = FALSE, debug = debug)))
if (nchar(x_strip_collapsed) >= 4 & nchar(lastword_half) > 2) {
if (isTRUE(debug)) {
message("Running '", x_strip_collapsed, "'")
}
# first try without dyslexia mode
found <- suppressMessages(suppressWarnings(exec_as.mo(x_strip_collapsed, initial_search = FALSE, dyslexia_mode = FALSE, allow_uncertain = FALSE, debug = debug)))
if (empty_result(found)) {
# then with dyslexia mode
found <- suppressMessages(suppressWarnings(exec_as.mo(x_strip_collapsed, initial_search = FALSE, dyslexia_mode = TRUE, allow_uncertain = FALSE, debug = debug)))
}
if (!empty_result(found)) {
found_result <- found
found <- microorganismsDT[mo == found, ..property][[1]]
uncertainties <<- rbind(uncertainties,
data.frame(uncertainty = 2,
data.frame(uncertainty = now_checks_for_uncertainty_level,
input = a.x_backup,
fullname = microorganismsDT[mo == found_result[1L], fullname][[1]],
mo = found_result[1L]))
@ -1181,13 +1216,21 @@ exec_as.mo <- function(x,
if (length(x_strip) > 1) {
for (i in 1:(length(x_strip) - 1)) {
x_strip_collapsed <- paste(x_strip[1:(length(x_strip) - i)], collapse = " ")
if (nchar(x_strip_collapsed) >= 4) {
found <- suppressMessages(suppressWarnings(exec_as.mo(x_strip_collapsed, initial_search = FALSE, allow_uncertain = FALSE, debug = debug)))
if (nchar(x_strip_collapsed) >= 6) {
if (isTRUE(debug)) {
message("Running '", x_strip_collapsed, "'")
}
# first try without dyslexia mode
found <- suppressMessages(suppressWarnings(exec_as.mo(x_strip_collapsed, initial_search = FALSE, dyslexia_mode = FALSE, allow_uncertain = FALSE, debug = debug)))
if (empty_result(found)) {
# then with dyslexia mode
found <- suppressMessages(suppressWarnings(exec_as.mo(x_strip_collapsed, initial_search = FALSE, dyslexia_mode = TRUE, allow_uncertain = FALSE, debug = debug)))
}
if (!empty_result(found)) {
found_result <- found
found <- microorganismsDT[mo == found, ..property][[1]]
uncertainties <<- rbind(uncertainties,
data.frame(uncertainty = 2,
data.frame(uncertainty = now_checks_for_uncertainty_level,
input = a.x_backup,
fullname = microorganismsDT[mo == found_result[1L], fullname][[1]],
mo = found_result[1L]))
@ -1208,7 +1251,7 @@ exec_as.mo <- function(x,
found_result <- found
found <- microorganismsDT[mo == found, ..property][[1]]
uncertainties <<- rbind(uncertainties,
data.frame(uncertainty = 2,
data.frame(uncertainty = now_checks_for_uncertainty_level,
input = a.x_backup,
fullname = microorganismsDT[mo == found_result[1L], fullname][[1]],
mo = found_result[1L]))
@ -1222,7 +1265,7 @@ exec_as.mo <- function(x,
found_result <- found
found <- microorganismsDT[mo == found, ..property][[1]]
uncertainties <<- rbind(uncertainties,
data.frame(uncertainty = 2,
data.frame(uncertainty = now_checks_for_uncertainty_level,
input = a.x_backup,
fullname = microorganismsDT[mo == found_result[1L], fullname][[1]],
mo = found_result[1L]))
@ -1239,14 +1282,22 @@ exec_as.mo <- function(x,
if (length(x_strip) > 1 & nchar(g.x_backup_without_spp) >= 6) {
for (i in 2:(length(x_strip))) {
x_strip_collapsed <- paste(x_strip[i:length(x_strip)], collapse = " ")
found <- suppressMessages(suppressWarnings(exec_as.mo(x_strip_collapsed, initial_search = FALSE, allow_uncertain = FALSE, debug = debug)))
if (isTRUE(debug)) {
message("Running '", x_strip_collapsed, "'")
}
# first try without dyslexia mode
found <- suppressMessages(suppressWarnings(exec_as.mo(x_strip_collapsed, initial_search = FALSE, dyslexia_mode = FALSE, allow_uncertain = FALSE, debug = debug)))
if (empty_result(found)) {
# then with dyslexia mode
found <- suppressMessages(suppressWarnings(exec_as.mo(x_strip_collapsed, initial_search = FALSE, dyslexia_mode = TRUE, allow_uncertain = FALSE, debug = debug)))
}
if (!empty_result(found)) {
found_result <- found
found <- microorganismsDT[mo == found_result[1L], ..property][[1]]
# uncertainty level 2 only if searched part contains a space (otherwise it will be found with lvl 3)
if (x_strip_collapsed %like% " ") {
uncertainties <<- rbind(uncertainties,
data.frame(uncertainty = 2,
data.frame(uncertainty = now_checks_for_uncertainty_level,
input = a.x_backup,
fullname = microorganismsDT[mo == found_result[1L], fullname][[1]],
mo = found_result[1L]))
@ -1261,20 +1312,30 @@ exec_as.mo <- function(x,
}
if (uncertainty_level >= 3) {
# (7) try to strip off one element from start and check the remains ----
now_checks_for_uncertainty_level <- 3
# (7a) try to strip off one element from start and check the remains (any text size) ----
if (isTRUE(debug)) {
cat("\n[UNCERTAINLY LEVEL 3] (7) try to strip off one element from start and check the remains\n")
cat("\n[UNCERTAINLY LEVEL 3] (7a) try to strip off one element from start and check the remains (any text size)\n")
}
x_strip <- a.x_backup %>% strsplit(" ") %>% unlist()
if (length(x_strip) > 1 & nchar(g.x_backup_without_spp) >= 6) {
for (i in 2:(length(x_strip))) {
x_strip_collapsed <- paste(x_strip[i:length(x_strip)], collapse = " ")
found <- suppressMessages(suppressWarnings(exec_as.mo(x_strip_collapsed, initial_search = FALSE, allow_uncertain = FALSE, debug = debug)))
if (isTRUE(debug)) {
message("Running '", x_strip_collapsed, "'")
}
# first try without dyslexia mode
found <- suppressMessages(suppressWarnings(exec_as.mo(x_strip_collapsed, initial_search = FALSE, dyslexia_mode = FALSE, allow_uncertain = FALSE, debug = debug)))
if (empty_result(found)) {
# then with dyslexia mode
found <- suppressMessages(suppressWarnings(exec_as.mo(x_strip_collapsed, initial_search = FALSE, dyslexia_mode = TRUE, allow_uncertain = FALSE, debug = debug)))
}
if (!empty_result(found)) {
found_result <- found
found <- microorganismsDT[mo == found, ..property][[1]]
uncertainties <<- rbind(uncertainties,
data.frame(uncertainty = 3,
data.frame(uncertainty = now_checks_for_uncertainty_level,
input = a.x_backup,
fullname = microorganismsDT[mo == found_result[1L], fullname][[1]],
mo = found_result[1L]))
@ -1285,18 +1346,53 @@ exec_as.mo <- function(x,
}
}
}
# (7b) try to strip off one element from end and check the remains (any text size) ----
# (this is in fact 5b but without nchar limit of >=6)
if (isTRUE(debug)) {
cat("\n[UNCERTAINLY LEVEL 3] (7b) try to strip off one element from end and check the remains (any text size)\n")
}
if (length(x_strip) > 1) {
for (i in 1:(length(x_strip) - 1)) {
x_strip_collapsed <- paste(x_strip[1:(length(x_strip) - i)], collapse = " ")
if (isTRUE(debug)) {
message("Running '", x_strip_collapsed, "'")
}
# first try without dyslexia mode
found <- suppressMessages(suppressWarnings(exec_as.mo(x_strip_collapsed, initial_search = FALSE, dyslexia_mode = FALSE, allow_uncertain = FALSE, debug = debug)))
if (empty_result(found)) {
# then with dyslexia mode
found <- suppressMessages(suppressWarnings(exec_as.mo(x_strip_collapsed, initial_search = FALSE, dyslexia_mode = TRUE, allow_uncertain = FALSE, debug = debug)))
}
if (!empty_result(found)) {
found_result <- found
found <- microorganismsDT[mo == found, ..property][[1]]
uncertainties <<- rbind(uncertainties,
data.frame(uncertainty = now_checks_for_uncertainty_level,
input = a.x_backup,
fullname = microorganismsDT[mo == found_result[1L], fullname][[1]],
mo = found_result[1L]))
if (initial_search == TRUE) {
set_mo_history(a.x_backup, get_mo_code(found[1L], property), 2, force = force_mo_history)
}
return(found[1L])
}
}
}
# (8) part of a name (very unlikely match) ----
if (isTRUE(debug)) {
cat("\n[UNCERTAINLY LEVEL 3] (8) part of a name (very unlikely match)\n")
}
if (isTRUE(debug)) {
message("Running '", f.x_withspaces_end_only, "'")
}
found <- microorganismsDT[fullname %like% f.x_withspaces_end_only]
if (nrow(found) > 0) {
found_result <- found[["mo"]]
if (!empty_result(found_result) & nchar(g.x_backup_without_spp) >= 6) {
found <- microorganismsDT[mo == found_result[1L], ..property][[1]]
uncertainties <<- rbind(uncertainties,
data.frame(uncertainty = 3,
data.frame(uncertainty = now_checks_for_uncertainty_level,
input = a.x_backup,
fullname = microorganismsDT[mo == found_result[1L], fullname][[1]],
mo = found_result[1L]))

View File

@ -78,7 +78,7 @@
</button>
<span class="navbar-brand">
<a class="navbar-link" href="index.html">AMR (for R)</a>
<span class="version label label-default" data-toggle="tooltip" data-placement="bottom" title="Latest development version">0.7.1.9037</span>
<span class="version label label-default" data-toggle="tooltip" data-placement="bottom" title="Latest development version">0.7.1.9038</span>
</span>
</div>

View File

@ -78,7 +78,7 @@
</button>
<span class="navbar-brand">
<a class="navbar-link" href="../index.html">AMR (for R)</a>
<span class="version label label-default" data-toggle="tooltip" data-placement="bottom" title="Latest development version">0.7.1.9037</span>
<span class="version label label-default" data-toggle="tooltip" data-placement="bottom" title="Latest development version">0.7.1.9038</span>
</span>
</div>

View File

@ -78,7 +78,7 @@
</button>
<span class="navbar-brand">
<a class="navbar-link" href="index.html">AMR (for R)</a>
<span class="version label label-default" data-toggle="tooltip" data-placement="bottom" title="Latest development version">0.7.1.9037</span>
<span class="version label label-default" data-toggle="tooltip" data-placement="bottom" title="Latest development version">0.7.1.9038</span>
</span>
</div>

View File

@ -42,7 +42,7 @@
</button>
<span class="navbar-brand">
<a class="navbar-link" href="index.html">AMR (for R)</a>
<span class="version label label-default" data-toggle="tooltip" data-placement="bottom" title="Latest development version">0.7.1.9037</span>
<span class="version label label-default" data-toggle="tooltip" data-placement="bottom" title="Latest development version">0.7.1.9038</span>
</span>
</div>

View File

@ -78,7 +78,7 @@
</button>
<span class="navbar-brand">
<a class="navbar-link" href="../index.html">AMR (for R)</a>
<span class="version label label-default" data-toggle="tooltip" data-placement="bottom" title="Latest development version">0.7.1.9037</span>
<span class="version label label-default" data-toggle="tooltip" data-placement="bottom" title="Latest development version">0.7.1.9038</span>
</span>
</div>
@ -225,9 +225,9 @@
</div>
<div id="amr-0-7-1-9037" class="section level1">
<div id="amr-0-7-1-9038" class="section level1">
<h1 class="page-header">
<a href="#amr-0-7-1-9037" class="anchor"></a>AMR 0.7.1.9037<small> Unreleased </small>
<a href="#amr-0-7-1-9038" class="anchor"></a>AMR 0.7.1.9038<small> Unreleased </small>
</h1>
<div id="breaking" class="section level3">
<h3 class="hasAnchor">
@ -1236,7 +1236,7 @@ Using <code><a href="../reference/as.mo.html">as.mo(..., allow_uncertain = 3)</a
<div id="tocnav">
<h2>Contents</h2>
<ul class="nav nav-pills nav-stacked">
<li><a href="#amr-0-7-1-9037">0.7.1.9037</a></li>
<li><a href="#amr-0-7-1-9038">0.7.1.9038</a></li>
<li><a href="#amr-0-7-1">0.7.1</a></li>
<li><a href="#amr-0-7-0">0.7.0</a></li>
<li><a href="#amr-0-6-1">0.6.1</a></li>

View File

@ -78,7 +78,7 @@
</button>
<span class="navbar-brand">
<a class="navbar-link" href="../index.html">AMR (for R)</a>
<span class="version label label-default" data-toggle="tooltip" data-placement="bottom" title="Latest development version">0.7.1.9037</span>
<span class="version label label-default" data-toggle="tooltip" data-placement="bottom" title="Latest development version">0.7.1.9038</span>
</span>
</div>

View File

@ -197,8 +197,8 @@ test_that("as.mo works", {
print(mo_renamed())
# check uncertain names
expect_equal(suppressWarnings(as.character(as.mo("esco extra_text", allow_uncertain = FALSE))), "UNKNOWN")
expect_equal(suppressWarnings(as.character(as.mo("esco extra_text", allow_uncertain = TRUE))), "B_ESCHR_COL")
expect_equal(suppressWarnings(as.character(as.mo("staaur extratest", allow_uncertain = TRUE))), "B_STPHY_AUR")
expect_equal(suppressWarnings(as.character(as.mo("staaur extratest", allow_uncertain = FALSE))), "UNKNOWN")
expect_warning(as.mo("esco extra_text", allow_uncertain = TRUE))
expect_equal(suppressWarnings(as.character(as.mo("unexisting aureus", allow_uncertain = 3))), "B_STPHY_AUR")
expect_equal(suppressWarnings(as.character(as.mo("unexisting staphy", allow_uncertain = 3))), "B_STPHY")
@ -270,7 +270,7 @@ test_that("as.mo works", {
expect_equal(as.character(as.mo("F_CANDD_GLB")), "F_CANDD_GLA")
# debug mode
expect_output(print(suppressWarnings(as.mo("kshgcjkhsdgkshjdfsfvsdfv", debug = TRUE, allow_uncertain = 3))))
expect_output(print(suppressMessages(suppressWarnings(as.mo("kshgcjkhsdgkshjdfsfvsdfv", debug = TRUE, allow_uncertain = 3)))))
# ..coccus
expect_equal(as.character(as.mo(c("meningococ", "gonococ", "pneumococ"))),