1
0
mirror of https://github.com/msberends/AMR.git synced 2025-07-08 11:51:59 +02:00

(v1.8.1.9013) add Toxoplasma

This commit is contained in:
2022-06-10 13:15:23 +02:00
parent b84d647cac
commit 7f981e7778
29 changed files with 432 additions and 124 deletions

279
R/mo.R
View File

@ -2208,3 +2208,282 @@ strip_words <- function(text, n, side = "right") {
})
vapply(FUN.VALUE = character(1), out, paste, collapse = " ")
}
as.mo2 <- function(x,
Becker = FALSE,
Lancefield = FALSE,
allow_uncertain = TRUE,
reference_df = get_mo_source(),
info = interactive(),
property = "mo",
initial_search = TRUE,
dyslexia_mode = FALSE,
debug = FALSE,
ignore_pattern = getOption("AMR_ignore_pattern"),
reference_data_to_use = MO_lookup,
actual_uncertainty = 1,
actual_input = NULL,
language = get_AMR_locale()) {
meet_criteria(x, allow_class = c("mo", "data.frame", "list", "character", "numeric", "integer", "factor"), allow_NA = TRUE)
meet_criteria(Becker, allow_class = c("logical", "character"), has_length = 1)
meet_criteria(Lancefield, allow_class = c("logical", "character"), has_length = 1)
meet_criteria(allow_uncertain, allow_class = c("logical", "numeric", "integer"), has_length = 1)
meet_criteria(reference_df, allow_class = "data.frame", allow_NULL = TRUE)
meet_criteria(property, allow_class = "character", has_length = 1, is_in = colnames(microorganisms))
meet_criteria(initial_search, allow_class = "logical", has_length = 1)
meet_criteria(dyslexia_mode, allow_class = "logical", has_length = 1)
meet_criteria(debug, allow_class = "logical", has_length = 1)
meet_criteria(ignore_pattern, allow_class = "character", has_length = 1, allow_NULL = TRUE)
meet_criteria(reference_data_to_use, allow_class = "data.frame")
meet_criteria(actual_uncertainty, allow_class = "numeric", has_length = 1)
meet_criteria(actual_input, allow_class = "character", allow_NULL = TRUE)
meet_criteria(language, has_length = 1, is_in = c(LANGUAGES_SUPPORTED, ""), allow_NULL = TRUE, allow_NA = TRUE)
check_dataset_integrity()
if (isTRUE(debug) && initial_search == TRUE) {
time_start_tracking()
}
lookup <- function(needle,
column = property,
haystack = reference_data_to_use,
n = 1,
debug_mode = debug,
initial = initial_search,
uncertainty = actual_uncertainty,
input_actual = actual_input) {
if (!is.null(input_actual)) {
input <- input_actual
} else {
input <- tryCatch(x_backup[i], error = function(e) "")
}
# `column` can be NULL for all columns, or a selection
# returns a [character] (vector) - if `column` > length 1 then with columns as names
if (isTRUE(debug_mode)) {
cat(font_silver("Looking up: ", substitute(needle), collapse = ""),
"\n ", time_track())
}
if (length(column) == 1) {
res_df <- haystack[which(eval(substitute(needle), envir = haystack, enclos = parent.frame())), , drop = FALSE]
if (NROW(res_df) > 1 & uncertainty != -1) {
# sort the findings on matching score
scores <- mo_matching_score(x = input,
n = res_df[, "fullname", drop = TRUE])
res_df <- res_df[order(scores, decreasing = TRUE), , drop = FALSE]
}
res <- as.character(res_df[, column, drop = TRUE])
if (length(res) == 0) {
if (isTRUE(debug_mode)) {
cat(font_red(" (no match)\n"))
}
NA_character_
} else {
if (isTRUE(debug_mode)) {
cat(font_green(paste0(" MATCH (", NROW(res_df), " results)\n")))
}
if ((length(res) > n | uncertainty > 1) & uncertainty != -1) {
# save the other possible results as well, but not for forced certain results (then uncertainty == -1)
uncertainties <<- rbind(uncertainties,
format_uncertainty_as_df(uncertainty_level = uncertainty,
input = input,
result_mo = res_df[1, "mo", drop = TRUE],
candidates = as.character(res_df[, "fullname", drop = TRUE])),
stringsAsFactors = FALSE)
}
res[seq_len(min(n, length(res)))]
}
} else {
if (is.null(column)) {
column <- names(haystack)
}
res <- haystack[which(eval(substitute(needle), envir = haystack, enclos = parent.frame())), , drop = FALSE]
res <- res[seq_len(min(n, nrow(res))), column, drop = TRUE]
if (NROW(res) == 0) {
if (isTRUE(debug_mode)) {
cat(font_red(" (no rows)\n"))
}
res <- rep(NA_character_, length(column))
} else {
if (isTRUE(debug_mode)) {
cat(font_green(paste0(" MATCH (", NROW(res), " rows)\n")))
}
}
res <- as.character(res)
names(res) <- column
res
}
}
# start off with replaced language-specific non-ASCII characters with ASCII characters
x <- parse_and_convert(x)
# replace mo codes used in older package versions
x <- replace_old_mo_codes(x, property)
# ignore cases that match the ignore pattern
x <- replace_ignore_pattern(x, ignore_pattern)
# WHONET: xxx = no growth
x[tolower(as.character(paste0(x, ""))) %in% c("", "xxx", "na", "nan")] <- NA_character_
# Laboratory systems: remove (translated) entries like "no growth", etc.
x[trimws2(x) %like% translate_AMR("no .*growth", language = language)] <- NA_character_
x[trimws2(x) %like% paste0("^(", translate_AMR("no|not", language = language), ") [a-z]+")] <- "UNKNOWN"
if (initial_search == TRUE) {
# keep track of time - give some hints to improve speed if it takes a long time
start_time <- Sys.time()
pkg_env$mo_failures <- NULL
pkg_env$mo_uncertainties <- NULL
pkg_env$mo_renamed <- NULL
}
pkg_env$mo_renamed_last_run <- NULL
failures <- character(0)
uncertainty_level <- translate_allow_uncertain(allow_uncertain)
uncertainties <- data.frame(uncertainty = integer(0),
input = character(0),
fullname = character(0),
renamed_to = character(0),
mo = character(0),
candidates = character(0),
stringsAsFactors = FALSE)
x_input <- x
# already strip leading and trailing spaces
x <- trimws(x)
# only check the uniques, which is way faster
x <- unique(x)
# remove empty values (to later fill them in again with NAs)
# ("xxx" is WHONET code for 'no growth')
x <- x[!is.na(x)
& !is.null(x)
& !identical(x, "")
& !identical(x, "xxx")]
# defined df to check for
if (!is.null(reference_df)) {
check_validity_mo_source(reference_df)
reference_df <- repair_reference_df(reference_df)
}
# all empty
if (all(identical(trimws(x_input), "") | is.na(x_input) | length(x) == 0)) {
if (property == "mo") {
return(set_clean_class(rep(NA_character_, length(x_input)),
new_class = c("mo", "character")))
} else {
return(rep(NA_character_, length(x_input)))
}
} else if (all(x %in% reference_df[, 1][[1]])) {
# all in reference df
colnames(reference_df)[1] <- "x"
suppressWarnings(
x <- MO_lookup[match(reference_df[match(x, reference_df$x), "mo", drop = TRUE], MO_lookup$mo), property, drop = TRUE]
)
} else if (all(x %in% reference_data_to_use$mo)) {
x <- MO_lookup[match(x, MO_lookup$mo), property, drop = TRUE]
} else if (all(tolower(x) %in% reference_data_to_use$fullname_lower)) {
# we need special treatment for very prevalent full names, they are likely!
# e.g. as.mo("Staphylococcus aureus")
x <- MO_lookup[match(tolower(x), MO_lookup$fullname_lower), property, drop = TRUE]
} else if (all(x %in% reference_data_to_use$fullname)) {
# we need special treatment for very prevalent full names, they are likely!
# e.g. as.mo("Staphylococcus aureus")
x <- MO_lookup[match(x, MO_lookup$fullname), property, drop = TRUE]
} else if (all(toupper(x) %in% microorganisms.codes$code)) {
# commonly used MO codes
x <- MO_lookup[match(microorganisms.codes[match(toupper(x),
microorganisms.codes$code),
"mo",
drop = TRUE],
MO_lookup$mo),
property,
drop = TRUE]
} else if (!all(x %in% microorganisms[, property])) {
strip_whitespace <- function(x, dyslexia_mode) {
# all whitespaces (tab, new lines, etc.) should be one space
# and spaces before and after should be left blank
trimmed <- trimws2(x)
# also, make sure the trailing and leading characters are a-z or 0-9
# in case of non-regex
if (dyslexia_mode == FALSE) {
trimmed <- gsub("^[^a-zA-Z0-9)(]+", "", trimmed, perl = TRUE)
trimmed <- gsub("[^a-zA-Z0-9)(]+$", "", trimmed, perl = TRUE)
}
trimmed
}
x_backup_untouched <- x
x <- strip_whitespace(x, dyslexia_mode)
# translate 'unknown' names back to English
if (any(tolower(x) %like_case% "unbekannt|onbekend|desconocid|sconosciut|iconnu|desconhecid", na.rm = TRUE)) {
trns <- subset(TRANSLATIONS, pattern %like% "unknown")
langs <- LANGUAGES_SUPPORTED[LANGUAGES_SUPPORTED != "en"]
for (l in langs) {
for (i in seq_len(nrow(trns))) {
if (!is.na(trns[i, l, drop = TRUE])) {
x <- gsub(pattern = trns[i, l, drop = TRUE],
replacement = trns$pattern[i],
x = x,
ignore.case = TRUE,
perl = TRUE)
}
}
}
}
# remove spp and species
x <- gsub("(^| )[ .]*(spp|ssp|ss|sp|subsp|subspecies|biovar|biotype|serovar|species)[ .]*( |$)", "", x, ignore.case = TRUE, perl = TRUE)
x <- strip_whitespace(x, dyslexia_mode)
x_backup <- x
# from here on case-insensitive
x <- tolower(x)
x_backup[x %like_case% "^(fungus|fungi)$"] <- "(unknown fungus)" # will otherwise become the kingdom
x_backup[x_backup_untouched == "Fungi"] <- "Fungi" # is literally the kingdom
# Fill in fullnames and MO codes directly
known_names <- tolower(x_backup) %in% MO_lookup$fullname_lower
x[known_names] <- MO_lookup[match(tolower(x_backup)[known_names], MO_lookup$fullname_lower), property, drop = TRUE]
known_codes_mo <- toupper(x_backup) %in% MO_lookup$mo
x[known_codes_mo] <- MO_lookup[match(toupper(x_backup)[known_codes_mo], MO_lookup$mo), property, drop = TRUE]
known_codes_lis <- toupper(x_backup) %in% microorganisms.codes$code
x[known_codes_lis] <- MO_lookup[match(microorganisms.codes[match(toupper(x_backup)[known_codes_lis],
microorganisms.codes$code), "mo", drop = TRUE],
MO_lookup$mo), property, drop = TRUE]
already_known <- known_names | known_codes_mo | known_codes_lis
# now only continue where the right taxonomic output is not already known
if (any(!already_known)) {
x_unknown <- x[!already_known]
x_unknown <- gsub(" ?[(].*[)] ?", "", x_unknown, perl = TRUE)
x_unknown <- gsub("[^a-z ]", " ", x_unknown, perl = TRUE)
x_unknown <- gsub(" +", " ", x_unknown, perl = TRUE)
print(x_unknown)
x_search <- gsub("([a-z])[a-z]*( ([a-z])[a-z]*)?( ([a-z])[a-z]*)?", "^\\1.* \\3.* \\5.*", x_unknown, perl = TRUE)
x_search <- gsub("( [.][*])+$", "", x_search, perl = TRUE)
print(x_search)
for (i in seq_len(length(x_unknown))) {
# search first, second and third part
mos_to_search <- MO_lookup[which(MO_lookup$fullname_lower %like_case% x_search[i]), "fullname", drop = TRUE]
score <- mo_matching_score(x_unknown[i], mos_to_search)
out <- mos_to_search[order(score, decreasing = TRUE)][1:25] # keep first 25
print(score[order(score, decreasing = TRUE)][1])
x[!already_known][i] <- MO_lookup$mo[match(out[1], MO_lookup$fullname)]
}
}
}
x
}