1
0
mirror of https://github.com/msberends/AMR.git synced 2025-07-10 06:21:50 +02:00

(v1.2.0.9023) ab_from_text() improvement

This commit is contained in:
2020-07-02 21:12:52 +02:00
parent 298e67a45b
commit 152ac5bcad
29 changed files with 175 additions and 118 deletions

View File

@ -27,6 +27,7 @@
#' @param type type of property to search for, either `"drug"`, `"dose"` or `"administration"`, see *Examples*
#' @param collapse character to pass on to `paste(..., collapse = ...)` to only return one character per element of `text`, see *Examples*
#' @param translate_ab if `type = "drug"`: a column name of the [antibiotics] data set to translate the antibiotic abbreviations to, using [ab_property()]. Defaults to `FALSE`. Using `TRUE` is equal to using "name".
#' @param thorough_search logical to indicate whether the input must be extensively searched for misspelling and other faulty input values. Setting this to `TRUE` will take considerably more time than when using `FALSE`. At default, it will turn `TRUE` when all input elements contain a maximum of three words.
#' @param ... parameters passed on to [as.ab()]
#' @details This function is also internally used by [as.ab()], although it then only searches for the first drug name and will throw a note if more drug names could have been returned.
#'
@ -85,6 +86,7 @@ ab_from_text <- function(text,
type = c("drug", "dose", "administration"),
collapse = NULL,
translate_ab = FALSE,
thorough_search = NULL,
...) {
if (missing(type)) {
@ -95,30 +97,54 @@ ab_from_text <- function(text,
text <- tolower(as.character(text))
text_split_all <- strsplit(text, "[ ;.,:\\|]")
progress <- progress_estimated(n = length(text_split_all), n_min = 5)
on.exit(close(progress))
if (type %like% "(drug|ab|anti)") {
translate_ab <- get_translate_ab(translate_ab)
abbr <- unlist(antibiotics$abbreviations)
abbr <- abbr[nchar(abbr) >= 4]
names_atc <- substr(c(antibiotics$name, antibiotics$atc), 1, 5)
synonyms <- unlist(antibiotics$synonyms)
synonyms <- synonyms[nchar(synonyms) >= 4]
to_regex <- function(x) {
paste0("^(",
paste0(unique(gsub("[^a-z0-9]", ".*", sort(tolower(x)))), collapse = "|"),
").*")
if (isTRUE(thorough_search) |
(isTRUE(is.null(thorough_search)) & max(sapply(text_split_all, length), na.rm = TRUE) <= 3)) {
text_split_all <- text_split_all[nchar(text_split_all) >= 4 & grepl("[a-z]+", text_split_all)]
result <- lapply(text_split_all, function(text_split) {
progress$tick()
suppressWarnings(
out <- as.ab(text_split, ...)
)
})
} else {
# no thorough search
abbr <- unlist(antibiotics$abbreviations)
abbr <- abbr[nchar(abbr) >= 4]
names_atc <- substr(c(antibiotics$name, antibiotics$atc), 1, 5)
synonyms <- unlist(antibiotics$synonyms)
synonyms <- synonyms[nchar(synonyms) >= 4]
# regular expression must not be too long, so split synonyms in two:
synonyms_part1 <- synonyms[seq_len(0.5 * length(synonyms))]
synonyms_part2 <- synonyms[!synonyms %in% synonyms_part1]
to_regex <- function(x) {
paste0("^(",
paste0(unique(gsub("[^a-z0-9]+", "", sort(tolower(x)))), collapse = "|"),
").*")
}
result <- lapply(text_split_all, function(text_split) {
progress$tick()
suppressWarnings(
out <- as.ab(unique(c(text_split[text_split %like_case% to_regex(abbr)],
text_split[text_split %like_case% to_regex(names_atc)],
text_split[text_split %like_case% to_regex(synonyms_part1)],
text_split[text_split %like_case% to_regex(synonyms_part2)])
),
...)
)
})
}
result <- lapply(text_split_all, function(text_split) {
suppressWarnings(
out <- as.ab(unique(c(text_split[grep(to_regex(abbr), text_split)],
text_split[grep(to_regex(names_atc), text_split)],
# regular expression must not be too long, so split synonyms in two:
text_split[grep(to_regex(synonyms[c(1:0.5 * length(synonyms))]), text_split)],
text_split[grep(to_regex(synonyms[c(0.5 * length(synonyms):length(synonyms))]), text_split)])),
...))
close(progress)
result <- lapply(result, function(out) {
out <- out[!is.na(out)]
if (length(out) == 0) {
as.ab(NA)
@ -128,6 +154,7 @@ ab_from_text <- function(text,
}
out
}
})
} else if (type %like% "dos") {
@ -167,7 +194,7 @@ ab_from_text <- function(text,
# collapse text if needed
if (!is.null(collapse)) {
result <- sapply(result, function(x) {
if(length(x) == 1 & all(is.na(x))) {
if (length(x) == 1 & all(is.na(x))) {
NA_character_
} else {
paste0(x, collapse = collapse)