mirror of
https://github.com/msberends/AMR.git
synced 2025-07-09 02:03:04 +02:00
(v1.2.0.9023) ab_from_text() improvement
This commit is contained in:
@ -184,7 +184,7 @@ stop_ifnot_installed <- function(package) {
|
||||
# https://developer.r-project.org/Blog/public/2019/02/14/staged-install/index.html
|
||||
sapply(package, function(pkg)
|
||||
tryCatch(get(".packageName", envir = asNamespace(pkg)),
|
||||
error = function(e) {
|
||||
error = function(e) {
|
||||
if (package == "rstudioapi") {
|
||||
stop("This function only works in RStudio.", call. = FALSE)
|
||||
} else if (pkg != "base") {
|
||||
|
@ -27,6 +27,7 @@
|
||||
#' @param type type of property to search for, either `"drug"`, `"dose"` or `"administration"`, see *Examples*
|
||||
#' @param collapse character to pass on to `paste(..., collapse = ...)` to only return one character per element of `text`, see *Examples*
|
||||
#' @param translate_ab if `type = "drug"`: a column name of the [antibiotics] data set to translate the antibiotic abbreviations to, using [ab_property()]. Defaults to `FALSE`. Using `TRUE` is equal to using "name".
|
||||
#' @param thorough_search logical to indicate whether the input must be extensively searched for misspelling and other faulty input values. Setting this to `TRUE` will take considerably more time than when using `FALSE`. At default, it will turn `TRUE` when all input elements contain a maximum of three words.
|
||||
#' @param ... parameters passed on to [as.ab()]
|
||||
#' @details This function is also internally used by [as.ab()], although it then only searches for the first drug name and will throw a note if more drug names could have been returned.
|
||||
#'
|
||||
@ -85,6 +86,7 @@ ab_from_text <- function(text,
|
||||
type = c("drug", "dose", "administration"),
|
||||
collapse = NULL,
|
||||
translate_ab = FALSE,
|
||||
thorough_search = NULL,
|
||||
...) {
|
||||
|
||||
if (missing(type)) {
|
||||
@ -95,30 +97,54 @@ ab_from_text <- function(text,
|
||||
|
||||
text <- tolower(as.character(text))
|
||||
text_split_all <- strsplit(text, "[ ;.,:\\|]")
|
||||
progress <- progress_estimated(n = length(text_split_all), n_min = 5)
|
||||
on.exit(close(progress))
|
||||
|
||||
if (type %like% "(drug|ab|anti)") {
|
||||
|
||||
translate_ab <- get_translate_ab(translate_ab)
|
||||
|
||||
abbr <- unlist(antibiotics$abbreviations)
|
||||
abbr <- abbr[nchar(abbr) >= 4]
|
||||
names_atc <- substr(c(antibiotics$name, antibiotics$atc), 1, 5)
|
||||
synonyms <- unlist(antibiotics$synonyms)
|
||||
synonyms <- synonyms[nchar(synonyms) >= 4]
|
||||
to_regex <- function(x) {
|
||||
paste0("^(",
|
||||
paste0(unique(gsub("[^a-z0-9]", ".*", sort(tolower(x)))), collapse = "|"),
|
||||
").*")
|
||||
if (isTRUE(thorough_search) |
|
||||
(isTRUE(is.null(thorough_search)) & max(sapply(text_split_all, length), na.rm = TRUE) <= 3)) {
|
||||
text_split_all <- text_split_all[nchar(text_split_all) >= 4 & grepl("[a-z]+", text_split_all)]
|
||||
result <- lapply(text_split_all, function(text_split) {
|
||||
progress$tick()
|
||||
suppressWarnings(
|
||||
out <- as.ab(text_split, ...)
|
||||
)
|
||||
})
|
||||
|
||||
} else {
|
||||
# no thorough search
|
||||
abbr <- unlist(antibiotics$abbreviations)
|
||||
abbr <- abbr[nchar(abbr) >= 4]
|
||||
names_atc <- substr(c(antibiotics$name, antibiotics$atc), 1, 5)
|
||||
synonyms <- unlist(antibiotics$synonyms)
|
||||
synonyms <- synonyms[nchar(synonyms) >= 4]
|
||||
# regular expression must not be too long, so split synonyms in two:
|
||||
synonyms_part1 <- synonyms[seq_len(0.5 * length(synonyms))]
|
||||
synonyms_part2 <- synonyms[!synonyms %in% synonyms_part1]
|
||||
to_regex <- function(x) {
|
||||
paste0("^(",
|
||||
paste0(unique(gsub("[^a-z0-9]+", "", sort(tolower(x)))), collapse = "|"),
|
||||
").*")
|
||||
}
|
||||
result <- lapply(text_split_all, function(text_split) {
|
||||
progress$tick()
|
||||
suppressWarnings(
|
||||
out <- as.ab(unique(c(text_split[text_split %like_case% to_regex(abbr)],
|
||||
text_split[text_split %like_case% to_regex(names_atc)],
|
||||
text_split[text_split %like_case% to_regex(synonyms_part1)],
|
||||
text_split[text_split %like_case% to_regex(synonyms_part2)])
|
||||
),
|
||||
...)
|
||||
)
|
||||
})
|
||||
}
|
||||
|
||||
result <- lapply(text_split_all, function(text_split) {
|
||||
suppressWarnings(
|
||||
out <- as.ab(unique(c(text_split[grep(to_regex(abbr), text_split)],
|
||||
text_split[grep(to_regex(names_atc), text_split)],
|
||||
# regular expression must not be too long, so split synonyms in two:
|
||||
text_split[grep(to_regex(synonyms[c(1:0.5 * length(synonyms))]), text_split)],
|
||||
text_split[grep(to_regex(synonyms[c(0.5 * length(synonyms):length(synonyms))]), text_split)])),
|
||||
...))
|
||||
close(progress)
|
||||
|
||||
result <- lapply(result, function(out) {
|
||||
out <- out[!is.na(out)]
|
||||
if (length(out) == 0) {
|
||||
as.ab(NA)
|
||||
@ -128,6 +154,7 @@ ab_from_text <- function(text,
|
||||
}
|
||||
out
|
||||
}
|
||||
|
||||
})
|
||||
|
||||
} else if (type %like% "dos") {
|
||||
@ -167,7 +194,7 @@ ab_from_text <- function(text,
|
||||
# collapse text if needed
|
||||
if (!is.null(collapse)) {
|
||||
result <- sapply(result, function(x) {
|
||||
if(length(x) == 1 & all(is.na(x))) {
|
||||
if (length(x) == 1 & all(is.na(x))) {
|
||||
NA_character_
|
||||
} else {
|
||||
paste0(x, collapse = collapse)
|
||||
|
@ -135,7 +135,7 @@ format.bug_drug_combinations <- function(x,
|
||||
format <- tolower(format)
|
||||
ab_txt <- rep(format, length(ab))
|
||||
for (i in seq_len(length(ab_txt))) {
|
||||
ab_txt[i] <- gsub("ab", ab[i], ab_txt[i])
|
||||
ab_txt[i] <- gsub("ab", as.character(as.ab(ab[i])), ab_txt[i])
|
||||
ab_txt[i] <- gsub("cid", ab_cid(ab[i]), ab_txt[i])
|
||||
ab_txt[i] <- gsub("group", ab_group(ab[i], language = language), ab_txt[i])
|
||||
ab_txt[i] <- gsub("atc_group1", ab_atc_group1(ab[i], language = language), ab_txt[i])
|
||||
|
@ -259,25 +259,25 @@ first_isolate <- function(x,
|
||||
|
||||
# arrange data to the right sorting
|
||||
if (is.null(specimen_group)) {
|
||||
x <- x[order(x$newvar_patient_id,
|
||||
x$newvar_genus_species,
|
||||
x$newvar_date), ]
|
||||
rownames(x) <- NULL
|
||||
row.start <- 1
|
||||
row.end <- nrow(x)
|
||||
x <- x[order(x$newvar_patient_id,
|
||||
x$newvar_genus_species,
|
||||
x$newvar_date), ]
|
||||
rownames(x) <- NULL
|
||||
row.start <- 1
|
||||
row.end <- nrow(x)
|
||||
} else {
|
||||
# filtering on specimen and only analyse these rows to save time
|
||||
x <- x[order(pull(x, col_specimen),
|
||||
x$newvar_patient_id,
|
||||
x$newvar_genus_species,
|
||||
x$newvar_date), ]
|
||||
rownames(x) <- NULL
|
||||
suppressWarnings(
|
||||
row.start <- which(x %>% pull(col_specimen) == specimen_group) %>% min(na.rm = TRUE)
|
||||
)
|
||||
suppressWarnings(
|
||||
row.end <- which(x %>% pull(col_specimen) == specimen_group) %>% max(na.rm = TRUE)
|
||||
)
|
||||
x <- x[order(pull(x, col_specimen),
|
||||
x$newvar_patient_id,
|
||||
x$newvar_genus_species,
|
||||
x$newvar_date), ]
|
||||
rownames(x) <- NULL
|
||||
suppressWarnings(
|
||||
row.start <- which(x %>% pull(col_specimen) == specimen_group) %>% min(na.rm = TRUE)
|
||||
)
|
||||
suppressWarnings(
|
||||
row.end <- which(x %>% pull(col_specimen) == specimen_group) %>% max(na.rm = TRUE)
|
||||
)
|
||||
}
|
||||
|
||||
# no isolates found
|
||||
@ -290,7 +290,7 @@ first_isolate <- function(x,
|
||||
|
||||
# did find some isolates - add new index numbers of rows
|
||||
x$newvar_row_index_sorted <- seq_len(nrow(x))
|
||||
|
||||
|
||||
scope.size <- nrow(x[which(x$newvar_row_index_sorted %in% c(row.start + 1:row.end) &
|
||||
!is.na(x$newvar_mo)), , drop = FALSE])
|
||||
|
||||
@ -318,17 +318,17 @@ first_isolate <- function(x,
|
||||
|
||||
# Analysis of first isolate ----
|
||||
x$other_pat_or_mo <- if_else(x$newvar_patient_id == lag(x$newvar_patient_id) &
|
||||
x$newvar_genus_species == lag(x$newvar_genus_species),
|
||||
FALSE,
|
||||
TRUE)
|
||||
x$newvar_genus_species == lag(x$newvar_genus_species),
|
||||
FALSE,
|
||||
TRUE)
|
||||
x$episode_group <- paste(x$newvar_patient_id, x$newvar_genus_species)
|
||||
x$more_than_episode_ago <- unname(unlist(lapply(unique(x$episode_group),
|
||||
function(g,
|
||||
df = x,
|
||||
days = episode_days) {
|
||||
identify_new_year(x = df[which(df$episode_group == g), "newvar_date"],
|
||||
episode_days = days)
|
||||
})))
|
||||
x$more_than_episode_ago <- unlist(lapply(unique(x$episode_group),
|
||||
function(g,
|
||||
df = x,
|
||||
days = episode_days) {
|
||||
identify_new_year(x = df[which(df$episode_group == g), "newvar_date", drop = TRUE],
|
||||
episode_days = days)
|
||||
}))
|
||||
|
||||
weighted.notice <- ""
|
||||
if (!is.null(col_keyantibiotics)) {
|
||||
@ -336,38 +336,38 @@ first_isolate <- function(x,
|
||||
if (info == TRUE) {
|
||||
if (type == "keyantibiotics") {
|
||||
message(font_black(paste0("[Criterion] Base inclusion on key antibiotics, ",
|
||||
ifelse(ignore_I == FALSE, "not ", ""),
|
||||
"ignoring I")))
|
||||
ifelse(ignore_I == FALSE, "not ", ""),
|
||||
"ignoring I")))
|
||||
}
|
||||
if (type == "points") {
|
||||
message(font_black(paste0("[Criterion] Base inclusion on key antibiotics, using points threshold of "
|
||||
, points_threshold)))
|
||||
, points_threshold)))
|
||||
}
|
||||
}
|
||||
type_param <- type
|
||||
|
||||
x$other_key_ab <- !key_antibiotics_equal(y = x$newvar_key_ab,
|
||||
z = lag(x$newvar_key_ab),
|
||||
type = type_param,
|
||||
ignore_I = ignore_I,
|
||||
points_threshold = points_threshold,
|
||||
info = info)
|
||||
z = lag(x$newvar_key_ab),
|
||||
type = type_param,
|
||||
ignore_I = ignore_I,
|
||||
points_threshold = points_threshold,
|
||||
info = info)
|
||||
# with key antibiotics
|
||||
x$newvar_first_isolate <- if_else(x$newvar_row_index_sorted >= row.start &
|
||||
x$newvar_row_index_sorted <= row.end &
|
||||
x$newvar_genus_species != "" &
|
||||
(x$other_pat_or_mo | x$more_than_episode_ago | x$other_key_ab),
|
||||
TRUE,
|
||||
FALSE)
|
||||
x$newvar_row_index_sorted <= row.end &
|
||||
x$newvar_genus_species != "" &
|
||||
(x$other_pat_or_mo | x$more_than_episode_ago | x$other_key_ab),
|
||||
TRUE,
|
||||
FALSE)
|
||||
|
||||
} else {
|
||||
# no key antibiotics
|
||||
x$newvar_first_isolate <- if_else(x$newvar_row_index_sorted >= row.start &
|
||||
x$newvar_row_index_sorted <= row.end &
|
||||
x$newvar_genus_species != "" &
|
||||
(x$other_pat_or_mo | x$more_than_episode_ago),
|
||||
TRUE,
|
||||
FALSE)
|
||||
x$newvar_row_index_sorted <= row.end &
|
||||
x$newvar_genus_species != "" &
|
||||
(x$other_pat_or_mo | x$more_than_episode_ago),
|
||||
TRUE,
|
||||
FALSE)
|
||||
}
|
||||
|
||||
# first one as TRUE
|
||||
@ -391,17 +391,17 @@ first_isolate <- function(x,
|
||||
# handle empty microorganisms
|
||||
if (any(x$newvar_mo == "UNKNOWN", na.rm = TRUE) & info == TRUE) {
|
||||
message(font_blue(paste0("NOTE: ", ifelse(include_unknown == TRUE, "Included ", "Excluded "),
|
||||
format(sum(x$newvar_mo == "UNKNOWN", na.rm = TRUE),
|
||||
decimal.mark = decimal.mark, big.mark = big.mark),
|
||||
" isolates with a microbial ID 'UNKNOWN' (column `", font_bold(col_mo), "`)")))
|
||||
format(sum(x$newvar_mo == "UNKNOWN", na.rm = TRUE),
|
||||
decimal.mark = decimal.mark, big.mark = big.mark),
|
||||
" isolates with a microbial ID 'UNKNOWN' (column `", font_bold(col_mo), "`)")))
|
||||
}
|
||||
x[which(x$newvar_mo == "UNKNOWN"), "newvar_first_isolate"] <- include_unknown
|
||||
|
||||
# exclude all NAs
|
||||
if (any(is.na(x$newvar_mo)) & info == TRUE) {
|
||||
message(font_blue(paste0("NOTE: Excluded ", format(sum(is.na(x$newvar_mo), na.rm = TRUE),
|
||||
decimal.mark = decimal.mark, big.mark = big.mark),
|
||||
" isolates with a microbial ID 'NA' (column `", font_bold(col_mo), "`)")))
|
||||
decimal.mark = decimal.mark, big.mark = big.mark),
|
||||
" isolates with a microbial ID 'NA' (column `", font_bold(col_mo), "`)")))
|
||||
}
|
||||
x[which(is.na(x$newvar_mo)), "newvar_first_isolate"] <- FALSE
|
||||
|
||||
@ -465,7 +465,7 @@ filter_first_weighted_isolate <- function(x,
|
||||
col_keyantibiotics <- "keyab"
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
subset(x, first_isolate(x = y,
|
||||
col_date = col_date,
|
||||
col_patient_id = col_patient_id,
|
||||
|
21
R/like.R
21
R/like.R
@ -102,14 +102,19 @@ like <- function(x, pattern, ignore.case = TRUE) {
|
||||
as.integer(x) %in% base::grep(pattern, levels(x), ignore.case = FALSE, fixed = fixed)
|
||||
} else {
|
||||
tryCatch(base::grepl(pattern, x, ignore.case = FALSE, fixed = fixed),
|
||||
error = function(e) ifelse(grepl("Invalid regexp", e$message),
|
||||
# try with perl = TRUE:
|
||||
return(base::grepl(pattern = pattern, x = x,
|
||||
ignore.case = FALSE,
|
||||
fixed = fixed,
|
||||
perl = TRUE)),
|
||||
# stop otherwise
|
||||
stop(e$message)))
|
||||
error = function(e) {
|
||||
if (grepl("invalid reg(ular )?exp", e$message, ignore.case = TRUE)) {
|
||||
# try with perl = TRUE:
|
||||
return(base::grepl(pattern = pattern,
|
||||
x = x,
|
||||
ignore.case = FALSE,
|
||||
fixed = fixed,
|
||||
perl = TRUE))
|
||||
} else {
|
||||
# stop otherwise
|
||||
stop(e$message)
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
|
20
R/rsi_calc.R
20
R/rsi_calc.R
@ -40,6 +40,11 @@ rsi_calc <- function(...,
|
||||
data_vars <- dots2vars(...)
|
||||
|
||||
dots_df <- switch(1, ...)
|
||||
if (is.data.frame(dots_df)) {
|
||||
# make sure to remove all other classes like tibbles, data.tables, etc
|
||||
dots_df <- as.data.frame(dots_df, stringsAsFactors = FALSE)
|
||||
}
|
||||
|
||||
dots <- base::eval(base::substitute(base::alist(...)))
|
||||
stop_if(length(dots) == 0, "no variables selected", call = -2)
|
||||
|
||||
@ -50,6 +55,7 @@ rsi_calc <- function(...,
|
||||
|
||||
if (is.data.frame(dots_df)) {
|
||||
# data.frame passed with other columns, like: example_isolates %>% proportion_S(AMC, GEN)
|
||||
|
||||
dots <- as.character(dots)
|
||||
# remove first element, it's the data.frame
|
||||
if (length(dots) == 1) {
|
||||
@ -62,6 +68,10 @@ rsi_calc <- function(...,
|
||||
# and the old rsi function, which has "df" as name of the first parameter
|
||||
x <- dots_df
|
||||
} else {
|
||||
# get dots that are in column names already, and the ones that will be once evaluated using dots_df or global env
|
||||
# this is to support susceptibility(example_isolates, AMC, dplyr::all_of(some_vector_with_AB_names))
|
||||
dots <- c(dots[dots %in% colnames(dots_df)],
|
||||
eval(parse(text = dots[!dots %in% colnames(dots_df)]), envir = dots_df, enclos = globalenv()))
|
||||
dots_not_exist <- dots[!dots %in% colnames(dots_df)]
|
||||
stop_if(length(dots_not_exist) > 0, "column(s) not found: ", paste0("'", dots_not_exist, "'", collapse = ", "), call = -2)
|
||||
x <- dots_df[, dots, drop = FALSE]
|
||||
@ -72,10 +82,10 @@ rsi_calc <- function(...,
|
||||
} else {
|
||||
# multiple variables passed without pipe, like: proportion_S(example_isolates$AMC, example_isolates$GEN)
|
||||
x <- NULL
|
||||
try(x <- as.data.frame(dots), silent = TRUE)
|
||||
try(x <- as.data.frame(dots, stringsAsFactors = FALSE), silent = TRUE)
|
||||
if (is.null(x)) {
|
||||
# support for example_isolates %>% group_by(hospital_id) %>% summarise(amox = susceptibility(GEN, AMX))
|
||||
x <- as.data.frame(list(...))
|
||||
x <- as.data.frame(list(...), stringsAsFactors = FALSE)
|
||||
}
|
||||
}
|
||||
|
||||
@ -92,9 +102,9 @@ rsi_calc <- function(...,
|
||||
rsi_integrity_check <- character(0)
|
||||
for (i in seq_len(ncol(x))) {
|
||||
# check integrity of columns: force rsi class
|
||||
if (!is.rsi(x %>% pull(i))) {
|
||||
rsi_integrity_check <- c(rsi_integrity_check, x %>% pull(i) %>% as.character())
|
||||
x[, i] <- suppressWarnings(x %>% pull(i) %>% as.rsi()) # warning will be given later
|
||||
if (!is.rsi(x[, i, drop = TRUE])) {
|
||||
rsi_integrity_check <- c(rsi_integrity_check, as.character(x[, i, drop = TRUE]))
|
||||
x[, i] <- suppressWarnings(as.rsi(x[, i, drop = TRUE])) # warning will be given later
|
||||
print_warning <- TRUE
|
||||
}
|
||||
}
|
||||
|
Reference in New Issue
Block a user