1
0
mirror of https://github.com/msberends/AMR.git synced 2025-01-14 00:11:50 +01:00

bactid speed improvement

This commit is contained in:
dr. M.S. (Matthijs) Berends 2018-07-30 00:14:06 +02:00
parent 826694323b
commit d0a115d995
2 changed files with 76 additions and 45 deletions

View File

@ -102,6 +102,12 @@ as.bactid <- function(x) {
for (i in 1:length(x)) { for (i in 1:length(x)) {
if (identical(x.backup[i], "")) {
# empty values
x[i] <- NA
failures <- c(failures, x.fullbackup[i])
next
}
if (x.fullbackup[i] %in% AMR::microorganisms$bactid) { if (x.fullbackup[i] %in% AMR::microorganisms$bactid) {
# is already a valid bactid # is already a valid bactid
x[i] <- x.fullbackup[i] x[i] <- x.fullbackup[i]
@ -115,7 +121,8 @@ as.bactid <- function(x) {
if (tolower(x[i]) == '^e.*coli$') { if (tolower(x[i]) == '^e.*coli$') {
# avoid detection of Entamoeba coli in case of E. coli # avoid detection of Entamoeba coli in case of E. coli
x[i] <- 'Escherichia coli' x[i] <- 'ESCCOL'
next
} }
if (tolower(x[i]) == '^h.*influenzae$') { if (tolower(x[i]) == '^h.*influenzae$') {
# avoid detection of Haematobacter influenzae in case of H. influenzae # avoid detection of Haematobacter influenzae in case of H. influenzae
@ -168,32 +175,48 @@ as.bactid <- function(x) {
} }
# let's try the ID's first # let's try the ID's first
found <- AMR::microorganisms %>% filter(bactid == x.backup[i]) found <- AMR::microorganisms[which(AMR::microorganisms$bactid == x.backup[i]),]$bactid
if (length(found) > 0) {
x[i] <- found[1L]
next
}
if (nrow(found) == 0) {
# now try exact match # now try exact match
found <- AMR::microorganisms %>% filter(fullname == x[i]) found <- AMR::microorganisms[which(AMR::microorganisms$fullname == x[i]),]$bactid
if (length(found) > 0) {
x[i] <- found[1L]
next
} }
if (nrow(found) == 0) {
# try any match # try any match
found <- AMR::microorganisms %>% filter(fullname %like% x[i]) found <- AMR::microorganisms[which(AMR::microorganisms$fullname %like% x[i]),]$bactid
if (length(found) > 0) {
x[i] <- found[1L]
next
} }
if (nrow(found) == 0) {
# try exact match of only genus, with 'species' attached # try exact match of only genus, with 'species' attached
# (e.g. this prevents Streptococcus for becoming Peptostreptococcus, since "p" < "s") # (e.g. this prevents Streptococcus for becoming Peptostreptococcus, since "p" < "s")
found <- AMR::microorganisms %>% filter(fullname == x_species[i]) found <- AMR::microorganisms[which(AMR::microorganisms$fullname == x_species[i]),]$bactid
if (length(found) > 0) {
x[i] <- found[1L]
next
} }
if (nrow(found) == 0) {
# try any match of only genus, with 'species' attached # try any match of only genus, with 'species' attached
found <- AMR::microorganisms %>% filter(fullname %like% x_species[i]) found <- AMR::microorganisms[which(AMR::microorganisms$fullname %like% x_species[i]),]$bactid
if (length(found) > 0) {
x[i] <- found[1L]
next
} }
if (nrow(found) == 0) {
# search for GLIMS code # search for GLIMS code
if (toupper(x.backup[i]) %in% toupper(AMR::microorganisms.umcg$mocode)) { found <- AMR::microorganisms.umcg[which(toupper(AMR::microorganisms.umcg$mocode) == toupper(x.backup[i])),]$bactid
found <- AMR::microorganisms.umcg %>% filter(toupper(mocode) == toupper(x.backup[i])) if (length(found) > 0) {
x[i] <- found[1L]
next
} }
}
if (nrow(found) == 0) {
# try splitting of characters and then find ID # try splitting of characters and then find ID
# like esco = E. coli, klpn = K. pneumoniae, stau = S. aureus # like esco = E. coli, klpn = K. pneumoniae, stau = S. aureus
x_split <- x x_split <- x
@ -201,9 +224,12 @@ as.bactid <- function(x) {
x_split[i] <- paste0(x.backup[i] %>% substr(1, x_length / 2) %>% trimws(), x_split[i] <- paste0(x.backup[i] %>% substr(1, x_length / 2) %>% trimws(),
'.* ', '.* ',
x.backup[i] %>% substr((x_length / 2) + 1, x_length) %>% trimws()) x.backup[i] %>% substr((x_length / 2) + 1, x_length) %>% trimws())
found <- AMR::microorganisms %>% filter(fullname %like% paste0('^', x_split[i])) found <- AMR::microorganisms[which(AMR::microorganisms$fullname %like% paste0('^', x_split[i])),]$bactid
if (length(found) > 0) {
x[i] <- found[1L]
next
} }
if (nrow(found) == 0) {
# try any match with text before and after original search string # try any match with text before and after original search string
# so "negative rods" will be "GNR" # so "negative rods" will be "GNR"
if (x.backup[i] %like% "^Gram") { if (x.backup[i] %like% "^Gram") {
@ -212,16 +238,17 @@ as.bactid <- function(x) {
x.backup[i] <- trimws(x.backup[i], which = "both") x.backup[i] <- trimws(x.backup[i], which = "both")
} }
if (!is.na(x.backup[i])) { if (!is.na(x.backup[i])) {
found <- AMR::microorganisms %>% filter(fullname %like% x.backup[i]) found <- AMR::microorganisms[which(AMR::microorganisms$fullname %like% x.backup[i]),]$bactid
if (length(found) > 0) {
x[i] <- found[1L]
next
} }
} }
if (nrow(found) != 0 & x.backup[i] != "") { # not found
x[i] <- as.character(found[1, 'bactid'])
} else {
x[i] <- NA_character_ x[i] <- NA_character_
failures <- c(failures, x.fullbackup[i]) failures <- c(failures, x.fullbackup[i])
}
} }
failures <- failures[!failures %in% c(NA, NULL, NaN)] failures <- failures[!failures %in% c(NA, NULL, NaN)]

View File

@ -68,4 +68,8 @@ test_that("as.bactid works", {
expect_equal(nrow(data.frame(test = as.bactid("ESCCOL"))), expect_equal(nrow(data.frame(test = as.bactid("ESCCOL"))),
1) 1)
# check empty values
expect_equal(as.character(suppressWarnings(as.bactid(""))),
NA_character_)
}) })