1
0
mirror of https://github.com/msberends/AMR.git synced 2026-04-28 12:23:54 +02:00

Add add_if_missing parameter to control NA handling in interpretive rules (#264)

This commit is contained in:
Matthijs Berends
2026-04-21 21:53:43 +02:00
committed by GitHub
parent fb8758f36b
commit 8ff5d4472a
46 changed files with 1232 additions and 1016 deletions

View File

@@ -262,9 +262,9 @@ get_synonyms <- function(CID, clean = TRUE) {
if (is.na(CID[i])) {
next
}
all_cids <- CID[i]
# we will now get the closest compounds with a 96% threshold
similar_cids <- tryCatch(
data.table::fread(
@@ -281,7 +281,7 @@ get_synonyms <- function(CID, clean = TRUE) {
# leave out all CIDs that we have in our antimicrobials dataset to prevent duplication
similar_cids <- similar_cids[!similar_cids %in% antimicrobials$cid[!is.na(antimicrobials$cid)]]
all_cids <- unique(c(all_cids, similar_cids))
# for each one, we are getting the synonyms
current_syns <- character(0)
for (j in seq_len(length(all_cids))) {
@@ -297,9 +297,9 @@ get_synonyms <- function(CID, clean = TRUE) {
)[[1]],
error = function(e) NA_character_
)
Sys.sleep(0.05)
if (clean == TRUE) {
# remove text between brackets
synonyms_txt <- trimws(gsub(
@@ -319,16 +319,16 @@ get_synonyms <- function(CID, clean = TRUE) {
synonyms_txt <- gsub("[^a-z]+$", "", ignore.case = TRUE, synonyms_txt)
# only length 5 to 20 and lower-case names starting with a capital letter
synonyms_txt <- synonyms_txt[nchar(synonyms_txt) %in% c(5:20) &
grepl("^[A-Z][a-z]+$", synonyms_txt, ignore.case = FALSE)]
grepl("^[A-Z][a-z]+$", synonyms_txt, ignore.case = FALSE)]
synonyms_txt <- unlist(strsplit(synonyms_txt, ";", fixed = TRUE))
}
# synonyms must not be set for other agents, so remove the duplicates
synonyms_txt <- synonyms_txt[!synonyms_txt %in% unlist(synonyms)]
current_syns <- c(current_syns, synonyms_txt)
}
current_syns <- unique(trimws(current_syns[tolower(current_syns) %in% unique(tolower(current_syns))]))
synonyms[i] <- list(sort(current_syns))
}
@@ -763,10 +763,12 @@ antimicrobials[which(antimicrobials$ab %in% c("CYC", "LNZ", "THA", "TZD")), "gro
# add efflux
effl <- antimicrobials |>
filter(ab == "ACM") |>
mutate(ab = as.character("EFF"),
cid = NA_real_,
name = "Efflux",
group = "Other")
mutate(
ab = as.character("EFF"),
cid = NA_real_,
name = "Efflux",
group = "Other"
)
antimicrobials <- antimicrobials |>
mutate(ab = as.character(ab)) |>
bind_rows(effl)
@@ -777,9 +779,11 @@ antimicrobials[which(antimicrobials$ab == "EFF"), "abbreviations"][[1]] <- list(
# add clindamycin inducible screening
clin <- antimicrobials |>
filter(ab == "FOX1") |>
mutate(ab = as.character("CLI-S"),
name = "Clindamycin inducible screening",
group = "Macrolides/lincosamides")
mutate(
ab = as.character("CLI-S"),
name = "Clindamycin inducible screening",
group = "Macrolides/lincosamides"
)
antimicrobials <- antimicrobials |>
mutate(ab = as.character(ab)) |>
bind_rows(clin)
@@ -791,109 +795,123 @@ antimicrobials <- antimicrobials |>
bind_rows(
antimicrobials |>
filter(ab == "EFF") |>
mutate(ab = "BLA-S",
name = paste("Beta-lactamase", "screening test"),
cid = NA_real_,
atc = list(character(0)),
atc_group1 = NA_character_,
atc_group2 = NA_character_,
abbreviations = list(c("beta-lactamase", "betalactamase", "bl screen", "blt screen")),
synonyms = list(character(0)),
oral_ddd = NA_real_,
oral_units = NA_character_,
iv_ddd = NA_real_,
iv_units = NA_character_,
loinc = list(character(0))),
mutate(
ab = "BLA-S",
name = paste("Beta-lactamase", "screening test"),
cid = NA_real_,
atc = list(character(0)),
atc_group1 = NA_character_,
atc_group2 = NA_character_,
abbreviations = list(c("beta-lactamase", "betalactamase", "bl screen", "blt screen")),
synonyms = list(character(0)),
oral_ddd = NA_real_,
oral_units = NA_character_,
iv_ddd = NA_real_,
iv_units = NA_character_,
loinc = list(character(0))
),
antimicrobials |>
filter(ab == "PEN") |>
mutate(ab = "PEN-S",
name = paste(name, "screening test"),
cid = NA,
atc = list(character(0)),
atc_group1 = NA_character_,
atc_group2 = NA_character_,
abbreviations = list(c("pen screen")),
synonyms = list(character(0)),
oral_ddd = NA_real_,
oral_units = NA_character_,
iv_ddd = NA_real_,
iv_units = NA_character_,
loinc = list(character(0))),
mutate(
ab = "PEN-S",
name = paste(name, "screening test"),
cid = NA,
atc = list(character(0)),
atc_group1 = NA_character_,
atc_group2 = NA_character_,
abbreviations = list(c("pen screen")),
synonyms = list(character(0)),
oral_ddd = NA_real_,
oral_units = NA_character_,
iv_ddd = NA_real_,
iv_units = NA_character_,
loinc = list(character(0))
),
antimicrobials |>
filter(ab == "OXA") |>
mutate(ab = "OXA-S",
name = paste(name, "screening test"),
cid = NA,
atc = list(character(0)),
atc_group1 = NA_character_,
atc_group2 = NA_character_,
abbreviations = list(c("oxa screen")),
synonyms = list(character(0)),
oral_ddd = NA_real_,
oral_units = NA_character_,
iv_ddd = NA_real_,
iv_units = NA_character_,
loinc = list(character(0))),
mutate(
ab = "OXA-S",
name = paste(name, "screening test"),
cid = NA,
atc = list(character(0)),
atc_group1 = NA_character_,
atc_group2 = NA_character_,
abbreviations = list(c("oxa screen")),
synonyms = list(character(0)),
oral_ddd = NA_real_,
oral_units = NA_character_,
iv_ddd = NA_real_,
iv_units = NA_character_,
loinc = list(character(0))
),
antimicrobials |>
filter(ab == "PEF") |>
mutate(ab = "PEF-S",
name = paste(name, "screening test"),
cid = NA,
atc = list(character(0)),
atc_group1 = NA_character_,
atc_group2 = NA_character_,
abbreviations = list(c("pef screen")),
synonyms = list(character(0)),
oral_ddd = NA_real_,
oral_units = NA_character_,
iv_ddd = NA_real_,
iv_units = NA_character_,
loinc = list(character(0))),
mutate(
ab = "PEF-S",
name = paste(name, "screening test"),
cid = NA,
atc = list(character(0)),
atc_group1 = NA_character_,
atc_group2 = NA_character_,
abbreviations = list(c("pef screen")),
synonyms = list(character(0)),
oral_ddd = NA_real_,
oral_units = NA_character_,
iv_ddd = NA_real_,
iv_units = NA_character_,
loinc = list(character(0))
),
antimicrobials |>
filter(ab == "NAL") |>
mutate(ab = "NAL-S",
name = paste(name, "screening test"),
cid = NA,
atc = list(character(0)),
atc_group1 = NA_character_,
atc_group2 = NA_character_,
abbreviations = list(c("nal screen")),
synonyms = list(character(0)),
oral_ddd = NA_real_,
oral_units = NA_character_,
iv_ddd = NA_real_,
iv_units = NA_character_,
loinc = list(character(0))),
mutate(
ab = "NAL-S",
name = paste(name, "screening test"),
cid = NA,
atc = list(character(0)),
atc_group1 = NA_character_,
atc_group2 = NA_character_,
abbreviations = list(c("nal screen")),
synonyms = list(character(0)),
oral_ddd = NA_real_,
oral_units = NA_character_,
iv_ddd = NA_real_,
iv_units = NA_character_,
loinc = list(character(0))
),
antimicrobials |>
filter(ab == "NOR") |>
mutate(ab = "NOR-S",
name = paste(name, "screening test"),
cid = NA,
atc = list(character(0)),
atc_group1 = NA_character_,
atc_group2 = NA_character_,
abbreviations = list(c("nor screen")),
synonyms = list(character(0)),
oral_ddd = NA_real_,
oral_units = NA_character_,
iv_ddd = NA_real_,
iv_units = NA_character_,
loinc = list(character(0))),
mutate(
ab = "NOR-S",
name = paste(name, "screening test"),
cid = NA,
atc = list(character(0)),
atc_group1 = NA_character_,
atc_group2 = NA_character_,
abbreviations = list(c("nor screen")),
synonyms = list(character(0)),
oral_ddd = NA_real_,
oral_units = NA_character_,
iv_ddd = NA_real_,
iv_units = NA_character_,
loinc = list(character(0))
),
antimicrobials |>
filter(ab == "TCY") |>
mutate(ab = "TCY-S",
name = paste(name, "screening test"),
cid = NA,
atc = list(character(0)),
atc_group1 = NA_character_,
atc_group2 = NA_character_,
abbreviations = list(c("tcy screen")),
synonyms = list(character(0)),
oral_ddd = NA_real_,
oral_units = NA_character_,
iv_ddd = NA_real_,
iv_units = NA_character_,
loinc = list(character(0)))
mutate(
ab = "TCY-S",
name = paste(name, "screening test"),
cid = NA,
atc = list(character(0)),
atc_group1 = NA_character_,
atc_group2 = NA_character_,
abbreviations = list(c("tcy screen")),
synonyms = list(character(0)),
oral_ddd = NA_real_,
oral_units = NA_character_,
iv_ddd = NA_real_,
iv_units = NA_character_,
loinc = list(character(0))
)
)
@@ -919,16 +937,20 @@ antimicrobials <- antimicrobials |>
antimicrobials |>
filter(ab == "FPE") |>
mutate(ab = as.character(ab)) |>
mutate(ab = "FTA",
name = "Cefepime/taniborbactam",
cid = NA_real_),
mutate(
ab = "FTA",
name = "Cefepime/taniborbactam",
cid = NA_real_
),
antimicrobials |>
filter(ab == "TBP") |>
mutate(ab = as.character(ab)) |>
mutate(ab = "TAN",
name = "Taniborbactam",
cid = 76902493,
abbreviations = list("VNRX-5133"))
mutate(
ab = "TAN",
name = "Taniborbactam",
cid = 76902493,
abbreviations = list("VNRX-5133")
)
)
antimicrobials <- antimicrobials |>
@@ -936,39 +958,51 @@ antimicrobials <- antimicrobials |>
bind_rows(
antimicrobials |>
filter(ab == "CTB") |>
mutate(ab = "CTA",
cid = NA_real_,
name = "Ceftibuten/avibactam") |>
mutate(
ab = "CTA",
cid = NA_real_,
name = "Ceftibuten/avibactam"
) |>
select(1:4),
antimicrobials |>
filter(ab == "KAC") |>
mutate(ab = "KAS",
cid = NA_real_,
name = "Kasugamycin") |>
mutate(
ab = "KAS",
cid = NA_real_,
name = "Kasugamycin"
) |>
select(1:4),
antimicrobials |>
filter(ab == "PRI") |>
mutate(ab = "OST",
cid = NA_real_,
name = "Ostreogrycin") |>
mutate(
ab = "OST",
cid = NA_real_,
name = "Ostreogrycin"
) |>
select(1:4),
antimicrobials |>
filter(ab == "PRI") |>
mutate(ab = "THS",
cid = NA_real_,
name = "Thiostrepton") |>
mutate(
ab = "THS",
cid = NA_real_,
name = "Thiostrepton"
) |>
select(1, 3),
antimicrobials |>
filter(ab == "CLA1") |>
mutate(ab = "XER",
cid = NA_real_,
name = "Xeruborbactam") |>
mutate(
ab = "XER",
cid = NA_real_,
name = "Xeruborbactam"
) |>
select(1:4),
antimicrobials |>
filter(ab == "BLM") |>
mutate(ab = "ZOR",
cid = NA_real_,
name = "Zorbamycin") |>
mutate(
ab = "ZOR",
cid = NA_real_,
name = "Zorbamycin"
) |>
select(1:4),
)
@@ -977,9 +1011,11 @@ antimicrobials <- antimicrobials |>
bind_rows(
antimicrobials |>
filter(ab == "NOV") |>
mutate(ab = "CLB",
cid = 54706138,
name = "Clorobiocin") |>
mutate(
ab = "CLB",
cid = 54706138,
name = "Clorobiocin"
) |>
select(1:4),
)
@@ -990,7 +1026,7 @@ get_atc_table <- function(ab_name, type = "human") {
if (type == "human") {
url <- "https://atcddd.fhi.no/atc_ddd_index/"
} else if (type == "veterinary") {
url <- "https://atcddd.fhi.no/atcvet/atcvet_index/"
url <- "https://atcddd.fhi.no/atcvet/atcvet_index/"
} else {
stop("invalid type")
}
@@ -1055,8 +1091,10 @@ to_update <- 1:nrow(antimicrobials)
# or just the empty ones:
to_update <- which(sapply(antimicrobials$atc, function(x) length(x[!is.na(x)])) == 0)
updated_atc <- lapply(seq_len(length(to_update)),
function(x) NA_character_)
updated_atc <- lapply(
seq_len(length(to_update)),
function(x) NA_character_
)
# this takes around 10 minutes for the whole table (some ABx are skipped and go faster)

View File

@@ -72,12 +72,12 @@ whonet_organisms <- whonet_organisms_raw |>
ORGANISM = if_else(ORGANISM_CODE == "ckr", "Candida krusei", ORGANISM)
) |>
# try to match on GBIF identifier
left_join(microorganisms |> distinct(mo, gbif, status) |> filter(!is.na(gbif)), by = c("GBIF_TAXON_ID" = "gbif")) |>
left_join(microorganisms |> distinct(mo, gbif, status) |> filter(!is.na(gbif)), by = c("GBIF_TAXON_ID" = "gbif")) |>
# remove duplicates
arrange(ORGANISM_CODE, GBIF_TAXON_ID, status) |>
distinct(ORGANISM_CODE, .keep_all = TRUE) |>
distinct(ORGANISM_CODE, .keep_all = TRUE) |>
# add Enterobacterales, which is a subkingdom code in their data
bind_rows(data.frame(ORGANISM_CODE = "ebc", ORGANISM = "Enterobacterales", mo = as.mo("Enterobacterales"))) |>
bind_rows(data.frame(ORGANISM_CODE = "ebc", ORGANISM = "Enterobacterales", mo = as.mo("Enterobacterales"))) |>
arrange(ORGANISM)
@@ -88,31 +88,39 @@ unmatched <- whonet_organisms |> filter(is.na(mo))
# generate the mo codes and add their names
message("Getting MO codes for WHONET input...")
unmatched <- unmatched |>
mutate(mo = as.mo(gsub("(sero[a-z]*| nontypable| non[-][a-zA-Z]+|var[.]| not .*|sp[.],.*|, .*variant.*|, .*toxin.*|, microaer.*| beta-haem[.])", "", ORGANISM),
minimum_matching_score = 0.55,
keep_synonyms = TRUE,
language = "en"),
mo = case_when(ORGANISM %like% "Anaerobic" & ORGANISM %like% "negative" ~ as.mo("B_ANAER-NEG"),
ORGANISM %like% "Anaerobic" & ORGANISM %like% "positive" ~ as.mo("B_ANAER-POS"),
ORGANISM %like% "Anaerobic" ~ as.mo("B_ANAER"),
TRUE ~ mo),
mo_name = mo_name(mo,
keep_synonyms = TRUE,
language = "en"))
unmatched <- unmatched |>
mutate(
mo = as.mo(gsub("(sero[a-z]*| nontypable| non[-][a-zA-Z]+|var[.]| not .*|sp[.],.*|, .*variant.*|, .*toxin.*|, microaer.*| beta-haem[.])", "", ORGANISM),
minimum_matching_score = 0.55,
keep_synonyms = TRUE,
language = "en"
),
mo = case_when(
ORGANISM %like% "Anaerobic" & ORGANISM %like% "negative" ~ as.mo("B_ANAER-NEG"),
ORGANISM %like% "Anaerobic" & ORGANISM %like% "positive" ~ as.mo("B_ANAER-POS"),
ORGANISM %like% "Anaerobic" ~ as.mo("B_ANAER"),
TRUE ~ mo
),
mo_name = mo_name(mo,
keep_synonyms = TRUE,
language = "en"
)
)
# check if coercion at least resembles the first part (genus)
unmatched <- unmatched |>
unmatched <- unmatched |>
mutate(
first_part = sapply(ORGANISM, function(x) strsplit(gsub("[^a-zA-Z _-]+", "", x), " ")[[1]][1], USE.NAMES = FALSE),
keep = mo_name %like_case% first_part | ORGANISM %like% "Gram " | ORGANISM == "Other" | ORGANISM %like% "anaerobic") |>
keep = mo_name %like_case% first_part | ORGANISM %like% "Gram " | ORGANISM == "Other" | ORGANISM %like% "anaerobic"
) |>
arrange(keep)
unmatched |> View()
unmatched <- unmatched |>
filter(keep == TRUE)
organisms <- matched |> transmute(code = toupper(ORGANISM_CODE), group = SPECIES_GROUP, mo) |>
bind_rows(unmatched |> transmute(code = toupper(ORGANISM_CODE), group = SPECIES_GROUP, mo)) |>
mutate(name = mo_name(mo, keep_synonyms = TRUE)) |>
organisms <- matched |>
transmute(code = toupper(ORGANISM_CODE), group = SPECIES_GROUP, mo) |>
bind_rows(unmatched |> transmute(code = toupper(ORGANISM_CODE), group = SPECIES_GROUP, mo)) |>
mutate(name = mo_name(mo, keep_synonyms = TRUE)) |>
arrange(code)
# self-defined codes in the MO table must be retained
@@ -125,25 +133,33 @@ organisms <- organisms |>
# some subspecies exist, while their upper species do not, add them as the species level:
subspp <- organisms |>
filter(mo_species(mo, keep_synonyms = TRUE) == mo_subspecies(mo, keep_synonyms = TRUE) &
mo_species(mo, keep_synonyms = TRUE) != "" &
mo_genus(mo, keep_synonyms = TRUE) != "Salmonella") |>
mutate(mo = as.mo(paste(mo_genus(mo, keep_synonyms = TRUE),
mo_species(mo, keep_synonyms = TRUE)),
keep_synonyms = TRUE),
name = mo_name(mo, keep_synonyms = TRUE))
mo_species(mo, keep_synonyms = TRUE) != "" &
mo_genus(mo, keep_synonyms = TRUE) != "Salmonella") |>
mutate(
mo = as.mo(
paste(
mo_genus(mo, keep_synonyms = TRUE),
mo_species(mo, keep_synonyms = TRUE)
),
keep_synonyms = TRUE
),
name = mo_name(mo, keep_synonyms = TRUE)
)
organisms <- organisms |>
filter(!code %in% subspp$code) |>
bind_rows(subspp) |>
arrange(code)
# add the groups
organisms <- organisms |>
bind_rows(tibble(code = organisms |> filter(!is.na(group)) |> pull(group) |> unique(),
group = NA,
mo = organisms |> filter(!is.na(group)) |> pull(group) |> unique() |> as.mo(keep_synonyms = TRUE),
name = mo_name(mo, keep_synonyms = TRUE))) |>
arrange(code, group) |>
select(-group) |>
organisms <- organisms |>
bind_rows(tibble(
code = organisms |> filter(!is.na(group)) |> pull(group) |> unique(),
group = NA,
mo = organisms |> filter(!is.na(group)) |> pull(group) |> unique() |> as.mo(keep_synonyms = TRUE),
name = mo_name(mo, keep_synonyms = TRUE)
)) |>
arrange(code, group) |>
select(-group) |>
distinct()
# no XXX
organisms <- organisms |> filter(code != "XXX")
@@ -153,7 +169,7 @@ organisms <- organisms |> filter(code != "XXX")
# 2025-04-20 still the case
# 2026-03-27 still the case, but fixed using `existing_codes` above
organisms |> filter(code == "SGM")
# organisms <- organisms |>
# organisms <- organisms |>
# filter(!(code == "SGM" & name %like% "Streptococcus"))
# this must be empty:
organisms$code[organisms$code |> duplicated()]
@@ -165,12 +181,12 @@ saveRDS(organisms, "data-raw/organisms.rds", version = 2)
#---
# update microorganisms.codes with the latest WHONET codes
microorganisms.codes2 <- microorganisms.codes |>
microorganisms.codes2 <- microorganisms.codes |>
# remove all old WHONET codes, whether we (in the end) keep them or not
filter(!toupper(code) %in% toupper(organisms$code)) |>
filter(!toupper(code) %in% toupper(organisms$code)) |>
# and add the new ones
bind_rows(organisms |> select(code, mo)) |>
arrange(code) |>
bind_rows(organisms |> select(code, mo)) |>
arrange(code) |>
distinct(code, .keep_all = TRUE)
# new codes:
microorganisms.codes2$code[which(!microorganisms.codes2$code %in% microorganisms.codes$code)]
@@ -214,47 +230,53 @@ devtools::load_all()
# now that we have the correct MO codes, get the breakpoints and convert them
whonet_breakpoints_raw |>
count(GUIDELINES, BREAKPOINT_TYPE) |>
pivot_wider(names_from = BREAKPOINT_TYPE, values_from = n) |>
whonet_breakpoints_raw |>
count(GUIDELINES, BREAKPOINT_TYPE) |>
pivot_wider(names_from = BREAKPOINT_TYPE, values_from = n) |>
janitor::adorn_totals(where = c("row", "col"))
whonet_breakpoints_raw |>
whonet_breakpoints_raw |>
filter(YEAR == format(Sys.Date(), "%Y")) |>
count(GUIDELINES, YEAR, BREAKPOINT_TYPE) |>
pivot_wider(names_from = BREAKPOINT_TYPE, values_from = n) |>
count(GUIDELINES, YEAR, BREAKPOINT_TYPE) |>
pivot_wider(names_from = BREAKPOINT_TYPE, values_from = n) |>
janitor::adorn_totals(where = c("row", "col"))
# compared to current
AMR::clinical_breakpoints |>
count(GUIDELINES = gsub("[^a-zA-Z]", "", guideline), type) |>
arrange(tolower(type)) |>
pivot_wider(names_from = type, values_from = n) |>
pivot_wider(names_from = type, values_from = n) |>
as.data.frame() |>
janitor::adorn_totals(where = c("row", "col"))
breakpoints <- whonet_breakpoints_raw |>
mutate(code = toupper(ORGANISM_CODE)) |>
left_join(bind_rows(microorganisms.codes |> filter(!code %in% c("ALL", "GEN")),
# GEN (Generic) and ALL (All) are PK/PD codes
data.frame(code = c("ALL", "GEN"),
mo = rep(as.mo("UNKNOWN"), 2))))
left_join(bind_rows(
microorganisms.codes |> filter(!code %in% c("ALL", "GEN")),
# GEN (Generic) and ALL (All) are PK/PD codes
data.frame(
code = c("ALL", "GEN"),
mo = rep(as.mo("UNKNOWN"), 2)
)
))
# these ones lack an MO name, they cannot be used:
unknown <- breakpoints |>
filter(is.na(mo)) |>
pull(code) |>
unique()
breakpoints |>
filter(code %in% unknown) |>
breakpoints |>
filter(code %in% unknown) |>
count(GUIDELINES, YEAR, ORGANISM_CODE, BREAKPOINT_TYPE, sort = TRUE)
# 2025-04-20: these codes are currently: cps, fso. No clue (are not in MO list of WHONET), and they are only ECOFFs, so remove them:
breakpoints <- breakpoints |>
breakpoints <- breakpoints |>
filter(!is.na(mo))
# and these ones have unknown antibiotics according to WHONET itself:
breakpoints |>
filter(!WHONET_ABX_CODE %in% whonet_antibiotics_raw$WHONET_ABX_CODE) |>
breakpoints |>
filter(!WHONET_ABX_CODE %in% whonet_antibiotics_raw$WHONET_ABX_CODE) |>
count(GUIDELINES, WHONET_ABX_CODE) |>
mutate(ab = as.ab(WHONET_ABX_CODE, fast_mode = TRUE),
ab_name = ab_name(ab))
mutate(
ab = as.ab(WHONET_ABX_CODE, fast_mode = TRUE),
ab_name = ab_name(ab)
)
# 2025-04-20: these codes are currently: CFC, ROX, FIX, and N/A. All have the right replacements in `antimicrobials`, so we can safely use as.ab() later on
# the NAs are for M. tuberculosis, they are empty breakpoints
breakpoints <- breakpoints |>
@@ -264,7 +286,7 @@ breakpoints <- breakpoints |>
## Build new breakpoints table ----
breakpoints_new <- breakpoints |>
filter(!is.na(WHONET_ABX_CODE)) |>
filter(!is.na(WHONET_ABX_CODE)) |>
transmute(
guideline = paste(GUIDELINES, YEAR),
type = ifelse(BREAKPOINT_TYPE == "ECOFF", "ECOFF", tolower(BREAKPOINT_TYPE)),
@@ -301,22 +323,26 @@ breakpoints_new <- breakpoints |>
distinct(guideline, type, host, ab, mo, method, site, breakpoint_S, .keep_all = TRUE)
# fix reference table names
breakpoints_new |> filter(guideline %like% "EUCAST", is.na(ref_tbl)) |> View()
breakpoints_new <- breakpoints_new |>
mutate(ref_tbl = case_when(is.na(ref_tbl) & guideline %like% "EUCAST 202" ~ lead(ref_tbl),
is.na(ref_tbl) ~ "Unknown",
TRUE ~ ref_tbl))
breakpoints_new |>
filter(guideline %like% "EUCAST", is.na(ref_tbl)) |>
View()
breakpoints_new <- breakpoints_new |>
mutate(ref_tbl = case_when(
is.na(ref_tbl) & guideline %like% "EUCAST 202" ~ lead(ref_tbl),
is.na(ref_tbl) ~ "Unknown",
TRUE ~ ref_tbl
))
# clean disk zones
breakpoints_new[which(breakpoints_new$method == "DISK"), "breakpoint_S"] <- as.double(as.disk(breakpoints_new[which(breakpoints_new$method == "DISK"), "breakpoint_S", drop = TRUE]))
breakpoints_new[which(breakpoints_new$method == "DISK"), "breakpoint_R"] <- as.double(as.disk(breakpoints_new[which(breakpoints_new$method == "DISK"), "breakpoint_R", drop = TRUE]))
# regarding animal breakpoints, CLSI has adults and foals for horses, but only for amikacin - only keep adult horses
breakpoints_new |>
breakpoints_new |>
filter(host %like% "foal") |>
count(guideline, host, ab)
breakpoints_new <- breakpoints_new |>
filter(host %unlike% "foal") |>
breakpoints_new <- breakpoints_new |>
filter(host %unlike% "foal") |>
mutate(host = ifelse(host %like% "horse", "horse", host))
# FIXES FOR WHONET ERRORS ----
@@ -324,8 +350,12 @@ m <- unique(as.double(as.mic(levels(as.mic(1)))))
# WHONET has no >1024 but instead uses 1025, 513, and 129, so as.mic() cannot be used to clean.
# instead, raise these one higher valid MIC factor level:
breakpoints_new |> filter(method == "MIC" & (!breakpoint_S %in% c(m, NA))) |> distinct(breakpoint_S)
breakpoints_new |> filter(method == "MIC" & (!breakpoint_R %in% c(m, NA))) |> distinct(breakpoint_R)
breakpoints_new |>
filter(method == "MIC" & (!breakpoint_S %in% c(m, NA))) |>
distinct(breakpoint_S)
breakpoints_new |>
filter(method == "MIC" & (!breakpoint_R %in% c(m, NA))) |>
distinct(breakpoint_R)
breakpoints_new[which(breakpoints_new$breakpoint_R == 129), "breakpoint_R"] <- m[which(m == 128) + 1]
breakpoints_new[which(breakpoints_new$breakpoint_R == 257), "breakpoint_R"] <- m[which(m == 256) + 1]
breakpoints_new[which(breakpoints_new$breakpoint_R == 513), "breakpoint_R"] <- m[which(m == 512) + 1]
@@ -353,12 +383,12 @@ breakpoints_new$mo[breakpoints_new$guideline %like% "EUCAST" & breakpoints_new$m
breakpoints_new |>
filter(method == "MIC" & guideline %like% "EUCAST" & mo %like% as.mo("B_HMPHL")) |>
count(guideline, mo)
breakpoints_new <- breakpoints_new |>
breakpoints_new <- breakpoints_new |>
bind_rows(
breakpoints_new |>
filter(guideline %like% "EUCAST", mo == "B_HMPHL_INFL") |>
filter(guideline %like% "EUCAST", mo == "B_HMPHL_INFL") |>
mutate(mo = as.mo("B_HMPHL_PRNF"))
) |>
) |>
arrange(desc(guideline), mo, ab, type, host, method) |>
distinct()
# Achromobacter denitrificans is in WHONET included in their A. xylosoxidans table, must be removed
@@ -387,7 +417,9 @@ breakpoints_new <- breakpoints_new |> filter(!wrong)
# 2025-04-20/ fixed now
# WHONET sets for EUCAST 2026 TMP breakpoints for all Klebsiella, but this is now only for non-aerogenes species
kleb_spp <- microorganisms |> filter(rank == "species", genus == "Klebsiella", !species %in% c("", "aerogenes")) |> pull(mo)
kleb_spp <- microorganisms |>
filter(rank == "species", genus == "Klebsiella", !species %in% c("", "aerogenes")) |>
pull(mo)
kleb_tmp_mic <- breakpoints_new |>
filter(guideline == "EUCAST 2026", method == "MIC", ab == "TMP", mo == as.mo("Klebsiella")) |>
uncount(length(kleb_spp)) |>
@@ -398,8 +430,10 @@ kleb_tmp_disk <- breakpoints_new |>
mutate(mo = kleb_spp)
breakpoints_new <- breakpoints_new |>
filter(!(guideline == "EUCAST 2026" & method == "MIC" & ab == "TMP" & mo == as.mo("Klebsiella"))) |>
bind_rows(kleb_tmp_mic,
kleb_tmp_disk)
bind_rows(
kleb_tmp_mic,
kleb_tmp_disk
)
# WHONET contains wrong EUCAST breakpoints for enterococci/SXT: disk should be 23/23, not 21/50, and MIC should be 1/1, not 0.032/1
# applies to all previous years, since v11 (2011)
@@ -441,14 +475,14 @@ breakpoints_new <- breakpoints_new |>
# check the strange duplicates
breakpoints_new |>
breakpoints_new |>
mutate(id = paste(guideline, type, host, method, site, mo, ab, uti)) %>%
filter(id %in% .$id[which(duplicated(id))]) |>
filter(id %in% .$id[which(duplicated(id))]) |>
arrange(desc(guideline)) |>
View()
# 2024-06-19/ mostly ECOFFs, but there's no explanation in the whonet_breakpoints_raw df, we have to remove duplicates
# 2025-04-20/ same, most important one seems M. tuberculosis in CLSI (also in 2025)
breakpoints_new <- breakpoints_new |>
breakpoints_new <- breakpoints_new |>
distinct(guideline, type, host, method, site, mo, ab, uti, .keep_all = TRUE)
@@ -469,7 +503,7 @@ dim(clinical_breakpoints)
# SAVE TO PACKAGE ----
# determine rank again now that some changes were made on taxonomic level (genus -> species)
breakpoints_new <- breakpoints_new |>
breakpoints_new <- breakpoints_new |>
mutate(rank_index = case_when(
mo_rank(mo, keep_synonyms = TRUE) %like% "(infra|sub)" ~ 1,
mo_rank(mo, keep_synonyms = TRUE) == "species" ~ 2,

View File

@@ -649,7 +649,9 @@ taxonomy_mycobank <- taxonomy_mycobank %>%
arrange(fullname)
taxonomy_mycobank %>% count(rank, sort = TRUE)
taxonomy_mycobank %>% filter(rank %like% "#") %>% count(rank)
taxonomy_mycobank %>%
filter(rank %like% "#") %>%
count(rank)
taxonomy_mycobank3 <- taxonomy_mycobank
@@ -2546,7 +2548,9 @@ taxonomy %>%
arrange(mo) %>%
View()
# keep the firsts
taxonomy <- taxonomy %>% arrange(mo) %>% distinct(mo, .keep_all = TRUE)
taxonomy <- taxonomy %>%
arrange(mo) %>%
distinct(mo, .keep_all = TRUE)
# are fullnames unique?
taxonomy %>%
@@ -2997,7 +3001,9 @@ taxonomy$rank[which(taxonomy$fullname %like% "unknown")] <- "(unknown rank)"
# this happened in early 2025, check that MO codes do not have repeated elements
# fixed it then like this: microorganisms$mo <- gsub("B_SCLLM_CNNM_LNSM_LNSM_LNSM_LNSM", "B_SCLLM_CNNM", microorganisms$mo)
taxonomy |> filter(mo %like% "_.*_.*_.*_") |> View()
taxonomy |>
filter(mo %like% "_.*_.*_.*_") |>
View()
fix_old_mos <- function(dataset) {
@@ -3085,7 +3091,9 @@ microorganisms <- taxonomy
# set class <mo>
class(microorganisms$mo) <- c("mo", "character")
microorganisms <- microorganisms %>% arrange(fullname) %>% df_remove_nonASCII()
microorganisms <- microorganisms %>%
arrange(fullname) %>%
df_remove_nonASCII()
usethis::use_data(
microorganisms,
overwrite = TRUE,

View File

@@ -59,72 +59,101 @@ whonet_organisms <- whonet_organisms %>%
mutate(
# this one was called Issatchenkia orientalis, but it should be:
ORGANISM = if_else(ORGANISM_CODE == "ckr", "Candida krusei", ORGANISM)
) %>%
) %>%
# try to match on GBIF identifier
left_join(microorganisms %>% distinct(mo, gbif, status) %>% filter(!is.na(gbif)), by = c("GBIF_TAXON_ID" = "gbif")) %>%
left_join(microorganisms %>% distinct(mo, gbif, status) %>% filter(!is.na(gbif)), by = c("GBIF_TAXON_ID" = "gbif")) %>%
# remove duplicates
arrange(ORGANISM_CODE, GBIF_TAXON_ID, status) %>%
distinct(ORGANISM_CODE, .keep_all = TRUE) %>%
distinct(ORGANISM_CODE, .keep_all = TRUE) %>%
# add Enterobacterales, which is a subkingdom code in their data
bind_rows(data.frame(ORGANISM_CODE = "ebc", ORGANISM = "Enterobacterales", mo = as.mo("Enterobacterales"))) %>%
bind_rows(data.frame(ORGANISM_CODE = "ebc", ORGANISM = "Enterobacterales", mo = as.mo("Enterobacterales"))) %>%
arrange(ORGANISM)
# check non-existing species groups in the microorganisms table
whonet_organisms %>%
filter(!is.na(SPECIES_GROUP)) %>%
group_by(SPECIES_GROUP) %>%
summarise(complex = ORGANISM[ORGANISM %like% " (group|complex)"][1],
organisms = paste0(n(), ": ", paste(sort(unique(ORGANISM)), collapse = ", "))) %>%
summarise(
complex = ORGANISM[ORGANISM %like% " (group|complex)"][1],
organisms = paste0(n(), ": ", paste(sort(unique(ORGANISM)), collapse = ", "))
) %>%
filter(!SPECIES_GROUP %in% microorganisms.codes$code)
# create the species group data set ----
microorganisms.groups <- whonet_organisms %>%
# these will not be translated well
filter(!ORGANISM %in% c("Trueperella pyogenes-like bacteria",
"Mycobacterium suricattae",
"Mycobacterium canetti")) %>%
filter(!ORGANISM %in% c(
"Trueperella pyogenes-like bacteria",
"Mycobacterium suricattae",
"Mycobacterium canetti"
)) %>%
filter(!is.na(SPECIES_GROUP), SPECIES_GROUP != ORGANISM_CODE) %>%
transmute(mo_group = as.mo(SPECIES_GROUP),
mo = ifelse(is.na(mo),
as.character(as.mo(ORGANISM, keep_synonyms = TRUE, minimum_matching_score = 0)),
mo)) %>%
transmute(
mo_group = as.mo(SPECIES_GROUP),
mo = ifelse(is.na(mo),
as.character(as.mo(ORGANISM, keep_synonyms = TRUE, minimum_matching_score = 0)),
mo
)
) %>%
# add our own CoNS and CoPS, WHONET does not strictly follow Becker et al. (2014, 2019, 2020)
filter(mo_group != as.mo("CoNS")) %>%
bind_rows(tibble(mo_group = as.mo("CoNS"), mo = MO_CONS)) %>%
filter(mo_group != as.mo("CoPS")) %>%
bind_rows(tibble(mo_group = as.mo("CoPS"), mo = MO_COPS)) %>%
filter(mo_group != as.mo("CoNS")) %>%
bind_rows(tibble(mo_group = as.mo("CoNS"), mo = MO_CONS)) %>%
filter(mo_group != as.mo("CoPS")) %>%
bind_rows(tibble(mo_group = as.mo("CoPS"), mo = MO_COPS)) %>%
# at least all our Lancefield-grouped streptococci must be in the beta-haemolytic group:
bind_rows(tibble(mo_group = as.mo("Beta-haemolytic streptococcus"),
mo = c(MO_LANCEFIELD,
microorganisms %>% filter(fullname %like% "^Streptococcus Group") %>% pull(mo)))) %>%
bind_rows(tibble(
mo_group = as.mo("Beta-haemolytic streptococcus"),
mo = c(
MO_LANCEFIELD,
microorganisms %>% filter(fullname %like% "^Streptococcus Group") %>% pull(mo)
)
)) %>%
# and per Streptococcus group as well:
# group A - S. pyogenes
bind_rows(tibble(mo_group = as.mo("Streptococcus Group A"),
mo = microorganisms$mo[which(microorganisms$mo %like% "^B_STRPT_PYGN(_|$)")])) %>%
bind_rows(tibble(
mo_group = as.mo("Streptococcus Group A"),
mo = microorganisms$mo[which(microorganisms$mo %like% "^B_STRPT_PYGN(_|$)")]
)) %>%
# group B - S. agalactiae
bind_rows(tibble(mo_group = as.mo("Streptococcus Group B"),
mo = microorganisms$mo[which(microorganisms$mo %like% "^B_STRPT_AGLC(_|$)")])) %>%
bind_rows(tibble(
mo_group = as.mo("Streptococcus Group B"),
mo = microorganisms$mo[which(microorganisms$mo %like% "^B_STRPT_AGLC(_|$)")]
)) %>%
# group C - all subspecies within S. dysgalactiae and S. equi (such as S. equi zooepidemicus)
bind_rows(tibble(mo_group = as.mo("Streptococcus Group C"),
mo = microorganisms$mo[which(microorganisms$mo %like% "^B_STRPT_(DYSG|EQUI)(_|$)")])) %>%
bind_rows(tibble(
mo_group = as.mo("Streptococcus Group C"),
mo = microorganisms$mo[which(microorganisms$mo %like% "^B_STRPT_(DYSG|EQUI)(_|$)")]
)) %>%
# group F - Milleri group == S. anginosus group, which incl. S. anginosus, S. constellatus, S. intermedius
bind_rows(tibble(mo_group = as.mo("Streptococcus Group F"),
mo = microorganisms$mo[which(microorganisms$mo %like% "^B_STRPT_(ANGN|CNST|INTR)(_|$)")])) %>%
bind_rows(tibble(
mo_group = as.mo("Streptococcus Group F"),
mo = microorganisms$mo[which(microorganisms$mo %like% "^B_STRPT_(ANGN|CNST|INTR)(_|$)")]
)) %>%
# group G - S. dysgalactiae and S. canis (though dysgalactiae is also group C and will be matched there)
bind_rows(tibble(mo_group = as.mo("Streptococcus Group G"),
mo = microorganisms$mo[which(microorganisms$mo %like% "^B_STRPT_(DYSG|CANS)(_|$)")])) %>%
bind_rows(tibble(
mo_group = as.mo("Streptococcus Group G"),
mo = microorganisms$mo[which(microorganisms$mo %like% "^B_STRPT_(DYSG|CANS)(_|$)")]
)) %>%
# group H - S. sanguinis
bind_rows(tibble(mo_group = as.mo("Streptococcus Group H"),
mo = microorganisms$mo[which(microorganisms$mo %like% "^B_STRPT_SNGN(_|$)")])) %>%
bind_rows(tibble(
mo_group = as.mo("Streptococcus Group H"),
mo = microorganisms$mo[which(microorganisms$mo %like% "^B_STRPT_SNGN(_|$)")]
)) %>%
# group K - S. salivarius, incl. S. salivarius salivariuss and S. salivarius thermophilus
bind_rows(tibble(mo_group = as.mo("Streptococcus Group K"),
mo = microorganisms$mo[which(microorganisms$mo %like% "^B_STRPT_SLVR(_|$)")])) %>%
bind_rows(tibble(
mo_group = as.mo("Streptococcus Group K"),
mo = microorganisms$mo[which(microorganisms$mo %like% "^B_STRPT_SLVR(_|$)")]
)) %>%
# group L - only S. dysgalactiae
bind_rows(tibble(mo_group = as.mo("Streptococcus Group L"),
mo = microorganisms$mo[which(microorganisms$mo %like% "^B_STRPT_DYSG(_|$)")])) %>%
bind_rows(tibble(
mo_group = as.mo("Streptococcus Group L"),
mo = microorganisms$mo[which(microorganisms$mo %like% "^B_STRPT_DYSG(_|$)")]
)) %>%
# and for EUCAST: Strep group A, B, C, G
bind_rows(tibble(mo_group = as.mo("Streptococcus Group A, B, C, G"),
mo = microorganisms$mo[which(microorganisms$mo %like% "^B_STRPT_(PYGN|AGLC|DYSG|EQUI|CANS|GRPA|GRPB|GRPC|GRPG)(_|$)")])) %>%
bind_rows(tibble(
mo_group = as.mo("Streptococcus Group A, B, C, G"),
mo = microorganisms$mo[which(microorganisms$mo %like% "^B_STRPT_(PYGN|AGLC|DYSG|EQUI|CANS|GRPA|GRPB|GRPC|GRPG)(_|$)")]
)) %>%
# HACEK is:
# - Haemophilus species
# - Aggregatibacter species
@@ -133,38 +162,46 @@ microorganisms.groups <- whonet_organisms %>%
# - Kingella species
# - and previously Actinobacillus actinomycetemcomitans
# see https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3656887/
filter(mo_group != as.mo("HACEK")) %>%
bind_rows(tibble(mo_group = as.mo("HACEK"), mo = microorganisms %>% filter(genus == "Haemophilus") %>% pull(mo))) %>%
bind_rows(tibble(mo_group = as.mo("HACEK"), mo = microorganisms %>% filter(genus == "Aggregatibacter") %>% pull(mo))) %>%
filter(mo_group != as.mo("HACEK")) %>%
bind_rows(tibble(mo_group = as.mo("HACEK"), mo = microorganisms %>% filter(genus == "Haemophilus") %>% pull(mo))) %>%
bind_rows(tibble(mo_group = as.mo("HACEK"), mo = microorganisms %>% filter(genus == "Aggregatibacter") %>% pull(mo))) %>%
bind_rows(tibble(mo_group = as.mo("HACEK"), mo = as.mo("Cardiobacterium hominis", keep_synonyms = TRUE))) %>%
bind_rows(tibble(mo_group = as.mo("HACEK"), mo = as.mo("Eikenella corrodens", keep_synonyms = TRUE))) %>%
bind_rows(tibble(mo_group = as.mo("HACEK"), mo = microorganisms %>% filter(genus == "Kingella") %>% pull(mo))) %>%
bind_rows(tibble(mo_group = as.mo("HACEK"), mo = as.mo("Actinobacillus actinomycetemcomitans", keep_synonyms = TRUE))) %>%
# Citrobacter freundii complex in the NCBI Taxonomy Browser:
# https://www.ncbi.nlm.nih.gov/Taxonomy/Browser/wwwtax.cgi?id=1344959
filter(mo_group != "B_CTRBC_FRND-C") %>%
bind_rows(tibble(mo_group = as.mo("B_CTRBC_FRND-C"),
mo = paste("Citrobacter", c("freundii", "braakii", "gillenii", "murliniae", "portucalensis", "sedlakii", "werkmanii", "youngae")) %>% as.mo(keep_synonyms = TRUE))) %>%
filter(mo_group != "B_CTRBC_FRND-C") %>%
bind_rows(tibble(
mo_group = as.mo("B_CTRBC_FRND-C"),
mo = paste("Citrobacter", c("freundii", "braakii", "gillenii", "murliniae", "portucalensis", "sedlakii", "werkmanii", "youngae")) %>% as.mo(keep_synonyms = TRUE)
)) %>%
# Klebsiella pneumoniae complex
filter(mo_group != "B_KLBSL_PNMN-C") %>%
bind_rows(tibble(mo_group = as.mo("B_KLBSL_PNMN-C"),
mo = paste("Klebsiella", c("africana", "pneumoniae", "quasipneumoniae", "quasivariicola", "variicola")) %>% as.mo(keep_synonyms = TRUE))) %>%
filter(mo_group != "B_KLBSL_PNMN-C") %>%
bind_rows(tibble(
mo_group = as.mo("B_KLBSL_PNMN-C"),
mo = paste("Klebsiella", c("africana", "pneumoniae", "quasipneumoniae", "quasivariicola", "variicola")) %>% as.mo(keep_synonyms = TRUE)
)) %>%
# Yersinia pseudotuberculosis complex in the NCBI Taxonomy Browser:
# https://www.ncbi.nlm.nih.gov/Taxonomy/Browser/wwwtax.cgi?id=1649845
filter(mo_group != "B_YERSN_PSDT-C") %>%
bind_rows(tibble(mo_group = as.mo("B_YERSN_PSDT-C"),
mo = paste("Yersinia", c("pseudotuberculosis", "pestis", "similis", "wautersii")) %>% as.mo(keep_synonyms = TRUE))) %>%
filter(mo_group != "B_YERSN_PSDT-C") %>%
bind_rows(tibble(
mo_group = as.mo("B_YERSN_PSDT-C"),
mo = paste("Yersinia", c("pseudotuberculosis", "pestis", "similis", "wautersii")) %>% as.mo(keep_synonyms = TRUE)
)) %>%
# RGM are Rapidly-growing Mycobacteria, see https://pubmed.ncbi.nlm.nih.gov/28084211/
filter(mo_group != "B_MYCBC_RGM") %>%
bind_rows(tibble(mo_group = as.mo("B_MYCBC_RGM"),
mo = paste("Mycobacterium", c( "abscessus abscessus", "abscessus bolletii", "abscessus massiliense", "agri", "aichiense", "algericum", "alvei", "anyangense", "arabiense", "aromaticivorans", "aubagnense", "aubagnense", "aurum", "austroafricanum", "bacteremicum", "boenickei", "bourgelatii", "brisbanense", "brumae", "canariasense", "celeriflavum", "chelonae", "chitae", "chlorophenolicum", "chubuense", "confluentis", "cosmeticum", "crocinum", "diernhoferi", "duvalii", "elephantis", "fallax", "flavescens", "fluoranthenivorans", "fortuitum", "franklinii", "frederiksbergense", "gadium", "gilvum", "goodii", "hassiacum", "hippocampi", "hodleri", "holsaticum", "houstonense", "immunogenum", "insubricum", "iranicum", "komossense", "litorale", "llatzerense", "madagascariense", "mageritense", "monacense", "moriokaense", "mucogenicum", "mucogenicum", "murale", "neoaurum", "neworleansense", "novocastrense", "obuense", "pallens", "parafortuitum", "peregrinum", "phlei", "phocaicum", "phocaicum", "porcinum", "poriferae", "psychrotolerans", "pyrenivorans", "rhodesiae", "rufum", "rutilum", "salmoniphilum", "sediminis", "senegalense", "septicum", "setense", "smegmatis", "sphagni", "thermoresistibile", "tokaiense", "vaccae", "vanbaalenii", "wolinskyi")) %>% as.mo(keep_synonyms = TRUE)))
filter(mo_group != "B_MYCBC_RGM") %>%
bind_rows(tibble(
mo_group = as.mo("B_MYCBC_RGM"),
mo = paste("Mycobacterium", c("abscessus abscessus", "abscessus bolletii", "abscessus massiliense", "agri", "aichiense", "algericum", "alvei", "anyangense", "arabiense", "aromaticivorans", "aubagnense", "aubagnense", "aurum", "austroafricanum", "bacteremicum", "boenickei", "bourgelatii", "brisbanense", "brumae", "canariasense", "celeriflavum", "chelonae", "chitae", "chlorophenolicum", "chubuense", "confluentis", "cosmeticum", "crocinum", "diernhoferi", "duvalii", "elephantis", "fallax", "flavescens", "fluoranthenivorans", "fortuitum", "franklinii", "frederiksbergense", "gadium", "gilvum", "goodii", "hassiacum", "hippocampi", "hodleri", "holsaticum", "houstonense", "immunogenum", "insubricum", "iranicum", "komossense", "litorale", "llatzerense", "madagascariense", "mageritense", "monacense", "moriokaense", "mucogenicum", "mucogenicum", "murale", "neoaurum", "neworleansense", "novocastrense", "obuense", "pallens", "parafortuitum", "peregrinum", "phlei", "phocaicum", "phocaicum", "porcinum", "poriferae", "psychrotolerans", "pyrenivorans", "rhodesiae", "rufum", "rutilum", "salmoniphilum", "sediminis", "senegalense", "septicum", "setense", "smegmatis", "sphagni", "thermoresistibile", "tokaiense", "vaccae", "vanbaalenii", "wolinskyi")) %>% as.mo(keep_synonyms = TRUE)
))
# add subspecies to all species
for (group in unique(microorganisms.groups$mo_group)) {
spp <- microorganisms.groups %>%
filter(mo_group == group & mo_rank(mo, keep_synonyms = TRUE) == "species") %>%
pull(mo) %>%
paste0(collapse = "|") %>%
filter(mo_group == group & mo_rank(mo, keep_synonyms = TRUE) == "species") %>%
pull(mo) %>%
paste0(collapse = "|") %>%
paste0("^(", ., ")")
mos <- microorganisms %>%
filter(mo %like% spp & rank == "subspecies") %>%
@@ -175,9 +212,11 @@ for (group in unique(microorganisms.groups$mo_group)) {
# add full names, arrange and clean
microorganisms.groups <- microorganisms.groups %>%
mutate(mo_group_name = mo_name(mo_group, keep_synonyms = TRUE, language = NULL),
mo_name = mo_name(mo, keep_synonyms = TRUE, language = NULL)) %>%
arrange(mo_group_name, mo_name) %>%
mutate(
mo_group_name = mo_name(mo_group, keep_synonyms = TRUE, language = NULL),
mo_name = mo_name(mo, keep_synonyms = TRUE, language = NULL)
) %>%
arrange(mo_group_name, mo_name) %>%
filter(mo_group != mo) %>%
distinct() %>%
dataset_UTF8_to_ASCII()