Add add_if_missing parameter to control NA handling in interpretive rules (#264)

2026-06-20 20:56:22 +02:00 · 2026-04-21 21:53:43 +02:00
parent fb8758f36b
commit 8ff5d4472a
46 changed files with 1232 additions and 1016 deletions
--- a/data-raw/_reproduction_scripts/reproduction_of_antimicrobials.R
+++ b/data-raw/_reproduction_scripts/reproduction_of_antimicrobials.R
@@ -262,9 +262,9 @@ get_synonyms <- function(CID, clean = TRUE) {
    if (is.na(CID[i])) {
      next
    }
-    
+
    all_cids <- CID[i]
-    
+
    # we will now get the closest compounds with a 96% threshold
    similar_cids <- tryCatch(
      data.table::fread(
@@ -281,7 +281,7 @@ get_synonyms <- function(CID, clean = TRUE) {
    # leave out all CIDs that we have in our antimicrobials dataset to prevent duplication
    similar_cids <- similar_cids[!similar_cids %in% antimicrobials$cid[!is.na(antimicrobials$cid)]]
    all_cids <- unique(c(all_cids, similar_cids))
-    
+
    # for each one, we are getting the synonyms
    current_syns <- character(0)
    for (j in seq_len(length(all_cids))) {
@@ -297,9 +297,9 @@ get_synonyms <- function(CID, clean = TRUE) {
        )[[1]],
        error = function(e) NA_character_
      )
-      
+
      Sys.sleep(0.05)
-      
+
      if (clean == TRUE) {
        # remove text between brackets
        synonyms_txt <- trimws(gsub(
@@ -319,16 +319,16 @@ get_synonyms <- function(CID, clean = TRUE) {
        synonyms_txt <- gsub("[^a-z]+$", "", ignore.case = TRUE, synonyms_txt)
        # only length 5 to 20 and lower-case names starting with a capital letter
        synonyms_txt <- synonyms_txt[nchar(synonyms_txt) %in% c(5:20) &
-                                       grepl("^[A-Z][a-z]+$", synonyms_txt, ignore.case = FALSE)]
+          grepl("^[A-Z][a-z]+$", synonyms_txt, ignore.case = FALSE)]
        synonyms_txt <- unlist(strsplit(synonyms_txt, ";", fixed = TRUE))
      }
-      
+
      # synonyms must not be set for other agents, so remove the duplicates
      synonyms_txt <- synonyms_txt[!synonyms_txt %in% unlist(synonyms)]
-      
+
      current_syns <- c(current_syns, synonyms_txt)
    }
-    
+
    current_syns <- unique(trimws(current_syns[tolower(current_syns) %in% unique(tolower(current_syns))]))
    synonyms[i] <- list(sort(current_syns))
  }
@@ -763,10 +763,12 @@ antimicrobials[which(antimicrobials$ab %in% c("CYC", "LNZ", "THA", "TZD")), "gro
 # add efflux
 effl <- antimicrobials |>
  filter(ab == "ACM") |>
-  mutate(ab = as.character("EFF"),
-         cid = NA_real_,
-         name = "Efflux",
-         group = "Other")
+  mutate(
+    ab = as.character("EFF"),
+    cid = NA_real_,
+    name = "Efflux",
+    group = "Other"
+  )
 antimicrobials <- antimicrobials |>
  mutate(ab = as.character(ab)) |>
  bind_rows(effl)
@@ -777,9 +779,11 @@ antimicrobials[which(antimicrobials$ab == "EFF"), "abbreviations"][[1]] <- list(
 # add clindamycin inducible screening
 clin <- antimicrobials |>
  filter(ab == "FOX1") |>
-  mutate(ab = as.character("CLI-S"),
-         name = "Clindamycin inducible screening",
-         group = "Macrolides/lincosamides")
+  mutate(
+    ab = as.character("CLI-S"),
+    name = "Clindamycin inducible screening",
+    group = "Macrolides/lincosamides"
+  )
 antimicrobials <- antimicrobials |>
  mutate(ab = as.character(ab)) |>
  bind_rows(clin)
@@ -791,109 +795,123 @@ antimicrobials <- antimicrobials |>
  bind_rows(
    antimicrobials |>
      filter(ab == "EFF") |>
-      mutate(ab = "BLA-S",
-             name = paste("Beta-lactamase", "screening test"),
-             cid = NA_real_,
-             atc = list(character(0)),
-             atc_group1 = NA_character_,
-             atc_group2 = NA_character_,
-             abbreviations = list(c("beta-lactamase", "betalactamase", "bl screen", "blt screen")),
-             synonyms = list(character(0)),
-             oral_ddd = NA_real_,
-             oral_units = NA_character_,
-             iv_ddd = NA_real_,
-             iv_units = NA_character_,
-             loinc = list(character(0))),
+      mutate(
+        ab = "BLA-S",
+        name = paste("Beta-lactamase", "screening test"),
+        cid = NA_real_,
+        atc = list(character(0)),
+        atc_group1 = NA_character_,
+        atc_group2 = NA_character_,
+        abbreviations = list(c("beta-lactamase", "betalactamase", "bl screen", "blt screen")),
+        synonyms = list(character(0)),
+        oral_ddd = NA_real_,
+        oral_units = NA_character_,
+        iv_ddd = NA_real_,
+        iv_units = NA_character_,
+        loinc = list(character(0))
+      ),
    antimicrobials |>
      filter(ab == "PEN") |>
-      mutate(ab = "PEN-S",
-             name = paste(name, "screening test"),
-             cid = NA,
-             atc = list(character(0)),
-             atc_group1 = NA_character_,
-             atc_group2 = NA_character_,
-             abbreviations = list(c("pen screen")),
-             synonyms = list(character(0)),
-             oral_ddd = NA_real_,
-             oral_units = NA_character_,
-             iv_ddd = NA_real_,
-             iv_units = NA_character_,
-             loinc = list(character(0))),
+      mutate(
+        ab = "PEN-S",
+        name = paste(name, "screening test"),
+        cid = NA,
+        atc = list(character(0)),
+        atc_group1 = NA_character_,
+        atc_group2 = NA_character_,
+        abbreviations = list(c("pen screen")),
+        synonyms = list(character(0)),
+        oral_ddd = NA_real_,
+        oral_units = NA_character_,
+        iv_ddd = NA_real_,
+        iv_units = NA_character_,
+        loinc = list(character(0))
+      ),
    antimicrobials |>
      filter(ab == "OXA") |>
-      mutate(ab = "OXA-S",
-             name = paste(name, "screening test"),
-             cid = NA,
-             atc = list(character(0)),
-             atc_group1 = NA_character_,
-             atc_group2 = NA_character_,
-             abbreviations = list(c("oxa screen")),
-             synonyms = list(character(0)),
-             oral_ddd = NA_real_,
-             oral_units = NA_character_,
-             iv_ddd = NA_real_,
-             iv_units = NA_character_,
-             loinc = list(character(0))),
+      mutate(
+        ab = "OXA-S",
+        name = paste(name, "screening test"),
+        cid = NA,
+        atc = list(character(0)),
+        atc_group1 = NA_character_,
+        atc_group2 = NA_character_,
+        abbreviations = list(c("oxa screen")),
+        synonyms = list(character(0)),
+        oral_ddd = NA_real_,
+        oral_units = NA_character_,
+        iv_ddd = NA_real_,
+        iv_units = NA_character_,
+        loinc = list(character(0))
+      ),
    antimicrobials |>
      filter(ab == "PEF") |>
-      mutate(ab = "PEF-S",
-             name = paste(name, "screening test"),
-             cid = NA,
-             atc = list(character(0)),
-             atc_group1 = NA_character_,
-             atc_group2 = NA_character_,
-             abbreviations = list(c("pef screen")),
-             synonyms = list(character(0)),
-             oral_ddd = NA_real_,
-             oral_units = NA_character_,
-             iv_ddd = NA_real_,
-             iv_units = NA_character_,
-             loinc = list(character(0))),
+      mutate(
+        ab = "PEF-S",
+        name = paste(name, "screening test"),
+        cid = NA,
+        atc = list(character(0)),
+        atc_group1 = NA_character_,
+        atc_group2 = NA_character_,
+        abbreviations = list(c("pef screen")),
+        synonyms = list(character(0)),
+        oral_ddd = NA_real_,
+        oral_units = NA_character_,
+        iv_ddd = NA_real_,
+        iv_units = NA_character_,
+        loinc = list(character(0))
+      ),
    antimicrobials |>
      filter(ab == "NAL") |>
-      mutate(ab = "NAL-S",
-             name = paste(name, "screening test"),
-             cid = NA,
-             atc = list(character(0)),
-             atc_group1 = NA_character_,
-             atc_group2 = NA_character_,
-             abbreviations = list(c("nal screen")),
-             synonyms = list(character(0)),
-             oral_ddd = NA_real_,
-             oral_units = NA_character_,
-             iv_ddd = NA_real_,
-             iv_units = NA_character_,
-             loinc = list(character(0))),
+      mutate(
+        ab = "NAL-S",
+        name = paste(name, "screening test"),
+        cid = NA,
+        atc = list(character(0)),
+        atc_group1 = NA_character_,
+        atc_group2 = NA_character_,
+        abbreviations = list(c("nal screen")),
+        synonyms = list(character(0)),
+        oral_ddd = NA_real_,
+        oral_units = NA_character_,
+        iv_ddd = NA_real_,
+        iv_units = NA_character_,
+        loinc = list(character(0))
+      ),
    antimicrobials |>
      filter(ab == "NOR") |>
-      mutate(ab = "NOR-S",
-             name = paste(name, "screening test"),
-             cid = NA,
-             atc = list(character(0)),
-             atc_group1 = NA_character_,
-             atc_group2 = NA_character_,
-             abbreviations = list(c("nor screen")),
-             synonyms = list(character(0)),
-             oral_ddd = NA_real_,
-             oral_units = NA_character_,
-             iv_ddd = NA_real_,
-             iv_units = NA_character_,
-             loinc = list(character(0))),
+      mutate(
+        ab = "NOR-S",
+        name = paste(name, "screening test"),
+        cid = NA,
+        atc = list(character(0)),
+        atc_group1 = NA_character_,
+        atc_group2 = NA_character_,
+        abbreviations = list(c("nor screen")),
+        synonyms = list(character(0)),
+        oral_ddd = NA_real_,
+        oral_units = NA_character_,
+        iv_ddd = NA_real_,
+        iv_units = NA_character_,
+        loinc = list(character(0))
+      ),
    antimicrobials |>
      filter(ab == "TCY") |>
-      mutate(ab = "TCY-S",
-             name = paste(name, "screening test"),
-             cid = NA,
-             atc = list(character(0)),
-             atc_group1 = NA_character_,
-             atc_group2 = NA_character_,
-             abbreviations = list(c("tcy screen")),
-             synonyms = list(character(0)),
-             oral_ddd = NA_real_,
-             oral_units = NA_character_,
-             iv_ddd = NA_real_,
-             iv_units = NA_character_,
-             loinc = list(character(0)))
+      mutate(
+        ab = "TCY-S",
+        name = paste(name, "screening test"),
+        cid = NA,
+        atc = list(character(0)),
+        atc_group1 = NA_character_,
+        atc_group2 = NA_character_,
+        abbreviations = list(c("tcy screen")),
+        synonyms = list(character(0)),
+        oral_ddd = NA_real_,
+        oral_units = NA_character_,
+        iv_ddd = NA_real_,
+        iv_units = NA_character_,
+        loinc = list(character(0))
+      )
  )


@@ -919,16 +937,20 @@ antimicrobials <- antimicrobials |>
    antimicrobials |>
      filter(ab == "FPE") |>
      mutate(ab = as.character(ab)) |>
-      mutate(ab = "FTA",
-             name = "Cefepime/taniborbactam",
-             cid = NA_real_),
+      mutate(
+        ab = "FTA",
+        name = "Cefepime/taniborbactam",
+        cid = NA_real_
+      ),
    antimicrobials |>
      filter(ab == "TBP") |>
      mutate(ab = as.character(ab)) |>
-      mutate(ab = "TAN",
-             name = "Taniborbactam",
-             cid = 76902493,
-             abbreviations = list("VNRX-5133"))
+      mutate(
+        ab = "TAN",
+        name = "Taniborbactam",
+        cid = 76902493,
+        abbreviations = list("VNRX-5133")
+      )
  )

 antimicrobials <- antimicrobials |>
@@ -936,39 +958,51 @@ antimicrobials <- antimicrobials |>
  bind_rows(
    antimicrobials |>
      filter(ab == "CTB") |>
-      mutate(ab = "CTA",
-             cid = NA_real_,
-             name = "Ceftibuten/avibactam") |>
+      mutate(
+        ab = "CTA",
+        cid = NA_real_,
+        name = "Ceftibuten/avibactam"
+      ) |>
      select(1:4),
    antimicrobials |>
      filter(ab == "KAC") |>
-      mutate(ab = "KAS",
-             cid = NA_real_,
-             name = "Kasugamycin") |>
+      mutate(
+        ab = "KAS",
+        cid = NA_real_,
+        name = "Kasugamycin"
+      ) |>
      select(1:4),
    antimicrobials |>
      filter(ab == "PRI") |>
-      mutate(ab = "OST",
-             cid = NA_real_,
-             name = "Ostreogrycin") |>
+      mutate(
+        ab = "OST",
+        cid = NA_real_,
+        name = "Ostreogrycin"
+      ) |>
      select(1:4),
    antimicrobials |>
      filter(ab == "PRI") |>
-      mutate(ab = "THS",
-             cid = NA_real_,
-             name = "Thiostrepton") |>
+      mutate(
+        ab = "THS",
+        cid = NA_real_,
+        name = "Thiostrepton"
+      ) |>
      select(1, 3),
    antimicrobials |>
      filter(ab == "CLA1") |>
-      mutate(ab = "XER",
-             cid = NA_real_,
-             name = "Xeruborbactam") |>
+      mutate(
+        ab = "XER",
+        cid = NA_real_,
+        name = "Xeruborbactam"
+      ) |>
      select(1:4),
    antimicrobials |>
      filter(ab == "BLM") |>
-      mutate(ab = "ZOR",
-             cid = NA_real_,
-             name = "Zorbamycin") |>
+      mutate(
+        ab = "ZOR",
+        cid = NA_real_,
+        name = "Zorbamycin"
+      ) |>
      select(1:4),
  )

@@ -977,9 +1011,11 @@ antimicrobials <- antimicrobials |>
  bind_rows(
    antimicrobials |>
      filter(ab == "NOV") |>
-      mutate(ab = "CLB",
-             cid = 54706138,
-             name = "Clorobiocin") |>
+      mutate(
+        ab = "CLB",
+        cid = 54706138,
+        name = "Clorobiocin"
+      ) |>
      select(1:4),
  )

@@ -990,7 +1026,7 @@ get_atc_table <- function(ab_name, type = "human") {
  if (type == "human") {
    url <- "https://atcddd.fhi.no/atc_ddd_index/"
  } else if (type == "veterinary") {
-    url <- "https://atcddd.fhi.no/atcvet/atcvet_index/"  
+    url <- "https://atcddd.fhi.no/atcvet/atcvet_index/"
  } else {
    stop("invalid type")
  }
@@ -1055,8 +1091,10 @@ to_update <- 1:nrow(antimicrobials)
 # or just the empty ones:
 to_update <- which(sapply(antimicrobials$atc, function(x) length(x[!is.na(x)])) == 0)

-updated_atc <- lapply(seq_len(length(to_update)),
-                      function(x) NA_character_)
+updated_atc <- lapply(
+  seq_len(length(to_update)),
+  function(x) NA_character_
+)


 # this takes around 10 minutes for the whole table (some ABx are skipped and go faster)
--- a/data-raw/_reproduction_scripts/reproduction_of_clinical_breakpoints.R
+++ b/data-raw/_reproduction_scripts/reproduction_of_clinical_breakpoints.R
@@ -72,12 +72,12 @@ whonet_organisms <- whonet_organisms_raw |>
    ORGANISM = if_else(ORGANISM_CODE == "ckr", "Candida krusei", ORGANISM)
  ) |>
  # try to match on GBIF identifier
-  left_join(microorganisms |> distinct(mo, gbif, status) |> filter(!is.na(gbif)), by = c("GBIF_TAXON_ID" = "gbif")) |> 
+  left_join(microorganisms |> distinct(mo, gbif, status) |> filter(!is.na(gbif)), by = c("GBIF_TAXON_ID" = "gbif")) |>
  # remove duplicates
  arrange(ORGANISM_CODE, GBIF_TAXON_ID, status) |>
-  distinct(ORGANISM_CODE, .keep_all = TRUE) |> 
+  distinct(ORGANISM_CODE, .keep_all = TRUE) |>
  # add Enterobacterales, which is a subkingdom code in their data
-  bind_rows(data.frame(ORGANISM_CODE = "ebc", ORGANISM = "Enterobacterales", mo = as.mo("Enterobacterales"))) |> 
+  bind_rows(data.frame(ORGANISM_CODE = "ebc", ORGANISM = "Enterobacterales", mo = as.mo("Enterobacterales"))) |>
  arrange(ORGANISM)


@@ -88,31 +88,39 @@ unmatched <- whonet_organisms |> filter(is.na(mo))

 # generate the mo codes and add their names
 message("Getting MO codes for WHONET input...")
-unmatched <- unmatched |> 
-  mutate(mo = as.mo(gsub("(sero[a-z]*| nontypable| non[-][a-zA-Z]+|var[.]| not .*|sp[.],.*|, .*variant.*|, .*toxin.*|, microaer.*| beta-haem[.])", "", ORGANISM),
-                    minimum_matching_score = 0.55,
-                    keep_synonyms = TRUE,
-                    language = "en"),
-         mo = case_when(ORGANISM %like% "Anaerobic" & ORGANISM %like% "negative" ~ as.mo("B_ANAER-NEG"),
-                        ORGANISM %like% "Anaerobic" & ORGANISM %like% "positive" ~ as.mo("B_ANAER-POS"),
-                        ORGANISM %like% "Anaerobic" ~ as.mo("B_ANAER"),
-                        TRUE ~ mo),
-         mo_name = mo_name(mo,
-                           keep_synonyms = TRUE,
-                           language = "en"))
+unmatched <- unmatched |>
+  mutate(
+    mo = as.mo(gsub("(sero[a-z]*| nontypable| non[-][a-zA-Z]+|var[.]| not .*|sp[.],.*|, .*variant.*|, .*toxin.*|, microaer.*| beta-haem[.])", "", ORGANISM),
+      minimum_matching_score = 0.55,
+      keep_synonyms = TRUE,
+      language = "en"
+    ),
+    mo = case_when(
+      ORGANISM %like% "Anaerobic" & ORGANISM %like% "negative" ~ as.mo("B_ANAER-NEG"),
+      ORGANISM %like% "Anaerobic" & ORGANISM %like% "positive" ~ as.mo("B_ANAER-POS"),
+      ORGANISM %like% "Anaerobic" ~ as.mo("B_ANAER"),
+      TRUE ~ mo
+    ),
+    mo_name = mo_name(mo,
+      keep_synonyms = TRUE,
+      language = "en"
+    )
+  )
 # check if coercion at least resembles the first part (genus)
-unmatched <- unmatched |> 
+unmatched <- unmatched |>
  mutate(
    first_part = sapply(ORGANISM, function(x) strsplit(gsub("[^a-zA-Z _-]+", "", x), " ")[[1]][1], USE.NAMES = FALSE),
-    keep = mo_name %like_case% first_part | ORGANISM %like% "Gram " | ORGANISM == "Other" | ORGANISM %like% "anaerobic") |> 
+    keep = mo_name %like_case% first_part | ORGANISM %like% "Gram " | ORGANISM == "Other" | ORGANISM %like% "anaerobic"
+  ) |>
  arrange(keep)
 unmatched |> View()
 unmatched <- unmatched |>
  filter(keep == TRUE)

-organisms <- matched |> transmute(code = toupper(ORGANISM_CODE), group = SPECIES_GROUP, mo) |> 
-  bind_rows(unmatched |> transmute(code = toupper(ORGANISM_CODE), group = SPECIES_GROUP, mo)) |> 
-  mutate(name = mo_name(mo, keep_synonyms = TRUE)) |> 
+organisms <- matched |>
+  transmute(code = toupper(ORGANISM_CODE), group = SPECIES_GROUP, mo) |>
+  bind_rows(unmatched |> transmute(code = toupper(ORGANISM_CODE), group = SPECIES_GROUP, mo)) |>
+  mutate(name = mo_name(mo, keep_synonyms = TRUE)) |>
  arrange(code)

 # self-defined codes in the MO table must be retained
@@ -125,25 +133,33 @@ organisms <- organisms |>
 # some subspecies exist, while their upper species do not, add them as the species level:
 subspp <- organisms |>
  filter(mo_species(mo, keep_synonyms = TRUE) == mo_subspecies(mo, keep_synonyms = TRUE) &
-           mo_species(mo, keep_synonyms = TRUE) != "" &
-           mo_genus(mo, keep_synonyms = TRUE) != "Salmonella") |> 
-  mutate(mo = as.mo(paste(mo_genus(mo, keep_synonyms = TRUE),
-                          mo_species(mo, keep_synonyms = TRUE)),
-                    keep_synonyms = TRUE),
-         name = mo_name(mo, keep_synonyms = TRUE))
+    mo_species(mo, keep_synonyms = TRUE) != "" &
+    mo_genus(mo, keep_synonyms = TRUE) != "Salmonella") |>
+  mutate(
+    mo = as.mo(
+      paste(
+        mo_genus(mo, keep_synonyms = TRUE),
+        mo_species(mo, keep_synonyms = TRUE)
+      ),
+      keep_synonyms = TRUE
+    ),
+    name = mo_name(mo, keep_synonyms = TRUE)
+  )
 organisms <- organisms |>
  filter(!code %in% subspp$code) |>
  bind_rows(subspp) |>
  arrange(code)

 # add the groups
-organisms <- organisms |> 
-  bind_rows(tibble(code = organisms |> filter(!is.na(group)) |> pull(group) |> unique(),
-                   group = NA,
-                   mo = organisms |> filter(!is.na(group)) |> pull(group) |> unique() |> as.mo(keep_synonyms = TRUE),
-                   name = mo_name(mo, keep_synonyms = TRUE))) |> 
-  arrange(code, group) |> 
-  select(-group) |> 
+organisms <- organisms |>
+  bind_rows(tibble(
+    code = organisms |> filter(!is.na(group)) |> pull(group) |> unique(),
+    group = NA,
+    mo = organisms |> filter(!is.na(group)) |> pull(group) |> unique() |> as.mo(keep_synonyms = TRUE),
+    name = mo_name(mo, keep_synonyms = TRUE)
+  )) |>
+  arrange(code, group) |>
+  select(-group) |>
  distinct()
 # no XXX
 organisms <- organisms |> filter(code != "XXX")
@@ -153,7 +169,7 @@ organisms <- organisms |> filter(code != "XXX")
 # 2025-04-20 still the case
 # 2026-03-27 still the case, but fixed using `existing_codes` above
 organisms |> filter(code == "SGM")
-# organisms <- organisms |> 
+# organisms <- organisms |>
 #   filter(!(code == "SGM" & name %like% "Streptococcus"))
 # this must be empty:
 organisms$code[organisms$code |> duplicated()]
@@ -165,12 +181,12 @@ saveRDS(organisms, "data-raw/organisms.rds", version = 2)
 #---

 # update microorganisms.codes with the latest WHONET codes
-microorganisms.codes2 <- microorganisms.codes |> 
+microorganisms.codes2 <- microorganisms.codes |>
  # remove all old WHONET codes, whether we (in the end) keep them or not
-  filter(!toupper(code) %in% toupper(organisms$code)) |> 
+  filter(!toupper(code) %in% toupper(organisms$code)) |>
  # and add the new ones
-  bind_rows(organisms |> select(code, mo)) |> 
-  arrange(code) |> 
+  bind_rows(organisms |> select(code, mo)) |>
+  arrange(code) |>
  distinct(code, .keep_all = TRUE)
 # new codes:
 microorganisms.codes2$code[which(!microorganisms.codes2$code %in% microorganisms.codes$code)]
@@ -214,47 +230,53 @@ devtools::load_all()

 # now that we have the correct MO codes, get the breakpoints and convert them

-whonet_breakpoints_raw |> 
-  count(GUIDELINES, BREAKPOINT_TYPE) |> 
-  pivot_wider(names_from = BREAKPOINT_TYPE, values_from = n) |> 
+whonet_breakpoints_raw |>
+  count(GUIDELINES, BREAKPOINT_TYPE) |>
+  pivot_wider(names_from = BREAKPOINT_TYPE, values_from = n) |>
  janitor::adorn_totals(where = c("row", "col"))
-whonet_breakpoints_raw |> 
+whonet_breakpoints_raw |>
  filter(YEAR == format(Sys.Date(), "%Y")) |>
-  count(GUIDELINES, YEAR, BREAKPOINT_TYPE) |> 
-  pivot_wider(names_from = BREAKPOINT_TYPE, values_from = n) |> 
+  count(GUIDELINES, YEAR, BREAKPOINT_TYPE) |>
+  pivot_wider(names_from = BREAKPOINT_TYPE, values_from = n) |>
  janitor::adorn_totals(where = c("row", "col"))
 # compared to current
 AMR::clinical_breakpoints |>
  count(GUIDELINES = gsub("[^a-zA-Z]", "", guideline), type) |>
  arrange(tolower(type)) |>
-  pivot_wider(names_from = type, values_from = n) |> 
+  pivot_wider(names_from = type, values_from = n) |>
  as.data.frame() |>
  janitor::adorn_totals(where = c("row", "col"))

 breakpoints <- whonet_breakpoints_raw |>
  mutate(code = toupper(ORGANISM_CODE)) |>
-  left_join(bind_rows(microorganisms.codes |> filter(!code %in% c("ALL", "GEN")),
-                      # GEN (Generic) and ALL (All) are PK/PD codes
-                      data.frame(code = c("ALL", "GEN"),
-                                 mo = rep(as.mo("UNKNOWN"), 2))))
+  left_join(bind_rows(
+    microorganisms.codes |> filter(!code %in% c("ALL", "GEN")),
+    # GEN (Generic) and ALL (All) are PK/PD codes
+    data.frame(
+      code = c("ALL", "GEN"),
+      mo = rep(as.mo("UNKNOWN"), 2)
+    )
+  ))
 # these ones lack an MO name, they cannot be used:
 unknown <- breakpoints |>
  filter(is.na(mo)) |>
  pull(code) |>
  unique()
-breakpoints |> 
-  filter(code %in% unknown) |> 
+breakpoints |>
+  filter(code %in% unknown) |>
  count(GUIDELINES, YEAR, ORGANISM_CODE, BREAKPOINT_TYPE, sort = TRUE)
 # 2025-04-20: these codes are currently: cps, fso. No clue (are not in MO list of WHONET), and they are only ECOFFs, so remove them:
-breakpoints <- breakpoints |> 
+breakpoints <- breakpoints |>
  filter(!is.na(mo))

 # and these ones have unknown antibiotics according to WHONET itself:
-breakpoints |> 
-  filter(!WHONET_ABX_CODE %in% whonet_antibiotics_raw$WHONET_ABX_CODE) |> 
+breakpoints |>
+  filter(!WHONET_ABX_CODE %in% whonet_antibiotics_raw$WHONET_ABX_CODE) |>
  count(GUIDELINES, WHONET_ABX_CODE) |>
-  mutate(ab = as.ab(WHONET_ABX_CODE, fast_mode = TRUE),
-         ab_name = ab_name(ab))
+  mutate(
+    ab = as.ab(WHONET_ABX_CODE, fast_mode = TRUE),
+    ab_name = ab_name(ab)
+  )
 # 2025-04-20: these codes are currently: CFC, ROX, FIX, and N/A. All have the right replacements in `antimicrobials`, so we can safely use as.ab() later on
 # the NAs are for M. tuberculosis, they are empty breakpoints
 breakpoints <- breakpoints |>
@@ -264,7 +286,7 @@ breakpoints <- breakpoints |>
 ## Build new breakpoints table ----

 breakpoints_new <- breakpoints |>
-  filter(!is.na(WHONET_ABX_CODE)) |> 
+  filter(!is.na(WHONET_ABX_CODE)) |>
  transmute(
    guideline = paste(GUIDELINES, YEAR),
    type = ifelse(BREAKPOINT_TYPE == "ECOFF", "ECOFF", tolower(BREAKPOINT_TYPE)),
@@ -301,22 +323,26 @@ breakpoints_new <- breakpoints |>
  distinct(guideline, type, host, ab, mo, method, site, breakpoint_S, .keep_all = TRUE)

 # fix reference table names
-breakpoints_new |> filter(guideline %like% "EUCAST", is.na(ref_tbl)) |> View()
-breakpoints_new <- breakpoints_new |> 
-  mutate(ref_tbl = case_when(is.na(ref_tbl) & guideline %like% "EUCAST 202" ~ lead(ref_tbl),
-                             is.na(ref_tbl) ~ "Unknown",
-                             TRUE ~ ref_tbl))
+breakpoints_new |>
+  filter(guideline %like% "EUCAST", is.na(ref_tbl)) |>
+  View()
+breakpoints_new <- breakpoints_new |>
+  mutate(ref_tbl = case_when(
+    is.na(ref_tbl) & guideline %like% "EUCAST 202" ~ lead(ref_tbl),
+    is.na(ref_tbl) ~ "Unknown",
+    TRUE ~ ref_tbl
+  ))

 # clean disk zones
 breakpoints_new[which(breakpoints_new$method == "DISK"), "breakpoint_S"] <- as.double(as.disk(breakpoints_new[which(breakpoints_new$method == "DISK"), "breakpoint_S", drop = TRUE]))
 breakpoints_new[which(breakpoints_new$method == "DISK"), "breakpoint_R"] <- as.double(as.disk(breakpoints_new[which(breakpoints_new$method == "DISK"), "breakpoint_R", drop = TRUE]))

 # regarding animal breakpoints, CLSI has adults and foals for horses, but only for amikacin - only keep adult horses
-breakpoints_new |> 
+breakpoints_new |>
  filter(host %like% "foal") |>
  count(guideline, host, ab)
-breakpoints_new <- breakpoints_new |> 
-  filter(host %unlike% "foal") |> 
+breakpoints_new <- breakpoints_new |>
+  filter(host %unlike% "foal") |>
  mutate(host = ifelse(host %like% "horse", "horse", host))

 # FIXES FOR WHONET ERRORS ----
@@ -324,8 +350,12 @@ m <- unique(as.double(as.mic(levels(as.mic(1)))))

 # WHONET has no >1024 but instead uses 1025, 513, and 129, so as.mic() cannot be used to clean.
 # instead, raise these one higher valid MIC factor level:
-breakpoints_new |> filter(method == "MIC" & (!breakpoint_S %in% c(m, NA))) |> distinct(breakpoint_S)
-breakpoints_new |> filter(method == "MIC" & (!breakpoint_R %in% c(m, NA))) |> distinct(breakpoint_R)
+breakpoints_new |>
+  filter(method == "MIC" & (!breakpoint_S %in% c(m, NA))) |>
+  distinct(breakpoint_S)
+breakpoints_new |>
+  filter(method == "MIC" & (!breakpoint_R %in% c(m, NA))) |>
+  distinct(breakpoint_R)
 breakpoints_new[which(breakpoints_new$breakpoint_R == 129), "breakpoint_R"] <- m[which(m == 128) + 1]
 breakpoints_new[which(breakpoints_new$breakpoint_R == 257), "breakpoint_R"] <- m[which(m == 256) + 1]
 breakpoints_new[which(breakpoints_new$breakpoint_R == 513), "breakpoint_R"] <- m[which(m == 512) + 1]
@@ -353,12 +383,12 @@ breakpoints_new$mo[breakpoints_new$guideline %like% "EUCAST" & breakpoints_new$m
 breakpoints_new |>
  filter(method == "MIC" & guideline %like% "EUCAST" & mo %like% as.mo("B_HMPHL")) |>
  count(guideline, mo)
-breakpoints_new <- breakpoints_new |> 
+breakpoints_new <- breakpoints_new |>
  bind_rows(
    breakpoints_new |>
-      filter(guideline %like% "EUCAST", mo == "B_HMPHL_INFL") |> 
+      filter(guideline %like% "EUCAST", mo == "B_HMPHL_INFL") |>
      mutate(mo = as.mo("B_HMPHL_PRNF"))
-  ) |> 
+  ) |>
  arrange(desc(guideline), mo, ab, type, host, method) |>
  distinct()
 # Achromobacter denitrificans is in WHONET included in their A. xylosoxidans table, must be removed
@@ -387,7 +417,9 @@ breakpoints_new <- breakpoints_new |> filter(!wrong)
 # 2025-04-20/ fixed now

 # WHONET sets for EUCAST 2026 TMP breakpoints for all Klebsiella, but this is now only for non-aerogenes species
-kleb_spp <- microorganisms |> filter(rank == "species", genus == "Klebsiella", !species %in% c("", "aerogenes")) |> pull(mo)
+kleb_spp <- microorganisms |>
+  filter(rank == "species", genus == "Klebsiella", !species %in% c("", "aerogenes")) |>
+  pull(mo)
 kleb_tmp_mic <- breakpoints_new |>
  filter(guideline == "EUCAST 2026", method == "MIC", ab == "TMP", mo == as.mo("Klebsiella")) |>
  uncount(length(kleb_spp)) |>
@@ -398,8 +430,10 @@ kleb_tmp_disk <- breakpoints_new |>
  mutate(mo = kleb_spp)
 breakpoints_new <- breakpoints_new |>
  filter(!(guideline == "EUCAST 2026" & method == "MIC" & ab == "TMP" & mo == as.mo("Klebsiella"))) |>
-  bind_rows(kleb_tmp_mic,
-            kleb_tmp_disk)
+  bind_rows(
+    kleb_tmp_mic,
+    kleb_tmp_disk
+  )

 # WHONET contains wrong EUCAST breakpoints for enterococci/SXT: disk should be 23/23, not 21/50, and MIC should be 1/1, not 0.032/1
 # applies to all previous years, since v11 (2011)
@@ -441,14 +475,14 @@ breakpoints_new <- breakpoints_new |>


 # check the strange duplicates
-breakpoints_new |> 
+breakpoints_new |>
  mutate(id = paste(guideline, type, host, method, site, mo, ab, uti)) %>%
-  filter(id %in% .$id[which(duplicated(id))]) |> 
+  filter(id %in% .$id[which(duplicated(id))]) |>
  arrange(desc(guideline)) |>
  View()
 # 2024-06-19/ mostly ECOFFs, but there's no explanation in the whonet_breakpoints_raw df, we have to remove duplicates
 # 2025-04-20/ same, most important one seems M. tuberculosis in CLSI (also in 2025)
-breakpoints_new <- breakpoints_new |> 
+breakpoints_new <- breakpoints_new |>
  distinct(guideline, type, host, method, site, mo, ab, uti, .keep_all = TRUE)


@@ -469,7 +503,7 @@ dim(clinical_breakpoints)
 # SAVE TO PACKAGE ----

 # determine rank again now that some changes were made on taxonomic level (genus -> species)
-breakpoints_new <- breakpoints_new |> 
+breakpoints_new <- breakpoints_new |>
  mutate(rank_index = case_when(
    mo_rank(mo, keep_synonyms = TRUE) %like% "(infra|sub)" ~ 1,
    mo_rank(mo, keep_synonyms = TRUE) == "species" ~ 2,
--- a/data-raw/_reproduction_scripts/reproduction_of_microorganisms.R
+++ b/data-raw/_reproduction_scripts/reproduction_of_microorganisms.R
@@ -649,7 +649,9 @@ taxonomy_mycobank <- taxonomy_mycobank %>%
  arrange(fullname)

 taxonomy_mycobank %>% count(rank, sort = TRUE)
-taxonomy_mycobank %>% filter(rank %like% "#") %>% count(rank)
+taxonomy_mycobank %>%
+  filter(rank %like% "#") %>%
+  count(rank)

 taxonomy_mycobank3 <- taxonomy_mycobank

@@ -2546,7 +2548,9 @@ taxonomy %>%
  arrange(mo) %>%
  View()
 # keep the firsts
-taxonomy <- taxonomy %>% arrange(mo) %>% distinct(mo, .keep_all = TRUE)
+taxonomy <- taxonomy %>%
+  arrange(mo) %>%
+  distinct(mo, .keep_all = TRUE)

 # are fullnames unique?
 taxonomy %>%
@@ -2997,7 +3001,9 @@ taxonomy$rank[which(taxonomy$fullname %like% "unknown")] <- "(unknown rank)"

 # this happened in early 2025, check that MO codes do not have repeated elements
 # fixed it then like this: microorganisms$mo <- gsub("B_SCLLM_CNNM_LNSM_LNSM_LNSM_LNSM", "B_SCLLM_CNNM", microorganisms$mo)
-taxonomy |> filter(mo %like% "_.*_.*_.*_") |> View()
+taxonomy |>
+  filter(mo %like% "_.*_.*_.*_") |>
+  View()


 fix_old_mos <- function(dataset) {
@@ -3085,7 +3091,9 @@ microorganisms <- taxonomy

 # set class <mo>
 class(microorganisms$mo) <- c("mo", "character")
-microorganisms <- microorganisms %>% arrange(fullname) %>% df_remove_nonASCII()
+microorganisms <- microorganisms %>%
+  arrange(fullname) %>%
+  df_remove_nonASCII()
 usethis::use_data(
  microorganisms,
  overwrite = TRUE,
--- a/data-raw/_reproduction_scripts/reproduction_of_microorganisms.groups.R
+++ b/data-raw/_reproduction_scripts/reproduction_of_microorganisms.groups.R
@@ -59,72 +59,101 @@ whonet_organisms <- whonet_organisms %>%
  mutate(
    # this one was called Issatchenkia orientalis, but it should be:
    ORGANISM = if_else(ORGANISM_CODE == "ckr", "Candida krusei", ORGANISM)
-  ) %>% 
+  ) %>%
  # try to match on GBIF identifier
-  left_join(microorganisms %>% distinct(mo, gbif, status) %>% filter(!is.na(gbif)), by = c("GBIF_TAXON_ID" = "gbif")) %>% 
+  left_join(microorganisms %>% distinct(mo, gbif, status) %>% filter(!is.na(gbif)), by = c("GBIF_TAXON_ID" = "gbif")) %>%
  # remove duplicates
  arrange(ORGANISM_CODE, GBIF_TAXON_ID, status) %>%
-  distinct(ORGANISM_CODE, .keep_all = TRUE) %>% 
+  distinct(ORGANISM_CODE, .keep_all = TRUE) %>%
  # add Enterobacterales, which is a subkingdom code in their data
-  bind_rows(data.frame(ORGANISM_CODE = "ebc", ORGANISM = "Enterobacterales", mo = as.mo("Enterobacterales"))) %>% 
+  bind_rows(data.frame(ORGANISM_CODE = "ebc", ORGANISM = "Enterobacterales", mo = as.mo("Enterobacterales"))) %>%
  arrange(ORGANISM)

 # check non-existing species groups in the microorganisms table
 whonet_organisms %>%
  filter(!is.na(SPECIES_GROUP)) %>%
  group_by(SPECIES_GROUP) %>%
-  summarise(complex = ORGANISM[ORGANISM %like% " (group|complex)"][1],
-            organisms = paste0(n(), ": ", paste(sort(unique(ORGANISM)), collapse = ", "))) %>% 
+  summarise(
+    complex = ORGANISM[ORGANISM %like% " (group|complex)"][1],
+    organisms = paste0(n(), ": ", paste(sort(unique(ORGANISM)), collapse = ", "))
+  ) %>%
  filter(!SPECIES_GROUP %in% microorganisms.codes$code)

 # create the species group data set ----
 microorganisms.groups <- whonet_organisms %>%
  # these will not be translated well
-  filter(!ORGANISM %in% c("Trueperella pyogenes-like bacteria",
-                          "Mycobacterium suricattae",
-                          "Mycobacterium canetti")) %>% 
+  filter(!ORGANISM %in% c(
+    "Trueperella pyogenes-like bacteria",
+    "Mycobacterium suricattae",
+    "Mycobacterium canetti"
+  )) %>%
  filter(!is.na(SPECIES_GROUP), SPECIES_GROUP != ORGANISM_CODE) %>%
-  transmute(mo_group = as.mo(SPECIES_GROUP),
-            mo = ifelse(is.na(mo),
-                        as.character(as.mo(ORGANISM, keep_synonyms = TRUE, minimum_matching_score = 0)),
-                        mo)) %>% 
+  transmute(
+    mo_group = as.mo(SPECIES_GROUP),
+    mo = ifelse(is.na(mo),
+      as.character(as.mo(ORGANISM, keep_synonyms = TRUE, minimum_matching_score = 0)),
+      mo
+    )
+  ) %>%
  # add our own CoNS and CoPS, WHONET does not strictly follow Becker et al. (2014, 2019, 2020)
-  filter(mo_group != as.mo("CoNS")) %>% 
-  bind_rows(tibble(mo_group = as.mo("CoNS"), mo = MO_CONS)) %>% 
-  filter(mo_group != as.mo("CoPS")) %>% 
-  bind_rows(tibble(mo_group = as.mo("CoPS"), mo = MO_COPS)) %>% 
+  filter(mo_group != as.mo("CoNS")) %>%
+  bind_rows(tibble(mo_group = as.mo("CoNS"), mo = MO_CONS)) %>%
+  filter(mo_group != as.mo("CoPS")) %>%
+  bind_rows(tibble(mo_group = as.mo("CoPS"), mo = MO_COPS)) %>%
  # at least all our Lancefield-grouped streptococci must be in the beta-haemolytic group:
-  bind_rows(tibble(mo_group = as.mo("Beta-haemolytic streptococcus"), 
-                   mo = c(MO_LANCEFIELD,
-                          microorganisms %>% filter(fullname %like% "^Streptococcus Group") %>% pull(mo)))) %>% 
+  bind_rows(tibble(
+    mo_group = as.mo("Beta-haemolytic streptococcus"),
+    mo = c(
+      MO_LANCEFIELD,
+      microorganisms %>% filter(fullname %like% "^Streptococcus Group") %>% pull(mo)
+    )
+  )) %>%
  # and per Streptococcus group as well:
  # group A - S. pyogenes
-  bind_rows(tibble(mo_group = as.mo("Streptococcus Group A"),
-                   mo = microorganisms$mo[which(microorganisms$mo %like% "^B_STRPT_PYGN(_|$)")])) %>% 
+  bind_rows(tibble(
+    mo_group = as.mo("Streptococcus Group A"),
+    mo = microorganisms$mo[which(microorganisms$mo %like% "^B_STRPT_PYGN(_|$)")]
+  )) %>%
  # group B - S. agalactiae
-  bind_rows(tibble(mo_group = as.mo("Streptococcus Group B"),
-                   mo = microorganisms$mo[which(microorganisms$mo %like% "^B_STRPT_AGLC(_|$)")])) %>% 
+  bind_rows(tibble(
+    mo_group = as.mo("Streptococcus Group B"),
+    mo = microorganisms$mo[which(microorganisms$mo %like% "^B_STRPT_AGLC(_|$)")]
+  )) %>%
  # group C - all subspecies within S. dysgalactiae and S. equi (such as S. equi zooepidemicus)
-  bind_rows(tibble(mo_group = as.mo("Streptococcus Group C"),
-                   mo = microorganisms$mo[which(microorganisms$mo %like% "^B_STRPT_(DYSG|EQUI)(_|$)")])) %>% 
+  bind_rows(tibble(
+    mo_group = as.mo("Streptococcus Group C"),
+    mo = microorganisms$mo[which(microorganisms$mo %like% "^B_STRPT_(DYSG|EQUI)(_|$)")]
+  )) %>%
  # group F - Milleri group == S. anginosus group, which incl. S. anginosus, S. constellatus, S. intermedius
-  bind_rows(tibble(mo_group = as.mo("Streptococcus Group F"),
-                   mo = microorganisms$mo[which(microorganisms$mo %like% "^B_STRPT_(ANGN|CNST|INTR)(_|$)")])) %>% 
+  bind_rows(tibble(
+    mo_group = as.mo("Streptococcus Group F"),
+    mo = microorganisms$mo[which(microorganisms$mo %like% "^B_STRPT_(ANGN|CNST|INTR)(_|$)")]
+  )) %>%
  # group G - S. dysgalactiae and S. canis (though dysgalactiae is also group C and will be matched there)
-  bind_rows(tibble(mo_group = as.mo("Streptococcus Group G"),
-                   mo = microorganisms$mo[which(microorganisms$mo %like% "^B_STRPT_(DYSG|CANS)(_|$)")])) %>% 
+  bind_rows(tibble(
+    mo_group = as.mo("Streptococcus Group G"),
+    mo = microorganisms$mo[which(microorganisms$mo %like% "^B_STRPT_(DYSG|CANS)(_|$)")]
+  )) %>%
  # group H - S. sanguinis
-  bind_rows(tibble(mo_group = as.mo("Streptococcus Group H"),
-                   mo = microorganisms$mo[which(microorganisms$mo %like% "^B_STRPT_SNGN(_|$)")])) %>% 
+  bind_rows(tibble(
+    mo_group = as.mo("Streptococcus Group H"),
+    mo = microorganisms$mo[which(microorganisms$mo %like% "^B_STRPT_SNGN(_|$)")]
+  )) %>%
  # group K - S. salivarius, incl. S. salivarius salivariuss and S. salivarius thermophilus
-  bind_rows(tibble(mo_group = as.mo("Streptococcus Group K"),
-                   mo = microorganisms$mo[which(microorganisms$mo %like% "^B_STRPT_SLVR(_|$)")])) %>%
+  bind_rows(tibble(
+    mo_group = as.mo("Streptococcus Group K"),
+    mo = microorganisms$mo[which(microorganisms$mo %like% "^B_STRPT_SLVR(_|$)")]
+  )) %>%
  # group L - only S. dysgalactiae
-  bind_rows(tibble(mo_group = as.mo("Streptococcus Group L"),
-                   mo = microorganisms$mo[which(microorganisms$mo %like% "^B_STRPT_DYSG(_|$)")])) %>% 
+  bind_rows(tibble(
+    mo_group = as.mo("Streptococcus Group L"),
+    mo = microorganisms$mo[which(microorganisms$mo %like% "^B_STRPT_DYSG(_|$)")]
+  )) %>%
  # and for EUCAST: Strep group A, B, C, G
-  bind_rows(tibble(mo_group = as.mo("Streptococcus Group A, B, C, G"),
-                   mo = microorganisms$mo[which(microorganisms$mo %like% "^B_STRPT_(PYGN|AGLC|DYSG|EQUI|CANS|GRPA|GRPB|GRPC|GRPG)(_|$)")])) %>%
+  bind_rows(tibble(
+    mo_group = as.mo("Streptococcus Group A, B, C, G"),
+    mo = microorganisms$mo[which(microorganisms$mo %like% "^B_STRPT_(PYGN|AGLC|DYSG|EQUI|CANS|GRPA|GRPB|GRPC|GRPG)(_|$)")]
+  )) %>%
  # HACEK is:
  # - Haemophilus species
  # - Aggregatibacter species
@@ -133,38 +162,46 @@ microorganisms.groups <- whonet_organisms %>%
  # - Kingella species
  # - and previously Actinobacillus actinomycetemcomitans
  # see https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3656887/
-  filter(mo_group != as.mo("HACEK")) %>% 
-  bind_rows(tibble(mo_group = as.mo("HACEK"), mo = microorganisms %>% filter(genus == "Haemophilus") %>% pull(mo))) %>% 
-  bind_rows(tibble(mo_group = as.mo("HACEK"), mo = microorganisms %>% filter(genus == "Aggregatibacter") %>% pull(mo))) %>% 
+  filter(mo_group != as.mo("HACEK")) %>%
+  bind_rows(tibble(mo_group = as.mo("HACEK"), mo = microorganisms %>% filter(genus == "Haemophilus") %>% pull(mo))) %>%
+  bind_rows(tibble(mo_group = as.mo("HACEK"), mo = microorganisms %>% filter(genus == "Aggregatibacter") %>% pull(mo))) %>%
  bind_rows(tibble(mo_group = as.mo("HACEK"), mo = as.mo("Cardiobacterium hominis", keep_synonyms = TRUE))) %>%
  bind_rows(tibble(mo_group = as.mo("HACEK"), mo = as.mo("Eikenella corrodens", keep_synonyms = TRUE))) %>%
  bind_rows(tibble(mo_group = as.mo("HACEK"), mo = microorganisms %>% filter(genus == "Kingella") %>% pull(mo))) %>%
  bind_rows(tibble(mo_group = as.mo("HACEK"), mo = as.mo("Actinobacillus actinomycetemcomitans", keep_synonyms = TRUE))) %>%
  # Citrobacter freundii complex in the NCBI Taxonomy Browser:
  # https://www.ncbi.nlm.nih.gov/Taxonomy/Browser/wwwtax.cgi?id=1344959
-  filter(mo_group != "B_CTRBC_FRND-C") %>% 
-  bind_rows(tibble(mo_group = as.mo("B_CTRBC_FRND-C"),
-                   mo = paste("Citrobacter", c("freundii", "braakii", "gillenii", "murliniae", "portucalensis", "sedlakii", "werkmanii", "youngae")) %>% as.mo(keep_synonyms = TRUE))) %>% 
+  filter(mo_group != "B_CTRBC_FRND-C") %>%
+  bind_rows(tibble(
+    mo_group = as.mo("B_CTRBC_FRND-C"),
+    mo = paste("Citrobacter", c("freundii", "braakii", "gillenii", "murliniae", "portucalensis", "sedlakii", "werkmanii", "youngae")) %>% as.mo(keep_synonyms = TRUE)
+  )) %>%
  # Klebsiella pneumoniae complex
-  filter(mo_group != "B_KLBSL_PNMN-C") %>% 
-  bind_rows(tibble(mo_group = as.mo("B_KLBSL_PNMN-C"),
-                   mo = paste("Klebsiella", c("africana", "pneumoniae", "quasipneumoniae", "quasivariicola", "variicola")) %>% as.mo(keep_synonyms = TRUE))) %>% 
+  filter(mo_group != "B_KLBSL_PNMN-C") %>%
+  bind_rows(tibble(
+    mo_group = as.mo("B_KLBSL_PNMN-C"),
+    mo = paste("Klebsiella", c("africana", "pneumoniae", "quasipneumoniae", "quasivariicola", "variicola")) %>% as.mo(keep_synonyms = TRUE)
+  )) %>%
  # Yersinia pseudotuberculosis complex in the NCBI Taxonomy Browser:
  # https://www.ncbi.nlm.nih.gov/Taxonomy/Browser/wwwtax.cgi?id=1649845
-  filter(mo_group != "B_YERSN_PSDT-C") %>% 
-  bind_rows(tibble(mo_group = as.mo("B_YERSN_PSDT-C"),
-                   mo = paste("Yersinia", c("pseudotuberculosis", "pestis", "similis", "wautersii")) %>% as.mo(keep_synonyms = TRUE))) %>% 
+  filter(mo_group != "B_YERSN_PSDT-C") %>%
+  bind_rows(tibble(
+    mo_group = as.mo("B_YERSN_PSDT-C"),
+    mo = paste("Yersinia", c("pseudotuberculosis", "pestis", "similis", "wautersii")) %>% as.mo(keep_synonyms = TRUE)
+  )) %>%
  # RGM are Rapidly-growing Mycobacteria, see https://pubmed.ncbi.nlm.nih.gov/28084211/
-  filter(mo_group != "B_MYCBC_RGM") %>% 
-  bind_rows(tibble(mo_group = as.mo("B_MYCBC_RGM"),
-                   mo = paste("Mycobacterium", c( "abscessus abscessus", "abscessus bolletii", "abscessus massiliense", "agri", "aichiense", "algericum", "alvei", "anyangense", "arabiense", "aromaticivorans", "aubagnense", "aubagnense", "aurum", "austroafricanum", "bacteremicum", "boenickei", "bourgelatii", "brisbanense", "brumae", "canariasense", "celeriflavum", "chelonae", "chitae", "chlorophenolicum", "chubuense", "confluentis", "cosmeticum", "crocinum", "diernhoferi", "duvalii", "elephantis", "fallax", "flavescens", "fluoranthenivorans", "fortuitum", "franklinii", "frederiksbergense", "gadium", "gilvum", "goodii", "hassiacum", "hippocampi", "hodleri", "holsaticum", "houstonense", "immunogenum", "insubricum", "iranicum", "komossense", "litorale", "llatzerense", "madagascariense", "mageritense", "monacense", "moriokaense", "mucogenicum", "mucogenicum", "murale", "neoaurum", "neworleansense", "novocastrense", "obuense", "pallens", "parafortuitum", "peregrinum", "phlei", "phocaicum", "phocaicum", "porcinum", "poriferae", "psychrotolerans", "pyrenivorans", "rhodesiae", "rufum", "rutilum", "salmoniphilum", "sediminis", "senegalense", "septicum", "setense", "smegmatis", "sphagni", "thermoresistibile", "tokaiense", "vaccae", "vanbaalenii", "wolinskyi")) %>% as.mo(keep_synonyms = TRUE)))
+  filter(mo_group != "B_MYCBC_RGM") %>%
+  bind_rows(tibble(
+    mo_group = as.mo("B_MYCBC_RGM"),
+    mo = paste("Mycobacterium", c("abscessus abscessus", "abscessus bolletii", "abscessus massiliense", "agri", "aichiense", "algericum", "alvei", "anyangense", "arabiense", "aromaticivorans", "aubagnense", "aubagnense", "aurum", "austroafricanum", "bacteremicum", "boenickei", "bourgelatii", "brisbanense", "brumae", "canariasense", "celeriflavum", "chelonae", "chitae", "chlorophenolicum", "chubuense", "confluentis", "cosmeticum", "crocinum", "diernhoferi", "duvalii", "elephantis", "fallax", "flavescens", "fluoranthenivorans", "fortuitum", "franklinii", "frederiksbergense", "gadium", "gilvum", "goodii", "hassiacum", "hippocampi", "hodleri", "holsaticum", "houstonense", "immunogenum", "insubricum", "iranicum", "komossense", "litorale", "llatzerense", "madagascariense", "mageritense", "monacense", "moriokaense", "mucogenicum", "mucogenicum", "murale", "neoaurum", "neworleansense", "novocastrense", "obuense", "pallens", "parafortuitum", "peregrinum", "phlei", "phocaicum", "phocaicum", "porcinum", "poriferae", "psychrotolerans", "pyrenivorans", "rhodesiae", "rufum", "rutilum", "salmoniphilum", "sediminis", "senegalense", "septicum", "setense", "smegmatis", "sphagni", "thermoresistibile", "tokaiense", "vaccae", "vanbaalenii", "wolinskyi")) %>% as.mo(keep_synonyms = TRUE)
+  ))

 # add subspecies to all species
 for (group in unique(microorganisms.groups$mo_group)) {
  spp <- microorganisms.groups %>%
-    filter(mo_group == group & mo_rank(mo, keep_synonyms = TRUE) == "species") %>% 
-    pull(mo) %>% 
-    paste0(collapse = "|") %>% 
+    filter(mo_group == group & mo_rank(mo, keep_synonyms = TRUE) == "species") %>%
+    pull(mo) %>%
+    paste0(collapse = "|") %>%
    paste0("^(", ., ")")
  mos <- microorganisms %>%
    filter(mo %like% spp & rank == "subspecies") %>%
@@ -175,9 +212,11 @@ for (group in unique(microorganisms.groups$mo_group)) {

 # add full names, arrange and clean
 microorganisms.groups <- microorganisms.groups %>%
-  mutate(mo_group_name = mo_name(mo_group, keep_synonyms = TRUE, language = NULL),
-         mo_name = mo_name(mo, keep_synonyms = TRUE, language = NULL)) %>%
-  arrange(mo_group_name, mo_name) %>% 
+  mutate(
+    mo_group_name = mo_name(mo_group, keep_synonyms = TRUE, language = NULL),
+    mo_name = mo_name(mo, keep_synonyms = TRUE, language = NULL)
+  ) %>%
+  arrange(mo_group_name, mo_name) %>%
  filter(mo_group != mo) %>%
  distinct() %>%
  dataset_UTF8_to_ASCII()