update reproduction scripts links

2025-07-21 12:13:20 +02:00 · 2025-04-13 10:02:47 +02:00
parent d31371613e
commit 7c3320b967
9 changed files with 0 additions and 0 deletions
--- a/data-raw/_reproduction_scripts/reproduction_of_antimicrobials.R
+++ b/data-raw/_reproduction_scripts/reproduction_of_antimicrobials.R
--- a/data-raw/_reproduction_scripts/reproduction_of_antivirals.R
+++ b/data-raw/_reproduction_scripts/reproduction_of_antivirals.R
@ -0,0 +1,160 @@
+# ==================================================================== #
+# TITLE:                                                               #
+# AMR: An R Package for Working with Antimicrobial Resistance Data     #
+#                                                                      #
+# SOURCE CODE:                                                         #
+# https://github.com/msberends/AMR                                     #
+#                                                                      #
+# PLEASE CITE THIS SOFTWARE AS:                                        #
+# Berends MS, Luz CF, Friedrich AW, et al. (2022).                     #
+# AMR: An R Package for Working with Antimicrobial Resistance Data.    #
+# Journal of Statistical Software, 104(3), 1-31.                       #
+# https://doi.org/10.18637/jss.v104.i03                                #
+#                                                                      #
+# Developed at the University of Groningen and the University Medical  #
+# Center Groningen in The Netherlands, in collaboration with many      #
+# colleagues from around the world, see our website.                   #
+#                                                                      #
+# This R package is free software; you can freely use and distribute   #
+# it for both personal and commercial purposes under the terms of the  #
+# GNU General Public License version 2.0 (GNU GPL-2), as published by  #
+# the Free Software Foundation.                                        #
+# We created this package for both routine data analysis and academic  #
+# research and it was publicly released in the hope that it will be    #
+# useful, but it comes WITHOUT ANY WARRANTY OR LIABILITY.              #
+#                                                                      #
+# Visit our website for the full manual and a complete tutorial about  #
+# how to conduct AMR data analysis: https://amr-for-r.org              #
+# ==================================================================== #
+
+library(dplyr)
+library(tidyr)
+library(rvest)
+
+# get all data from the WHOCC website
+get_atc_table <- function(atc_group) {
+  # give as input J0XXX, like atc_group = "J05AB"
+  downloaded <- read_html(paste0("https://atcddd.fhi.no/atc_ddd_index/?code=", atc_group, "&showdescription=no"))
+  table_title <- downloaded %>%
+    html_nodes(paste0('a[href^="./?code=', atc_group, '&"]')) %>%
+    html_text()
+  table_title <- table_title[tolower(table_title) != "show text from guidelines"][1]
+  table_content <- downloaded %>%
+    html_nodes("table") %>%
+    html_table(header = TRUE) %>%
+    # returns list, so make data.frame out of it
+    as.data.frame(stringsAsFactors = FALSE) %>%
+    # select right columns
+    select(atc = ATC.code, name = Name, ddd = DDD, unit = U, ddd_type = Adm.R) %>%
+    # fill empty rows
+    mutate(atc = ifelse(atc == "", lag(atc), atc), name = ifelse(name == "", lag(name), name)) %>%
+    pivot_wider(names_from = ddd_type, values_from = c(ddd, unit)) %>%
+    mutate(atc_group = table_title)
+  if (!"ddd_O" %in% colnames(table_content)) {
+    table_content <- table_content %>% mutate(ddd_O = NA_real_, unit_O = NA_character_)
+  }
+  if (!"ddd_P" %in% colnames(table_content)) {
+    table_content <- table_content %>% mutate(ddd_P = NA_real_, unit_P = NA_character_)
+  }
+  table_content %>% select(atc, name, atc_group,
+    oral_ddd = ddd_O, oral_units = unit_O,
+    iv_ddd = ddd_P, iv_units = unit_P
+  )
+}
+
+# these are the relevant groups for input: https://atcddd.fhi.no/atc_ddd_index/?code=J05A (J05 only contains J05A)
+atc_groups <- c("J05AA", "J05AB", "J05AC", "J05AD", "J05AE", "J05AF", "J05AG", "J05AH", "J05AJ", "J05AP", "J05AR", "J05AX")
+
+# get the first
+antivirals <- get_atc_table(atc_groups[1])
+# bind all others to it
+for (i in 2:length(atc_groups)) {
+  message(atc_groups[i], "...")
+  antivirals <- rbind(antivirals, get_atc_table(atc_groups[i]))
+}
+
+# arrange on name, untibble it
+antivirals <- antivirals %>%
+  arrange(name) %>%
+  as.data.frame(stringsAsFactors = FALSE)
+
+# add PubChem Compound ID (cid) and their trade names
+# see `data-raw/_reproduction_scripts/reproduction_of_antimicrobials.R` for get_CID() and get_synonyms()
+CIDs <- get_CID(antivirals$name)
+# these could not be found:
+antivirals[is.na(CIDs), ] %>% View()
+# get brand names from PubChem
+synonyms <- get_synonyms(CIDs)
+synonyms <- lapply(
+  synonyms,
+  function(x) {
+    if (length(x) == 0 | all(is.na(x))) {
+      ""
+    } else {
+      x
+    }
+  }
+)
+
+antivirals <- antivirals %>%
+  transmute(atc,
+    cid = as.double(CIDs),
+    name,
+    atc_group,
+    synonyms = unname(synonyms),
+    oral_ddd,
+    oral_units,
+    iv_ddd,
+    iv_units
+  ) %>%
+  AMR:::dataset_UTF8_to_ASCII()
+
+av_codes <- tibble(name = antivirals$name %>%
+  strsplit("(, | and )") %>%
+  unlist() %>%
+  unique() %>%
+  sort()) %>%
+  mutate(av_1st = toupper(abbreviate(name, minlength = 3, use.classes = FALSE))) %>%
+  filter(!name %in% c("acid", "dipivoxil", "disoproxil", "marboxil", "alafenamide"))
+
+replace_with_av_code <- function(name) {
+  unname(av_codes$av_1st[match(name, av_codes$name)])
+}
+
+names_codes <- antivirals %>%
+  separate(name,
+    into = paste0("name", c(1:7)),
+    sep = "(, | and )",
+    remove = FALSE,
+    fill = "right"
+  ) %>%
+  # remove empty columns
+  select(!where(function(x) all(is.na(x)))) %>%
+  mutate_at(vars(matches("name[1-9]")), replace_with_av_code) %>%
+  unite(av, matches("name[1-9]"), sep = "+", na.rm = TRUE) %>%
+  mutate(name = gsub("(, | and )", "/", name))
+substr(names_codes$name, 1, 1) <- toupper(substr(names_codes$name, 1, 1))
+
+antivirals <- bind_cols(
+  names_codes %>% select(av, name),
+  antivirals %>% select(-name)
+)
+class(antivirals$av) <- c("av", "character")
+antivirals <- antivirals %>% AMR:::dataset_UTF8_to_ASCII()
+
+# ! add loinc, run 'data-raw/loinc.R' !
+
+# de-duplicate synonyms
+for (i in 1:nrow(antivirals)) {
+  syn <- as.character(sort(unique(tolower(antivirals[i, "synonyms", drop = TRUE][[1]]))))
+  syn <- syn[!syn %in% tolower(antivirals[i, "name", drop = TRUE])]
+  antivirals[i, "synonyms"][[1]] <- ifelse(length(syn[!syn == ""]) == 0, list(""), list(syn))
+}
+
+antivirals <- antivirals %>% AMR:::dataset_UTF8_to_ASCII()
+
+# check it
+antivirals
+
+# save it
+usethis::use_data(antivirals, overwrite = TRUE, internal = FALSE, compress = "xz", version = 2)
--- a/data-raw/_reproduction_scripts/reproduction_of_clinical_breakpoints.R
+++ b/data-raw/_reproduction_scripts/reproduction_of_clinical_breakpoints.R
@ -0,0 +1,418 @@
+# ==================================================================== #
+# TITLE:                                                               #
+# AMR: An R Package for Working with Antimicrobial Resistance Data     #
+#                                                                      #
+# SOURCE CODE:                                                         #
+# https://github.com/msberends/AMR                                     #
+#                                                                      #
+# PLEASE CITE THIS SOFTWARE AS:                                        #
+# Berends MS, Luz CF, Friedrich AW, et al. (2022).                     #
+# AMR: An R Package for Working with Antimicrobial Resistance Data.    #
+# Journal of Statistical Software, 104(3), 1-31.                       #
+# https://doi.org/10.18637/jss.v104.i03                                #
+#                                                                      #
+# Developed at the University of Groningen and the University Medical  #
+# Center Groningen in The Netherlands, in collaboration with many      #
+# colleagues from around the world, see our website.                   #
+#                                                                      #
+# This R package is free software; you can freely use and distribute   #
+# it for both personal and commercial purposes under the terms of the  #
+# GNU General Public License version 2.0 (GNU GPL-2), as published by  #
+# the Free Software Foundation.                                        #
+# We created this package for both routine data analysis and academic  #
+# research and it was publicly released in the hope that it will be    #
+# useful, but it comes WITHOUT ANY WARRANTY OR LIABILITY.              #
+#                                                                      #
+# Visit our website for the full manual and a complete tutorial about  #
+# how to conduct AMR data analysis: https://amr-for-r.org              #
+# ==================================================================== #
+
+# This script runs in 20-30 minutes and renews all guidelines of CLSI and EUCAST!
+# Run it with source("data-raw/_reproduction_scripts/reproduction_of_clinical_breakpoints.R")
+
+library(dplyr)
+library(readr)
+library(tidyr)
+devtools::load_all()
+
+# Install the WHONET software on Windows (http://www.whonet.org/software.html),
+# and copy the folder C:\WHONET\Resources to the data-raw/WHONET/ folder
+# (for ASIARS-Net update, also copy C:\WHONET\Codes to the data-raw/WHONET/ folder)
+
+# BE SURE TO RUN data-raw/_reproduction_scripts/reproduction_of_microorganisms.groups.R FIRST TO GET THE GROUPS!
+
+# READ DATA ----
+
+whonet_organisms <- read_tsv("data-raw/WHONET/Resources/Organisms.txt", na = c("", "NA", "-"), show_col_types = FALSE) %>%
+  # remove old taxonomic names
+  filter(TAXONOMIC_STATUS == "C") %>%
+  mutate(ORGANISM_CODE = toupper(WHONET_ORG_CODE))
+
+whonet_breakpoints <- read_tsv("data-raw/WHONET/Resources/Breakpoints.txt", na = c("", "NA", "-"),
+                               show_col_types = FALSE, guess_max = Inf) %>%
+  filter(GUIDELINES %in% c("CLSI", "EUCAST"))
+
+whonet_antibiotics <- read_tsv("data-raw/WHONET/Resources/Antibiotics.txt", na = c("", "NA", "-"), show_col_types = FALSE) %>%
+  arrange(WHONET_ABX_CODE) %>%
+  distinct(WHONET_ABX_CODE, .keep_all = TRUE)
+
+# MICROORGANISMS WHONET CODES ----
+
+whonet_organisms <- whonet_organisms %>%
+  select(ORGANISM_CODE, ORGANISM, SPECIES_GROUP, GBIF_TAXON_ID) %>%
+  mutate(
+    # this one was called Issatchenkia orientalis, but it should be:
+    ORGANISM = if_else(ORGANISM_CODE == "ckr", "Candida krusei", ORGANISM)
+  ) %>% 
+  # try to match on GBIF identifier
+  left_join(microorganisms %>% distinct(mo, gbif, status) %>% filter(!is.na(gbif)), by = c("GBIF_TAXON_ID" = "gbif")) %>% 
+  # remove duplicates
+  arrange(ORGANISM_CODE, GBIF_TAXON_ID, status) %>%
+  distinct(ORGANISM_CODE, .keep_all = TRUE) %>% 
+  # add Enterobacterales, which is a subkingdom code in their data
+  bind_rows(data.frame(ORGANISM_CODE = "ebc", ORGANISM = "Enterobacterales", mo = as.mo("Enterobacterales"))) %>% 
+  arrange(ORGANISM)
+
+
+## Add new WHO codes to microorganisms.codes ----
+
+matched <- whonet_organisms %>% filter(!is.na(mo))
+unmatched <- whonet_organisms %>% filter(is.na(mo))
+
+# generate the mo codes and add their names
+message("Getting MO codes for WHONET input...")
+unmatched <- unmatched %>% 
+  mutate(mo = as.mo(gsub("(sero[a-z]*| nontypable| non[-][a-zA-Z]+|var[.]| not .*|sp[.],.*|, .*variant.*|, .*toxin.*|, microaer.*| beta-haem[.])", "", ORGANISM),
+                    minimum_matching_score = 0.55,
+                    keep_synonyms = TRUE,
+                    language = "en"),
+         mo = case_when(ORGANISM %like% "Anaerobic" & ORGANISM %like% "negative" ~ as.mo("B_ANAER-NEG"),
+                        ORGANISM %like% "Anaerobic" & ORGANISM %like% "positive" ~ as.mo("B_ANAER-POS"),
+                        ORGANISM %like% "Anaerobic" ~ as.mo("B_ANAER"),
+                        TRUE ~ mo),
+         mo_name = mo_name(mo,
+                           keep_synonyms = TRUE,
+                           language = "en"))
+# check if coercion at least resembles the first part (genus)
+unmatched <- unmatched %>% 
+  mutate(
+    first_part = sapply(ORGANISM, function(x) strsplit(gsub("[^a-zA-Z _-]+", "", x), " ")[[1]][1], USE.NAMES = FALSE),
+    keep = mo_name %like_case% first_part | ORGANISM %like% "Gram " | ORGANISM == "Other" | ORGANISM %like% "anaerobic") %>% 
+  arrange(keep)
+unmatched %>% 
+  View()
+unmatched <- unmatched %>% 
+  filter(keep == TRUE)
+
+organisms <- matched %>% transmute(code = toupper(ORGANISM_CODE), group = SPECIES_GROUP, mo) %>% 
+  bind_rows(unmatched %>% transmute(code = toupper(ORGANISM_CODE), group = SPECIES_GROUP, mo)) %>% 
+  mutate(name = mo_name(mo, keep_synonyms = TRUE)) %>% 
+  arrange(code)
+
+# some subspecies exist, while their upper species do not, add them as the species level:
+subspp <- organisms %>%
+  filter(mo_species(mo, keep_synonyms = TRUE) == mo_subspecies(mo, keep_synonyms = TRUE) &
+           mo_species(mo, keep_synonyms = TRUE) != "" &
+           mo_genus(mo, keep_synonyms = TRUE) != "Salmonella") %>% 
+  mutate(mo = as.mo(paste(mo_genus(mo, keep_synonyms = TRUE),
+                          mo_species(mo, keep_synonyms = TRUE)),
+                    keep_synonyms = TRUE),
+         name = mo_name(mo, keep_synonyms = TRUE))
+organisms <- organisms %>%
+  filter(!code %in% subspp$code) %>%
+  bind_rows(subspp) %>%
+  arrange(code)
+
+# add the groups
+organisms <- organisms %>% 
+  bind_rows(tibble(code = organisms %>% filter(!is.na(group)) %>% pull(group) %>% unique(),
+                   group = NA,
+                   mo = organisms %>% filter(!is.na(group)) %>% pull(group) %>% unique() %>% as.mo(keep_synonyms = TRUE),
+                   name = mo_name(mo, keep_synonyms = TRUE))) %>% 
+  arrange(code, group) %>% 
+  select(-group) %>% 
+  distinct()
+
+# 2023-07-08 SGM is also Strep gamma in WHONET, must only be Slowly-growing Mycobacterium
+# 2024-06-14 still the case
+organisms <- organisms %>% 
+  filter(!(code == "SGM" & name %like% "Streptococcus"))
+# this must be empty:
+organisms$code[organisms$code %>% duplicated()]
+
+saveRDS(organisms, "data-raw/organisms.rds", version = 2)
+
+#---
+# AT THIS POINT, `organisms` is clean and all entries have an mo code
+#---
+
+# update microorganisms.codes with the latest WHONET codes
+microorganisms.codes2 <- microorganisms.codes %>% 
+  # remove all old WHONET codes, whether we (in the end) keep them or not
+  filter(!toupper(code) %in% toupper(organisms$code)) %>% 
+  # and add the new ones
+  bind_rows(organisms %>% select(code, mo)) %>% 
+  arrange(code) %>% 
+  distinct(code, .keep_all = TRUE)
+# new codes:
+microorganisms.codes2$code[which(!microorganisms.codes2$code %in% microorganisms.codes$code)]
+mo_name(microorganisms.codes2$mo[which(!microorganisms.codes2$code %in% microorganisms.codes$code)], keep_synonyms = TRUE)
+microorganisms.codes <- microorganisms.codes2
+
+# Run this part to update ASIARS-Net:
+# 2024-06-14: file not available anymore
+# # start
+# asiarsnet <- read_tsv("data-raw/WHONET/Codes/ASIARS_Net_Organisms_ForwardLookup.txt")
+# asiarsnet <- asiarsnet %>%
+#   mutate(WHONET_Code = toupper(WHONET_Code)) %>%
+#   left_join(whonet_organisms %>% mutate(WHONET_Code = toupper(ORGANISM_CODE))) %>%
+#   mutate(
+#     mo1 = as.mo(ORGANISM_CODE),
+#     mo2 = as.mo(ORGANISM)
+#   ) %>%
+#   mutate(mo = if_else(mo2 == "UNKNOWN" | is.na(mo2), mo1, mo2)) %>%
+#   filter(!is.na(mo))
+# insert1 <- asiarsnet %>% transmute(code = WHONET_Code, mo)
+# insert2 <- asiarsnet %>% transmute(code = as.character(ASIARS_Net_Code), mo)
+# # these will be updated
+# bind_rows(insert1, insert2) %>%
+#   rename(mo_new = mo) %>%
+#   left_join(microorganisms.codes) %>%
+#   filter(mo != mo_new)
+# microorganisms.codes <- microorganisms.codes %>%
+#   filter(!code %in% c(insert1$code, insert2$code)) %>%
+#   bind_rows(insert1, insert2) %>%
+#   arrange(code)
+# # end
+
+## Save to package ----
+class(microorganisms.codes$mo) <- c("mo", "character")
+usethis::use_data(microorganisms.codes, overwrite = TRUE, compress = "xz", version = 2)
+rm(microorganisms.codes)
+devtools::load_all()
+
+
+# BREAKPOINTS ----
+
+# now that we have the correct MO codes, get the breakpoints and convert them
+
+whonet_breakpoints %>% 
+  count(GUIDELINES, BREAKPOINT_TYPE) %>% 
+  pivot_wider(names_from = BREAKPOINT_TYPE, values_from = n) %>% 
+  janitor::adorn_totals(where = c("row", "col"))
+# compared to current
+AMR::clinical_breakpoints %>%
+  count(GUIDELINES = gsub("[^a-zA-Z]", "", guideline), type) %>%
+  arrange(tolower(type)) %>%
+  pivot_wider(names_from = type, values_from = n) %>% 
+  as.data.frame() %>%
+  janitor::adorn_totals(where = c("row", "col"))
+
+breakpoints <- whonet_breakpoints %>%
+  mutate(code = toupper(ORGANISM_CODE)) %>%
+  left_join(bind_rows(microorganisms.codes %>% filter(!code %in% c("ALL", "GEN")),
+                      # GEN (Generic) and ALL (All) are PK/PD codes
+                      data.frame(code = c("ALL", "GEN"),
+                                 mo = rep(as.mo("UNKNOWN"), 2))))
+# these ones lack an MO name, they cannot be used:
+unknown <- breakpoints %>%
+  filter(is.na(mo)) %>%
+  pull(code) %>%
+  unique()
+breakpoints %>% 
+  filter(code %in% unknown) %>% 
+  count(GUIDELINES, YEAR, ORGANISM_CODE, BREAKPOINT_TYPE, sort = TRUE)
+# 2025-03-11: these codes are currently: clu, kma, fso, tyi. No clue (are not in MO list of WHONET), and they are only ECOFFs, so remove them:
+breakpoints <- breakpoints %>% 
+  filter(!is.na(mo))
+
+# and these ones have unknown antibiotics according to WHONET itself:
+breakpoints %>% 
+  filter(!WHONET_ABX_CODE %in% whonet_antibiotics$WHONET_ABX_CODE) %>% 
+  count(YEAR, GUIDELINES, WHONET_ABX_CODE) %>% 
+  arrange(desc(YEAR))
+breakpoints %>% 
+  filter(!WHONET_ABX_CODE %in% whonet_antibiotics$WHONET_ABX_CODE) %>%
+  pull(WHONET_ABX_CODE) %>%
+  unique()
+# they are at the moment all old codes ("CFC", "ROX", "FIX") that have the right replacements in `antimicrobials`, so we can use as.ab()
+
+
+## Build new breakpoints table ----
+
+breakpoints_new <- breakpoints %>%
+  filter(!is.na(WHONET_ABX_CODE)) %>% 
+  transmute(
+    guideline = paste(GUIDELINES, YEAR),
+    type = ifelse(BREAKPOINT_TYPE == "ECOFF", "ECOFF", tolower(BREAKPOINT_TYPE)),
+    host = ifelse(BREAKPOINT_TYPE == "ECOFF", "ECOFF", tolower(HOST)),
+    method = TEST_METHOD,
+    site = SITE_OF_INFECTION,
+    mo,
+    rank_index = case_when(
+      is.na(mo_rank(mo, keep_synonyms = TRUE)) ~ 6, # for UNKNOWN, B_GRAMN, B_ANAER, B_ANAER-NEG, etc.
+      mo_rank(mo, keep_synonyms = TRUE) %like% "(infra|sub)" ~ 1,
+      mo_rank(mo, keep_synonyms = TRUE) == "species" ~ 2,
+      mo_rank(mo, keep_synonyms = TRUE) == "species group" ~ 2.5,
+      mo_rank(mo, keep_synonyms = TRUE) == "genus" ~ 3,
+      mo_rank(mo, keep_synonyms = TRUE) == "family" ~ 4,
+      mo_rank(mo, keep_synonyms = TRUE) == "order" ~ 5,
+      TRUE ~ 6
+    ),
+    ab = as.ab(WHONET_ABX_CODE),
+    ref_tbl = ifelse(type == "ECOFF" & is.na(REFERENCE_TABLE), "ECOFF", REFERENCE_TABLE),
+    disk_dose = POTENCY,
+    breakpoint_S = ifelse(type == "ECOFF" & is.na(S) & !is.na(ECV_ECOFF), ECV_ECOFF, S),
+    breakpoint_R = ifelse(type == "ECOFF" & is.na(R) & !is.na(ECV_ECOFF), ECV_ECOFF, R),
+    uti = ifelse(is.na(site), FALSE, gsub(".*(UTI|urinary|urine).*", "UTI", site) == "UTI"),
+    is_SDD = !is.na(SDD)
+  ) %>%
+  # Greek symbols and EM dash symbols are not allowed by CRAN, so replace them with ASCII:
+  mutate(disk_dose = disk_dose %>%
+    gsub("μ", "mc", ., fixed = TRUE) %>% # this is 'mu', \u03bc
+    gsub("µ", "mc", ., fixed = TRUE) %>% # this is 'micro', \u00b5 (yes, they look the same)
+    gsub("–", "-", ., fixed = TRUE) %>%
+    gsub("(?<=\\d)(?=[a-zA-Z])", " ", ., perl = TRUE)) %>% # make sure we keep a space after a number, e.g. "1mcg" to "1 mcg"
+  arrange(desc(guideline), mo, ab, type, method) %>%
+  filter(!(is.na(breakpoint_S) & is.na(breakpoint_R)) & !is.na(mo) & !is.na(ab)) %>%
+  distinct(guideline, type, host, ab, mo, method, site, breakpoint_S, .keep_all = TRUE)
+
+# fix reference table names
+breakpoints_new %>% filter(guideline %like% "EUCAST", is.na(ref_tbl)) %>% View()
+breakpoints_new <- breakpoints_new %>% 
+  mutate(ref_tbl = case_when(is.na(ref_tbl) & guideline %like% "EUCAST 202" ~ lead(ref_tbl),
+                             is.na(ref_tbl) ~ "Unknown",
+                             TRUE ~ ref_tbl))
+
+# clean disk zones
+breakpoints_new[which(breakpoints_new$method == "DISK"), "breakpoint_S"] <- as.double(as.disk(breakpoints_new[which(breakpoints_new$method == "DISK"), "breakpoint_S", drop = TRUE]))
+breakpoints_new[which(breakpoints_new$method == "DISK"), "breakpoint_R"] <- as.double(as.disk(breakpoints_new[which(breakpoints_new$method == "DISK"), "breakpoint_R", drop = TRUE]))
+
+# regarding animal breakpoints, CLSI has adults and foals for horses, but only for amikacin - only keep adult horses
+breakpoints_new %>% 
+  filter(host %like% "foal") %>%
+  count(guideline, host)
+breakpoints_new <- breakpoints_new %>% 
+  filter(host %unlike% "foal") %>% 
+  mutate(host = ifelse(host %like% "horse", "horse", host))
+
+# FIXES FOR WHONET ERRORS ----
+m <- unique(as.double(as.mic(levels(as.mic(1)))))
+
+# WHONET has no >1024 but instead uses 1025, 513, etc, so as.mic() cannot be used to clean.
+# instead, raise these one higher valid MIC factor level:
+breakpoints_new[which(breakpoints_new$breakpoint_R == 129), "breakpoint_R"] <- m[which(m == 128) + 1]
+breakpoints_new[which(breakpoints_new$breakpoint_R == 257), "breakpoint_R"] <- m[which(m == 256) + 1]
+breakpoints_new[which(breakpoints_new$breakpoint_R == 513), "breakpoint_R"] <- m[which(m == 512) + 1]
+breakpoints_new[which(breakpoints_new$breakpoint_R == 1025), "breakpoint_R"] <- m[which(m == 1024) + 1]
+
+# a lot of R breakpoints are missing, though none of the S breakpoints are missing:
+anyNA(breakpoints_new$breakpoint_S)
+
+breakpoints_new %>% 
+  filter(is.na(breakpoint_R)) %>% 
+  count(guideline, host) |>
+  pivot_wider(names_from = host,
+              values_from = n,
+              values_fill = list(n = 0)) |>
+  View()
+
+# 2025-03-12 don't do this anymore - we now use as.sir(..., substitute_missing_r_breakpoint = TRUE/FALSE, ...)
+# breakpoints_new[which(breakpoints_new$method == "MIC" &
+#                         is.na(breakpoints_new$breakpoint_R)), "breakpoint_R"] <- max(m)
+
+
+# fix streptococci in WHONET table of EUCAST: Strep A, B, C and G must only include these groups and not all streptococci:
+breakpoints_new$mo[breakpoints_new$mo == "B_STRPT" & breakpoints_new$ref_tbl %like% "^strep.* a.* b.*c.*g"] <- as.mo("B_STRPT_ABCG")
+# Haemophilus same error (must only be H. influenzae)
+breakpoints_new$mo[breakpoints_new$mo == "B_HMPHL" & breakpoints_new$ref_tbl %like% "^h.* influenzae"] <- as.mo("B_HMPHL_INFL")
+# EUCAST says that for H. parainfluenzae the H. influenza rules can be used, so add them
+breakpoints_new <- breakpoints_new %>% 
+  bind_rows(
+    breakpoints_new %>%
+      filter(guideline %like% "EUCAST", mo == "B_HMPHL_INFL") %>% 
+      mutate(mo = as.mo("B_HMPHL_PRNF"))
+  ) %>% 
+  arrange(desc(guideline), mo, ab, type, host, method)
+# Achromobacter denitrificans is in WHONET included in their A. xylosoxidans table, must be removed
+breakpoints_new <- breakpoints_new %>% filter(mo != as.mo("Achromobacter denitrificans"))
+# WHONET contains gentamicin breakpoints for viridans streptocci, which are intrinsic R - they meant genta-high, which is ALSO in their table, so we just remove gentamicin in viridans streptococci
+breakpoints_new <- breakpoints_new %>% filter(!(mo == as.mo("Streptococcus viridans") & ab == "GEN"))
+# Nitrofurantoin in Staph (EUCAST) only applies to S. saprophyticus, while WHONET has the DISK correct but the MIC on genus level
+breakpoints_new$mo[breakpoints_new$mo == "B_STPHY" & breakpoints_new$ab == "NIT" & breakpoints_new$guideline %like% "EUCAST"] <- as.mo("B_STPHY_SPRP")
+# WHONET sets the 2023 breakpoints for SAM to MIC of 16/32 for Enterobacterales, should be MIC 8/32 like AMC (see issue #123 on github.com/msberends/AMR)
+# UPDATE 2024-02-22: fixed now
+
+# There's a problem with C. diff in EUCAST where breakpoint_R is missing - they are listed as normal human breakpoints but are ECOFF
+rows <- which(breakpoints_new$guideline %like% "EUCAST" & breakpoints_new$mo == "B_CRDDS_DFFC" & is.na(breakpoints_new$breakpoint_R) & !is.na(breakpoints_new$breakpoint_S))
+breakpoints_new$type[rows] <- "ECOFF"
+breakpoints_new$host[rows] <- "ECOFF"
+breakpoints_new$ref_tbl[rows] <- "ECOFF"
+breakpoints_new$breakpoint_R[rows] <- breakpoints_new$breakpoint_S[rows]
+breakpoints_new <- distinct(breakpoints_new, .keep_all = TRUE)
+
+# determine rank again now that some changes were made on taxonomic level (genus -> species)
+breakpoints_new <- breakpoints_new %>% 
+  mutate(rank_index = case_when(
+    mo_rank(mo, keep_synonyms = TRUE) %like% "(infra|sub)" ~ 1,
+    mo_rank(mo, keep_synonyms = TRUE) == "species" ~ 2,
+    mo_rank(mo, keep_synonyms = TRUE) == "species group" ~ 2.5,
+    mo_rank(mo, keep_synonyms = TRUE) == "genus" ~ 3,
+    mo_rank(mo, keep_synonyms = TRUE) == "family" ~ 4,
+    mo_rank(mo, keep_synonyms = TRUE) == "order" ~ 5,
+    mo != "UNKNOWN" ~ 6, # for B_ANAER, etc.
+    TRUE ~ 7
+  ))
+
+# WHONET adds one log2 level to the R breakpoint for their software, e.g. in AMC in Enterobacterales:
+# EUCAST 2023 guideline: S <= 8 and R > 8
+#           WHONET file: S <= 8 and R >= 16
+breakpoints_new %>% filter(guideline == "EUCAST 2023", ab == "AMC", mo == "B_[ORD]_ENTRBCTR", method == "MIC")
+# but this will make an MIC of 12 I, which should be R according to EUCAST, so:
+breakpoints_new <- breakpoints_new %>%
+  mutate(breakpoint_R = ifelse(guideline %like% "EUCAST" & method == "MIC" & log2(breakpoint_R) - log2(breakpoint_S) != 0,
+    pmax(breakpoint_S, breakpoint_R / 2),
+    breakpoint_R
+  ))
+# fix disks as well
+breakpoints_new %>% filter(guideline == "EUCAST 2023", ab == "AMC", mo == "B_[ORD]_ENTRBCTR", method == "DISK")
+breakpoints_new <- breakpoints_new %>%
+  mutate(breakpoint_R = ifelse(guideline %like% "EUCAST" & method == "DISK" & breakpoint_S - breakpoint_R != 0,
+    breakpoint_R + 1,
+    breakpoint_R
+  ))
+# fill missing R breakpoint where there is an S breakpoint
+# 2025-03-12 don't do this anymore - we now use as.sir(..., substitute_missing_r_breakpoint = TRUE/FALSE, ...)
+# breakpoints_new[which(is.na(breakpoints_new$breakpoint_R)), "breakpoint_R"] <- breakpoints_new[which(is.na(breakpoints_new$breakpoint_R)), "breakpoint_S"]
+
+
+# check the strange duplicates
+breakpoints_new %>% 
+  mutate(id = paste(guideline, type, host, method, site, mo, ab, uti)) %>% 
+  filter(id %in% .$id[which(duplicated(id))]) %>% 
+  arrange(desc(guideline))
+# 2024-06-19 mostly ECOFFs, but there's no explanation in the whonet_breakpoints file, we have to remove duplicates
+# remove duplicates
+breakpoints_new <- breakpoints_new %>% 
+  distinct(guideline, type, host, method, site, mo, ab, uti, .keep_all = TRUE)
+
+
+# CHECKS AND SAVE TO PACKAGE ----
+
+# check again
+breakpoints_new %>% filter(guideline == "EUCAST 2024", ab == "AMC", mo == "B_[ORD]_ENTRBCTR", method == "MIC")
+# compare with current version
+clinical_breakpoints %>% filter(guideline == "EUCAST 2024", ab == "AMC", mo == "B_[ORD]_ENTRBCTR", method == "MIC")
+
+# must have "human" and "ECOFF"
+breakpoints_new %>% filter(mo == "B_STRPT_PNMN", ab == "AMP", guideline == "EUCAST 2020", method == "MIC")
+
+# check dimensions
+dim(breakpoints_new)
+dim(clinical_breakpoints)
+
+clinical_breakpoints <- breakpoints_new
+clinical_breakpoints <- clinical_breakpoints %>% dataset_UTF8_to_ASCII()
+usethis::use_data(clinical_breakpoints, overwrite = TRUE, compress = "xz", version = 2)
+rm(clinical_breakpoints)
+devtools::load_all(".")
--- a/data-raw/_reproduction_scripts/reproduction_of_dosage.R
+++ b/data-raw/_reproduction_scripts/reproduction_of_dosage.R
@ -0,0 +1,178 @@
+# ==================================================================== #
+# TITLE:                                                               #
+# AMR: An R Package for Working with Antimicrobial Resistance Data     #
+#                                                                      #
+# SOURCE CODE:                                                         #
+# https://github.com/msberends/AMR                                     #
+#                                                                      #
+# PLEASE CITE THIS SOFTWARE AS:                                        #
+# Berends MS, Luz CF, Friedrich AW, et al. (2022).                     #
+# AMR: An R Package for Working with Antimicrobial Resistance Data.    #
+# Journal of Statistical Software, 104(3), 1-31.                       #
+# https://doi.org/10.18637/jss.v104.i03                                #
+#                                                                      #
+# Developed at the University of Groningen and the University Medical  #
+# Center Groningen in The Netherlands, in collaboration with many      #
+# colleagues from around the world, see our website.                   #
+#                                                                      #
+# This R package is free software; you can freely use and distribute   #
+# it for both personal and commercial purposes under the terms of the  #
+# GNU General Public License version 2.0 (GNU GPL-2), as published by  #
+# the Free Software Foundation.                                        #
+# We created this package for both routine data analysis and academic  #
+# research and it was publicly released in the hope that it will be    #
+# useful, but it comes WITHOUT ANY WARRANTY OR LIABILITY.              #
+#                                                                      #
+# Visit our website for the full manual and a complete tutorial about  #
+# how to conduct AMR data analysis: https://amr-for-r.org              #
+# ==================================================================== #
+
+library(dplyr)
+library(readxl)
+library(cleaner)
+
+# URL:
+# https://www.eucast.org/fileadmin/src/media/PDFs/EUCAST_files/Breakpoint_tables/Dosages_v_13.0_Breakpoint_Tables.pdf
+# download the PDF file, open in Adobe Acrobat and export as Excel workbook
+breakpoints_version <- 13
+
+dosage_source <- read_excel("data-raw/Dosages_v_12.0_Breakpoint_Tables.xlsx", skip = 4, na = "None") %>%
+  format_names(snake_case = TRUE, penicillins = "drug") %>%
+  filter(!tolower(standard_dosage) %in% c("standard dosage", "standard dosage_source", "under review")) %>%
+  filter(!is.na(standard_dosage)) %>%
+  # keep only one drug in the table
+  arrange(desc(drug)) %>%
+  mutate(drug = gsub("(.*) ([(]|iv|oral).*", "\\1", drug)) %>%
+  # distinct(drug, .keep_all = TRUE) %>%
+  arrange(drug) %>%
+  mutate(
+    ab = as.ab(drug),
+    ab_name = ab_name(ab, language = NULL)
+  )
+
+dosage_source <- bind_rows(
+  # oral
+  dosage_source %>%
+    filter(standard_dosage %like% " oral") %>%
+    mutate(
+      standard_dosage = gsub("oral.*", "oral", standard_dosage),
+      high_dosage = if_else(high_dosage %like% "oral",
+        gsub("oral.*", "oral", high_dosage),
+        NA_character_
+      )
+    ),
+  # iv
+  dosage_source %>%
+    filter(standard_dosage %like% " iv") %>%
+    mutate(
+      standard_dosage = gsub(".* or ", "", standard_dosage),
+      high_dosage = if_else(high_dosage %like% "( or | iv)",
+        gsub(".* or ", "", high_dosage),
+        NA_character_
+      )
+    ),
+  # im
+  dosage_source %>%
+    filter(standard_dosage %like% " im")
+) %>%
+  arrange(drug)
+
+
+get_dosage_lst <- function(col_data) {
+  standard <- col_data %>%
+    # remove new lines
+    gsub(" ?(\n|\t)+ ?", " ", .) %>%
+    # keep only the first suggestion, replace all after 'or' and more informative texts
+    gsub("(.*?) (or|with|loading|depending|over|by) .*", "\\1", .) %>%
+    # remove (1 MU)
+    gsub(" [(][0-9] [A-Z]+[)]", "", .) %>%
+    # remove parentheses
+    gsub("[)(]", "", .) %>%
+    # remove drug names
+    gsub(" [a-z]{5,99}( |$)", " ", .) %>%
+    gsub(" [a-z]{5,99}( |$)", " ", .) %>%
+    gsub(" (acid|dose)", "", .) # %>%
+  # keep lowest value only (25-30 mg -> 25 mg)
+  # gsub("[-].*? ", " ", .)
+
+  dosage_lst <- lapply(
+    strsplit(standard, " x "),
+    function(x) {
+      dose <- x[1]
+      if (dose %like% "under") {
+        dose <- NA_character_
+      }
+      admin <- x[2]
+
+      list(
+        dose = trimws(dose),
+        dose_times = gsub("^([0-9.]+).*", "\\1", admin),
+        administration = clean_character(admin),
+        notes = "",
+        original_txt = ""
+      )
+    }
+  )
+  for (i in seq_len(length(col_data))) {
+    dosage_lst[[i]]$original_txt <- gsub("\n", " ", col_data[i])
+    if (col_data[i] %like% " (or|with|loading|depending|over) ") {
+      dosage_lst[[i]]$notes <- gsub("\n", " ", gsub(".* ((or|with|loading|depending|over) .*)", "\\1", col_data[i]))
+    }
+  }
+  dosage_lst
+}
+
+standard <- get_dosage_lst(dosage_source$standard_dosage)
+high <- get_dosage_lst(dosage_source$high_dosage)
+uti <- get_dosage_lst(dosage_source$uncomplicated_uti)
+dosage_new <- bind_rows(
+  # standard dose
+  data.frame(
+    ab = dosage_source$ab,
+    name = dosage_source$ab_name,
+    type = "standard_dosage",
+    dose = sapply(standard, function(x) x$dose),
+    dose_times = sapply(standard, function(x) x$dose_times),
+    administration = sapply(standard, function(x) x$administration),
+    notes = sapply(standard, function(x) x$notes),
+    original_txt = sapply(standard, function(x) x$original_txt),
+    stringsAsFactors = FALSE
+  ),
+  # high dose
+  data.frame(
+    ab = dosage_source$ab,
+    name = dosage_source$ab_name,
+    type = "high_dosage",
+    dose = sapply(high, function(x) x$dose),
+    dose_times = sapply(high, function(x) x$dose_times),
+    administration = sapply(high, function(x) x$administration),
+    notes = sapply(high, function(x) x$notes),
+    original_txt = sapply(high, function(x) x$original_txt),
+    stringsAsFactors = FALSE
+  ),
+  # UTIs
+  data.frame(
+    ab = dosage_source$ab,
+    name = dosage_source$ab_name,
+    type = "uncomplicated_uti",
+    dose = sapply(uti, function(x) x$dose),
+    dose_times = sapply(uti, function(x) x$dose_times),
+    administration = sapply(uti, function(x) x$administration),
+    notes = sapply(uti, function(x) x$notes),
+    original_txt = sapply(uti, function(x) x$original_txt),
+    stringsAsFactors = FALSE
+  )
+) %>%
+  mutate(
+    eucast_version = breakpoints_version,
+    dose_times = as.integer(dose_times),
+    administration = gsub("([a-z]+) .*", "\\1", administration)
+  ) %>%
+  arrange(name, administration, type) %>%
+  filter(!is.na(dose), dose != ".") %>%
+  # this makes it a tibble as well:
+  dataset_UTF8_to_ASCII()
+
+dosage <- bind_rows(dosage_new, AMR::dosage)
+
+usethis::use_data(dosage, internal = FALSE, overwrite = TRUE, version = 2, compress = "xz")
--- a/data-raw/_reproduction_scripts/reproduction_of_example_isolates_unclean.R
+++ b/data-raw/_reproduction_scripts/reproduction_of_example_isolates_unclean.R
@ -0,0 +1,146 @@
+# ==================================================================== #
+# TITLE:                                                               #
+# AMR: An R Package for Working with Antimicrobial Resistance Data     #
+#                                                                      #
+# SOURCE CODE:                                                         #
+# https://github.com/msberends/AMR                                     #
+#                                                                      #
+# PLEASE CITE THIS SOFTWARE AS:                                        #
+# Berends MS, Luz CF, Friedrich AW, et al. (2022).                     #
+# AMR: An R Package for Working with Antimicrobial Resistance Data.    #
+# Journal of Statistical Software, 104(3), 1-31.                       #
+# https://doi.org/10.18637/jss.v104.i03                                #
+#                                                                      #
+# Developed at the University of Groningen and the University Medical  #
+# Center Groningen in The Netherlands, in collaboration with many      #
+# colleagues from around the world, see our website.                   #
+#                                                                      #
+# This R package is free software; you can freely use and distribute   #
+# it for both personal and commercial purposes under the terms of the  #
+# GNU General Public License version 2.0 (GNU GPL-2), as published by  #
+# the Free Software Foundation.                                        #
+# We created this package for both routine data analysis and academic  #
+# research and it was publicly released in the hope that it will be    #
+# useful, but it comes WITHOUT ANY WARRANTY OR LIABILITY.              #
+#                                                                      #
+# Visit our website for the full manual and a complete tutorial about  #
+# how to conduct AMR data analysis: https://amr-for-r.org              #
+# ==================================================================== #
+
+patients <- unlist(lapply(LETTERS, paste0, 1:10))
+
+patients_table <- data.frame(
+  patient_id = patients,
+  gender = c(
+    rep("M", 135),
+    rep("F", 125)
+  )
+)
+
+dates <- seq(as.Date("2011-01-01"), as.Date("2020-01-01"), by = "day")
+
+bacteria_a <- c(
+  "E. coli", "S. aureus",
+  "S. pneumoniae", "K. pneumoniae"
+)
+
+bacteria_b <- c("esccol", "staaur", "strpne", "klepne")
+
+bacteria_c <- c(
+  "Escherichia coli", "Staphylococcus aureus",
+  "Streptococcus pneumoniae", "Klebsiella pneumoniae"
+)
+
+ab_interpretations <- c("S", "I", "R")
+
+ab_interpretations_messy <- c("R", "< 0.5 S", "I")
+
+sample_size <- 1000
+
+data_a <- data.frame(
+  date = sample(dates, size = sample_size, replace = TRUE),
+  hospital = "A",
+  bacteria = sample(bacteria_a,
+    size = sample_size, replace = TRUE,
+    prob = c(0.50, 0.25, 0.15, 0.10)
+  ),
+  AMX = sample(ab_interpretations,
+    size = sample_size, replace = TRUE,
+    prob = c(0.60, 0.05, 0.35)
+  ),
+  AMC = sample(ab_interpretations,
+    size = sample_size, replace = TRUE,
+    prob = c(0.75, 0.10, 0.15)
+  ),
+  CIP = sample(ab_interpretations,
+    size = sample_size, replace = TRUE,
+    prob = c(0.80, 0.00, 0.20)
+  ),
+  GEN = sample(ab_interpretations,
+    size = sample_size, replace = TRUE,
+    prob = c(0.92, 0.00, 0.08)
+  )
+)
+
+
+data_b <- data.frame(
+  date = sample(dates, size = sample_size, replace = TRUE),
+  hospital = "B",
+  bacteria = sample(bacteria_b,
+    size = sample_size, replace = TRUE,
+    prob = c(0.50, 0.25, 0.15, 0.10)
+  ),
+  AMX = sample(ab_interpretations_messy,
+    size = sample_size, replace = TRUE,
+    prob = c(0.60, 0.05, 0.35)
+  ),
+  AMC = sample(ab_interpretations_messy,
+    size = sample_size, replace = TRUE,
+    prob = c(0.75, 0.10, 0.15)
+  ),
+  CIP = sample(ab_interpretations_messy,
+    size = sample_size, replace = TRUE,
+    prob = c(0.80, 0.00, 0.20)
+  ),
+  GEN = sample(ab_interpretations_messy,
+    size = sample_size, replace = TRUE,
+    prob = c(0.92, 0.00, 0.08)
+  )
+)
+
+data_c <- data.frame(
+  date = sample(dates, size = sample_size, replace = TRUE),
+  hospital = "C",
+  bacteria = sample(bacteria_c,
+    size = sample_size, replace = TRUE,
+    prob = c(0.50, 0.25, 0.15, 0.10)
+  ),
+  AMX = sample(ab_interpretations,
+    size = sample_size, replace = TRUE,
+    prob = c(0.60, 0.05, 0.35)
+  ),
+  AMC = sample(ab_interpretations,
+    size = sample_size, replace = TRUE,
+    prob = c(0.75, 0.10, 0.15)
+  ),
+  CIP = sample(ab_interpretations,
+    size = sample_size, replace = TRUE,
+    prob = c(0.80, 0.00, 0.20)
+  ),
+  GEN = sample(ab_interpretations,
+    size = sample_size, replace = TRUE,
+    prob = c(0.92, 0.00, 0.08)
+  )
+)
+
+
+example_isolates_unclean <- data_a %>%
+  bind_rows(data_b, data_c)
+
+example_isolates_unclean$patient_id <- sample(patients, size = nrow(example_isolates_unclean), replace = TRUE)
+
+example_isolates_unclean <- example_isolates_unclean %>%
+  select(patient_id, hospital, date, bacteria, everything()) %>%
+  dataset_UTF8_to_ASCII()
+
+usethis::use_data(example_isolates_unclean, overwrite = TRUE, internal = FALSE, version = 2, compress = "xz")
--- a/data-raw/_reproduction_scripts/reproduction_of_intrinsic_resistant.R
+++ b/data-raw/_reproduction_scripts/reproduction_of_intrinsic_resistant.R
@ -0,0 +1,68 @@
+# ==================================================================== #
+# TITLE:                                                               #
+# AMR: An R Package for Working with Antimicrobial Resistance Data     #
+#                                                                      #
+# SOURCE CODE:                                                         #
+# https://github.com/msberends/AMR                                     #
+#                                                                      #
+# PLEASE CITE THIS SOFTWARE AS:                                        #
+# Berends MS, Luz CF, Friedrich AW, et al. (2022).                     #
+# AMR: An R Package for Working with Antimicrobial Resistance Data.    #
+# Journal of Statistical Software, 104(3), 1-31.                       #
+# https://doi.org/10.18637/jss.v104.i03                                #
+#                                                                      #
+# Developed at the University of Groningen and the University Medical  #
+# Center Groningen in The Netherlands, in collaboration with many      #
+# colleagues from around the world, see our website.                   #
+#                                                                      #
+# This R package is free software; you can freely use and distribute   #
+# it for both personal and commercial purposes under the terms of the  #
+# GNU General Public License version 2.0 (GNU GPL-2), as published by  #
+# the Free Software Foundation.                                        #
+# We created this package for both routine data analysis and academic  #
+# research and it was publicly released in the hope that it will be    #
+# useful, but it comes WITHOUT ANY WARRANTY OR LIABILITY.              #
+#                                                                      #
+# Visit our website for the full manual and a complete tutorial about  #
+# how to conduct AMR data analysis: https://amr-for-r.org              #
+# ==================================================================== #
+
+library(AMR)
+library(dplyr)
+int_resis <- data.frame(mo = microorganisms$mo, stringsAsFactors = FALSE)
+for (i in seq_len(nrow(antimicrobials))) {
+  int_resis$new <- as.sir("S")
+  colnames(int_resis)[ncol(int_resis)] <- antimicrobials$ab[i]
+}
+
+int_resis <- eucast_rules(int_resis,
+  eucast_rules_df = subset(
+    AMR:::EUCAST_RULES_DF,
+    is.na(have_these_values) & reference.rule_group == "Expected phenotypes" & reference.version == 1.2
+  ),
+  overwrite = TRUE,
+  info = FALSE
+)
+
+int_resis2 <- int_resis[, sapply(int_resis, function(x) any(!is.sir(x) | x == "R")), drop = FALSE] %>%
+  tidyr::pivot_longer(-mo) %>%
+  filter(value == "R") %>%
+  select(mo, ab = name)
+
+# remove lab drugs
+untreatable <- antimicrobials[which(antimicrobials$name %like% "-high|EDTA|polysorbate|macromethod|screening|/nacubactam"), "ab", drop = TRUE]
+# takes ages with filter()..., weird
+int_resis3 <- int_resis2[which(!int_resis2$ab %in% untreatable), ]
+int_resis3$ab <- as.ab(int_resis3$ab)
+int_resis3
+
+all(int_resis3$mo %in% microorganisms$mo)
+all(int_resis3$ab %in% antimicrobials$ab)
+
+intrinsic_resistant <- int_resis3
+
+usethis::use_data(intrinsic_resistant, internal = FALSE, overwrite = TRUE, version = 2, compress = "xz")
+rm(intrinsic_resistant)
+
+# AFTER THIS:
+# DO NOT FORGET TO UPDATE THE VERSION NUMBER IN mo_is_intrinsic_resistant() AND R/data.R
--- a/data-raw/_reproduction_scripts/reproduction_of_microorganisms.R
+++ b/data-raw/_reproduction_scripts/reproduction_of_microorganisms.R
--- a/data-raw/_reproduction_scripts/reproduction_of_microorganisms.groups.R
+++ b/data-raw/_reproduction_scripts/reproduction_of_microorganisms.groups.R
@ -0,0 +1,189 @@
+# ==================================================================== #
+# TITLE:                                                               #
+# AMR: An R Package for Working with Antimicrobial Resistance Data     #
+#                                                                      #
+# SOURCE CODE:                                                         #
+# https://github.com/msberends/AMR                                     #
+#                                                                      #
+# PLEASE CITE THIS SOFTWARE AS:                                        #
+# Berends MS, Luz CF, Friedrich AW, et al. (2022).                     #
+# AMR: An R Package for Working with Antimicrobial Resistance Data.    #
+# Journal of Statistical Software, 104(3), 1-31.                       #
+# https://doi.org/10.18637/jss.v104.i03                                #
+#                                                                      #
+# Developed at the University of Groningen and the University Medical  #
+# Center Groningen in The Netherlands, in collaboration with many      #
+# colleagues from around the world, see our website.                   #
+#                                                                      #
+# This R package is free software; you can freely use and distribute   #
+# it for both personal and commercial purposes under the terms of the  #
+# GNU General Public License version 2.0 (GNU GPL-2), as published by  #
+# the Free Software Foundation.                                        #
+# We created this package for both routine data analysis and academic  #
+# research and it was publicly released in the hope that it will be    #
+# useful, but it comes WITHOUT ANY WARRANTY OR LIABILITY.              #
+#                                                                      #
+# Visit our website for the full manual and a complete tutorial about  #
+# how to conduct AMR data analysis: https://amr-for-r.org              #
+# ==================================================================== #
+
+# This data set is being used in the clinical_breakpoints data set, and thus by as.sir().
+# It prevents the breakpoints table from being extremely long for species that are part of a species group.
+# Also used by eucast_rules() to expand group names.
+
+library(dplyr)
+library(readr)
+library(tidyr)
+devtools::load_all()
+
+# Install the WHONET software on Windows (http://www.whonet.org/software.html),
+# and copy the folder C:\WHONET\Resources to the data-raw/WHONET/ folder
+
+
+# BACTERIAL COMPLEXES
+# find all bacterial complex in the NCBI Taxonomy Browser here:
+# https://www.ncbi.nlm.nih.gov/Taxonomy/Browser/wwwtax.cgi?mode=Undef&id=2&lvl=6&lin=f&keep=1&srchmode=1&unlock
+# and search the page for 'complex', then follow the link and add the missing ones in this R file
+
+
+# READ DATA ----
+
+whonet_organisms <- read_tsv("data-raw/WHONET/Resources/Organisms.txt", na = c("", "NA", "-"), show_col_types = FALSE) %>%
+  # remove old taxonomic names
+  filter(TAXONOMIC_STATUS == "C") %>%
+  mutate(ORGANISM_CODE = toupper(WHONET_ORG_CODE))
+
+whonet_organisms <- whonet_organisms %>%
+  select(ORGANISM_CODE, ORGANISM, SPECIES_GROUP, GBIF_TAXON_ID) %>%
+  mutate(
+    # this one was called Issatchenkia orientalis, but it should be:
+    ORGANISM = if_else(ORGANISM_CODE == "ckr", "Candida krusei", ORGANISM)
+  ) %>% 
+  # try to match on GBIF identifier
+  left_join(microorganisms %>% distinct(mo, gbif, status) %>% filter(!is.na(gbif)), by = c("GBIF_TAXON_ID" = "gbif")) %>% 
+  # remove duplicates
+  arrange(ORGANISM_CODE, GBIF_TAXON_ID, status) %>%
+  distinct(ORGANISM_CODE, .keep_all = TRUE) %>% 
+  # add Enterobacterales, which is a subkingdom code in their data
+  bind_rows(data.frame(ORGANISM_CODE = "ebc", ORGANISM = "Enterobacterales", mo = as.mo("Enterobacterales"))) %>% 
+  arrange(ORGANISM)
+
+# check non-existing species groups in the microorganisms table
+whonet_organisms %>%
+  filter(!is.na(SPECIES_GROUP)) %>%
+  group_by(SPECIES_GROUP) %>%
+  summarise(complex = ORGANISM[ORGANISM %like% " (group|complex)"][1],
+            organisms = paste0(n(), ": ", paste(sort(unique(ORGANISM)), collapse = ", "))) %>% 
+  filter(!SPECIES_GROUP %in% microorganisms.codes$code)
+
+# create the species group data set ----
+microorganisms.groups <- whonet_organisms %>%
+  # these will not be translated well
+  filter(!ORGANISM %in% c("Trueperella pyogenes-like bacteria",
+                          "Mycobacterium suricattae",
+                          "Mycobacterium canetti")) %>% 
+  filter(!is.na(SPECIES_GROUP), SPECIES_GROUP != ORGANISM_CODE) %>%
+  transmute(mo_group = as.mo(SPECIES_GROUP),
+            mo = ifelse(is.na(mo),
+                        as.character(as.mo(ORGANISM, keep_synonyms = TRUE, minimum_matching_score = 0)),
+                        mo)) %>% 
+  # add our own CoNS and CoPS, WHONET does not strictly follow Becker et al (2014, 2019, 2020)
+  filter(mo_group != as.mo("CoNS")) %>% 
+  bind_rows(tibble(mo_group = as.mo("CoNS"), mo = MO_CONS)) %>% 
+  filter(mo_group != as.mo("CoPS")) %>% 
+  bind_rows(tibble(mo_group = as.mo("CoPS"), mo = MO_COPS)) %>% 
+  # at least all our Lancefield-grouped streptococci must be in the beta-haemolytic group:
+  bind_rows(tibble(mo_group = as.mo("Beta-haemolytic streptococcus"), 
+                   mo = c(MO_LANCEFIELD,
+                          microorganisms %>% filter(fullname %like% "^Streptococcus Group") %>% pull(mo)))) %>% 
+  # and per Streptococcus group as well:
+  # group A - S. pyogenes
+  bind_rows(tibble(mo_group = as.mo("Streptococcus Group A"),
+                   mo = microorganisms$mo[which(microorganisms$mo %like% "^B_STRPT_PYGN(_|$)")])) %>% 
+  # group B - S. agalactiae
+  bind_rows(tibble(mo_group = as.mo("Streptococcus Group B"),
+                   mo = microorganisms$mo[which(microorganisms$mo %like% "^B_STRPT_AGLC(_|$)")])) %>% 
+  # group C - all subspecies within S. dysgalactiae and S. equi (such as S. equi zooepidemicus)
+  bind_rows(tibble(mo_group = as.mo("Streptococcus Group C"),
+                   mo = microorganisms$mo[which(microorganisms$mo %like% "^B_STRPT_(DYSG|EQUI)(_|$)")])) %>% 
+  # group F - Milleri group == S. anginosus group, which incl. S. anginosus, S. constellatus, S. intermedius
+  bind_rows(tibble(mo_group = as.mo("Streptococcus Group F"),
+                   mo = microorganisms$mo[which(microorganisms$mo %like% "^B_STRPT_(ANGN|CNST|INTR)(_|$)")])) %>% 
+  # group G - S. dysgalactiae and S. canis (though dysgalactiae is also group C and will be matched there)
+  bind_rows(tibble(mo_group = as.mo("Streptococcus Group G"),
+                   mo = microorganisms$mo[which(microorganisms$mo %like% "^B_STRPT_(DYSG|CANS)(_|$)")])) %>% 
+  # group H - S. sanguinis
+  bind_rows(tibble(mo_group = as.mo("Streptococcus Group H"),
+                   mo = microorganisms$mo[which(microorganisms$mo %like% "^B_STRPT_SNGN(_|$)")])) %>% 
+  # group K - S. salivarius, incl. S. salivarius salivariuss and S. salivarius thermophilus
+  bind_rows(tibble(mo_group = as.mo("Streptococcus Group K"),
+                   mo = microorganisms$mo[which(microorganisms$mo %like% "^B_STRPT_SLVR(_|$)")])) %>%
+  # group L - only S. dysgalactiae
+  bind_rows(tibble(mo_group = as.mo("Streptococcus Group L"),
+                   mo = microorganisms$mo[which(microorganisms$mo %like% "^B_STRPT_DYSG(_|$)")])) %>% 
+  # and for EUCAST: Strep group A, B, C, G
+  bind_rows(tibble(mo_group = as.mo("Streptococcus Group A, B, C, G"),
+                   mo = microorganisms$mo[which(microorganisms$mo %like% "^B_STRPT_(PYGN|AGLC|DYSG|EQUI|CANS|GRPA|GRPB|GRPC|GRPG)(_|$)")])) %>%
+  # HACEK is:
+  # - Haemophilus species
+  # - Aggregatibacter species
+  # - Cardiobacterium hominis
+  # - Eikenella corrodens
+  # - Kingella species
+  # - and previously Actinobacillus actinomycetemcomitans
+  # see https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3656887/
+  filter(mo_group != as.mo("HACEK")) %>% 
+  bind_rows(tibble(mo_group = as.mo("HACEK"), mo = microorganisms %>% filter(genus == "Haemophilus") %>% pull(mo))) %>% 
+  bind_rows(tibble(mo_group = as.mo("HACEK"), mo = microorganisms %>% filter(genus == "Aggregatibacter") %>% pull(mo))) %>% 
+  bind_rows(tibble(mo_group = as.mo("HACEK"), mo = as.mo("Cardiobacterium hominis", keep_synonyms = TRUE))) %>%
+  bind_rows(tibble(mo_group = as.mo("HACEK"), mo = as.mo("Eikenella corrodens", keep_synonyms = TRUE))) %>%
+  bind_rows(tibble(mo_group = as.mo("HACEK"), mo = microorganisms %>% filter(genus == "Kingella") %>% pull(mo))) %>%
+  bind_rows(tibble(mo_group = as.mo("HACEK"), mo = as.mo("Actinobacillus actinomycetemcomitans", keep_synonyms = TRUE))) %>%
+  # Citrobacter freundii complex in the NCBI Taxonomy Browser:
+  # https://www.ncbi.nlm.nih.gov/Taxonomy/Browser/wwwtax.cgi?id=1344959
+  filter(mo_group != "B_CTRBC_FRND-C") %>% 
+  bind_rows(tibble(mo_group = as.mo("B_CTRBC_FRND-C"),
+                   mo = paste("Citrobacter", c("freundii", "braakii", "gillenii", "murliniae", "portucalensis", "sedlakii", "werkmanii", "youngae")) %>% as.mo(keep_synonyms = TRUE))) %>% 
+  # Klebsiella pneumoniae complex
+  filter(mo_group != "B_KLBSL_PNMN-C") %>% 
+  bind_rows(tibble(mo_group = as.mo("B_KLBSL_PNMN-C"),
+                   mo = paste("Klebsiella", c("africana", "pneumoniae", "quasipneumoniae", "quasivariicola", "variicola")) %>% as.mo(keep_synonyms = TRUE))) %>% 
+  # Yersinia pseudotuberculosis complex in the NCBI Taxonomy Browser:
+  # https://www.ncbi.nlm.nih.gov/Taxonomy/Browser/wwwtax.cgi?id=1649845
+  filter(mo_group != "B_YERSN_PSDT-C") %>% 
+  bind_rows(tibble(mo_group = as.mo("B_YERSN_PSDT-C"),
+                   mo = paste("Yersinia", c("pseudotuberculosis", "pestis", "similis", "wautersii")) %>% as.mo(keep_synonyms = TRUE))) %>% 
+  # RGM are Rapidly-grwoing Mycobacteria, see https://pubmed.ncbi.nlm.nih.gov/28084211/
+  filter(mo_group != "B_MYCBC_RGM") %>% 
+  bind_rows(tibble(mo_group = as.mo("B_MYCBC_RGM"),
+                   mo = paste("Mycobacterium", c( "abscessus abscessus", "abscessus bolletii", "abscessus massiliense", "agri", "aichiense", "algericum", "alvei", "anyangense", "arabiense", "aromaticivorans", "aubagnense", "aubagnense", "aurum", "austroafricanum", "bacteremicum", "boenickei", "bourgelatii", "brisbanense", "brumae", "canariasense", "celeriflavum", "chelonae", "chitae", "chlorophenolicum", "chubuense", "confluentis", "cosmeticum", "crocinum", "diernhoferi", "duvalii", "elephantis", "fallax", "flavescens", "fluoranthenivorans", "fortuitum", "franklinii", "frederiksbergense", "gadium", "gilvum", "goodii", "hassiacum", "hippocampi", "hodleri", "holsaticum", "houstonense", "immunogenum", "insubricum", "iranicum", "komossense", "litorale", "llatzerense", "madagascariense", "mageritense", "monacense", "moriokaense", "mucogenicum", "mucogenicum", "murale", "neoaurum", "neworleansense", "novocastrense", "obuense", "pallens", "parafortuitum", "peregrinum", "phlei", "phocaicum", "phocaicum", "porcinum", "poriferae", "psychrotolerans", "pyrenivorans", "rhodesiae", "rufum", "rutilum", "salmoniphilum", "sediminis", "senegalense", "septicum", "setense", "smegmatis", "sphagni", "thermoresistibile", "tokaiense", "vaccae", "vanbaalenii", "wolinskyi")) %>% as.mo(keep_synonyms = TRUE)))
+
+# add subspecies to all species
+for (group in unique(microorganisms.groups$mo_group)) {
+  spp <- microorganisms.groups %>%
+    filter(mo_group == group & mo_rank(mo, keep_synonyms = TRUE) == "species") %>% 
+    pull(mo) %>% 
+    paste0(collapse = "|") %>% 
+    paste0("^(", ., ")")
+  mos <- microorganisms %>%
+    filter(mo %like% spp & rank == "subspecies") %>%
+    pull(mo)
+  # add them
+  microorganisms.groups <- microorganisms.groups %>% bind_rows(tibble(mo_group = as.mo(group), mo = mos))
+}
+
+# add full names, arrange and clean
+microorganisms.groups <- microorganisms.groups %>%
+  mutate(mo_group_name = mo_name(mo_group, keep_synonyms = TRUE, language = NULL),
+         mo_name = mo_name(mo, keep_synonyms = TRUE, language = NULL)) %>%
+  arrange(mo_group_name, mo_name) %>% 
+  filter(mo_group != mo) %>%
+  distinct() %>%
+  dataset_UTF8_to_ASCII()
+mo_uncertainties()
+
+class(microorganisms.groups$mo_group) <- c("mo", "character")
+class(microorganisms.groups$mo) <- c("mo", "character")
+usethis::use_data(microorganisms.groups, internal = FALSE, overwrite = TRUE, compress = "xz", version = 2)
+rm(microorganisms.groups)
+devtools::load_all()
--- a/data-raw/_reproduction_scripts/reproduction_of_poorman.R
+++ b/data-raw/_reproduction_scripts/reproduction_of_poorman.R
@ -0,0 +1,110 @@
+# get complete filenames of all R files in the GitHub repository of nathaneastwood/poorman
+
+library(magrittr)
+`%like%` <- function(x, y) grepl(y, x, ignore.case = TRUE, perl = TRUE)
+`%unlike%` <- function(x, y) !grepl(y, x, ignore.case = TRUE, perl = TRUE)
+
+commit <- "3cc0a9920b1eb559dd166f548561244189586b3a"
+
+files <- xml2::read_html(paste0("https://github.com/nathaneastwood/poorman/tree/", commit, "/R")) %>%
+  rvest::html_nodes("a") %>%
+  rvest::html_attr("href")
+files <- files[files %like% "/blob/.*R$"]
+
+# get full URLs of all raw R files
+files <- sort(paste0("https://raw.githubusercontent.com", gsub("blob/", "", files[files %like% "/R/.*.R$"])))
+# remove files with only pkg specific code
+files <- files[files %unlike% "(zzz|init)[.]R$"]
+# also, there's a lot of functions we don't use
+files <- files[files %unlike% "/(between|coalesce|cumulative|fill|glimpse|group_cols|na_if|near|nest_by|check_filter|poorman-package|print|recode|reconstruct|replace_na|replace_with|rownames|slice|union_all|unite|window_rank|with_groups)[.]R$"]
+
+# add our prepend file, containing info about the source of the data
+intro <- readLines("data-raw/poorman_prepend.R") %>%
+  # add commit to intro part
+  gsub("{commit}", commit, ., fixed = TRUE) %>%
+  # add date to intro part
+  gsub("{date}", trimws(format(Sys.Date(), "%e %B %Y")), ., fixed = TRUE)
+# copyright info:
+copyright <- paste0("# ", readLines(paste0("https://raw.githubusercontent.com/nathaneastwood/poorman/", commit, "/LICENSE")))
+
+# read all contents to a character vector
+contents <- character(0)
+sapply(files, function(file) {
+  message("reading ", basename(file))
+  contents <<- c(contents, readLines(file))
+  invisible()
+})
+
+# remove lines starting with "#'" and NULL and write to file
+contents <- contents[!grepl("^(#'|NULL|\"_PACKAGE)", contents)]
+contents.bak <- contents
+
+# grouped attributes same as dplyr
+contents <- gsub("grouped_data", "grouped_df", contents, fixed = TRUE)
+# now make it independent on UseMethod, since we will not export these functions
+has_usemethods <- gsub("^([a-z_]+).*", "\\1", contents[which(contents %like% "usemethod") - 1])
+for (use in has_usemethods) {
+  relevant_row <- which(contents %like% paste0("^", use, " <- function")) + 1
+  function_call <- trimws(gsub(".*function(.*)\\{.*", "\\1", contents[relevant_row - 1]))
+  function_call1 <- trimws(gsub("[()]", "", strsplit(function_call, ",")[[1]][1]))
+  if (any(contents %like% paste0(use, ".grouped_df"))) {
+    # this function will have methods for data.frame and grouped_df
+    contents[relevant_row] <- paste0("  if (\"grouped_df\" %in% class(", function_call1, ")) ", use, ".grouped_df", function_call, " else ", use, ".data.frame", function_call)
+  } else {
+    # this function will only have data.frame as method
+    contents[relevant_row] <- paste0("  ", use, ".data.frame", function_call)
+  }
+  # add pm_ prefix
+  contents[relevant_row - 1] <- paste0("pm_", contents[relevant_row - 1])
+}
+# correct for NextMethod
+contents <- gsub("NextMethod\\(\"(.*)\"\\)", "\\1.data.frame(...)", contents)
+# correct for 'default' method
+contents <- gsub(".default <-", ".data.frame <-", contents, fixed = TRUE)
+contents <- gsub("pm_group_by_drop.data.frame", "pm_group_by_drop", contents, fixed = TRUE)
+contents <- gsub("(stats::)?setNames", "stats::setNames", contents)
+# now get all those pm_* functions to replace all untransformed function name calls as well
+new_pm_names <- sort(gsub("pm_(.*?) <-.*", "\\1", contents[grepl("^pm_", contents)]))
+for (i in seq_len(length(new_pm_names))) {
+  contents <- gsub(paste0("([^a-z._])", new_pm_names[i], "([^a-z._])"), paste0("\\1pm_", new_pm_names[i], "\\2"), contents)
+  # starting with a space or a straight bracket or an opening parenthesis, ending with nothing or a non-character or a closing parenthesis
+  contents <- gsub(paste0("( |\\[|\\()", new_pm_names[i], "($|[^a-z]|\\))"), paste0("\\1pm_", new_pm_names[i], "\\2"), contents)
+}
+# replace %>% with %pm>%
+contents[which(contents %like% "^\\|\\|") - 1] <- paste0(contents[which(contents %like% "^\\|\\|") - 1], " ||")
+contents[which(contents %like% "^\\|\\|")] <- gsub("^\\|\\|", "", contents[which(contents %like% "^\\|\\|")])
+contents <- gsub("%>%", "%pm>%", contents, fixed = TRUE)
+# fix for new lines, since n() also existed
+contents <- gsub("\\pm_n", "\\n", contents, fixed = TRUE)
+# prefix other functions also with "pm_"
+contents <- gsub("^([a-z_]+)(\\$|)", "pm_\\1\\2", contents)
+# prefix environmental objects and functions
+contents <- gsub("(add_group_columns|add_tally|apply_grouped_function|as_function|as_symbols|build_data_frame|calculate_groups|check_filter|check_if_types|check_name|check_context|collapse_to_sentence|context|deparse_|dotdotdot|drop_dup_list|eval_call|eval_env|eval_expr|eval_select_pos|find_used|flatten|get_group_details|gluestick|group_|groups|groups_set|has_groups|have_name|insert_dot|is.grouped_df|is_df_or_vector|is_empty_list|is_formula|is_named|is_negated_colon|is_nested|is_string|is_wholenumber|join_message|join_worker|names_are_invalid|nth|peek_vars|reconstruct_attrs|replace_na|replace_with|select_|select_context|select_env|select_positions|setup_|split_into_groups|squash|tally|tally_n|validate_case_when_length)", "pm_\\1", contents)
+# now a lot of items are overprefixed
+contents <- gsub("(pm_)+", "pm_", contents)
+contents <- gsub("_pm_", "_", contents)
+contents <- gsub("pm_if (\"grouped_df", "if (\"grouped_df", contents, fixed = TRUE)
+# remove comments and empty lines
+contents <- gsub("#.*", "", contents)
+contents <- contents[trimws(contents) != ""]
+# fix for their relocate()
+contents <- gsub("if (!missing(.before))", "if (!missing(.before) && !is.null(.before))", contents, fixed = TRUE)
+contents <- gsub("if (!missing(.after))", "if (!missing(.after) && !is.null(.after))", contents, fixed = TRUE)
+contents[which(contents %like% "reshape\\($") + 1] <- gsub("data", "as.data.frame(data, stringsAsFactors = FALSE)", contents[which(contents %like% "reshape\\($") + 1])
+contents <- gsub("pm_relocate(.data = long, values_to, .after = -1)", 'pm_relocate(.data = long, "value", .after = -1)', contents, fixed = TRUE)
+
+# who needs US spelling?
+contents <- contents[contents %unlike% "summarize"]
+
+# add intro
+contents <- c(
+  intro,
+  copyright,
+  "",
+  contents
+)
+
+writeLines(contents, "R/aa_helper_pm_functions.R")
+
+# note: pm_left_join() will be overwritten by aaa_helper_functions.R, which contains a faster implementation
+# replace `res <- as.data.frame(res)` with  `res <- as.data.frame(res, stringsAsFactors = FALSE)`