Fix clinical breakpoints

2025-06-07 20:33:59 +02:00 · 2023-04-14 23:14:34 +02:00 · 2023-04-14 23:14:34 +02:00 · ed70f95380
commit ed70f95380
parent 147f9112e9
30 changed files with 1226 additions and 1616 deletions
--- a/2
+++ b/2
@ -1,5 +1,5 @@
 Package: AMR
-Version: 2.0.0.9005
+Version: 2.0.0.9006
 Date: 2023-04-14
 Title: Antimicrobial Resistance Data Analysis
 Description: Functions to simplify and standardise antimicrobial resistance (AMR)
--- a/NEWS.md
+++ b/NEWS.md
@ -1,8 +1,8 @@
-# AMR 2.0.0.9005
+# AMR 2.0.0.9006

 ## Changed
 * formatting fix for `sir_interpretation_history()`
-* Fixed some WHONET codes for microorganisms
+* Fixed some WHONET codes for microorganisms and consequently a couple of entries in `clinical_breakpoints`


 # AMR 2.0.0
--- a/R/data.R
+++ b/R/data.R
@ -125,7 +125,7 @@
 #' - 2 entries of *Staphylococcus* (coagulase-negative (CoNS) and coagulase-positive (CoPS))
 #' - 1 entry of *Blastocystis* (*B.  hominis*), although it officially does not exist (Noel *et al.* 2005, PMID 15634993)
 #' - 1 entry of *Moraxella* (*M. catarrhalis*), which was formally named *Branhamella catarrhalis* (Catlin, 1970) though this change was never accepted within the field of clinical microbiology
-#' - 6 other 'undefined' entries (unknown, unknown Gram negatives, unknown Gram positives, unknown yeast, unknown fungus, and unknown anaerobic bacteria)
+#' - 8 other 'undefined' entries (unknown, unknown Gram-negatives, unknown Gram-positives, unknown yeast, unknown fungus, and unknown anaerobic Gram-pos/Gram-neg bacteria)
 #'
 #' The syntax used to transform the original data to a cleansed \R format, can be found here: <https://github.com/msberends/AMR/blob/main/data-raw/reproduction_of_microorganisms.R>.
 #'
--- a/R/sysdata.rda
+++ b/R/sysdata.rda
--- a/data-raw/clin_break.md5
+++ b/data-raw/clin_break.md5
@ -1 +1 @@
-68467f5179638ac5622281df53a5ea75
+e150d98b724ad979e176058c4197c469
--- a/data-raw/clinical_breakpoints.dta
+++ b/data-raw/clinical_breakpoints.dta
--- a/data-raw/clinical_breakpoints.feather
+++ b/data-raw/clinical_breakpoints.feather
--- a/data-raw/clinical_breakpoints.parquet
+++ b/data-raw/clinical_breakpoints.parquet
--- a/data-raw/clinical_breakpoints.rds
+++ b/data-raw/clinical_breakpoints.rds
--- a/data-raw/clinical_breakpoints.sas
+++ b/data-raw/clinical_breakpoints.sas
--- a/data-raw/clinical_breakpoints.sav
+++ b/data-raw/clinical_breakpoints.sav
--- a/data-raw/clinical_breakpoints.txt
+++ b/data-raw/clinical_breakpoints.txt
--- a/data-raw/clinical_breakpoints.xlsx
+++ b/data-raw/clinical_breakpoints.xlsx
--- a/data-raw/microorganisms.dta
+++ b/data-raw/microorganisms.dta
--- a/data-raw/microorganisms.feather
+++ b/data-raw/microorganisms.feather
--- a/data-raw/microorganisms.md5
+++ b/data-raw/microorganisms.md5
@ -1 +1 @@
-cf8b0db59dbfe8b42fbd0a6c51a7e9b0
+3d92820386230a7ac3c9367ce6d96db9
--- a/data-raw/microorganisms.parquet
+++ b/data-raw/microorganisms.parquet
--- a/data-raw/microorganisms.rds
+++ b/data-raw/microorganisms.rds
--- a/data-raw/microorganisms.sas
+++ b/data-raw/microorganisms.sas
--- a/data-raw/microorganisms.sav
+++ b/data-raw/microorganisms.sav
--- a/data-raw/microorganisms.txt
+++ b/data-raw/microorganisms.txt
@ -1,5 +1,7 @@
 "mo"	"fullname"	"status"	"kingdom"	"phylum"	"class"	"order"	"family"	"genus"	"species"	"subspecies"	"rank"	"ref"	"source"	"lpsn"	"lpsn_parent"	"lpsn_renamed_to"	"gbif"	"gbif_parent"	"gbif_renamed_to"	"prevalence"	"snomed"
 "B_ANAER"	"(unknown anaerobic bacteria)"	"accepted"	"Bacteria"	"(unknown phylum)"	"(unknown class)"	"(unknown order)"	"(unknown family)"	"(unknown Gram-negatives)"	"(unknown species)"	"(unknown subspecies)"	"subspecies"		"manually added"							2	""
+"B_ANAER-NEG"	"(unknown anaerobic Gram-negatives)"	"accepted"	"Bacteria"	"(unknown phylum)"	"(unknown class)"	"(unknown order)"	"(unknown family)"	"(unknown genus)"	"(unknown species)"	"(unknown subspecies)"	"subspecies"		"manually added"							2	""
+"B_ANAER-POS"	"(unknown anaerobic Gram-positives)"	"accepted"	"Bacteria"	"(unknown phylum)"	"(unknown class)"	"(unknown order)"	"(unknown family)"	"(unknown genus)"	"(unknown species)"	"(unknown subspecies)"	"subspecies"		"manually added"							2	""
 "F_FUNGUS"	"(unknown fungus)"	"accepted"	"Fungi"	"(unknown phylum)"	"(unknown class)"	"(unknown order)"	"(unknown family)"	"(unknown genus)"	"(unknown species)"	"(unknown subspecies)"	"subspecies"		"manually added"							2	""
 "B_GRAMN"	"(unknown Gram-negatives)"	"accepted"	"Bacteria"	"(unknown phylum)"	"(unknown class)"	"(unknown order)"	"(unknown family)"	"(unknown Gram-negatives)"	"(unknown species)"	"(unknown subspecies)"	"subspecies"		"manually added"							2	""
 "B_GRAMP"	"(unknown Gram-positives)"	"accepted"	"Bacteria"	"(unknown phylum)"	"(unknown class)"	"(unknown order)"	"(unknown family)"	"(unknown Gram-positives)"	"(unknown species)"	"(unknown subspecies)"	"subspecies"		"manually added"							2	""
--- a/data-raw/microorganisms.xlsx
+++ b/data-raw/microorganisms.xlsx
--- a/data-raw/reproduction_of_clinical_breakpoints.R
+++ b/data-raw/reproduction_of_clinical_breakpoints.R
@ -39,7 +39,9 @@ library(AMR)
 # and copy the folder C:\WHONET\Resources to the data-raw/WHONET/ folder
 # (for ASIARS-Net update, also copy C:\WHONET\Codes to the data-raw/WHONET/ folder)

-# Load source data ----
+
+# MICROORGANISMS WHONET CODES ----
+
 whonet_organisms <- read_tsv("data-raw/WHONET/Resources/Organisms.txt", na = c("", "NA", "-"), show_col_types = FALSE) %>%
  # remove old taxonomic names
  filter(TAXONOMIC_STATUS == "C") %>%
@ -55,97 +57,49 @@ whonet_organisms <- read_tsv("data-raw/WHONET/Resources/Organisms.txt", na = c("
    ORGANISM = if_else(ORGANISM_CODE == "fne", "Fusobacterium necrophorum", ORGANISM),
    ORGANISM = if_else(ORGANISM_CODE == "fnu", "Fusobacterium nucleatum", ORGANISM),
    ORGANISM = if_else(ORGANISM_CODE == "sdy", "Streptococcus dysgalactiae", ORGANISM),
-    ORGANISM = if_else(ORGANISM_CODE == "axy", "Achromobacter xylosoxidans", ORGANISM)
+    ORGANISM = if_else(ORGANISM_CODE == "axy", "Achromobacter xylosoxidans", ORGANISM),
+    # and this one was called Issatchenkia orientalis, but it should be:
+    ORGANISM = if_else(ORGANISM_CODE == "ckr", "Candida krusei", ORGANISM)
  )
-whonet_breakpoints <- read_tsv("data-raw/WHONET/Resources/Breakpoints.txt", na = c("", "NA", "-"), show_col_types = FALSE) %>%
-  filter(BREAKPOINT_TYPE == "Human", GUIDELINES %in% c("CLSI", "EUCAST"))
-whonet_antibiotics <- read_tsv("data-raw/WHONET/Resources/Antibiotics.txt", na = c("", "NA", "-"), show_col_types = FALSE) %>%
-  arrange(WHONET_ABX_CODE) %>%
-  distinct(WHONET_ABX_CODE, .keep_all = TRUE)
-
-
-# Transform data ----

+# add some general codes
 whonet_organisms <- whonet_organisms %>%
  bind_rows(data.frame(
    ORGANISM_CODE = c("ebc", "cof"),
    ORGANISM = c("Enterobacterales", "Campylobacter")
  ))

-mo_reset_session()
 whonet_organisms.bak <- whonet_organisms
+# generate the mo codes and add their names
 whonet_organisms <- whonet_organisms.bak %>% 
  mutate(mo = as.mo(gsub("(sero[a-z]*| complex| nontypable| non[-][a-zA-Z]+|var[.]| not .*|sp[.],.*|, .*variant.*|, .*toxin.*|, microaer.*| beta-haem[.])", "", ORGANISM),
                    keep_synonyms = TRUE,
                    language = "en"),
-         mo = as.mo(ifelse(ORGANISM %like% "Anaerobic", "B_ANAER", mo)),
+         mo = case_when(ORGANISM %like% "Anaerobic" & ORGANISM %like% "negative" ~ as.mo("B_ANAER-NEG"),
+                        ORGANISM %like% "Anaerobic" & ORGANISM %like% "positive" ~ as.mo("B_ANAER-POS"),
+                        ORGANISM %like% "Anaerobic" ~ as.mo("B_ANAER"),
+                        TRUE ~ mo),
         mo_name = mo_name(mo,
                           keep_synonyms = TRUE,
                           language = "en"))
-
-# update microorganisms.codes with the latest WHONET codes
+# check if coercion at least resembles the first part (genus)
 new_mo_codes <- whonet_organisms %>% 
  mutate(
    first_part = sapply(ORGANISM, function(x) strsplit(gsub("[^a-zA-Z _-]+", "", x), " ")[[1]][1], USE.NAMES = FALSE),
-    keep = mo_name %like_case% first_part | ORGANISM %like% "Gram " | ORGANISM == "Other")
+    keep = mo_name %like_case% first_part | ORGANISM %like% "Gram " | ORGANISM == "Other" | ORGANISM %like% "anaerobic")
+# update microorganisms.codes with the latest WHONET codes
 microorganisms.codes <- microorganisms.codes %>% 
  # remove all old WHONET codes, whether we (in the end) keep them or not
-  filter(!toupper(code) %in% toupper(new_mo_codes$ORGANISM_CODE)) %>% 
+  filter(!toupper(code) %in% toupper(whonet_organisms$ORGANISM_CODE)) %>% 
+  # and add the new ones
  bind_rows(new_mo_codes %>% 
              filter(keep == TRUE) %>% 
              transmute(code = toupper(ORGANISM_CODE),
                        mo = mo)) %>% 
  arrange(code)
-# save to package
-usethis::use_data(microorganisms.codes, overwrite = TRUE, compress = "xz", version = 2)
-rm(microorganisms.codes)
-devtools::load_all()

-
-breakpoints <- whonet_breakpoints %>%
-  mutate(ORGANISM_CODE = tolower(ORGANISM_CODE)) %>%
-  left_join(whonet_organisms) %>%
-  filter(ORGANISM %unlike% "(^cdc |Gram.*variable|virus)")
-# this ones lack a MO name, they will become "UNKNOWN":
-breakpoints %>%
-  filter(is.na(ORGANISM)) %>%
-  pull(ORGANISM_CODE) %>%
-  unique()
-
-
-# Generate new lookup table for microorganisms ----
-
-new_mo_codes <- breakpoints %>%
-  distinct(ORGANISM_CODE, ORGANISM) %>%
-  mutate(ORGANISM = ORGANISM %>%
-    gsub("Issatchenkia orientalis", "Candida krusei", .) %>%
-    gsub(", nutritionally variant", "", .) %>%
-    gsub(", toxin-.*producing", "", .)) %>%
-  mutate(
-    mo = as.mo(ORGANISM, language = NULL, keep_synonyms = FALSE),
-    mo_name = mo_name(mo, language = NULL)
-  )
-
-
-# Update microorganisms.codes with the latest WHONET codes ----
-
-# these will be changed :
-new_mo_codes %>%
-  mutate(code = toupper(ORGANISM_CODE)) %>%
-  rename(mo_new = mo) %>%
-  left_join(microorganisms.codes %>% rename(mo_old = mo)) %>%
-  filter(mo_old != mo_new)
-
-microorganisms.codes <- microorganisms.codes %>%
-  filter(!code %in% toupper(new_mo_codes$ORGANISM_CODE)) %>%
-  bind_rows(new_mo_codes %>% transmute(code = toupper(ORGANISM_CODE), mo = mo) %>% filter(!is.na(mo))) %>%
-  arrange(code) %>%
-  as_tibble()
-usethis::use_data(microorganisms.codes, overwrite = TRUE, compress = "xz", version = 2)
-rm(microorganisms.codes)
-devtools::load_all()
-
-# update ASIARS-Net?
+# Run this part to update ASIARS-Net:
+# start
 asiarsnet <- read_tsv("data-raw/WHONET/Codes/ASIARS_Net_Organisms_ForwardLookup.txt")
 asiarsnet <- asiarsnet %>%
  mutate(WHONET_Code = toupper(WHONET_Code)) %>%
@ -167,20 +121,59 @@ microorganisms.codes <- microorganisms.codes %>%
  filter(!code %in% c(insert1$code, insert2$code)) %>%
  bind_rows(insert1, insert2) %>%
  arrange(code)
+# end
+
+# save to package
+usethis::use_data(microorganisms.codes, overwrite = TRUE, compress = "xz", version = 2)
+rm(microorganisms.codes)
+devtools::load_all()


-# Create new breakpoint table ----
+# BREAKPOINTS ----

+# now that we have the right MO codes, get the breakpoints and convert them
+whonet_breakpoints <- read_tsv("data-raw/WHONET/Resources/Breakpoints.txt", na = c("", "NA", "-"), show_col_types = FALSE) %>%
+  filter(BREAKPOINT_TYPE == "Human", GUIDELINES %in% c("CLSI", "EUCAST"))
+whonet_antibiotics <- read_tsv("data-raw/WHONET/Resources/Antibiotics.txt", na = c("", "NA", "-"), show_col_types = FALSE) %>%
+  arrange(WHONET_ABX_CODE) %>%
+  distinct(WHONET_ABX_CODE, .keep_all = TRUE)
+
+breakpoints <- whonet_breakpoints %>%
+  mutate(code = toupper(ORGANISM_CODE)) %>%
+  left_join(microorganisms.codes)
+# these ones lack a MO name, they cannot be used:
+unknown <- breakpoints %>%
+  filter(is.na(mo)) %>%
+  pull(code) %>%
+  unique()
+whonet_organisms %>% 
+  filter(toupper(ORGANISM_CODE) %in% unknown)
+breakpoints <- breakpoints %>% 
+  filter(!is.na(mo))
+
+# and these ones have unknown antibiotics according to WHONET itself:
+breakpoints %>% 
+  filter(!WHONET_ABX_CODE %in% whonet_antibiotics$WHONET_ABX_CODE) %>% 
+  count(YEAR, GUIDELINES, WHONET_ABX_CODE) %>% 
+  arrange(desc(YEAR))
+# we cannot use them
+breakpoints <- breakpoints %>% 
+  filter(WHONET_ABX_CODE %in% whonet_antibiotics$WHONET_ABX_CODE)
+# now check with our own antibiotics
+breakpoints %>% 
+  filter(!toupper(WHONET_ABX_CODE) %in% antibiotics$ab) %>% 
+  pull(WHONET_ABX_CODE) %>% 
+  unique()
+# they are at the moment all old codes that have right replacements in `antibiotics`, so we can use as.ab()
+  
 breakpoints_new <- breakpoints %>%
-  # only last 10 years
-  filter(YEAR > as.double(format(Sys.Date(), "%Y")) - 10) %>%
-  # "all" and "gen" (general) must become UNKNOWNs:
-  mutate(ORGANISM_CODE = if_else(ORGANISM_CODE %in% c("all", "gen"), "UNKNOWN", ORGANISM_CODE)) %>%
+  # only last available 10 years
+  filter(YEAR > max(YEAR) - 10) %>%
  transmute(
    guideline = paste(GUIDELINES, YEAR),
    method = TEST_METHOD,
-    site = gsub("Urinary tract infection", "UTI", SITE_OF_INFECTION),
-    mo = as.mo(ORGANISM_CODE, keep_synonyms = FALSE),
+    site = gsub(".*(UTI|urinary|urine).*", "UTI", SITE_OF_INFECTION, ignore.case = TRUE),
+    mo,
    rank_index = case_when(
      mo_rank(mo) %like% "(infra|sub)" ~ 1,
      mo_rank(mo) == "species" ~ 2,
@ -194,17 +187,22 @@ breakpoints_new <- breakpoints %>%
    disk_dose = POTENCY,
    breakpoint_S = S,
    breakpoint_R = R,
-    uti = SITE_OF_INFECTION %like% "(UTI|urinary|urine)"
+    uti = ifelse(is.na(site), FALSE, site == "UTI")
  ) %>%
  # Greek symbols and EM dash symbols are not allowed by CRAN, so replace them with ASCII:
  mutate(disk_dose = disk_dose %>%
-    gsub("μ", "u", ., fixed = TRUE) %>%
-    gsub("µ", "u", ., fixed = TRUE) %>% # this is another micro sign, although we cannot see it
+    gsub("μ", "u", ., fixed = TRUE) %>% # this is 'mu', \u03bc
+    gsub("µ", "u", ., fixed = TRUE) %>% # this is 'micro', u00b5 (yes, they look the same)
    gsub("–", "-", ., fixed = TRUE)) %>%
  arrange(desc(guideline), ab, mo, method) %>%
  filter(!(is.na(breakpoint_S) & is.na(breakpoint_R)) & !is.na(mo) & !is.na(ab)) %>%
  distinct(guideline, ab, mo, method, site, breakpoint_S, .keep_all = TRUE)

+# check the strange duplicates
+breakpoints_new %>% 
+  mutate(id = paste(guideline, ab, mo, method, site)) %>% 
+  filter(id %in% .$id[which(duplicated(id))])
+
 # clean disk zones and MICs
 breakpoints_new[which(breakpoints_new$method == "DISK"), "breakpoint_S"] <- as.double(as.disk(breakpoints_new[which(breakpoints_new$method == "DISK"), "breakpoint_S", drop = TRUE]))
 breakpoints_new[which(breakpoints_new$method == "DISK"), "breakpoint_R"] <- as.double(as.disk(breakpoints_new[which(breakpoints_new$method == "DISK"), "breakpoint_R", drop = TRUE]))
@ -223,7 +221,7 @@ breakpoints_new[which(breakpoints_new$breakpoint_R == 513), "breakpoint_R"] <- m
 breakpoints_new[which(breakpoints_new$breakpoint_R == 1025), "breakpoint_R"] <- m[which(m == 1024) + 1]

 # WHONET adds one log2 level to the R breakpoint for their software, e.g. in AMC in Enterobacterales:
-# EUCAST 2021 guideline: S <= 8 and R > 8
+# EUCAST 2022 guideline: S <= 8 and R > 8
 #           WHONET file: S <= 8 and R >= 16
 breakpoints_new %>% filter(guideline == "EUCAST 2022", ab == "AMC", mo == "B_[ORD]_ENTRBCTR", method == "MIC")
 # this will make an MIC of 12 I, which should be R, so:
--- a/data/clinical_breakpoints.rda
+++ b/data/clinical_breakpoints.rda
--- a/data/microorganisms.codes.rda
+++ b/data/microorganisms.codes.rda
--- a/data/microorganisms.rda
+++ b/data/microorganisms.rda
--- a/man/as.sir.Rd
+++ b/man/as.sir.Rd
@ -163,7 +163,7 @@ After using \code{\link[=as.sir]{as.sir()}}, you can use the \code{\link[=eucast

 \subsection{Machine-Readable Clinical Breakpoints}{

-The repository of this package \href{https://github.com/msberends/AMR/blob/main/data-raw/clinical_breakpoints.txt}{contains a machine-readable version} of all guidelines. This is a CSV file consisting of 18 308 rows and 11 columns. This file is machine-readable, since it contains one row for every unique combination of the test method (MIC or disk diffusion), the antimicrobial drug and the microorganism. \strong{This allows for easy implementation of these rules in laboratory information systems (LIS)}. Note that it only contains interpretation guidelines for humans - interpretation guidelines from CLSI for animals were removed.
+The repository of this package \href{https://github.com/msberends/AMR/blob/main/data-raw/clinical_breakpoints.txt}{contains a machine-readable version} of all guidelines. This is a CSV file consisting of 17 918 rows and 11 columns. This file is machine-readable, since it contains one row for every unique combination of the test method (MIC or disk diffusion), the antimicrobial drug and the microorganism. \strong{This allows for easy implementation of these rules in laboratory information systems (LIS)}. Note that it only contains interpretation guidelines for humans - interpretation guidelines from CLSI for animals were removed.
 }

 \subsection{Other}{
--- a/man/clinical_breakpoints.Rd
+++ b/man/clinical_breakpoints.Rd
@ -5,7 +5,7 @@
 \alias{clinical_breakpoints}
 \title{Data Set with Clinical Breakpoints for SIR Interpretation}
 \format{
-A \link[tibble:tibble]{tibble} with 18 308 observations and 11 variables:
+A \link[tibble:tibble]{tibble} with 17 918 observations and 11 variables:
 \itemize{
 \item \code{guideline}\cr Name of the guideline
 \item \code{method}\cr Either "DISK" or "MIC"
--- a/man/microorganisms.Rd
+++ b/man/microorganisms.Rd
@ -3,9 +3,9 @@
 \docType{data}
 \name{microorganisms}
 \alias{microorganisms}
-\title{Data Set with 52 149 Microorganisms}
+\title{Data Set with 52 151 Microorganisms}
 \format{
-A \link[tibble:tibble]{tibble} with 52 149 observations and 22 variables:
+A \link[tibble:tibble]{tibble} with 52 151 observations and 22 variables:
 \itemize{
 \item \code{mo}\cr ID of microorganism as used by this package
 \item \code{fullname}\cr Full name, like \code{"Escherichia coli"}. For the taxonomic ranks genus, species and subspecies, this is the 'pasted' text of genus, species, and subspecies. For all taxonomic ranks higher than genus, this is the name of the taxon.
@ -66,7 +66,7 @@ For convenience, some entries were added manually:
 \item 2 entries of \emph{Staphylococcus} (coagulase-negative (CoNS) and coagulase-positive (CoPS))
 \item 1 entry of \emph{Blastocystis} (\emph{B.  hominis}), although it officially does not exist (Noel \emph{et al.} 2005, PMID 15634993)
 \item 1 entry of \emph{Moraxella} (\emph{M. catarrhalis}), which was formally named \emph{Branhamella catarrhalis} (Catlin, 1970) though this change was never accepted within the field of clinical microbiology
-\item 6 other 'undefined' entries (unknown, unknown Gram negatives, unknown Gram positives, unknown yeast, unknown fungus, and unknown anaerobic bacteria)
+\item 8 other 'undefined' entries (unknown, unknown Gram-negatives, unknown Gram-positives, unknown yeast, unknown fungus, and unknown anaerobic Gram-pos/Gram-neg bacteria)
 }

 The syntax used to transform the original data to a cleansed \R format, can be found here: \url{https://github.com/msberends/AMR/blob/main/data-raw/reproduction_of_microorganisms.R}.
--- a/man/microorganisms.codes.Rd
+++ b/man/microorganisms.codes.Rd
@ -3,9 +3,9 @@
 \docType{data}
 \name{microorganisms.codes}
 \alias{microorganisms.codes}
-\title{Data Set with 5 751 Common Microorganism Codes}
+\title{Data Set with 5 754 Common Microorganism Codes}
 \format{
-A \link[tibble:tibble]{tibble} with 5 751 observations and 2 variables:
+A \link[tibble:tibble]{tibble} with 5 754 observations and 2 variables:
 \itemize{
 \item \code{code}\cr Commonly used code of a microorganism
 \item \code{mo}\cr ID of the microorganism in the \link{microorganisms} data set