(v0.7.1.9024) eucast_rules() fix, new MOs

2025-11-25 06:11:06 +01:00 · 2019-08-06 14:39:22 +02:00
parent 85b62aaf8f
commit 3a1f960f89
23 changed files with 252 additions and 411 deletions
--- a/data-raw/eucast_rules.tsv
+++ b/data-raw/eucast_rules.tsv
@@ -1,4 +1,14 @@
-if_mo_property	like_is_one_of	this_value	and_these_antibiotics	have_these_values	then_change_these_antibiotics	to_value	reference.rule	reference.rule_group
+# ---------------------------------------------------------------------------------------------------
+# For editing this EUCAST reference file, these values can all be used for target antibiotics:
+# all_betalactams, aminoglycosides, carbapenems, cephalosporins, cephalosporins_without_CAZ, fluoroquinolones, 
+# glycopeptides, macrolides, minopenicillins, polymyxins, streptogramins, tetracyclines, ureidopenicillins
+# and all separate EARS-Net letter codes like AMC. They can be separated by comma: 'AMC, fluoroquinolones'.
+# The if_mo_property column can be any column name from the AMR::microorganisms data set, or "genus_species" or "gramstain".
+# The like.is.one_of column must contain one of: like, is, one_of ('like' will read the first column as regular expression)
+# The EUCAST guideline contains references to the 'Burkholderia cepacia complex'. All species in this group can be found in: LiPuma J, Curr Opin Pulm Med. 2005 Nov;11(6):528-33. (PMID 16217180).
+# >>>>> IF YOU WANT TO IMPORT THIS FILE INTO YOUR OWN SOFTWARE, HAVE THE FIRST 10 LINES SKIPPED <<<<<
+# ---------------------------------------------------------------------------------------------------
+if_mo_property	like.is.one_of	this_value	and_these_antibiotics	have_these_values	then_change_these_antibiotics	to_value	reference.rule	reference.rule_group
 order	is	Enterobacteriales	AMP	S	AMX	S	Enterobacteriales (Order)	Breakpoints
 order	is	Enterobacteriales	AMP	I	AMX	I	Enterobacteriales (Order)	Breakpoints
 order	is	Enterobacteriales	AMP	R	AMX	R	Enterobacteriales (Order)	Breakpoints
--- a/data-raw/internals.R
+++ b/data-raw/internals.R
@@ -1,14 +1,16 @@
-# EUCAST rules ----
-# For editing the reference file, these values can all be used for target antibiotics:
-# "aminoglycosides", "tetracyclines", "polymyxins", "macrolides", "glycopeptides",
-# "streptogramins", "cephalosporins", "cephalosporins_without_CAZ", "carbapenems",
-# "minopenicillins", "ureidopenicillins", "fluoroquinolones", "all_betalactams",
-# and all separate EARS-Net letter codes like "AMC". They can be separated by comma: "AMC, fluoroquinolones".
-# The mo_property can be any column name from the AMR::microorganisms data set, or "genus_species" or "gramstain".
-# The EUCAST guideline contains references to the 'Burkholderia cepacia complex'. The species in this group can be found in:
-# LiPuma JJ, Curr Opin Pulm Med. 2005 Nov;11(6):528-33. (PMID 16217180).
+# ---------------------------------------------------------------------------------------------------
+# For editing this EUCAST reference file, these values can all be used for target antibiotics:
+# all_betalactams, aminoglycosides, carbapenems, cephalosporins, cephalosporins_without_CAZ, fluoroquinolones, 
+# glycopeptides, macrolides, minopenicillins, polymyxins, streptogramins, tetracyclines, ureidopenicillins
+# and all separate EARS-Net letter codes like AMC. They can be separated by comma: 'AMC, fluoroquinolones'.
+# The if_mo_property column can be any column name from the AMR::microorganisms data set, or "genus_species" or "gramstain".
+# The EUCAST guideline contains references to the 'Burkholderia cepacia complex'. All species in this group can be found in:
+# LiPuma J, Curr Opin Pulm Med. 2005 Nov;11(6):528-33. (PMID 16217180).
+# >>>>> IF YOU WANT TO IMPORT THIS FILE INTO YOUR OWN SOFTWARE, HAVE THE FIRST 10 LINES SKIPPED <<<<<
+# ---------------------------------------------------------------------------------------------------
 eucast_rules_file <- dplyr::arrange(
  .data = utils::read.delim(file = "data-raw/eucast_rules.tsv",
+                            skip = 10,
                            sep = "\t",
                            stringsAsFactors = FALSE,
                            header = TRUE,
--- a/data-raw/reproduction_of_microorganisms.R
+++ b/data-raw/reproduction_of_microorganisms.R
@@ -99,7 +99,7 @@ MOs <- data_total %>%
      # and not all fungi: Aspergillus, Candida, Trichphyton and Pneumocystis are the most important,
      # so only keep these orders from the fungi:
      & !(kingdom == "Fungi"
-          & !order %in% c("Eurotiales", "Saccharomycetales", "Schizosaccharomycetales", "Tremellales", "Onygenales", "Pneumocystales"))
+          & !order %in% c("Eurotiales", "Mucorales", "Saccharomycetales", "Schizosaccharomycetales", "Tremellales", "Onygenales", "Pneumocystales"))
    )
    # or the genus has to be one of the genera we found in our hospitals last decades (Northern Netherlands, 2002-2018)
    | genus %in% c("Absidia", "Acremonium", "Actinotignum", "Alternaria", "Anaerosalibacter", "Ancylostoma", "Anisakis", "Apophysomyces",
@@ -123,7 +123,7 @@ MOs <- MOs %>%

 MOs <- MOs %>%
  # remove text if it contains 'Not assigned' like phylum in viruses
-  mutate_all(~gsub("Not assigned", "", .))
+  mutate_all(~gsub("(Not assigned|\\[homonym\\]|\\[mistake\\])", "", ., ignore.case = TRUE))

 MOs <- MOs %>%
  # Only keep first author, e.g. transform 'Smith, Jones, 2011' to 'Smith et al., 2011':
@@ -166,8 +166,10 @@ MOs <- MOs %>%

 # Remove non-ASCII characters (these are not allowed by CRAN)
 MOs <- MOs %>%
-  lapply(iconv, from = "UTF-8", to = "ASCII//TRANSLIT") %>%
-  as_tibble(stringsAsFactors = FALSE)
+  lapply(iconv, from = "UTF-8", to = "ASCII//TRANSLIT") %>% 
+  as_tibble(stringsAsFactors = FALSE) %>% 
+  # remove invalid characters
+  mutate_all(~gsub("[\"'`]+", "", .))

 # Split old taxonomic names - they refer in the original data to a new `taxonID` with `acceptedNameUsageID`
 MOs.old <- MOs %>%
@@ -219,6 +221,9 @@ MOs <- MOs %>%
         !(source == "DSMZ" & fullname %in% (MOs %>% filter(source == "CoL") %>% pull(fullname)))) %>%
  distinct(fullname, .keep_all = TRUE)

+# what characters are in the fullnames?
+paste(unique(sort(unlist(strsplit(x = paste(MOs$fullname, collapse = ""), split = "")))), collapse = "")
+
 # Add abbreviations so we can easily know which ones are which ones.
 # These will become valid and unique microbial IDs for the AMR package.
 MOs <- MOs %>%
@@ -295,7 +300,6 @@ MOs <- MOs %>%
  # put `mo` in front, followed by the rest
  select(mo, everything(), -abbr_other, -abbr_genus, -abbr_species, -abbr_subspecies)

-
 # add non-taxonomic entries
 MOs <- MOs %>%
  bind_rows(
@@ -348,6 +352,38 @@ MOs <- MOs %>%
               species_id = "",
               source = "manually added",
               stringsAsFactors = FALSE),
+    data.frame(mo = "F_YEAST",
+               col_id = NA_integer_,
+               fullname = "(unknown yeast)",
+               kingdom = "Fungi",
+               phylum = "(unknown phylum)",
+               class = "(unknown class)",
+               order = "(unknown order)",
+               family = "(unknown family)",
+               genus = "(unknown genus)",
+               species = "(unknown species)",
+               subspecies = "(unknown subspecies)",
+               rank = "species",
+               ref = NA_character_,
+               species_id = "",
+               source = "manually added",
+               stringsAsFactors = FALSE),
+    data.frame(mo = "F_FUNGUS",
+               col_id = NA_integer_,
+               fullname = "(unknown fungus)",
+               kingdom = "Fungi",
+               phylum = "(unknown phylum)",
+               class = "(unknown class)",
+               order = "(unknown order)",
+               family = "(unknown family)",
+               genus = "(unknown genus)",
+               species = "(unknown species)",
+               subspecies = "(unknown subspecies)",
+               rank = "species",
+               ref = NA_character_,
+               species_id = "",
+               source = "manually added",
+               stringsAsFactors = FALSE),
    # CoNS
    MOs %>%
      filter(genus == "Staphylococcus", species == "epidermidis") %>% .[1,] %>%
@@ -488,6 +524,11 @@ MOs <- MOs %>%
 sum(duplicated(MOs$mo))
 colnames(MOs)

+# here we welcome the new ones:
+MOs %>% filter(!fullname %in% AMR::microorganisms$fullname) %>% View()
+# and the ones we lost:
+AMR::microorganisms %>% filter(!fullname %in% MOs$fullname) %>% View()
+
 # set prevalence per species
 MOs <- MOs %>%
  mutate(prevalence = case_when(
@@ -534,12 +575,16 @@ MOs.old$col_id <- as.integer(MOs.old$col_id)
 MOs.old$col_id_new <- as.integer(MOs.old$col_id_new)

 # save
+### for other server
 saveRDS(MOs, "microorganisms.rds")
 saveRDS(MOs.old, "microorganisms.old.rds")
+### for same server
+microorganisms <- MOs
+microorganisms.old <- MOs.old

 # on the server, do:
 usethis::use_data(microorganisms, overwrite = TRUE, version = 2)
 usethis::use_data(microorganisms.old, overwrite = TRUE, version = 2)
 rm(microorganisms)
 rm(microorganisms.old)
-# and update the year in R/data.R
+# and update the year and dimensions in R/data.R