From cead31bed03c290f6d0fd5bbd4a0ec20c8e09ab5 Mon Sep 17 00:00:00 2001 From: Matthijs Berends Date: Mon, 4 May 2026 22:57:42 +0200 Subject: [PATCH] prepare for morphology --- R/data.R | 1 + R/mo_property.R | 19 ++- .../reproduction_of_microorganisms.R | 131 +++++++++++++++++- 3 files changed, 148 insertions(+), 3 deletions(-) diff --git a/R/data.R b/R/data.R index 6f4fc92e6..15120706f 100755 --- a/R/data.R +++ b/R/data.R @@ -111,6 +111,7 @@ #' - `rank`\cr Text of the taxonomic rank of the microorganism, such as `"species"` or `"genus"` #' - `ref`\cr Abbreviated authority citation for the nomenclatural act that established the current name combination, following ICNP conventions. For species described in their current genus (*sp. nov.*), this is the original description author(s) and year. For species transferred to a different genus (*comb. nov.*), this is the reclassification author(s) and year. Emendations are excluded. For synonyms, this is the authority under which the synonym was originally published. This field is directly retrieved from the source specified in the column `source`. Diacritics were removed to comply with CRAN, that only allows ASCII characters. #' - `oxygen_tolerance` \cr Oxygen tolerance, either `r vector_or(microorganisms$oxygen_tolerance, documentation = TRUE)`. These data were retrieved from BacDive (see *Source*). Items that contain "likely" are missing from BacDive and were extrapolated from other species within the same genus to guess the oxygen tolerance. Currently `r round(length(microorganisms$oxygen_tolerance[which(!is.na(microorganisms$oxygen_tolerance))]) / nrow(microorganisms[which(microorganisms$kingdom == "Bacteria"), ]) * 100, 1)`% of all `r format_included_data_number(nrow(microorganisms[which(microorganisms$kingdom == "Bacteria"), ]))` bacteria in the data set contain an oxygen tolerance. +#' - `morphology` \cr Morphology (cell shape), either `r vector_or(microorganisms$morphology, documentation = TRUE)`. These data were retrieved from BacDive (see *Source*). Genera that are clinically established as coccobacilli (the HACEK group and beyond, such as *Haemophilus* and *Acinetobacter*) are classified as such regardless of BacDive majority vote. Items that contain "likely" are missing from BacDive and were extrapolated from other species within the same genus. Currently `r round(length(microorganisms$morphology[which(!is.na(microorganisms$morphology))]) / nrow(microorganisms[which(microorganisms$kingdom == "Bacteria"), ]) * 100, 1)`% of all `r format_included_data_number(nrow(microorganisms[which(microorganisms$kingdom == "Bacteria"), ]))` bacteria in the data set contain a morphology. #' - `source`\cr Either `r vector_or(microorganisms$source, documentation = TRUE)` (see *Source*) #' - `lpsn`\cr Identifier ('Record number') of `r TAXONOMY_VERSION$LPSN$name`. This will be the first/highest LPSN identifier to keep one identifier per row. For example, *Acetobacter ascendens* has LPSN Record number 7864 and 11011. Only the first is available in the `microorganisms` data set. ***This is a unique identifier***, though available for only `r format_included_data_number(sum(!is.na(microorganisms$lpsn)))` records. #' - `lpsn_parent`\cr LPSN identifier of the parent taxon diff --git a/R/mo_property.R b/R/mo_property.R index 1a6dddaef..0bde26c90 100755 --- a/R/mo_property.R +++ b/R/mo_property.R @@ -56,7 +56,7 @@ #' #' Determination of intrinsic resistance ([mo_is_intrinsic_resistant()]) is based on the [intrinsic_resistant] data set, which is based on `r format_eucast_version_nr(names(EUCAST_VERSION_EXPECTED_PHENOTYPES[1]))`. The [mo_is_intrinsic_resistant()] function can be vectorised over both argument `x` (input for microorganisms) and `ab` (input for antimicrobials). #' -#' Determination of bacterial oxygen tolerance ([mo_oxygen_tolerance()]) is based on BacDive, see *Source*. The function [mo_is_anaerobic()] only returns `TRUE` if the oxygen tolerance is `"anaerobe"`, indicting an obligate anaerobic species or genus. It always returns `FALSE` for species outside the taxonomic kingdom of Bacteria. +#' Determination of both bacterial oxygen tolerance ([mo_oxygen_tolerance()]) and morphology ([mo_morphology()]) are based on BacDive, see *Source*. The function [mo_is_anaerobic()] only returns `TRUE` if the oxygen tolerance is `"anaerobe"`, indicating an obligate anaerobic species or genus. It always returns `FALSE` for species outside the taxonomic kingdom of Bacteria. #' #' The function [mo_url()] will return the direct URL to the online database entry, which also shows the scientific reference of the concerned species. [This MycoBank URL](`r TAXONOMY_VERSION$MycoBank$url`) is used for fungi wherever available , [this LPSN URL](`r TAXONOMY_VERSION$MycoBank$url`) for bacteria wherever available, and [this GBIF link](`r TAXONOMY_VERSION$GBIF$url`) otherwise. #' @@ -462,8 +462,9 @@ mo_pathogenicity <- function(x, language = get_AMR_locale(), keep_synonyms = get } #' @rdname mo_property +#' @param add_morphology a [logical] to indicate whether the morphology (from [mo_morphology()]) should be added to the Gram stain result, e.g. `"Gram-negative rods"` instead of `"Gram-negative"`. The default is `FALSE`. #' @export -mo_gramstain <- function(x, language = get_AMR_locale(), keep_synonyms = getOption("AMR_keep_synonyms", FALSE), ...) { +mo_gramstain <- function(x, language = get_AMR_locale(), keep_synonyms = getOption("AMR_keep_synonyms", FALSE), add_morphology = FALSE, ...) { if (missing(x)) { # this tries to find the data and an 'mo' column x <- find_mo_col(fn = "mo_gramstain") @@ -636,6 +637,20 @@ mo_is_anaerobic <- function(x, language = get_AMR_locale(), keep_synonyms = getO out } +#' @rdname mo_property +#' @export +mo_morphology <- function(x, language = get_AMR_locale(), keep_synonyms = getOption("AMR_keep_synonyms", FALSE), ...) { + if (missing(x)) { + # this tries to find the data and an 'mo' column + x <- find_mo_col(fn = "mo_morphology") + } + meet_criteria(x, allow_NA = TRUE) + language <- validate_language(language) + meet_criteria(keep_synonyms, allow_class = "logical", has_length = 1) + + mo_validate(x = x, property = "morphology", language = language, keep_synonyms = keep_synonyms, ...) +} + #' @rdname mo_property #' @export mo_snomed <- function(x, language = get_AMR_locale(), keep_synonyms = getOption("AMR_keep_synonyms", FALSE), ...) { diff --git a/data-raw/_reproduction_scripts/reproduction_of_microorganisms.R b/data-raw/_reproduction_scripts/reproduction_of_microorganisms.R index d7a862747..dcc029bd6 100644 --- a/data-raw/_reproduction_scripts/reproduction_of_microorganisms.R +++ b/data-raw/_reproduction_scripts/reproduction_of_microorganisms.R @@ -746,7 +746,7 @@ taxonomy_mycobank <- taxonomy_mycobank %>% tax_h, tax_i %in% taxonomy_mycobank$fullname[taxonomy_mycobank$rank == "genus"] ~ tax_i, - tax_k %in% taxonomy_mycobank$fullname[taxonomy_mycobank$rank == "genus"] ~ + tax_j %in% taxonomy_mycobank$fullname[taxonomy_mycobank$rank == "genus"] ~ tax_j, tax_k %in% taxonomy_mycobank$fullname[taxonomy_mycobank$rank == "genus"] ~ tax_k, @@ -2858,6 +2858,135 @@ taxonomy <- taxonomy %>% relocate(oxygen_tolerance, .after = ref) +# Add morphology --------------------------------------------------------------------- + +# We will use the BacDive data base for this: +# - go to https://bacdive.dsmz.de/advsearch +# - filter 'Cell shape' on "*" and click Submit +# - click on the 'Download table as CSV' button +bacdive_shape <- vroom::vroom("data-raw/bacdive_shape.csv", skip = 2) %>% + select(species, shape = `Cell shape`) +bacdive_shape <- bacdive_shape %>% + # fill in missing species from previous rows + mutate(fullname = if_else(is.na(species), lag(species), species)) %>% + filter( + !is.na(species), + !is.na(shape), + species %unlike% "unclassified" + ) %>% + select(-species) +bacdive_shape <- bacdive_shape %>% + # map raw BacDive values to a controlled vocabulary + mutate( + shape = case_when( + shape %in% c("coccus-shaped", "sphere-shaped", "diplococcus-shaped") ~ "cocci", + shape %in% c("oval-shaped", "ovoid-shaped") ~ "coccobacilli", + shape %in% c("rod-shaped", "curved-shaped", "vibrio-shaped", "flask-shaped") ~ "rods", + shape %in% c("spiral-shaped", "helical-shaped") ~ "spirilla", + shape == "filament-shaped" ~ "filamentous", + TRUE ~ NA_character_ + ) + ) %>% + filter(!is.na(shape)) %>% + # now determine shape per species by majority vote + group_by(fullname) %>% + summarise( + morphology = names(sort(table(shape), decreasing = TRUE))[1] + ) +# now find all synonyms and copy them from their current taxonomic names +synonyms_shape <- taxonomy %>% + filter(status == "synonym") %>% + transmute( + mo, + fullname_old = fullname, + current = synonym_mo_to_accepted_mo( + mo, + fill_in_accepted = FALSE, + dataset = taxonomy + ) + ) %>% + filter(!is.na(current)) %>% + mutate(fullname = taxonomy$fullname[match(current, taxonomy$mo)]) %>% + left_join(bacdive_shape, by = "fullname") %>% + filter(!is.na(morphology)) %>% + select(fullname, morphology) + +bacdive_shape <- bacdive_shape %>% + bind_rows(synonyms_shape) %>% + distinct() + +bacdive_shape_genus <- bacdive_shape %>% + mutate( + shape_raw = morphology, + genus = taxonomy$genus[match(fullname, taxonomy$fullname)] + ) %>% + group_by(fullname = genus) %>% + summarise( + morphology = names(sort(table(shape_raw), decreasing = TRUE))[1] + ) +bacdive_shape <- bacdive_shape %>% + bind_rows(bacdive_shape_genus) %>% + arrange(fullname) + +bacdive_shape_other <- taxonomy %>% + filter( + kingdom == "Bacteria", + rank == "species", + !fullname %in% bacdive_shape$fullname, + genus %in% bacdive_shape$fullname + ) %>% + select(fullname, genus) %>% + left_join(bacdive_shape, by = c("genus" = "fullname")) %>% + mutate( + morphology = paste("likely", morphology) + ) %>% + select(fullname, morphology) %>% + distinct(fullname, .keep_all = TRUE) + +bacdive_shape <- bacdive_shape %>% + bind_rows(bacdive_shape_other) %>% + arrange(fullname) %>% + distinct(fullname, .keep_all = TRUE) + +taxonomy <- taxonomy %>% + left_join(bacdive_shape, by = "fullname") %>% + relocate(morphology, .after = oxygen_tolerance) + +# Override: genera that are clinically established coccobacilli but where BacDive +# majority vote yields "rods" due to observer disagreement on the rod/oval boundary. +# These genera are universally reported as coccobacilli on Gram stain in clinical +# microbiology practice. +coccobacilli_genera <- c( + "Acinetobacter", "Aggregatibacter", "Brucella", + "Gardnerella", "Haemophilus", "Kingella", + "Moraxella", "Pasteurella" +) +taxonomy <- taxonomy %>% + mutate( + morphology = case_when( + genus %in% coccobacilli_genera & is.na(morphology) ~ "likely coccobacilli", + genus %in% coccobacilli_genera & + morphology %in% c("rods", "cocci") ~ "coccobacilli", + genus %in% coccobacilli_genera & + morphology %in% c("likely rods", "likely cocci") ~ "likely coccobacilli", + TRUE ~ morphology + ) + ) + +# Spirochaetes: the entire phylum is spirochaete by definition, fill in where missing +taxonomy <- taxonomy %>% + mutate( + morphology = case_when( + phylum %in% c("Spirochaetota", "Spirochaetes") & is.na(morphology) ~ "likely spirilla", + phylum %in% c("Spirochaetota", "Spirochaetes") & + morphology %in% c("rods", "likely rods") ~ "spirilla", + TRUE ~ morphology + ) + ) + + + + # Restore 'synonym' microorganisms to 'accepted' -------------------------------------------------- # If there are some synonyms that need to be corrected to 'accepted', you can do that here.