# ==================================================================== #
# TITLE                                                                #
# Antimicrobial Resistance (AMR) Data Analysis for R                   #
#                                                                      #
# SOURCE                                                               #
# https://github.com/msberends/AMR                                     #
#                                                                      #
# LICENCE                                                              #
# (c) 2018-2022 Berends MS, Luz CF et al.                              #
# Developed at the University of Groningen, the Netherlands, in        #
# collaboration with non-profit organisations Certe Medical            #
# Diagnostics & Advice, and University Medical Center Groningen.       # 
#                                                                      #
# This R package is free software; you can freely use and distribute   #
# it for both personal and commercial purposes under the terms of the  #
# GNU General Public License version 2.0 (GNU GPL-2), as published by  #
# the Free Software Foundation.                                        #
# We created this package for both routine data analysis and academic  #
# research and it was publicly released in the hope that it will be    #
# useful, but it comes WITHOUT ANY WARRANTY OR LIABILITY.              #
#                                                                      #
# Visit our website for the full manual and a complete tutorial about  #
# how to conduct AMR data analysis: https://msberends.github.io/AMR/   #
# ==================================================================== #

library(AMR)
library(tidyverse)

# we will use Public Health Information Network Vocabulary Access and Distribution System (PHIN VADS)
# as a source, which copies directly from the latest US SNOMED CT version
# - go to https://phinvads.cdc.gov/vads/ViewValueSet.action?oid=2.16.840.1.114222.4.11.1009
# - check that current online version is higher than SNOMED_VERSION$current_version
# - if so, click on 'Download Value Set', choose 'TXT'
snomed <- read_tsv("data-raw/SNOMED_PHVS_Microorganism_CDC_V12.txt", skip = 3) %>% 
  select(1:2) %>% 
  set_names(c("snomed", "mo"))

# save all valid genera, species and subspecies
vctr <- unique(unlist(strsplit(c(microorganisms$fullname, microorganisms.old$fullname), " ")))
vctr <- tolower(vctr[vctr %like% "^[a-z]+$"])

# remove all parts of the name that are no valid values in genera, species or subspecies
# this takes ~20 seconds
snomed <- snomed %>% 
  mutate(fullname = vapply(FUN.VALUE = character(1),
                           # split on space and/or comma
                           strsplit(tolower(mo), "[ ,]"),
                           function(x) trimws(paste0(x[x %in% vctr], collapse = " "))),
         # remove " group"
         fullname = gsub(" group", "", fullname, fixed = TRUE))

snomed_keep <- snomed %>% 
  filter(fullname %in% tolower(c(microorganisms$fullname, microorganisms.old$fullname))) %>% 
  group_by(fullname_lower = fullname) %>% 
  summarise(snomed = list(snomed))

message(nrow(snomed_keep), " MO's will get a SNOMED code.")

# save to microorganisms data set
microorganisms <- microorganisms %>%
  # remove old snomed
  select(-snomed) %>%
  # create dummy var for joining
  mutate(fullname_lower = tolower(fullname)) %>%
  # join new snomed
  left_join(snomed_keep) %>%
  # remove dummy var
  select(-fullname_lower) %>% 
  AMR:::dataset_UTF8_to_ASCII()

# don't forget to update the version number in SNOMED_VERSION in ./R/globals.R!

# usethis::use_data(microorganisms, overwrite = TRUE, version = 2, compress = "xz")