AMR/data-raw/snomed.R

75 lines
3.7 KiB
R
Raw Normal View History

2020-01-26 20:20:00 +01:00
# ==================================================================== #
# TITLE #
# Antimicrobial Resistance (AMR) Data Analysis for R #
2020-01-26 20:20:00 +01:00
# #
# SOURCE #
# https://github.com/msberends/AMR #
2020-01-26 20:20:00 +01:00
# #
# LICENCE #
2021-12-23 18:56:28 +01:00
# (c) 2018-2022 Berends MS, Luz CF et al. #
2020-10-08 11:16:03 +02:00
# Developed at the University of Groningen, the Netherlands, in #
# collaboration with non-profit organisations Certe Medical #
# Diagnostics & Advice, and University Medical Center Groningen. #
2020-01-26 20:20:00 +01:00
# #
# This R package is free software; you can freely use and distribute #
# it for both personal and commercial purposes under the terms of the #
# GNU General Public License version 2.0 (GNU GPL-2), as published by #
# the Free Software Foundation. #
# We created this package for both routine data analysis and academic #
# research and it was publicly released in the hope that it will be #
# useful, but it comes WITHOUT ANY WARRANTY OR LIABILITY. #
2020-10-08 11:16:03 +02:00
# #
# Visit our website for the full manual and a complete tutorial about #
# how to conduct AMR data analysis: https://msberends.github.io/AMR/ #
2020-01-26 20:20:00 +01:00
# ==================================================================== #
library(AMR)
2020-01-27 19:14:23 +01:00
library(tidyverse)
2021-03-11 21:42:30 +01:00
# we will use Public Health Information Network Vocabulary Access and Distribution System (PHIN VADS)
# as a source, which copies directly from the latest US SNOMED CT version
# - go to https://phinvads.cdc.gov/vads/ViewValueSet.action?oid=2.16.840.1.114222.4.11.1009
# - check that current online version is higher than SNOMED_VERSION$current_version
# - if so, click on 'Download Value Set', choose 'TXT'
snomed <- read_tsv("data-raw/SNOMED_PHVS_Microorganism_CDC_V12.txt", skip = 3) %>%
select(1:2) %>%
set_names(c("snomed", "mo"))
2020-01-27 19:14:23 +01:00
2021-03-11 21:42:30 +01:00
# save all valid genera, species and subspecies
vctr <- unique(unlist(strsplit(c(microorganisms$fullname, microorganisms.old$fullname), " ")))
vctr <- tolower(vctr[vctr %like% "^[a-z]+$"])
2020-01-27 19:14:23 +01:00
2021-03-11 21:42:30 +01:00
# remove all parts of the name that are no valid values in genera, species or subspecies
# this takes ~20 seconds
2021-03-11 21:42:30 +01:00
snomed <- snomed %>%
mutate(fullname = vapply(FUN.VALUE = character(1),
# split on space and/or comma
strsplit(tolower(mo), "[ ,]"),
function(x) trimws(paste0(x[x %in% vctr], collapse = " "))),
# remove " group"
fullname = gsub(" group", "", fullname, fixed = TRUE))
2020-01-27 19:14:23 +01:00
2021-03-11 21:42:30 +01:00
snomed_keep <- snomed %>%
filter(fullname %in% tolower(c(microorganisms$fullname, microorganisms.old$fullname))) %>%
group_by(fullname_lower = fullname) %>%
summarise(snomed = list(snomed))
2020-01-27 19:14:23 +01:00
message(nrow(snomed_keep), " MO's will get a SNOMED code.")
2021-03-11 21:42:30 +01:00
# save to microorganisms data set
microorganisms <- microorganisms %>%
# remove old snomed
select(-snomed) %>%
# create dummy var for joining
mutate(fullname_lower = tolower(fullname)) %>%
# join new snomed
left_join(snomed_keep) %>%
# remove dummy var
select(-fullname_lower) %>%
AMR:::dataset_UTF8_to_ASCII()
2020-01-26 20:20:00 +01:00
2021-03-11 21:42:30 +01:00
# don't forget to update the version number in SNOMED_VERSION in ./R/globals.R!
# usethis::use_data(microorganisms, overwrite = TRUE, version = 2, compress = "xz")