AMR/data-raw/reproduction_of_antivirals.R

# ==================================================================== #
# TITLE                                                                #
# AMR: An R Package for Working with Antimicrobial Resistance Data     #
#                                                                      #
# SOURCE                                                               #
# https://github.com/msberends/AMR                                     #
#                                                                      #
# CITE AS                                                              #
# Berends MS, Luz CF, Friedrich AW, Sinha BNM, Albers CJ, Glasner C    #
# (2022). AMR: An R Package for Working with Antimicrobial Resistance  #
# Data. Journal of Statistical Software, 104(3), 1-31.                 #
# doi:10.18637/jss.v104.i03                                            #
#                                                                      #
# Developed at the University of Groningen and the University Medical  #
# Center Groningen in The Netherlands, in collaboration with many      #
# colleagues from around the world, see our website.                   #
#                                                                      #
# This R package is free software; you can freely use and distribute   #
# it for both personal and commercial purposes under the terms of the  #
# GNU General Public License version 2.0 (GNU GPL-2), as published by  #
# the Free Software Foundation.                                        #
# We created this package for both routine data analysis and academic  #
# research and it was publicly released in the hope that it will be    #
# useful, but it comes WITHOUT ANY WARRANTY OR LIABILITY.              #
#                                                                      #
# Visit our website for the full manual and a complete tutorial about  #
# how to conduct AMR data analysis: https://msberends.github.io/AMR/   #
# ==================================================================== #

library(dplyr)
library(tidyr)
library(rvest)

# get all data from the WHOCC website
get_atc_table <- function(atc_group) {
  # give as input J0XXX, like atc_group = "J05AB"
  downloaded <- read_html(paste0("https://www.whocc.no/atc_ddd_index/?code=", atc_group, "&showdescription=no"))
  table_title <- downloaded %>%
    html_nodes(paste0('a[href^="./?code=', atc_group, '&"]')) %>%
    html_text()
  table_title <- table_title[tolower(table_title) != "show text from guidelines"][1]
  table_content <- downloaded %>%
    html_nodes("table") %>%
    html_table(header = TRUE) %>%
    # returns list, so make data.frame out of it
    as.data.frame(stringsAsFactors = FALSE) %>%
    # select right columns
    select(atc = ATC.code, name = Name, ddd = DDD, unit = U, ddd_type = Adm.R) %>%
    # fill empty rows
    mutate(atc = ifelse(atc == "", lag(atc), atc), name = ifelse(name == "", lag(name), name)) %>%
    pivot_wider(names_from = ddd_type, values_from = c(ddd, unit)) %>%
    mutate(atc_group = table_title)
  if (!"ddd_O" %in% colnames(table_content)) {
    table_content <- table_content %>% mutate(ddd_O = NA_real_, unit_O = NA_character_)
  }
  if (!"ddd_P" %in% colnames(table_content)) {
    table_content <- table_content %>% mutate(ddd_P = NA_real_, unit_P = NA_character_)
  }
  table_content %>% select(atc, name, atc_group,
    oral_ddd = ddd_O, oral_units = unit_O,
    iv_ddd = ddd_P, iv_units = unit_P
  )
}

# these are the relevant groups for input: https://www.whocc.no/atc_ddd_index/?code=J05A (J05 only contains J05A)
atc_groups <- c("J05AA", "J05AB", "J05AC", "J05AD", "J05AE", "J05AF", "J05AG", "J05AH", "J05AJ", "J05AP", "J05AR", "J05AX")

# get the first
antivirals <- get_atc_table(atc_groups[1])
# bind all others to it
for (i in 2:length(atc_groups)) {
  message(atc_groups[i], "...")
  antivirals <- rbind(antivirals, get_atc_table(atc_groups[i]))
}

# arrange on name, untibble it
antivirals <- antivirals %>%
  arrange(name) %>%
  as.data.frame(stringsAsFactors = FALSE)

# add PubChem Compound ID (cid) and their trade names
# see `data-raw/reproduction_of_antibiotics` for get_CID() and get_synonyms()
CIDs <- get_CID(antivirals$name)
# these could not be found:
antivirals[is.na(CIDs), ] %>% View()
# get brand names from PubChem
synonyms <- get_synonyms(CIDs)
synonyms <- lapply(
  synonyms,
  function(x) {
    if (length(x) == 0 | all(is.na(x))) {
      ""
    } else {
      x
    }
  }
)

antivirals <- antivirals %>%
  transmute(atc,
    cid = as.double(CIDs),
    name,
    atc_group,
    synonyms = unname(synonyms),
    oral_ddd,
    oral_units,
    iv_ddd,
    iv_units
  ) %>%
  AMR:::dataset_UTF8_to_ASCII()

av_codes <- tibble(name = antivirals$name %>%
  strsplit("(, | and )") %>%
  unlist() %>%
  unique() %>%
  sort()) %>%
  mutate(av_1st = toupper(abbreviate(name, minlength = 3, use.classes = FALSE))) %>%
  filter(!name %in% c("acid", "dipivoxil", "disoproxil", "marboxil", "alafenamide"))

replace_with_av_code <- function(name) {
  unname(av_codes$av_1st[match(name, av_codes$name)])
}

names_codes <- antivirals %>%
  separate(name,
    into = paste0("name", c(1:7)),
    sep = "(, | and )",
    remove = FALSE,
    fill = "right"
  ) %>%
  # remove empty columns
  select(!where(function(x) all(is.na(x)))) %>%
  mutate_at(vars(matches("name[1-9]")), replace_with_av_code) %>%
  unite(av, matches("name[1-9]"), sep = "+", na.rm = TRUE) %>%
  mutate(name = gsub("(, | and )", "/", name))
substr(names_codes$name, 1, 1) <- toupper(substr(names_codes$name, 1, 1))

antivirals <- bind_cols(
  names_codes %>% select(av, name),
  antivirals %>% select(-name)
)
class(antivirals$av) <- c("av", "character")
antivirals <- antivirals %>% AMR:::dataset_UTF8_to_ASCII()

# add loinc, see 'data-raw/loinc.R'
loinc_df <- read.csv("data-raw/Loinc.csv",
  row.names = NULL,
  stringsAsFactors = FALSE
)

loinc_df <- loinc_df %>% filter(CLASS == "DRUG/TOX")
av_names <- antivirals %>%
  pull(name) %>%
  paste0(collapse = "|") %>%
  paste0("(", ., ")")

antivirals$loinc <- as.list(rep(NA_character_, nrow(antivirals)))
for (i in seq_len(nrow(antivirals))) {
  message(i)
  loinc_ab <- loinc_df %>%
    filter(COMPONENT %like% paste0("^", antivirals$name[i])) %>%
    pull(LOINC_NUM)
  if (length(loinc_ab) > 0) {
    antivirals$loinc[i] <- list(loinc_ab)
  }
}
# sort and fix for empty values
for (i in 1:nrow(antivirals)) {
  loinc <- as.character(sort(unique(tolower(antivirals[i, "loinc", drop = TRUE][[1]]))))
  antivirals[i, "loinc"][[1]] <- ifelse(length(loinc[!loinc == ""]) == 0, list(""), list(loinc))
}

# de-duplicate synonyms
for (i in 1:nrow(antivirals)) {
  syn <- as.character(sort(unique(tolower(antivirals[i, "synonyms", drop = TRUE][[1]]))))
  syn <- syn[!syn %in% tolower(antivirals[i, "name", drop = TRUE])]
  antivirals[i, "synonyms"][[1]] <- ifelse(length(syn[!syn == ""]) == 0, list(""), list(syn))
}

antivirals <- antivirals %>% AMR:::dataset_UTF8_to_ASCII()

# check it
antivirals

# save it
usethis::use_data(antivirals, overwrite = TRUE, internal = FALSE, compress = "xz", version = 2)