radar2/stitchArchive.R

require(tidyverse)

source('globalData.R')
reqColumns = abAbbrevs

patchFirstColumns = c('sampleid', 'date', 'is_clinical', 'is_icu', 'is_outward', 
                      'IsolNr', 'Materiaal', 'specimen_type', 'specimen_group', 
                      'patientid', 'birth_date', 'department', 'type_dept', 
                      'specialism', 'age', 'gender', 'MIC', 'RAP', 'mo')

readPatch = function(filename, reqColumns) {
  patch = read_rds(filename)
  for (c in 1:length(reqColumns)) {
    if (!reqColumns[c] %in% colnames(patch)) { 
      patch = patch %>% 
        add_column(!!reqColumns[c] := as.rsi(NA)) %>%
        select(order(colnames(.))) %>%
        relocate(all_of(patchFirstColumns), .before = 1)
    }
  }
  return(patch)
}

readAllPatches = function() {
  tPatches = tibble(name = list.files(path = 'data', pattern = 'bk_.*\\.rds'))
  
  tPatches = tPatches %>% 
    separate(name, into = c('prefix', 'from', 'tail'), sep = '_') %>%
    separate(tail, into = c('to', 'ext'), sep = '\\.') %>%
    filter(!is.na(to), !str_detect(to, '[:alpha:]')) %>%
    mutate(name = paste0(prefix, '_', from, '_', to, '.', ext)) %>%
    arrange(from, to)
  
  return(tPatches)
}

stitchPatches = function(tPatches, dataPath = 'data', reqColumns) {
  for (i in 1:nrow(tPatches)) {
    # patch = read_rds(file.path(dataPath, tPatches[i,]$name))
    # for (c in 1:length(reqColumns)) {
    #   if (!reqColumns[c] %in% colnames(patch)) { patch = patch %>% add_column(!!reqColumns[c] := is.rsi(NA)) }
    # }
    cat(paste0('  patch ', i, ': ', tPatches$from[i], ' ', tPatches$to[i], '\n'))
    patch = readPatch(filename = file.path(dataPath, tPatches$name[i]), 
                      reqColumns = reqColumns)
    # print(colnames(patch))
    if (i == 1) {
      tStitched = patch
    } else {
      tStitched = tStitched %>% add_row(patch)
    }
  }
  tStitched = tStitched %>% arrange(date, sampleid)
  
  return(tStitched)
}

appendDailyData = function(fnTotal, fnTotalSofar, fnDay, reqColumns) {
  assert_that(file.exists(fnTotalSofar))#, 
              # msg = paste0("File ", fnTotalSofar, " doesn't exist"))
  assert_that(file.exists(fnDay))
  
  tTotalSofar = readPatch(fnTotalSofar, reqColumns = reqColumns)
  tDay        = readPatch(fnDay, reqColumns = reqColumns)
  
  tTotal = tTotalSofar %>% add_row(tDay)
  
  write_rds(x = tTotal, file = fnTotal, compress = 'gz')
  
  return(tTotal)
}

manualStitch_2022_09_30 = function() {
  a = 
    mutate(RAP = as.character('#')) %>% # correct for accidental inclusion in as.rsi conversion
    select(order(colnames(.))) %>%
    relocate(c(sampleid, date, is_clinical, is_icu, is_outward, IsolNr,
               Materiaal, specimen_type, specimen_group, uuid, patientid,
               old_patientid, birth_date, old_birth_date, department,
               type_dept, specialism, age, gender, MIC, RAP, mo), .before = 1)
  # a = a %>% mutate(across(AMC:VOR, as.rsi)) was already done before
  
  b = readPatch('bk_2022-06-17_2022-09-29.rds', reqColumns) %>%
    select(order(colnames(.))) %>%
    relocate(c(sampleid, date, is_clinical, is_icu, is_outward, IsolNr,
               Materiaal, specimen_type, specimen_group, patientid,
               birth_date, department,
               type_dept, specialism, age, gender, MIC, RAP, mo), .before = 1)
  b = b %>% mutate(across(AMC:VOR, as.rsi))
  
  tTotal = a %>% 
    add_row(b) %>%
    mutate(MIC = as.mic(MIC),
           mo = as.mo(mo))
  # Warning: Problem with `mutate()` column `mo`.
  # ℹ `mo = as.mo(mo)`.
  # ℹ 
  # in `as.mo()`: nine unique values (covering 0.0%) could not be coerced and were considered 'unknown': "}hemstre", "}Primair",
  # "acgsch", "anrhyd", "bclmeg", "BK_Isolaat_AEF", "corjimi", "strsalg" and "vvStrpne".
  # Use `mo_failures()` to review them. Edit the `allow_uncertain` argument if needed (see ?as.mo).
  # You can also use your own reference data with set_mo_source() or directly, e.g.:
  #   as.mo("mycode", reference_df = data.frame(own = "mycode", mo = "B_ESCHR_COLI"))
  # mo_name("mycode", reference_df = data.frame(own = "mycode", mo = "B_ESCHR_COLI"))
  # Warning message:
  #   Problem with `mutate()` column `MIC`.
  # ℹ `MIC = as.mic(MIC)`.
  # ℹ in `as.mic()`: 3 results truncated (0%) that were invalid MICs: "?", "13" and "17090355431" 
  
  write_rds(x = tTotal, file = 'data/total_2022-09-29.rds', compress = 'gz')
}

lastLargePatchesStitching = function() manualStitch_2022_09_30()

# > mo_uncertainties()
# Matching scores (in blue) are based on pathogenicity in humans and the resemblance between the input and the full taxonomic name.
# See `?mo_matching_score`.
# 
# "brelut" -> Brevundimonas lutea (B_BRVND_LUTE, 0.316)
# Also matched: Brevibacterium luteolum (0.304)
# 
# "vvEncspp" -> Brevibacterium ravenspurgense (B_BCTRM_RVNS, 0.284)

# read 1
if (F) {
  tPatches = readAllPatches()
  tStitched = stitchPatches(tPatches)
  tStitched %>% write_rds(file = 'data/bk_archive_2014-08_2022-06_v2022-09-15.rds',
                          compress = 'gz')

    x = read_rds('data/bk_archive_2014-08_2022-06_v2022-09-15.rds')
  x = x %>% relocate(all_of(cns), .after = 'date') %>% 
    relocate(specimen_group, .after = specimen_type) %>%
    relocate(MIC, .after = mo)
  y = x %>% mutate(across(AMB:DAP, as.rsi))
  z = y %>% mutate(mo = as.mo(mo))
  mo_failures = c('}AeNeSt', '}hemstre', '}nietfe', '}Primair', 'acgsan', 'acgsch', 'bclmeg', 'bctfae', 'BK_Isolaat_AEF', 'BK_Isolaat_ANF', 'BK_Isolaat_PEF', 'chcgle', 'corjimi', 'enbbug', 'encfcmva', 'Meng', 'psspsy', 'staaurar', 'stainp', 'strsalg', 'surwad', 'vvAeNeSt', 'vvAePoSt', 'vvMRSA')
  
  write_rds(z, 'data/bk_archive_2014-08_2022-06_v2022-09-15_3.rds', 
            compress = 'gz')
  write_rds(z %>% filter(date >= '2021-07-01'), 'data/bk_archive_2021-07_2022-06_v2022-09-23_3.rds', 
            compress = 'gz')
}