radar2/stitchArchive.R

144 lines
5.7 KiB
R
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

require(tidyverse)
source('globalData.R')
reqColumns = abAbbrevs
patchFirstColumns = c('sampleid', 'date', 'is_clinical', 'is_icu', 'is_outward',
'IsolNr', 'Materiaal', 'specimen_type', 'specimen_group',
'patientid', 'birth_date', 'department', 'type_dept',
'specialism', 'age', 'gender', 'MIC', 'RAP', 'mo')
readPatch = function(filename, reqColumns) {
patch = read_rds(filename)
for (c in 1:length(reqColumns)) {
if (!reqColumns[c] %in% colnames(patch)) {
patch = patch %>%
add_column(!!reqColumns[c] := as.rsi(NA)) %>%
select(order(colnames(.))) %>%
relocate(all_of(patchFirstColumns), .before = 1)
}
}
return(patch)
}
readAllPatches = function() {
tPatches = tibble(name = list.files(path = 'data', pattern = 'bk_.*\\.rds'))
tPatches = tPatches %>%
separate(name, into = c('prefix', 'from', 'tail'), sep = '_') %>%
separate(tail, into = c('to', 'ext'), sep = '\\.') %>%
filter(!is.na(to), !str_detect(to, '[:alpha:]')) %>%
mutate(name = paste0(prefix, '_', from, '_', to, '.', ext)) %>%
arrange(from, to)
return(tPatches)
}
stitchPatches = function(tPatches, dataPath = 'data', reqColumns) {
for (i in 1:nrow(tPatches)) {
# patch = read_rds(file.path(dataPath, tPatches[i,]$name))
# for (c in 1:length(reqColumns)) {
# if (!reqColumns[c] %in% colnames(patch)) { patch = patch %>% add_column(!!reqColumns[c] := is.rsi(NA)) }
# }
cat(paste0(' patch ', i, ': ', tPatches$from[i], ' ', tPatches$to[i], '\n'))
patch = readPatch(filename = file.path(dataPath, tPatches$name[i]),
reqColumns = reqColumns)
# print(colnames(patch))
if (i == 1) {
tStitched = patch
} else {
tStitched = tStitched %>% add_row(patch)
}
}
tStitched = tStitched %>% arrange(date, sampleid)
return(tStitched)
}
appendDailyData = function(fnTotal, fnTotalSofar, fnDay, reqColumns) {
assert_that(file.exists(fnTotalSofar))#,
# msg = paste0("File ", fnTotalSofar, " doesn't exist"))
assert_that(file.exists(fnDay))
tTotalSofar = readPatch(fnTotalSofar, reqColumns = reqColumns)
tDay = readPatch(fnDay, reqColumns = reqColumns)
tTotal = tTotalSofar %>% add_row(tDay)
write_rds(x = tTotal, file = fnTotal, compress = 'gz')
return(tTotal)
}
manualStitch_2022_09_30 = function() {
a =
mutate(RAP = as.character('#')) %>% # correct for accidental inclusion in as.rsi conversion
select(order(colnames(.))) %>%
relocate(c(sampleid, date, is_clinical, is_icu, is_outward, IsolNr,
Materiaal, specimen_type, specimen_group, uuid, patientid,
old_patientid, birth_date, old_birth_date, department,
type_dept, specialism, age, gender, MIC, RAP, mo), .before = 1)
# a = a %>% mutate(across(AMC:VOR, as.rsi)) was already done before
b = readPatch('bk_2022-06-17_2022-09-29.rds', reqColumns) %>%
select(order(colnames(.))) %>%
relocate(c(sampleid, date, is_clinical, is_icu, is_outward, IsolNr,
Materiaal, specimen_type, specimen_group, patientid,
birth_date, department,
type_dept, specialism, age, gender, MIC, RAP, mo), .before = 1)
b = b %>% mutate(across(AMC:VOR, as.rsi))
tTotal = a %>%
add_row(b) %>%
mutate(MIC = as.mic(MIC),
mo = as.mo(mo))
# Warning: Problem with `mutate()` column `mo`.
# `mo = as.mo(mo)`.
#
# in `as.mo()`: nine unique values (covering 0.0%) could not be coerced and were considered 'unknown': "}hemstre", "}Primair",
# "acgsch", "anrhyd", "bclmeg", "BK_Isolaat_AEF", "corjimi", "strsalg" and "vvStrpne".
# Use `mo_failures()` to review them. Edit the `allow_uncertain` argument if needed (see ?as.mo).
# You can also use your own reference data with set_mo_source() or directly, e.g.:
# as.mo("mycode", reference_df = data.frame(own = "mycode", mo = "B_ESCHR_COLI"))
# mo_name("mycode", reference_df = data.frame(own = "mycode", mo = "B_ESCHR_COLI"))
# Warning message:
# Problem with `mutate()` column `MIC`.
# `MIC = as.mic(MIC)`.
# in `as.mic()`: 3 results truncated (0%) that were invalid MICs: "?", "13" and "17090355431"
write_rds(x = tTotal, file = 'data/total_2022-09-29.rds', compress = 'gz')
}
lastLargePatchesStitching = function() manualStitch_2022_09_30()
# > mo_uncertainties()
# Matching scores (in blue) are based on pathogenicity in humans and the resemblance between the input and the full taxonomic name.
# See `?mo_matching_score`.
#
# "brelut" -> Brevundimonas lutea (B_BRVND_LUTE, 0.316)
# Also matched: Brevibacterium luteolum (0.304)
#
# "vvEncspp" -> Brevibacterium ravenspurgense (B_BCTRM_RVNS, 0.284)
# read 1
if (F) {
tPatches = readAllPatches()
tStitched = stitchPatches(tPatches)
tStitched %>% write_rds(file = 'data/bk_archive_2014-08_2022-06_v2022-09-15.rds',
compress = 'gz')
x = read_rds('data/bk_archive_2014-08_2022-06_v2022-09-15.rds')
x = x %>% relocate(all_of(cns), .after = 'date') %>%
relocate(specimen_group, .after = specimen_type) %>%
relocate(MIC, .after = mo)
y = x %>% mutate(across(AMB:DAP, as.rsi))
z = y %>% mutate(mo = as.mo(mo))
mo_failures = c('}AeNeSt', '}hemstre', '}nietfe', '}Primair', 'acgsan', 'acgsch', 'bclmeg', 'bctfae', 'BK_Isolaat_AEF', 'BK_Isolaat_ANF', 'BK_Isolaat_PEF', 'chcgle', 'corjimi', 'enbbug', 'encfcmva', 'Meng', 'psspsy', 'staaurar', 'stainp', 'strsalg', 'surwad', 'vvAeNeSt', 'vvAePoSt', 'vvMRSA')
write_rds(z, 'data/bk_archive_2014-08_2022-06_v2022-09-15_3.rds',
compress = 'gz')
write_rds(z %>% filter(date >= '2021-07-01'), 'data/bk_archive_2021-07_2022-06_v2022-09-23_3.rds',
compress = 'gz')
}