radar2/stitchArchive.R

144 lines
5.7 KiB
R
Raw Normal View History

2023-02-07 16:49:16 +01:00
require(tidyverse)
source('globalData.R')
reqColumns = abAbbrevs
patchFirstColumns = c('sampleid', 'date', 'is_clinical', 'is_icu', 'is_outward',
'IsolNr', 'Materiaal', 'specimen_type', 'specimen_group',
'patientid', 'birth_date', 'department', 'type_dept',
'specialism', 'age', 'gender', 'MIC', 'RAP', 'mo')
readPatch = function(filename, reqColumns) {
patch = read_rds(filename)
for (c in 1:length(reqColumns)) {
if (!reqColumns[c] %in% colnames(patch)) {
patch = patch %>%
add_column(!!reqColumns[c] := as.rsi(NA)) %>%
select(order(colnames(.))) %>%
relocate(all_of(patchFirstColumns), .before = 1)
}
}
return(patch)
}
readAllPatches = function() {
tPatches = tibble(name = list.files(path = 'data', pattern = 'bk_.*\\.rds'))
tPatches = tPatches %>%
separate(name, into = c('prefix', 'from', 'tail'), sep = '_') %>%
separate(tail, into = c('to', 'ext'), sep = '\\.') %>%
filter(!is.na(to), !str_detect(to, '[:alpha:]')) %>%
mutate(name = paste0(prefix, '_', from, '_', to, '.', ext)) %>%
arrange(from, to)
return(tPatches)
}
stitchPatches = function(tPatches, dataPath = 'data', reqColumns) {
for (i in 1:nrow(tPatches)) {
# patch = read_rds(file.path(dataPath, tPatches[i,]$name))
# for (c in 1:length(reqColumns)) {
# if (!reqColumns[c] %in% colnames(patch)) { patch = patch %>% add_column(!!reqColumns[c] := is.rsi(NA)) }
# }
cat(paste0(' patch ', i, ': ', tPatches$from[i], ' ', tPatches$to[i], '\n'))
patch = readPatch(filename = file.path(dataPath, tPatches$name[i]),
reqColumns = reqColumns)
# print(colnames(patch))
if (i == 1) {
tStitched = patch
} else {
tStitched = tStitched %>% add_row(patch)
}
}
tStitched = tStitched %>% arrange(date, sampleid)
return(tStitched)
}
appendDailyData = function(fnTotal, fnTotalSofar, fnDay, reqColumns) {
assert_that(file.exists(fnTotalSofar))#,
# msg = paste0("File ", fnTotalSofar, " doesn't exist"))
assert_that(file.exists(fnDay))
tTotalSofar = readPatch(fnTotalSofar, reqColumns = reqColumns)
tDay = readPatch(fnDay, reqColumns = reqColumns)
tTotal = tTotalSofar %>% add_row(tDay)
write_rds(x = tTotal, file = fnTotal, compress = 'gz')
return(tTotal)
}
manualStitch_2022_09_30 = function() {
a =
mutate(RAP = as.character('#')) %>% # correct for accidental inclusion in as.rsi conversion
select(order(colnames(.))) %>%
relocate(c(sampleid, date, is_clinical, is_icu, is_outward, IsolNr,
Materiaal, specimen_type, specimen_group, uuid, patientid,
old_patientid, birth_date, old_birth_date, department,
type_dept, specialism, age, gender, MIC, RAP, mo), .before = 1)
# a = a %>% mutate(across(AMC:VOR, as.rsi)) was already done before
b = readPatch('bk_2022-06-17_2022-09-29.rds', reqColumns) %>%
select(order(colnames(.))) %>%
relocate(c(sampleid, date, is_clinical, is_icu, is_outward, IsolNr,
Materiaal, specimen_type, specimen_group, patientid,
birth_date, department,
type_dept, specialism, age, gender, MIC, RAP, mo), .before = 1)
b = b %>% mutate(across(AMC:VOR, as.rsi))
tTotal = a %>%
add_row(b) %>%
mutate(MIC = as.mic(MIC),
mo = as.mo(mo))
# Warning: Problem with `mutate()` column `mo`.
# `mo = as.mo(mo)`.
#
# in `as.mo()`: nine unique values (covering 0.0%) could not be coerced and were considered 'unknown': "}hemstre", "}Primair",
# "acgsch", "anrhyd", "bclmeg", "BK_Isolaat_AEF", "corjimi", "strsalg" and "vvStrpne".
# Use `mo_failures()` to review them. Edit the `allow_uncertain` argument if needed (see ?as.mo).
# You can also use your own reference data with set_mo_source() or directly, e.g.:
# as.mo("mycode", reference_df = data.frame(own = "mycode", mo = "B_ESCHR_COLI"))
# mo_name("mycode", reference_df = data.frame(own = "mycode", mo = "B_ESCHR_COLI"))
# Warning message:
# Problem with `mutate()` column `MIC`.
# `MIC = as.mic(MIC)`.
# in `as.mic()`: 3 results truncated (0%) that were invalid MICs: "?", "13" and "17090355431"
write_rds(x = tTotal, file = 'data/total_2022-09-29.rds', compress = 'gz')
}
lastLargePatchesStitching = function() manualStitch_2022_09_30()
# > mo_uncertainties()
# Matching scores (in blue) are based on pathogenicity in humans and the resemblance between the input and the full taxonomic name.
# See `?mo_matching_score`.
#
# "brelut" -> Brevundimonas lutea (B_BRVND_LUTE, 0.316)
# Also matched: Brevibacterium luteolum (0.304)
#
# "vvEncspp" -> Brevibacterium ravenspurgense (B_BCTRM_RVNS, 0.284)
# read 1
if (F) {
tPatches = readAllPatches()
tStitched = stitchPatches(tPatches)
tStitched %>% write_rds(file = 'data/bk_archive_2014-08_2022-06_v2022-09-15.rds',
compress = 'gz')
x = read_rds('data/bk_archive_2014-08_2022-06_v2022-09-15.rds')
x = x %>% relocate(all_of(cns), .after = 'date') %>%
relocate(specimen_group, .after = specimen_type) %>%
relocate(MIC, .after = mo)
y = x %>% mutate(across(AMB:DAP, as.rsi))
z = y %>% mutate(mo = as.mo(mo))
mo_failures = c('}AeNeSt', '}hemstre', '}nietfe', '}Primair', 'acgsan', 'acgsch', 'bclmeg', 'bctfae', 'BK_Isolaat_AEF', 'BK_Isolaat_ANF', 'BK_Isolaat_PEF', 'chcgle', 'corjimi', 'enbbug', 'encfcmva', 'Meng', 'psspsy', 'staaurar', 'stainp', 'strsalg', 'surwad', 'vvAeNeSt', 'vvAePoSt', 'vvMRSA')
write_rds(z, 'data/bk_archive_2014-08_2022-06_v2022-09-15_3.rds',
compress = 'gz')
write_rds(z %>% filter(date >= '2021-07-01'), 'data/bk_archive_2021-07_2022-06_v2022-09-23_3.rds',
compress = 'gz')
}