Commit validated bk, 118922 Feb 7

This commit is contained in:
Gerolf de Boer
2023-02-07 16:49:16 +01:00
parent 2271c24b19
commit 32ca21ab60
21 changed files with 1946 additions and 77 deletions

View File

@ -0,0 +1,299 @@
require(tidyverse)
if (!require(lubridate)) {
install.packages('lubridate')
require(lubridate)
}
require(crayon)
require(stringr)
# TODO: check wat ProcessAndMergeDailyFiles ingaat: Monsternummer is.character?
# TODO: min # rows = bijv. 10 i.v.m. privacy
# TODO: 2 versies: MMBI & 'Algemeen'
memSize = function() {
result = tibble(object='', size=0, sizeMB=0)
objs = ls(envir = globalenv())
for (i in 1:length(objs) ) {
objName = objs[i]
sizeBytes = as.numeric(object.size(get(objName)))
result = result %>%
add_row(object = objName, size = sizeBytes, sizeMB = sizeBytes / 1024^2)
}
result = result %>%
arrange(desc(size))
return(list(result, sum(result$sizeMB)))
}
gatherAbIds = function() {
archFolders = list.dirs(file.path(dataFolder, 'archive'), full.names = F)
archFolders = archFolders[str_length(archFolders) == 26]
for (archFolder in archFolders) {
folder = archFolder
abFiles = list.files(file.path(dataFolder, 'archive', folder), pattern = 'AB-*')
for (abFileName in abFiles) {
abFile = read_4D_file(file.path(dataFolder, 'archive', folder, abFileName))
abNamesDict = tibble(name4d = abFile %>% pull(AB) %>% unique())
newNamesDict = abNamesDict %>% filter(!name4d %in% a$name4d, !is.na(name4d))
tNewNames = tibble(name4d = newNamesDict$name4d,
nameAMR = as.ab(newNamesDict$name4d),
nameAMRlong = ab_name(newNamesDict$name4d))
a = a %>% add_row(tNewNames)
}
}
return(a)
}
source('4D_day.R')
useCachedFileInfo = T
archivePath = './data/archive'
folders = list.files(path = archivePath, pattern = 'ImportedFromGlimms-*', full.names = T)
addFileInfo = function(x, filePath, fileName) {
mtime = file.mtime(filePath)
x = x %>%
add_row(filePath = filePath,
fileName = fileName,
fileType = str_sub(fileName, 1, 2),
fileDate = fileName %>% str_sub(4, 9) %>% ymd(),
modified = mtime,
# mdate = as.Date(ymd_hms(modified)),
size = file.size(filePath)) %>%
group_by(fileDate) %>%
mutate(nFilesPerDate = n()) %>%
ungroup() %>%
arrange(desc(fileDate))
return(x)
}
deriveFileOverview = function(tFiles) {
tFiles %>%
group_by(fileDate) %>%
mutate(AB = cur_data() %>%
filter(str_sub(fileName, 1, 2) == 'AB') %>%
nrow(),
MS = cur_data() %>%
filter(str_sub(fileName, 1, 2) == 'MS') %>%
nrow(),
ST = cur_data() %>%
filter(str_sub(fileName, 1, 2) == 'ST') %>%
nrow(),
TM = cur_data() %>%
filter(str_sub(fileName, 1, 2) == 'TM') %>%
nrow(),
TS = cur_data() %>%
filter(str_sub(fileName, 1, 2) == 'TS') %>%
nrow()
) %>%
slice(1) %>%
select(fileDate, AB, MS, ST, TM, TS)
}
getFilesInfo = function() {
tFiles = tibble(filePath = 'deleteme',
fileName = '',
fileType = '',
fileDate = ymd('2022-01-01'),
modified = ymd('2022-01-01'),
size = 0
)
for (folder in folders) {
# if (folder == 'ImportedFromGlimms-2014-09') {
folderPath = folder #file.path(archivePath, folder)
files = list.files(folderPath, pattern = '*')
for (file in files) {
filePath = file.path(folderPath, file)
tFiles = tFiles %>%
addFileInfo(filePath = filePath, fileName = file)
}
}
tFiles = tFiles %>%
mutate(mdate = as.Date(ymd_hms(modified))) %>%
filter(filePath != 'deleteme') %>%
arrange(fileDate, fileName)
tFiles = tFiles %>%
left_join(tFiles %>% count(fileDate, name = 'nFilesPerDate'), by = 'fileDate')
return(tFiles)
}
addFileInfoOfDay = function(x, folders, ymdDate, overwrite = F) {
datePattern = format(ymdDate, format = '%y%m%d')
dirPattern = format(today(), format = '%Y-%m')
folder = folders[which(folders %>% str_detect(dirPattern))]
dayFiles = list.files(path = folder, pattern = datePattern)
for (fileName in dayFiles) {
cat(paste0('Processing ', fileName, '\n'))
x = x %>% addFileInfo(filePath = file.path(folder, fileName),
fileName = fileName)
}
return(x)
}
addFileInfoOfToday = function(tFiles, folders) {
browser()
dirPattern = format(today(), format = '%Y-%m')
folder = folders[which(folders %>% str_detect(dirPattern))]
tFiles = tFiles %>%
addFileInfoOfDay(folderPath = file.path(archivePath, folder),
ymdDate = today(),
overwrite = F)
return(tFiles)
}
plotFileSize = function(tFiles) {
tFiles %>%
filter(fileDate > ymd('2014-10-01'),
fileDate <= ymd('2014-10-31'),
fileType != 'Ee') %>%
mutate(dayOfWeek = wday(fileDate),
dayOfWeekName = weekdays(fileDate),
size = size / (1024)
) %>%
ggplot(aes(x = dayOfWeek, y = size, group = dayOfWeek)) +
geom_boxplot(aes(fill = fileType)) +
scale_y_continuous(limits = c(0, 1250)) +
facet_wrap(facets = c(~fileType)) +
labs(y = 'size [KB]') +
theme_light() +
theme(
plot.title = element_text(face = 'bold')
)
}
if (useCachedFileInfo) {
tFiles = read_rds('./data/tFiles_2022-10-13.rds')
tFileOvw = read_rds('./data/tFileOvw_2022-10-13.rds')
} else {
tFiles = getFilesInfo()
tFileOvw = deriveFileOverview(tFiles)
write_rds(tFiles, paste0('./data/tFiles_', today(), '.rds'))
write_rds(tFileOvw, paste0('./data/tFileOvw_', today(), '.rds'))
}
readDailyFile = function(tFiles, date, prefix) {
result = NULL
fileNames = tFiles %>%
filter(fileDate == date,
fileType == prefix) %>%
pull(filePath)
if (length(fileNames) > 0) {
for (f in 1:length(fileNames)) {
fileName = fileNames[f]
cat(silver(paste0('Reading ', fileName)), '\n')
fileContent = read_4D_file(fileName)
if (f == 1) {
result = fileContent
} else {
result = result %>%
add_row(fileContent)
}
}
}
return(result)
}
readDailyMSFile = function(tFiles, date) {
result = NULL
fileNames = tFiles %>%
filter(fileDate == date,
fileType == 'MS') %>%
pull(filePath)
if (length(fileNames) > 0) {
for (f in 1:length(fileNames)) {
fileName = fileNames[f]
cat(silver(paste0('Reading ', fileName)), '\n')
msFileContent = read_4D_MS_file(fileName)
if (f == 1) {
result = msFileContent
} else {
result = result %>%
add_row(msFileContent)
}
}
}
return(result)
}
readDailyFiles = function(tFiles, date) {
abFile = readDailyFile(tFiles, date, 'AB')
msFile = readDailyMSFile(tFiles, date)
stFile = readDailyFile(tFiles, date, 'ST')
tmFile = readDailyFile(tFiles, date, 'TM')
tsFile = readDailyFile(tFiles, date, 'TS')
return(list(abFile, msFile, stFile, tmFile, tsFile))
}
readArchive = function(tFileInfo, tFileOvw, createDebugList = FALSE) {
tMergeDates = NULL
tsDates = NULL
tMergeDatesDebugList = list()
dates = tFileInfo %>% filter(!is.na(fileDate)) %>% pull(fileDate) %>% unique()
for (d in 1:length(dates)) {
currDate = dates[d]
cat(paste0('\nd = ', d, ', date: ', currDate, '\n'))
print(tFileOvw %>% filter(fileDate == currDate))
dateFiles = readDailyFiles(tFileInfo, currDate)
if (!is.null(dateFiles[[1]])) {
processed = ProcessAndMergeDailyFiles(
dateFiles[[1]] %>% select(-...15),
dateFiles[[2]],
dateFiles[[3]],
dateFiles[[4]],
dateFiles[[5]]
)
if (!is.null(processed$merged)) {
tMergeDate = processed$merged %>% # abFile, stFile and msFile joined
mutate(date = currDate, .before = 1)
if (createDebugList) tMergeDatesDebugList[[d]] = tMergeDate
# separate tables of (ab|st|ms|ts|tm)File of the current day can be found
# in processed$(ab|st|ms|ts|tm)File, for instance:
if (!is.null(processed$tsFile)) {
tsDate = processed$tsFile %>%
mutate(date = currDate, .before = 1)
}
if (is.null(tMergeDates)) {
if (nrow(tMergeDate) > 0) {
tMergeDates = tMergeDate
}
if (exists('tsDate')) {
if (nrow(tsDate) > 0) {
tsDates = tsDate
}
}
} else {
if (nrow(tMergeDates) > 0) {
tMergeDates = tMergeDates %>% add_row(tMergeDate)
}
if (exists('tsDates')) {
if (!is.null(tsDates)) {
if (nrow(tsDates)) {
tsDates = tsDates %>% add_row(tsDate)
}
}
}
}
tMergeDates = tMergeDates %>%
mutate(Monsternummer = str_trim(Monsternummer))
}
}
}
if (createDebugList) {
return(list(tMergeDates, tMergeDatesDebugList))
} else {
return(tMergeDates)
}
}

268
interfaces/4D/src/4D_day.R Normal file
View File

@ -0,0 +1,268 @@
# initially only interfacing 4D data
# TODO loggen zoveel mogelijk over ontvangen data (nieuwe GLIMS data)
# TODO pushen R scripts naar git server waar Gerhard en Kornelis bij kunnen
# require(stringr)
require(tidyverse)
require(AMR)
dataFolder = './data'
get_most_recent_4D_file = function(path, prefix) {
files = list.files(dataFolder, paste0('^', prefix, '*'))
tFiles = tibble(name = files,
mtime = file.mtime(file.path(dataFolder, files))) %>%
arrange(desc(mtime))
tmpFilename = tFiles$name[1]
return(tmpFilename)
}
read_4D_file = function(path) {
tFile = read_tsv(path, col_types = paste0(rep('c', 30), collapse = ''))
# remove comment lines
if (nrow(tFile) > 0) {
tFile = tFile %>%
slice( seq(4, nrow(tFile)) ) %>%
filter(str_sub(ID, 1, 3) != '___')
}
if (colnames(tFile)[1] == 'X1') {
if (str_detect(path, 'AB-')) {
tFile = tibble(ID = '0',
Monsternummer = '',
IsolNr = '',
AB = '',
V = '',
PrefMIC ='',
MIC = '',
D = '',
DIF = '',
PrefEtest = '',
E = '',
ETEST = '',
RIS = '',
RAP = '',
...15 = '')
} else if (str_detect(path, 'MS-')) {
tFile = tibble(UMCG.Monster= '0',
PIN.Lot= '0',
TypeObject= '0',
Lot= '0',Echtgenoot= '0',
AchternNaam= '0',
Geboortedatum= '0',
Geslacht= '0',
OntvOLabDatum= '0',
Tijdstip.materiaal.afgenomen= '0',
PC= '0',
Aanvrager= '0',
ligafdeling= '0',
A= '0',
B= '0',
C= '0',
Materiaal= '0',
Materiaal.variabelen= '0',
X = F)
} else if (str_detect(path, 'ST-')) {
tFile = tibble(ID = '0',
Monsternummer = '',
MMBProc = '',
Isolnr = '',
MO = '',
Waarde = '',
Pos = '',
Rap = '')
} else if (str_detect(path, 'TM-')) {
tFile = tibble(ID = '0',
Monsternummer = '',
Test = '',
Uitslag = '')
} else if (str_detect(path, 'TS-')) {
tFile = tibble(ID = '0',
Monsternummer = '',
Isolnr = '',
Test = '',
Waarde = '',
Rapport = '')
}
}
if ('ID' %in% colnames(tFile)) {
tFile = tFile %>%
filter(ID != '0')
} else if ('UMCG.Monster' %in% colnames(tFile)) {
tFile = tFile %>%
filter(UMCG.Monster != '0')
}
return(tFile)
}
read_4D_TM_file = function(path) {
tmFilename = str_remove(path, paste0(dirname(path), '/'))
tmFileL = readLines(path)
tmFileL = tmFileL[c(5:length(tmFileL))]
for (l in 1:length(tmFileL)) {
if (str_count(tmFileL[l], '\t') == 3) {
tmFileL = paste0(tmFileL, '\t')
}
}
newTmFilename = file.path(dirname(path),
tmFilename %>% str_replace('TM', 'bTM'))
if (file.exists(newTmFilename)) file.remove(newTmFilename)
writeLines(tmFileL, newTmFilename)
tFile = read_tsv(newTmFilename,
col_names = c('ID','Monsternummer','Test','Status.test.tm','Uitslag'),
col_types = 'ccccc')
# remove comment lines
tFile = tFile %>%
slice(c(5:(tFile %>% nrow()))) %>%
mutate(
Monsternummer = as.character(Monsternummer)
)
return(tFile)
}
get_most_recent_4D_MS_file = function(path) {
# get files
msFiles = list.files(path, '^MS*')
# create table with modification datetimes, for selection of the most recent one
tmsFiles = tibble(name = msFiles,
mtime = file.mtime(file.path(path, msFiles))) %>%
arrange(desc(mtime))
# get the name of the most recently modified file
msFilename = tmsFiles$name[1]
return(msFilename)
}
read_4D_MS_file = function(path) {
# print(path)
msFilename = str_remove(path, paste0(dirname(path), '/'))
# read content
msFileL = readLines(path)
# remove lines with repeated '_', useful for the human eye only
msFileL = msFileL[str_sub(msFileL, 1, 5) != '_____']
# replace (real) header
msHeader = readLines(file.path(dataFolder, 'headerMS.txt'))
msFileL[1] = paste0(msHeader, '\t')
# replace tabs within a date time combination with a space in line 2 and further,
# for example: replace '29/09/2021 14:05' by '29/09/2021 14:05'
# ↑ = '\t' ↑ = ' '
for (i in 2:length(msFileL)) {
msFileL[i] = msFileL[i] %>% str_replace_all('(?<=[:digit:]{4})\t(?=[:digit:]{2}:[:digit:]{2})', ' ')
}
# write to new machine readable file
newMsFilename = file.path(dirname(path),
msFilename %>% str_replace('MS', 'bMS'))
if (file.exists(newMsFilename)) file.remove(newMsFilename)
writeLines(msFileL, newMsFilename)
# finally: read the machine readable file
result = read.delim(file = newMsFilename, sep = '\t') %>%
as_tibble() %>%
mutate(
UMCG.Monster = as.character(UMCG.Monster),
PIN.Lot = as.character(PIN.Lot),
Aanvrager = as.character(Aanvrager),
C = as.character(C)
)
# Hurray!
return(result)
}
# abFile = read_4D_file(dataFolder, 'AB')
# msFile = read_4D_MS_file(dataFolder)
# stFile = read_4D_file(dataFolder, 'ST')
# tmFile = read_4D_file(dataFolder, 'TM')
# tsFile = read_4D_file(dataFolder, 'TS')
ProcessAndMergeDailyFiles = function(abFile, msFile, stFile, tmFile, tsFile) {
if (!is.null(abFile))
abFile = abFile %>%
filter(!is.na(RAP)) #%>%
# mutate(
# #Monsternummer = as.character(Monsternummer)
# AB.amr = AMR::ab_property(AB, 'ab') %>% as.character()
# )
if (!is.null(stFile))
stFile = stFile %>%
filter(!is.na(Rap)) %>%
# mutate(Monsternummer = as.character(Monsternummer) %>% str_trim()) %>%
rename(IsolNr = Isolnr)
if (!is.null(msFile))
msFile = msFile %>%
rename(Monsternummer = UMCG.Monster) %>%
# mutate(Monsternummer = as.character(Monsternummer) %>% str_trim()) %>%
select(-X)
# if (!is.null(tmFile))
# tmFile = tmFile %>%
# mutate(Monsternummer = as.character(Monsternummer) %>% str_trim())
if (!is.null(tsFile))
tsFile = tsFile %>%
filter(!is.na(Rapport))
# mutate(Monsternummer = as.character(Monsternummer) %>% str_trim())
if (!is.null(abFile) & !is.null(stFile))
t_ab_st = abFile %>%
full_join(stFile,
by = c('Monsternummer', 'IsolNr'),
suffix = c('.ab', '.st'))
if (exists('t_ab_st')) {
if (!is.null(t_ab_st) & !is.null(msFile)) {
t_ab_st_ms = t_ab_st %>%
full_join(msFile, by = 'Monsternummer') %>%
rename(Waarde.st = Waarde)
return(
list(
merged = t_ab_st_ms,
abFile = abFile,
stFile = stFile,
msFile = msFile,
tsFile = tsFile,
tmFile = tmFile
)
)
# t_ab_st_ms_ts = t_ab_st_ms %>%
# full_join(tsFile, by = c('Monsternummer', 'IsolNr' = 'Isolnr'))
#
# t_ab_st_ms_ts_tm = t_ab_st_ms_ts %>%
# add_column(Uitslag = NA,
# is_sampleResult = F) %>%
# add_row(Monsternummer = tmFile$Monsternummer,
# Test = tmFile$Test,
# Uitslag = tmFile$Uitslag,
# is_sampleResult = T)
#
# t_ab_st_ms_ts_tm = t_ab_st_ms_ts_tm %>%
# rename(Waarde.ts = Waarde) %>%
# filter(!Test %in% c('no_filter_yet'))
#
# # if ('X' %in% colnames(t_ab_st_ms_ts_tm)) {
# # t_ab_st_ms_ts_tm = t_ab_st_ms_ts_tm %>%
# # select(-'X')
# # print('Dit statement is voor debugging')
# # }
}
}
return(
list(
merged = NULL,
abFile = abFile,
stFile = stFile,
msFile = msFile,
tsFile = tsFile,
tmFile = tmFile
)
)
}

View File

@ -0,0 +1,26 @@
require(tidyverse)
require(readr)
v171 = read_rds('v171.rds')
v180 = read_rds('v180.rds')
new_v180 = v180 %>% anti_join(v171 %>% left_join(v180))
source('interfaces.R')
d = readGlimsArchive(NULL, NULL, T, NULL, NULL)
d %>% filter(str_starts(mo, 'B_ES')) %>% count(mo, CXM)
# deze aantallen zie ik niet terug in wat de applicatie presenteert:
# Ceftazidim: R (705), SI (2705), totaal dus 3410
# misschien dat data_select() voor andere aantallen zorgt?
data_select = d %>%
# filter(specialty_shiny %in% input$specialtyInput & department %in% input$departmentInput) %>%
filter_first_isolate(col_patient_id = "patientid", episode_days = 30) %>%
mutate(mo = as.mo(mo, Becker = TRUE))
data_select %>% filter(str_starts(mo, 'B_ES')) %>% count(mo, CXM)
# Ja dus!

View File

@ -0,0 +1,37 @@
require(tidyverse)
inspectRead = 10
dataPath = 'data'
abNames = read_rds('data/abNames.rds')
filename = list.files(dataPath, paste0('read_', inspectRead, '_*'))
d = read_rds(file.path(dataPath, filename)) #%>%
# left_join(abNames, by = c('ab.amr' = 'nameAMR'))
# list a sampleid that occurs more often but not too often
examples = d %>% count(sampleid) %>% arrange(desc(n)) %>% filter(n < 50)
view_d = d %>%
select(sampleid, date, IsolNr, MIC, mo, RIS, ab.amr, RAP) %>% #, name4d, nameAMRlong) %>%
mutate(asmo = suppressWarnings(as.mo(mo))) %>%
arrange(sampleid, IsolNr) #%>%
# filter(sampleid == examples$sampleid[i])
has_ecoli = view_d %>% filter(str_starts(asmo, 'B_ES'))
examples_having_ecoli = has_ecoli %>%
filter(sampleid %in% examples$sampleid) %>%
pull(sampleid) %>%
unique()
e = 2
view(view_d %>%
filter(sampleid == examples_having_ecoli[e]) %>%
mutate(IsolNr = as.numeric(IsolNr)) %>%
arrange(IsolNr),# %>%
#distinct(sampleid, date, IsolNr, mo, ab.amr, .keep_all = T),
title = 'example')
if (F) {
view(view_d)
view_d %>% filter(!is.na(mo)) %>% pull(mo) %>% unique()
}