radar2/interfaces/4D/src/4D_archive.R

300 lines
8.8 KiB
R

require(tidyverse)
if (!require(lubridate)) {
install.packages('lubridate')
require(lubridate)
}
require(crayon)
require(stringr)
# TODO: check wat ProcessAndMergeDailyFiles ingaat: Monsternummer is.character?
# TODO: min # rows = bijv. 10 i.v.m. privacy
# TODO: 2 versies: MMBI & 'Algemeen'
memSize = function() {
result = tibble(object='', size=0, sizeMB=0)
objs = ls(envir = globalenv())
for (i in 1:length(objs) ) {
objName = objs[i]
sizeBytes = as.numeric(object.size(get(objName)))
result = result %>%
add_row(object = objName, size = sizeBytes, sizeMB = sizeBytes / 1024^2)
}
result = result %>%
arrange(desc(size))
return(list(result, sum(result$sizeMB)))
}
gatherAbIds = function() {
archFolders = list.dirs(file.path(dataFolder, 'archive'), full.names = F)
archFolders = archFolders[str_length(archFolders) == 26]
for (archFolder in archFolders) {
folder = archFolder
abFiles = list.files(file.path(dataFolder, 'archive', folder), pattern = 'AB-*')
for (abFileName in abFiles) {
abFile = read_4D_file(file.path(dataFolder, 'archive', folder, abFileName))
abNamesDict = tibble(name4d = abFile %>% pull(AB) %>% unique())
newNamesDict = abNamesDict %>% filter(!name4d %in% a$name4d, !is.na(name4d))
tNewNames = tibble(name4d = newNamesDict$name4d,
nameAMR = as.ab(newNamesDict$name4d),
nameAMRlong = ab_name(newNamesDict$name4d))
a = a %>% add_row(tNewNames)
}
}
return(a)
}
source('4D_day.R')
useCachedFileInfo = T
archivePath = './data/archive'
folders = list.files(path = archivePath, pattern = 'ImportedFromGlimms-*', full.names = T)
addFileInfo = function(x, filePath, fileName) {
mtime = file.mtime(filePath)
x = x %>%
add_row(filePath = filePath,
fileName = fileName,
fileType = str_sub(fileName, 1, 2),
fileDate = fileName %>% str_sub(4, 9) %>% ymd(),
modified = mtime,
# mdate = as.Date(ymd_hms(modified)),
size = file.size(filePath)) %>%
group_by(fileDate) %>%
mutate(nFilesPerDate = n()) %>%
ungroup() %>%
arrange(desc(fileDate))
return(x)
}
deriveFileOverview = function(tFiles) {
tFiles %>%
group_by(fileDate) %>%
mutate(AB = cur_data() %>%
filter(str_sub(fileName, 1, 2) == 'AB') %>%
nrow(),
MS = cur_data() %>%
filter(str_sub(fileName, 1, 2) == 'MS') %>%
nrow(),
ST = cur_data() %>%
filter(str_sub(fileName, 1, 2) == 'ST') %>%
nrow(),
TM = cur_data() %>%
filter(str_sub(fileName, 1, 2) == 'TM') %>%
nrow(),
TS = cur_data() %>%
filter(str_sub(fileName, 1, 2) == 'TS') %>%
nrow()
) %>%
slice(1) %>%
select(fileDate, AB, MS, ST, TM, TS)
}
getFilesInfo = function() {
tFiles = tibble(filePath = 'deleteme',
fileName = '',
fileType = '',
fileDate = ymd('2022-01-01'),
modified = ymd('2022-01-01'),
size = 0
)
for (folder in folders) {
# if (folder == 'ImportedFromGlimms-2014-09') {
folderPath = folder #file.path(archivePath, folder)
files = list.files(folderPath, pattern = '*')
for (file in files) {
filePath = file.path(folderPath, file)
tFiles = tFiles %>%
addFileInfo(filePath = filePath, fileName = file)
}
}
tFiles = tFiles %>%
mutate(mdate = as.Date(ymd_hms(modified))) %>%
filter(filePath != 'deleteme') %>%
arrange(fileDate, fileName)
tFiles = tFiles %>%
left_join(tFiles %>% count(fileDate, name = 'nFilesPerDate'), by = 'fileDate')
return(tFiles)
}
addFileInfoOfDay = function(x, folders, ymdDate, overwrite = F) {
datePattern = format(ymdDate, format = '%y%m%d')
dirPattern = format(today(), format = '%Y-%m')
folder = folders[which(folders %>% str_detect(dirPattern))]
dayFiles = list.files(path = folder, pattern = datePattern)
for (fileName in dayFiles) {
cat(paste0('Processing ', fileName, '\n'))
x = x %>% addFileInfo(filePath = file.path(folder, fileName),
fileName = fileName)
}
return(x)
}
addFileInfoOfToday = function(tFiles, folders) {
browser()
dirPattern = format(today(), format = '%Y-%m')
folder = folders[which(folders %>% str_detect(dirPattern))]
tFiles = tFiles %>%
addFileInfoOfDay(folderPath = file.path(archivePath, folder),
ymdDate = today(),
overwrite = F)
return(tFiles)
}
plotFileSize = function(tFiles) {
tFiles %>%
filter(fileDate > ymd('2014-10-01'),
fileDate <= ymd('2014-10-31'),
fileType != 'Ee') %>%
mutate(dayOfWeek = wday(fileDate),
dayOfWeekName = weekdays(fileDate),
size = size / (1024)
) %>%
ggplot(aes(x = dayOfWeek, y = size, group = dayOfWeek)) +
geom_boxplot(aes(fill = fileType)) +
scale_y_continuous(limits = c(0, 1250)) +
facet_wrap(facets = c(~fileType)) +
labs(y = 'size [KB]') +
theme_light() +
theme(
plot.title = element_text(face = 'bold')
)
}
if (useCachedFileInfo) {
tFiles = read_rds('./data/tFiles_2022-10-13.rds')
tFileOvw = read_rds('./data/tFileOvw_2022-10-13.rds')
} else {
tFiles = getFilesInfo()
tFileOvw = deriveFileOverview(tFiles)
write_rds(tFiles, paste0('./data/tFiles_', today(), '.rds'))
write_rds(tFileOvw, paste0('./data/tFileOvw_', today(), '.rds'))
}
readDailyFile = function(tFiles, date, prefix) {
result = NULL
fileNames = tFiles %>%
filter(fileDate == date,
fileType == prefix) %>%
pull(filePath)
if (length(fileNames) > 0) {
for (f in 1:length(fileNames)) {
fileName = fileNames[f]
cat(silver(paste0('Reading ', fileName)), '\n')
fileContent = read_4D_file(fileName)
if (f == 1) {
result = fileContent
} else {
result = result %>%
add_row(fileContent)
}
}
}
return(result)
}
readDailyMSFile = function(tFiles, date) {
result = NULL
fileNames = tFiles %>%
filter(fileDate == date,
fileType == 'MS') %>%
pull(filePath)
if (length(fileNames) > 0) {
for (f in 1:length(fileNames)) {
fileName = fileNames[f]
cat(silver(paste0('Reading ', fileName)), '\n')
msFileContent = read_4D_MS_file(fileName)
if (f == 1) {
result = msFileContent
} else {
result = result %>%
add_row(msFileContent)
}
}
}
return(result)
}
readDailyFiles = function(tFiles, date) {
abFile = readDailyFile(tFiles, date, 'AB')
msFile = readDailyMSFile(tFiles, date)
stFile = readDailyFile(tFiles, date, 'ST')
tmFile = readDailyFile(tFiles, date, 'TM')
tsFile = readDailyFile(tFiles, date, 'TS')
return(list(abFile, msFile, stFile, tmFile, tsFile))
}
readArchive = function(tFileInfo, tFileOvw, createDebugList = FALSE) {
tMergeDates = NULL
tsDates = NULL
tMergeDatesDebugList = list()
dates = tFileInfo %>% filter(!is.na(fileDate)) %>% pull(fileDate) %>% unique()
for (d in 1:length(dates)) {
currDate = dates[d]
cat(paste0('\nd = ', d, ', date: ', currDate, '\n'))
print(tFileOvw %>% filter(fileDate == currDate))
dateFiles = readDailyFiles(tFileInfo, currDate)
if (!is.null(dateFiles[[1]])) {
processed = ProcessAndMergeDailyFiles(
dateFiles[[1]] %>% select(-...15),
dateFiles[[2]],
dateFiles[[3]],
dateFiles[[4]],
dateFiles[[5]]
)
if (!is.null(processed$merged)) {
tMergeDate = processed$merged %>% # abFile, stFile and msFile joined
mutate(date = currDate, .before = 1)
if (createDebugList) tMergeDatesDebugList[[d]] = tMergeDate
# separate tables of (ab|st|ms|ts|tm)File of the current day can be found
# in processed$(ab|st|ms|ts|tm)File, for instance:
if (!is.null(processed$tsFile)) {
tsDate = processed$tsFile %>%
mutate(date = currDate, .before = 1)
}
if (is.null(tMergeDates)) {
if (nrow(tMergeDate) > 0) {
tMergeDates = tMergeDate
}
if (exists('tsDate')) {
if (nrow(tsDate) > 0) {
tsDates = tsDate
}
}
} else {
if (nrow(tMergeDates) > 0) {
tMergeDates = tMergeDates %>% add_row(tMergeDate)
}
if (exists('tsDates')) {
if (!is.null(tsDates)) {
if (nrow(tsDates)) {
tsDates = tsDates %>% add_row(tsDate)
}
}
}
}
tMergeDates = tMergeDates %>%
mutate(Monsternummer = str_trim(Monsternummer))
}
}
}
if (createDebugList) {
return(list(tMergeDates, tMergeDatesDebugList))
} else {
return(tMergeDates)
}
}