300 lines
8.8 KiB
R
300 lines
8.8 KiB
R
require(tidyverse)
|
|
if (!require(lubridate)) {
|
|
install.packages('lubridate')
|
|
require(lubridate)
|
|
}
|
|
require(crayon)
|
|
require(stringr)
|
|
|
|
# TODO: check wat ProcessAndMergeDailyFiles ingaat: Monsternummer is.character?
|
|
# TODO: min # rows = bijv. 10 i.v.m. privacy
|
|
# TODO: 2 versies: MMBI & 'Algemeen'
|
|
|
|
memSize = function() {
|
|
result = tibble(object='', size=0, sizeMB=0)
|
|
objs = ls(envir = globalenv())
|
|
for (i in 1:length(objs) ) {
|
|
objName = objs[i]
|
|
sizeBytes = as.numeric(object.size(get(objName)))
|
|
result = result %>%
|
|
add_row(object = objName, size = sizeBytes, sizeMB = sizeBytes / 1024^2)
|
|
}
|
|
result = result %>%
|
|
arrange(desc(size))
|
|
return(list(result, sum(result$sizeMB)))
|
|
}
|
|
|
|
gatherAbIds = function() {
|
|
archFolders = list.dirs(file.path(dataFolder, 'archive'), full.names = F)
|
|
archFolders = archFolders[str_length(archFolders) == 26]
|
|
for (archFolder in archFolders) {
|
|
folder = archFolder
|
|
abFiles = list.files(file.path(dataFolder, 'archive', folder), pattern = 'AB-*')
|
|
for (abFileName in abFiles) {
|
|
abFile = read_4D_file(file.path(dataFolder, 'archive', folder, abFileName))
|
|
abNamesDict = tibble(name4d = abFile %>% pull(AB) %>% unique())
|
|
newNamesDict = abNamesDict %>% filter(!name4d %in% a$name4d, !is.na(name4d))
|
|
tNewNames = tibble(name4d = newNamesDict$name4d,
|
|
nameAMR = as.ab(newNamesDict$name4d),
|
|
nameAMRlong = ab_name(newNamesDict$name4d))
|
|
a = a %>% add_row(tNewNames)
|
|
}
|
|
}
|
|
return(a)
|
|
}
|
|
|
|
source('4D_day.R')
|
|
|
|
useCachedFileInfo = T
|
|
|
|
archivePath = './data/archive'
|
|
folders = list.files(path = archivePath, pattern = 'ImportedFromGlimms-*', full.names = T)
|
|
|
|
addFileInfo = function(x, filePath, fileName) {
|
|
mtime = file.mtime(filePath)
|
|
|
|
x = x %>%
|
|
add_row(filePath = filePath,
|
|
fileName = fileName,
|
|
fileType = str_sub(fileName, 1, 2),
|
|
fileDate = fileName %>% str_sub(4, 9) %>% ymd(),
|
|
modified = mtime,
|
|
# mdate = as.Date(ymd_hms(modified)),
|
|
size = file.size(filePath)) %>%
|
|
group_by(fileDate) %>%
|
|
mutate(nFilesPerDate = n()) %>%
|
|
ungroup() %>%
|
|
arrange(desc(fileDate))
|
|
|
|
return(x)
|
|
}
|
|
|
|
deriveFileOverview = function(tFiles) {
|
|
tFiles %>%
|
|
group_by(fileDate) %>%
|
|
mutate(AB = cur_data() %>%
|
|
filter(str_sub(fileName, 1, 2) == 'AB') %>%
|
|
nrow(),
|
|
MS = cur_data() %>%
|
|
filter(str_sub(fileName, 1, 2) == 'MS') %>%
|
|
nrow(),
|
|
ST = cur_data() %>%
|
|
filter(str_sub(fileName, 1, 2) == 'ST') %>%
|
|
nrow(),
|
|
TM = cur_data() %>%
|
|
filter(str_sub(fileName, 1, 2) == 'TM') %>%
|
|
nrow(),
|
|
TS = cur_data() %>%
|
|
filter(str_sub(fileName, 1, 2) == 'TS') %>%
|
|
nrow()
|
|
) %>%
|
|
slice(1) %>%
|
|
select(fileDate, AB, MS, ST, TM, TS)
|
|
}
|
|
|
|
getFilesInfo = function() {
|
|
tFiles = tibble(filePath = 'deleteme',
|
|
fileName = '',
|
|
fileType = '',
|
|
fileDate = ymd('2022-01-01'),
|
|
modified = ymd('2022-01-01'),
|
|
size = 0
|
|
)
|
|
for (folder in folders) {
|
|
# if (folder == 'ImportedFromGlimms-2014-09') {
|
|
folderPath = folder #file.path(archivePath, folder)
|
|
files = list.files(folderPath, pattern = '*')
|
|
for (file in files) {
|
|
filePath = file.path(folderPath, file)
|
|
tFiles = tFiles %>%
|
|
addFileInfo(filePath = filePath, fileName = file)
|
|
}
|
|
}
|
|
tFiles = tFiles %>%
|
|
mutate(mdate = as.Date(ymd_hms(modified))) %>%
|
|
filter(filePath != 'deleteme') %>%
|
|
arrange(fileDate, fileName)
|
|
tFiles = tFiles %>%
|
|
left_join(tFiles %>% count(fileDate, name = 'nFilesPerDate'), by = 'fileDate')
|
|
|
|
return(tFiles)
|
|
}
|
|
|
|
addFileInfoOfDay = function(x, folders, ymdDate, overwrite = F) {
|
|
datePattern = format(ymdDate, format = '%y%m%d')
|
|
dirPattern = format(today(), format = '%Y-%m')
|
|
folder = folders[which(folders %>% str_detect(dirPattern))]
|
|
dayFiles = list.files(path = folder, pattern = datePattern)
|
|
for (fileName in dayFiles) {
|
|
cat(paste0('Processing ', fileName, '\n'))
|
|
x = x %>% addFileInfo(filePath = file.path(folder, fileName),
|
|
fileName = fileName)
|
|
}
|
|
|
|
return(x)
|
|
}
|
|
|
|
addFileInfoOfToday = function(tFiles, folders) {
|
|
browser()
|
|
dirPattern = format(today(), format = '%Y-%m')
|
|
folder = folders[which(folders %>% str_detect(dirPattern))]
|
|
tFiles = tFiles %>%
|
|
addFileInfoOfDay(folderPath = file.path(archivePath, folder),
|
|
ymdDate = today(),
|
|
overwrite = F)
|
|
return(tFiles)
|
|
}
|
|
|
|
plotFileSize = function(tFiles) {
|
|
tFiles %>%
|
|
filter(fileDate > ymd('2014-10-01'),
|
|
fileDate <= ymd('2014-10-31'),
|
|
fileType != 'Ee') %>%
|
|
mutate(dayOfWeek = wday(fileDate),
|
|
dayOfWeekName = weekdays(fileDate),
|
|
size = size / (1024)
|
|
) %>%
|
|
ggplot(aes(x = dayOfWeek, y = size, group = dayOfWeek)) +
|
|
geom_boxplot(aes(fill = fileType)) +
|
|
scale_y_continuous(limits = c(0, 1250)) +
|
|
facet_wrap(facets = c(~fileType)) +
|
|
labs(y = 'size [KB]') +
|
|
theme_light() +
|
|
theme(
|
|
plot.title = element_text(face = 'bold')
|
|
)
|
|
}
|
|
|
|
if (useCachedFileInfo) {
|
|
tFiles = read_rds('./data/tFiles_2022-10-13.rds')
|
|
tFileOvw = read_rds('./data/tFileOvw_2022-10-13.rds')
|
|
} else {
|
|
tFiles = getFilesInfo()
|
|
tFileOvw = deriveFileOverview(tFiles)
|
|
|
|
write_rds(tFiles, paste0('./data/tFiles_', today(), '.rds'))
|
|
write_rds(tFileOvw, paste0('./data/tFileOvw_', today(), '.rds'))
|
|
}
|
|
|
|
readDailyFile = function(tFiles, date, prefix) {
|
|
result = NULL
|
|
fileNames = tFiles %>%
|
|
filter(fileDate == date,
|
|
fileType == prefix) %>%
|
|
pull(filePath)
|
|
if (length(fileNames) > 0) {
|
|
for (f in 1:length(fileNames)) {
|
|
fileName = fileNames[f]
|
|
cat(silver(paste0('Reading ', fileName)), '\n')
|
|
fileContent = read_4D_file(fileName)
|
|
if (f == 1) {
|
|
result = fileContent
|
|
} else {
|
|
result = result %>%
|
|
add_row(fileContent)
|
|
}
|
|
}
|
|
}
|
|
return(result)
|
|
}
|
|
|
|
readDailyMSFile = function(tFiles, date) {
|
|
result = NULL
|
|
fileNames = tFiles %>%
|
|
filter(fileDate == date,
|
|
fileType == 'MS') %>%
|
|
pull(filePath)
|
|
if (length(fileNames) > 0) {
|
|
for (f in 1:length(fileNames)) {
|
|
fileName = fileNames[f]
|
|
cat(silver(paste0('Reading ', fileName)), '\n')
|
|
msFileContent = read_4D_MS_file(fileName)
|
|
if (f == 1) {
|
|
result = msFileContent
|
|
} else {
|
|
result = result %>%
|
|
add_row(msFileContent)
|
|
}
|
|
}
|
|
}
|
|
return(result)
|
|
|
|
}
|
|
|
|
readDailyFiles = function(tFiles, date) {
|
|
abFile = readDailyFile(tFiles, date, 'AB')
|
|
msFile = readDailyMSFile(tFiles, date)
|
|
stFile = readDailyFile(tFiles, date, 'ST')
|
|
tmFile = readDailyFile(tFiles, date, 'TM')
|
|
tsFile = readDailyFile(tFiles, date, 'TS')
|
|
|
|
return(list(abFile, msFile, stFile, tmFile, tsFile))
|
|
}
|
|
|
|
readArchive = function(tFileInfo, tFileOvw, createDebugList = FALSE) {
|
|
tMergeDates = NULL
|
|
tsDates = NULL
|
|
tMergeDatesDebugList = list()
|
|
dates = tFileInfo %>% filter(!is.na(fileDate)) %>% pull(fileDate) %>% unique()
|
|
for (d in 1:length(dates)) {
|
|
currDate = dates[d]
|
|
cat(paste0('\nd = ', d, ', date: ', currDate, '\n'))
|
|
print(tFileOvw %>% filter(fileDate == currDate))
|
|
|
|
dateFiles = readDailyFiles(tFileInfo, currDate)
|
|
|
|
if (!is.null(dateFiles[[1]])) {
|
|
processed = ProcessAndMergeDailyFiles(
|
|
dateFiles[[1]] %>% select(-...15),
|
|
dateFiles[[2]],
|
|
dateFiles[[3]],
|
|
dateFiles[[4]],
|
|
dateFiles[[5]]
|
|
)
|
|
|
|
if (!is.null(processed$merged)) {
|
|
tMergeDate = processed$merged %>% # abFile, stFile and msFile joined
|
|
mutate(date = currDate, .before = 1)
|
|
if (createDebugList) tMergeDatesDebugList[[d]] = tMergeDate
|
|
|
|
# separate tables of (ab|st|ms|ts|tm)File of the current day can be found
|
|
# in processed$(ab|st|ms|ts|tm)File, for instance:
|
|
if (!is.null(processed$tsFile)) {
|
|
tsDate = processed$tsFile %>%
|
|
mutate(date = currDate, .before = 1)
|
|
}
|
|
|
|
if (is.null(tMergeDates)) {
|
|
if (nrow(tMergeDate) > 0) {
|
|
tMergeDates = tMergeDate
|
|
}
|
|
if (exists('tsDate')) {
|
|
if (nrow(tsDate) > 0) {
|
|
tsDates = tsDate
|
|
}
|
|
}
|
|
} else {
|
|
if (nrow(tMergeDates) > 0) {
|
|
tMergeDates = tMergeDates %>% add_row(tMergeDate)
|
|
}
|
|
if (exists('tsDates')) {
|
|
if (!is.null(tsDates)) {
|
|
if (nrow(tsDates)) {
|
|
tsDates = tsDates %>% add_row(tsDate)
|
|
}
|
|
}
|
|
}
|
|
}
|
|
tMergeDates = tMergeDates %>%
|
|
mutate(Monsternummer = str_trim(Monsternummer))
|
|
}
|
|
}
|
|
}
|
|
|
|
if (createDebugList) {
|
|
return(list(tMergeDates, tMergeDatesDebugList))
|
|
} else {
|
|
return(tMergeDates)
|
|
}
|
|
}
|