require(tidyverse) if (!require(lubridate)) { install.packages('lubridate') require(lubridate) } require(crayon) require(stringr) # TODO: check wat ProcessAndMergeDailyFiles ingaat: Monsternummer is.character? # TODO: min # rows = bijv. 10 i.v.m. privacy # TODO: 2 versies: MMBI & 'Algemeen' memSize = function() { result = tibble(object='', size=0, sizeMB=0) objs = ls(envir = globalenv()) for (i in 1:length(objs) ) { objName = objs[i] sizeBytes = as.numeric(object.size(get(objName))) result = result %>% add_row(object = objName, size = sizeBytes, sizeMB = sizeBytes / 1024^2) } result = result %>% arrange(desc(size)) return(list(result, sum(result$sizeMB))) } gatherAbIds = function() { archFolders = list.dirs(file.path(dataFolder, 'archive'), full.names = F) archFolders = archFolders[str_length(archFolders) == 26] for (archFolder in archFolders) { folder = archFolder abFiles = list.files(file.path(dataFolder, 'archive', folder), pattern = 'AB-*') for (abFileName in abFiles) { abFile = read_4D_file(file.path(dataFolder, 'archive', folder, abFileName)) abNamesDict = tibble(name4d = abFile %>% pull(AB) %>% unique()) newNamesDict = abNamesDict %>% filter(!name4d %in% a$name4d, !is.na(name4d)) tNewNames = tibble(name4d = newNamesDict$name4d, nameAMR = as.ab(newNamesDict$name4d), nameAMRlong = ab_name(newNamesDict$name4d)) a = a %>% add_row(tNewNames) } } return(a) } source('4D_day.R') useCachedFileInfo = T archivePath = './data/archive' folders = list.files(path = archivePath, pattern = 'ImportedFromGlimms-*', full.names = T) addFileInfo = function(x, filePath, fileName) { mtime = file.mtime(filePath) x = x %>% add_row(filePath = filePath, fileName = fileName, fileType = str_sub(fileName, 1, 2), fileDate = fileName %>% str_sub(4, 9) %>% ymd(), modified = mtime, # mdate = as.Date(ymd_hms(modified)), size = file.size(filePath)) %>% group_by(fileDate) %>% mutate(nFilesPerDate = n()) %>% ungroup() %>% arrange(desc(fileDate)) return(x) } deriveFileOverview = function(tFiles) { tFiles %>% group_by(fileDate) %>% mutate(AB = cur_data() %>% filter(str_sub(fileName, 1, 2) == 'AB') %>% nrow(), MS = cur_data() %>% filter(str_sub(fileName, 1, 2) == 'MS') %>% nrow(), ST = cur_data() %>% filter(str_sub(fileName, 1, 2) == 'ST') %>% nrow(), TM = cur_data() %>% filter(str_sub(fileName, 1, 2) == 'TM') %>% nrow(), TS = cur_data() %>% filter(str_sub(fileName, 1, 2) == 'TS') %>% nrow() ) %>% slice(1) %>% select(fileDate, AB, MS, ST, TM, TS) } getFilesInfo = function() { tFiles = tibble(filePath = 'deleteme', fileName = '', fileType = '', fileDate = ymd('2022-01-01'), modified = ymd('2022-01-01'), size = 0 ) for (folder in folders) { # if (folder == 'ImportedFromGlimms-2014-09') { folderPath = folder #file.path(archivePath, folder) files = list.files(folderPath, pattern = '*') for (file in files) { filePath = file.path(folderPath, file) tFiles = tFiles %>% addFileInfo(filePath = filePath, fileName = file) } } tFiles = tFiles %>% mutate(mdate = as.Date(ymd_hms(modified))) %>% filter(filePath != 'deleteme') %>% arrange(fileDate, fileName) tFiles = tFiles %>% left_join(tFiles %>% count(fileDate, name = 'nFilesPerDate'), by = 'fileDate') return(tFiles) } addFileInfoOfDay = function(x, folders, ymdDate, overwrite = F) { datePattern = format(ymdDate, format = '%y%m%d') dirPattern = format(today(), format = '%Y-%m') folder = folders[which(folders %>% str_detect(dirPattern))] dayFiles = list.files(path = folder, pattern = datePattern) for (fileName in dayFiles) { cat(paste0('Processing ', fileName, '\n')) x = x %>% addFileInfo(filePath = file.path(folder, fileName), fileName = fileName) } return(x) } addFileInfoOfToday = function(tFiles, folders) { browser() dirPattern = format(today(), format = '%Y-%m') folder = folders[which(folders %>% str_detect(dirPattern))] tFiles = tFiles %>% addFileInfoOfDay(folderPath = file.path(archivePath, folder), ymdDate = today(), overwrite = F) return(tFiles) } plotFileSize = function(tFiles) { tFiles %>% filter(fileDate > ymd('2014-10-01'), fileDate <= ymd('2014-10-31'), fileType != 'Ee') %>% mutate(dayOfWeek = wday(fileDate), dayOfWeekName = weekdays(fileDate), size = size / (1024) ) %>% ggplot(aes(x = dayOfWeek, y = size, group = dayOfWeek)) + geom_boxplot(aes(fill = fileType)) + scale_y_continuous(limits = c(0, 1250)) + facet_wrap(facets = c(~fileType)) + labs(y = 'size [KB]') + theme_light() + theme( plot.title = element_text(face = 'bold') ) } if (useCachedFileInfo) { tFiles = read_rds('./data/tFiles_2022-10-13.rds') tFileOvw = read_rds('./data/tFileOvw_2022-10-13.rds') } else { tFiles = getFilesInfo() tFileOvw = deriveFileOverview(tFiles) write_rds(tFiles, paste0('./data/tFiles_', today(), '.rds')) write_rds(tFileOvw, paste0('./data/tFileOvw_', today(), '.rds')) } readDailyFile = function(tFiles, date, prefix) { result = NULL fileNames = tFiles %>% filter(fileDate == date, fileType == prefix) %>% pull(filePath) if (length(fileNames) > 0) { for (f in 1:length(fileNames)) { fileName = fileNames[f] cat(silver(paste0('Reading ', fileName)), '\n') fileContent = read_4D_file(fileName) if (f == 1) { result = fileContent } else { result = result %>% add_row(fileContent) } } } return(result) } readDailyMSFile = function(tFiles, date) { result = NULL fileNames = tFiles %>% filter(fileDate == date, fileType == 'MS') %>% pull(filePath) if (length(fileNames) > 0) { for (f in 1:length(fileNames)) { fileName = fileNames[f] cat(silver(paste0('Reading ', fileName)), '\n') msFileContent = read_4D_MS_file(fileName) if (f == 1) { result = msFileContent } else { result = result %>% add_row(msFileContent) } } } return(result) } readDailyFiles = function(tFiles, date) { abFile = readDailyFile(tFiles, date, 'AB') msFile = readDailyMSFile(tFiles, date) stFile = readDailyFile(tFiles, date, 'ST') tmFile = readDailyFile(tFiles, date, 'TM') tsFile = readDailyFile(tFiles, date, 'TS') return(list(abFile, msFile, stFile, tmFile, tsFile)) } readArchive = function(tFileInfo, tFileOvw, createDebugList = FALSE) { tMergeDates = NULL tsDates = NULL tMergeDatesDebugList = list() dates = tFileInfo %>% filter(!is.na(fileDate)) %>% pull(fileDate) %>% unique() for (d in 1:length(dates)) { currDate = dates[d] cat(paste0('\nd = ', d, ', date: ', currDate, '\n')) print(tFileOvw %>% filter(fileDate == currDate)) dateFiles = readDailyFiles(tFileInfo, currDate) if (!is.null(dateFiles[[1]])) { processed = ProcessAndMergeDailyFiles( dateFiles[[1]] %>% select(-...15), dateFiles[[2]], dateFiles[[3]], dateFiles[[4]], dateFiles[[5]] ) if (!is.null(processed$merged)) { tMergeDate = processed$merged %>% # abFile, stFile and msFile joined mutate(date = currDate, .before = 1) if (createDebugList) tMergeDatesDebugList[[d]] = tMergeDate # separate tables of (ab|st|ms|ts|tm)File of the current day can be found # in processed$(ab|st|ms|ts|tm)File, for instance: if (!is.null(processed$tsFile)) { tsDate = processed$tsFile %>% mutate(date = currDate, .before = 1) } if (is.null(tMergeDates)) { if (nrow(tMergeDate) > 0) { tMergeDates = tMergeDate } if (exists('tsDate')) { if (nrow(tsDate) > 0) { tsDates = tsDate } } } else { if (nrow(tMergeDates) > 0) { tMergeDates = tMergeDates %>% add_row(tMergeDate) } if (exists('tsDates')) { if (!is.null(tsDates)) { if (nrow(tsDates)) { tsDates = tsDates %>% add_row(tsDate) } } } } tMergeDates = tMergeDates %>% mutate(Monsternummer = str_trim(Monsternummer)) } } } if (createDebugList) { return(list(tMergeDates, tMergeDatesDebugList)) } else { return(tMergeDates) } }