Feather and Parquet files

2026-06-18 07:30:36 +02:00 · 2022-08-26 22:25:15 +02:00
parent 4da32e3d40
commit 3864ab2fb8
48 changed files with 188 additions and 175 deletions
--- a/.gitattributes
+++ b/.gitattributes
@@ -1,6 +0,0 @@
-*.dta filter=lfs diff=lfs merge=lfs -text
-*.sas filter=lfs diff=lfs merge=lfs -text
-*.sav filter=lfs diff=lfs merge=lfs -text
-data-raw/*.dta filter=lfs diff=lfs merge=lfs -text
-data-raw/*.sas filter=lfs diff=lfs merge=lfs -text
-data-raw/*.sav filter=lfs diff=lfs merge=lfs -text
--- a/.github/prehooks/post-checkout
+++ b/.github/prehooks/post-checkout
@@ -1,3 +0,0 @@
-#!/bin/sh
-command -v git-lfs >/dev/null 2>&1 || { echo >&2 "\nThis repository is configured for Git LFS but 'git-lfs' was not found on your path. If you no longer wish to use Git LFS, remove this hook by deleting .git/hooks/post-checkout.\n"; exit 2; }
-git lfs post-checkout "$@"
--- a/.github/prehooks/post-commit
+++ b/.github/prehooks/post-commit
@@ -1,3 +0,0 @@
-#!/bin/sh
-command -v git-lfs >/dev/null 2>&1 || { echo >&2 "\nThis repository is configured for Git LFS but 'git-lfs' was not found on your path. If you no longer wish to use Git LFS, remove this hook by deleting .git/hooks/post-commit.\n"; exit 2; }
-git lfs post-commit "$@"
--- a/.github/prehooks/post-merge
+++ b/.github/prehooks/post-merge
@@ -1,3 +0,0 @@
-#!/bin/sh
-command -v git-lfs >/dev/null 2>&1 || { echo >&2 "\nThis repository is configured for Git LFS but 'git-lfs' was not found on your path. If you no longer wish to use Git LFS, remove this hook by deleting .git/hooks/post-merge.\n"; exit 2; }
-git lfs post-merge "$@"
--- a/.github/prehooks/pre-commit
+++ b/.github/prehooks/pre-commit
@@ -5,13 +5,13 @@ echo "Running pre-commit hook..."
 # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 echo ">>  Updating R documentation..."
 if command -v Rscript > /dev/null; then
-  if [ "$(Rscript -e 'cat(all(c('"'roxygen2'"', '"'pkgload'"') %in% rownames(installed.packages())))')" = "TRUE" ]; then
-    Rscript -e "suppressMessages(roxygen2::roxygenise())"
+  if [ "$(Rscript -e 'cat(all(c('"'pkgload'"', '"'devtools'"', '"'dplyr'"') %in% rownames(installed.packages())))')" = "TRUE" ]; then
+    Rscript -e "source('data-raw/pre-commit-hook.R')"
    currentpkg=`Rscript -e "cat(pkgload::pkg_name())"`
    git add man/*
-    echo ">>  done."
+    git add R/sysdata.rda
  else
-    echo ">>  R packages 'roxygen2' and 'pkgload' are not installed!"
+    echo ">>  R package 'pkgload', 'devtools', or 'dplyr' not installed!"
    currentpkg="your"
  fi
 else
@@ -30,7 +30,7 @@ currenttagfull=`git describe --tags --abbrev=0`
 currenttag=`git describe --tags --abbrev=0 | sed 's/v//'`
 if [ "$currenttag" = "" ]; then
  # there is no tag, so set tag to 0.0.1 and commit index to current count
-  echo ">>  - no git tags found, create some using v(x).(y).(z)"
+  echo ">>  - no git tags found, create one in this format: 'v(x).(y).(z)'!"
  currenttag="0.0.1"
  currentcommit=`git rev-list --count HEAD`
 else
--- a/.github/prehooks/pre-commit.save
+++ b/.github/prehooks/pre-commit.save
@@ -1,71 +0,0 @@
-#!/bin/sh
-
-echo "Running pre-commit hook..."
-
-# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-echo ">>  Updating R documentation..."
-if command -v Rscript > /dev/null; then
-  if [ "$(Rscript -e 'cat(all(c('"'roxygen2'"', '"'pkgload'"') %in% rownames(installed.packages())))')" = "TRUE" ]; then
-    Rscript -e "suppressMessages(roxygen2::roxygenise())"
-    currentpkg=`Rscript -e "cat(pkgload::pkg_name())"`
-    git add man/*
-    echo ">>  done."
-  else
-    echo ">>  R packages 'roxygen2' and 'pkgload' are not installed!"
-    currentpkg="your"
-  fi
-else
-  echo ">>  R is not available on your system!"
-  currentpkg="your"
-fi
-echo ">>  "
-
-
-# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-echo ">>  Updating semantic versioning and date..."
-
-# get tags from remote, and remove tags not on remote:
-git fetch origin --prune --prune-tags --quiet
-currenttagfull=`git describe --tags --abbrev=0`
-currenttag=`git describe --tags --abbrev=0 | sed 's/v//'`
-if [ "$currenttag" = "" ]; then
-  # there is no tag, so set tag to 0.0.1 and commit index to current count
-  echo ">>  - no git tags found, create some using v(x).(y).(z)"
-  currenttag="0.0.1"
-  currentcommit=`git rev-list --count HEAD`
-else
-  # there is a tag, so base version number on that
-  currentcommit=`git rev-list --count ${currenttagfull}..HEAD`
-  if (( "$currentcommit" == 0 )); then
-    # tag is new, so this must become the version number
-    currentversion="$currenttag"
-  fi
-  echo ">>  - latest tag is '${currenttagfull}', with ${currentcommit} previous commits"
-fi
-if [ "$currentversion" = "" ]; then
-  # combine tag (e.g. 1.2.3) and commit number (like 5) increased by 9000 to indicate beta version
-  currentversion="$currenttag.$((currentcommit + 9001))" # results in e.g. 1.2.3.9005
-fi
-echo ">>  - ${currentpkg} pkg version set to ${currentversion}"
-
-# set version number and date to DESCRIPTION file
-sed -i -- "s/^Version: .*/Version: ${currentversion}/" DESCRIPTION
-sed -i -- "s/^Date: .*/Date: $(date '+%Y-%m-%d')/" DESCRIPTION
-echo ">>  - updated DESCRIPTION"
-# remove leftover on macOS
-rm -f DESCRIPTION--
-# add to commit
-git add DESCRIPTION
-
-# set version number to NEWS file
-if [ -e "NEWS.md" ]; then
-  sed -i -- "1s/.*/# ${currentpkg} ${currentversion}/" NEWS.md
-  echo ">>  - updated NEWS.md"
-  # remove leftover on macOS
-  rm -f NEWS.md--
-  # add to commit
-  git add NEWS.md
-else
-  echo ">>  - no NEWS.md found!"
-fi
-echo ">>  "
--- a/.github/prehooks/pre-push
+++ b/.github/prehooks/pre-push
@@ -1,3 +0,0 @@
-#!/bin/sh
-command -v git-lfs >/dev/null 2>&1 || { echo >&2 "\nThis repository is configured for Git LFS but 'git-lfs' was not found on your path. If you no longer wish to use Git LFS, remove this hook by deleting .git/hooks/pre-push.\n"; exit 2; }
-git lfs pre-push "$@"
--- a/.gitignore
+++ b/.gitignore
@@ -5,7 +5,6 @@ doc
 .Rhistory
 .RData
 .Ruserdata
-AMR.Rproj
 tests/testthat/Rplots.pdf
 inst/doc
 /src/*.o
--- a/AMR.Rproj
+++ b/AMR.Rproj
@@ -0,0 +1,22 @@
+Version: 1.0
+
+RestoreWorkspace: No
+SaveWorkspace: Ask
+AlwaysSaveHistory: Yes
+
+EnableCodeIndexing: Yes
+UseSpacesForTab: Yes
+NumSpacesForTab: 2
+Encoding: UTF-8
+
+RnwWeave: Sweave
+LaTeX: pdfLaTeX
+
+AutoAppendNewline: Yes
+
+BuildType: Package
+PackageUseDevtools: Yes
+PackageInstallArgs: --no-multiarch --with-keep.source
+PackageBuildArgs: --no-build-vignettes
+PackageCheckArgs: --no-build-vignettes --as-cran
+PackageRoxygenize: rd,collate,namespace
--- a/2
+++ b/2
@@ -1,5 +1,5 @@
 Package: AMR
-Version: 1.8.1.9027
+Version: 1.8.1.9028
 Date: 2022-08-26
 Title: Antimicrobial Resistance Data Analysis
 Description: Functions to simplify and standardise antimicrobial resistance (AMR)
--- a/NEWS.md
+++ b/NEWS.md
@@ -1,11 +1,12 @@
-# AMR 1.8.1.9027
+# AMR 1.8.1.9028

 ### New
 * EUCAST 2022 and CLSI 2022 guidelines have been added for `as.rsi()`. EUCAST 2022 is now the new default guideline for all MIC and disks diffusion interpretations.
 * Support for the following languages: Chinese, Greek, Japanese, Polish, Turkish and Ukrainian. The `AMR` package is now available in 16 languages.

 ### Changed
-* Fix for `as.rsi()` on certain EUCAST breakpoints for MIC values
+* Fix for using `as.rsi()` on certain EUCAST breakpoints for MIC values
+* Fix for using `as.rsi()` on `NA` values (e.g. `as.rsi(as.disk(NA), ...)`)
 * Removed `as.integer()` for MIC values, since MIC are not integer values and running `table()` on MIC values  consequently failed for not being able to retrieve the level position (as that's how normally `as.integer()` on `factor`s work)
 * `droplevels()` on MIC will now return a common `factor` at default and will lose the `<mic>` class. Use `droplevels(..., as.mic = TRUE)` to keep the `<mic>` class.
 * Small fix for using `ab_from_text()`
@@ -19,6 +20,9 @@
 ### Other
 * New website to make use of the new Bootstrap 5 and pkgdown v2.0. The website now contains results for all examples and will be automatically regenerated with every change to our repository, using GitHub Actions
 * Added Peter Dutey-Magni and Anton Mymrikov as contributors, to thank them for their valuable input
+* Our data sets are now also continually exported to Apache Feather and Apache Parquet formats
+* Set up Git Large File Storage (Git LFS) for the large SAS and SPSS file formats
+

 # `AMR` 1.8.1

--- a/R/aa_globals.R
+++ b/R/aa_globals.R
@@ -24,7 +24,7 @@
 # ==================================================================== #

 # add new version numbers here, and add the rules themselves to "data-raw/eucast_rules.tsv" and rsi_translation
-# (sourcing "data-raw/_internals.R" will process the TSV file)
+# (sourcing "data-raw/pre-commit-hook.R" will process the TSV file)
 EUCAST_VERSION_BREAKPOINTS <- list("11.0" = list(version_txt = "v11.0",
                                                 year = 2021, 
                                                 title = "'EUCAST Clinical Breakpoint Tables'",
--- a/R/aa_helper_functions.R
+++ b/R/aa_helper_functions.R
@@ -596,7 +596,7 @@ create_eucast_ab_documentation <- function() {
  ab <- character()
  for (val in x) {
    if (paste0("AB_", val) %in% ls(envir = asNamespace("AMR"))) {
-      # antibiotic group names, as defined in data-raw/_internals.R, such as `CARBAPENEMS`
+      # antibiotic group names, as defined in data-raw/pre-commit-hook.R, such as `CARBAPENEMS`
      val <- eval(parse(text = paste0("AB_", val)), envir = asNamespace("AMR"))
    } else if (val %in% AB_lookup$ab) {
      # separate drugs, such as `AMX`
--- a/R/ab_selectors.R
+++ b/R/ab_selectors.R
@@ -502,7 +502,7 @@ ab_select_exec <- function(function_name,
  }
  
  if (is.null(ab_class_args)) {
-    # their upper case equivalent are vectors with class <ab>, created in data-raw/_internals.R
+    # their upper case equivalent are vectors with class <ab>, created in data-raw/pre-commit-hook.R
    # carbapenems() gets its codes from AMR:::AB_CARBAPENEMS
    abx <- get(paste0("AB_", toupper(function_name)), envir = asNamespace("AMR"))  
    ab_group <- function_name
--- a/R/eucast_rules.R
+++ b/R/eucast_rules.R
@@ -543,7 +543,7 @@ eucast_rules <- function(x,
    # this allows: eucast_rules(x, eucast_rules_df = AMR:::EUCAST_RULES_DF %>% filter(is.na(have_these_values)))
    eucast_rules_df <- list(...)$eucast_rules_df
  } else {
-    # otherwise internal data file, created in data-raw/_internals.R
+    # otherwise internal data file, created in data-raw/pre-commit-hook.R
    eucast_rules_df <- EUCAST_RULES_DF
  }
  
--- a/R/guess_ab_col.R
+++ b/R/guess_ab_col.R
@@ -311,7 +311,7 @@ get_ab_from_namespace <- function(x, cols_ab) {
  x_new <- character()
  for (val in x) {
    if (paste0("AB_", val) %in% ls(envir = asNamespace("AMR"))) {
-      # antibiotic group names, as defined in data-raw/_internals.R, such as `AB_CARBAPENEMS`
+      # antibiotic group names, as defined in data-raw/pre-commit-hook.R, such as `AB_CARBAPENEMS`
      val <- eval(parse(text = paste0("AB_", val)), envir = asNamespace("AMR"))
    } else if (val %in% AB_lookup$ab) {
      # separate drugs, such as `AMX`
--- a/R/rsi.R
+++ b/R/rsi.R
@@ -597,18 +597,18 @@ get_guideline <- function(guideline, reference_data) {
  guideline_param
 }

-as_rsi_method <- function(method_short = "mic",
-                          method_long = "MIC values",
-                          x = x,
-                          mo = NULL, 
-                          ab = deparse(substitute(x)), 
-                          guideline = "EUCAST", 
-                          uti = FALSE,
-                          conserve_capped_values = FALSE,
-                          add_intrinsic_resistance = FALSE,
-                          reference_data = AMR::rsi_translation,
+as_rsi_method <- function(method_short,
+                          method_long,
+                          x,
+                          mo,
+                          ab,
+                          guideline,
+                          uti,
+                          conserve_capped_values,
+                          add_intrinsic_resistance,
+                          reference_data,
                          ...) {
-  meet_criteria(x)
+  meet_criteria(x, allow_NA = TRUE)
  meet_criteria(mo, allow_class = c("mo", "character"), allow_NULL = TRUE)
  meet_criteria(ab, allow_class = c("ab", "character"))
  meet_criteria(guideline, allow_class = "character", has_length = 1)
--- a/R/sysdata.rda
+++ b/R/sysdata.rda
--- a/R/zzz.R
+++ b/R/zzz.R
@@ -133,7 +133,7 @@ create_MO_lookup <- function() {
    MO_lookup$fullname_lower <- MO_FULLNAME_LOWER
  } else {
    MO_lookup$fullname_lower <- ""
-    warning("MO table updated - Run: source(\"data-raw/_internals.R\")", call. = FALSE)
+    warning("MO table updated - Run: source(\"data-raw/pre-commit-hook.R\")", call. = FALSE)
  }
  
  # add a column with only "e coli" like combinations
--- a/data-raw/antibiotics.feather
+++ b/data-raw/antibiotics.feather
--- a/data-raw/antibiotics.parquet
+++ b/data-raw/antibiotics.parquet
--- a/data-raw/antibiotics.rds
+++ b/data-raw/antibiotics.rds
--- a/data-raw/antivirals.feather
+++ b/data-raw/antivirals.feather
--- a/data-raw/antivirals.parquet
+++ b/data-raw/antivirals.parquet
--- a/data-raw/antivirals.rds
+++ b/data-raw/antivirals.rds
--- a/data-raw/dosage.feather
+++ b/data-raw/dosage.feather
--- a/data-raw/dosage.parquet
+++ b/data-raw/dosage.parquet
--- a/data-raw/intrinsic_resistant.feather
+++ b/data-raw/intrinsic_resistant.feather
--- a/data-raw/intrinsic_resistant.parquet
+++ b/data-raw/intrinsic_resistant.parquet
--- a/data-raw/microorganisms.dta
+++ b/data-raw/microorganisms.dta
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:3b737ed331dd70a51aabf8203faadaa3f61e67c2f2cdbfce9c1b4aca7b61df93
-size 28881867
+oid sha256:0d69888efa84f05de1de460039fbd137439f76fba0e1a98f605df77a0e3b0ea4
+size 65184439
--- a/data-raw/microorganisms.feather
+++ b/data-raw/microorganisms.feather
--- a/data-raw/microorganisms.md5
+++ b/data-raw/microorganisms.md5
@@ -0,0 +1 @@
+ec28bed91f4b254e2b33f30b77198325
--- a/data-raw/microorganisms.old.feather
+++ b/data-raw/microorganisms.old.feather
--- a/data-raw/microorganisms.old.parquet
+++ b/data-raw/microorganisms.old.parquet
--- a/data-raw/microorganisms.parquet
+++ b/data-raw/microorganisms.parquet
--- a/data-raw/microorganisms.rds
+++ b/data-raw/microorganisms.rds
--- a/data-raw/microorganisms.sas
+++ b/data-raw/microorganisms.sas
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:c3c78b0121a7adc97218825b701ab157e2d0c01400d797fa5fd40b7abf27d79f
-size 32219136
+oid sha256:2253a2f9b918972e77af08eec81565219510c10dba4bd957bca1580e4392033e
+size 72474624
--- a/data-raw/microorganisms.sav
+++ b/data-raw/microorganisms.sav
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:4970b36edc301a65f2a2494da93419e2e116302d029ba5a49a4fac82cef8e068
-size 17100983
+oid sha256:cbe379d131f50308af69d73f5cf74a14b92d6cf892a9b11fd02eaa48bf5b5657
+size 21775629
--- a/data-raw/microorganisms.xlsx
+++ b/data-raw/microorganisms.xlsx
--- a/data-raw/pre-commit-hook.R
+++ b/data-raw/pre-commit-hook.R
@@ -9,7 +9,7 @@
 # (c) 2018-2022 Berends MS, Luz CF et al.                              #
 # Developed at the University of Groningen, the Netherlands, in        #
 # collaboration with non-profit organisations Certe Medical            #
-# Diagnostics & Advice, and University Medical Center Groningen.       # 
+# Diagnostics & Advice, and University Medical Center Groningen.       #
 #                                                                      #
 # This R package is free software; you can freely use and distribute   #
 # it for both personal and commercial purposes under the terms of the  #
@@ -24,7 +24,7 @@
 # ==================================================================== #

 # Run this file to update the package using:
-# source("data-raw/_internals.R")
+# source("data-raw/pre-commit-hook.R")

 library(dplyr, warn.conflicts = FALSE)
 devtools::load_all(quiet = TRUE)
@@ -42,19 +42,38 @@ EUCAST_RULES_DF <- utils::read.delim(file = "data-raw/eucast_rules.tsv",
                                     stringsAsFactors = FALSE,
                                     header = TRUE,
                                     strip.white = TRUE,
-                                     na = c(NA, "", NULL)) %>% 
+                                     na = c(NA, "", NULL)) %>%
  # take the order of the reference.rule_group column in the original data file
  mutate(reference.rule_group = factor(reference.rule_group,
                                       levels = unique(reference.rule_group),
                                       ordered = TRUE),
-         sorting_rule = ifelse(grepl("^Table", reference.rule, ignore.case = TRUE), 1, 2)) %>% 
+         sorting_rule = ifelse(grepl("^Table", reference.rule, ignore.case = TRUE), 1, 2)) %>%
  arrange(reference.rule_group,
          reference.version,
          sorting_rule,
-          reference.rule) %>% 
-  mutate(reference.rule_group = as.character(reference.rule_group)) %>% 
+          reference.rule) %>%
+  mutate(reference.rule_group = as.character(reference.rule_group)) %>%
  select(-sorting_rule)

+TRANSLATIONS <- utils::read.delim(file = "data-raw/translations.tsv",
+                                  sep = "\t",
+                                  stringsAsFactors = FALSE,
+                                  header = TRUE,
+                                  blank.lines.skip = TRUE,
+                                  fill = TRUE,
+                                  strip.white = TRUE,
+                                  encoding = "UTF-8",
+                                  fileEncoding = "UTF-8",
+                                  na.strings = c(NA, "", NULL),
+                                  allowEscapes = TRUE, # else "\\1" will be imported as "\\\\1"
+                                  quote = "")
+
+LANGUAGES_SUPPORTED_NAMES <- c(list(en = list(exonym = "English", endonym = "English")),
+                               lapply(TRANSLATIONS[, which(nchar(colnames(TRANSLATIONS)) == 2)],
+                                      function(x) list(exonym = x[1], endonym = x[2])))
+
+LANGUAGES_SUPPORTED <- names(LANGUAGES_SUPPORTED_NAMES)
+
 # vectors of CoNS and CoPS, improves speed in as.mo()
 create_species_cons_cops <- function(type = c("CoNS", "CoPS")) {
  # Determination of which staphylococcal species are CoNS/CoPS according to:
@@ -66,7 +85,7 @@ create_species_cons_cops <- function(type = c("CoNS", "CoPS")) {
  MO_staph <- MO_staph[which(MO_staph$genus == "Staphylococcus"), , drop = FALSE]
  if (type == "CoNS") {
    MO_staph[which(MO_staph$species %in% c("coagulase-negative", "argensis", "arlettae",
-                                           "auricularis", "borealis", "caeli", "capitis", "caprae", 
+                                           "auricularis", "borealis", "caeli", "capitis", "caprae",
                                           "carnosus", "casei", "chromogenes", "cohnii", "condimenti",
                                           "croceilyticus",
                                           "debuckii", "devriesei", "edaphicus", "epidermidis",
@@ -99,7 +118,7 @@ create_species_cons_cops <- function(type = c("CoNS", "CoPS")) {
 create_MO_fullname_lower <- function() {
  MO_lookup <- AMR::microorganisms
  # use this paste instead of `fullname` to work with Viridans Group Streptococci, etc.
-  MO_lookup$fullname_lower <- tolower(trimws(paste(MO_lookup$genus, 
+  MO_lookup$fullname_lower <- tolower(trimws(paste(MO_lookup$genus,
                                                   MO_lookup$species,
                                                   MO_lookup$subspecies)))
  ind <- MO_lookup$genus == "" | grepl("^[(]unknown ", MO_lookup$fullname, perl = TRUE)
@@ -175,7 +194,7 @@ create_AB_lookup <- function() {
  AB_lookup$generalised_synonyms <- lapply(AB_lookup$synonyms, generalise_antibiotic_name)
  AB_lookup$generalised_abbreviations <- lapply(AB_lookup$abbreviations, generalise_antibiotic_name)
  AB_lookup$generalised_loinc <- lapply(AB_lookup$loinc, generalise_antibiotic_name)
-  AB_lookup$generalised_all <- unname(lapply(as.list(as.data.frame(t(AB_lookup[, 
+  AB_lookup$generalised_all <- unname(lapply(as.list(as.data.frame(t(AB_lookup[,
                                                                               c("ab", "atc", "cid", "name",
                                                                                 colnames(AB_lookup)[colnames(AB_lookup) %like% "generalised"]),
                                                                               drop = FALSE]),
@@ -189,7 +208,10 @@ create_AB_lookup <- function() {
 AB_LOOKUP <- create_AB_lookup()

 # Export to package as internal data ----
-usethis::use_data(EUCAST_RULES_DF, 
+usethis::use_data(EUCAST_RULES_DF,
+                  TRANSLATIONS,
+                  LANGUAGES_SUPPORTED_NAMES,
+                  LANGUAGES_SUPPORTED,
                  MO_CONS,
                  MO_COPS,
                  MO_STREP_ABCG,
@@ -232,23 +254,35 @@ usethis::use_data(EUCAST_RULES_DF,

 # Export data sets to the repository in different formats -----------------

+for (pkg in c("haven", "openxlsx", "arrow")) {
+  if (!pkg %in% rownames(utils::installed.packages())) {
+    message("NOTE: package '", pkg, "' not installed! Ignoring export where this package is required.")
+  }
+}
+if ("digest" %in% rownames(utils::installed.packages())) {
+  md5 <- function(object) digest::digest(object, "md5")
+} else {
+  # will write all files anyway, since MD5 hash cannot be determined
+  md5 <- function(object) "unknown-md5-hash"
+}
+
 write_md5 <- function(object) {
  conn <- file(paste0("data-raw/", deparse(substitute(object)), ".md5"))
-  writeLines(digest::digest(object, "md5"), conn)
+  writeLines(md5(object), conn)
  close(conn)
 }
 changed_md5 <- function(object) {
  tryCatch({
    conn <- file(paste0("data-raw/", deparse(substitute(object)), ".md5"))
-    compared <- digest::digest(object, "md5") != readLines(con = conn)
+    compared <- md5(object) != readLines(con = conn)
    close(conn)
    compared
  }, error = function(e) TRUE)
 }

 # give official names to ABs and MOs
-rsi <- AMR::rsi_translation %>% 
-  mutate(mo_name = mo_name(mo, language = NULL), .after = mo) %>% 
+rsi <- rsi_translation %>%
+  mutate(mo_name = mo_name(mo, language = NULL), .after = mo) %>%
  mutate(ab_name = ab_name(ab, language = NULL), .after = ab)
 if (changed_md5(rsi)) {
  usethis::ui_info(paste0("Saving {usethis::ui_value('rsi_translation')} to {usethis::ui_value('/data-raw/')}"))
@@ -259,18 +293,25 @@ if (changed_md5(rsi)) {
  try(haven::write_sav(rsi, "data-raw/rsi_translation.sav"), silent = TRUE)
  try(haven::write_dta(rsi, "data-raw/rsi_translation.dta"), silent = TRUE)
  try(openxlsx::write.xlsx(rsi, "data-raw/rsi_translation.xlsx"), silent = TRUE)
+  try(arrow::write_feather(rsi, "data-raw/rsi_translation.feather"), silent = TRUE)
+  try(arrow::write_parquet(rsi, "data-raw/rsi_translation.parquet"), silent = TRUE)
 }

-mo <- dplyr::mutate_if(microorganisms, ~!is.numeric(.), as.character)
-if (changed_md5(mo)) {
+if (changed_md5(microorganisms)) {
  usethis::ui_info(paste0("Saving {usethis::ui_value('microorganisms')} to {usethis::ui_value('/data-raw/')}"))
-  write_md5(mo)
-  try(saveRDS(mo, "data-raw/microorganisms.rds", version = 2, compress = "xz"), silent = TRUE)
+  write_md5(microorganisms)
+  try(saveRDS(microorganisms, "data-raw/microorganisms.rds", version = 2, compress = "xz"), silent = TRUE)
  try(write.table(mo, "data-raw/microorganisms.txt", sep = "\t", na = "", row.names = FALSE), silent = TRUE)
-  try(haven::write_sas(dplyr::select(mo, -snomed), "data-raw/microorganisms.sas"), silent = TRUE)
-  try(haven::write_sav(dplyr::select(mo, -snomed), "data-raw/microorganisms.sav"), silent = TRUE)
-  try(haven::write_dta(dplyr::select(mo, -snomed), "data-raw/microorganisms.dta"), silent = TRUE)
-  try(openxlsx::write.xlsx(dplyr::select(mo, -snomed), "data-raw/microorganisms.xlsx"), silent = TRUE)
+  max_50_snomed <- sapply(microorganisms$snomed, function(x) paste(x[seq_len(min(50, length(x), na.rm = TRUE))], collapse = " "))
+  mo <- microorganisms
+  mo$snomed <- max_50_snomed
+  mo <- dplyr::mutate_if(mo, ~!is.numeric(.), as.character)
+  try(haven::write_sas(mo, "data-raw/microorganisms.sas"), silent = TRUE)
+  try(haven::write_sav(mo, "data-raw/microorganisms.sav"), silent = TRUE)
+  try(haven::write_dta(mo, "data-raw/microorganisms.dta"), silent = TRUE)
+  try(openxlsx::write.xlsx(mo, "data-raw/microorganisms.xlsx"), silent = TRUE)
+  try(arrow::write_feather(microorganisms, "data-raw/microorganisms.feather"), silent = TRUE)
+  try(arrow::write_parquet(microorganisms, "data-raw/microorganisms.parquet"), silent = TRUE)
 }

 if (changed_md5(microorganisms.old)) {
@@ -282,30 +323,36 @@ if (changed_md5(microorganisms.old)) {
  try(haven::write_sav(microorganisms.old, "data-raw/microorganisms.old.sav"), silent = TRUE)
  try(haven::write_dta(microorganisms.old, "data-raw/microorganisms.old.dta"), silent = TRUE)
  try(openxlsx::write.xlsx(microorganisms.old, "data-raw/microorganisms.old.xlsx"), silent = TRUE)
+  try(arrow::write_feather(microorganisms.old, "data-raw/microorganisms.old.feather"), silent = TRUE)
+  try(arrow::write_parquet(microorganisms.old, "data-raw/microorganisms.old.parquet"), silent = TRUE)
 }

 ab <- dplyr::mutate_if(antibiotics, ~!is.numeric(.), as.character)
 if (changed_md5(ab)) {
  usethis::ui_info(paste0("Saving {usethis::ui_value('antibiotics')} to {usethis::ui_value('/data-raw/')}"))
  write_md5(ab)
-  try(saveRDS(ab, "data-raw/antibiotics.rds", version = 2, compress = "xz"), silent = TRUE)
-  try(write.table(ab, "data-raw/antibiotics.txt", sep = "\t", na = "", row.names = FALSE), silent = TRUE)
+  try(saveRDS(antibiotics, "data-raw/antibiotics.rds", version = 2, compress = "xz"), silent = TRUE)
+  try(write.table(antibiotics, "data-raw/antibiotics.txt", sep = "\t", na = "", row.names = FALSE), silent = TRUE)
  try(haven::write_sas(ab, "data-raw/antibiotics.sas"), silent = TRUE)
  try(haven::write_sav(ab, "data-raw/antibiotics.sav"), silent = TRUE)
  try(haven::write_dta(ab, "data-raw/antibiotics.dta"), silent = TRUE)
  try(openxlsx::write.xlsx(ab, "data-raw/antibiotics.xlsx"), silent = TRUE)
+  try(arrow::write_feather(antibiotics, "data-raw/antibiotics.feather"), silent = TRUE)
+  try(arrow::write_parquet(antibiotics, "data-raw/antibiotics.parquet"), silent = TRUE)
 }

 av <- dplyr::mutate_if(antivirals, ~!is.numeric(.), as.character)
 if (changed_md5(av)) {
  usethis::ui_info(paste0("Saving {usethis::ui_value('antivirals')} to {usethis::ui_value('/data-raw/')}"))
  write_md5(av)
-  try(saveRDS(av, "data-raw/antivirals.rds", version = 2, compress = "xz"), silent = TRUE)
+  try(saveRDS(antivirals, "data-raw/antivirals.rds", version = 2, compress = "xz"), silent = TRUE)
  try(write.table(av, "data-raw/antivirals.txt", sep = "\t", na = "", row.names = FALSE), silent = TRUE)
  try(haven::write_sas(av, "data-raw/antivirals.sas"), silent = TRUE)
  try(haven::write_sav(av, "data-raw/antivirals.sav"), silent = TRUE)
  try(haven::write_dta(av, "data-raw/antivirals.dta"), silent = TRUE)
  try(openxlsx::write.xlsx(av, "data-raw/antivirals.xlsx"), silent = TRUE)
+  try(arrow::write_feather(antivirals, "data-raw/antivirals.feather"), silent = TRUE)
+  try(arrow::write_parquet(antivirals, "data-raw/antivirals.parquet"), silent = TRUE)
 }

 # give official names to ABs and MOs
@@ -321,6 +368,8 @@ if (changed_md5(intrinsicR)) {
  try(haven::write_sav(intrinsicR, "data-raw/intrinsic_resistant.sav"), silent = TRUE)
  try(haven::write_dta(intrinsicR, "data-raw/intrinsic_resistant.dta"), silent = TRUE)
  try(openxlsx::write.xlsx(intrinsicR, "data-raw/intrinsic_resistant.xlsx"), silent = TRUE)
+  try(arrow::write_feather(intrinsicR, "data-raw/intrinsic_resistant.feather"), silent = TRUE)
+  try(arrow::write_parquet(intrinsicR, "data-raw/intrinsic_resistant.parquet"), silent = TRUE)
 }

 if (changed_md5(dosage)) {
@@ -332,6 +381,8 @@ if (changed_md5(dosage)) {
  try(haven::write_sav(dosage, "data-raw/dosage.sav"), silent = TRUE)
  try(haven::write_dta(dosage, "data-raw/dosage.dta"), silent = TRUE)
  try(openxlsx::write.xlsx(dosage, "data-raw/dosage.xlsx"), silent = TRUE)
+  try(arrow::write_feather(dosage, "data-raw/dosage.feather"), silent = TRUE)
+  try(arrow::write_parquet(dosage, "data-raw/dosage.parquet"), silent = TRUE)
 }

 reset_AMR_locale()
@@ -340,3 +391,6 @@ reset_AMR_locale()
 current_globalenv <- ls(envir = globalenv())
 rm(list = current_globalenv[!current_globalenv %in% old_globalenv])
 rm(current_globalenv)
+
+devtools::load_all(quiet = TRUE)
+devtools::document()
--- a/data-raw/reproduction_of_microorganisms.R
+++ b/data-raw/reproduction_of_microorganisms.R
@@ -901,7 +901,7 @@ usethis::use_data(rsi_translation, overwrite = TRUE, version = 2)
 usethis::use_data(microorganisms.codes, overwrite = TRUE, version = 2)
 # saveRDS(microorganisms.translation, file = "data-raw/microorganisms.translation.rds", version = 2)
 # to save microorganisms.translation internally to the package
-# source("data-raw/_internals.R")
+# source("data-raw/pre-commit-hook.R")

 # load new data sets again
 devtools::load_all(".")
--- a/data-raw/reproduction_of_microorganisms_update.R
+++ b/data-raw/reproduction_of_microorganisms_update.R
@@ -444,7 +444,7 @@ rm(intrinsic_resistant)

 # load new data sets again
 devtools::load_all(".")
-source("data-raw/_internals.R")
+source("data-raw/pre-commit-hook.R")
 devtools::load_all(".")


--- a/data-raw/rsi_translation.feather
+++ b/data-raw/rsi_translation.feather
--- a/data-raw/rsi_translation.parquet
+++ b/data-raw/rsi_translation.parquet
--- a/data-raw/rsi_translation.rds
+++ b/data-raw/rsi_translation.rds
--- a/vignettes/AMR_intro.png
+++ b/vignettes/AMR_intro.png
--- a/vignettes/datasets.Rmd
+++ b/vignettes/datasets.Rmd
@@ -13,7 +13,7 @@ editor_options:
  chunk_output_type: console
 ---

-```{r setup, include = FALSE, results = 'markup'}
+```{r setup, include = FALSE, results = "markup"}
 knitr::opts_chunk$set(
  warning = FALSE,
  collapse = TRUE,
@@ -40,30 +40,41 @@ download_txt <- function(filename) {
                ". Find more info about the structure of this data set [here](https://msberends.github.io/AMR/reference/", ifelse(filename == "antivirals", "antibiotics", filename), ".html).\n")
  github_base <- "https://github.com/msberends/AMR/raw/main/data-raw/"
  filename <- paste0("../data-raw/", filename)
-  txt <- paste0(filename, ".txt")
  rds <- paste0(filename, ".rds")
+  txt <- paste0(filename, ".txt")
+  excel <- paste0(filename, ".xlsx")
+  feather <- paste0(filename, ".feather")
+  parquet <- paste0(filename, ".parquet")
+  sas <- paste0(filename, ".sas")
  spss <- paste0(filename, ".sav")
  stata <- paste0(filename, ".dta")
-  sas <- paste0(filename, ".sas")
-  excel <- paste0(filename, ".xlsx")
-  create_txt <- function(filename, type, software) {
-    paste0("* Download as [", software, " file](", github_base, filename, ") (", AMR:::formatted_filesize(filename), ")  \n")
+  create_txt <- function(filename, type, software, exists) {
+    if (isTRUE(exists)) {
+      paste0("* Download as [", software, "](", github_base, filename, ") (",
+             AMR:::formatted_filesize(filename), ")  \n")
+    } else {
+      paste0("* *(unavailable as ", software, ")*\n")
+    }
  }
-
+  
  if (any(file.exists(rds),
-          file.exists(excel),
          file.exists(txt),
+          file.exists(excel),
+          file.exists(feather),
+          file.exists(parquet),
          file.exists(sas),
          file.exists(spss),
          file.exists(stata))) {
-    msg <- c(msg, "\n**Direct download links:**\n\n")
+    msg <- c(msg, "\n**Direct download links:**\n\n",
+             create_txt(rds, "rds", "original R Data Structure (RDS) file", file.exists(rds)),
+             create_txt(txt, "txt", "tab-separated text file", file.exists(txt)),
+             create_txt(excel, "xlsx", "Microsoft Excel workbook", file.exists(excel)),
+             create_txt(feather, "feather", "Apache Feather file", file.exists(feather)),
+             create_txt(parquet, "parquet", "Apache Parquet file", file.exists(parquet)),
+             create_txt(sas, "sas", "SAS data file", file.exists(sas)),
+             create_txt(spss, "sav", "IBM SPSS Statistics data file", file.exists(spss)),
+             create_txt(stata, "dta", "Stata DTA file", file.exists(stata)))
  }
-  if (file.exists(rds)) msg <- c(msg, create_txt(rds, "rds", "R"))
-  if (file.exists(excel)) msg <- c(msg, create_txt(excel, "xlsx", "Excel"))
-  if (file.exists(txt)) msg <- c(msg, create_txt(txt, "txt", "plain text"))
-  if (file.exists(sas)) msg <- c(msg, create_txt(sas, "sas", "SAS"))
-  if (file.exists(spss)) msg <- c(msg, create_txt(spss, "sav", "SPSS"))
-  if (file.exists(stata)) msg <- c(msg, create_txt(stata, "dta", "Stata"))
  paste0(msg, collapse = "")
 }

@@ -87,14 +98,13 @@ print_df <- function(x, rows = 6) {
    }) %>%
    knitr::kable(align = "c")
 }
-
 ```

-All reference data (about microorganisms, antibiotics, R/SI interpretation, EUCAST rules, etc.) in this `AMR` package are reliable, up-to-date and freely available. We continually export our data sets to formats for use in R, SPSS, SAS, Stata and Excel. We also supply  tab separated files that are machine-readable and suitable for input in any software program, such as laboratory information systems. 
+All reference data (about microorganisms, antibiotics, R/SI interpretation, EUCAST rules, etc.) in this `AMR` package are reliable, up-to-date and freely available. We continually export our data sets to formats for use in R, MS Excel, Apache Feather, Apache Parquet, SPSS, SAS, and Stata. We also provide tab-separated text files that are machine-readable and suitable for input in any software program, such as laboratory information systems. 

 On this page, we explain how to download them and how the structure of the data sets look like. 

-## Microorganisms (currently accepted names)
+## `microorganisms`: Microbial Taxonomy (currently accepted names)

 `r structure_txt(microorganisms)`

@@ -102,6 +112,8 @@ This data set is in R available as `microorganisms`, after you load the `AMR` pa

 `r download_txt("microorganisms")`

+**NOTE: The exported files for Excel, SAS, SPSS and Stata contain only the first 50 SNOMED codes per record, as their file size would otherwise exceed 100 MB; the file size limit of GitHub.** Advice? Use R instead.
+
 ### Source

 Our full taxonomy of microorganisms is based on the authoritative and comprehensive:
@@ -130,7 +142,7 @@ microorganisms %>%
  print_df()
 ```

-## Microorganisms (previously accepted names)
+## `microorganisms.old`: Microbial Taxonomy (previously accepted names)

 `r structure_txt(microorganisms.old)`

@@ -158,7 +170,7 @@ microorganisms.old %>%
 ```


-## Antibiotic agents
+## `antibiotics`: Antibiotic Agents

 `r structure_txt(antibiotics)`

@@ -183,7 +195,7 @@ antibiotics %>%
 ```


-## Antiviral agents
+## `antivirals`: Antiviral Agents

 `r structure_txt(antivirals)`

@@ -205,7 +217,7 @@ antivirals %>%
  print_df()
 ```

-## Interpretation from MIC values / disk diameters to R/SI
+## `rsi_translation`: Interpretation from MIC values / disk diameters to R/SI

 `r structure_txt(rsi_translation)`

@@ -227,7 +239,7 @@ rsi_translation %>%
 ```


-## Intrinsic bacterial resistance
+## `intrinsic_resistant`: Intrinsic Bacterial Resistance

 `r structure_txt(intrinsic_resistant)`

@@ -253,7 +265,7 @@ intrinsic_resistant %>%
 ```


-## Dosage guidelines from EUCAST
+## `dosage`: Dosage Guidelines from EUCAST

 `r structure_txt(dosage)`

--- a/vignettes/welcome_to_AMR.Rmd
+++ b/vignettes/welcome_to_AMR.Rmd
@@ -22,15 +22,19 @@ knitr::opts_chunk$set(
 )
 ```

-Note: to keep the package size as small as possible, we only included this vignette on CRAN. You can read more vignettes on our website about how to conduct AMR data analysis, determine MDRO's, find explanation of EUCAST rules, and much more: <https://msberends.github.io/AMR/articles/>.
+Note: to keep the package size as small as possible, we only included this vignette on CRAN. You can read more vignettes on our website about how to conduct AMR data analysis, determine MDROs, find explanation of EUCAST rules, and much more: <https://msberends.github.io/AMR/articles/>.

 ----

-`AMR` is a free, open-source and independent R package (see [Copyright](https://msberends.github.io/AMR/#copyright)) to simplify the analysis and prediction of Antimicrobial Resistance (AMR) and to work with microbial and antimicrobial data and properties, by using evidence-based methods. **Our aim is to provide a standard** for clean and reproducible antimicrobial resistance data analysis, that can therefore empower epidemiological analyses to continuously enable surveillance and treatment evaluation in any setting.
+The `AMR` package is a [free and open-source](https://msberends.github.io/AMR/#copyright) R package with [zero dependencies](https://en.wikipedia.org/wiki/Dependency_hell) to simplify the analysis and prediction of Antimicrobial Resistance (AMR) and to work with microbial and antimicrobial data and properties, by using evidence-based methods. **Our aim is to provide a standard** for clean and reproducible AMR data analysis, that can therefore empower epidemiological analyses to continuously enable surveillance and treatment evaluation in any setting.
+
+```{r, echo = FALSE, out.width = "555px"}
+knitr::include_graphics("AMR_intro.png")
+```

 After installing this package, R knows `r AMR:::format_included_data_number(AMR::microorganisms)` distinct microbial species and all `r AMR:::format_included_data_number(rbind(AMR::antibiotics[, "atc", drop = FALSE], AMR::antivirals[, "atc", drop = FALSE]))` antibiotic, antimycotic and antiviral drugs by name and code (including ATC, EARS-Net, PubChem, LOINC and SNOMED CT), and knows all about valid R/SI and MIC values. It supports any data format, including WHONET/EARS-Net data.

-The `AMR` package is available in Danish, Dutch, English, French, German, Italian, Portuguese, Russian, Spanish and Swedish. Antimicrobial drug (group) names and colloquial microorganism names are provided in these languages.
+The `AMR` package is available in English, Chinese, Danish, Dutch, French, German, Greek, Italian, Japanese, Polish, Portuguese, Russian, Spanish, Swedish, Turkish and Ukrainian. Antimicrobial drug (group) names and colloquial microorganism names are provided in these languages.

 This package is fully independent of any other R package and works on Windows, macOS and Linux with all versions of R since R-3.0 (April 2013). **It was designed to work in any setting, including those with very limited resources**. Since its first public release in early 2018, this package has been downloaded from more than 175 countries.

@@ -56,3 +60,9 @@ This package can be used for:
 All reference data sets (about microorganisms, antibiotics, R/SI interpretation, EUCAST rules, etc.) in this `AMR` package are publicly and freely available. We continually export our data sets to formats for use in R, SPSS, SAS, Stata and Excel. We also supply flat files that are machine-readable and suitable for input in any software program, such as laboratory information systems. Please find [all download links on our website](https://msberends.github.io/AMR/articles/datasets.html), which is automatically updated with every code change.

 This R package was created for both routine data analysis and academic research at the Faculty of Medical Sciences of the [University of Groningen](https://www.rug.nl), in collaboration with non-profit organisations [Certe Medical Diagnostics and Advice Foundation](https://www.certe.nl) and [University Medical Center Groningen](https://www.umcg.nl). This R package formed the basis of two PhD theses ([DOI 10.33612/diss.177417131](https://doi.org/10.33612/diss.177417131) and [DOI 10.33612/diss.192486375](https://doi.org/10.33612/diss.192486375)) but is actively and durably maintained (see [changelog)](https://msberends.github.io/AMR/news/index.html)) by two public healthcare organisations in the Netherlands.
+
+----
+
+<small>
+This AMR package for R is free, open-source software and licensed under the [GNU General Public License v2.0 (GPL-2)](https://msberends.github.io/AMR/LICENSE-text.html). These requirements are consequently legally binding: modifications must be released under the same license when distributing the package, changes made to the code must be documented, source code must be made available when the package is distributed, and a copy of the license and copyright notice must be included with the package.
+</small>