improvement for forecasting resistance

2026-07-13 05:50:55 +02:00 · 2018-07-26 16:30:42 +02:00
parent d9e204031d
commit 498e88b5cf
13 changed files with 438 additions and 145 deletions
--- a/R/atc.R
+++ b/R/atc.R
@@ -267,33 +267,33 @@ abname <- function(abcode, from = c("guess", "atc", "molis", "umcg"), to = 'offi
  }

  abcode <- as.character(abcode)
+  abcode.bak <- abcode

  for (i in 1:length(abcode)) {
-    drug <- abcode[i]
-    if (!grepl('+', drug, fixed = TRUE) & !grepl(' en ', drug, fixed = TRUE)) {
+    abcode[i] <- abcode[i]
+    if (!grepl('+', abcode[i], fixed = TRUE) & !grepl(' en ', abcode[i], fixed = TRUE)) {
      # only 1 drug
-      if (drug %in% (antibiotics %>% pull(from))) {
+      if (abcode[i] %in% (antibiotics %>% pull(from))) {
        abcode[i] <-
          antibiotics %>%
-          filter(.[, from] == drug) %>%
+          filter(.[, from] == abcode[i]) %>%
          select(to) %>%
          slice(1) %>%
          as.character()
      } else {
        # not found
-        warning('Code "', drug, '" not found in antibiotics list.', call. = FALSE)
        abcode[i] <- NA
      }
    } else {
      # more than 1 drug
-      if (grepl('+', drug, fixed = TRUE)) {
-        drug.group <-
-          strsplit(drug, '+', fixed = TRUE) %>%
+      if (grepl('+', abcode[i], fixed = TRUE)) {
+        abcode.group <-
+          strsplit(abcode[i], '+', fixed = TRUE) %>%
          unlist() %>%
          trimws('both')
-      } else if (grepl(' en ', drug, fixed = TRUE)) {
-        drug.group <-
-          strsplit(drug, ' en ', fixed = TRUE) %>%
+      } else if (grepl(' en ', abcode[i], fixed = TRUE)) {
+        abcode.group <-
+          strsplit(abcode[i], ' en ', fixed = TRUE) %>%
          unlist() %>%
          trimws('both')
      } else {
@@ -302,18 +302,29 @@ abname <- function(abcode, from = c("guess", "atc", "molis", "umcg"), to = 'offi
        next
      }

-      for (j in 1:length(drug.group)) {
-        drug.group[j] <-
+      for (j in 1:length(abcode.group)) {
+        abcode.group[j] <-
          antibiotics %>%
-          filter(.[, from] == drug.group[j]) %>%
+          filter(.[, from] == abcode.group[j]) %>%
          select(to) %>%
          slice(1) %>%
          as.character()
        if (j > 1 & to %in% c('official', 'trivial_nl')) {
-          drug.group[j] <- drug.group[j] %>% tolower()
+          abcode.group[j] <- abcode.group[j] %>% tolower()
        }
      }
-      abcode[i] <- paste(drug.group, collapse = textbetween)
+      abcode[i] <- paste(abcode.group, collapse = textbetween)
+    }
+
+    # when nothing found, try first chars of official name
+    if (is.na(abcode[i])) {
+      abcode[i] <- antibiotics %>%
+        filter(official %like% paste0('^', abcode.bak[i])) %>%
+        pull(to) %>%
+        .[1]
+    }
+    if (is.na(abcode[i])) {
+      warning('Code "', abcode.bak[i], '" not found in antibiotics list.', call. = FALSE)
    }
  }

--- a/R/eucast.R
+++ b/R/eucast.R
@@ -22,8 +22,73 @@
 #' @param tbl table with antibiotic columns, like e.g. \code{amox} and \code{amcl}
 #' @param col_bactid column name of the bacteria ID in \code{tbl} - values of this column should be present in \code{microorganisms$bactid}, see \code{\link{microorganisms}}
 #' @param info print progress
-#' @param amcl,amik,amox,ampi,azit,aztr,cefa,cfra,cfep,cfot,cfox,cfta,cftr,cfur,chlo,cipr,clar,clin,clox,coli,czol,dapt,doxy,erta,eryt,fosf,fusi,gent,imip,kana,levo,linc,line,mero,mino,moxi,nali,neom,neti,nitr,novo,norf,oflo,peni,pita,poly,qida,rifa,roxi,siso,teic,tetr,tica,tige,tobr,trim,trsu,vanc column names of antibiotics. Use \code{NA} to skip a column, like \code{tica = NA}. Non-existing column will be skipped.
+#' @param amcl,amik,amox,ampi,azit,azlo,aztr,cefa,cfep,cfot,cfox,cfra,cfta,cftr,cfur,chlo,cipr,clar,clin,clox,coli,czol,dapt,doxy,erta,eryt,fosf,fusi,gent,imip,kana,levo,linc,line,mero,mezl,mino,moxi,nali,neom,neti,nitr,norf,novo,oflo,peni,pita,poly,pris,qida,rifa,roxi,siso,teic,tetr,tica,tige,tobr,trim,trsu,vanc column names of antibiotics. Use \code{NA} to skip a column, like \code{tica = NA}. Non-existing columns will anyway be skipped. See the Antibiotics section for an explanation of the abbreviations.
 #' @param ... parameters that are passed on to \code{EUCAST_rules}
+#' @section Abbrevations of antibiotics:
+#' Abbrevations of the column containing antibiotics:
+#'
+#'  \strong{amcl}: amoxicillin and beta-lactamase inhibitor (\emph{J01CR02}),
+#'  \strong{amik}: amikacin (\emph{J01GB06}),
+#'  \strong{amox}: amoxicillin (\emph{J01CA04}),
+#'  \strong{ampi}: ampicillin (\emph{J01CA01}),
+#'  \strong{azit}: azithromycin (\emph{J01FA10}),
+#'  \strong{azlo}: azlocillin (\emph{J01CA09}),
+#'  \strong{aztr}: aztreonam (\emph{J01DF01}),
+#'  \strong{cefa}: cefaloridine (\emph{J01DB02}),
+#'  \strong{cfep}: cefepime (\emph{J01DE01}),
+#'  \strong{cfot}: cefotaxime (\emph{J01DD01}),
+#'  \strong{cfox}: cefoxitin (\emph{J01DC01}),
+#'  \strong{cfra}: cefradine (\emph{J01DB09}),
+#'  \strong{cfta}: ceftazidime (\emph{J01DD02}),
+#'  \strong{cftr}: ceftriaxone (\emph{J01DD04}),
+#'  \strong{cfur}: cefuroxime (\emph{J01DC02}),
+#'  \strong{chlo}: chloramphenicol (\emph{J01BA01}),
+#'  \strong{cipr}: ciprofloxacin (\emph{J01MA02}),
+#'  \strong{clar}: clarithromycin (\emph{J01FA09}),
+#'  \strong{clin}: clindamycin (\emph{J01FF01}),
+#'  \strong{clox}: flucloxacillin (\emph{J01CF05}),
+#'  \strong{coli}: colistin (\emph{J01XB01}),
+#'  \strong{czol}: cefazolin (\emph{J01DB04}),
+#'  \strong{dapt}: daptomycin (\emph{J01XX09}),
+#'  \strong{doxy}: doxycycline (\emph{J01AA02}),
+#'  \strong{erta}: ertapenem (\emph{J01DH03}),
+#'  \strong{eryt}: erythromycin (\emph{J01FA01}),
+#'  \strong{fosf}: fosfomycin (\emph{J01XX01}),
+#'  \strong{fusi}: fusidic acid (\emph{J01XC01}),
+#'  \strong{gent}: gentamicin (\emph{J01GB03}),
+#'  \strong{imip}: imipenem and cilastatin (\emph{J01DH51}),
+#'  \strong{kana}: kanamycin (\emph{J01GB04}),
+#'  \strong{levo}: levofloxacin (\emph{J01MA12}),
+#'  \strong{linc}: lincomycin (\emph{J01FF02}),
+#'  \strong{line}: linezolid (\emph{J01XX08}),
+#'  \strong{mero}: meropenem (\emph{J01DH02}),
+#'  \strong{mezl}: mezlocillin (\emph{J01CA10}),
+#'  \strong{mino}: minocycline (\emph{J01AA08}),
+#'  \strong{moxi}: moxifloxacin (\emph{J01MA14}),
+#'  \strong{nali}: nalidixic acid (\emph{J01MB02}),
+#'  \strong{neom}: neomycin (\emph{J01GB05}),
+#'  \strong{neti}: netilmicin (\emph{J01GB07}),
+#'  \strong{nitr}: nitrofurantoin (\emph{J01XE01}),
+#'  \strong{norf}: norfloxacin (\emph{J01MA06}),
+#'  \strong{novo}: novobiocin (an ATCvet code: \emph{QJ01XX95}),
+#'  \strong{oflo}: ofloxacin (\emph{J01MA01}),
+#'  \strong{peni}: penicillins, combinations with other antibacterials (\emph{J01RA01}),
+#'  \strong{pita}: piperacillin and beta-lactamase inhibitor (\emph{J01CR05}),
+#'  \strong{poly}: polymyxin B (\emph{J01XB02}),
+#'  \strong{pris}: pristinamycin (\emph{J01FG01}),
+#'  \strong{qida}: quinupristin/dalfopristin (\emph{J01FG02}),
+#'  \strong{rifa}: rifampicin (\emph{J04AB02}),
+#'  \strong{roxi}: roxithromycin (\emph{J01FA06}),
+#'  \strong{siso}: sisomicin (\emph{J01GB08}),
+#'  \strong{teic}: teicoplanin (\emph{J01XA02}),
+#'  \strong{tetr}: tetracycline (\emph{J01AA07}),
+#'  \strong{tica}: ticarcillin (\emph{J01CA13}),
+#'  \strong{tige}: tigecycline (\emph{J01AA12}),
+#'  \strong{tobr}: tobramycin (\emph{J01GB01}),
+#'  \strong{trim}: trimethoprim (\emph{J01EA01}),
+#'  \strong{trsu}: sulfamethoxazole and trimethoprim (\emph{J01EE01}),
+#'  \strong{vanc}: vancomycin (\emph{J01XA01}).
+#' @keywords interpretive eucast reading resistance
 #' @rdname EUCAST
 #' @export
 #' @importFrom dplyr %>% left_join select
@@ -60,12 +125,13 @@ EUCAST_rules <- function(tbl,
                         amox = 'amox',
                         ampi = 'ampi',
                         azit = 'azit',
+                         azlo = 'azlo',
                         aztr = 'aztr',
                         cefa = 'cefa',
-                         cfra = 'cfra',
                         cfep = 'cfep',
                         cfot = 'cfot',
                         cfox = 'cfox',
+                         cfra = 'cfra',
                         cfta = 'cfta',
                         cftr = 'cftr',
                         cfur = 'cfur',
@@ -89,18 +155,20 @@ EUCAST_rules <- function(tbl,
                         linc = 'linc',
                         line = 'line',
                         mero = 'mero',
+                         mezl = 'mezl',
                         mino = 'mino',
                         moxi = 'moxi',
                         nali = 'nali',
                         neom = 'neom',
                         neti = 'neti',
                         nitr = 'nitr',
-                         novo = 'novo',
                         norf = 'norf',
+                         novo = 'novo',
                         oflo = 'oflo',
                         peni = 'peni',
                         pita = 'pita',
                         poly = 'poly',
+                         pris = 'pris',
                         qida = 'qida',
                         rifa = 'rifa',
                         roxi = 'roxi',
@@ -121,11 +189,11 @@ EUCAST_rules <- function(tbl,
  }

  # check columns
-  col.list <- c(amcl, amik, amox, ampi, azit, aztr, cefa, cfra, cfep, cfot,
+  col.list <- c(amcl, amik, amox, ampi, azit, azlo, aztr, cefa, cfra, cfep, cfot,
                cfox, cfta, cftr, cfur, chlo, cipr, clar, clin, clox, coli,
                czol, dapt, doxy, erta, eryt, fosf, fusi, gent, imip, kana,
-                levo, linc, line, mero, mino, moxi, nali, neom, neti, nitr,
-                novo, norf, oflo, peni, pita, poly, qida, rifa, roxi, siso,
+                levo, linc, line, mero, mezl, mino, moxi, nali, neom, neti, nitr,
+                novo, norf, oflo, peni, pita, poly, pris,  qida, rifa, roxi, siso,
                teic, tetr, tica, tige, tobr, trim, trsu, vanc)
  col.list <- check_available_columns(tbl = tbl, col.list = col.list, info = info)
  amcl <- col.list[amcl]
@@ -133,12 +201,13 @@ EUCAST_rules <- function(tbl,
  amox <- col.list[amox]
  ampi <- col.list[ampi]
  azit <- col.list[azit]
+  azlo <- col.list[azlo]
  aztr <- col.list[aztr]
  cefa <- col.list[cefa]
-  cfra <- col.list[cfra]
  cfep <- col.list[cfep]
  cfot <- col.list[cfot]
  cfox <- col.list[cfox]
+  cfra <- col.list[cfra]
  cfta <- col.list[cfta]
  cftr <- col.list[cftr]
  cfur <- col.list[cfur]
@@ -162,18 +231,20 @@ EUCAST_rules <- function(tbl,
  linc <- col.list[linc]
  line <- col.list[line]
  mero <- col.list[mero]
+  mezl <- col.list[mezl]
  mino <- col.list[mino]
  moxi <- col.list[moxi]
  nali <- col.list[nali]
  neom <- col.list[neom]
  neti <- col.list[neti]
  nitr <- col.list[nitr]
-  novo <- col.list[novo]
  norf <- col.list[norf]
+  novo <- col.list[novo]
  oflo <- col.list[oflo]
  peni <- col.list[peni]
  pita <- col.list[pita]
  poly <- col.list[poly]
+  pris <- col.list[pris]
  qida <- col.list[qida]
  rifa <- col.list[rifa]
  roxi <- col.list[roxi]
@@ -191,7 +262,9 @@ EUCAST_rules <- function(tbl,
  total_rows <- integer(0)

  # helper function for editing the table
-  edit_rsi <- function(to, rows, cols) {
+  edit_rsi <- function(to, rows, cols, EUCAST_rule = "") {
+    # later: use this as attribute for the edited observations
+    EUCAST_rule <- trimws(paste("EUCAST rule", EUCAST_rule))
    cols <- cols[!is.na(cols)]
    if (length(rows) > 0 & length(cols) > 0) {
      tbl[rows, cols] <<- to
@@ -200,9 +273,9 @@ EUCAST_rules <- function(tbl,
    }
  }

-  # join to microorganisms table
+  # join to microorganisms data set
  if (!tbl %>% pull(col_bactid) %>% is.bactid()) {
-    tbl[, col_bactid] <- tbl %>% pull(col_bactid) %>% as.bactid()
+    # warning("Improve integrity of the `", col_bactid, "` column by transforming it with 'as.bactid'.")
  }
  tbl <- tbl %>% left_join_microorganisms(by = col_bactid, suffix = c("_tempmicroorganisms", ""))

@@ -212,11 +285,11 @@ EUCAST_rules <- function(tbl,
  polymyxins <- c(poly, coli)
  macrolides <- c(eryt, azit, roxi, clar) # since EUCAST v3.1 clinda is set apart
  glycopeptides <- c(vanc, teic)
-  streptogramins <- qida # should officially also be pristinamycin and quinupristin/dalfopristin
+  streptogramins <- c(qida, pris) # should officially also be quinupristin/dalfopristin
  cephalosporins <- c(cfep, cfot, cfox, cfra, cfta, cftr, cfur, czol)
  carbapenems <- c(erta, imip, mero)
  aminopenicillins <- c(ampi, amox)
-  ureidopenicillins <- pita # should officially also be azlo and mezlo
+  ureidopenicillins <- c(pita, azlo, mezl)
  fluoroquinolones <- c(oflo, cipr, norf, levo, moxi)

  if (info == TRUE) {
@@ -230,7 +303,7 @@ EUCAST_rules <- function(tbl,

  # Table 1: Intrinsic resistance in Enterobacteriaceae ----
  if (info == TRUE) {
-    cat('...Table 1: Intrinsic resistance in Enterobacteriaceae\n')
+    cat('- Table 1:  Intrinsic resistance in Enterobacteriaceae\n')
  }
  # Intrisiek R for this group
  edit_rsi(to = 'R',
@@ -301,7 +374,7 @@ EUCAST_rules <- function(tbl,

  # Table 2: Intrinsic resistance in non-fermentative Gram-negative bacteria ----
  if (info == TRUE) {
-    cat('...Table 2: Intrinsic resistance in non-fermentative Gram-negative bacteria\n')
+    cat('- Table 2:  Intrinsic resistance in non-fermentative Gram-negative bacteria\n')
  }
  # Intrisiek R for this group
  edit_rsi(to = 'R',
@@ -349,7 +422,7 @@ EUCAST_rules <- function(tbl,

  # Table 3: Intrinsic resistance in other Gram-negative bacteria ----
  if (info == TRUE) {
-    cat('...Table 3: Intrinsic resistance in other Gram-negative bacteria\n')
+    cat('- Table 3:  Intrinsic resistance in other Gram-negative bacteria\n')
  }
  # Intrisiek R for this group
  edit_rsi(to = 'R',
@@ -381,7 +454,7 @@ EUCAST_rules <- function(tbl,

  # Table 4: Intrinsic resistance in Gram-positive bacteria ----
  if (info == TRUE) {
-    cat('...Table 4: Intrinsic resistance in Gram-positive bacteria\n')
+    cat('- Table 4:  Intrinsic resistance in Gram-positive bacteria\n')
  }
  # Intrisiek R for this group
  edit_rsi(to = 'R',
@@ -435,7 +508,7 @@ EUCAST_rules <- function(tbl,

  # Table 8: Interpretive rules for B-lactam agents and Gram-positive cocci ----
  if (info == TRUE) {
-    cat('...Table 8: Interpretive rules for B-lactam agents and Gram-positive cocci\n')
+    cat('- Table 8:  Interpretive rules for B-lactam agents and Gram-positive cocci\n')
  }
  # rule 8.3
  if (!is.na(peni)) {
@@ -460,7 +533,7 @@ EUCAST_rules <- function(tbl,

  # Table 9: Interpretive rules for B-lactam agents and Gram-negative rods ----
  if (info == TRUE) {
-    cat('...Table 9: Interpretive rules for B-lactam agents and Gram-negative rods\n')
+    cat('- Table 9:  Interpretive rules for B-lactam agents and Gram-negative rods\n')
  }
  # rule 9.3
  if (!is.na(tica) & !is.na(pita)) {
@@ -473,7 +546,7 @@ EUCAST_rules <- function(tbl,

  # Table 10: Interpretive rules for B-lactam agents and other Gram-negative bacteria ----
  if (info == TRUE) {
-    cat('...Table 10: Interpretive rules for B-lactam agents and other Gram-negative bacteria\n')
+    cat('- Table 10: Interpretive rules for B-lactam agents and other Gram-negative bacteria\n')
  }
  # rule 10.2
  if (!is.na(ampi)) {
@@ -486,7 +559,7 @@ EUCAST_rules <- function(tbl,

  # Table 11: Interpretive rules for macrolides, lincosamides, and streptogramins ----
  if (info == TRUE) {
-    cat('...Table 11: Interpretive rules for macrolides, lincosamides, and streptogramins\n')
+    cat('- Table 11: Interpretive rules for macrolides, lincosamides, and streptogramins\n')
  }
  # rule 11.1
  if (!is.na(eryt)) {
@@ -500,7 +573,7 @@ EUCAST_rules <- function(tbl,

  # Table 12: Interpretive rules for aminoglycosides ----
  if (info == TRUE) {
-    cat('...Table 12: Interpretive rules for aminoglycosides\n')
+    cat('- Table 12: Interpretive rules for aminoglycosides\n')
  }
  # rule 12.2
  if (!is.na(tobr)) {
@@ -536,7 +609,7 @@ EUCAST_rules <- function(tbl,

  # Table 13: Interpretive rules for quinolones ----
  if (info == TRUE) {
-    cat('...Table 13: Interpretive rules for quinolones\n')
+    cat('- Table 13: Interpretive rules for quinolones\n')
  }
  # rule 13.2
  if (!is.na(moxi)) {
@@ -570,7 +643,7 @@ EUCAST_rules <- function(tbl,

  # Other ----
  if (info == TRUE) {
-    cat('...Non-EUCAST: trim = R where trsu = R and ampi = R where amcl = R\n')
+    cat('- Non-EUCAST: trim = R where trsu = R and ampi = R where amcl = R\n')
  }
  if (!is.na(amcl)) {
    edit_rsi(to = 'R',
@@ -582,6 +655,20 @@ EUCAST_rules <- function(tbl,
             rows = which(tbl[, trsu] == 'R'),
             cols = trim)
  }
+  if (info == TRUE) {
+    cat('- Non-EUCAST: trsu = S where trim = S and amcl = S where ampi = S\n')
+  }
+  if (!is.na(amcl)) {
+    edit_rsi(to = 'S',
+             rows = which(tbl[, ampi] == 'S'),
+             cols = amcl)
+  }
+  if (!is.na(trsu)) {
+    edit_rsi(to = 'S',
+             rows = which(tbl[, trim] == 'S'),
+             cols = trsu)
+  }
+  # amox = ampi
  if (!is.na(ampi) & !is.na(amox)) {
    tbl[, amox] <- tbl %>% pull(ampi)
  }
@@ -596,7 +683,7 @@ EUCAST_rules <- function(tbl,
  if (info == TRUE) {
    cat('Done.\n\nEUCAST Expert rules applied to',
        total_rows %>% unique() %>% length() %>% format(big.mark = ","),
-        'different rows (isolates); edited a total of',
+        'different rows; overwritten a total of',
        total %>% format(big.mark = ","), 'test results.\n\n')
  }

--- a/R/first_isolate.R
+++ b/R/first_isolate.R
@@ -72,13 +72,13 @@
 #'             resistance = resistance(gent))
 #'
 #' B <- my_patients %>%
-#'   filter(first_isolate == TRUE) %>%
+#'   filter(first_isolate == TRUE) %>% # the 1st isolate filter
 #'   group_by(hospital_id) %>%
-#'   summarise(count = n_rsi(gent), # gentamicin
+#'   summarise(count = n_rsi(gent),
 #'             resistance = resistance(gent))
 #'
 #' # Have a look at A and B. B is more reliable because every isolate is
-#' # counted once. Gentamicin resitance in hospital D seems to be 5%
+#' # counted once. Gentamicin resitance in hospital D appears to be 5%
 #' # higher than originally thought.
 #'
 #' ## OTHER EXAMPLES:
@@ -171,8 +171,9 @@ first_isolate <- function(tbl,

  if (!is.na(col_bactid)) {
    if (!tbl %>% pull(col_bactid) %>% is.bactid()) {
-      warning("Improve integrity of the `", col_bactid, "` column by transforming it with 'as.bactid'.")
+      # warning("Improve integrity of the `", col_bactid, "` column by transforming it with 'as.bactid'.")
    }
+    # join to microorganisms data set
    tbl <- tbl %>% left_join_microorganisms(by = col_bactid)
    col_genus <- "genus"
    col_species <- "species"
--- a/R/globals.R
+++ b/R/globals.R
@@ -17,6 +17,7 @@
 # ==================================================================== #

 globalVariables(c('abname',
+                  'antibiotic',
                  'antibiotics',
                  'atc',
                  'bactid',
@@ -47,11 +48,15 @@ globalVariables(c('abname',
                  'molis',
                  'n',
                  'na.omit',
+                  'observations',
+                  'official',
                  'other_pat_or_mo',
                  'Pasted',
                  'patient_id',
                  'quantile',
+                  'R',
                  'real_first_isolate',
+                  'S',
                  'septic_patients',
                  'species',
                  'umcg',
--- a/R/resistance.R
+++ b/R/resistance.R
@@ -415,33 +415,45 @@ rsi_df <- function(tbl,
 #' Predict antimicrobial resistance
 #'
 #' Create a prediction model to predict antimicrobial resistance for the next years on statistical solid ground. Standard errors (SE) will be returned as columns \code{se_min} and \code{se_max}. See Examples for a real live example.
-#' @param tbl table that contains columns \code{col_ab} and \code{col_date}
-#' @param col_ab column name of \code{tbl} with antimicrobial interpretations (\code{R}, \code{I} and \code{S}), supports tidyverse-like quotation
-#' @param col_date column name of the date, will be used to calculate years if this column doesn't consist of years already, supports tidyverse-like quotation
-#' @param year_max highest year to use in the prediction model, deafults to 15 years after today
+#' @inheritParams first_isolate
+#' @param col_ab column name of \code{tbl} with antimicrobial interpretations (\code{R}, \code{I} and \code{S})
+#' @param col_date column name of the date, will be used to calculate years if this column doesn't consist of years already
+#' @param year_min lowest year to use in the prediction model, dafaults the lowest year in \code{col_date}
+#' @param year_max highest year to use in the prediction model, defaults to 15 years after today
 #' @param year_every unit of sequence between lowest year found in the data and \code{year_max}
+#' @param minimum minimal amount of available isolates per year to include. Years containing less observations will be estimated by the model.
 #' @param model the statistical model of choice. Valid values are \code{"binomial"} (or \code{"binom"} or \code{"logit"}) or \code{"loglin"} or \code{"linear"} (or \code{"lin"}).
 #' @param I_as_R treat \code{I} as \code{R}
-#' @param preserve_measurements overwrite predictions of years that are actually available in the data, with the original data. The standard errors of those years will be \code{NA}.
+#' @param preserve_measurements logical to indicate whether predictions of years that are actually available in the data should be overwritten with the original data. The standard errors of those years will be \code{NA}.
 #' @param info print textual analysis with the name and \code{\link{summary}} of the model.
-#' @return \code{data.frame} with columns \code{year}, \code{probR}, \code{se_min} and \code{se_max}.
+#' @return \code{data.frame} with columns:
+#' \itemize{
+#'   \item{\code{year}}
+#'   \item{\code{resistance}, the same as \code{estimated} when \code{preserve_measurements = FALSE}, and a combination of \code{observed} and \code{estimated} otherwise}
+#'   \item{\code{se_min}, the lower bound of the standard error with a minimum of \code{0}}
+#'   \item{\code{se_max} the upper bound of the standard error with a maximum of \code{1}}
+#'   \item{\code{observations}, the total number of observations, i.e. S + I + R}
+#'   \item{\code{observed}, the original observed values}
+#'   \item{\code{estimated}, the estimated values, calculated by the model}
+#' }
 #' @seealso \code{\link{resistance}} \cr \code{\link{lm}} \code{\link{glm}}
 #' @rdname resistance_predict
 #' @export
-#' @importFrom dplyr %>% pull mutate group_by_at summarise filter
-#' @importFrom reshape2 dcast
+#' @importFrom stats predict glm lm
+#' @importFrom dplyr %>% pull mutate group_by_at summarise filter n_distinct arrange
+# @importFrom tidyr spread
 #' @examples
 #' \dontrun{
-#' # use it directly:
-#' rsi_predict(tbl = tbl[which(first_isolate == TRUE & genus == "Haemophilus"),],
-#'             col_ab = "amcl", col_date = "date")
+#' # use it with base R:
+#' resistance_predict(tbl = tbl[which(first_isolate == TRUE & genus == "Haemophilus"),],
+#'                    col_ab = "amcl", col_date = "date")
 #'
-#' # or with dplyr so you can actually read it:
+#' # or use dplyr so you can actually read it:
 #' library(dplyr)
 #' tbl %>%
 #'   filter(first_isolate == TRUE,
 #'          genus == "Haemophilus") %>%
-#'   rsi_predict(amcl, date)
+#'   resistance_predict(amcl, date)
 #' }
 #'
 #'
@@ -463,20 +475,49 @@ rsi_df <- function(tbl,
 #'          species == "coli",
 #'          first_isolate == TRUE) %>%
 #'   # predict resistance of cefotaxime for next years
-#'   rsi_predict(col_ab = "cfot",
-#'               col_date = "date",
-#'               year_max = 2025,
-#'               preserve_measurements = FALSE)
+#'   resistance_predict(col_ab = "cfot",
+#'                      col_date = "date",
+#'                      year_max = 2025,
+#'                      preserve_measurements = TRUE,
+#'                      minimum = 0)
 #'
+#' # create nice plots with ggplot
+#' if (!require(ggplot2)) {
+#'
+#'   data <- septic_patients %>%
+#'     filter(bactid == "ESCCOL") %>%
+#'     resistance_predict(col_ab = "amox",
+#'                       col_date = "date",
+#'                       info = FALSE,
+#'                       minimum = 15)
+#'
+#'   ggplot(data,
+#'          aes(x = year)) +
+#'     geom_col(aes(y = resistance),
+#'              fill = "grey75") +
+#'     geom_errorbar(aes(ymin = se_min,
+#'                       ymax = se_max),
+#'                   colour = "grey50") +
+#'     scale_y_continuous(limits = c(0, 1),
+#'                        breaks = seq(0, 1, 0.1),
+#'                        labels = paste0(seq(0, 100, 10), "%")) +
+#'     labs(title = expression(paste("Forecast of amoxicillin resistance in ",
+#'                                   italic("E. coli"))),
+#'          y = "%IR",
+#'          x = "Year") +
+#'     theme_minimal(base_size = 13)
+#' }
 resistance_predict <- function(tbl,
-                        col_ab,
-                        col_date,
-                        year_max = as.integer(format(as.Date(Sys.Date()), '%Y')) + 15,
-                        year_every = 1,
-                        model = 'binomial',
-                        I_as_R = TRUE,
-                        preserve_measurements = TRUE,
-                        info = TRUE) {
+                               col_ab,
+                               col_date,
+                               year_min = NULL,
+                               year_max = NULL,
+                               year_every = 1,
+                               minimum = 30,
+                               model = 'binomial',
+                               I_as_R = TRUE,
+                               preserve_measurements = TRUE,
+                               info = TRUE) {

  if (nrow(tbl) == 0) {
    stop('This table does not contain any observations.')
@@ -498,8 +539,8 @@ resistance_predict <- function(tbl,
    tbl[, col_ab] <- gsub('I', 'R', tbl %>% pull(col_ab))
  }

-  if (!all(tbl %>% pull(col_ab) %>% as.rsi() %in% c(NA, 'S', 'I', 'R'))) {
-    stop('Column ', col_ab, ' must contain antimicrobial interpretations (S, I, R).')
+  if (!tbl %>% pull(col_ab) %>% is.rsi()) {
+    tbl[, col_ab] <- tbl %>% pull(col_ab) %>% as.rsi()
  }

  year <- function(x) {
@@ -510,15 +551,39 @@ resistance_predict <- function(tbl,
    }
  }

-  years_predict <- seq(from = min(year(tbl %>% pull(col_date))), to = year_max, by = year_every)
-
  df <- tbl %>%
-    mutate(year = year(tbl %>% pull(col_date))) %>%
+    mutate(year = tbl %>% pull(col_date) %>% year()) %>%
    group_by_at(c('year', col_ab)) %>%
    summarise(n())
-  colnames(df) <- c('year', 'antibiotic', 'count')
+
+  if (df %>% pull(col_ab) %>% n_distinct(na.rm = TRUE) < 2) {
+    stop("No variety in antimicrobial interpretations - all isolates are '",
+         df %>% pull(col_ab) %>% unique() %>% .[!is.na(.)], "'.",
+         call. = FALSE)
+  }
+
+  colnames(df) <- c('year', 'antibiotic', 'observations')
  df <- df %>%
-    reshape2::dcast(year ~ antibiotic, value.var = 'count')
+    filter(!is.na(antibiotic)) %>%
+    tidyr::spread(antibiotic, observations, fill = 0) %>%
+    mutate(total = R + S) %>%
+    filter(total >= minimum)
+
+  if (NROW(df) == 0) {
+    stop('There are no observations.')
+  }
+
+  year_lowest <- min(df$year)
+  if (is.null(year_min)) {
+    year_min <- year_lowest
+  } else {
+    year_min <- max(year_min, year_lowest, na.rm = TRUE)
+  }
+  if (is.null(year_max)) {
+    year_max <- year(Sys.Date()) + 15
+  }
+
+  years_predict <- seq(from = year_min, to = year_max, by = year_every)

  if (model %in% c('binomial', 'binom', 'logit')) {
    logitmodel <- with(df, glm(cbind(R, S) ~ year, family = binomial))
@@ -528,7 +593,7 @@ resistance_predict <- function(tbl,
      print(summary(logitmodel))
    }

-    predictmodel <- stats::predict(logitmodel, newdata = with(df, list(year = years_predict)), type = "response", se.fit = TRUE)
+    predictmodel <- predict(logitmodel, newdata = with(df, list(year = years_predict)), type = "response", se.fit = TRUE)
    prediction <- predictmodel$fit
    se <- predictmodel$se.fit

@@ -540,7 +605,7 @@ resistance_predict <- function(tbl,
      print(summary(loglinmodel))
    }

-    predictmodel <- stats::predict(loglinmodel, newdata = with(df, list(year = years_predict)), type = "response", se.fit = TRUE)
+    predictmodel <- predict(loglinmodel, newdata = with(df, list(year = years_predict)), type = "response", se.fit = TRUE)
    prediction <- predictmodel$fit
    se <- predictmodel$se.fit

@@ -552,7 +617,7 @@ resistance_predict <- function(tbl,
      print(summary(linmodel))
    }

-    predictmodel <- stats::predict(linmodel, newdata = with(df, list(year = years_predict)), se.fit = TRUE)
+    predictmodel <- predict(linmodel, newdata = with(df, list(year = years_predict)), se.fit = TRUE)
    prediction <- predictmodel$fit
    se <- predictmodel$se.fit

@@ -561,13 +626,13 @@ resistance_predict <- function(tbl,
  }

  # prepare the output dataframe
-  prediction <- data.frame(year = years_predict, probR = prediction, stringsAsFactors = FALSE)
+  prediction <- data.frame(year = years_predict, resistance = prediction, stringsAsFactors = FALSE)

-  prediction$se_min <- prediction$probR - se
-  prediction$se_max <- prediction$probR + se
+  prediction$se_min <- prediction$resistance - se
+  prediction$se_max <- prediction$resistance + se

  if (model == 'loglin') {
-    prediction$probR <- prediction$probR %>%
+    prediction$resistance <- prediction$resistance %>%
      format(scientific = FALSE) %>%
      as.integer()
    prediction$se_min <- prediction$se_min %>% as.integer()
@@ -578,31 +643,42 @@ resistance_predict <- function(tbl,
    prediction$se_max[which(prediction$se_max > 1)] <- 1
  }
  prediction$se_min[which(prediction$se_min < 0)] <- 0
+  prediction$observations = NA

  total <- prediction

  if (preserve_measurements == TRUE) {
-    # geschatte data vervangen door gemeten data
+    # replace estimated data by observed data
    if (I_as_R == TRUE) {
      if (!'I' %in% colnames(df)) {
        df$I <- 0
      }
-      df$probR <- df$R / rowSums(df[, c('R', 'S', 'I')])
+      df$resistance <- df$R / rowSums(df[, c('R', 'S', 'I')])
    } else {
-      df$probR <- df$R / rowSums(df[, c('R', 'S')])
+      df$resistance <- df$R / rowSums(df[, c('R', 'S')])
    }
    measurements <- data.frame(year = df$year,
-                           probR = df$probR,
-                           se_min = NA,
-                           se_max = NA,
-                           stringsAsFactors = FALSE)
+                               resistance = df$resistance,
+                               se_min = NA,
+                               se_max = NA,
+                               observations = df$total,
+                               stringsAsFactors = FALSE)
    colnames(measurements) <- colnames(prediction)
-    prediction <- prediction %>% filter(!year %in% df$year)

-    total <- rbind(measurements, prediction)
+    total <- rbind(measurements,
+                   prediction %>% filter(!year %in% df$year))
+    if (model %in% c('binomial', 'binom', 'logit')) {
+      total <- total %>% mutate(observed = ifelse(is.na(observations), NA, resistance),
+                                estimated = prediction$resistance)
+    }
  }

-  total
+  try(
+    total$resistance[which(total$resistance > 1)] <- 1,
+    total$resistance[which(total$resistance < 0)] <- 0,
+    silent = TRUE
+  )
+  total %>% arrange(year)

 }