(v1.3.0.9023) optimalisation

2025-07-09 00:02:38 +02:00 · 2020-09-19 11:54:01 +02:00
parent 4e40e42011
commit d049cce69b
30 changed files with 104 additions and 578 deletions
--- a/R/aa_helper_functions.R
+++ b/R/aa_helper_functions.R
@ -26,7 +26,7 @@ pm_left_join <- function(x, y, by = NULL, suffix = c(".x", ".y")) {
    if (is.na(by)) {
      stop_("no common column found for pm_left_join()")
    }
-    join_message(by)
+    pm_join_message(by)
  } else if (!is.null(names(by))) {
    by <- unname(c(names(by), by))
  }
--- a/R/aa_helper_pm_functions.R
+++ b/R/aa_helper_pm_functions.R
@ -36,7 +36,7 @@
 # distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software 
 # is furnished to do so', given that a copyright notice is given in the software.
 #
-# Copyright notice on 18 September 2020, the day this code was downloaded, as found on
+# Copyright notice on 19 September 2020, the day this code was downloaded, as found on
 # https://github.com/nathaneastwood/poorman/blob/52eb6947e0b4430cd588976ed8820013eddf955f/LICENSE:
 # YEAR: 2020
 # COPYRIGHT HOLDER: Nathan Eastwood
@ -67,28 +67,6 @@ pm_between <- function(x, left, right) {
  if (!is.double(x)) x <- as.numeric(x)
  x >= as.numeric(left) & x <= as.numeric(right)
 }
-pm_coalesce <- function(...) {
-  if (missing(..1)) stop("At least one argument must be supplied.")
-
-  vectors <- list(...)
-  vectors_lens <- unique(lengths(vectors))
-  if (length(vectors_lens) > 2L || (length(vectors_lens) == 2L & !1 %in% vectors_lens)) {
-    stop("Vectors must all be of length 1 and/or pm_n")
-  }
-  max_len <- max(vectors_lens)
-
-  len_one <- lengths(vectors) == 1L
-  vectors[len_one] <- lapply(vectors[len_one], function(x) rep(x, max_len))
-
-  x <- vectors[[1]]
-  vectors <- vectors[-1]
-
-  for (i in seq_along(vectors)) {
-    x_miss <- is.na(x)
-    x[x_miss] <- vectors[[i]][x_miss]
-  }
-  x
-}
 pm_context <- new.env()

 # Data
@ -288,23 +266,6 @@ pm_filter.grouped_data <- function(.data, ...) {
  res <- pm_apply_grouped_function("pm_filter", .data, drop = TRUE, ...)
  res[rows[rows %in% rownames(res)], ]
 }
-pm_glimpse <- function(x, width = getOption("width"), ...) {
-  if ("grouped_data" %in% class(.data)) {
-    pm_glimpse.grouped_data(.data, ...)
-  } else {
-    pm_glimpse.default(.data, ...)
-  }
-}
-
-pm_glimpse.default <- function (x, width = getOption("width"), max.level = 3, ...) {
-  utils::str(x, width = width, max.level = max.level, ...)
-  invisible(x)
-}
-
-pm_glimpse.data.frame <- function(x, width = getOption("width"), ...) {
-  utils::str(x, width = width, ...)
-  invisible(x)
-}
 pm_group_by <- function(.data, ..., .add = FALSE) {
  pm_check_is_dataframe(.data)
  pre_groups <- pm_get_groups(.data)
@ -481,6 +442,27 @@ pm_if_else <- function(condition, true, false, missing = NULL) {
  res
 }

+pm_anti_join <- function(x, y, by = NULL) {
+  pm_filter_join_worker(x, y, by, type = "anti")
+}
+
+pm_semi_join <- function(x, y, by = NULL) {
+  pm_filter_join_worker(x, y, by, type = "semi")
+}
+
+pm_filter_join_worker <- function(x, y, by = NULL, type = c("anti", "semi")) {
+  type <- match.arg(type, choices = c("anti", "semi"), several.ok = FALSE)
+  if (is.null(by)) {
+    by <- intersect(names(x), names(y))
+    pm_join_message(by)
+  }
+  rows <- interaction(x[, by]) %in% interaction(y[, by])
+  if (type == "anti") rows <- !rows
+  res <- x[rows, , drop = FALSE]
+  rownames(res) <- NULL
+  res
+}
+
 pm_inner_join <- function(x, y, by = NULL, suffix = c(".x", ".y")) {
  pm_join_worker(x = x, y = y, by = by, suffix = suffix, sort = FALSE)
 }
@ -520,27 +502,6 @@ pm_join_message <- function(by) {
    message("Joining, by = \"", by, "\"\n", sep = "")
  }
 }
-
-pm_anti_join <- function(x, y, by = NULL) {
-  pm_filter_join_worker(x, y, by, type = "anti")
-}
-
-pm_semi_join <- function(x, y, by = NULL) {
-  pm_filter_join_worker(x, y, by, type = "semi")
-}
-
-pm_filter_join_worker <- function(x, y, by = NULL, type = c("anti", "semi")) {
-  type <- match.arg(type, choices = c("anti", "semi"), several.ok = FALSE)
-  if (is.null(by)) {
-    by <- intersect(names(x), names(y))
-    pm_join_message(by)
-  }
-  rows <- interaction(x[, by]) %in% interaction(y[, by])
-  if (type == "anti") rows <- !rows
-  res <- x[rows,, drop = FALSE]
-  rownames(res) <- NULL
-  res
-}
 pm_lag <- function(x, pm_n = 1L, default = NA) {
  if (inherits(x, "ts")) stop("`x` must be a vector, not a `ts` object, do you want `stats::pm_lag()`?")
  if (length(pm_n) != 1L || !is.numeric(pm_n) || pm_n < 0L) stop("`pm_n` must be a nonnegative integer scalar")
@ -630,163 +591,6 @@ pm_pull <- function(.data, var = -1) {
  }
  .data[, var]
 }
-pm_recode <- function(.x, ..., .default = NULL, .missing = NULL) {
-  if ("grouped_data" %in% class(.data)) {
-    pm_recode.grouped_data(.data, ...)
-  } else {
-    pm_recode.default(.data, ...)
-  }
-}
-
-pm_recode.numeric <- function(.x, ..., .default = NULL, .missing = NULL) {
-  values <- pm_dotdotdot(...)
-
-  nms <- pm_have_name(values)
-  if (all(nms)) {
-    vals <- as.double(names(values))
-  } else if (all(!nms)) {
-    vals <- seq_along(values)
-  } else {
-    stop("Either all values must be named, or none must be named.")
-  }
-
-  pm_n <- length(.x)
-  template <- pm_find_template(values, .default, .missing)
-  res <- template[rep(NA_integer_, pm_n)]
-  replaced <- rep(FALSE, pm_n)
-
-  for (i in seq_along(values)) {
-    res <- pm_replace_with(res, .x == vals[i], values[[i]], paste0("Vector ", i))
-    replaced[.x == vals[i]] <- TRUE
-  }
-
-  .default <- pm_validate_recode_default(.default, .x, res, replaced)
-  res <- pm_replace_with(res, !replaced & !is.na(.x), .default, "`.default`")
-  res <- pm_replace_with(res, is.na(.x), .missing, "`.missing`")
-  res
-}
-
-pm_recode.character <- function(.x, ..., .default = NULL, .missing = NULL) {
-  .x <- as.character(.x)
-  values <- pm_dotdotdot(...)
-  val_names <- names(values)
-  have_names <- pm_have_name(values)
-  if (!all(have_names)) {
-    bad <- which(!have_names) + 1L
-    stop("Argument", if (length(bad) > 1L) "s", " ", paste(bad, sep = ", "), " must be named, not unnamed.")
-  }
-
-  pm_n <- length(.x)
-  template <- pm_find_template(values, .default, .missing)
-  res <- template[rep(NA_integer_, pm_n)]
-  replaced <- rep(FALSE, pm_n)
-
-  for (nm in val_names) {
-    res <- pm_replace_with(res, .x == nm, values[[nm]], paste0("`", nm, "`"))
-    replaced[.x == nm] <- TRUE
-  }
-
-  .default <- pm_validate_recode_default(.default, .x, res, replaced)
-  res <- pm_replace_with(res, !replaced & !is.na(.x), .default, "`.default`")
-  res <- pm_replace_with(res, is.na(.x), .missing, "`.missing`")
-  res
-}
-
-pm_recode.factor <- function(.x, ..., .default = NULL, .missing = NULL) {
-  values <- pm_dotdotdot(...)
-  if (length(values) == 0) stop("No replacements provided.")
-
-  have_names <- pm_have_name(values)
-  if (!all(have_names)) {
-    bad <- which(!have_names) + 1
-    stop(bad, " must be named, not unnamed.")
-  }
-  if (!is.null(.missing)) {
-    stop("`.missing` is not supported for factors.")
-  }
-
-  pm_n <- length(levels(.x))
-  template <- pm_find_template(values, .default, .missing)
-  res <- template[rep(NA_integer_, pm_n)]
-  replaced <- rep(FALSE, pm_n)
-
-  for (nm in names(values)) {
-    res <- pm_replace_with(res, levels(.x) == nm, values[[nm]], paste0("`", nm, "`"))
-    replaced[levels(.x) == nm] <- TRUE
-  }
-  .default <- pm_validate_recode_default(.default, .x, res, replaced)
-  res <- pm_replace_with(res, !replaced, .default, "`.default`")
-
-  if (is.character(res)) {
-    levels(.x) <- res
-    .x
-  } else {
-    res[as.integer(.x)]
-  }
-}
-
-pm_have_name <- function(x) {
-  nms <- names(x)
-  if (is.null(nms)) rep(FALSE, length(x)) else !(nms == "" | is.na(nms))
-}
-
-pm_compact <- function(.x) Filter(length, .x)
-
-pm_find_template <- function(values, .default = NULL, .missing = NULL) {
-  x <- pm_compact(c(values, .default, .missing))
-  if (length(x) == 0L) {
-    stop("No replacements provided.")
-  }
-  x[[1]]
-}
-
-pm_validate_recode_default <- function(default, x, res, replaced) {
-  default <- pm_recode_default(x, default, res)
-  if (is.null(default) && sum(replaced & !is.na(x)) < length(res[!is.na(x)])) {
-    warning(
-      "Unreplaced values treated as NA as .x is not compatible. ",
-      "Please specify replacements exhaustively or supply .default",
-      call. = FALSE
-    )
-  }
-  default
-}
-
-pm_recode_default <- function(x, default, res) {
-  if ("grouped_data" %in% class(.data)) {
-    pm_recode_default.grouped_data(.data, ...)
-  } else {
-    pm_recode_default.default(.data, ...)
-  }
-}
-
-pm_recode_default.default <- function(x, default, res) {
-  same_type <- identical(typeof(x), typeof(res))
-  if (is.null(default) && same_type) x else default
-}
-
-pm_recode_default.factor <- function(x, default, res) {
-  if (is.null(default)) {
-    if ((is.character(res) || is.factor(res)) && is.factor(x)) {
-      levels(x)
-    } else {
-      res[NA_integer_]
-    }
-  } else {
-    default
-  }
-}
-
-pm_recode_factor <- function(.x, ..., .default = NULL, .missing = NULL, .ordered = FALSE) {
-  recoded <- pm_recode(.x, ..., .default = .default, .missing = .missing)
-
-  values <- pm_dotdotdot(...)
-  all_levels <- unique(c(values, pm_recode_default(.x, .default, recoded), .missing))
-  recoded_levels <- if (is.factor(recoded)) levels(recoded) else unique(recoded)
-  levels <- intersect(all_levels, recoded_levels)
-
-  factor(recoded, levels, ordered = .ordered)
-}
 pm_set_names <- function(object = nm, nm) {
  names(object) <- nm
  object
@ -860,35 +664,6 @@ pm_rename_with <- function(.data, .fn, .cols = pm_everything(), ...) {
  if (grouped) .data <- pm_set_groups(.data, colnames(.data)[grp_pos])
  .data
 }
-pm_replace_na <- function(data, replace, ...) {
-  if ("grouped_data" %in% class(.data)) {
-    pm_replace_na.grouped_data(.data, ...)
-  } else {
-    pm_replace_na.default(.data, ...)
-  }
-}
-
-pm_replace_na.default <- function(data, replace = NA, ...) {
-  pm_check_replacement(replace, deparse(substitute(data)))
-  data[is.na(data)] <- replace
-  data
-}
-
-pm_replace_na.data.frame <- function(data, replace = list(), ...) {
-  stopifnot(is.list(replace))
-  replace_vars <- intersect(names(replace), names(data))
-  for (var in replace_vars) {
-    pm_check_replacement(replace[[var]], var)
-    data[[var]][is.na(data[[var]])] <- replace[[var]]
-  }
-  data
-}
-
-pm_check_replacement <- function(x, var) {
-  pm_n <- length(x)
-  if (pm_n == 1L) return()
-  stop("Replacement for `", var, "` is length ", pm_n, ", not length 1")
-}
 pm_replace_with <- function(x, i, val, arg_name) {
  if (is.null(val)) return(x)
  pm_check_length(val, x, arg_name)
@ -937,20 +712,6 @@ pm_rownames_to_column <- function(.data, var = "rowname") {
  rownames(.data) <- NULL
  .data[, c(var, setdiff(col_names, var))]
 }
-
-pm_select <- function(.data, ...) {
-  col_pos <- pm_select_positions(.data, ..., .group_pos = TRUE)
-  map_names <- names(col_pos)
-  map_names_length <- nchar(map_names)
-  if (any(map_names_length == 0L)) {
-    no_new_names <- which(map_names_length == 0L)
-    map_names[no_new_names] <- colnames(.data)[no_new_names]
-  }
-  res <- .data[, col_pos, drop = FALSE]
-  if (!is.null(map_names) && all(col_pos > 0L)) colnames(res) <- map_names
-  if (pm_has_groups(.data)) res <- pm_set_groups(res, pm_get_groups(.data))
-  res
-}
 pm_starts_with <- function(match, ignore.case = TRUE, vars = pm_peek_vars()) {
  grep(pattern = paste0("^", paste0(match, collapse = "|^")), x = vars, ignore.case = ignore.case)
 }
@ -1161,243 +922,18 @@ pm_select_env$get_colnames <- function() colnames(pm_select_env$.data)
 pm_select_env$get_nrow <- function() nrow(pm_select_env$.data)
 pm_select_env$get_ncol <- function() ncol(pm_select_env$.data)

-pm_slice <- function(.data, ...) {
-  pm_check_is_dataframe(.data)
-  if ("grouped_data" %in% class(.data)) {
-    pm_slice.grouped_data(.data, ...)
-  } else {
-    pm_slice.default(.data, ...)
+pm_select <- function(.data, ...) {
+  col_pos <- pm_select_positions(.data, ..., .group_pos = TRUE)
+  map_names <- names(col_pos)
+  map_names_length <- nchar(map_names)
+  if (any(map_names_length == 0L)) {
+    no_new_names <- which(map_names_length == 0L)
+    map_names[no_new_names] <- colnames(.data)[no_new_names]
  }
-}
-
-pm_slice.data.frame <- function(.data, ...) {
-  if (nrow(.data) == 0L) return(.data)
-  pos <- pm_slice_positions(.data, ...)
-  .data[pos, , drop = FALSE]
-}
-
-pm_slice.grouped_data <- function(.data, ...) {
-  pm_apply_grouped_function("pm_slice", .data, drop = TRUE, ...)
-}
-
-
-pm_slice_head <- function(.data, ..., pm_n, prop) {
-  if ("grouped_data" %in% class(.data)) {
-    pm_slice_head.grouped_data(.data, ...)
-  } else {
-    pm_slice_head.default(.data, ...)
-  }
-}
-
-pm_slice_head.data.frame <- function(.data, ..., pm_n, prop) {
-  size <- pm_check_slice_size(pm_n, prop)
-  idx <- switch(
-    size$type,
-    pm_n = function(pm_n) seq2(1, min(size$pm_n, pm_n)),
-    prop = function(pm_n) seq2(1, min(size$prop * pm_n, pm_n))
-  )
-  pm_slice(.data, idx(AMR:::pm_n()))
-}
-
-pm_slice_head.grouped_data <- function(.data, ..., pm_n, prop) {
-  pm_apply_grouped_function("pm_slice_head", .data, drop = TRUE, pm_n = pm_n, prop = prop, ...)
-}
-
-pm_slice_tail <- function(.data, ..., pm_n, prop) {
-  if ("grouped_data" %in% class(.data)) {
-    pm_slice_tail.grouped_data(.data, ...)
-  } else {
-    pm_slice_tail.default(.data, ...)
-  }
-}
-
-pm_slice_tail.data.frame <- function(.data, ..., pm_n, prop) {
-  size <- pm_check_slice_size(pm_n, prop)
-  idx <- switch(
-    size$type,
-    pm_n = function(pm_n) seq2(max(pm_n - size$pm_n + 1, 1), pm_n),
-    prop = function(pm_n) seq2(max(ceiling(pm_n - size$prop * pm_n) + 1, 1), pm_n)
-  )
-  pm_slice(.data, idx(AMR:::pm_n()))
-}
-
-pm_slice_tail.grouped_data <- function(.data, ..., pm_n, prop) {
-  pm_apply_grouped_function("pm_slice_tail", .data, drop = TRUE, pm_n = pm_n, prop = prop, ...)
-}
-
-pm_slice_min <- function(.data, order_by, ..., pm_n, prop, with_ties = TRUE) {
-  if ("grouped_data" %in% class(.data)) {
-    pm_slice_min.grouped_data(.data, ...)
-  } else {
-    pm_slice_min.default(.data, ...)
-  }
-}
-
-pm_slice_min.data.frame <- function(.data, order_by, ..., pm_n, prop, with_ties = TRUE) {
-  if (missing(order_by)) stop("argument `order_by` is missing, with no default.")
-
-  size <- pm_check_slice_size(pm_n, prop)
-  idx <- if (isTRUE(with_ties)) {
-    switch(
-      size$type,
-      pm_n = function(x, pm_n) pm_vec_head(order(x), pm_smaller_ranks(x, size$pm_n)),
-      prop = function(x, pm_n) pm_vec_head(order(x), pm_smaller_ranks(x, size$prop * pm_n))
-    )
-  } else {
-    switch(
-      size$type,
-      pm_n = function(x, pm_n) pm_vec_head(order(x), size$pm_n),
-      prop = function(x, pm_n) pm_vec_head(order(x), size$prop * pm_n)
-    )
-  }
-  order_by <- .data[, pm_deparse_var(order_by)]
-  pm_slice(.data, idx(order_by, AMR:::pm_n()))
-}
-
-pm_slice_min.grouped_data <- function(.data, order_by, ..., pm_n, prop, with_ties = TRUE) {
-  pm_eval_env$env <- environment()
-  on.exit(rm(list = "env", envir = pm_eval_env), add = TRUE)
-  pm_apply_grouped_function(
-    "pm_slice_min", .data, drop = TRUE, order_by = order_by, pm_n = pm_n, prop = prop, with_ties = with_ties, ...
-  )
-}
-
-pm_slice_max <- function(.data, order_by, ..., pm_n, prop, with_ties = TRUE) {
-  if ("grouped_data" %in% class(.data)) {
-    pm_slice_max.grouped_data(.data, ...)
-  } else {
-    pm_slice_max.default(.data, ...)
-  }
-}
-
-pm_slice_max.data.frame <- function(.data, order_by, ..., pm_n, prop, with_ties = TRUE) {
-  if (missing(order_by)) stop("argument `order_by` is missing, with no default.")
-
-  size <- pm_check_slice_size(pm_n, prop)
-  idx <- if (isTRUE(with_ties)) {
-    switch(
-      size$type,
-      pm_n = function(x, pm_n) pm_vec_head(order(x, decreasing = TRUE), pm_smaller_ranks(pm_desc(x), size$pm_n)),
-      prop = function(x, pm_n) pm_vec_head(order(x, decreasing = TRUE), pm_smaller_ranks(pm_desc(x), size$prop * pm_n))
-    )
-  } else {
-    switch(
-      size$type,
-      pm_n = function(x, pm_n) pm_vec_head(order(x, decreasing = TRUE), size$pm_n),
-      prop = function(x, pm_n) pm_vec_head(order(x, decreasing = TRUE), size$prop * pm_n)
-    )
-  }
-  order_by <- .data[, pm_deparse_var(order_by)]
-  pm_slice(.data, idx(order_by, AMR:::pm_n()))
-}
-
-pm_slice_max.grouped_data <- function(.data, order_by, ..., pm_n, prop, with_ties = TRUE) {
-  pm_eval_env$env <- environment()
-  on.exit(rm(list = "env", envir = pm_eval_env), add = TRUE)
-  pm_apply_grouped_function(
-    "pm_slice_max", .data, drop = TRUE, order_by = order_by, pm_n = pm_n, prop = prop, with_ties = with_ties, ...
-  )
-}
-
-pm_slice_sample <- function(.data, ..., pm_n, prop, weight_by = NULL, replace = FALSE) {
-  if ("grouped_data" %in% class(.data)) {
-    pm_slice_sample.grouped_data(.data, ...)
-  } else {
-    pm_slice_sample.default(.data, ...)
-  }
-}
-
-pm_slice_sample.data.frame <- function(.data, ..., pm_n, prop, weight_by = NULL, replace = FALSE) {
-  size <- pm_check_slice_size(pm_n, prop)
-  idx <- switch(
-    size$type,
-    pm_n = function(x, pm_n) pm_sample_int(pm_n, size$pm_n, replace = replace, wt = x),
-    prop = function(x, pm_n) pm_sample_int(pm_n, size$prop * pm_n, replace = replace, wt = x),
-  )
-  weight_by <- pm_deparse_var(weight_by)
-  if (!is.null(weight_by)) weight_by <- .data[, weight_by]
-  pm_slice(.data, idx(weight_by, AMR:::pm_n()))
-}
-
-pm_slice_sample.grouped_data <- function(.data, ..., pm_n, prop, weight_by = NULL, replace = FALSE) {
-  pm_eval_env$env <- environment()
-  on.exit(rm(list = "env", envir = pm_eval_env), add = TRUE)
-  pm_apply_grouped_function(
-    "pm_slice_sample", .data, drop = TRUE, pm_n = pm_n, prop = prop, weight_by = weight_by, replace = replace, ...
-  )
-}
-
-# helpers ----------------------------------------------------------------------
-
-pm_slice_positions <- function(.data, ...) {
-  conditions <- pm_dotdotdot(...)
-  pm_context$setup(.data)
-  on.exit(pm_context$clean(), add = TRUE)
-  if (length(conditions) == 0L) return(seq_len(pm_n()))
-
-  frame <- parent.frame(2L)
-  rows <- lapply(
-    conditions,
-    function(cond, frame) {
-      res <- eval(cond, pm_context$.data, frame)
-      if (is.logical(res) && all(is.na(res))) {
-        res <- integer()
-      } else if (is.numeric(res)) {
-        res <- as.integer(res)
-      } else if (!is.integer(res)) {
-        stop("`pm_slice()` expressions should return indices (positive or negative integers).")
-      }
-    },
-    frame = frame
-  )
-  rows <- do.call(c, rows)
-  if (length(rows) == 0L) {
-    # do nothing
-  } else if (all(rows >= 0, na.rm = TRUE)) {
-    rows <- rows[!is.na(rows) & rows <= pm_n() & rows > 0]
-  } else if (all(rows <= 0, na.rm = TRUE)) {
-    rows <- setdiff(seq_len(pm_n()), -rows)
-  } else {
-    stop("`pm_slice()` expressions should return either all positive or all negative.")
-  }
-  rows
-}
-
-pm_check_slice_size <- function(pm_n, prop) {
-  if (missing(pm_n) && missing(prop)) {
-    list(type = "pm_n", pm_n = 1L)
-  } else if (!missing(pm_n) && missing(prop)) {
-    if (!is.numeric(pm_n) || length(pm_n) != 1) {
-      stop("`pm_n` must be a single number.")
-    }
-    if (is.na(pm_n) || pm_n < 0) {
-      stop("`pm_n` must be a non-missing positive number.")
-    }
-
-    list(type = "pm_n", pm_n = pm_n)
-  } else if (!missing(prop) && missing(pm_n)) {
-    if (!is.numeric(prop) || length(prop) != 1) {
-      stop("`prop` must be a single number.")
-    }
-    if (is.na(prop) || prop < 0) {
-      stop("`prop` must be a non-missing positive number.")
-    }
-    list(type = "prop", prop = prop)
-  } else {
-    stop("Must supply exactly one of `pm_n` and `prop` arguments.")
-  }
-}
-
-pm_sample_int <- function(pm_n, size, replace = FALSE, wt = NULL) {
-  if (isTRUE(replace)) {
-    sample.int(pm_n, size, prob = wt, replace = TRUE)
-  } else {
-    sample.int(pm_n, min(size, pm_n), prob = wt)
-  }
-}
-
-pm_smaller_ranks <- function(x, y) {
-  sum(pm_min_rank(x) <= y, na.rm = TRUE)
+  res <- .data[, col_pos, drop = FALSE]
+  if (!is.null(map_names) && all(col_pos > 0L)) colnames(res) <- map_names
+  if (pm_has_groups(.data)) res <- pm_set_groups(res, pm_get_groups(.data))
+  res
 }
 pm_summarise <- function(.data, ...) {
  pm_check_is_dataframe(.data)
--- a/R/ab.R
+++ b/R/ab.R
@ -104,7 +104,7 @@ as.ab <- function(x, flag_multiple_results = TRUE, info = TRUE, ...) {
    x_bak_clean <- generalise_antibiotic_name(x_bak_clean)
  }
  
-  x <- unique(x_bak_clean)
+  x <- unique(x_bak_clean) # this means that every x is in fact generalise_antibiotic_name(x)
  x_new <- rep(NA_character_, length(x))
  x_unknown <- character(0)
  
@ -174,7 +174,7 @@ as.ab <- function(x, flag_multiple_results = TRUE, info = TRUE, ...) {
    
    # exact LOINC code
    loinc_found <- unlist(lapply(AB_lookup$generalised_loinc,
-                                 function(s) generalise_antibiotic_name(x[i]) %in% s))
+                                 function(s) x[i] %in% s))
    found <- antibiotics$ab[loinc_found == TRUE]
    if (length(found) > 0) {
      x_new[i] <- note_if_more_than_one_found(found, i, from_text)
@ -183,7 +183,7 @@ as.ab <- function(x, flag_multiple_results = TRUE, info = TRUE, ...) {
    
    # exact synonym
    synonym_found <- unlist(lapply(AB_lookup$generalised_synonyms,
-                                   function(s) generalise_antibiotic_name(x[i]) %in% s))
+                                   function(s) x[i] %in% s))
    found <- antibiotics$ab[synonym_found == TRUE]
    if (length(found) > 0) {
      x_new[i] <- note_if_more_than_one_found(found, i, from_text)
@ -192,7 +192,7 @@ as.ab <- function(x, flag_multiple_results = TRUE, info = TRUE, ...) {
    
    # exact abbreviation
    abbr_found <- unlist(lapply(AB_lookup$generalised_abbreviations,
-                                function(s) generalise_antibiotic_name(x[i]) %in% s))
+                                function(s) x[i] %in% s))
    found <- antibiotics$ab[abbr_found == TRUE]
    if (length(found) > 0) {
      x_new[i] <- note_if_more_than_one_found(found, i, from_text)
@ -244,7 +244,7 @@ as.ab <- function(x, flag_multiple_results = TRUE, info = TRUE, ...) {
    
    # and try if any synonym starts with it
    synonym_found <- unlist(lapply(AB_lookup$generalised_synonyms,
-                                   function(s) any(generalise_antibiotic_name(s) %like% paste0("^", x_spelling))))
+                                   function(s) any(s %like% paste0("^", x_spelling))))
    found <- antibiotics$ab[synonym_found == TRUE]
    if (length(found) > 0) {
      x_new[i] <- note_if_more_than_one_found(found, i, from_text)
@ -532,7 +532,7 @@ generalise_antibiotic_name <- function(x) {
  # remove part between brackets if that's followed by another string
  x <- gsub("(.*)+ [(].*[)]", "\\1", x)
  # keep only max 1 space
-  x <- trimws(gsub(" +", " ", x))
+  x <- trimws2(gsub(" +", " ", x))
  # non-character, space or number should be a slash
  x <- gsub("[^A-Z0-9 -]", "/", x)
  # spaces around non-characters must be removed: amox + clav -> amox/clav
--- a/R/bug_drug_combinations.R
+++ b/R/bug_drug_combinations.R
@ -199,7 +199,7 @@ format.bug_drug_combinations <- function(x,
    select_ab_vars() %pm>% 
    pm_arrange(ab_group, ab_txt)
  y <- y %pm>% 
-    create_var(ab_group = ifelse(y$ab_group != lag(y$ab_group) | is.na(pm_lag(y$ab_group)), y$ab_group, ""))
+    create_var(ab_group = ifelse(y$ab_group != pm_lag(y$ab_group) | is.na(pm_lag(y$ab_group)), y$ab_group, ""))
  
  if (add_ab_group == FALSE) {
    y <- y %pm>% 
--- a/R/mo.R
+++ b/R/mo.R
@ -109,10 +109,10 @@
 #' 3. The level of uncertainty \eqn{U} needed to get to the result, as stated above (1 to 3);
 #' 4. The [Levenshtein distance](https://en.wikipedia.org/wiki/Levenshtein_distance) \eqn{L} is the distance between the user input and all taxonomic full names, with the text length of the user input being the maximum distance. A modified version of the Levenshtein distance \eqn{L'} based on the text length of the full name \eqn{F} is calculated as:
 #' 
-#' \deqn{L' = F - \frac{0.5 \times L}{F}}{L' = F - (0.5 * L) / F}
+#' \deqn{L' = F - \frac{0.5L}{F}}{L' = (F - 0.5L) / F}
 #'   
 #' The final matching score \eqn{M} is calculated as:
-#' \deqn{M = L' \times \frac{1}{P \times K} * \frac{1}{U}}{M = L' * (1 / (P * K)) * (1 / U)}
+#' \deqn{M = L' \times \frac{1}{P K U} = \frac{F - 0.5L}{F P K U}}{M = L' * (1 / (P * K * U)) = (F - 0.5L) / (F * P * K * U)}
 #' 
 #' All matches are sorted descending on their matching score and for all user input values, the top match will be returned.
 #' @inheritSection catalogue_of_life Catalogue of Life
--- a/R/mo_matching_score.R
+++ b/R/mo_matching_score.R
@ -32,10 +32,10 @@
 #' 3. The level of uncertainty \eqn{U} that is needed to get to a result (1 to 3, see [as.mo()]);
 #' 4. The [Levenshtein distance](https://en.wikipedia.org/wiki/Levenshtein_distance) \eqn{L} is the distance between the user input and all taxonomic full names, with the text length of the user input being the maximum distance. A modified version of the Levenshtein distance \eqn{L'} based on the text length of the full name \eqn{F} is calculated as:
 #'  
-#' \deqn{L' = F - \frac{0.5 \times L}{F}}{L' = F - (0.5 * L) / F}
+#' \deqn{L' = F - \frac{0.5L}{F}}{L' = (F - 0.5L) / F}
 #'   
 #' The final matching score \eqn{M} is calculated as:
-#' \deqn{M = L' \times \frac{1}{P \times K} * \frac{1}{U}}{M = L' * (1 / (P * K)) * (1 / U)}
+#' \deqn{M = L' \times \frac{1}{P K U} = \frac{F - 0.5L}{F P K U}}{M = L' * (1 / (P * K * U)) = (F - 0.5L) / (F * P * K * U)}
 #' 
 #' @export
 #' @examples 
@ -55,9 +55,18 @@ mo_matching_score <- function(x, fullname, uncertainty = 1) {
    levenshtein[i] <- min(as.double(utils::adist(x[i], fullname[i], ignore.case = FALSE)),
                          nchar(fullname[i]))
  }
-  # self-made score between 0 and 1 (for % certainty, so 0 means huge distance, 1 means no distance)
-  dist <- (nchar(fullname) - 0.5 * levenshtein) / nchar(fullname)
-  prevalence_kingdom_index <- tryCatch(MO_lookup[match(fullname, MO_lookup$fullname), "prevalence_kingdom_index", drop = TRUE],
-                                       error = function(e) rep(1, length(fullname)))
-  dist * (1 / prevalence_kingdom_index) * (1 / uncertainty)
+  
+  # F = length of fullname
+  var_F <- nchar(fullname)
+  # L = modified Levenshtein distance
+  var_L <- levenshtein
+  # P = Prevalence (1 to 3)
+  var_P <- MO_lookup[match(fullname, MO_lookup$fullname), "prevalence", drop = TRUE]
+  # K = kingdom index (Bacteria = 1, Fungi = 2, Protozoa = 3, Archaea = 4, others = 5)
+  var_K <- MO_lookup[match(fullname, MO_lookup$fullname), "kingdom_index", drop = TRUE]
+  # U = uncertainty level (1 to 3), as per as.mo()
+  var_U <- uncertainty
+  
+  # matching score:
+  (var_F - 0.5 * L) / (var_F * var_P * var_K * var_U)
 }
--- a/R/rsi_calc.R
+++ b/R/rsi_calc.R
@ -268,7 +268,7 @@ rsi_calc_df <- function(type, # "proportion", "count" or "both"
    res <- do.call(rbind, unname(lapply(grouped, fn, ...)))
    if (any(groups %in% colnames(res))) {
      class(res) <- c("grouped_data", class(res))
-      res <- set_groups(res, groups[groups %in% colnames(res)])
+      res <- pm_set_groups(res, groups[groups %in% colnames(res)])
    }
    res
  }
--- a/R/zzz.R
+++ b/R/zzz.R
@ -84,8 +84,6 @@ create_MO_lookup <- function() {
  # all the rest
  MO_lookup[which(is.na(MO_lookup$kingdom_index)), "kingdom_index"] <- 5
  
-  MO_lookup$prevalence_kingdom_index <- MO_lookup$prevalence * MO_lookup$kingdom_index
-  
  # use this paste instead of `fullname` to work with Viridans Group Streptococci, etc.
  MO_lookup$fullname_lower <- tolower(trimws(paste(MO_lookup$genus, 
                                                   MO_lookup$species,