AMR/R/join_microorganisms.R

257 lines
9.3 KiB
R
Raw Normal View History

# ==================================================================== #
# TITLE #
# Antimicrobial Resistance (AMR) Analysis #
# #
2019-01-02 23:24:07 +01:00
# SOURCE #
2020-07-08 14:48:06 +02:00
# https://github.com/msberends/AMR #
# #
# LICENCE #
# (c) 2018-2020 Berends MS, Luz CF et al. #
# #
2019-01-02 23:24:07 +01:00
# This R package is free software; you can freely use and distribute #
# it for both personal and commercial purposes under the terms of the #
# GNU General Public License version 2.0 (GNU GPL-2), as published by #
# the Free Software Foundation. #
# #
# We created this package for both routine data analysis and academic #
# research and it was publicly released in the hope that it will be #
# useful, but it comes WITHOUT ANY WARRANTY OR LIABILITY. #
2020-07-08 14:48:06 +02:00
# Visit our website for more info: https://msberends.github.io/AMR. #
# ==================================================================== #
2020-05-16 13:05:47 +02:00
#' Join [microorganisms] to a data set
2018-02-21 11:52:31 +01:00
#'
#' Join the data set [microorganisms] easily to an existing table or character vector.
#' @inheritSection lifecycle Stable lifecycle
2018-02-21 11:52:31 +01:00
#' @rdname join
#' @name join
#' @aliases join inner_join
2018-10-12 16:35:18 +02:00
#' @param x existing table to join, or character vector
2020-09-26 16:26:01 +02:00
#' @param by a variable to join by - if left empty will search for a column with class [`mo`] (created with [as.mo()]) or will be `"mo"` if that column name exists in `x`, could otherwise be a column name of `x` with values that exist in `microorganisms$mo` (like `by = "bacteria_id"`), or another column in [microorganisms] (but then it should be named, like `by = c("bacteria_id" = "fullname")`)
#' @param suffix if there are non-joined duplicate variables in `x` and `y`, these suffixes will be added to the output to disambiguate them. Should be a character vector of length 2.
2020-06-17 21:35:10 +02:00
#' @param ... ignored
#' @details **Note:** As opposed to the `join()` functions of `dplyr`, [character] vectors are supported and at default existing columns will get a suffix `"2"` and the newly joined columns will not get a suffix.
2020-05-16 13:05:47 +02:00
#'
2020-09-26 16:26:01 +02:00
#' If the `dplyr` package is installed, their join functions will be used. Otherwise, the much slower [merge()] function from base R will be used.
2019-01-02 23:24:07 +01:00
#' @inheritSection AMR Read more on our website!
2018-02-21 11:52:31 +01:00
#' @export
2018-04-02 11:11:21 +02:00
#' @examples
2018-10-12 16:35:18 +02:00
#' left_join_microorganisms(as.mo("K. pneumoniae"))
#' left_join_microorganisms("B_KLBSL_PNE")
2018-04-02 11:11:21 +02:00
#'
#' \donttest{
#' if (require("dplyr")) {
#' example_isolates %>%
#' left_join_microorganisms() %>%
#' colnames()
#'
#' df <- data.frame(date = seq(from = as.Date("2018-01-01"),
#' to = as.Date("2018-01-07"),
#' by = 1),
#' bacteria = as.mo(c("S. aureus", "MRSA", "MSSA", "STAAUR",
#' "E. coli", "E. coli", "E. coli")),
#' stringsAsFactors = FALSE)
#' colnames(df)
#' df_joined <- left_join_microorganisms(df, "bacteria")
#' colnames(df_joined)
#' }
2020-05-16 13:05:47 +02:00
#' }
2018-10-12 16:35:18 +02:00
inner_join_microorganisms <- function(x, by = NULL, suffix = c("2", ""), ...) {
2020-03-14 14:05:43 +01:00
check_dataset_integrity()
2020-06-03 11:48:00 +02:00
check_groups_before_join(x, "inner_join_microorganisms")
2018-10-12 16:35:18 +02:00
checked <- joins_check_df(x, by)
2020-06-03 14:33:55 +02:00
x_class <- get_prejoined_class(x)
2018-10-12 16:35:18 +02:00
x <- checked$x
by <- checked$by
2020-09-26 16:26:01 +02:00
# use dplyr if available - it's much faster
dplyr_inner <- import_fn("inner_join", "dplyr", error_on_fail = FALSE)
if (!is.null(dplyr_inner)) {
join <- suppressWarnings(
dplyr_inner(x = x, y = microorganisms, by = by, suffix = suffix, ...)
)
} else {
join <- suppressWarnings(
pm_inner_join(x = x, y = microorganisms, by = by, suffix = suffix, ...)
)
}
2020-03-14 14:05:43 +01:00
if (NROW(join) > NROW(x)) {
2019-10-11 17:21:02 +02:00
warning("The newly joined tbl contains ", nrow(join) - nrow(x), " rows more that its original.")
2018-02-21 11:52:31 +01:00
}
2020-06-03 11:48:00 +02:00
class(join) <- x_class
2018-02-21 11:52:31 +01:00
join
}
#' @rdname join
#' @export
2018-10-12 16:35:18 +02:00
left_join_microorganisms <- function(x, by = NULL, suffix = c("2", ""), ...) {
2020-03-14 14:05:43 +01:00
check_dataset_integrity()
2020-06-03 11:48:00 +02:00
check_groups_before_join(x, "left_join_microorganisms")
2018-10-12 16:35:18 +02:00
checked <- joins_check_df(x, by)
2020-06-03 14:33:55 +02:00
x_class <- get_prejoined_class(x)
2018-10-12 16:35:18 +02:00
x <- checked$x
by <- checked$by
2020-09-26 16:26:01 +02:00
# use dplyr if available - it's much faster
dplyr_left <- import_fn("left_join", "dplyr", error_on_fail = FALSE)
if (!is.null(dplyr_left)) {
join <- suppressWarnings(
dplyr_left(x = x, y = microorganisms, by = by, suffix = suffix, ...)
)
} else {
join <- suppressWarnings(
pm_left_join(x = x, y = microorganisms, by = by, suffix = suffix, ...)
)
}
2020-03-14 14:05:43 +01:00
if (NROW(join) > NROW(x)) {
2019-10-11 17:21:02 +02:00
warning("The newly joined tbl contains ", nrow(join) - nrow(x), " rows more that its original.")
2018-02-21 11:52:31 +01:00
}
2020-06-03 11:48:00 +02:00
class(join) <- x_class
2018-02-21 11:52:31 +01:00
join
}
#' @rdname join
#' @export
2018-10-12 16:35:18 +02:00
right_join_microorganisms <- function(x, by = NULL, suffix = c("2", ""), ...) {
2020-03-14 14:05:43 +01:00
check_dataset_integrity()
2020-06-03 11:48:00 +02:00
check_groups_before_join(x, "right_join_microorganisms")
2018-10-12 16:35:18 +02:00
checked <- joins_check_df(x, by)
2020-06-03 14:33:55 +02:00
x_class <- get_prejoined_class(x)
2018-10-12 16:35:18 +02:00
x <- checked$x
by <- checked$by
2020-09-26 16:26:01 +02:00
# use dplyr if available - it's much faster
dplyr_right <- import_fn("right_join", "dplyr", error_on_fail = FALSE)
if (!is.null(dplyr_right)) {
join <- suppressWarnings(
dplyr_right(x = x, y = microorganisms, by = by, suffix = suffix, ...)
)
} else {
join <- suppressWarnings(
pm_right_join(x = x, y = microorganisms, by = by, suffix = suffix, ...)
)
}
2020-03-14 14:05:43 +01:00
if (NROW(join) > NROW(x)) {
2019-10-11 17:21:02 +02:00
warning("The newly joined tbl contains ", nrow(join) - nrow(x), " rows more that its original.")
2018-02-21 11:52:31 +01:00
}
2020-06-03 11:48:00 +02:00
class(join) <- x_class
2018-02-21 11:52:31 +01:00
join
}
#' @rdname join
#' @export
2018-10-12 16:35:18 +02:00
full_join_microorganisms <- function(x, by = NULL, suffix = c("2", ""), ...) {
2020-03-14 14:05:43 +01:00
check_dataset_integrity()
2020-06-03 11:48:00 +02:00
check_groups_before_join(x, "full_join_microorganisms")
2018-10-12 16:35:18 +02:00
checked <- joins_check_df(x, by)
2020-06-03 14:33:55 +02:00
x_class <- get_prejoined_class(x)
2018-10-12 16:35:18 +02:00
x <- checked$x
by <- checked$by
2020-09-26 16:26:01 +02:00
# use dplyr if available - it's much faster
dplyr_full <- import_fn("full_join", "dplyr", error_on_fail = FALSE)
if (!is.null(dplyr_full)) {
join <- suppressWarnings(
dplyr_full(x = x, y = microorganisms, by = by, suffix = suffix, ...)
)
} else {
join <- suppressWarnings(
pm_full_join(x = x, y = microorganisms, by = by, suffix = suffix, ...)
)
}
2020-03-14 14:05:43 +01:00
if (NROW(join) > NROW(x)) {
2019-10-11 17:21:02 +02:00
warning("The newly joined tbl contains ", nrow(join) - nrow(x), " rows more that its original.")
2018-03-27 17:43:42 +02:00
}
2020-06-03 11:48:00 +02:00
class(join) <- x_class
2018-03-27 17:43:42 +02:00
join
2018-02-21 11:52:31 +01:00
}
#' @rdname join
#' @export
2018-10-12 16:35:18 +02:00
semi_join_microorganisms <- function(x, by = NULL, ...) {
2020-03-14 14:05:43 +01:00
check_dataset_integrity()
2020-06-03 11:48:00 +02:00
check_groups_before_join(x, "semi_join_microorganisms")
2020-06-03 14:33:55 +02:00
x_class <- get_prejoined_class(x)
2018-10-12 16:35:18 +02:00
checked <- joins_check_df(x, by)
x <- checked$x
by <- checked$by
2020-09-26 16:26:01 +02:00
# use dplyr if available - it's much faster
dplyr_semi <- import_fn("semi_join", "dplyr", error_on_fail = FALSE)
if (!is.null(dplyr_semi)) {
join <- suppressWarnings(
2020-09-28 11:00:59 +02:00
dplyr_semi(x = x, y = microorganisms, by = by, ...)
2020-09-26 16:26:01 +02:00
)
} else {
join <- suppressWarnings(
2020-09-28 11:00:59 +02:00
pm_semi_join(x = x, y = microorganisms, by = by, ...)
2020-09-26 16:26:01 +02:00
)
}
2020-06-03 11:48:00 +02:00
class(join) <- x_class
join
2018-02-21 11:52:31 +01:00
}
#' @rdname join
#' @export
2018-10-12 16:35:18 +02:00
anti_join_microorganisms <- function(x, by = NULL, ...) {
2020-03-14 14:05:43 +01:00
check_dataset_integrity()
2020-06-03 11:48:00 +02:00
check_groups_before_join(x, "anti_join_microorganisms")
2018-10-12 16:35:18 +02:00
checked <- joins_check_df(x, by)
2020-06-03 14:33:55 +02:00
x_class <- get_prejoined_class(x)
2018-10-12 16:35:18 +02:00
x <- checked$x
by <- checked$by
2020-09-26 16:26:01 +02:00
# use dplyr if available - it's much faster
dplyr_anti <- import_fn("anti_join", "dplyr", error_on_fail = FALSE)
if (!is.null(dplyr_anti)) {
join <- suppressWarnings(
2020-09-28 11:00:59 +02:00
dplyr_anti(x = x, y = microorganisms, by = by, ...)
2020-09-26 16:26:01 +02:00
)
} else {
join <- suppressWarnings(
2020-09-28 11:00:59 +02:00
pm_anti_join(x = x, y = microorganisms, by = by, ...)
2020-09-26 16:26:01 +02:00
)
}
2020-06-03 11:48:00 +02:00
class(join) <- x_class
join
2018-10-12 16:35:18 +02:00
}
joins_check_df <- function(x, by) {
if (!any(class(x) %in% c("data.frame", "matrix"))) {
2020-03-14 14:05:43 +01:00
x <- data.frame(mo = as.mo(x), stringsAsFactors = FALSE)
2018-10-12 16:35:18 +02:00
if (is.null(by)) {
by <- "mo"
}
}
2020-06-03 11:48:00 +02:00
x <- as.data.frame(x, stringsAsFactors = FALSE)
2018-10-12 16:35:18 +02:00
if (is.null(by)) {
# search for column with class `mo` and return first one found
by <- colnames(x)[lapply(x, is.mo) == TRUE][1]
if (is.na(by)) {
if ("mo" %in% colnames(x)) {
by <- "mo"
2020-03-14 14:05:43 +01:00
x[, "mo"] <- as.mo(x[, "mo"])
2018-10-12 16:35:18 +02:00
} else {
stop("Cannot join - no column found with name 'mo' or with class <mo>.", call. = FALSE)
2018-10-12 16:35:18 +02:00
}
}
message('Joining, by = "', by, '"') # message same as dplyr::join functions
}
2018-02-21 11:52:31 +01:00
if (is.null(names(by))) {
2020-02-14 19:54:13 +01:00
joinby <- colnames(microorganisms)[1]
2018-02-21 11:52:31 +01:00
names(joinby) <- by
} else {
joinby <- by
}
2018-10-12 16:35:18 +02:00
list(x = x,
by = joinby)
2018-02-21 11:52:31 +01:00
}
2020-06-03 11:48:00 +02:00
2020-06-03 14:33:55 +02:00
get_prejoined_class <- function(x) {
if (is.data.frame(x)) {
class(x)
} else {
"data.frame"
}
}
2020-06-03 11:48:00 +02:00
check_groups_before_join <- function(x, fn) {
if (is.data.frame(x) && !is.null(attributes(x)$groups)) {
warning("Groups are dropped, since the ", fn, "() function relies on merge() from base R, not on join() from dplyr.", call. = FALSE)
}
}