2018-12-16 22:45:12 +01:00
# ==================================================================== #
# TITLE #
2021-02-02 23:57:35 +01:00
# Antimicrobial Resistance (AMR) Data Analysis for R #
2018-12-16 22:45:12 +01:00
# #
2019-01-02 23:24:07 +01:00
# SOURCE #
2020-07-08 14:48:06 +02:00
# https://github.com/msberends/AMR #
2018-12-16 22:45:12 +01:00
# #
# LICENCE #
2021-12-23 18:56:28 +01:00
# (c) 2018-2022 Berends MS, Luz CF et al. #
2020-10-08 11:16:03 +02:00
# Developed at the University of Groningen, the Netherlands, in #
# collaboration with non-profit organisations Certe Medical #
# Diagnostics & Advice, and University Medical Center Groningen. #
2018-12-16 22:45:12 +01:00
# #
2019-01-02 23:24:07 +01:00
# This R package is free software; you can freely use and distribute #
# it for both personal and commercial purposes under the terms of the #
# GNU General Public License version 2.0 (GNU GPL-2), as published by #
# the Free Software Foundation. #
2020-01-05 17:22:09 +01:00
# We created this package for both routine data analysis and academic #
# research and it was publicly released in the hope that it will be #
# useful, but it comes WITHOUT ANY WARRANTY OR LIABILITY. #
2020-10-08 11:16:03 +02:00
# #
# Visit our website for the full manual and a complete tutorial about #
2021-02-02 23:57:35 +01:00
# how to conduct AMR data analysis: https://msberends.github.io/AMR/ #
2018-12-16 22:45:12 +01:00
# ==================================================================== #
2021-01-18 16:57:56 +01:00
#' Join [microorganisms] to a Data Set
2018-02-21 11:52:31 +01:00
#'
2021-05-12 18:15:03 +02:00
#' Join the data set [microorganisms] easily to an existing data set or to a [character] vector.
2018-02-21 11:52:31 +01:00
#' @rdname join
#' @name join
#' @aliases join inner_join
2021-05-12 18:15:03 +02:00
#' @param x existing data set to join, or [character] vector. In case of a [character] vector, the resulting [data.frame] will contain a column 'x' with these values.
2020-12-17 16:22:25 +01:00
#' @param by a variable to join by - if left empty will search for a column with class [`mo`] (created with [as.mo()]) or will be `"mo"` if that column name exists in `x`, could otherwise be a column name of `x` with values that exist in `microorganisms$mo` (such as `by = "bacteria_id"`), or another column in [microorganisms] (but then it should be named, like `by = c("bacteria_id" = "fullname")`)
2021-05-12 18:15:03 +02:00
#' @param suffix if there are non-joined duplicate variables in `x` and `y`, these suffixes will be added to the output to disambiguate them. Should be a [character] vector of length 2.
2021-04-29 17:16:30 +02:00
#' @param ... ignored, only in place to allow future extensions
2020-09-18 16:05:53 +02:00
#' @details **Note:** As opposed to the `join()` functions of `dplyr`, [character] vectors are supported and at default existing columns will get a suffix `"2"` and the newly joined columns will not get a suffix.
2020-05-16 13:05:47 +02:00
#'
2021-06-22 12:16:42 +02:00
#' If the `dplyr` package is installed, their join functions will be used. Otherwise, the much slower [merge()] and [interaction()] functions from base \R will be used.
2021-05-12 18:15:03 +02:00
#' @return a [data.frame]
2018-02-21 11:52:31 +01:00
#' @export
2018-04-02 11:11:21 +02:00
#' @examples
2018-10-12 16:35:18 +02:00
#' left_join_microorganisms(as.mo("K. pneumoniae"))
2021-05-12 18:15:03 +02:00
#' left_join_microorganisms("B_KLBSL_PNMN")
2018-04-02 11:11:21 +02:00
#'
2020-09-29 23:35:46 +02:00
#' \donttest{
#' if (require("dplyr")) {
#' example_isolates %>%
#' left_join_microorganisms() %>%
#' colnames()
#'
#' df <- data.frame(date = seq(from = as.Date("2018-01-01"),
#' to = as.Date("2018-01-07"),
#' by = 1),
#' bacteria = as.mo(c("S. aureus", "MRSA", "MSSA", "STAAUR",
#' "E. coli", "E. coli", "E. coli")),
#' stringsAsFactors = FALSE)
#' colnames(df)
#' df_joined <- left_join_microorganisms(df, "bacteria")
#' colnames(df_joined)
#' }
2020-05-16 13:05:47 +02:00
#' }
2018-10-12 16:35:18 +02:00
inner_join_microorganisms <- function ( x , by = NULL , suffix = c ( " 2" , " " ) , ... ) {
2020-10-19 17:09:19 +02:00
meet_criteria ( x , allow_class = c ( " data.frame" , " character" ) )
meet_criteria ( by , allow_class = " character" , allow_NULL = TRUE )
meet_criteria ( suffix , allow_class = " character" , has_length = 2 )
2021-05-12 18:15:03 +02:00
join_microorganisms ( type = " inner_join" , x = x , by = by , suffix = suffix , ... )
2018-02-21 11:52:31 +01:00
}
#' @rdname join
#' @export
2018-10-12 16:35:18 +02:00
left_join_microorganisms <- function ( x , by = NULL , suffix = c ( " 2" , " " ) , ... ) {
2020-10-19 17:09:19 +02:00
meet_criteria ( x , allow_class = c ( " data.frame" , " character" ) )
meet_criteria ( by , allow_class = " character" , allow_NULL = TRUE )
meet_criteria ( suffix , allow_class = " character" , has_length = 2 )
2021-05-12 18:15:03 +02:00
join_microorganisms ( type = " left_join" , x = x , by = by , suffix = suffix , ... )
2018-02-21 11:52:31 +01:00
}
#' @rdname join
#' @export
2018-10-12 16:35:18 +02:00
right_join_microorganisms <- function ( x , by = NULL , suffix = c ( " 2" , " " ) , ... ) {
2020-10-19 17:09:19 +02:00
meet_criteria ( x , allow_class = c ( " data.frame" , " character" ) )
meet_criteria ( by , allow_class = " character" , allow_NULL = TRUE )
meet_criteria ( suffix , allow_class = " character" , has_length = 2 )
2021-05-12 18:15:03 +02:00
join_microorganisms ( type = " right_join" , x = x , by = by , suffix = suffix , ... )
2018-02-21 11:52:31 +01:00
}
#' @rdname join
#' @export
2018-10-12 16:35:18 +02:00
full_join_microorganisms <- function ( x , by = NULL , suffix = c ( " 2" , " " ) , ... ) {
2020-10-19 17:09:19 +02:00
meet_criteria ( x , allow_class = c ( " data.frame" , " character" ) )
meet_criteria ( by , allow_class = " character" , allow_NULL = TRUE )
meet_criteria ( suffix , allow_class = " character" , has_length = 2 )
2021-05-12 18:15:03 +02:00
join_microorganisms ( type = " full_join" , x = x , by = by , suffix = suffix , ... )
2018-02-21 11:52:31 +01:00
}
#' @rdname join
#' @export
2018-10-12 16:35:18 +02:00
semi_join_microorganisms <- function ( x , by = NULL , ... ) {
2020-10-19 17:09:19 +02:00
meet_criteria ( x , allow_class = c ( " data.frame" , " character" ) )
meet_criteria ( by , allow_class = " character" , allow_NULL = TRUE )
2021-05-13 15:56:12 +02:00
join_microorganisms ( type = " semi_join" , x = x , by = by , ... )
2018-02-21 11:52:31 +01:00
}
#' @rdname join
#' @export
2018-10-12 16:35:18 +02:00
anti_join_microorganisms <- function ( x , by = NULL , ... ) {
2020-10-19 17:09:19 +02:00
meet_criteria ( x , allow_class = c ( " data.frame" , " character" ) )
meet_criteria ( by , allow_class = " character" , allow_NULL = TRUE )
2021-05-13 15:56:12 +02:00
join_microorganisms ( type = " anti_join" , x = x , by = by , ... )
2018-10-12 16:35:18 +02:00
}
2021-05-12 18:15:03 +02:00
join_microorganisms <- function ( type , x , by , suffix , ... ) {
check_dataset_integrity ( )
if ( ! is.data.frame ( x ) ) {
x <- data.frame ( mo = x , stringsAsFactors = FALSE )
by <- " mo"
2018-10-12 16:35:18 +02:00
}
if ( is.null ( by ) ) {
2021-05-12 18:15:03 +02:00
by <- search_type_in_df ( x , " mo" , info = FALSE )
2021-05-13 15:56:12 +02:00
if ( is.null ( by ) && NCOL ( x ) == 1 ) {
by <- colnames ( x ) [1L ]
} else {
stop_if ( is.null ( by ) , " no column with microorganism names or codes found, set this column with `by`" , call = -2 )
}
message_ ( ' Joining, by = "' , by , ' "' , add_fn = font_black , as_note = FALSE ) # message same as dplyr::join functions
2021-05-12 18:15:03 +02:00
}
if ( ! all ( x [ , by , drop = TRUE ] %in% MO_lookup $ mo , na.rm = TRUE ) ) {
x $ join.mo <- as.mo ( x [ , by , drop = TRUE ] )
by <- c ( " join.mo" = " mo" )
} else {
x [ , by ] <- as.mo ( x [ , by , drop = TRUE ] )
2018-03-19 12:43:22 +01:00
}
2021-05-12 18:15:03 +02:00
2018-02-21 11:52:31 +01:00
if ( is.null ( names ( by ) ) ) {
2021-05-12 18:15:03 +02:00
# will always be joined to microorganisms$mo, so add name to that
by <- stats :: setNames ( " mo" , by )
}
# use dplyr if available - it's much faster than poorman alternatives
dplyr_join <- import_fn ( name = type , pkg = " dplyr" , error_on_fail = FALSE )
if ( ! is.null ( dplyr_join ) ) {
join_fn <- dplyr_join
2018-02-21 11:52:31 +01:00
} else {
2021-05-12 18:15:03 +02:00
# otherwise use poorman, see R/aa_helper_pm_functions.R
join_fn <- get ( paste0 ( " pm_" , type ) , envir = asNamespace ( " AMR" ) )
2018-02-21 11:52:31 +01:00
}
2021-05-12 18:15:03 +02:00
if ( type %like% " full|left|right|inner" ) {
joined <- join_fn ( x = x , y = AMR :: microorganisms , by = by , suffix = suffix , ... )
2020-06-03 14:33:55 +02:00
} else {
2021-05-12 18:15:03 +02:00
joined <- join_fn ( x = x , y = AMR :: microorganisms , by = by , ... )
2020-06-03 14:33:55 +02:00
}
2021-05-12 18:15:03 +02:00
if ( " join.mo" %in% colnames ( joined ) ) {
if ( " mo" %in% colnames ( joined ) ) {
ind_mo <- which ( colnames ( joined ) %in% c ( " mo" , " join.mo" ) )
colnames ( joined ) [ind_mo [1L ] ] <- paste0 ( " mo" , suffix [1L ] )
colnames ( joined ) [ind_mo [2L ] ] <- paste0 ( " mo" , suffix [2L ] )
} else {
colnames ( joined ) [colnames ( joined ) == " join.mo" ] <- " mo"
}
2020-06-03 11:48:00 +02:00
}
2021-05-12 18:15:03 +02:00
if ( type %like% " full|left|right|inner" && NROW ( joined ) > NROW ( x ) ) {
2022-03-02 15:38:55 +01:00
warning_ ( " in `" , type , " _join()`: the newly joined data set contains " , nrow ( joined ) - nrow ( x ) , " rows more than the number of rows of `x`." )
2021-05-12 18:15:03 +02:00
}
joined
2020-06-03 11:48:00 +02:00
}