1
0
mirror of https://github.com/msberends/AMR.git synced 2025-07-13 02:32:07 +02:00

fix missing R breakpoints

This commit is contained in:
2022-10-29 14:15:23 +02:00
parent c2801ba7a1
commit 6ad7857d39
34 changed files with 959 additions and 865 deletions

View File

@ -592,18 +592,61 @@ for (i in 2:6) {
status = "accepted",
source = "manually added"
) %>%
filter(!paste(kingdom, .[[ncol(.) - 4]], rank) %in% paste(taxonomy$kingdom, taxonomy[[i + 1]], taxonomy$rank)) %>%
filter(!paste(kingdom, .[[ncol(.) - 4]], rank) %in% paste(taxonomy$kingdom, taxonomy[[i + 1]], taxonomy$rank))# %>%
# get GBIF identifier where available
left_join(current_gbif %>%
select(kingdom, all_of(i_name), rank = taxonRank, ref = scientificNameAuthorship, gbif = taxonID, gbif_parent = parentNameUsageID),
by = c("kingdom", "rank", i_name)
) %>%
mutate(source = ifelse(!is.na(gbif), "GBIF", source))
# left_join(current_gbif %>%
# select(kingdom, all_of(i_name), rank = taxonRank, ref = scientificNameAuthorship, gbif = taxonID, gbif_parent = parentNameUsageID),
# by = c("kingdom", "rank", i_name)
# ) %>%
# mutate(source = ifelse(!is.na(gbif), "GBIF", source))
message("n = ", nrow(to_add))
taxonomy <- taxonomy %>%
bind_rows(to_add)
}
# FIX LATER: added missings after finding out still some taxonomic levels were missing
# this should not be needed - it was the only part that was required after last update
# can now be removed? Check with next update!
new_df <- AMR::microorganisms[0, ]
for (tax in c("phylum", "class", "order", "family", "genus")) {
print(tax)
out <- AMR::microorganisms %>% pull(tax) %>% unique()
missing <- vapply(FUN.VALUE = logical(1), out, function(x) length(which(AMR::microorganisms[[tax]] == x & AMR::microorganisms$rank == tax)) == 0)
missing <- names(missing)[which(missing == TRUE & names(missing) != "" & names(missing) %unlike% "unknown")]
out <- microorganisms %>%
filter(.[[tax]] %in% missing) %>%
distinct(.[[tax]], .keep_all = TRUE) %>%
mutate_at(vars((which(colnames(.) == tax) + 1):subspecies), ~"") %>%
mutate_at(vars(lpsn:gbif_renamed_to), ~NA_character_) %>%
mutate(rank = tax,
ref = NA_character_,
status = "accepted",
fullname = .[[tax]],
source = "manually added",
snomed = rep(list(character(0)), nrow(.)))
new_df <- bind_rows(new_df, out)
if (".[[tax]]" %in% colnames(new_df)) {
new_df <- new_df %>% select(-`.[[tax]]`)
}
}
new_df <- new_df %>%
mutate(mo = as.character(mo))
new_mo <- new_df %>%
filter(rank == "family") %>%
mutate(
mo_rank_new8 = abbreviate_mo(family, minlength = 8, prefix = "[FAM]_"),
mo_rank_new9 = abbreviate_mo(family, minlength = 9, prefix = "[FAM]_"),
mo_rank_new = mo_rank_new8,
mo_duplicated = duplicated(mo_rank_new),
mo_rank_new = ifelse(mo_duplicated, mo_rank_new9, mo_rank_new),
mo_duplicated = duplicated(mo_rank_new)
) %>%
transmute(fullname, mo_rank_new = paste0(gsub("_.*", "_", as.character(mo)), mo_rank_new))
any(new_mo$mo_rank_new %in% microorganisms$mo)
new_df[which(new_df$fullname %in% new_mo$fullname), "mo"] <- new_mo$mo_rank_new
# species (requires combination with genus)
taxonomy <- taxonomy %>%
bind_rows(taxonomy %>%
@ -998,9 +1041,16 @@ taxonomy <- taxonomy %>%
.before = 1
) %>%
select(!starts_with("mo_")) %>%
arrange(fullname) %>%
arrange(fullname)
# now check these - e.g. Nitrospira is the name of a genus AND its class
taxonomy %>% filter(fullname %in% .[duplicated(fullname), "fullname", drop = TRUE])
taxonomy <- taxonomy %>%
distinct(fullname, .keep_all = TRUE)
# This must not exist:
taxonomy %>% filter(mo %like% "__")
# Remove unwanted taxonomic entries from Protoza/Fungi --------------------
@ -1027,7 +1077,7 @@ message("\nCongratulations! The new taxonomic table will contain ", format(nrow(
# we will use Public Health Information Network Vocabulary Access and Distribution System (PHIN VADS)
# as a source, which copies directly from the latest US SNOMED CT version
# - go to https://phinvads.cdc.gov/vads/ViewValueSet.action?oid=2.16.840.1.114222.4.11.1009
# - check that current online version is higher than SNOMED_VERSION$current_version
# - check that current online version is higher than TAXONOMY_VERSION$SNOMED
# - if so, click on 'Download Value Set', choose 'TXT'
snomed <- vroom("data-raw/SNOMED_PHVS_Microorganism_CDC_V12.txt", skip = 3) %>%
select(1:2) %>%