diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 9593a551d..ac0915d6b 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -30,7 +30,7 @@ stages: image: rocker/r-base before_script: - - apt-get update -qq + - apt-get update -qq --allow-releaseinfo-change # install dependencies for packages - apt-get install -y wget locales libxml2-dev libssl-dev libcurl4-openssl-dev zlib1g-dev > /dev/null # recent pandoc @@ -116,9 +116,9 @@ coverage: script: - apt-get install --yes git # install missing and outdated packages - - Rscript -e 'source(".gitlab-ci.R"); gl_update_pkg_all(repos = "https://cran.rstudio.com", quiet = TRUE, install_pkgdown = TRUE)' + - Rscript -e 'source(".gitlab-ci.R"); gl_update_pkg_all(repos = "https://cran.rstudio.com", quiet = TRUE, install_pkgdown = FALSE)' # codecov token is set in https://gitlab.com/msberends/AMR/settings/ci_cd - - Rscript -e "cc <- covr::package_coverage(); covr::codecov(coverage = cc, token = '$codecov', exclusions = c('R/atc_online.R', 'R/mo_source.R')); cat('Code coverage:', covr::percent_coverage(cc))" + - Rscript -e "cc <- covr::package_coverage(line_exclusions = list('R/atc_online.R', 'R/mo_source.R')); covr::codecov(coverage = cc, token = '$codecov'); cat('Code coverage:', covr::percent_coverage(cc))" coverage: '/Code coverage: \d+\.\d+/' pages: diff --git a/DESCRIPTION b/DESCRIPTION index ee3d1f8ab..0b343ec4a 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,6 +1,6 @@ Package: AMR -Version: 0.7.1.9008 -Date: 2019-07-04 +Version: 0.7.1.9026 +Date: 2019-08-06 Title: Antimicrobial Resistance Analysis Authors@R: c( person( @@ -45,6 +45,7 @@ Depends: R (>= 3.1.0) Imports: backports, + clean, crayon (>= 1.3.0), data.table (>= 1.9.0), dplyr (>= 0.7.0), @@ -59,7 +60,6 @@ Suggests: covr (>= 3.0.1), curl, readxl, - rmarkdown, rstudioapi, rvest (>= 0.3.2), testthat (>= 1.0.2), @@ -67,7 +67,7 @@ Suggests: VignetteBuilder: knitr URL: https://msberends.gitlab.io/AMR, https://gitlab.com/msberends/AMR BugReports: https://gitlab.com/msberends/AMR/issues -License: GPL-2 | file LICENSE +License: GPL-2 Encoding: UTF-8 LazyData: true RoxygenNote: 6.1.1 diff --git a/LICENSE b/LICENSE deleted file mode 100755 index 23cb79033..000000000 --- a/LICENSE +++ /dev/null @@ -1,339 +0,0 @@ - GNU GENERAL PUBLIC LICENSE - Version 2, June 1991 - - Copyright (C) 1989, 1991 Free Software Foundation, Inc., - 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - Everyone is permitted to copy and distribute verbatim copies - of this license document, but changing it is not allowed. - - Preamble - - The licenses for most software are designed to take away your -freedom to share and change it. By contrast, the GNU General Public -License is intended to guarantee your freedom to share and change free -software--to make sure the software is free for all its users. This -General Public License applies to most of the Free Software -Foundation's software and to any other program whose authors commit to -using it. (Some other Free Software Foundation software is covered by -the GNU Lesser General Public License instead.) You can apply it to -your programs, too. - - When we speak of free software, we are referring to freedom, not -price. Our General Public Licenses are designed to make sure that you -have the freedom to distribute copies of free software (and charge for -this service if you wish), that you receive source code or can get it -if you want it, that you can change the software or use pieces of it -in new free programs; and that you know you can do these things. - - To protect your rights, we need to make restrictions that forbid -anyone to deny you these rights or to ask you to surrender the rights. -These restrictions translate to certain responsibilities for you if you -distribute copies of the software, or if you modify it. - - For example, if you distribute copies of such a program, whether -gratis or for a fee, you must give the recipients all the rights that -you have. You must make sure that they, too, receive or can get the -source code. And you must show them these terms so they know their -rights. - - We protect your rights with two steps: (1) copyright the software, and -(2) offer you this license which gives you legal permission to copy, -distribute and/or modify the software. - - Also, for each author's protection and ours, we want to make certain -that everyone understands that there is no warranty for this free -software. If the software is modified by someone else and passed on, we -want its recipients to know that what they have is not the original, so -that any problems introduced by others will not reflect on the original -authors' reputations. - - Finally, any free program is threatened constantly by software -patents. We wish to avoid the danger that redistributors of a free -program will individually obtain patent licenses, in effect making the -program proprietary. To prevent this, we have made it clear that any -patent must be licensed for everyone's free use or not licensed at all. - - The precise terms and conditions for copying, distribution and -modification follow. - - GNU GENERAL PUBLIC LICENSE - TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION - - 0. This License applies to any program or other work which contains -a notice placed by the copyright holder saying it may be distributed -under the terms of this General Public License. The "Program", below, -refers to any such program or work, and a "work based on the Program" -means either the Program or any derivative work under copyright law: -that is to say, a work containing the Program or a portion of it, -either verbatim or with modifications and/or translated into another -language. (Hereinafter, translation is included without limitation in -the term "modification".) Each licensee is addressed as "you". - -Activities other than copying, distribution and modification are not -covered by this License; they are outside its scope. The act of -running the Program is not restricted, and the output from the Program -is covered only if its contents constitute a work based on the -Program (independent of having been made by running the Program). -Whether that is true depends on what the Program does. - - 1. You may copy and distribute verbatim copies of the Program's -source code as you receive it, in any medium, provided that you -conspicuously and appropriately publish on each copy an appropriate -copyright notice and disclaimer of warranty; keep intact all the -notices that refer to this License and to the absence of any warranty; -and give any other recipients of the Program a copy of this License -along with the Program. - -You may charge a fee for the physical act of transferring a copy, and -you may at your option offer warranty protection in exchange for a fee. - - 2. You may modify your copy or copies of the Program or any portion -of it, thus forming a work based on the Program, and copy and -distribute such modifications or work under the terms of Section 1 -above, provided that you also meet all of these conditions: - - a) You must cause the modified files to carry prominent notices - stating that you changed the files and the date of any change. - - b) You must cause any work that you distribute or publish, that in - whole or in part contains or is derived from the Program or any - part thereof, to be licensed as a whole at no charge to all third - parties under the terms of this License. - - c) If the modified program normally reads commands interactively - when run, you must cause it, when started running for such - interactive use in the most ordinary way, to print or display an - announcement including an appropriate copyright notice and a - notice that there is no warranty (or else, saying that you provide - a warranty) and that users may redistribute the program under - these conditions, and telling the user how to view a copy of this - License. (Exception: if the Program itself is interactive but - does not normally print such an announcement, your work based on - the Program is not required to print an announcement.) - -These requirements apply to the modified work as a whole. If -identifiable sections of that work are not derived from the Program, -and can be reasonably considered independent and separate works in -themselves, then this License, and its terms, do not apply to those -sections when you distribute them as separate works. But when you -distribute the same sections as part of a whole which is a work based -on the Program, the distribution of the whole must be on the terms of -this License, whose permissions for other licensees extend to the -entire whole, and thus to each and every part regardless of who wrote it. - -Thus, it is not the intent of this section to claim rights or contest -your rights to work written entirely by you; rather, the intent is to -exercise the right to control the distribution of derivative or -collective works based on the Program. - -In addition, mere aggregation of another work not based on the Program -with the Program (or with a work based on the Program) on a volume of -a storage or distribution medium does not bring the other work under -the scope of this License. - - 3. You may copy and distribute the Program (or a work based on it, -under Section 2) in object code or executable form under the terms of -Sections 1 and 2 above provided that you also do one of the following: - - a) Accompany it with the complete corresponding machine-readable - source code, which must be distributed under the terms of Sections - 1 and 2 above on a medium customarily used for software interchange; or, - - b) Accompany it with a written offer, valid for at least three - years, to give any third party, for a charge no more than your - cost of physically performing source distribution, a complete - machine-readable copy of the corresponding source code, to be - distributed under the terms of Sections 1 and 2 above on a medium - customarily used for software interchange; or, - - c) Accompany it with the information you received as to the offer - to distribute corresponding source code. (This alternative is - allowed only for noncommercial distribution and only if you - received the program in object code or executable form with such - an offer, in accord with Subsection b above.) - -The source code for a work means the preferred form of the work for -making modifications to it. For an executable work, complete source -code means all the source code for all modules it contains, plus any -associated interface definition files, plus the scripts used to -control compilation and installation of the executable. However, as a -special exception, the source code distributed need not include -anything that is normally distributed (in either source or binary -form) with the major components (compiler, kernel, and so on) of the -operating system on which the executable runs, unless that component -itself accompanies the executable. - -If distribution of executable or object code is made by offering -access to copy from a designated place, then offering equivalent -access to copy the source code from the same place counts as -distribution of the source code, even though third parties are not -compelled to copy the source along with the object code. - - 4. You may not copy, modify, sublicense, or distribute the Program -except as expressly provided under this License. Any attempt -otherwise to copy, modify, sublicense or distribute the Program is -void, and will automatically terminate your rights under this License. -However, parties who have received copies, or rights, from you under -this License will not have their licenses terminated so long as such -parties remain in full compliance. - - 5. You are not required to accept this License, since you have not -signed it. However, nothing else grants you permission to modify or -distribute the Program or its derivative works. These actions are -prohibited by law if you do not accept this License. Therefore, by -modifying or distributing the Program (or any work based on the -Program), you indicate your acceptance of this License to do so, and -all its terms and conditions for copying, distributing or modifying -the Program or works based on it. - - 6. Each time you redistribute the Program (or any work based on the -Program), the recipient automatically receives a license from the -original licensor to copy, distribute or modify the Program subject to -these terms and conditions. You may not impose any further -restrictions on the recipients' exercise of the rights granted herein. -You are not responsible for enforcing compliance by third parties to -this License. - - 7. If, as a consequence of a court judgment or allegation of patent -infringement or for any other reason (not limited to patent issues), -conditions are imposed on you (whether by court order, agreement or -otherwise) that contradict the conditions of this License, they do not -excuse you from the conditions of this License. If you cannot -distribute so as to satisfy simultaneously your obligations under this -License and any other pertinent obligations, then as a consequence you -may not distribute the Program at all. For example, if a patent -license would not permit royalty-free redistribution of the Program by -all those who receive copies directly or indirectly through you, then -the only way you could satisfy both it and this License would be to -refrain entirely from distribution of the Program. - -If any portion of this section is held invalid or unenforceable under -any particular circumstance, the balance of the section is intended to -apply and the section as a whole is intended to apply in other -circumstances. - -It is not the purpose of this section to induce you to infringe any -patents or other property right claims or to contest validity of any -such claims; this section has the sole purpose of protecting the -integrity of the free software distribution system, which is -implemented by public license practices. Many people have made -generous contributions to the wide range of software distributed -through that system in reliance on consistent application of that -system; it is up to the author/donor to decide if he or she is willing -to distribute software through any other system and a licensee cannot -impose that choice. - -This section is intended to make thoroughly clear what is believed to -be a consequence of the rest of this License. - - 8. If the distribution and/or use of the Program is restricted in -certain countries either by patents or by copyrighted interfaces, the -original copyright holder who places the Program under this License -may add an explicit geographical distribution limitation excluding -those countries, so that distribution is permitted only in or among -countries not thus excluded. In such case, this License incorporates -the limitation as if written in the body of this License. - - 9. The Free Software Foundation may publish revised and/or new versions -of the General Public License from time to time. Such new versions will -be similar in spirit to the present version, but may differ in detail to -address new problems or concerns. - -Each version is given a distinguishing version number. If the Program -specifies a version number of this License which applies to it and "any -later version", you have the option of following the terms and conditions -either of that version or of any later version published by the Free -Software Foundation. If the Program does not specify a version number of -this License, you may choose any version ever published by the Free Software -Foundation. - - 10. If you wish to incorporate parts of the Program into other free -programs whose distribution conditions are different, write to the author -to ask for permission. For software which is copyrighted by the Free -Software Foundation, write to the Free Software Foundation; we sometimes -make exceptions for this. Our decision will be guided by the two goals -of preserving the free status of all derivatives of our free software and -of promoting the sharing and reuse of software generally. - - NO WARRANTY - - 11. BECAUSE THE PROGRAM IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY -FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW. EXCEPT WHEN -OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES -PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED -OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF -MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE ENTIRE RISK AS -TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU. SHOULD THE -PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING, -REPAIR OR CORRECTION. - - 12. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING -WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR -REDISTRIBUTE THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, -INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING -OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED -TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY -YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER -PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE -POSSIBILITY OF SUCH DAMAGES. - - END OF TERMS AND CONDITIONS - - How to Apply These Terms to Your New Programs - - If you develop a new program, and you want it to be of the greatest -possible use to the public, the best way to achieve this is to make it -free software which everyone can redistribute and change under these terms. - - To do so, attach the following notices to the program. It is safest -to attach them to the start of each source file to most effectively -convey the exclusion of warranty; and each file should have at least -the "copyright" line and a pointer to where the full notice is found. - - {description} - Copyright (C) {year} {fullname} - - This program is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation; either version 2 of the License, or - (at your option) any later version. - - This program is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. - - You should have received a copy of the GNU General Public License along - with this program; if not, write to the Free Software Foundation, Inc., - 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. - -Also add information on how to contact you by electronic and paper mail. - -If the program is interactive, make it output a short notice like this -when it starts in an interactive mode: - - Gnomovision version 69, Copyright (C) year name of author - Gnomovision comes with ABSOLUTELY NO WARRANTY; for details type `show w'. - This is free software, and you are welcome to redistribute it - under certain conditions; type `show c' for details. - -The hypothetical commands `show w' and `show c' should show the appropriate -parts of the General Public License. Of course, the commands you use may -be called something other than `show w' and `show c'; they could even be -mouse-clicks or menu items--whatever suits your program. - -You should also get your employer (if you work as a programmer) or your -school, if any, to sign a "copyright disclaimer" for the program, if -necessary. Here is a sample; alter the names: - - Yoyodyne, Inc., hereby disclaims all copyright interest in the program - `Gnomovision' (which makes passes at compilers) written by James Hacker. - - {signature of Ty Coon}, 1 April 1989 - Ty Coon, President of Vice - -This General Public License does not permit incorporating your program into -proprietary programs. If your program is a subroutine library, you may -consider it more useful to permit linking proprietary applications with the -library. If this is what you want to do, use the GNU Lesser General -Public License instead of this License. diff --git a/NAMESPACE b/NAMESPACE index e233f55eb..0e07b8231 100755 --- a/NAMESPACE +++ b/NAMESPACE @@ -1,7 +1,6 @@ # Generated by roxygen2: do not edit by hand S3method(as.data.frame,ab) -S3method(as.data.frame,freq) S3method(as.data.frame,mo) S3method(as.double,mic) S3method(as.integer,mic) @@ -10,28 +9,21 @@ S3method(as.rsi,data.frame) S3method(as.rsi,default) S3method(as.rsi,disk) S3method(as.rsi,mic) -S3method(as.vector,freq) -S3method(as_tibble,freq) S3method(barplot,mic) S3method(barplot,rsi) -S3method(boxplot,freq) -S3method(diff,freq) S3method(droplevels,mic) S3method(droplevels,rsi) -S3method(format,freq) -S3method(hist,freq) +S3method(freq,mo) +S3method(freq,rsi) S3method(kurtosis,data.frame) S3method(kurtosis,default) S3method(kurtosis,matrix) -S3method(plot,freq) S3method(plot,mic) S3method(plot,resistance_predict) S3method(plot,rsi) S3method(print,ab) S3method(print,catalogue_of_life_version) S3method(print,disk) -S3method(print,freq) -S3method(print,frequency_tbl) S3method(print,mic) S3method(print,mo) S3method(print,mo_renamed) @@ -39,7 +31,6 @@ S3method(print,mo_uncertainties) S3method(print,rsi) S3method(pull,ab) S3method(pull,mo) -S3method(select,freq) S3method(skewness,data.frame) S3method(skewness,default) S3method(skewness,matrix) @@ -98,8 +89,6 @@ export(filter_glycopeptides) export(filter_macrolides) export(filter_tetracyclines) export(first_isolate) -export(freq) -export(frequency_tbl) export(full_join_microorganisms) export(g.test) export(geom_rsi) @@ -108,7 +97,6 @@ export(get_mo_source) export(ggplot_rsi) export(ggplot_rsi_predict) export(guess_ab_col) -export(header) export(inner_join_microorganisms) export(is.ab) export(is.disk) @@ -171,35 +159,26 @@ export(semi_join_microorganisms) export(set_mo_source) export(skewness) export(theme_rsi) -export(top_freq) exportMethods(as.data.frame.ab) -exportMethods(as.data.frame.freq) exportMethods(as.data.frame.mo) exportMethods(as.double.mic) exportMethods(as.integer.mic) exportMethods(as.numeric.mic) -exportMethods(as.vector.freq) -exportMethods(as_tibble.freq) exportMethods(barplot.mic) exportMethods(barplot.rsi) -exportMethods(boxplot.freq) -exportMethods(diff.freq) exportMethods(droplevels.mic) exportMethods(droplevels.rsi) -exportMethods(format.freq) -exportMethods(hist.freq) +exportMethods(freq.mo) +exportMethods(freq.rsi) exportMethods(kurtosis) exportMethods(kurtosis.data.frame) exportMethods(kurtosis.default) exportMethods(kurtosis.matrix) -exportMethods(plot.freq) exportMethods(plot.mic) exportMethods(plot.rsi) exportMethods(print.ab) exportMethods(print.catalogue_of_life_version) exportMethods(print.disk) -exportMethods(print.freq) -exportMethods(print.frequency_tbl) exportMethods(print.mic) exportMethods(print.mo) exportMethods(print.mo_renamed) @@ -209,7 +188,6 @@ exportMethods(pull.ab) exportMethods(pull.mo) exportMethods(scale_type.ab) exportMethods(scale_type.mo) -exportMethods(select.freq) exportMethods(skewness) exportMethods(skewness.data.frame) exportMethods(skewness.default) @@ -217,6 +195,8 @@ exportMethods(skewness.matrix) exportMethods(summary.mic) exportMethods(summary.mo) exportMethods(summary.rsi) +importFrom(clean,freq) +importFrom(clean,top_freq) importFrom(crayon,bgGreen) importFrom(crayon,bgRed) importFrom(crayon,bgYellow) @@ -232,7 +212,6 @@ importFrom(crayon,strip_style) importFrom(crayon,underline) importFrom(crayon,white) importFrom(crayon,yellow) -importFrom(data.table,address) importFrom(data.table,as.data.table) importFrom(data.table,data.table) importFrom(data.table,setkey) @@ -241,9 +220,7 @@ importFrom(dplyr,all_vars) importFrom(dplyr,any_vars) importFrom(dplyr,arrange) importFrom(dplyr,arrange_at) -importFrom(dplyr,as_tibble) importFrom(dplyr,between) -importFrom(dplyr,bind_cols) importFrom(dplyr,bind_rows) importFrom(dplyr,case_when) importFrom(dplyr,desc) @@ -252,7 +229,6 @@ importFrom(dplyr,everything) importFrom(dplyr,filter) importFrom(dplyr,filter_all) importFrom(dplyr,filter_at) -importFrom(dplyr,full_join) importFrom(dplyr,funs) importFrom(dplyr,group_by) importFrom(dplyr,group_by_at) @@ -275,35 +251,25 @@ importFrom(dplyr,slice) importFrom(dplyr,summarise) importFrom(dplyr,summarise_if) importFrom(dplyr,tibble) -importFrom(dplyr,top_n) importFrom(dplyr,transmute) importFrom(dplyr,ungroup) importFrom(dplyr,vars) -importFrom(grDevices,boxplot.stats) importFrom(graphics,arrows) importFrom(graphics,axis) importFrom(graphics,barplot) -importFrom(graphics,boxplot) -importFrom(graphics,hist) importFrom(graphics,par) importFrom(graphics,plot) importFrom(graphics,points) importFrom(graphics,text) -importFrom(hms,is.hms) -importFrom(knitr,kable) importFrom(microbenchmark,microbenchmark) importFrom(rlang,as_label) importFrom(rlang,enquos) -importFrom(rlang,eval_tidy) importFrom(scales,percent) importFrom(stats,complete.cases) -importFrom(stats,fivenum) importFrom(stats,glm) importFrom(stats,lm) -importFrom(stats,mad) importFrom(stats,pchisq) importFrom(stats,predict) -importFrom(stats,sd) importFrom(utils,browseURL) -importFrom(utils,browseVignettes) importFrom(utils,installed.packages) +importFrom(utils,menu) diff --git a/NEWS.md b/NEWS.md index 0bb14c24d..6ce82d4d7 100755 --- a/NEWS.md +++ b/NEWS.md @@ -1,4 +1,7 @@ -# AMR 0.7.1.9008 +# AMR 0.7.1.9026 + +### Breaking +* Function `freq()` has moved to a new package, [`clean`](https://github.com/msberends/clean) ([CRAN link](https://cran.r-project.org/package=clean)). Creating frequency tables is actually not the scope of this package (never was) and this function has matured a lot over the last two years. Therefore, a new package was created for data cleaning and checking and it perfectly fits the `freq()` function. The [`clean`](https://github.com/msberends/clean) package is available on CRAN and will be installed automatically when updating the `AMR` package, that now imports it. In a later stage, the `skewness()` and `kurtosis()` functions will be moved to the `clean` package too. ### New * Additional way to calculate co-resistance, i.e. when using multiple antibiotics as input for `portion_*` functions or `count_*` functions. This can be used to determine the empiric susceptibily of a combination therapy. A new parameter `only_all_tested` (**which defaults to `FALSE`**) replaces the old `also_single_tested` and can be used to select one of the two methods to count isolates and calculate portions. The difference can be seen in this example table (which is also on the `portion` and `count` help pages), where the %SI is being determined: @@ -32,15 +35,21 @@ Since this is a major change, usage of the old `also_single_tested` will throw an informative error that it has been replaced by `only_all_tested`. ### Changed +* Fixed a bug in `eucast_rules()` that caused an error when the input was a specific kind of `tibble` * Removed class `atc` - using `as.atc()` is now deprecated in favour of `ab_atc()` and this will return a character, not the `atc` class anymore * Removed deprecated functions `abname()`, `ab_official()`, `atc_name()`, `atc_official()`, `atc_property()`, `atc_tradenames()`, `atc_trivial_nl()` * Fix and speed improvement for `mo_shortname()` -* Fix for `as.mo()` where misspelled input would not be understood +* Algorithm improvements for `as.mo()`: + * Some misspelled input were not understood + * These new trivial names known to the field are now understood: meningococcus, gonococcus, pneumococcus + * Added support for unknown yeasts and fungi +* Added the newest taxonomic data from the IJSEM journal (now up to date until August 2019) * Fix for using `mo_*` functions where the coercion uncertainties and failures would not be available through `mo_uncertainties()` and `mo_failures()` anymore * Deprecated the `country` parameter of `mdro()` in favour of the already existing `guideline` parameter to support multiple guidelines within one country -* Fix for frequency tables when creating one directly on a group (using `group_by()`) -* The name of `RIF` is now Rifampicin instead of Rifampin +* The `name` of `RIF` is now Rifampicin instead of Rifampin * The `antibiotics` data set is now sorted by name +* Using verbose mode with `eucast_rules(..., verbose = TRUE)` returns more informative and readable output +* Speed improvement for `guess_ab_col()` which is now 30 times faster for antibiotic abbreviations # AMR 0.7.1 diff --git a/R/catalogue_of_life.R b/R/catalogue_of_life.R index f1073be87..0ec29f8aa 100755 --- a/R/catalogue_of_life.R +++ b/R/catalogue_of_life.R @@ -90,6 +90,7 @@ NULL #' @export #' @examples #' library(dplyr) +#' library(clean) #' microorganisms %>% freq(kingdom) #' microorganisms %>% group_by(kingdom) %>% freq(phylum, nmax = NULL) catalogue_of_life_version <- function() { diff --git a/R/data.R b/R/data.R index adf3e19c7..1974ab19c 100755 --- a/R/data.R +++ b/R/data.R @@ -55,7 +55,7 @@ #' #' A data set containing the microbial taxonomy of six kingdoms from the Catalogue of Life. MO codes can be looked up using \code{\link{as.mo}}. #' @inheritSection catalogue_of_life Catalogue of Life -#' @format A \code{\link{data.frame}} with 67,906 observations and 16 variables: +#' @format A \code{\link{data.frame}} with 68,260 observations and 16 variables: #' \describe{ #' \item{\code{mo}}{ID of microorganism as used by this package} #' \item{\code{col_id}}{Catalogue of Life ID} @@ -72,8 +72,8 @@ #' \item{9 entries of \emph{Streptococcus} (beta haemolytic groups A, B, C, D, F, G, H, K and unspecified)} #' \item{2 entries of \emph{Staphylococcus} (coagulase-negative [CoNS] and coagulase-positive [CoPS])} #' \item{3 entries of Trichomonas (Trichomonas vaginalis, and its family and genus)} -#' \item{3 other 'undefined' entries (unknown, unknown Gram negatives and unknown Gram positives)} -#' \item{8,830 species from the DSMZ (Deutsche Sammlung von Mikroorganismen und Zellkulturen) that are not in the Catalogue of Life} +#' \item{5 other 'undefined' entries (unknown, unknown Gram negatives, unknown Gram positives, unknown yeast and unknown fungus)} +#' \item{8,970 species from the DSMZ (Deutsche Sammlung von Mikroorganismen und Zellkulturen) that are not in the Catalogue of Life} #' } #' @section About the records from DSMZ (see source): #' Names of prokaryotes are defined as being validly published by the International Code of Nomenclature of Bacteria. Validly published are all names which are included in the Approved Lists of Bacterial Names and the names subsequently published in the International Journal of Systematic Bacteriology (IJSB) and, from January 2000, in the International Journal of Systematic and Evolutionary Microbiology (IJSEM) as original articles or in the validation lists. @@ -91,14 +91,14 @@ catalogue_of_life <- list( version = "Catalogue of Life: {year} Annual Checklist", url_CoL = "http://www.catalogueoflife.org/annual-checklist/{year}/", url_DSMZ = "https://www.dsmz.de/microorganisms/pnu/bacterial_nomenclature_info_mm.php", - yearmonth_DSMZ = "February 2019" + yearmonth_DSMZ = "August 2019" ) #' Data set with previously accepted taxonomic names #' #' A data set containing old (previously valid or accepted) taxonomic names according to the Catalogue of Life. This data set is used internally by \code{\link{as.mo}}. #' @inheritSection catalogue_of_life Catalogue of Life -#' @format A \code{\link{data.frame}} with 21,342 observations and 4 variables: +#' @format A \code{\link{data.frame}} with 21,743 observations and 4 variables: #' \describe{ #' \item{\code{col_id}}{Catalogue of Life ID that was originally given} #' \item{\code{col_id_new}}{New Catalogue of Life ID that responds to an entry in the \code{\link{microorganisms}} data set} diff --git a/R/eucast_rules.R b/R/eucast_rules.R index e237556ae..1f2a4ba3c 100755 --- a/R/eucast_rules.R +++ b/R/eucast_rules.R @@ -29,7 +29,7 @@ EUCAST_VERSION_EXPERT_RULES <- "3.1, 2016" #' @param x data with antibiotic columns, like e.g. \code{AMX} and \code{AMC} #' @param info print progress #' @param rules a character vector that specifies which rules should be applied - one or more of \code{c("breakpoints", "expert", "other", "all")} -#' @param verbose a logical to indicate whether extensive info should be returned as a \code{data.frame} with info about which rows and columns are effected. It runs all EUCAST rules, but will not be applied to an output - only an informative \code{data.frame} with changes will be returned as output. +#' @param verbose a logical to turn Verbose mode on and off (default is off). In Verbose mode, the function does not apply rules to the data, but instead returns a \code{data.frame} with extensive info about which rows and columns would be effected and in which way. #' @param ... column name of an antibiotic, see section Antibiotics #' @inheritParams first_isolate #' @details @@ -119,7 +119,8 @@ EUCAST_VERSION_EXPERT_RULES <- "3.1, 2016" #' @rdname eucast_rules #' @export #' @importFrom dplyr %>% select pull mutate_at vars group_by summarise n -#' @importFrom crayon bold bgGreen bgYellow bgRed black green blue italic strip_style white +#' @importFrom crayon bold bgGreen bgYellow bgRed black green blue italic strip_style white red +#' @importFrom utils menu #' @return The input of \code{x}, possibly with edited values of antibiotics. Or, if \code{verbose = TRUE}, a \code{data.frame} with all original and new values of the affected bug-drug combinations. #' @source #' \itemize{ @@ -139,8 +140,6 @@ EUCAST_VERSION_EXPERT_RULES <- "3.1, 2016" #' } #' @inheritSection AMR Read more on our website! #' @examples -#' a <- eucast_rules(septic_patients) -#' #' a <- data.frame(mo = c("Staphylococcus aureus", #' "Enterococcus faecalis", #' "Escherichia coli", @@ -176,7 +175,7 @@ EUCAST_VERSION_EXPERT_RULES <- "3.1, 2016" #' # 5 Pseudomonas aeruginosa R R - - R R R #' #' -#' # do not apply EUCAST rules, but rather get a a data.frame +#' # do not apply EUCAST rules, but rather get a data.frame #' # with 18 rows, containing all details about the transformations: #' c <- eucast_rules(a, verbose = TRUE) eucast_rules <- function(x, @@ -185,9 +184,21 @@ eucast_rules <- function(x, rules = c("breakpoints", "expert", "other", "all"), verbose = FALSE, ...) { - - x <- x - + + if (verbose == TRUE & interactive()) { + txt <- paste0("WARNING: In Verbose mode, the eucast_rules() function does not apply rules to the data, but instead returns a data set in logbook form: with extensive info about which rows and columns would be effected and in which way.", + "\n\nThis may overwrite your existing data if you use e.g.:", + "\ndata <- eucast_rules(data, verbose = TRUE)\n\nDo you want to continue?") + if ("rstudioapi" %in% rownames(installed.packages())) { + q_continue <- rstudioapi::showQuestion("Using verbose = TRUE with eucast_rules()", txt) + } else { + q_continue <- menu(choices = c("OK", "Cancel"), graphics = TRUE, title = txt) + } + if (q_continue %in% c(FALSE, 2)) { + return(invisible()) + } + } + if (!is.data.frame(x)) { stop("`x` must be a data frame.", call. = FALSE) } @@ -385,7 +396,6 @@ eucast_rules <- function(x, cols <- unique(cols[!is.na(cols) & !is.null(cols)]) if (length(rows) > 0 & length(cols) > 0) { before_df <- x_original - before <- as.character(unlist(as.list(x_original[rows, cols]))) tryCatch( # insert into original table @@ -406,9 +416,7 @@ eucast_rules <- function(x, x[rows, cols] <<- x_original[rows, cols] - after <- as.character(unlist(as.list(x_original[rows, cols]))) - - # before_df might not be a data.frame, but a tibble of data.table instead + # before_df might not be a data.frame, but a tibble or data.table instead old <- as.data.frame(before_df, stringsAsFactors = FALSE)[rows,] no_of_changes_this_run <- 0 for (i in 1:length(cols)) { @@ -423,13 +431,14 @@ eucast_rules <- function(x, stringsAsFactors = FALSE) colnames(verbose_new) <- c("row", "col", "mo_fullname", "old", "new", "rule", "rule_group", "rule_name") verbose_new <- verbose_new %>% filter(old != new | is.na(old)) + # save changes to data set 'verbose_info' verbose_info <<- rbind(verbose_info, verbose_new) no_of_changes_this_run <- no_of_changes_this_run + nrow(verbose_new) } - # return number of (new) changes + # after the applied changes: return number of (new) changes return(no_of_changes_this_run) } - # return number of (new) changes: none. + # no changes were applied: return number of (new) changes: none. return(0) } @@ -498,6 +507,15 @@ eucast_rules <- function(x, } y[y != "" & y %in% colnames(df)] } + get_antibiotic_names <- function(x) { + x %>% + strsplit(",") %>% + unlist() %>% + trimws() %>% + sapply(function(x) if(x %in% AMR::antibiotics$ab) ab_name(x, language = NULL, tolower = TRUE) else x) %>% + sort() %>% + paste(collapse = ", ") + } eucast_rules_df <- eucast_rules_file # internal data file no_of_changes <- 0 @@ -510,10 +528,11 @@ eucast_rules <- function(x, rule_group_current <- eucast_rules_df[i, "reference.rule_group"] rule_group_next <- eucast_rules_df[min(nrow(eucast_rules_df), i + 1), "reference.rule_group"] if (is.na(eucast_rules_df[i, 4])) { - rule_text <- paste("always:", eucast_rules_df[i, 6], "=", eucast_rules_df[i, 7]) + rule_text <- paste0("always report as '", eucast_rules_df[i, 7], "': ", get_antibiotic_names(eucast_rules_df[i, 6])) } else { - rule_text <- paste("if", eucast_rules_df[i, 4], "=", eucast_rules_df[i, 5], - "then", eucast_rules_df[i, 6], "=", eucast_rules_df[i, 7]) + rule_text <- paste0("report as '", eucast_rules_df[i, 7], "' when ", + get_antibiotic_names(eucast_rules_df[i, 4]), " is '", eucast_rules_df[i, 5], "': ", + get_antibiotic_names(eucast_rules_df[i, 6])) } if (i == 1) { rule_previous <- "" @@ -736,7 +755,9 @@ eucast_rules <- function(x, cat(paste0(silver(strrep("-", options()$width - 1)), "\n")) if (verbose == FALSE & nrow(verbose_info) > 0) { - cat(paste("\nUse", bold("verbose = TRUE"), "to get a data.frame with all specified edits instead.\n")) + cat(paste("\nUse", bold("verbose = TRUE"), "(on your original data) to get a data.frame with all specified edits instead.\n")) + } else if (verbose == TRUE) { + cat(paste(red("\nUsed 'Verbose mode' (verbose = TRUE)."), "This returns a data.frame with all specified edits.\nUse", bold("verbose = FALSE"), "to apply the rules on your data.\n")) } } diff --git a/R/first_isolate.R b/R/first_isolate.R index ba95f4a1d..2625e4555 100755 --- a/R/first_isolate.R +++ b/R/first_isolate.R @@ -70,7 +70,7 @@ #' @keywords isolate isolates first #' @seealso \code{\link{key_antibiotics}} #' @export -#' @importFrom dplyr arrange_at lag between row_number filter mutate arrange pull +#' @importFrom dplyr arrange_at lag between row_number filter mutate arrange pull ungroup #' @importFrom crayon blue bold silver #' @return Logical vector #' @source Methodology of this function is based on: \strong{M39 Analysis and Presentation of Cumulative Antimicrobial Susceptibility Test Data, 4th Edition}, 2014, \emph{Clinical and Laboratory Standards Institute (CLSI)}. \url{https://clsi.org/standards/products/microbiology/documents/m39/}. diff --git a/R/freq.R b/R/freq.R index 83cadbe99..38ff3880e 100755 --- a/R/freq.R +++ b/R/freq.R @@ -19,1254 +19,36 @@ # Visit our website for more info: https://msberends.gitlab.io/AMR. # # ==================================================================== # -#' Frequency table -#' -#' Create a frequency table of a vector with items or a \code{data.frame}. Supports quasiquotation and markdown for reports. Best practice is: \code{data \%>\% freq(var)}.\cr -#' \code{top_freq} can be used to get the top/bottom \emph{n} items of a frequency table, with counts as names. -#' @param x vector of any class or a \code{\link{data.frame}}, \code{\link{tibble}} (may contain a grouping variable) or \code{\link{table}} -#' @param ... up to nine different columns of \code{x} when \code{x} is a \code{data.frame} or \code{tibble}, to calculate frequencies from - see Examples. Also supports quasiquotion. -#' @param sort.count sort on count, i.e. frequencies. This will be \code{TRUE} at default for everything except when using grouping variables. -#' @param nmax number of row to print. The default, \code{15}, uses \code{\link{getOption}("max.print.freq")}. Use \code{nmax = 0}, \code{nmax = Inf}, \code{nmax = NULL} or \code{nmax = NA} to print all rows. -#' @param na.rm a logical value indicating whether \code{NA} values should be removed from the frequency table. The header (if set) will always print the amount of \code{NA}s. -#' @param row.names a logical value indicating whether row indices should be printed as \code{1:nrow(x)} -#' @param markdown a logical value indicating whether the frequency table should be printed in markdown format. This will print all rows (except when \code{nmax} is defined) and is default behaviour in non-interactive R sessions (like when knitting RMarkdown files). -#' @param digits how many significant digits are to be used for numeric values in the header (not for the items themselves, that depends on \code{\link{getOption}("digits")}) -#' @param quote a logical value indicating whether or not strings should be printed with surrounding quotes. Default is to print them only around characters that are actually numeric values. -#' @param header a logical value indicating whether an informative header should be printed -#' @param title text to show above frequency table, at default to tries to coerce from the variables passed to \code{x} -#' @param na a character string that should be used to show empty (\code{NA}) values (only useful when \code{na.rm = FALSE}) -#' @param droplevels a logical value indicating whether in factors empty levels should be dropped -#' @param sep a character string to separate the terms when selecting multiple columns -#' @inheritParams base::format -#' @param f a frequency table -#' @param n number of top \emph{n} items to return, use -n for the bottom \emph{n} items. It will include more than \code{n} rows if there are ties. -#' @param property property in header to return this value directly -#' @details Frequency tables (or frequency distributions) are summaries of the distribution of values in a sample. With the `freq` function, you can create univariate frequency tables. Multiple variables will be pasted into one variable, so it forces a univariate distribution. This package also has a vignette available to explain the use of this function further, run \code{browseVignettes("AMR")} to read it. -#' -#' For numeric values of any class, these additional values will all be calculated with \code{na.rm = TRUE} and shown into the header: -#' \itemize{ -#' \item{Mean, using \code{\link[base]{mean}}} -#' \item{Standard Deviation, using \code{\link[stats]{sd}}} -#' \item{Coefficient of Variation (CV), the standard deviation divided by the mean} -#' \item{Mean Absolute Deviation (MAD), using \code{\link[stats]{mad}}} -#' \item{Tukey Five-Number Summaries (minimum, Q1, median, Q3, maximum), using \code{\link[stats]{fivenum}}} -#' \item{Interquartile Range (IQR) calculated as \code{Q3 - Q1} using the Tukey Five-Number Summaries, i.e. \strong{not} using the \code{\link[stats]{quantile}} function} -#' \item{Coefficient of Quartile Variation (CQV, sometimes called coefficient of dispersion), calculated as \code{(Q3 - Q1) / (Q3 + Q1)} using the Tukey Five-Number Summaries} -#' \item{Outliers (total count and unique count), using \code{\link[grDevices]{boxplot.stats}}} -#' } -#' -#' For dates and times of any class, these additional values will be calculated with \code{na.rm = TRUE} and shown into the header: -#' \itemize{ -#' \item{Oldest, using \code{\link{min}}} -#' \item{Newest, using \code{\link{max}}, with difference between newest and oldest} -#' \item{Median, using \code{\link[stats]{median}}, with percentage since oldest} -#' } -#' -#' In factors, all factor levels that are not existing in the input data will be dropped. -#' -#' The function \code{top_freq} uses \code{\link[dplyr]{top_n}} internally and will include more than \code{n} rows if there are ties. -#' @importFrom stats fivenum sd mad -#' @importFrom grDevices boxplot.stats -#' @importFrom dplyr %>% arrange arrange_at bind_cols desc filter_at funs group_by mutate mutate_at n n_distinct pull select summarise tibble ungroup vars all_vars -#' @importFrom utils browseVignettes -#' @importFrom hms is.hms -#' @importFrom crayon red green silver -#' @importFrom rlang enquos eval_tidy as_label -#' @keywords summary summarise frequency freq -#' @rdname freq -#' @name freq -#' @return A \code{data.frame} (with an additional class \code{"freq"}) with five columns: \code{item}, \code{count}, \code{percent}, \code{cum_count} and \code{cum_percent}. -#' @export -#' @inheritSection AMR Read more on our website! -#' @examples -#' library(dplyr) -#' -#' # this all gives the same result: -#' freq(septic_patients$hospital_id) -#' freq(septic_patients[, "hospital_id"]) -#' septic_patients$hospital_id %>% freq() -#' septic_patients[, "hospital_id"] %>% freq() -#' septic_patients %>% freq("hospital_id") -#' septic_patients %>% freq(hospital_id) #<- easiest to remember (tidyverse) -#' -#' -#' # you could also use `select` or `pull` to get your variables -#' septic_patients %>% -#' filter(hospital_id == "A") %>% -#' select(mo) %>% -#' freq() -#' -#' -#' # multiple selected variables will be pasted together -#' septic_patients %>% -#' left_join_microorganisms %>% -#' freq(genus, species) -#' -#' # functions as quasiquotation are also supported -#' septic_patients %>% -#' freq(mo_genus(mo), mo_species(mo)) -#' -#' -#' # group a variable and analyse another -#' septic_patients %>% -#' group_by(hospital_id) %>% -#' freq(gender) -#' -#' -#' # get top 10 bugs of hospital A as a vector -#' septic_patients %>% -#' filter(hospital_id == "A") %>% -#' freq(mo) %>% -#' top_freq(10) -#' -#' -#' # save frequency table to an object -#' years <- septic_patients %>% -#' mutate(year = format(date, "%Y")) %>% -#' freq(year) -#' -#' -#' # show only the top 5 -#' years %>% print(nmax = 5) -#' -#' -#' # save to an object with formatted percentages -#' years <- format(years) -#' -#' -#' # print a histogram of numeric values -#' septic_patients %>% -#' freq(age) %>% -#' hist() -#' -#' # or a boxplot of numeric values -#' septic_patients %>% -#' freq(age) %>% -#' boxplot() -#' -#' # or even a boxplot per group -#' septic_patients %>% -#' group_by(hospital_id) %>% -#' freq(age) %>% -#' boxplot() -#' -#' # or print all points to a regular plot -#' septic_patients %>% -#' freq(age) %>% -#' plot() -#' -#' -#' # transform to a data.frame or tibble -#' septic_patients %>% -#' freq(age) %>% -#' as.data.frame() -#' -#' -#' # or transform (back) to a vector -#' septic_patients %>% -#' freq(age) %>% -#' as.vector() -#' -#' identical(septic_patients %>% -#' freq(age) %>% -#' as.vector() %>% -#' sort(), -#' sort(septic_patients$age)) # TRUE -#' -#' -#' # it also supports `table` objects -#' table(septic_patients$gender, -#' septic_patients$age) %>% -#' freq(sep = " **sep** ") -#' -#' -#' # only get selected columns -#' septic_patients %>% -#' freq(hospital_id) %>% -#' select(item, percent) -#' -#' septic_patients %>% -#' freq(hospital_id) %>% -#' select(-count, -cum_count) -#' -#' -#' # check differences between frequency tables -#' diff(freq(septic_patients$TMP), -#' freq(septic_patients$SXT)) -freq <- function(x, - ..., - sort.count = TRUE, - nmax = getOption("max.print.freq"), - na.rm = TRUE, - row.names = TRUE, - markdown = !interactive(), - digits = 2, - quote = NULL, - header = TRUE, - title = NULL, - na = "", - droplevels = TRUE, - sep = " ", - decimal.mark = getOption("OutDec"), - big.mark = ifelse(decimal.mark != ",", ",", ".")) { - - mult.columns <- 0 - x.group = character(0) - df <- NULL - x.name <- NULL - cols <- NULL - cols.names <- NULL - if (any(class(x) == "list")) { - cols <- names(x) - cols.names <- cols - x <- as.data.frame(x, stringsAsFactors = FALSE) - x.name <- "a list" - } else if (any(class(x) == "matrix")) { - x <- as.data.frame(x, stringsAsFactors = FALSE) - x.name <- "a matrix" - cols <- colnames(x) - quote <- FALSE - cols.names <- cols - if (all(cols %like% "V[0-9]")) { - cols <- NULL - } - } - - if (any(class(x) == "data.frame")) { - - if (is.null(x.name)) { - x.name <- deparse(substitute(x)) - } - if (x.name %like% "(%>%)") { - x.name <- x.name %>% strsplit("%>%", fixed = TRUE) %>% unlist() %>% .[1] %>% trimws() - } - if (x.name == ".") { - # passed on with pipe - x.name <- get_data_source_name(x) - if (!is.null(x.name)) { - x.name <- paste0("`", x.name, "`") - } else { - x.name <- "a data.frame" - } - } else if (!x.name %in% c("a list", "a matrix")) { - x.name <- paste0("`", x.name, "`") - } - x.name.dims <- x %>% - dim() %>% - format(decimal.mark = decimal.mark, big.mark = big.mark) %>% - trimws() %>% - paste(collapse = " x ") - x.name <- paste0(x.name, " (", x.name.dims, ")") - - x.group <- group_vars(x) - if (length(x.group) > 1) { - x.group <- x.group[1L] - warning("freq supports one grouping variable, only `", x.group, "` will be kept.", call. = FALSE) - } - - user_exprs <- enquos(...) - - if (length(user_exprs) > 0) { - new_list <- list(0) - for (i in 1:length(user_exprs)) { - new_list[[i]] <- tryCatch(eval_tidy(user_exprs[[i]], data = x), - error = function(e) stop(e$message, call. = FALSE)) - if (length(new_list[[i]]) == 1) { - if (is.character(new_list[[i]]) & new_list[[i]] %in% colnames(x)) { - # support septic_patients %>% freq("hospital_id") - new_list[[i]] <- x %>% pull(new_list[[i]]) - } - } - cols <- c(cols, as_label(user_exprs[[i]])) - } - - if (length(new_list) == 1 & length(x.group) == 0) { - # is now character - x <- new_list[[1]] - df <- NULL - } else { - # create data frame - df <- as.data.frame(new_list, col.names = cols, stringsAsFactors = FALSE) - cols.names <- colnames(df) - } - } else { - # complete data frame - df <- x - } - - if (identical(x.group, cols.names)) { - # ... %>% group_by(var = calculation(..)) %>% freq(var) - x.group <- NULL - } - - # support grouping variables - if (length(x.group) > 0) { - x.group_cols <- c(x.group, cols.names) - x <- bind_cols(x, df) - # if (droplevels == TRUE) { - # x <- x %>% mutate_at(vars(x.group_cols), droplevels) - # } - suppressWarnings( - df <- x %>% - group_by_at(vars(x.group_cols)) %>% - summarise(count = n()) - ) - if (na.rm == TRUE) { - df <- df %>% filter_at(vars(x.group_cols), all_vars(!is.na(.))) - } - if (!missing(sort.count)) { - if (sort.count == TRUE) { - df <- df %>% arrange_at(c(x.group_cols, "count"), desc) - } - } - df <- df %>% - mutate(cum_count = cumsum(count)) - - df.topleft <- df[1, 1] - df <- df %>% - ungroup() %>% - # do not repeat group labels - mutate_at(vars(x.group), ~(ifelse(lag(.) == ., "", .))) - df[1, 1] <- df.topleft - colnames(df)[1:2] <- c("group", "item") - - if (!is.null(levels(df$item)) & droplevels == TRUE) { - # is factor - df <- df %>% filter(count != 0) - } - } else { - if (!is.null(df)) { - # no groups, multiple values like: septic_patients %>% freq(mo, mo_genus(mo)) - x <- df - df <- NULL - } - } - if (length(cols) > 0 & is.data.frame(x)) { - x <- x[, cols.names] - } - - } else if (any(class(x) == "table")) { - x <- as.data.frame(x, stringsAsFactors = FALSE) - # now this DF contains 3 columns: the 2 vars and a Freq column - # paste the first 2 cols and repeat them Freq times: - x <- rep(x = do.call(paste, c(x[colnames(x)[1:2]], sep = sep)), - times = x$Freq) - x.name <- "a `table` object" - cols <- NULL - # mult.columns <- 2 - } else { - x.name <- deparse(substitute(x)) - if (all(x.name %like% "[$]") & length(x.name) == 1) { - cols <- unlist(strsplit(x.name, "$", fixed = TRUE))[2] - x.name <- unlist(strsplit(x.name, "$", fixed = TRUE))[1] - # try to find the object to determine dimensions - - x.obj <- tryCatch(get(x.name), error = function(e) NULL) - x.name <- paste0("`", x.name , "`") - if (!is.null(dim(x.obj))) { - x.name <- paste0(x.name, - " (", - x.obj %>% - dim() %>% - format(decimal.mark = decimal.mark, big.mark = big.mark) %>% - trimws() %>% - paste(collapse = " x "), - ")") - } - } else { - x.name <- NULL - cols <- NULL - } - } - - if (!is.null(ncol(x))) { - if (ncol(x) == 1 & any(class(x) == "data.frame")) { - x <- x %>% pull(1) - } else if (ncol(x) < 10) { - mult.columns <- ncol(x) - # paste old columns together - x <- do.call(paste, c(x[colnames(x)], sep = sep)) - } else { - stop("A maximum of 9 columns can be analysed at the same time.", call. = FALSE) - } - } - - if (mult.columns > 1) { - NAs <- x[is.na(x) | x == trimws(strrep("NA ", mult.columns))] - } else { - NAs <- x[is.na(x)] - } - - if (mult.columns > 0) { - header_list <- list(columns = mult.columns) - } else { - header_list <- list(class = class(x), - mode = mode(x)) - } - header_list$length <- length(x) - - if (na.rm == TRUE) { - x_class <- class(x) - x <- x[!x %in% NAs] - class(x) <- x_class - } - - markdown_line <- "" - if (markdown == TRUE) { - markdown_line <- " " - } - x_align <- "l" - - if (!is.null(levels(x))) { - header_list$levels <- levels(x) - header_list$ordered <- is.ordered(x) - # drop levels of non-existing factor values, - # since dplyr >= 0.8.0 does not do this anymore in group_by - if (droplevels == TRUE) { - x <- droplevels(x) - } - } - - header_list$na_length <- length(NAs) - header_list$unique <- n_distinct(x) - - if (NROW(x) > 0 & any(class(x) == "character")) { - header_list$shortest <- x %>% base::nchar() %>% base::min(na.rm = TRUE) - header_list$longest <- x %>% base::nchar() %>% base::max(na.rm = TRUE) - } - - if (NROW(x) > 0 & any(class(x) == "mo")) { - x_mo <- as.mo(x) # do it once for all three - header_list$families <- x_mo %>% mo_family() %>% n_distinct() - header_list$genera <- x_mo %>% mo_genus() %>% n_distinct() - header_list$species <- x_mo %>% mo_species() %>% n_distinct() - } - - if (NROW(x) > 0 & any(class(x) == "difftime") & !is.hms(x)) { - header_list$units <- attributes(x)$units - x <- as.double(x) - # after this, the numeric header_txt continues - } - - if (NROW(x) > 0 & any(class(x) %in% c("double", "integer", "numeric", "raw", "single"))) { - # right align number - x_align <- "r" - header_list$mean <- base::mean(x, na.rm = TRUE) - header_list$sd <- stats::sd(x, na.rm = TRUE) - header_list$cv <- cv(x, na.rm = TRUE) - header_list$mad <- stats::mad(x, na.rm = TRUE) - Tukey_five <- stats::fivenum(x, na.rm = TRUE) - header_list$fivenum <- Tukey_five - header_list$IQR <- Tukey_five[4] - Tukey_five[2] - header_list$cqv <- cqv(x, na.rm = TRUE) - header_list$outliers_total <- length(boxplot.stats(x)$out) - header_list$outliers_unique <- n_distinct(boxplot.stats(x)$out) - } - - if (any(class(x) == "rsi")) { - header_list$count_SI <- max(0, sum(x %in% c("S", "I"), na.rm = TRUE), na.rm = TRUE) - header_list$count_R <- max(0, sum(x == "R", na.rm = TRUE), na.rm = TRUE) - } - - formatdates <- "%e %B %Y" # = d mmmm yyyy - if (is.hms(x)) { - x <- x %>% as.POSIXlt() - formatdates <- "%H:%M:%S" - } - if (NROW(x) > 0 & any(class(x) %in% c("Date", "POSIXct", "POSIXlt"))) { - if (formatdates == "%H:%M:%S") { - # hms - header_list$earliest <- min(x, na.rm = TRUE) - header_list$latest <- max(x, na.rm = TRUE) - - } else { - # other date formats - header_list$oldest <- min(x, na.rm = TRUE) - header_list$newest <- max(x, na.rm = TRUE) - } - header_list$median <- median(x, na.rm = TRUE) - header_list$date_format <- formatdates - } - if (any(class(x) == "POSIXlt")) { - x <- x %>% format(formatdates) - } - - nmax.set <- !missing(nmax) - if (!nmax.set & is.null(nmax) & is.null(base::getOption("max.print.freq", default = NULL))) { - # default for max print setting - nmax <- 15 - } else if (is.null(nmax)) { - nmax <- length(x) - } - - if (nmax %in% c(0, Inf, NA, NULL)) { - nmax <- length(x) - } - - column_names <- c("Item", "Count", "Percent", "Cum. Count", "Cum. Percent") - column_names_df <- c("item", "count", "percent", "cum_count", "cum_percent") - column_align <- c(x_align, "r", "r", "r", "r") - - if (is.null(df)) { - - suppressWarnings( # suppress since dplyr 0.8.0, which idiotly warns about included NAs :( - # create table with counts and percentages - df <- tibble(item = x) %>% - group_by(item) %>% - summarise(count = n()) - ) - - # sort according to setting - if (sort.count == TRUE) { - df <- df %>% arrange(desc(count), item) - } else { - df <- df %>% arrange(item) - } - } else { - column_names <- c("Group", column_names) - column_names_df <-c("group", column_names_df) - column_align <- c("l", column_align) - } - - if (df$item %>% paste(collapse = ",") %like% "\033") { - # remove escape char - # see https://en.wikipedia.org/wiki/Escape_character#ASCII_escape_character - df <- df %>% mutate(item = item %>% gsub("\033", " ", ., fixed = TRUE)) - } - - if (is.null(quote)) { - if (!is.numeric(df$item) & all(df$item %like% "^[0-9]+$", na.rm = TRUE)) { - quote <- TRUE - } else { - quote <- FALSE - } - } - - if (quote == TRUE) { - df$item <- paste0('"', df$item, '"') - if (length(x.group) != 0) { - df$group <- paste0('"', df$group, '"') - } - } - - df <- as.data.frame(df, stringsAsFactors = FALSE) - - df$percent <- df$count / base::sum(df$count, na.rm = TRUE) - if (length(x.group) == 0) { - df$cum_count <- base::cumsum(df$count) - } - df$cum_percent <- df$cum_count / base::sum(df$count, na.rm = TRUE) - if (length(x.group) != 0) { - # sort columns - df <- df[, column_names_df] - } - - if (markdown == TRUE) { - tbl_format <- "markdown" - } else { - tbl_format <- "pandoc" - } - - if (!is.null(title)) { - title <- trimws(gsub("^Frequency table of", "", title[1L], ignore.case = TRUE)) - } - - # if (nmax.set == FALSE) { - # nmax <- nrow(df) - # } - - structure(.Data = df, - class = unique(c("freq", class(df))), - header = header_list, - opt = list(title = title, - data = x.name, - vars = cols, - group_var = x.group, - header = header, - row_names = row.names, - column_names = column_names, - column_align = column_align, - decimal.mark = decimal.mark, - big.mark = big.mark, - tbl_format = tbl_format, - na = na, - digits = digits, - nmax = nmax, - nmax.set = nmax.set)) -} - -#' @rdname freq -#' @export -frequency_tbl <- freq - -is.freq <- function(f) { - any(c("freq", "frequency_tbl") %in% class(f)) -} - -#' @importFrom crayon silver green red -#' @importFrom dplyr %>% -format_header <- function(x, markdown = FALSE, decimal.mark = ".", big.mark = ",", digits = 2) { - newline <-"\n" - if (markdown == TRUE) { - newline <- " \n" - # no colours in markdown - silver <- function(x) x - green <- function(x) x - red <- function(x) x - } - - header <- header(x) - x_class <- header$class - has_length <- header$length > 0 - - # FORMATTING - # rsi - if (has_length == TRUE & any(x_class == "rsi")) { - if (!is.null(attributes(x)$opt$vars)) { - ab <- tryCatch(as.ab(attributes(x)$opt$vars), error = function(e) NA) - if (!is.na(ab) & isTRUE(length(ab) > 0)) { - header$drug <- paste0(ab_name(ab[1L]), " (", ab[1L], ", ", ab_atc(ab[1L]), ")") - header$group <- ab_group(ab[1L]) - } - } - header$`%SI` <- percent(header$count_SI / (header$count_SI + header$count_R), - force_zero = TRUE, round = digits, decimal.mark = decimal.mark) - } - header <- header[!names(header) %in% c("count_SI", "count_R")] - # dates - if (!is.null(header$date_format)) { - if (header$date_format == "%H:%M:%S") { - header$median <- paste0(format(header$median, header$date_format), - " (", - (as.double(difftime(header$median, header$earliest, units = "auto")) / - as.double(difftime(header$latest, header$earliest, units = "auto"))) %>% - percent(round = digits, decimal.mark = decimal.mark), ")") - header$latest <- paste0(format(header$latest, header$date_format), - " (+", - difftime(header$latest, header$earliest, units = "mins") %>% - as.double() %>% - format(digits = digits, decimal.mark = decimal.mark, big.mark = big.mark), - " min.)") - header$earliest <- format(header$earliest, header$date_format) - - header$median <- trimws(header$median) - header$latest <- trimws(header$latest) - header$earliest <- trimws(header$earliest) - } else { - header$median <- paste0(format(header$median, header$date_format), - " (", - (as.double(difftime(header$median, header$oldest, units = "auto")) / - as.double(difftime(header$newest, header$oldest, units = "auto"))) %>% - percent(round = digits, decimal.mark = decimal.mark), ")") - header$newest <- paste0(format(header$newest, header$date_format), - " (+", - difftime(header$newest, header$oldest, units = "auto") %>% - as.double() %>% - format(digits = digits, decimal.mark = decimal.mark, big.mark = big.mark), - ")") - header$oldest <- format(header$oldest, header$date_format) - - header$median <- trimws(header$median) - header$newest <- trimws(header$newest) - header$oldest <- trimws(header$oldest) - } - header <- header[names(header) != "date_format"] - } - - # class and mode - if (is.null(header$columns)) { - if (!header$mode %in% header$class) { - header$class <- header$class %>% rev() %>% paste(collapse = " > ") %>% paste0(silver(paste0(" (", header$mode, ")"))) - } else { - header$class <- header$class %>% rev() %>% paste(collapse = " > ") - } - header <- header[names(header) != "mode"] - } - # levels - if (!is.null(header$levels)) { - if (header$ordered == TRUE) { - levels_text <- paste0(header$levels, collapse = " < ") - } else { - levels_text <- paste0(header$levels, collapse = ", ") - } - if (nchar(levels_text) > 70) { - # levels text wider than half the console - levels_text <- paste0(substr(levels_text, 1, 70 - 3), "...") - if (nchar(gsub("[^`]", "", levels_text)) %% 2 == 1) { - # odd number of backticks, should be even - levels_text <- paste0(levels_text, "`") - } - } - header$levels <- paste0(length(header$levels), ": ", levels_text) - header <- header[names(header) != "ordered"] - } - # length and NAs - if (has_length == TRUE) { - na_txt <- paste0(header$na_length %>% format(decimal.mark = decimal.mark, big.mark = big.mark), " = ", - (header$na_length / header$length) %>% percent(force_zero = TRUE, round = digits, decimal.mark = decimal.mark) %>% - sub("NaN", "0", ., fixed = TRUE)) - if (!na_txt %like% "^0 =") { - na_txt <- red(na_txt) - } else { - na_txt <- green(na_txt) - } - na_txt <- paste0("(of which NA: ", na_txt, ")") - } else { - na_txt <- "" - } - header$length <- paste(format(header$length, decimal.mark = decimal.mark, big.mark = big.mark), - na_txt) - header <- header[names(header) != "na_length"] - - # format all numeric values - header <- lapply(header, function(x) { - if (is.numeric(x)) { - if (any(x < 1000, na.rm = TRUE)) { - format(round2(x, digits = digits), decimal.mark = decimal.mark, big.mark = big.mark) - } else { - format(x, digits = digits, decimal.mark = decimal.mark, big.mark = big.mark) - } - } else { - x - } - }) - - # numeric values - if (has_length == TRUE & !is.null(header$sd)) { - # any(x_class %in% c("double", "integer", "numeric", "raw", "single"))) { - header$sd <- paste0(header$sd, " (CV: ", header$cv, ", MAD: ", header$mad, ")") - header$fivenum <- paste0(paste(trimws(header$fivenum), collapse = " | "), " (IQR: ", header$IQR, ", CQV: ", header$cqv, ")") - header$outliers_total <- paste0(header$outliers_total, " (unique count: ", header$outliers_unique, ")") - header <- header[!names(header) %in% c("cv", "mad", "IQR", "cqv", "outliers_unique")] - } - - # header names - header_names <- paste0(names(header), ": ") - header_names <- gsub("sd", "SD", header_names) - header_names <- gsub("fivenum", "Five-Num", header_names) - header_names <- gsub("outliers_total", "Outliers", header_names) - # capitalise first character - header_names <- gsub("^(.)", "\\U\\1", header_names, perl = TRUE) - # make all header captions equal size - header_names <- gsub("\\s", " ", format(header_names, - width = max(nchar(header_names), - na.rm = TRUE))) - header <- paste0(header_names, header) - header <- paste(header, collapse = newline) - # add newline after 'Unique' - gsub("(.*Unique.*\\n)(.*?)", paste0("\\1", newline, "\\2"), header) -} - -#' @rdname freq -#' @export -#' @importFrom dplyr top_n pull -top_freq <- function(f, n) { - if (!is.freq(f)) { - stop("`top_freq` can only be applied to frequency tables", call. = FALSE) - } - if (!is.numeric(n) | length(n) != 1L) { - stop("For `top_freq`, 'n' must be a number of length 1", call. = FALSE) - } - top <- f %>% top_n(n, count) - vect <- top %>% pull(item) - names(vect) <- top %>% pull(count) - if (length(vect) > abs(n)) { - message("top_freq: selecting ", length(vect), " items instead of ", abs(n), ", because of ties") - } - vect -} - -#' @rdname freq -#' @export -header <- function(f, property = NULL) { - if (!is.freq(f)) { - stop("`header` can only be applied to frequency tables", call. = FALSE) - } - if (is.null(property)) { - attributes(f)$header - } else { - a <- attributes(f)$header - if (any(property %in% names(f))) { - a[names(a) %in% property] - } - } -} - -#' @noRd -#' @exportMethod diff.freq -#' @importFrom dplyr %>% full_join mutate -#' @export -diff.freq <- function(x, y, ...) { - # check classes - if (!is.freq(x) | !is.freq(y)) { - stop("Both x and y must be a frequency table.") - } - - cat("Differences between frequency tables") - if (identical(x, y)) { - cat("\n\nNo differences found.\n") - return(invisible()) - } - - x.attr <- attributes(x)$opt - - # only keep item and count - x <- x[, 1:2] - y <- y[, 1:2] - - x <- x %>% - full_join(y, - by = colnames(x)[1], - suffix = c(".x", ".y")) %>% - mutate( - diff = case_when( - is.na(count.y) ~ -count.x, - is.na(count.x) ~ count.y, - TRUE ~ count.y - count.x)) %>% - mutate( - diff.percent = percent( - diff / count.x, - force_zero = TRUE)) %>% - mutate(diff = ifelse(diff %like% "^-", - diff, - paste0("+", diff)), - diff.percent = ifelse(diff.percent %like% "^-", - diff.percent, - paste0("+", diff.percent))) - - print( - knitr::kable(x, - format = x.attr$tbl_format, - col.names = c("Item", "Count #1", "Count #2", "Difference", "Diff. percent"), - align = paste0(x.attr$column_align[1], "rrrr"), - padding = 1) - ) -} - -#' @rdname freq -#' @exportMethod print.freq -#' @importFrom knitr kable +freq_def <- clean:::freq.default +#' @exportMethod freq.mo #' @importFrom dplyr n_distinct -#' @importFrom crayon bold silver +#' @importFrom clean freq #' @export -print.freq <- function(x, - nmax = getOption("max.print.freq", default = 15), - markdown = !interactive(), - header = TRUE, - decimal.mark = getOption("OutDec"), - big.mark = ifelse(decimal.mark != ",", ",", "."), - ...) { - - opt <- attr(x, "opt") - if (is.null(opt)) { - # selection of frequency table, return original class - class(x) <- class(x)[!class(x) %in% c("freq", "frequency_tbl")] - print(x) - return(invisible()) - } - - opt$header_txt <- header(x) - if (is.null(opt$nmax)) { - opt$nmax <- 0 - } - if (is.null(opt$tbl_format)) { - opt$tbl_format <- "pandoc" - } - - dots <- list(...) - if ("markdown" %in% names(dots)) { - if (dots$markdown == TRUE) { - opt$tbl_format <- "markdown" - } else { - opt$tbl_format <- "pandoc" - } - } - if (!missing(markdown)) { - if (markdown == TRUE) { - opt$tbl_format <- "markdown" - } else { - opt$tbl_format <- "pandoc" - } - } - - if (length(opt$vars) == 0) { - opt$vars <- NULL - } - - if (is.null(opt$title)) { - if (isTRUE(opt$data %like% "^a data.frame") & isTRUE(opt$tbl_format == "markdown")) { - opt$data <- gsub("data.frame", "`data.frame`", opt$data, fixed = TRUE) - } - if (!is.null(opt$data) & !is.null(opt$vars)) { - title <- paste0("`", paste0(opt$vars, collapse = "` and `"), "` from ", opt$data) - } else if (!is.null(opt$data) & is.null(opt$vars)) { - title <- opt$data - } else if (is.null(opt$data) & !is.null(opt$vars)) { - title <- paste0("`", paste0(opt$vars, collapse = "` and `"), "`") - } else { - title <- "" - } - if (title != "" & length(opt$group_var) != 0) { - group_var <- paste0("(grouped by `", opt$group_var, "`)") - if (opt$tbl_format == "pandoc") { - group_var <- silver(group_var) - } - title <- paste(title, group_var) - } - title <- trimws(title) - if (title == "") { - title <- "Frequency table" - } else { - title <- paste("Frequency table of", trimws(title)) - } - } else { - title <- opt$title - } - - if (!missing(nmax) | is.null(opt$nmax)) { - opt$nmax <- nmax - opt$nmax.set <- TRUE - } - if (isTRUE(opt$nmax %in% c(0, Inf, NA, NULL))) { - opt$nmax <- NROW(x) - opt$nmax.set <- FALSE - } else if (isTRUE(opt$nmax >= NROW(x))) { - opt$nmax.set <- FALSE - } - - if (!missing(decimal.mark) | is.null(opt$decimal.mark)) { - opt$decimal.mark <- decimal.mark - } - if (!missing(big.mark) | is.null(opt$big.mark)) { - opt$big.mark <- big.mark - } - if (!missing(header)) { - opt$header <- header - } - - # bold title - if (isTRUE(opt$tbl_format == "pandoc")) { - title <- bold(title) - } else if (isTRUE(opt$tbl_format == "markdown")) { - title <- paste0("\n\n**", title, "** ") # two space for newline - } - - cat(title, "\n\n") - - if (NROW(x) == 0 | isTRUE(all(is.na(x$item)))) { - cat("No observations") - if (isTRUE(all(is.na(x$item) | identical(x$item, "") | identical(x$item, "(NA)")))) { - cat(" - all values are missing ()") - } - cat(".\n") - if (opt$tbl_format == "markdown") { - cat("\n") - } - return(invisible()) - } - - if (isTRUE(opt$header == TRUE)) { - if (!is.null(opt$header_txt)) { - if (is.null(opt$digits)) { - opt$digits <- 2 - } - cat(format_header(x, digits = opt$digits, markdown = (opt$tbl_format == "markdown"), - decimal.mark = decimal.mark, big.mark = big.mark)) - } - } - - # save old NA setting for kable - opt.old <- options()$knitr.kable.NA - if (is.null(opt$na)) { - opt$na <- "" - } - if (isTRUE(opt$tbl_format == "markdown")) { - # no HTML tags - opt$na <- gsub("<", "(", opt$na, fixed = TRUE) - opt$na <- gsub(">", ")", opt$na, fixed = TRUE) - } - options(knitr.kable.NA = opt$na) - - x.rows <- nrow(x) - x.unprinted <- base::sum(x[(opt$nmax + 1):nrow(x), "count"], na.rm = TRUE) - x.printed <- base::sum(x$count) - x.unprinted - - if (nrow(x) > opt$nmax & isTRUE(opt$tbl_format != "markdown")) { - - if (opt$nmax.set == TRUE) { - nmax <- opt$nmax - } else { - nmax <- getOption("max.print.freq", default = 15) - } - - x <- x[1:nmax,] - - if (opt$nmax.set == TRUE) { - footer <- paste("[ reached `nmax = ", opt$nmax, "`", sep = "") - } else { - footer <- '[ reached getOption("max.print.freq")' - } - footer <- paste(footer, - " -- omitted ", - format(x.rows - opt$nmax, big.mark = opt$big.mark, decimal.mark = opt$decimal.mark), - " entries, n = ", - format(x.unprinted, big.mark = opt$big.mark, decimal.mark = opt$decimal.mark), - " (", - (x.unprinted / (x.unprinted + x.printed)) %>% percent(force_zero = TRUE, decimal.mark = opt$decimal.mark), - ") ]\n", sep = "") - if (opt$tbl_format == "pandoc") { - footer <- silver(footer) # only silver in regular printing - } - } else if (opt$tbl_format == "markdown") { - if (opt$nmax.set == TRUE) { - x <- x[1:opt$nmax,] - footer <- paste("\n(omitted ", - format(x.rows - opt$nmax, big.mark = opt$big.mark, decimal.mark = opt$decimal.mark), - " entries, n = ", - format(x.unprinted, big.mark = opt$big.mark, decimal.mark = opt$decimal.mark), - " [", - (x.unprinted / (x.unprinted + x.printed)) %>% percent(force_zero = TRUE, decimal.mark = opt$decimal.mark), - "])\n", sep = "") - } else { - footer <- NULL - } - } else { - footer <- NULL - } - - if ("item" %in% colnames(x)) { - if (any(class(x$item) %in% c("double", "integer", "numeric", "raw", "single"))) { - x$item <- format(x$item, decimal.mark = opt$decimal.mark, big.mark = opt$big.mark) - } - } else { - opt$column_names <- opt$column_names[!opt$column_names == "Item"] - } - - all_unique <- FALSE - if ("count" %in% colnames(x)) { - if (all(x$count == 1)) { - all_unique <- TRUE - } - x$count <- format(x$count, decimal.mark = opt$decimal.mark, big.mark = opt$big.mark) - } else { - opt$column_names <- opt$column_names[!opt$column_names == "Count"] - } - if ("percent" %in% colnames(x)) { - x$percent <- percent(x$percent, force_zero = TRUE, decimal.mark = opt$decimal.mark) - } else { - opt$column_names <- opt$column_names[!opt$column_names == "Percent"] - } - if ("cum_count" %in% colnames(x)) { - x$cum_count <- format(x$cum_count, decimal.mark = opt$decimal.mark, big.mark = opt$big.mark) - } else { - opt$column_names <- opt$column_names[!opt$column_names == "Cum. Count"] - } - if ("cum_percent" %in% colnames(x)) { - x$cum_percent <- percent(x$cum_percent, force_zero = TRUE, decimal.mark = opt$decimal.mark) - } else { - opt$column_names <- opt$column_names[!opt$column_names == "Cum. Percent"] - } - - if (opt$tbl_format == "markdown") { - cat("\n") - } - - if (is.null(opt$row_names)) { - opt$row_names <- TRUE - } - if (is.null(opt$column_names)) { - opt$column_names <- colnames(x) - } - - print( - knitr::kable(x, - format = opt$tbl_format, - row.names = opt$row_names, - col.names = opt$column_names, - align = opt$column_align, - padding = 1) - ) - - if (!is.null(footer)) { - cat(footer) - } - - if (opt$tbl_format == "markdown") { - cat("\n\n") - } else { - cat("\n") - } - - if (all_unique == TRUE) { - message("NOTE: All observations are unique.") - } - - # reset old kable setting - options(knitr.kable.NA = opt.old) - return(invisible()) - -} - #' @noRd -#' @exportMethod print.frequency_tbl -#' @export -print.frequency_tbl <- print.freq +freq.mo <- function(x, ...) { + # replace with freq.default() if next `clean` version is published on CRAN + freq_def(x = x, ..., + .add_header = list(families = n_distinct(mo_family(x, language = NULL)), + genera = n_distinct(mo_genus(x, language = NULL)), + species = n_distinct(paste(mo_genus(x, language = NULL), + mo_species(x, language = NULL))))) +} +#' @exportMethod freq.rsi +#' @importFrom clean freq +#' @export #' @noRd -#' @exportMethod as.data.frame.freq -#' @export -as.data.frame.freq <- function(x, ...) { - attr(x, "package") <- NULL - attr(x, "opt") <- NULL - as.data.frame.data.frame(x, ...) -} - -#' @exportMethod select.freq -#' @export -#' @importFrom dplyr select -#' @noRd -select.freq <- function(.data, ...) { - select(as.data.frame(.data), ...) -} - -#' @noRd -#' @exportMethod as_tibble.freq -#' @export -#' @importFrom dplyr as_tibble -as_tibble.freq <- function(x, validate = TRUE, ..., rownames = NA) { - attr(x, "package") <- NULL - attr(x, "opt") <- NULL - as_tibble(x = as.data.frame(x), validate = validate, ..., rownames = rownames) -} - -#' @noRd -#' @exportMethod hist.freq -#' @export -#' @importFrom graphics hist -hist.freq <- function(x, breaks = "Sturges", main = NULL, xlab = NULL, ...) { - opt <- attr(x, "opt") - if (!class(x$item) %in% c("numeric", "double", "integer", "Date")) { - stop("`x` must be numeric or Date.", call. = FALSE) - } - if (!is.null(opt$vars)) { - title <- opt$vars - } else if (!is.null(opt$data)) { - title <- opt$data +freq.rsi <- function(x, ...) { + x_name <- deparse(substitute(x)) + x_name <- gsub(".*[$]", "", x_name) + ab <- suppressMessages(suppressWarnings(AMR::as.ab(x_name))) + if (!is.na(ab)) { + freq_def(x = x, ..., + .add_header = list(Drug = paste0(ab_name(ab), " (", ab, ", ", ab_atc(ab), ")"), + group = ab_group(ab), + `%SI` = portion_SI(x, minimum = 0, as_percent = TRUE))) } else { - title <- "frequency table" - } - if (class(x$item) == "Date") { - x <- as.Date(as.vector(x), origin = "1970-01-01") - } else { - x <- as.vector(x) - } - if (is.null(main)) { - main <- paste("Histogram of", title) - } - if (is.null(xlab)) { - xlab <- title - } - hist(x, main = main, xlab = xlab, breaks = breaks, ...) -} - -#' @noRd -#' @exportMethod boxplot.freq -#' @export -#' @importFrom graphics boxplot -boxplot.freq <- function(x, main = NULL, xlab = NULL, ...) { - opt <- attr(x, "opt") - x.bak <- x - if (!class(x$item) %in% c("numeric", "double", "integer", "Date")) { - stop("`x` must be numeric or Date.", call. = FALSE) - } - if (!is.null(opt$vars)) { - title <- opt$vars - } else if (!is.null(opt$data)) { - title <- opt$data - } else { - title <- "frequency table" - } - if (class(x$item) == "Date") { - x <- as.Date(as.vector(x), origin = "1970-01-01") - } else { - x <- as.vector(x) - } - if (is.null(main)) { - main <- paste("Boxplot of", title) - } - if (is.null(xlab)) { - xlab <- title - } - if (!is.null(opt$group_var) & isTRUE(length(opt$group_var) > 0)) { - # support for grouped frequency table - x.new <- data.frame(group = character(0), item = character(0)) - for (i in 1:nrow(x.bak)) { - if (x.bak[i, "group"] == "") { - x.bak[i, "group"] <- x.bak[i - 1, "group"] - } - for (j in 1:x.bak[i, "count"]) { - x.new <- rbind(x.new, - data.frame(group = x.bak[i, "group"], - item = x.bak[i, "item"])) - } - } - boxplot(item ~ group, data = x.bak, main = main, ylab = xlab, xlab = opt$group_var, ...) - } else { - boxplot(x, main = main, xlab = xlab, ...) - } -} - -#' @noRd -#' @exportMethod plot.freq -#' @export -plot.freq <- function(x, y, ...) { - opt <- attr(x, "opt") - if (!is.null(opt$vars)) { - title <- opt$vars - } else { - title <- "" - } - plot(x = x$item, y = x$count, ylab = "Count", xlab = title, ...) -} - -#' @noRd -#' @exportMethod as.vector.freq -#' @export -as.vector.freq <- function(x, mode = "any") { - as.vector(rep(x$item, x$count), mode = mode) -} - -#' @noRd -#' @exportMethod format.freq -#' @export -format.freq <- function(x, digits = 1, ...) { - opt <- attr(x, "opt") - if (opt$nmax.set == TRUE) { - nmax <- opt$nmax - } else { - nmax <- getOption("max.print.freq", default = 15) - } - - x <- x[1:nmax,] - x$percent <- percent(x$percent, round = digits, force_zero = TRUE) - x$cum_percent <- percent(x$cum_percent, round = digits, force_zero = TRUE) - base::format.data.frame(x, ...) -} - -#' @importFrom data.table address -get_data_source_name <- function(x, else_txt = NULL) { - obj_addr <- address(x) - # try global environment - addrs <- unlist(lapply(ls(".GlobalEnv"), function(x) address(get(x)))) - res <- ls(".GlobalEnv")[addrs == obj_addr] - if (length(res) == 0) { - # check AMR package - some users might use our data sets for testing - addrs <- unlist(lapply(ls("package:AMR"), function(x) address(get(x)))) - res <- ls("package:AMR")[addrs == obj_addr] - } - if (length(res) == 0) { - else_txt - } else { - res + freq_def(x = x, ..., + .add_header = list(`%SI` = portion_SI(x, minimum = 0, as_percent = TRUE))) } } diff --git a/R/get_locale.R b/R/get_locale.R index a81a4c1ae..b5316b24f 100755 --- a/R/get_locale.R +++ b/R/get_locale.R @@ -67,7 +67,10 @@ get_locale <- function() { } lang <- Sys.getlocale("LC_COLLATE") - # grepl with case = FALSE is faster than like + + # Check the locale settings for a start with one of these languages: + + # grepl() with ignore.case = FALSE is faster than %like% if (grepl("^(English|en_|EN_)", lang, ignore.case = FALSE)) { # as first option to optimise speed "en" diff --git a/R/guess_ab_col.R b/R/guess_ab_col.R index 148bea8fc..0fc662af0 100755 --- a/R/guess_ab_col.R +++ b/R/guess_ab_col.R @@ -79,6 +79,13 @@ guess_ab_col <- function(x = NULL, search_string = NULL, verbose = FALSE) { search_string.ab <- suppressWarnings(as.ab(search_string)) if (search_string.ab %in% colnames(x)) { ab_result <- colnames(x)[colnames(x) == search_string.ab][1L] + + } else if (any(tolower(colnames(x)) %in% tolower(unlist(ab_property(search_string.ab, "abbreviations"))))) { + ab_result <- colnames(x)[tolower(colnames(x)) %in% tolower(unlist(ab_property(search_string.ab, "abbreviations")))][1L] + + # } else if (any(tolower(colnames(x)) %in% tolower(ab_tradenames(search_string.ab)))) { + # ab_result <- colnames(x)[tolower(colnames(x)) %in% tolower(ab_tradenames(search_string.ab))][1L] + } else { # sort colnames on length - longest first cols <- colnames(x[, x %>% colnames() %>% nchar() %>% order() %>% rev()]) diff --git a/R/like.R b/R/like.R index c46595e90..b7d6c158d 100755 --- a/R/like.R +++ b/R/like.R @@ -48,6 +48,7 @@ #' #' # get frequencies of bacteria whose name start with 'Ent' or 'ent' #' library(dplyr) +#' library(clean) #' septic_patients %>% #' left_join_microorganisms() %>% #' filter(genus %like% '^ent') %>% @@ -75,7 +76,11 @@ like <- function(x, pattern) { if (is.factor(x)) { as.integer(x) %in% base::grep(pattern, levels(x), ignore.case = TRUE) } else { - base::grepl(pattern, x, ignore.case = TRUE) + tryCatch(base::grepl(pattern, x, ignore.case = TRUE), + error = function(e) ifelse(test = grepl("Invalid regexp", e$message), + # try with perl = TRUE: + yes = return(base::grepl(pattern, x, ignore.case = TRUE, perl = TRUE)), + no = stop(e$message))) } } diff --git a/R/mdro.R b/R/mdro.R index e177fe309..8ce9d9284 100755 --- a/R/mdro.R +++ b/R/mdro.R @@ -496,8 +496,8 @@ mdro <- function(x, #' @rdname mdro #' @export -brmo <- function(..., guideline = "BRMO") { - mdro(..., guideline = "BRMO") +brmo <- function(x, guideline = "BRMO", ...) { + mdro(x, guideline = "BRMO", ...) } #' @rdname mdro diff --git a/R/mic.R b/R/mic.R index b88647f6d..f8222e9b0 100755 --- a/R/mic.R +++ b/R/mic.R @@ -51,6 +51,8 @@ #' #' plot(mic_data) #' barplot(mic_data) +#' +#' library(clean) #' freq(mic_data) as.mic <- function(x, na.rm = FALSE) { if (is.mic(x)) { diff --git a/R/misc.R b/R/misc.R index babb054c3..7040eb000 100755 --- a/R/misc.R +++ b/R/misc.R @@ -71,18 +71,20 @@ size_humanreadable <- function(bytes, decimals = 1) { out } -percent_scales <- scales::percent +percent_clean <- clean:::percent # No export, no Rd -# based on scales::percent -percent <- function(x, round = 1, force_zero = FALSE, decimal.mark = getOption("OutDec"), ...) { - x <- percent_scales(x = as.double(x), - accuracy = 1 / 10 ^ round, - decimal.mark = decimal.mark, - ...) - if (force_zero == FALSE) { - x <- gsub("([.]%|%%)", "%", paste0(gsub("0+%$", "", x), "%")) +percent <- function(x, round = 1, force_zero = FALSE, decimal.mark = getOption("OutDec"), big.mark = ",", ...) { + if (decimal.mark == big.mark) { + if (decimal.mark == ",") { + big.mark <- "." + } else if (decimal.mark == ".") { + big.mark <- "," + } else { + big.mark <- " " + } } - x + x <- percent_clean(x = x, round = round, force_zero = force_zero, + decimal.mark = decimal.mark, big.mark = big.mark, ...) } #' @importFrom crayon blue bold red @@ -97,8 +99,10 @@ search_type_in_df <- function(x, type) { if (type == "mo") { if ("mo" %in% lapply(x, class)) { found <- colnames(x)[lapply(x, class) == "mo"][1] - } else if (any(colnames(x) %like% "^(mo|microorganism|organism|bacteria)s?$")) { - found <- colnames(x)[colnames(x) %like% "^(mo|microorganism|organism|bacteria)s?$"][1] + } else if (any(colnames(x) %like% "^(mo|microorganism|organism|bacteria|bacterie)s?$")) { + found <- colnames(x)[colnames(x) %like% "^(mo|microorganism|organism|bacteria|bacterie)s?$"][1] + } else if (any(colnames(x) %like% "^(microorganism|organism|bacteria|bacterie)")) { + found <- colnames(x)[colnames(x) %like% "^(microorganism|organism|bacteria|bacterie)"][1] } else if (any(colnames(x) %like% "species")) { found <- colnames(x)[colnames(x) %like% "species"][1] } diff --git a/R/mo.R b/R/mo.R index dab25c054..a85877732 100755 --- a/R/mo.R +++ b/R/mo.R @@ -472,7 +472,9 @@ exec_as.mo <- function(x, x_backup_without_spp <- x x_species <- paste(x, "species") # translate to English for supported languages of mo_property - x <- gsub("(Gruppe|gruppe|groep|grupo|gruppo|groupe)", "group", x, ignore.case = TRUE) + x <- gsub("(gruppe|groep|grupo|gruppo|groupe)", "group", x, ignore.case = TRUE) + x <- gsub("(hefe|gist|gisten|levadura|lievito|fermento|levure)[a-z]*", "yeast", x, ignore.case = TRUE) + x <- gsub("(schimmels?|mofo|molde|stampo|moisissure)[a-z]*", "fungus", x, ignore.case = TRUE) # remove non-text in case of "E. coli" except dots and spaces x <- gsub("[^.a-zA-Z0-9/ \\-]+", "", x) # replace minus by a space @@ -483,7 +485,7 @@ exec_as.mo <- function(x, x <- gsub("(alpha|beta|gamma).?ha?emoly", "\\1-haemoly", x) # remove genus as first word x <- gsub("^Genus ", "", x) - # allow characters that resemble others + # allow characters that resemble others ---- if (initial_search == FALSE) { x <- tolower(x) x <- gsub("[iy]+", "[iy]+", x) @@ -493,15 +495,17 @@ exec_as.mo <- function(x, x <- gsub("a+", "a+", x) x <- gsub("u+", "u+", x) # allow any ending of -um, -us, -ium, -icum, -ius, -icus, -ica and -a (needs perl for the negative backward lookup): - x <- gsub("(u\\+\\(c\\|k\\|q\\|qu\\+\\|s\\|z\\|x\\|ks\\)\\+)(?![a-z[])", + x <- gsub("(u\\+\\(c\\|k\\|q\\|qu\\+\\|s\\|z\\|x\\|ks\\)\\+)(?![a-z])", "(u[s|m]|[iy][ck]?u[ms]|[iy]?[ck]?a)", x, ignore.case = TRUE, perl = TRUE) - x <- gsub("(\\[iy\\]\\+\\(c\\|k\\|q\\|qu\\+\\|s\\|z\\|x\\|ks\\)\\+a\\+)(?![a-z[])", + x <- gsub("(\\[iy\\]\\+\\(c\\|k\\|q\\|qu\\+\\|s\\|z\\|x\\|ks\\)\\+a\\+)(?![a-z])", "(u[s|m]|[iy][ck]?u[ms]|[iy]?[ck]?a)", x, ignore.case = TRUE, perl = TRUE) - x <- gsub("(\\[iy\\]\\+u\\+m)(?![a-z[])", + x <- gsub("(\\[iy\\]\\+u\\+m)(?![a-z])", "(u[s|m]|[iy][ck]?u[ms]|[iy]?[ck]?a)", x, ignore.case = TRUE, perl = TRUE) x <- gsub("e+", "e+", x, ignore.case = TRUE) x <- gsub("o+", "o+", x, ignore.case = TRUE) x <- gsub("(.)\\1+", "\\1+", x) + # allow ending in -en or -us + x <- gsub("e\\+n(?![a-z[])", "(e+n|u+(c|k|q|qu|s|z|x|ks)+)", x, ignore.case = TRUE, perl = TRUE) } x <- strip_whitespace(x) @@ -519,7 +523,7 @@ exec_as.mo <- function(x, x_withspaces_end_only <- paste0(x_withspaces, '$') x_withspaces_start_end <- paste0('^', x_withspaces, '$') - if (debug == TRUE) { + if (isTRUE(debug)) { cat(paste0('x "', x, '"\n')) cat(paste0('x_species "', x_species, '"\n')) cat(paste0('x_withspaces_start_only "', x_withspaces_start_only, '"\n')) @@ -725,6 +729,14 @@ exec_as.mo <- function(x, } next } + if (x_backup_without_spp[i] %like% 'haemoly.*strept') { + # Haemolytic streptococci in different languages + x[i] <- microorganismsDT[mo == 'B_STRPT_HAE', ..property][[1]][1L] + if (initial_search == TRUE) { + set_mo_history(x_backup[i], get_mo_code(x[i], property), 0, force = force_mo_history) + } + next + } # CoNS/CoPS in different languages (support for German, Dutch, Spanish, Portuguese) ---- if (x_backup_without_spp[i] %like% '[ck]oagulas[ea] negatie?[vf]' | x_trimmed[i] %like% '[ck]oagulas[ea] negatie?[vf]' @@ -787,6 +799,32 @@ exec_as.mo <- function(x, } next } + + # trivial names known to the field: + if ("meningococcus" %like% x_trimmed[i]) { + # coerce S. coagulase positive + x[i] <- microorganismsDT[mo == 'B_NESSR_MEN', ..property][[1]][1L] + if (initial_search == TRUE) { + set_mo_history(x_backup[i], get_mo_code(x[i], property), 0, force = force_mo_history) + } + next + } + if ("gonococcus" %like% x_trimmed[i]) { + # coerce S. coagulase positive + x[i] <- microorganismsDT[mo == 'B_NESSR_GON', ..property][[1]][1L] + if (initial_search == TRUE) { + set_mo_history(x_backup[i], get_mo_code(x[i], property), 0, force = force_mo_history) + } + next + } + if ("pneumococcus" %like% x_trimmed[i]) { + # coerce S. coagulase positive + x[i] <- microorganismsDT[mo == 'B_STRPT_PNE', ..property][[1]][1L] + if (initial_search == TRUE) { + set_mo_history(x_backup[i], get_mo_code(x[i], property), 0, force = force_mo_history) + } + next + } } # FIRST TRY FULLNAMES AND CODES ---- @@ -1006,6 +1044,9 @@ exec_as.mo <- function(x, if (uncertainty_level >= 1) { # (1) look again for old taxonomic names, now for G. species ---- + if (isTRUE(debug)) { + cat("\n[UNCERTAINLY LEVEL 1] (1) look again for old taxonomic names, now for G. species\n") + } found <- microorganisms.oldDT[fullname %like% c.x_withspaces_start_end | fullname %like% d.x_withspaces_start_only] if (NROW(found) > 0 & nchar(g.x_backup_without_spp) >= 6) { @@ -1035,7 +1076,10 @@ exec_as.mo <- function(x, # (2) Try with misspelled input ---- # just rerun with initial_search = FALSE will used the extensive regex part above - found <- suppressMessages(suppressWarnings(exec_as.mo(a.x_backup, initial_search = FALSE, allow_uncertain = FALSE))) + if (isTRUE(debug)) { + cat("\n[UNCERTAINLY LEVEL 1] (2) Try with misspelled input\n") + } + found <- suppressMessages(suppressWarnings(exec_as.mo(a.x_backup, initial_search = FALSE, allow_uncertain = FALSE, debug = debug))) if (!empty_result(found)) { found_result <- found found <- microorganismsDT[mo == found, ..property][[1]] @@ -1054,6 +1098,9 @@ exec_as.mo <- function(x, if (uncertainty_level >= 2) { # (3) look for genus only, part of name ---- + if (isTRUE(debug)) { + cat("\n[UNCERTAINLY LEVEL 2] (3) look for genus only, part of name\n") + } if (nchar(g.x_backup_without_spp) > 4 & !b.x_trimmed %like% " ") { if (!grepl("^[A-Z][a-z]+", b.x_trimmed, ignore.case = FALSE)) { # not when input is like Genustext, because then Neospora would lead to Actinokineospora @@ -1074,9 +1121,12 @@ exec_as.mo <- function(x, } # (4) strip values between brackets ---- + if (isTRUE(debug)) { + cat("\n[UNCERTAINLY LEVEL 2] (4) strip values between brackets\n") + } a.x_backup_stripped <- gsub("( *[(].*[)] *)", " ", a.x_backup) a.x_backup_stripped <- trimws(gsub(" +", " ", a.x_backup_stripped)) - found <- suppressMessages(suppressWarnings(exec_as.mo(a.x_backup_stripped, initial_search = FALSE, allow_uncertain = FALSE))) + found <- suppressMessages(suppressWarnings(exec_as.mo(a.x_backup_stripped, initial_search = FALSE, allow_uncertain = FALSE, debug = debug))) if (!empty_result(found) & nchar(g.x_backup_without_spp) >= 6) { found_result <- found found <- microorganismsDT[mo == found, ..property][[1]] @@ -1092,6 +1142,9 @@ exec_as.mo <- function(x, } # (5a) try to strip off half an element from end and check the remains ---- + if (isTRUE(debug)) { + cat("\n[UNCERTAINLY LEVEL 2] (5a) try to strip off half an element from end and check the remains\n") + } x_strip <- a.x_backup %>% strsplit(" ") %>% unlist() if (length(x_strip) > 1) { for (i in 1:(length(x_strip) - 1)) { @@ -1100,7 +1153,7 @@ exec_as.mo <- function(x, # remove last half of the second term x_strip_collapsed <- paste(c(x_strip[1:(length(x_strip) - i)], lastword_half), collapse = " ") if (nchar(x_strip_collapsed) >= 4) { - found <- suppressMessages(suppressWarnings(exec_as.mo(x_strip_collapsed, initial_search = FALSE, allow_uncertain = FALSE))) + found <- suppressMessages(suppressWarnings(exec_as.mo(x_strip_collapsed, initial_search = FALSE, allow_uncertain = FALSE, debug = debug))) if (!empty_result(found)) { found_result <- found found <- microorganismsDT[mo == found, ..property][[1]] @@ -1118,11 +1171,14 @@ exec_as.mo <- function(x, } } # (5b) try to strip off one element from end and check the remains ---- + if (isTRUE(debug)) { + cat("\n[UNCERTAINLY LEVEL 2] (5b) try to strip off one element from end and check the remains\n") + } if (length(x_strip) > 1) { for (i in 1:(length(x_strip) - 1)) { x_strip_collapsed <- paste(x_strip[1:(length(x_strip) - i)], collapse = " ") if (nchar(x_strip_collapsed) >= 4) { - found <- suppressMessages(suppressWarnings(exec_as.mo(x_strip_collapsed, initial_search = FALSE, allow_uncertain = FALSE))) + found <- suppressMessages(suppressWarnings(exec_as.mo(x_strip_collapsed, initial_search = FALSE, allow_uncertain = FALSE, debug = debug))) if (!empty_result(found)) { found_result <- found found <- microorganismsDT[mo == found, ..property][[1]] @@ -1139,12 +1195,47 @@ exec_as.mo <- function(x, } } } + # (5c) check for unknown yeasts/fungi ---- + if (isTRUE(debug)) { + cat("\n[UNCERTAINLY LEVEL 2] (5b) check for unknown yeasts/fungi\n") + } + if (b.x_trimmed %like% "yeast") { + found <- "F_YEAST" + found_result <- found + found <- microorganismsDT[mo == found, ..property][[1]] + uncertainties <<- rbind(uncertainties, + data.frame(uncertainty = 2, + input = a.x_backup, + fullname = microorganismsDT[mo == found_result[1L], fullname][[1]], + mo = found_result[1L])) + if (initial_search == TRUE) { + set_mo_history(a.x_backup, get_mo_code(found[1L], property), 2, force = force_mo_history) + } + return(found[1L]) + } + if (b.x_trimmed %like% "fungus") { + found <- "F_FUNGUS" + found_result <- found + found <- microorganismsDT[mo == found, ..property][[1]] + uncertainties <<- rbind(uncertainties, + data.frame(uncertainty = 2, + input = a.x_backup, + fullname = microorganismsDT[mo == found_result[1L], fullname][[1]], + mo = found_result[1L])) + if (initial_search == TRUE) { + set_mo_history(a.x_backup, get_mo_code(found[1L], property), 2, force = force_mo_history) + } + return(found[1L]) + } # (6) try to strip off one element from start and check the remains (only allow >= 2-part name outcome) ---- + if (isTRUE(debug)) { + cat("\n[UNCERTAINLY LEVEL 2] (6) try to strip off one element from start and check the remains (only allow >= 2-part name outcome)\n") + } x_strip <- a.x_backup %>% strsplit(" ") %>% unlist() if (length(x_strip) > 1 & nchar(g.x_backup_without_spp) >= 6) { for (i in 2:(length(x_strip))) { x_strip_collapsed <- paste(x_strip[i:length(x_strip)], collapse = " ") - found <- suppressMessages(suppressWarnings(exec_as.mo(x_strip_collapsed, initial_search = FALSE, allow_uncertain = FALSE))) + found <- suppressMessages(suppressWarnings(exec_as.mo(x_strip_collapsed, initial_search = FALSE, allow_uncertain = FALSE, debug = debug))) if (!empty_result(found)) { found_result <- found found <- microorganismsDT[mo == found_result[1L], ..property][[1]] @@ -1167,11 +1258,14 @@ exec_as.mo <- function(x, if (uncertainty_level >= 3) { # (7) try to strip off one element from start and check the remains ---- + if (isTRUE(debug)) { + cat("\n[UNCERTAINLY LEVEL 3] (7) try to strip off one element from start and check the remains\n") + } x_strip <- a.x_backup %>% strsplit(" ") %>% unlist() if (length(x_strip) > 1 & nchar(g.x_backup_without_spp) >= 6) { for (i in 2:(length(x_strip))) { x_strip_collapsed <- paste(x_strip[i:length(x_strip)], collapse = " ") - found <- suppressMessages(suppressWarnings(exec_as.mo(x_strip_collapsed, initial_search = FALSE, allow_uncertain = FALSE))) + found <- suppressMessages(suppressWarnings(exec_as.mo(x_strip_collapsed, initial_search = FALSE, allow_uncertain = FALSE, debug = debug))) if (!empty_result(found)) { found_result <- found found <- microorganismsDT[mo == found, ..property][[1]] @@ -1189,6 +1283,9 @@ exec_as.mo <- function(x, } # (8) part of a name (very unlikely match) ---- + if (isTRUE(debug)) { + cat("\n[UNCERTAINLY LEVEL 3] (8) part of a name (very unlikely match)\n") + } found <- microorganismsDT[fullname %like% f.x_withspaces_end_only] if (nrow(found) > 0) { found_result <- found[["mo"]] @@ -1217,7 +1314,7 @@ exec_as.mo <- function(x, x_withspaces_end_only[i], x_backup_without_spp[i]) if (!empty_result(x[i])) { - # no set_mo_history: is already set in uncertain_fn() + # no set_mo_history here - it is already set in uncertain_fn() next } @@ -1238,10 +1335,10 @@ exec_as.mo <- function(x, if (n_distinct(failures) > 1) { plural <- c("values", "them", "were") } - total_failures <- length(x_input[x_input %in% failures & !x_input %in% c(NA, NULL, NaN)]) + total_failures <- length(x_input[as.character(x_input) %in% as.character(failures) & !x_input %in% c(NA, NULL, NaN)]) total_n <- length(x_input[!x_input %in% c(NA, NULL, NaN)]) msg <- paste0(nr2char(n_distinct(failures)), " unique ", plural[1], - " (^= ", percent(total_failures / total_n, round = 1, force_zero = TRUE), + " (covering ", percent(total_failures / total_n, round = 1, force_zero = TRUE), ") could not be coerced and ", plural[3], " considered 'unknown'") if (n_distinct(failures) <= 10) { msg <- paste0(msg, ": ", paste('"', unique(failures), '"', sep = "", collapse = ', ')) @@ -1412,6 +1509,7 @@ print.mo <- function(x, ...) { #' @exportMethod summary.mo #' @importFrom dplyr n_distinct +#' @importFrom clean freq top_freq #' @export #' @noRd summary.mo <- function(object, ...) { diff --git a/R/mo_property.R b/R/mo_property.R index 8e72cabe2..568340035 100755 --- a/R/mo_property.R +++ b/R/mo_property.R @@ -151,13 +151,18 @@ mo_shortname <- function(x, language = get_locale(), ...) { x.mo <- AMR::as.mo(x, ...) metadata <- get_mo_failures_uncertainties_renamed() + replace_empty <- function(x) { + x[x == ""] <- "spp." + x + } + # get first char of genus and complete species in English - shortnames <- paste0(substr(mo_genus(x.mo, language = NULL), 1, 1), ". ", mo_species(x.mo, language = NULL)) - + shortnames <- paste0(substr(mo_genus(x.mo, language = NULL), 1, 1), ". ", replace_empty(mo_species(x.mo, language = NULL))) + # exceptions for Staphylococci shortnames[shortnames == "S. coagulase-negative" ] <- "CoNS" shortnames[shortnames == "S. coagulase-positive" ] <- "CoPS" - # exceptions for Streptococci + # exceptions for Streptococci: Streptococcus Group A -> GAS shortnames[shortnames %like% "S. group [ABCDFGHK]"] <- paste0("G", gsub("S. group ([ABCDFGHK])", "\\1", shortnames[shortnames %like% "S. group [ABCDFGHK]"]), "S") load_mo_failures_uncertainties_renamed(metadata) diff --git a/R/resistance_predict.R b/R/resistance_predict.R index b4a97706c..bf100ce95 100755 --- a/R/resistance_predict.R +++ b/R/resistance_predict.R @@ -28,8 +28,8 @@ #' @param year_max highest year to use in the prediction model, defaults to 10 years after today #' @param year_every unit of sequence between lowest year found in the data and \code{year_max} #' @param minimum minimal amount of available isolates per year to include. Years containing less observations will be estimated by the model. -#' @param model the statistical model of choice. Defaults to a generalised linear regression model with binomial distribution, assuming that a period of zero resistance was followed by a period of increasing resistance leading slowly to more and more resistance. See Details for valid options. -#' @param I_as_S a logical to indicate whether values \code{I} should be treated as \code{S} +#' @param model the statistical model of choice. Defaults to a generalised linear regression model with binomial distribution (i.e. using \code{\link{glm}(..., family = \link{binomial})}), assuming that a period of zero resistance was followed by a period of increasing resistance leading slowly to more and more resistance. See Details for valid options. +#' @param I_as_S a logical to indicate whether values \code{I} should be treated as \code{S} (will otherwise be treated as \code{R}) #' @param preserve_measurements a logical to indicate whether predictions of years that are actually available in the data should be overwritten by the original data. The standard errors of those years will be \code{NA}. #' @param info a logical to indicate whether textual analysis should be printed with the name and \code{\link{summary}} of the statistical model. #' @param main title of the plot @@ -58,7 +58,7 @@ #' @rdname resistance_predict #' @export #' @importFrom stats predict glm lm -#' @importFrom dplyr %>% pull mutate mutate_at n group_by_at summarise filter filter_at all_vars n_distinct arrange case_when n_groups transmute +#' @importFrom dplyr %>% pull mutate mutate_at n group_by_at summarise filter filter_at all_vars n_distinct arrange case_when n_groups transmute ungroup #' @inheritSection AMR Read more on our website! #' @examples #' x <- resistance_predict(septic_patients, col_ab = "AMX", year_min = 2010) @@ -162,18 +162,19 @@ resistance_predict <- function(x, as.integer(format(as.Date(x), '%Y')) } } - + df <- x %>% mutate_at(col_ab, as.rsi) %>% - mutate_at(col_ab, droplevels) %>% - mutate_at(col_ab, ~( - if (I_as_S == TRUE) { - gsub("I", "S", .) - } else { - # then I as R - gsub("I", "R", .) - } - )) %>% + mutate_at(col_ab, droplevels) + if (I_as_S == TRUE) { + df <- df %>% + mutate_at(col_ab, ~gsub("I", "S", .)) + } else { + # then I as R + df <- df %>% + mutate_at(col_ab, ~gsub("I", "R", .)) + } + df <- df %>% filter_at(col_ab, all_vars(!is.na(.))) %>% mutate(year = pull(., col_date) %>% year()) %>% group_by_at(c('year', col_ab)) %>% diff --git a/R/rsi.R b/R/rsi.R index 3abec4156..39178ba6f 100755 --- a/R/rsi.R +++ b/R/rsi.R @@ -50,7 +50,7 @@ #' @return Ordered factor with new class \code{rsi} #' @keywords rsi #' @export -#' @importFrom dplyr %>% +#' @importFrom dplyr %>% desc arrange filter #' @seealso \code{\link{as.mic}} #' @inheritSection AMR Read more on our website! #' @examples @@ -73,6 +73,8 @@ #' #' plot(rsi_data) # for percentages #' barplot(rsi_data) # for frequencies +#' +#' library(clean) #' freq(rsi_data) # frequency table with informative header #' #' # using dplyr's mutate diff --git a/R/sysdata.rda b/R/sysdata.rda index 43047d60c..c24a8039f 100644 Binary files a/R/sysdata.rda and b/R/sysdata.rda differ diff --git a/R/whocc.R b/R/whocc.R index 6566a6ba1..bd103a67e 100755 --- a/R/whocc.R +++ b/R/whocc.R @@ -24,7 +24,7 @@ #' All antimicrobial drugs and their official names, ATC codes, ATC groups and defined daily dose (DDD) are included in this package, using the WHO Collaborating Centre for Drug Statistics Methodology. #' @section WHOCC: #' \if{html}{\figure{logo_who.png}{options: height=60px style=margin-bottom:5px} \cr} -#' This package contains \strong{all ~450 antimicrobial drugs} and their Anatomical Therapeutic Chemical (ATC) codes, ATC groups and Defined Daily Dose (DDD) from the World Health Organization Collaborating Centre for Drug Statistics Methodology (WHOCC, \url{https://www.whocc.no}) and the Pharmaceuticals Community Register of the European Commission (\url{http://ec.europa.eu/health/documents/community-register/html/atc.htm}). +#' This package contains \strong{all ~450 antimicrobial drugs} and their Anatomical Therapeutic Chemical (ATC) codes, ATC groups and Defined Daily Dose (DDD) from the World Health Organization Collaborating Centre for Drug Statistics Methodology (WHOCC, \url{https://www.whocc.no}) and the Pharmaceuticals Community Register of the European Commission (\url{http://ec.europa.eu/health/documents/community-register/html/atc.htm}). \strong{NOTE: The WHOCC copyright does not allow use for commercial purposes, unlike any other info from this package. See \url{https://www.whocc.no/copyright_disclaimer/}.} #' #' These have become the gold standard for international drug utilisation monitoring and research. #' diff --git a/_pkgdown.yml b/_pkgdown.yml index 380e5f533..b230db94c 100644 --- a/_pkgdown.yml +++ b/_pkgdown.yml @@ -62,12 +62,6 @@ navbar: - text: "Get properties of an antibiotic" icon: "fa-capsules" href: "reference/ab_property.html" # reference instead of article - - text: "Create frequency tables" - icon: "fa-sort-amount-down" - href: "articles/freq.html" -# - text: "Use the G-test" -# icon: "fa-clipboard-check" -# href: "reference/g.test.html" # reference instead of article - text: "Other: benchmarks" icon: "fa-shipping-fast" href: "articles/benchmarks.html" @@ -130,13 +124,12 @@ reference: - title: "Analysing your data" desc: > Functions for conducting AMR analysis, like counting isolates, calculating - resistance or susceptibility, creating frequency tables or make plots. + resistance or susceptibility, or make plots. contents: - "`availability`" - "`count`" - "`portion`" - "`filter_ab_class`" - - "`freq`" - "`g.test`" - "`ggplot_rsi`" - "`kurtosis`" diff --git a/data-raw/eucast_rules.tsv b/data-raw/eucast_rules.tsv index 9c9ccae87..4b822a634 100644 --- a/data-raw/eucast_rules.tsv +++ b/data-raw/eucast_rules.tsv @@ -1,4 +1,14 @@ -if_mo_property like_is_one_of this_value and_these_antibiotics have_these_values then_change_these_antibiotics to_value reference.rule reference.rule_group +# --------------------------------------------------------------------------------------------------- +# For editing this EUCAST reference file, these values can all be used for target antibiotics: +# all_betalactams, aminoglycosides, carbapenems, cephalosporins, cephalosporins_without_CAZ, fluoroquinolones, +# glycopeptides, macrolides, minopenicillins, polymyxins, streptogramins, tetracyclines, ureidopenicillins +# and all separate EARS-Net letter codes like AMC. They can be separated by comma: 'AMC, fluoroquinolones'. +# The if_mo_property column can be any column name from the AMR::microorganisms data set, or "genus_species" or "gramstain". +# The like.is.one_of column must contain one of: like, is, one_of ('like' will read the first column as regular expression) +# The EUCAST guideline contains references to the 'Burkholderia cepacia complex'. All species in this group can be found in: LiPuma J, Curr Opin Pulm Med. 2005 Nov;11(6):528-33. (PMID 16217180). +# >>>>> IF YOU WANT TO IMPORT THIS FILE INTO YOUR OWN SOFTWARE, HAVE THE FIRST 10 LINES SKIPPED <<<<< +# --------------------------------------------------------------------------------------------------- +if_mo_property like.is.one_of this_value and_these_antibiotics have_these_values then_change_these_antibiotics to_value reference.rule reference.rule_group order is Enterobacteriales AMP S AMX S Enterobacteriales (Order) Breakpoints order is Enterobacteriales AMP I AMX I Enterobacteriales (Order) Breakpoints order is Enterobacteriales AMP R AMX R Enterobacteriales (Order) Breakpoints diff --git a/data-raw/internals.R b/data-raw/internals.R index 596ba5f76..1be61db03 100644 --- a/data-raw/internals.R +++ b/data-raw/internals.R @@ -1,14 +1,16 @@ -# EUCAST rules ---- -# For editing the reference file, these values can all be used for target antibiotics: -# "aminoglycosides", "tetracyclines", "polymyxins", "macrolides", "glycopeptides", -# "streptogramins", "cephalosporins", "cephalosporins_without_CAZ", "carbapenems", -# "minopenicillins", "ureidopenicillins", "fluoroquinolones", "all_betalactams", -# and all separate EARS-Net letter codes like "AMC". They can be separated by comma: "AMC, fluoroquinolones". -# The mo_property can be any column name from the AMR::microorganisms data set, or "genus_species" or "gramstain". -# The EUCAST guideline contains references to the 'Burkholderia cepacia complex'. The species in this group can be found in: -# LiPuma JJ, Curr Opin Pulm Med. 2005 Nov;11(6):528-33. (PMID 16217180). +# --------------------------------------------------------------------------------------------------- +# For editing this EUCAST reference file, these values can all be used for target antibiotics: +# all_betalactams, aminoglycosides, carbapenems, cephalosporins, cephalosporins_without_CAZ, fluoroquinolones, +# glycopeptides, macrolides, minopenicillins, polymyxins, streptogramins, tetracyclines, ureidopenicillins +# and all separate EARS-Net letter codes like AMC. They can be separated by comma: 'AMC, fluoroquinolones'. +# The if_mo_property column can be any column name from the AMR::microorganisms data set, or "genus_species" or "gramstain". +# The EUCAST guideline contains references to the 'Burkholderia cepacia complex'. All species in this group can be found in: +# LiPuma J, Curr Opin Pulm Med. 2005 Nov;11(6):528-33. (PMID 16217180). +# >>>>> IF YOU WANT TO IMPORT THIS FILE INTO YOUR OWN SOFTWARE, HAVE THE FIRST 10 LINES SKIPPED <<<<< +# --------------------------------------------------------------------------------------------------- eucast_rules_file <- dplyr::arrange( .data = utils::read.delim(file = "data-raw/eucast_rules.tsv", + skip = 10, sep = "\t", stringsAsFactors = FALSE, header = TRUE, diff --git a/data-raw/reproduction_of_microorganisms.R b/data-raw/reproduction_of_microorganisms.R index db8b7217c..79b182911 100644 --- a/data-raw/reproduction_of_microorganisms.R +++ b/data-raw/reproduction_of_microorganisms.R @@ -99,7 +99,7 @@ MOs <- data_total %>% # and not all fungi: Aspergillus, Candida, Trichphyton and Pneumocystis are the most important, # so only keep these orders from the fungi: & !(kingdom == "Fungi" - & !order %in% c("Eurotiales", "Saccharomycetales", "Schizosaccharomycetales", "Tremellales", "Onygenales", "Pneumocystales")) + & !order %in% c("Eurotiales", "Mucorales", "Saccharomycetales", "Schizosaccharomycetales", "Tremellales", "Onygenales", "Pneumocystales")) ) # or the genus has to be one of the genera we found in our hospitals last decades (Northern Netherlands, 2002-2018) | genus %in% c("Absidia", "Acremonium", "Actinotignum", "Alternaria", "Anaerosalibacter", "Ancylostoma", "Anisakis", "Apophysomyces", @@ -123,7 +123,7 @@ MOs <- MOs %>% MOs <- MOs %>% # remove text if it contains 'Not assigned' like phylum in viruses - mutate_all(~gsub("Not assigned", "", .)) + mutate_all(~gsub("(Not assigned|\\[homonym\\]|\\[mistake\\])", "", ., ignore.case = TRUE)) MOs <- MOs %>% # Only keep first author, e.g. transform 'Smith, Jones, 2011' to 'Smith et al., 2011': @@ -166,8 +166,10 @@ MOs <- MOs %>% # Remove non-ASCII characters (these are not allowed by CRAN) MOs <- MOs %>% - lapply(iconv, from = "UTF-8", to = "ASCII//TRANSLIT") %>% - as_tibble(stringsAsFactors = FALSE) + lapply(iconv, from = "UTF-8", to = "ASCII//TRANSLIT") %>% + as_tibble(stringsAsFactors = FALSE) %>% + # remove invalid characters + mutate_all(~gsub("[\"'`]+", "", .)) # Split old taxonomic names - they refer in the original data to a new `taxonID` with `acceptedNameUsageID` MOs.old <- MOs %>% @@ -219,6 +221,9 @@ MOs <- MOs %>% !(source == "DSMZ" & fullname %in% (MOs %>% filter(source == "CoL") %>% pull(fullname)))) %>% distinct(fullname, .keep_all = TRUE) +# what characters are in the fullnames? +paste(unique(sort(unlist(strsplit(x = paste(MOs$fullname, collapse = ""), split = "")))), collapse = "") + # Add abbreviations so we can easily know which ones are which ones. # These will become valid and unique microbial IDs for the AMR package. MOs <- MOs %>% @@ -295,7 +300,6 @@ MOs <- MOs %>% # put `mo` in front, followed by the rest select(mo, everything(), -abbr_other, -abbr_genus, -abbr_species, -abbr_subspecies) - # add non-taxonomic entries MOs <- MOs %>% bind_rows( @@ -348,6 +352,38 @@ MOs <- MOs %>% species_id = "", source = "manually added", stringsAsFactors = FALSE), + data.frame(mo = "F_YEAST", + col_id = NA_integer_, + fullname = "(unknown yeast)", + kingdom = "Fungi", + phylum = "(unknown phylum)", + class = "(unknown class)", + order = "(unknown order)", + family = "(unknown family)", + genus = "(unknown genus)", + species = "(unknown species)", + subspecies = "(unknown subspecies)", + rank = "species", + ref = NA_character_, + species_id = "", + source = "manually added", + stringsAsFactors = FALSE), + data.frame(mo = "F_FUNGUS", + col_id = NA_integer_, + fullname = "(unknown fungus)", + kingdom = "Fungi", + phylum = "(unknown phylum)", + class = "(unknown class)", + order = "(unknown order)", + family = "(unknown family)", + genus = "(unknown genus)", + species = "(unknown species)", + subspecies = "(unknown subspecies)", + rank = "species", + ref = NA_character_, + species_id = "", + source = "manually added", + stringsAsFactors = FALSE), # CoNS MOs %>% filter(genus == "Staphylococcus", species == "epidermidis") %>% .[1,] %>% @@ -488,6 +524,11 @@ MOs <- MOs %>% sum(duplicated(MOs$mo)) colnames(MOs) +# here we welcome the new ones: +MOs %>% filter(!fullname %in% AMR::microorganisms$fullname) %>% View() +# and the ones we lost: +AMR::microorganisms %>% filter(!fullname %in% MOs$fullname) %>% View() + # set prevalence per species MOs <- MOs %>% mutate(prevalence = case_when( @@ -534,12 +575,16 @@ MOs.old$col_id <- as.integer(MOs.old$col_id) MOs.old$col_id_new <- as.integer(MOs.old$col_id_new) # save +### for other server saveRDS(MOs, "microorganisms.rds") saveRDS(MOs.old, "microorganisms.old.rds") +### for same server +microorganisms <- MOs +microorganisms.old <- MOs.old # on the server, do: usethis::use_data(microorganisms, overwrite = TRUE, version = 2) usethis::use_data(microorganisms.old, overwrite = TRUE, version = 2) rm(microorganisms) rm(microorganisms.old) -# and update the year in R/data.R +# and update the year and dimensions in R/data.R diff --git a/data-raw/reproduction_of_microorganisms_new.R b/data-raw/reproduction_of_microorganisms_new.R new file mode 100644 index 000000000..6897dfd78 --- /dev/null +++ b/data-raw/reproduction_of_microorganisms_new.R @@ -0,0 +1,657 @@ +# --------------------------------------------------------------------------------- +# Reproduction of the `microorganisms` data set +# --------------------------------------------------------------------------------- +# Data retrieved from: +# +# [1] Catalogue of Life (CoL) through the Encyclopaedia of Life +# https://opendata.eol.org/dataset/catalogue-of-life/ +# * Download the resource file with a name like "Catalogue of Life yyyy-mm-dd" +# * Extract "taxon.tab" +# +# [2] Global Biodiversity Information Facility (GBIF) +# https://doi.org/10.15468/39omei +# * Extract "Taxon.tsv" +# +# [3] Deutsche Sammlung von Mikroorganismen und Zellkulturen (DSMZ) +# https://www.dsmz.de/support/bacterial-nomenclature-up-to-date-downloads.html +# * Download the latest "Complete List" as xlsx file (DSMZ_bactnames.xlsx) +# --------------------------------------------------------------------------------- + +library(dplyr) +library(AMR) + +data_col <- data.table::fread("Documents/taxon.tab") +data_gbif <- data.table::fread("Documents/Taxon.tsv") + +# read the xlsx file from DSMZ (only around 2.5 MB): +data_dsmz <- readxl::read_xlsx("Downloads/DSMZ_bactnames.xlsx") + +# the CoL data is over 3.7M rows: +data_col %>% freq(kingdom) +# Item Count Percent Cum. Count Cum. Percent +# --- ---------- ---------- -------- ----------- ------------- +# 1 Animalia 2,225,627 59.1% 2,225,627 59.1% +# 2 Plantae 1,177,412 31.3% 3,403,039 90.4% +# 3 Fungi 290,145 7.7% 3,693,184 98.1% +# 4 Chromista 47,126 1.3% 3,740,310 99.3% +# 5 Bacteria 14,478 0.4% 3,754,788 99.7% +# 6 Protozoa 6,060 0.2% 3,760,848 99.9% +# 7 Viruses 3,827 0.1% 3,764,675 100.0% +# 8 Archaea 610 0.0% 3,765,285 100.0% + +# the GBIF data is over 5.8M rows: +data_gbif %>% freq(kingdom) +# Item Count Percent Cum. Count Cum. Percent +# --- --------------- ---------- -------- ----------- ------------- +# 1 Animalia 3,264,138 55.7% 3,264,138 55.7% +# 2 Plantae 1,814,962 31.0% 5,079,100 86.7% +# 3 Fungi 538,086 9.2% 5,617,186 95.9% +# 4 Chromista 181,374 3.1% 5,798,560 99.0% +# 5 Bacteria 24,048 0.4% 5,822,608 99.4% +# 6 Protozoa 15,138 0.3% 5,837,746 99.7% +# 7 incertae sedis 9,995 0.2% 5,847,741 99.8% +# 8 Viruses 9,630 0.2% 5,857,371 100.0% +# 9 Archaea 771 0.0% 5,858,142 100.0% + + +# Clean up helper function ------------------------------------------------ +clean_new <- function(new) { + new %>% + # only the ones that have no new ID to refer to a newer name + filter(is.na(col_id_new)) %>% + filter( + ( + # we only want all MICROorganisms and no viruses + !kingdom %in% c("Animalia", "Chromista", "Plantae", "Viruses") + # and not all fungi: Aspergillus, Candida, Trichphyton and Pneumocystis are the most important, + # so only keep these orders from the fungi: + & !(kingdom == "Fungi" + & !order %in% c("Eurotiales", "Saccharomycetales", "Schizosaccharomycetales", "Tremellales", "Onygenales", "Pneumocystales")) + ) + # or the family has to contain a genus we found in our hospitals last decades (Northern Netherlands, 2002-2018) + | genus %in% c("Absidia", "Acremonium", "Actinotignum", "Alternaria", "Anaerosalibacter", "Ancylostoma", "Anisakis", "Apophysomyces", + "Arachnia", "Ascaris", "Aureobacterium", "Aureobasidium", "Balantidum", "Bilophilia", "Branhamella", "Brochontrix", + "Brugia", "Calymmatobacterium", "Catabacter", "Chilomastix", "Chryseomonas", "Cladophialophora", "Cladosporium", + "Clonorchis", "Cordylobia", "Curvularia", "Demodex", "Dermatobia", "Diphyllobothrium", "Dracunculus", "Echinococcus", + "Enterobius", "Euascomycetes", "Exophiala", "Fasciola", "Fusarium", "Hendersonula", "Hymenolepis", "Kloeckera", + "Koserella", "Larva", "Leishmania", "Lelliottia", "Loa", "Lumbricus", "Malassezia", "Metagonimus", "Molonomonas", + "Mucor", "Nattrassia", "Necator", "Novospingobium", "Onchocerca", "Opistorchis", "Paragonimus", "Paramyxovirus", + "Pediculus", "Phoma", "Phthirus", "Pityrosporum", "Pseudallescheria", "Pulex", "Rhizomucor", "Rhizopus", "Rhodotorula", + "Salinococcus", "Sanguibacteroides", "Schistosoma", "Scopulariopsis", "Scytalidium", "Sporobolomyces", "Stomatococcus", + "Strongyloides", "Syncephalastraceae", "Taenia", "Torulopsis", "Trichinella", "Trichobilharzia", "Trichomonas", + "Trichosporon", "Trichuris", "Trypanosoma", "Wuchereria")) %>% + mutate( + authors2 = iconv(ref, from = "UTF-8", to = "ASCII//TRANSLIT"), + # remove leading and trailing brackets + authors2 = gsub("^[(](.*)[)]$", "\\1", authors2), + # only take part after brackets if there's a name + authors2 = ifelse(grepl(".*[)] [a-zA-Z]+.*", authors2), + gsub(".*[)] (.*)", "\\1", authors2), + authors2), + # get year from last 4 digits + lastyear = as.integer(gsub(".*([0-9]{4})$", "\\1", authors2)), + # can never be later than now + lastyear = ifelse(lastyear > as.integer(format(Sys.Date(), "%Y")), + NA, + lastyear), + # get authors without last year + authors = gsub("(.*)[0-9]{4}$", "\\1", authors2), + # remove nonsense characters from names + authors = gsub("[^a-zA-Z,'& -]", "", authors), + # remove trailing and leading spaces + authors = trimws(authors), + # only keep first author and replace all others by 'et al' + authors = gsub("(,| and| et| &| ex| emend\\.?) .*", " et al.", authors), + # et al. always with ending dot + authors = gsub(" et al\\.?", " et al.", authors), + authors = gsub(" ?,$", "", authors), + # don't start with 'sensu' or 'ehrenb' + authors = gsub("^(sensu|Ehrenb.?) ", "", authors, ignore.case = TRUE), + # no initials, only surname + authors = gsub("^([A-Z]+ )+", "", authors, ignore.case = FALSE), + # combine author and year if year is available + ref = ifelse(!is.na(lastyear), + paste0(authors, ", ", lastyear), + authors), + # fix beginning and ending + ref = gsub(", $", "", ref), + ref = gsub("^, ", "", ref)) %>% + # remove text if it contains 'Not assigned' like phylum in viruses + mutate_all(~gsub("Not assigned", "", .)) %>% + # Remove non-ASCII characters (these are not allowed by CRAN) + lapply(iconv, from = "UTF-8", to = "ASCII//TRANSLIT") %>% + as_tibble(stringsAsFactors = FALSE) %>% + mutate(fullname = trimws(case_when(rank == "family" ~ family, + rank == "order" ~ order, + rank == "class" ~ class, + rank == "phylum" ~ phylum, + rank == "kingdom" ~ kingdom, + TRUE ~ paste(genus, species, subspecies)))) +} +clean_old <- function(old, new) { + old %>% + # only the ones that exist in the new data set + filter(col_id_new %in% new$col_id) %>% + mutate( + authors2 = iconv(ref, from = "UTF-8", to = "ASCII//TRANSLIT"), + # remove leading and trailing brackets + authors2 = gsub("^[(](.*)[)]$", "\\1", authors2), + # only take part after brackets if there's a name + authors2 = ifelse(grepl(".*[)] [a-zA-Z]+.*", authors2), + gsub(".*[)] (.*)", "\\1", authors2), + authors2), + # get year from last 4 digits + lastyear = as.integer(gsub(".*([0-9]{4})$", "\\1", authors2)), + # can never be later than now + lastyear = ifelse(lastyear > as.integer(format(Sys.Date(), "%Y")), + NA, + lastyear), + # get authors without last year + authors = gsub("(.*)[0-9]{4}$", "\\1", authors2), + # remove nonsense characters from names + authors = gsub("[^a-zA-Z,'& -]", "", authors), + # remove trailing and leading spaces + authors = trimws(authors), + # only keep first author and replace all others by 'et al' + authors = gsub("(,| and| et| &| ex| emend\\.?) .*", " et al.", authors), + # et al. always with ending dot + authors = gsub(" et al\\.?", " et al.", authors), + authors = gsub(" ?,$", "", authors), + # don't start with 'sensu' or 'ehrenb' + authors = gsub("^(sensu|Ehrenb.?) ", "", authors, ignore.case = TRUE), + # no initials, only surname + authors = gsub("^([A-Z]+ )+", "", authors, ignore.case = FALSE), + # combine author and year if year is available + ref = ifelse(!is.na(lastyear), + paste0(authors, ", ", lastyear), + authors), + # fix beginning and ending + ref = gsub(", $", "", ref), + ref = gsub("^, ", "", ref)) %>% + # remove text if it contains 'Not assigned' like phylum in viruses + mutate_all(~gsub("Not assigned", "", .)) %>% + # Remove non-ASCII characters (these are not allowed by CRAN) + lapply(iconv, from = "UTF-8", to = "ASCII//TRANSLIT") %>% + as_tibble(stringsAsFactors = FALSE) %>% + select(col_id_new, fullname, ref, authors2) %>% + left_join(new %>% select(col_id, fullname_new = fullname), by = c(col_id_new = "col_id")) %>% + mutate(fullname = trimws( + gsub("(.*)[(].*", "\\1", + stringr::str_replace( + string = fullname, + pattern = stringr::fixed(authors2), + replacement = "")) %>% + gsub(" (var|f|subsp)[.]", "", .))) %>% + select(-c("col_id_new", "authors2")) %>% + filter(!is.na(fullname), !is.na(fullname_new)) %>% + filter(fullname != fullname_new, !fullname %like% "^[?]") +} + +# clean CoL and GBIF ---- +# clean data_col +data_col <- data_col %>% + as_tibble() %>% + select(col_id = taxonID, + col_id_new = acceptedNameUsageID, + fullname = scientificName, + kingdom, + phylum, + class, + order, + family, + genus, + species = specificEpithet, + subspecies = infraspecificEpithet, + rank = taxonRank, + ref = scientificNameAuthorship, + species_id = furtherInformationURL) %>% + mutate(source = "CoL") +# split into old and new +data_col.new <- data_col %>% clean_new() +data_col.old <- data_col %>% clean_old(new = data_col.new) +rm(data_col) + +# clean data_gbif +data_gbif <- data_gbif %>% + as_tibble() %>% + filter( + # no uncertain taxonomic placements + taxonRemarks != "doubtful", + kingdom != "incertae sedis", + taxonRank != "unranked") %>% + transmute(col_id = taxonID, + col_id_new = acceptedNameUsageID, + fullname = scientificName, + kingdom, + phylum, + class, + order, + family, + genus, + species = specificEpithet, + subspecies = infraspecificEpithet, + rank = taxonRank, + ref = scientificNameAuthorship, + species_id = as.character(parentNameUsageID)) %>% + mutate(source = "GBIF") +# split into old and new +data_gbif.new <- data_gbif %>% clean_new() +data_gbif.old <- data_gbif %>% clean_old(new = data_gbif.new) +rm(data_gbif) + +# put CoL and GBIF together ---- +MOs.new <- bind_rows(data_col.new, + data_gbif.new) %>% + mutate(taxonomic_tree_length = nchar(trimws(paste(kingdom, phylum, class, order, family, genus, species, subspecies)))) %>% + arrange(desc(taxonomic_tree_length)) %>% + distinct(fullname, .keep_all = TRUE) %>% + select(-c("col_id_new", "authors2", "authors", "lastyear", "taxonomic_tree_length")) %>% + arrange(fullname) +MOs.old <- bind_rows(data_col.old, + data_gbif.old) %>% + distinct(fullname, .keep_all = TRUE) %>% + arrange(fullname) + +# clean up DSMZ --- +data_dsmz <- data_dsmz %>% + as_tibble() %>% + transmute(col_id = NA_integer_, + col_id_new = NA_integer_, + fullname = "", + # kingdom = "", + # phylum = "", + # class = "", + # order = "", + # family = "", + genus = ifelse(is.na(GENUS), "", GENUS), + species = ifelse(is.na(SPECIES), "", SPECIES), + subspecies = ifelse(is.na(SUBSPECIES), "", SUBSPECIES), + rank = ifelse(species == "", "genus", "species"), + ref = AUTHORS, + species_id = as.character(RECORD_NO), + source = "DSMZ") + +# DSMZ only contains genus/(sub)species, try to find taxonomic properties based on genus and data_col +ref_taxonomy <- MOs.new %>% + distinct(genus, .keep_all = TRUE) %>% + filter(family != "") %>% + filter(genus %in% data_dsmz$genus) %>% + distinct(genus, .keep_all = TRUE) %>% + select(kingdom, phylum, class, order, family, genus) + +data_dsmz <- data_dsmz %>% + left_join(ref_taxonomy, by = "genus") %>% + mutate(kingdom = "Bacteria") + +data_dsmz.new <- data_dsmz %>% + clean_new() %>% + distinct(fullname, .keep_all = TRUE) %>% + select(colnames(MOs.new)) %>% + arrange(fullname) + +# combine everything ---- +MOs <- bind_rows(MOs.new, + data_dsmz.new) %>% + distinct(fullname, .keep_all = TRUE) %>% + # not the ones that are old + filter(!fullname %in% MOs.old$fullname) %>% + arrange(fullname) %>% + mutate(col_id = ifelse(source != "CoL", NA_integer_, col_id)) %>% + filter(fullname != "") + +rm(data_col.new) +rm(data_col.old) +rm(data_gbif.new) +rm(data_gbif.old) +rm(data_dsmz) +rm(data_dsmz.new) +rm(ref_taxonomy) +rm(MOs.new) + +MOs.bak <- MOs + +# Trichomonas trick ---- +# for species in Trypanosoma and Trichomonas we observe al lot of taxonomic info missing +MOs %>% filter(genus %in% c("Trypanosoma", "Trichomonas")) %>% View() +MOs[which(MOs$genus == "Trypanosoma"), "kingdom"] <- MOs[which(MOs$fullname == "Trypanosoma"),]$kingdom +MOs[which(MOs$genus == "Trypanosoma"), "phylum"] <- MOs[which(MOs$fullname == "Trypanosoma"),]$phylum +MOs[which(MOs$genus == "Trypanosoma"), "class"] <- MOs[which(MOs$fullname == "Trypanosoma"),]$class +MOs[which(MOs$genus == "Trypanosoma"), "order"] <- MOs[which(MOs$fullname == "Trypanosoma"),]$order +MOs[which(MOs$genus == "Trypanosoma"), "family"] <- MOs[which(MOs$fullname == "Trypanosoma"),]$family +MOs[which(MOs$genus == "Trichomonas"), "kingdom"] <- MOs[which(MOs$fullname == "Trichomonas"),]$kingdom +MOs[which(MOs$genus == "Trichomonas"), "phylum"] <- MOs[which(MOs$fullname == "Trichomonas"),]$phylum +MOs[which(MOs$genus == "Trichomonas"), "class"] <- MOs[which(MOs$fullname == "Trichomonas"),]$class +MOs[which(MOs$genus == "Trichomonas"), "order"] <- MOs[which(MOs$fullname == "Trichomonas"),]$order +MOs[which(MOs$genus == "Trichomonas"), "family"] <- MOs[which(MOs$fullname == "Trichomonas"),]$family + +# fill taxonomic properties that are missing +MOs <- MOs %>% + mutate(phylum = ifelse(phylum %in% c(NA, ""), "(unknown phylum)", phylum), + class = ifelse(class %in% c(NA, ""), "(unknown class)", class), + order = ifelse(order %in% c(NA, ""), "(unknown order)", order), + family = ifelse(family %in% c(NA, ""), "(unknown family)", family)) + +# Abbreviations ---- +# Add abbreviations so we can easily know which ones are which ones. +# These will become valid and unique microbial IDs for the AMR package. +MOs <- MOs %>% + arrange(kingdom, fullname) %>% + group_by(kingdom) %>% + mutate(abbr_other = case_when( + rank == "family" ~ paste0("[FAM]_", + abbreviate(family, + minlength = 8, + use.classes = TRUE, + method = "both.sides", + strict = FALSE)), + rank == "order" ~ paste0("[ORD]_", + abbreviate(order, + minlength = 8, + use.classes = TRUE, + method = "both.sides", + strict = FALSE)), + rank == "class" ~ paste0("[CLS]_", + abbreviate(class, + minlength = 8, + use.classes = TRUE, + method = "both.sides", + strict = FALSE)), + rank == "phylum" ~ paste0("[PHL]_", + abbreviate(phylum, + minlength = 8, + use.classes = TRUE, + method = "both.sides", + strict = FALSE)), + rank == "kingdom" ~ paste0("[KNG]_", kingdom), + TRUE ~ NA_character_ + )) %>% + # abbreviations determined per kingdom and family + # becuase they are part of the abbreviation + mutate(abbr_genus = abbreviate(genus, + minlength = 7, + use.classes = TRUE, + method = "both.sides", + strict = FALSE)) %>% + ungroup() %>% + group_by(genus) %>% + # species abbreviations may be the same between genera + # because the genus abbreviation is part of the abbreviation + mutate(abbr_species = abbreviate(stringr::str_to_title(species), + minlength = 3, + use.classes = FALSE, + method = "both.sides")) %>% + ungroup() %>% + group_by(genus, species) %>% + mutate(abbr_subspecies = abbreviate(stringr::str_to_title(subspecies), + minlength = 3, + use.classes = FALSE, + method = "both.sides")) %>% + ungroup() %>% + # remove trailing underscores + mutate(mo = gsub("_+$", "", + toupper(paste( + # first character: kingdom + ifelse(kingdom %in% c("Animalia", "Plantae"), + substr(kingdom, 1, 2), + substr(kingdom, 1, 1)), + # next: genus, species, subspecies + ifelse(is.na(abbr_other), + paste(abbr_genus, + abbr_species, + abbr_subspecies, + sep = "_"), + abbr_other), + sep = "_")))) %>% + mutate(mo = ifelse(duplicated(.$mo), + # these one or two must be unique too + paste0(mo, "1"), + mo), + fullname = ifelse(fullname == "", + trimws(paste(genus, species, subspecies)), + fullname)) %>% + # put `mo` in front, followed by the rest + select(mo, everything(), -abbr_other, -abbr_genus, -abbr_species, -abbr_subspecies) + +# add non-taxonomic entries +MOs <- MOs %>% + bind_rows( + # Unknowns + data.frame(mo = "UNKNOWN", + col_id = NA_integer_, + fullname = "(unknown name)", + kingdom = "(unknown kingdom)", + phylum = "(unknown phylum)", + class = "(unknown class)", + order = "(unknown order)", + family = "(unknown family)", + genus = "(unknown genus)", + species = "(unknown species)", + subspecies = "(unknown subspecies)", + rank = "(unknown rank)", + ref = NA_character_, + species_id = "", + source = "manually added", + stringsAsFactors = FALSE), + data.frame(mo = "B_GRAMN", + col_id = NA_integer_, + fullname = "(unknown Gram-negatives)", + kingdom = "Bacteria", + phylum = "(unknown phylum)", + class = "(unknown class)", + order = "(unknown order)", + family = "(unknown family)", + genus = "(unknown Gram-negatives)", + species = "(unknown species)", + subspecies = "(unknown subspecies)", + rank = "species", + ref = NA_character_, + species_id = "", + source = "manually added", + stringsAsFactors = FALSE), + data.frame(mo = "B_GRAMP", + col_id = NA_integer_, + fullname = "(unknown Gram-positives)", + kingdom = "Bacteria", + phylum = "(unknown phylum)", + class = "(unknown class)", + order = "(unknown order)", + family = "(unknown family)", + genus = "(unknown Gram-positives)", + species = "(unknown species)", + subspecies = "(unknown subspecies)", + rank = "species", + ref = NA_character_, + species_id = "", + source = "manually added", + stringsAsFactors = FALSE), + # CoNS + MOs %>% + filter(genus == "Staphylococcus", species == "") %>% .[1,] %>% + mutate(mo = paste(mo, "CNS", sep = "_"), + rank = "species", + col_id = NA_integer_, + species = "coagulase-negative", + fullname = "Coagulase-negative Staphylococcus (CoNS)", + ref = NA_character_, + species_id = "", + source = "manually added"), + # CoPS + MOs %>% + filter(genus == "Staphylococcus", species == "") %>% .[1,] %>% + mutate(mo = paste(mo, "CPS", sep = "_"), + rank = "species", + col_id = NA_integer_, + species = "coagulase-positive", + fullname = "Coagulase-positive Staphylococcus (CoPS)", + ref = NA_character_, + species_id = "", + source = "manually added"), + # Streptococci groups A, B, C, F, H, K + MOs %>% + filter(genus == "Streptococcus", species == "pyogenes") %>% .[1,] %>% + # we can keep all other details, since S. pyogenes is the only member of group A + mutate(mo = paste(MOs[MOs$fullname == "Streptococcus",]$mo, "GRA", sep = "_"), + species = "group A" , + fullname = "Streptococcus group A"), + MOs %>% + filter(genus == "Streptococcus", species == "agalactiae") %>% .[1,] %>% + # we can keep all other details, since S. agalactiae is the only member of group B + mutate(mo = paste(MOs[MOs$fullname == "Streptococcus",]$mo, "GRB", sep = "_"), + species = "group B" , + fullname = "Streptococcus group B"), + MOs %>% + filter(genus == "Streptococcus", species == "dysgalactiae") %>% .[1,] %>% + mutate(mo = paste(MOs[MOs$fullname == "Streptococcus",]$mo, "GRC", sep = "_"), + col_id = NA_integer_, + species = "group C" , + fullname = "Streptococcus group C", + ref = NA_character_, + species_id = "", + source = "manually added"), + MOs %>% + filter(genus == "Streptococcus", species == "agalactiae") %>% .[1,] %>% + mutate(mo = paste(MOs[MOs$fullname == "Streptococcus",]$mo, "GRD", sep = "_"), + col_id = NA_integer_, + species = "group D" , + fullname = "Streptococcus group D", + ref = NA_character_, + species_id = "", + source = "manually added"), + MOs %>% + filter(genus == "Streptococcus", species == "agalactiae") %>% .[1,] %>% + mutate(mo = paste(MOs[MOs$fullname == "Streptococcus",]$mo, "GRF", sep = "_"), + col_id = NA_integer_, + species = "group F" , + fullname = "Streptococcus group F", + ref = NA_character_, + species_id = "", + source = "manually added"), + MOs %>% + filter(genus == "Streptococcus", species == "agalactiae") %>% .[1,] %>% + mutate(mo = paste(MOs[MOs$fullname == "Streptococcus",]$mo, "GRG", sep = "_"), + col_id = NA_integer_, + species = "group G" , + fullname = "Streptococcus group G", + ref = NA_character_, + species_id = "", + source = "manually added"), + MOs %>% + filter(genus == "Streptococcus", species == "agalactiae") %>% .[1,] %>% + mutate(mo = paste(MOs[MOs$fullname == "Streptococcus",]$mo, "GRH", sep = "_"), + col_id = NA_integer_, + species = "group H" , + fullname = "Streptococcus group H", + ref = NA_character_, + species_id = "", + source = "manually added"), + MOs %>% + filter(genus == "Streptococcus", species == "agalactiae") %>% .[1,] %>% + mutate(mo = paste(MOs[MOs$fullname == "Streptococcus",]$mo, "GRK", sep = "_"), + col_id = NA_integer_, + species = "group K" , + fullname = "Streptococcus group K", + ref = NA_character_, + species_id = "", + source = "manually added"), + # Beta-haemolytic Streptococci + MOs %>% + filter(genus == "Streptococcus", species == "agalactiae") %>% .[1,] %>% + mutate(mo = paste(MOs[MOs$fullname == "Streptococcus",]$mo, "HAE", sep = "_"), + col_id = NA_integer_, + species = "beta-haemolytic" , + fullname = "Beta-haemolytic Streptococcus", + ref = NA_character_, + species_id = "", + source = "manually added") + ) + + +# everything distinct? +sum(duplicated(MOs$mo)) +colnames(MOs) + +# set prevalence per species +MOs <- MOs %>% + mutate(prevalence = case_when( + class == "Gammaproteobacteria" + | genus %in% c("Enterococcus", "Staphylococcus", "Streptococcus") + | mo %in% c("UNKNOWN", "B_GRAMN", "B_GRAMP") + ~ 1, + phylum %in% c("Proteobacteria", + "Firmicutes", + "Actinobacteria", + "Sarcomastigophora") + | genus %in% c("Aspergillus", + "Bacteroides", + "Candida", + "Capnocytophaga", + "Chryseobacterium", + "Cryptococcus", + "Elisabethkingia", + "Flavobacterium", + "Fusobacterium", + "Giardia", + "Leptotrichia", + "Mycoplasma", + "Prevotella", + "Rhodotorula", + "Treponema", + "Trichophyton", + "Trichomonas", + "Ureaplasma") + | rank %in% c("kingdom", "phylum", "class", "order", "family") + ~ 2, + TRUE ~ 3 + )) + +# arrange +MOs <- MOs %>% arrange(fullname) + +# transform +MOs <- as.data.frame(MOs, stringsAsFactors = FALSE) +MOs.old <- as.data.frame(MOs.old, stringsAsFactors = FALSE) +class(MOs$mo) <- "mo" +MOs$col_id <- as.integer(MOs$col_id) + +# get differences in MO codes between this data and the package version +MO_diff <- AMR::microorganisms %>% + mutate(pastedtext = paste(mo, fullname)) %>% + filter(!pastedtext %in% (MOs %>% mutate(pastedtext = paste(mo, fullname)) %>% pull(pastedtext))) %>% + select(mo_old = mo, fullname, pastedtext) %>% + left_join(MOs %>% + transmute(mo_new = mo, fullname_new = fullname, pastedtext = paste(mo, fullname)), "pastedtext") %>% + select(mo_old, mo_new, fullname_new) + +mo_diff2 <- AMR::microorganisms %>% + select(mo, fullname) %>% + left_join(MOs %>% + select(mo, fullname), + by = "fullname", + suffix = c("_old", "_new")) %>% + filter(mo_old != mo_new, + #!mo_new %in% mo_old, + !mo_old %like% "\\[") + +mo_diff3 <- tibble(previous_old = names(AMR:::make_trans_tbl()), + previous_new = AMR:::make_trans_tbl()) %>% + left_join(AMR::microorganisms %>% select(mo, fullname), by = c(previous_new = "mo")) %>% + left_join(MOs %>% select(mo_new = mo, fullname), by = "fullname") + +# what did we win most? +MOs %>% filter(!fullname %in% AMR::microorganisms$fullname) %>% freq(genus) +# what did we lose most? +AMR::microorganisms %>% + filter(kingdom != "Chromista" & !fullname %in% MOs$fullname & !fullname %in% MOs.old$fullname) %>% + freq(genus) + + +# save +saveRDS(MOs, "microorganisms.rds") +saveRDS(MOs.old, "microorganisms.old.rds") + +# on the server, do: +usethis::use_data(microorganisms, overwrite = TRUE, version = 2) +usethis::use_data(microorganisms.old, overwrite = TRUE, version = 2) +rm(microorganisms) +rm(microorganisms.old) +# and update the year in R/data.R diff --git a/data/microorganisms.codes.rda b/data/microorganisms.codes.rda index 1193b223b..bcfa993a5 100644 Binary files a/data/microorganisms.codes.rda and b/data/microorganisms.codes.rda differ diff --git a/data/microorganisms.old.rda b/data/microorganisms.old.rda index 5d42f9d3e..674279f61 100644 Binary files a/data/microorganisms.old.rda and b/data/microorganisms.old.rda differ diff --git a/data/microorganisms.rda b/data/microorganisms.rda index 830db9382..b5a2d375e 100755 Binary files a/data/microorganisms.rda and b/data/microorganisms.rda differ diff --git a/docs/LICENSE-text.html b/docs/LICENSE-text.html index cf0c354f1..6b0af1f45 100644 --- a/docs/LICENSE-text.html +++ b/docs/LICENSE-text.html @@ -78,7 +78,7 @@ AMR (for R) - 0.7.1.9008 + 0.7.1.9023 @@ -156,13 +156,6 @@ Get properties of an antibiotic -
  • - - - - Create frequency tables - -
  • diff --git a/docs/articles/AMR.html b/docs/articles/AMR.html index a4c0afd16..7a45e3040 100644 --- a/docs/articles/AMR.html +++ b/docs/articles/AMR.html @@ -40,7 +40,7 @@ AMR (for R) - 0.7.1.9005 + 0.7.1.9026 @@ -118,13 +118,6 @@ Get properties of an antibiotic
  • -
  • - - - - Create frequency tables - -
  • @@ -192,7 +185,7 @@

    How to conduct AMR analysis

    Matthijs S. Berends

    -

    01 July 2019

    +

    06 August 2019

    @@ -201,7 +194,7 @@ -

    Note: values on this page will change with every website update since they are based on randomly created values and the page was written in R Markdown. However, the methodology remains unchanged. This page was generated on 01 July 2019.

    +

    Note: values on this page will change with every website update since they are based on randomly created values and the page was written in R Markdown. However, the methodology remains unchanged. This page was generated on 06 August 2019.

    Introduction

    @@ -217,21 +210,21 @@ -2019-07-01 +2019-08-06 abcd Escherichia coli S S -2019-07-01 +2019-08-06 abcd Escherichia coli S R -2019-07-01 +2019-08-06 efgh Escherichia coli R @@ -244,12 +237,12 @@ Needed R packages

    As with many uses in R, we need some additional packages for AMR analysis. Our package works closely together with the tidyverse packages dplyr and ggplot2 by Dr Hadley Wickham. The tidyverse tremendously improves the way we conduct data science - it allows for a very natural way of writing syntaxes and creating beautiful plots in R.

    Our AMR package depends on these packages and even extends their use and functions.

    - +
    @@ -261,58 +254,58 @@

    Patients

    To start with patients, we need a unique list of patients.

    -
    patients <- unlist(lapply(LETTERS, paste0, 1:10))
    +
    patients <- unlist(lapply(LETTERS, paste0, 1:10))

    The LETTERS object is available in R - it’s a vector with 26 characters: A to Z. The patients object we just created is now a vector of length 260, with values (patient IDs) varying from A1 to Z10. Now we we also set the gender of our patients, by putting the ID and the gender in a table:

    -
    patients_table <- data.frame(patient_id = patients,
    -                             gender = c(rep("M", 135),
    -                                        rep("F", 125)))
    +
    patients_table <- data.frame(patient_id = patients,
    +                             gender = c(rep("M", 135),
    +                                        rep("F", 125)))

    The first 135 patient IDs are now male, the other 125 are female.

    Dates

    Let’s pretend that our data consists of blood cultures isolates from between 1 January 2010 and 1 January 2018.

    -
    dates <- seq(as.Date("2010-01-01"), as.Date("2018-01-01"), by = "day")
    +
    dates <- seq(as.Date("2010-01-01"), as.Date("2018-01-01"), by = "day")

    This dates object now contains all days in our date range.

    Microorganisms

    For this tutorial, we will uses four different microorganisms: Escherichia coli, Staphylococcus aureus, Streptococcus pneumoniae, and Klebsiella pneumoniae:

    -
    bacteria <- c("Escherichia coli", "Staphylococcus aureus",
    -              "Streptococcus pneumoniae", "Klebsiella pneumoniae")
    +
    bacteria <- c("Escherichia coli", "Staphylococcus aureus",
    +              "Streptococcus pneumoniae", "Klebsiella pneumoniae")

    Other variables

    For completeness, we can also add the hospital where the patients was admitted and we need to define valid antibmicrobial results for our randomisation:

    -
    hospitals <- c("Hospital A", "Hospital B", "Hospital C", "Hospital D")
    -ab_interpretations <- c("S", "I", "R")
    +
    hospitals <- c("Hospital A", "Hospital B", "Hospital C", "Hospital D")
    +ab_interpretations <- c("S", "I", "R")

    Put everything together

    Using the sample() function, we can randomly select items from all objects we defined earlier. To let our fake data reflect reality a bit, we will also approximately define the probabilities of bacteria and the antibiotic results with the prob parameter.

    -
    sample_size <- 20000
    -data <- data.frame(date = sample(dates, size = sample_size, replace = TRUE),
    -                   patient_id = sample(patients, size = sample_size, replace = TRUE),
    -                   hospital = sample(hospitals, size = sample_size, replace = TRUE,
    -                                     prob = c(0.30, 0.35, 0.15, 0.20)),
    -                   bacteria = sample(bacteria, size = sample_size, replace = TRUE,
    -                                     prob = c(0.50, 0.25, 0.15, 0.10)),
    -                   AMX = sample(ab_interpretations, size = sample_size, replace = TRUE,
    -                                 prob = c(0.60, 0.05, 0.35)),
    -                   AMC = sample(ab_interpretations, size = sample_size, replace = TRUE,
    -                                 prob = c(0.75, 0.10, 0.15)),
    -                   CIP = sample(ab_interpretations, size = sample_size, replace = TRUE,
    -                                 prob = c(0.80, 0.00, 0.20)),
    -                   GEN = sample(ab_interpretations, size = sample_size, replace = TRUE,
    -                                 prob = c(0.92, 0.00, 0.08))
    -                   )
    +
    sample_size <- 20000
    +data <- data.frame(date = sample(dates, size = sample_size, replace = TRUE),
    +                   patient_id = sample(patients, size = sample_size, replace = TRUE),
    +                   hospital = sample(hospitals, size = sample_size, replace = TRUE,
    +                                     prob = c(0.30, 0.35, 0.15, 0.20)),
    +                   bacteria = sample(bacteria, size = sample_size, replace = TRUE,
    +                                     prob = c(0.50, 0.25, 0.15, 0.10)),
    +                   AMX = sample(ab_interpretations, size = sample_size, replace = TRUE,
    +                                 prob = c(0.60, 0.05, 0.35)),
    +                   AMC = sample(ab_interpretations, size = sample_size, replace = TRUE,
    +                                 prob = c(0.75, 0.10, 0.15)),
    +                   CIP = sample(ab_interpretations, size = sample_size, replace = TRUE,
    +                                 prob = c(0.80, 0.00, 0.20)),
    +                   GEN = sample(ab_interpretations, size = sample_size, replace = TRUE,
    +                                 prob = c(0.92, 0.00, 0.08))
    +                   )

    Using the left_join() function from the dplyr package, we can ‘map’ the gender to the patient ID using the patients_table object we created earlier:

    - +

    The resulting data set contains 20,000 blood culture isolates. With the head() function we can preview the first 6 values of this data set:

    -
    head(data)
    +
    head(data)
    @@ -327,69 +320,69 @@ - - - + + + - + - - + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + - - - - - - - - - - - - - + + - - - - - - - - - - - + + - - - - - - - - - - - - - @@ -400,9 +393,11 @@

    Cleaning the data

    -

    Use the frequency table function freq() to look specifically for unique values in any variable. For example, for the gender variable:

    -
    data %>% freq(gender) # this would be the same: freq(data$gender)
    -
    # Frequency table of `gender` from `data` (20,000 x 9) 
    +

    We also created a package dedicated to data cleaning and checking, called the clean package. It gets automatically installed with the AMR package, so we only have to load it:

    +
    library(clean)
    +

    Use the frequency table function freq() from this clean package to look specifically for unique values in any variable. For example, for the gender variable:

    +
    data %>% freq(gender) # this would be the same: freq(data$gender)
    +
    # Frequency table 
     # 
     # Class:   factor (numeric)
     # Length:  20,000 (of which NA: 0 = 0.00%)
    @@ -411,82 +406,82 @@
     # 
     #      Item     Count   Percent   Cum. Count   Cum. Percent
     # ---  -----  -------  --------  -----------  -------------
    -# 1    M       10,408     52.0%       10,408          52.0%
    -# 2    F        9,592     48.0%       20,000         100.0%
    +# 1 M 10,366 51.8% 10,366 51.8% +# 2 F 9,634 48.2% 20,000 100.0%

    So, we can draw at least two conclusions immediately. From a data scientists perspective, the data looks clean: only values M and F. From a researchers perspective: there are slightly more men. Nothing we didn’t already know.

    The data is already quite clean, but we still need to transform some variables. The bacteria column now consists of text, and we want to add more variables based on microbial IDs later on. So, we will transform this column to valid IDs. The mutate() function of the dplyr package makes this really easy:

    -
    data <- data %>%
    -  mutate(bacteria = as.mo(bacteria))
    +
    data <- data %>%
    +  mutate(bacteria = as.mo(bacteria))

    We also want to transform the antibiotics, because in real life data we don’t know if they are really clean. The as.rsi() function ensures reliability and reproducibility in these kind of variables. The mutate_at() will run the as.rsi() function on defined variables:

    -
    data <- data %>%
    -  mutate_at(vars(AMX:GEN), as.rsi)
    +
    data <- data %>%
    +  mutate_at(vars(AMX:GEN), as.rsi)

    Finally, we will apply EUCAST rules on our antimicrobial results. In Europe, most medical microbiological laboratories already apply these rules. Our package features their latest insights on intrinsic resistance and exceptional phenotypes. Moreover, the eucast_rules() function can also apply additional rules, like forcing ampicillin = R when amoxicillin/clavulanic acid = R.

    Because the amoxicillin (column AMX) and amoxicillin/clavulanic acid (column AMC) in our data were generated randomly, some rows will undoubtedly contain AMX = S and AMC = R, which is technically impossible. The eucast_rules() fixes this:

    -
    data <- eucast_rules(data, col_mo = "bacteria")
    -# 
    -# Rules by the European Committee on Antimicrobial Susceptibility Testing (EUCAST)
    -# http://eucast.org/
    -# 
    -# EUCAST Clinical Breakpoints (v9.0, 2019)
    -# Aerococcus sanguinicola (no new changes)
    -# Aerococcus urinae (no new changes)
    -# Anaerobic Gram-negatives (no new changes)
    -# Anaerobic Gram-positives (no new changes)
    -# Campylobacter coli (no new changes)
    -# Campylobacter jejuni (no new changes)
    -# Enterobacteriales (Order) (no new changes)
    -# Enterococcus (no new changes)
    -# Haemophilus influenzae (no new changes)
    -# Kingella kingae (no new changes)
    -# Moraxella catarrhalis (no new changes)
    -# Pasteurella multocida (no new changes)
    -# Staphylococcus (no new changes)
    -# Streptococcus groups A, B, C, G (no new changes)
    -# Streptococcus pneumoniae (1,443 new changes)
    -# Viridans group streptococci (no new changes)
    -# 
    -# EUCAST Expert Rules, Intrinsic Resistance and Exceptional Phenotypes (v3.1, 2016)
    -# Table 01: Intrinsic resistance in Enterobacteriaceae (1,332 new changes)
    -# Table 02: Intrinsic resistance in non-fermentative Gram-negative bacteria (no new changes)
    -# Table 03: Intrinsic resistance in other Gram-negative bacteria (no new changes)
    -# Table 04: Intrinsic resistance in Gram-positive bacteria (2,723 new changes)
    -# Table 08: Interpretive rules for B-lactam agents and Gram-positive cocci (no new changes)
    -# Table 09: Interpretive rules for B-lactam agents and Gram-negative rods (no new changes)
    -# Table 11: Interpretive rules for macrolides, lincosamides, and streptogramins (no new changes)
    -# Table 12: Interpretive rules for aminoglycosides (no new changes)
    -# Table 13: Interpretive rules for quinolones (no new changes)
    -# 
    -# Other rules
    -# Non-EUCAST: amoxicillin/clav acid = S where ampicillin = S (2,213 new changes)
    -# Non-EUCAST: ampicillin = R where amoxicillin/clav acid = R (127 new changes)
    -# Non-EUCAST: piperacillin = R where piperacillin/tazobactam = R (no new changes)
    -# Non-EUCAST: piperacillin/tazobactam = S where piperacillin = S (no new changes)
    -# Non-EUCAST: trimethoprim = R where trimethoprim/sulfa = R (no new changes)
    -# Non-EUCAST: trimethoprim/sulfa = S where trimethoprim = S (no new changes)
    -# 
    -# --------------------------------------------------------------------------
    -# EUCAST rules affected 6,513 out of 20,000 rows, making a total of 7,838 edits
    -# => added 0 test results
    -# 
    -# => changed 7,838 test results
    -#    - 115 test results changed from S to I
    -#    - 4,719 test results changed from S to R
    -#    - 1,077 test results changed from I to S
    -#    - 335 test results changed from I to R
    -#    - 1,573 test results changed from R to S
    -#    - 19 test results changed from R to I
    -# --------------------------------------------------------------------------
    -# 
    -# Use verbose = TRUE to get a data.frame with all specified edits instead.
    +
    data <- eucast_rules(data, col_mo = "bacteria")
    +# 
    +# Rules by the European Committee on Antimicrobial Susceptibility Testing (EUCAST)
    +# http://eucast.org/
    +# 
    +# EUCAST Clinical Breakpoints (v9.0, 2019)
    +# Aerococcus sanguinicola (no new changes)
    +# Aerococcus urinae (no new changes)
    +# Anaerobic Gram-negatives (no new changes)
    +# Anaerobic Gram-positives (no new changes)
    +# Campylobacter coli (no new changes)
    +# Campylobacter jejuni (no new changes)
    +# Enterobacteriales (Order) (no new changes)
    +# Enterococcus (no new changes)
    +# Haemophilus influenzae (no new changes)
    +# Kingella kingae (no new changes)
    +# Moraxella catarrhalis (no new changes)
    +# Pasteurella multocida (no new changes)
    +# Staphylococcus (no new changes)
    +# Streptococcus groups A, B, C, G (no new changes)
    +# Streptococcus pneumoniae (1,439 new changes)
    +# Viridans group streptococci (no new changes)
    +# 
    +# EUCAST Expert Rules, Intrinsic Resistance and Exceptional Phenotypes (v3.1, 2016)
    +# Table 01: Intrinsic resistance in Enterobacteriaceae (1,329 new changes)
    +# Table 02: Intrinsic resistance in non-fermentative Gram-negative bacteria (no new changes)
    +# Table 03: Intrinsic resistance in other Gram-negative bacteria (no new changes)
    +# Table 04: Intrinsic resistance in Gram-positive bacteria (2,679 new changes)
    +# Table 08: Interpretive rules for B-lactam agents and Gram-positive cocci (no new changes)
    +# Table 09: Interpretive rules for B-lactam agents and Gram-negative rods (no new changes)
    +# Table 11: Interpretive rules for macrolides, lincosamides, and streptogramins (no new changes)
    +# Table 12: Interpretive rules for aminoglycosides (no new changes)
    +# Table 13: Interpretive rules for quinolones (no new changes)
    +# 
    +# Other rules
    +# Non-EUCAST: amoxicillin/clav acid = S where ampicillin = S (2,347 new changes)
    +# Non-EUCAST: ampicillin = R where amoxicillin/clav acid = R (114 new changes)
    +# Non-EUCAST: piperacillin = R where piperacillin/tazobactam = R (no new changes)
    +# Non-EUCAST: piperacillin/tazobactam = S where piperacillin = S (no new changes)
    +# Non-EUCAST: trimethoprim = R where trimethoprim/sulfa = R (no new changes)
    +# Non-EUCAST: trimethoprim/sulfa = S where trimethoprim = S (no new changes)
    +# 
    +# --------------------------------------------------------------------------
    +# EUCAST rules affected 6,589 out of 20,000 rows, making a total of 7,908 edits
    +# => added 0 test results
    +# 
    +# => changed 7,908 test results
    +#    - 110 test results changed from S to I
    +#    - 4,680 test results changed from S to R
    +#    - 1,068 test results changed from I to S
    +#    - 326 test results changed from I to R
    +#    - 1,711 test results changed from R to S
    +#    - 13 test results changed from R to I
    +# --------------------------------------------------------------------------
    +# 
    +# Use verbose = TRUE (on your original data) to get a data.frame with all specified edits instead.

    Adding new variables

    Now that we have the microbial ID, we can add some taxonomic properties:

    -
    data <- data %>% 
    -  mutate(gramstain = mo_gramstain(bacteria),
    -         genus = mo_genus(bacteria),
    -         species = mo_species(bacteria))
    +
    data <- data %>% 
    +  mutate(gramstain = mo_gramstain(bacteria),
    +         genus = mo_genus(bacteria),
    +         species = mo_species(bacteria))

    First isolates

    @@ -497,23 +492,23 @@

    (…) When preparing a cumulative antibiogram to guide clinical decisions about empirical antimicrobial therapy of initial infections, only the first isolate of a given species per patient, per analysis period (eg, one year) should be included, irrespective of body site, antimicrobial susceptibility profile, or other phenotypical characteristics (eg, biotype). The first isolate is easily identified, and cumulative antimicrobial susceptibility test data prepared using the first isolate are generally comparable to cumulative antimicrobial susceptibility test data calculated by other methods, providing duplicate isolates are excluded.
    M39-A4 Analysis and Presentation of Cumulative Antimicrobial Susceptibility Test Data, 4th Edition. CLSI, 2014. Chapter 6.4

    This AMR package includes this methodology with the first_isolate() function. It adopts the episode of a year (can be changed by user) and it starts counting days after every selected isolate. This new variable can easily be added to our data:

    - -

    So only 28.6% is suitable for resistance analysis! We can now filter on it with the filter() function, also from the dplyr package:

    - + +

    So only is suitable for resistance analysis! We can now filter on it with the filter() function, also from the dplyr package:

    +

    For future use, the above two syntaxes can be shortened with the filter_first_isolate() function:

    - +

    First weighted isolates

    -

    We made a slight twist to the CLSI algorithm, to take into account the antimicrobial susceptibility profile. Have a look at all isolates of patient S7, sorted on date:

    +

    We made a slight twist to the CLSI algorithm, to take into account the antimicrobial susceptibility profile. Have a look at all isolates of patient C10, sorted on date:

    date
    2011-09-06Z5Hospital B2017-03-10Q5Hospital C Escherichia coliRS S S S F
    2015-03-21E72011-12-26G6Hospital CStreptococcus pneumoniaeSSSSM
    2011-07-01L6 Hospital C Escherichia coliSSSSM
    2015-08-22A9Hospital BEscherichia coli RSSSM
    2017-03-16H1Hospital AStaphylococcus aureusS I S S M
    2010-08-11X6Hospital CEscherichia coliSSRSF
    2012-06-16E102017-06-20V2 Hospital DStaphylococcus aureusRSSSM
    2016-12-29J3Hospital C Escherichia coliIS R SSSM
    2010-04-09Q3Hospital BStreptococcus pneumoniaeRSSS F
    @@ -529,32 +524,32 @@ - - + + - - + + - - + + - - - + + + - - + + - + @@ -562,10 +557,10 @@ - - + + - + @@ -573,65 +568,65 @@ - - + + + + + + + + + + + + + - - - - - - - - - - - - - + + - + - - + + - + - - + + - - + + - + - - + + - + @@ -641,16 +636,16 @@
    isolate
    12010-01-28S72010-07-24C10 B_ESCHR_COLRISS S S TRUE
    22010-02-07S72010-12-03C10 B_ESCHR_COLSSS RSSS FALSE
    32010-03-16S72010-12-19C10 B_ESCHR_COLRS S S S
    42010-10-09S72011-02-18C10 B_ESCHR_COLSR S S S
    52011-01-25S72011-06-26C10 B_ESCHR_COLSS RRFALSE
    62011-06-28C10B_ESCHR_COLS S S S FALSE
    62011-02-16S7B_ESCHR_COLSSSSTRUE
    72011-02-24S72011-06-30C10 B_ESCHR_COL S SSR S FALSE
    82011-03-30S72011-08-29C10 B_ESCHR_COLRS S RSFALSERTRUE
    92011-04-25S72011-10-03C10 B_ESCHR_COL S S SRS FALSE
    102011-05-06S72011-12-30C10 B_ESCHR_COLSR S S S

    Only 2 isolates are marked as ‘first’ according to CLSI guideline. But when reviewing the antibiogram, it is obvious that some isolates are absolutely different strains and should be included too. This is why we weigh isolates, based on their antibiogram. The key_antibiotics() function adds a vector with 18 key antibiotics: 6 broad spectrum ones, 6 small spectrum for Gram negatives and 6 small spectrum for Gram positives. These can be defined by the user.

    If a column exists with a name like ‘key(…)ab’ the first_isolate() function will automatically use it and determine the first weighted isolates. Mind the NOTEs in below output:

    - + @@ -667,11 +662,11 @@ - - + + - - + + @@ -679,22 +674,22 @@ - - + + - - - + + + - - + + - + @@ -703,10 +698,10 @@ - - + + - + @@ -715,70 +710,70 @@ - - + + + + + - - - - - + + - + - - + + - + - + - - + + - + - - + + - - + + - + - - + + - + @@ -787,18 +782,19 @@
    isolate
    12010-01-28S72010-07-24C10 B_ESCHR_COLRISS S S TRUE
    22010-02-07S72010-12-03C10 B_ESCHR_COLSSS RSSS FALSE TRUE
    32010-03-16S72010-12-19C10 B_ESCHR_COLRS S S S
    42010-10-09S72011-02-18C10 B_ESCHR_COLSR S S S
    52011-01-25S72011-06-26C10 B_ESCHR_COLSSR RSSS FALSE TRUE
    62011-02-16S72011-06-28C10 B_ESCHR_COL S S S STRUEFALSE TRUE
    72011-02-24S72011-06-30C10 B_ESCHR_COL S SSR S FALSEFALSETRUE
    82011-03-30S72011-08-29C10 B_ESCHR_COLRS S RSFALSERTRUE TRUE
    92011-04-25S72011-10-03C10 B_ESCHR_COL S S SRS FALSE TRUE
    102011-05-06S72011-12-30C10 B_ESCHR_COLSR S S S
    -

    Instead of 2, now 9 isolates are flagged. In total, 75.5% of all isolates are marked ‘first weighted’ - 46.9% more than when using the CLSI guideline. In real life, this novel algorithm will yield 5-10% more isolates than the classic CLSI guideline.

    +

    Instead of 2, now 10 isolates are flagged. In total, of all isolates are marked ‘first weighted’ - more than when using the CLSI guideline. In real life, this novel algorithm will yield 5-10% more isolates than the classic CLSI guideline.

    As with filter_first_isolate(), there’s a shortcut for this new algorithm too:

    - -

    So we end up with 15,097 isolates for analysis.

    + +

    So we end up with 15,027 isolates for analysis.

    We can remove unneeded columns:

    - +

    Now our data looks like:

    -
    head(data_1st)
    +
    head(data_1st)
    + @@ -815,11 +811,12 @@ - - - + + + + - + @@ -830,56 +827,28 @@ - - + + + - - - + - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + + - - + + + - + @@ -890,15 +859,48 @@ - - - - - - + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + @@ -916,14 +918,14 @@

    Dispersion of species

    -

    To just get an idea how the species are distributed, create a frequency table with our freq() function. We created the genus and species column earlier based on the microbial ID. With paste(), we can concatenate them together.

    -

    The freq() function can be used like the base R language was intended:

    -
    freq(paste(data_1st$genus, data_1st$species))
    +

    To just get an idea how the species are distributed, create a frequency table with our freq() function. We created the genus and species column earlier based on the microbial ID. With paste(), we can concatenate them together.

    +

    The freq() function can be used like the base R language was intended:

    +
    freq(paste(data_1st$genus, data_1st$species))

    Or can be used like the dplyr way, which is easier readable:

    -
    data_1st %>% freq(genus, species)
    -

    Frequency table of genus and species from data_1st (15,097 x 13)

    -

    Columns: 2
    -Length: 15,097 (of which NA: 0 = 0.00%)
    +

    data_1st %>% freq(genus, species)
    +

    Frequency table

    +

    Class: character
    +Length: 15,027 (of which NA: 0 = 0.00%)
    Unique: 4

    Shortest: 16
    Longest: 24

    @@ -940,33 +942,33 @@ Longest: 24

    - + - + - - - - + + + + - - - - + + + + - - - + + + @@ -976,12 +978,12 @@ Longest: 24

    Resistance percentages

    The functions portion_S(), portion_SI(), portion_I(), portion_IR() and portion_R() can be used to determine the portion of a specific antimicrobial outcome. As per the EUCAST guideline of 2019, we calculate resistance as the portion of R (portion_R()) and susceptibility as the portion of S and I (portion_SI()). These functions can be used on their own:

    - +

    Or can be used in conjuction with group_by() and summarise(), both from the dplyr package:

    -
    data_1st %>% 
    -  group_by(hospital) %>% 
    -  summarise(amoxicillin = portion_R(AMX))
    +
    data_1st %>% 
    +  group_by(hospital) %>% 
    +  summarise(amoxicillin = portion_R(AMX))
    date patient_id hospital
    2011-09-06Z5Hospital B12017-03-10Q5Hospital C B_ESCHR_COLRS S S STRUE
    2015-03-21E722011-12-26G6 Hospital CB_ESCHR_COLRIB_STRPT_PNE S SMGram-negativeEscherichiacoliTRUE
    2010-08-11X6Hospital CB_ESCHR_COLS S RSFGram-negativeEscherichiacoliTRUE
    2012-06-16E10Hospital DB_STPHY_AURRSSS M Gram-positiveStaphylococcusaureusStreptococcuspneumoniae TRUE
    2016-12-29J332011-07-01L6 Hospital C B_ESCHR_COLRS S S STRUE
    2010-04-09Q3Hospital BB_STRPT_PNERR62017-06-20V2Hospital DB_ESCHR_COLI S RS FGram-negativeEscherichiacoliTRUE
    72012-02-10D1Hospital AB_STRPT_PNESSRRMGram-positiveStreptococcuspneumoniaeTRUE
    82010-05-01E1Hospital AB_STRPT_PNESSSRM Gram-positive Streptococcus pneumoniae
    1 Escherichia coli7,4837,456 49.6%7,4837,456 49.6%
    2 Staphylococcus aureus3,67324.3%11,15673.9%3,70924.7%11,16574.3%
    3 Streptococcus pneumoniae2,30615.3%13,46289.2%2,26715.1%13,43289.4%
    4 Klebsiella pneumoniae1,63510.8%15,0971,59510.6%15,027 100.0%
    @@ -990,27 +992,27 @@ Longest: 24

    - + - + - + - +
    hospital
    Hospital A0.48104060.4574111
    Hospital B0.47142590.4789054
    Hospital C0.46251130.4639895
    Hospital D0.47532470.4705098

    Of course it would be very convenient to know the number of isolates responsible for the percentages. For that purpose the n_rsi() can be used, which works exactly like n_distinct() from the dplyr package. It counts all isolates available for every group (i.e. values S, I or R):

    -
    data_1st %>% 
    -  group_by(hospital) %>% 
    -  summarise(amoxicillin = portion_R(AMX),
    -            available = n_rsi(AMX))
    +
    data_1st %>% 
    +  group_by(hospital) %>% 
    +  summarise(amoxicillin = portion_R(AMX),
    +            available = n_rsi(AMX))
    @@ -1020,32 +1022,32 @@ Longest: 24

    - - + + - - + + - - + + - - + +
    hospital
    Hospital A0.481040645360.45741114473
    Hospital B0.471425952670.47890545262
    Hospital C0.462511322140.46398952291
    Hospital D0.475324730800.47050983001

    These functions can also be used to get the portion of multiple antibiotics, to calculate empiric susceptibility of combination therapies very easily:

    - + @@ -1056,94 +1058,94 @@ Longest: 24

    - - - + + + - - - + + + - - - + + + - + - +
    genus
    Escherichia0.92569820.89295740.99518910.92449030.89645920.9936964
    Klebsiella0.82140670.90336390.98654430.82445140.90595610.9811912
    Staphylococcus0.92295130.92240680.99373810.92666490.92019410.9935293
    Streptococcus0.61535130.6060873 0.00000000.61535130.6060873

    To make a transition to the next part, let’s see how this difference could be plotted:

    -
    data_1st %>% 
    -  group_by(genus) %>% 
    -  summarise("1. Amoxi/clav" = portion_SI(AMC),
    -            "2. Gentamicin" = portion_SI(GEN),
    -            "3. Amoxi/clav + genta" = portion_SI(AMC, GEN)) %>% 
    -  tidyr::gather("antibiotic", "S", -genus) %>%
    -  ggplot(aes(x = genus,
    -             y = S,
    -             fill = antibiotic)) +
    -  geom_col(position = "dodge2")
    +
    data_1st %>% 
    +  group_by(genus) %>% 
    +  summarise("1. Amoxi/clav" = portion_SI(AMC),
    +            "2. Gentamicin" = portion_SI(GEN),
    +            "3. Amoxi/clav + genta" = portion_SI(AMC, GEN)) %>% 
    +  tidyr::gather("antibiotic", "S", -genus) %>%
    +  ggplot(aes(x = genus,
    +             y = S,
    +             fill = antibiotic)) +
    +  geom_col(position = "dodge2")

    Plots

    To show results in plots, most R users would nowadays use the ggplot2 package. This package lets you create plots in layers. You can read more about it on their website. A quick example would look like these syntaxes:

    -
    ggplot(data = a_data_set,
    -       mapping = aes(x = year,
    -                     y = value)) +
    -  geom_col() +
    -  labs(title = "A title",
    -       subtitle = "A subtitle",
    -       x = "My X axis",
    -       y = "My Y axis")
    -
    -# or as short as:
    -ggplot(a_data_set) +
    -  geom_bar(aes(year))
    +
    ggplot(data = a_data_set,
    +       mapping = aes(x = year,
    +                     y = value)) +
    +  geom_col() +
    +  labs(title = "A title",
    +       subtitle = "A subtitle",
    +       x = "My X axis",
    +       y = "My Y axis")
    +
    +# or as short as:
    +ggplot(a_data_set) +
    +  geom_bar(aes(year))

    The AMR package contains functions to extend this ggplot2 package, for example geom_rsi(). It automatically transforms data with count_df() or portion_df() and show results in stacked bars. Its simplest and shortest example:

    -
    ggplot(data_1st) +
    -  geom_rsi(translate_ab = FALSE)
    +
    ggplot(data_1st) +
    +  geom_rsi(translate_ab = FALSE)

    Omit the translate_ab = FALSE to have the antibiotic codes (AMX, AMC, CIP, GEN) translated to official WHO names (amoxicillin, amoxicillin/clavulanic acid, ciprofloxacin, gentamicin).

    If we group on e.g. the genus column and add some additional functions from our package, we can create this:

    - +

    To simplify this, we also created the ggplot_rsi() function, which combines almost all above functions:

    - +

    @@ -1151,33 +1153,33 @@ Longest: 24

    Independence test

    The next example uses the included septic_patients, which is an anonymised data set containing 2,000 microbial blood culture isolates with their full antibiograms found in septic patients in 4 different hospitals in the Netherlands, between 2001 and 2017. It is true, genuine data. This data.frame can be used to practice AMR analysis.

    We will compare the resistance to fosfomycin (column FOS) in hospital A and D. The input for the fisher.test() can be retrieved with a transformation like this:

    -
    check_FOS <- septic_patients %>%
    -  filter(hospital_id %in% c("A", "D")) %>% # filter on only hospitals A and D
    -  select(hospital_id, FOS) %>%             # select the hospitals and fosfomycin
    -  group_by(hospital_id) %>%                # group on the hospitals
    -  count_df(combine_SI = TRUE) %>%          # count all isolates per group (hospital_id)
    -  tidyr::spread(hospital_id, value) %>%    # transform output so A and D are columns
    -  select(A, D) %>%                         # and select these only
    -  as.matrix()                              # transform to good old matrix for fisher.test()
    -
    -check_FOS
    -#       A  D
    -# [1,] 25 77
    -# [2,] 24 33
    +
    check_FOS <- septic_patients %>%
    +  filter(hospital_id %in% c("A", "D")) %>% # filter on only hospitals A and D
    +  select(hospital_id, FOS) %>%             # select the hospitals and fosfomycin
    +  group_by(hospital_id) %>%                # group on the hospitals
    +  count_df(combine_SI = TRUE) %>%          # count all isolates per group (hospital_id)
    +  tidyr::spread(hospital_id, value) %>%    # transform output so A and D are columns
    +  select(A, D) %>%                         # and select these only
    +  as.matrix()                              # transform to good old matrix for fisher.test()
    +
    +check_FOS
    +#       A  D
    +# [1,] 25 77
    +# [2,] 24 33

    We can apply the test now with:

    - +

    As can be seen, the p value is 0.031, which means that the fosfomycin resistances found in hospital A and D are really different.

    diff --git a/docs/articles/AMR_files/figure-html/plot 1-1.png b/docs/articles/AMR_files/figure-html/plot 1-1.png index 97295b959..6ce2bb204 100644 Binary files a/docs/articles/AMR_files/figure-html/plot 1-1.png and b/docs/articles/AMR_files/figure-html/plot 1-1.png differ diff --git a/docs/articles/AMR_files/figure-html/plot 3-1.png b/docs/articles/AMR_files/figure-html/plot 3-1.png index bf1b2b193..6846b9870 100644 Binary files a/docs/articles/AMR_files/figure-html/plot 3-1.png and b/docs/articles/AMR_files/figure-html/plot 3-1.png differ diff --git a/docs/articles/AMR_files/figure-html/plot 4-1.png b/docs/articles/AMR_files/figure-html/plot 4-1.png index 3f4fbeb66..df4d8e0d0 100644 Binary files a/docs/articles/AMR_files/figure-html/plot 4-1.png and b/docs/articles/AMR_files/figure-html/plot 4-1.png differ diff --git a/docs/articles/AMR_files/figure-html/plot 5-1.png b/docs/articles/AMR_files/figure-html/plot 5-1.png index dc9c4aa3a..c3abecebb 100644 Binary files a/docs/articles/AMR_files/figure-html/plot 5-1.png and b/docs/articles/AMR_files/figure-html/plot 5-1.png differ diff --git a/docs/articles/EUCAST.html b/docs/articles/EUCAST.html index 256d165e8..a9d2ddedc 100644 --- a/docs/articles/EUCAST.html +++ b/docs/articles/EUCAST.html @@ -40,7 +40,7 @@ AMR (for R) - 0.7.1.9005 + 0.7.1.9015 @@ -118,13 +118,6 @@ Get properties of an antibiotic
  • -
  • - - - - Create frequency tables - -
  • @@ -192,7 +185,7 @@

    How to apply EUCAST rules

    Matthijs S. Berends

    -

    01 July 2019

    +

    29 July 2019

    diff --git a/docs/articles/MDR.html b/docs/articles/MDR.html index f59d29b54..c17496e8f 100644 --- a/docs/articles/MDR.html +++ b/docs/articles/MDR.html @@ -40,7 +40,7 @@
    AMR (for R) - 0.7.1.9005 + 0.7.1.9026 @@ -118,13 +118,6 @@ Get properties of an antibiotic
  • -
  • - - - - Create frequency tables - -
  • @@ -192,7 +185,7 @@

    How to determine multi-drug resistance (MDR)

    Matthijs S. Berends

    -

    01 July 2019

    +

    06 August 2019

    @@ -208,58 +201,60 @@
  • “WIP-Richtlijn Bijzonder Resistente Micro-organismen (BRMO)”, by RIVM (Rijksinstituut voor de Volksgezondheid, the Netherlands National Institute for Public Health and the Environment)
  • As an example, I will make a data set to determine multi-drug resistant TB:

    -
    +

    Because all column names are automatically verified for valid drug names or codes, this would have worked exactly the same:

    - +

    The data set looks like this now:

    - +

    We can now add the interpretation of MDR-TB to our data set:

    - -

    And review the result with a frequency table:

    -
    freq(my_TB_data$mdr)
    -

    Frequency table of mdr from my_TB_data (5,000 x 8)

    + +

    We also created a package dedicated to data cleaning and checking, called the clean package. It gets automatically installed with the AMR package, so we only have to load it:

    +
    library(clean)
    +

    It contains the freq() function, to create a frequency table:

    +
    freq(my_TB_data$mdr)
    +

    Frequency table

    Class: factor > ordered (numeric)
    Length: 5,000 (of which NA: 0 = 0.00%)
    Levels: 5: Negative < Mono-resistance < Poly-resistance < Multidrug resistance…
    @@ -277,41 +272,41 @@ Unique: 5

    1 Mono-resistance -3,222 -64.4% -3,222 -64.4% +3264 +65.3% +3264 +65.3% 2 Negative -659 -13.2% -3,881 -77.6% +627 +12.5% +3891 +77.8% 3 Multidrug resistance -589 -11.8% -4,470 -89.4% +607 +12.1% +4498 +90.0% 4 Poly-resistance -313 -6.3% -4,783 +288 +5.8% +4786 95.7% 5 Extensive drug resistance -217 +214 4.3% -5,000 +5000 100.0% diff --git a/docs/articles/SPSS.html b/docs/articles/SPSS.html index 35c47c154..3364be69a 100644 --- a/docs/articles/SPSS.html +++ b/docs/articles/SPSS.html @@ -40,7 +40,7 @@ AMR (for R) - 0.7.1.9005 + 0.7.1.9015 @@ -118,13 +118,6 @@ Get properties of an antibiotic -
  • - - - - Create frequency tables - -
  • @@ -192,7 +185,7 @@

    How to import data from SPSS / SAS / Stata

    Matthijs S. Berends

    -

    01 July 2019

    +

    29 July 2019

    @@ -242,39 +235,39 @@

    If you sometimes write syntaxes in SPSS to run a complete analysis or to ‘automate’ some of your work, you should perhaps do this in R. You will notice that writing syntaxes in R is a lot more nifty and clever than in SPSS. Still, as working with any statistical package, you will have to have knowledge about what you are doing (statistically) and what you are willing to accomplish.

    To demonstrate the first point:

    -
    +

    @@ -290,97 +283,97 @@

    If you want named variables to be imported as factors so it resembles SPSS more, use as_factor().

    The difference is this:

    - +

    Base R

    To import data from SPSS, SAS or Stata, you can use the great haven package yourself:

    - +

    You can now import files as follows:

    SPSS

    To read files from SPSS into R:

    -
    # read any SPSS file based on file extension (best way):
    -read_spss(file = "path/to/file")
    -
    -# read .sav or .zsav file:
    -read_sav(file = "path/to/file")
    -
    -# read .por file:
    -read_por(file = "path/to/file")
    +
    # read any SPSS file based on file extension (best way):
    +read_spss(file = "path/to/file")
    +
    +# read .sav or .zsav file:
    +read_sav(file = "path/to/file")
    +
    +# read .por file:
    +read_por(file = "path/to/file")

    Do not forget about as_factor(), as mentioned above.

    To export your R objects to the SPSS file format:

    -
    # save as .sav file:
    -write_sav(data = yourdata, path = "path/to/file")
    -
    -# save as compressed .zsav file:
    -write_sav(data = yourdata, path = "path/to/file", compress = TRUE)
    +
    # save as .sav file:
    +write_sav(data = yourdata, path = "path/to/file")
    +
    +# save as compressed .zsav file:
    +write_sav(data = yourdata, path = "path/to/file", compress = TRUE)

    SAS

    To read files from SAS into R:

    -
    # read .sas7bdat + .sas7bcat files:
    -read_sas(data_file = "path/to/file", catalog_file = NULL)
    -
    -# read SAS transport files (version 5 and version 8):
    -read_xpt(file = "path/to/file")
    +
    # read .sas7bdat + .sas7bcat files:
    +read_sas(data_file = "path/to/file", catalog_file = NULL)
    +
    +# read SAS transport files (version 5 and version 8):
    +read_xpt(file = "path/to/file")

    To export your R objects to the SAS file format:

    -
    # save as regular SAS file:
    -write_sas(data = yourdata, path = "path/to/file")
    -
    -# the SAS transport format is an open format 
    -# (required for submission of the data to the FDA)
    -write_xpt(data = yourdata, path = "path/to/file", version = 8)
    +
    # save as regular SAS file:
    +write_sas(data = yourdata, path = "path/to/file")
    +
    +# the SAS transport format is an open format 
    +# (required for submission of the data to the FDA)
    +write_xpt(data = yourdata, path = "path/to/file", version = 8)

    Stata

    To read files from Stata into R:

    -
    # read .dta file:
    -read_stata(file = "/path/to/file")
    -
    -# works exactly the same:
    -read_dta(file = "/path/to/file")
    +
    # read .dta file:
    +read_stata(file = "/path/to/file")
    +
    +# works exactly the same:
    +read_dta(file = "/path/to/file")

    To export your R objects to the Stata file format:

    - +
    diff --git a/docs/articles/WHONET.html b/docs/articles/WHONET.html index e4e054b85..cf72f8717 100644 --- a/docs/articles/WHONET.html +++ b/docs/articles/WHONET.html @@ -40,7 +40,7 @@ AMR (for R) - 0.7.1.9005 + 0.7.1.9026 @@ -118,13 +118,6 @@ Get properties of an antibiotic
  • -
  • - - - - Create frequency tables - -
  • @@ -192,7 +185,7 @@

    How to work with WHONET data

    Matthijs S. Berends

    -

    01 July 2019

    +

    06 August 2019

    @@ -206,38 +199,42 @@
    Import of data

    This tutorial assumes you already imported the WHONET data with e.g. the readxl package. In RStudio, this can be done using the menu button ‘Import Dataset’ in the tab ‘Environment’. Choose the option ‘From Excel’ and select your exported file. Make sure date fields are imported correctly.

    An example syntax could look like this:

    -
    library(readxl)
    -data <- read_excel(path = "path/to/your/file.xlsx")
    +
    library(readxl)
    +data <- read_excel(path = "path/to/your/file.xlsx")

    This package comes with an example data set WHONET. We will use it for this analysis.

    Preparation

    First, load the relevant packages if you did not yet did this. I use the tidyverse for all of my analyses. All of them. If you don’t know it yet, I suggest you read about it on their website: https://www.tidyverse.org/.

    -
    library(dplyr)   # part of tidyverse
    -library(ggplot2) # part of tidyverse
    -library(AMR)     # this package
    +
    library(dplyr)   # part of tidyverse
    +library(ggplot2) # part of tidyverse
    +library(AMR)     # this package

    We will have to transform some variables to simplify and automate the analysis:

    • Microorganisms should be transformed to our own microorganism IDs (called an mo) using the ITIS reference data set, which contains all ~20,000 microorganisms from the taxonomic kingdoms Bacteria, Fungi and Protozoa. We do the tranformation with as.mo(). This function also recognises almost all WHONET abbreviations of microorganisms.
    • Antimicrobial results or interpretations have to be clean and valid. In other words, they should only contain values "S", "I" or "R". That is exactly where the as.rsi() function is for.
    - -

    No errors or warnings, so all values are transformed succesfully. Let’s check it though, with a couple of frequency tables:

    - -

    Frequency table of mo from data (500 x 54)

    + +

    No errors or warnings, so all values are transformed succesfully.

    +

    We created a package dedicated to data cleaning and checking, called the clean package. It gets automatically installed with the AMR package, so we only have to load it:

    +
    library(clean)
    +

    It contains the freq() function, to create frequency tables.

    +

    So let’s check our data, with a couple of frequency tables:

    + +

    Frequency table

    Class: mo (character)
    Length: 500 (of which NA: 0 = 0.00%)
    Unique: 39

    Families: 10
    Genera: 17
    -Species: 38

    +Species: 39

    @@ -331,18 +328,16 @@ Species: 38

    (omitted 29 entries, n = 57 [11.4%])

    - -

    Frequency table of AMC_ND2 from data (500 x 54)

    + +

    Frequency table

    Class: factor > ordered > rsi (numeric)
    Length: 500 (of which NA: 19 = 3.80%)
    Levels: 3: S < I < R
    Unique: 3

    -

    Drug: Amoxicillin/clavulanic acid (AMC, J01CR02)
    -Group: Beta-lactams/penicillins
    -%SI: 78.59%

    +

    %SI: 78.6%

    diff --git a/docs/articles/benchmarks.html b/docs/articles/benchmarks.html index d950825d9..0c5b69c94 100644 --- a/docs/articles/benchmarks.html +++ b/docs/articles/benchmarks.html @@ -40,7 +40,7 @@ AMR (for R) - 0.7.1.9005 + 0.7.1.9015 @@ -118,13 +118,6 @@ Get properties of an antibiotic -
  • - - - - Create frequency tables - -
  • @@ -192,7 +185,7 @@

    Benchmarks

    Matthijs S. Berends

    -

    01 July 2019

    +

    29 July 2019

    @@ -203,161 +196,161 @@

    One of the most important features of this package is the complete microbial taxonomic database, supplied by the Catalogue of Life. We created a function as.mo() that transforms any user input value to a valid microbial ID by using intelligent rules combined with the taxonomic tree of Catalogue of Life.

    Using the microbenchmark package, we can review the calculation performance of this function. Its function microbenchmark() runs different input expressions independently of each other and measures their time-to-result.

    -
    library(microbenchmark)
    -library(AMR)
    +
    library(microbenchmark)
    +library(AMR)

    In the next test, we try to ‘coerce’ different input values for Staphylococcus aureus. The actual result is the same every time: it returns its MO code B_STPHY_AUR (B stands for Bacteria, the taxonomic kingdom).

    But the calculation time differs a lot:

    - +

    In the table above, all measurements are shown in milliseconds (thousands of seconds). A value of 5 milliseconds means it can determine 200 input values per second. It case of 100 milliseconds, this is only 10 input values per second. The second input is the only one that has to be looked up thoroughly. All the others are known codes (the first one is a WHONET code) or common laboratory codes, or common full organism names like the last one. Full organism names are always preferred.

    To achieve this speed, the as.mo function also takes into account the prevalence of human pathogenic microorganisms. The downside is of course that less prevalent microorganisms will be determined less fast. See this example for the ID of Thermus islandicus (B_THERMS_ISL), a bug probably never found before in humans:

    - -

    That takes 6.8 times as much time on average. A value of 100 milliseconds means it can only determine ~10 different input values per second. We can conclude that looking up arbitrary codes of less prevalent microorganisms is the worst way to go, in terms of calculation performance. Full names (like Thermus islandicus) are almost fast - these are the most probable input from most data sets.

    + +

    That takes 8.8 times as much time on average. A value of 100 milliseconds means it can only determine ~10 different input values per second. We can conclude that looking up arbitrary codes of less prevalent microorganisms is the worst way to go, in terms of calculation performance. Full names (like Thermus islandicus) are almost fast - these are the most probable input from most data sets.

    In the figure below, we compare Escherichia coli (which is very common) with Prevotella brevis (which is moderately common) and with Thermus islandicus (which is very uncommon):

    -
    par(mar = c(5, 16, 4, 2)) # set more space for left margin text (16)
    -
    -boxplot(microbenchmark(as.mo("Thermus islandicus"),
    -                       as.mo("Prevotella brevis"),
    -                       as.mo("Escherichia coli"),
    -                       as.mo("T. islandicus"),
    -                       as.mo("P. brevis"),
    -                       as.mo("E. coli"),
    -                       times = 10),
    -        horizontal = TRUE, las = 1, unit = "s", log = FALSE,
    -        xlab = "", ylab = "Time in seconds",
    -        main = "Benchmarks per prevalence")
    +
    par(mar = c(5, 16, 4, 2)) # set more space for left margin text (16)
    +
    +boxplot(microbenchmark(as.mo("Thermus islandicus"),
    +                       as.mo("Prevotella brevis"),
    +                       as.mo("Escherichia coli"),
    +                       as.mo("T. islandicus"),
    +                       as.mo("P. brevis"),
    +                       as.mo("E. coli"),
    +                       times = 10),
    +        horizontal = TRUE, las = 1, unit = "s", log = FALSE,
    +        xlab = "", ylab = "Time in seconds",
    +        main = "Benchmarks per prevalence")

    Uncommon microorganisms take a lot more time than common microorganisms. To relieve this pitfall and further improve performance, two important calculations take almost no time at all: repetitive results and already precalculated results.

    Repetitive results

    Repetitive results are unique values that are present more than once. Unique values will only be calculated once by as.mo(). We will use mo_fullname() for this test - a helper function that returns the full microbial name (genus, species and possibly subspecies) which uses as.mo() internally.

    - -

    So transforming 500,000 values (!!) of 50 unique values only takes 1.09 seconds (1092 ms). You only lose time on your unique input values.

    + +

    So transforming 500,000 values (!!) of 50 unique values only takes 0.62 seconds (618 ms). You only lose time on your unique input values.

    Precalculated results

    What about precalculated results? If the input is an already precalculated result of a helper function like mo_fullname(), it almost doesn’t take any time at all (see ‘C’ below):

    - -

    So going from mo_fullname("Staphylococcus aureus") to "Staphylococcus aureus" takes 0.0018 seconds - it doesn’t even start calculating if the result would be the same as the expected resulting value. That goes for all helper functions:

    - + +

    So going from mo_fullname("Staphylococcus aureus") to "Staphylococcus aureus" takes 0.0008 seconds - it doesn’t even start calculating if the result would be the same as the expected resulting value. That goes for all helper functions:

    +

    Of course, when running mo_phylum("Firmicutes") the function has zero knowledge about the actual microorganism, namely S. aureus. But since the result would be "Firmicutes" too, there is no point in calculating the result. And because this package ‘knows’ all phyla of all known bacteria (according to the Catalogue of Life), it can just return the initial value immediately.

    Results in other languages

    When the system language is non-English and supported by this AMR package, some functions will have a translated result. This almost does’t take extra time:

    -
    mo_fullname("CoNS", language = "en") # or just mo_fullname("CoNS") on an English system
    -# [1] "Coagulase-negative Staphylococcus (CoNS)"
    -
    -mo_fullname("CoNS", language = "es") # or just mo_fullname("CoNS") on a Spanish system
    -# [1] "Staphylococcus coagulasa negativo (SCN)"
    -
    -mo_fullname("CoNS", language = "nl") # or just mo_fullname("CoNS") on a Dutch system
    -# [1] "Coagulase-negatieve Staphylococcus (CNS)"
    -
    -run_it <- microbenchmark(en = mo_fullname("CoNS", language = "en"),
    -                         de = mo_fullname("CoNS", language = "de"),
    -                         nl = mo_fullname("CoNS", language = "nl"),
    -                         es = mo_fullname("CoNS", language = "es"),
    -                         it = mo_fullname("CoNS", language = "it"),
    -                         fr = mo_fullname("CoNS", language = "fr"),
    -                         pt = mo_fullname("CoNS", language = "pt"),
    -                         times = 10)
    -print(run_it, unit = "ms", signif = 4)
    -# Unit: milliseconds
    -#  expr   min    lq  mean median    uq    max neval
    -#    en 43.00 43.12 45.51  44.82 44.89  56.61    10
    -#    de 46.47 46.99 52.11  47.57 48.11  93.77    10
    -#    nl 60.86 62.72 67.57  63.69 63.99 108.20    10
    -#    es 45.74 46.05 52.37  46.42 47.98 103.00    10
    -#    it 45.84 45.89 51.90  47.66 47.73  94.83    10
    -#    fr 45.97 46.92 47.44  47.76 47.86  48.49    10
    -#    pt 45.93 46.77 47.36  47.77 47.93  48.12    10
    +
    mo_fullname("CoNS", language = "en") # or just mo_fullname("CoNS") on an English system
    +# [1] "Coagulase-negative Staphylococcus (CoNS)"
    +
    +mo_fullname("CoNS", language = "es") # or just mo_fullname("CoNS") on a Spanish system
    +# [1] "Staphylococcus coagulasa negativo (SCN)"
    +
    +mo_fullname("CoNS", language = "nl") # or just mo_fullname("CoNS") on a Dutch system
    +# [1] "Coagulase-negatieve Staphylococcus (CNS)"
    +
    +run_it <- microbenchmark(en = mo_fullname("CoNS", language = "en"),
    +                         de = mo_fullname("CoNS", language = "de"),
    +                         nl = mo_fullname("CoNS", language = "nl"),
    +                         es = mo_fullname("CoNS", language = "es"),
    +                         it = mo_fullname("CoNS", language = "it"),
    +                         fr = mo_fullname("CoNS", language = "fr"),
    +                         pt = mo_fullname("CoNS", language = "pt"),
    +                         times = 10)
    +print(run_it, unit = "ms", signif = 4)
    +# Unit: milliseconds
    +#  expr   min    lq  mean median    uq   max neval
    +#    en 17.21 17.67 18.20  18.40 18.62 18.81    10
    +#    de 18.76 19.01 21.59  19.45 19.82 41.96    10
    +#    nl 24.15 24.57 29.11  25.67 26.49 45.25    10
    +#    es 18.31 19.02 19.33  19.37 19.93 20.09    10
    +#    it 18.96 19.27 23.48  19.58 20.91 41.27    10
    +#    fr 18.33 18.80 19.46  19.27 19.97 21.10    10
    +#    pt 18.89 19.50 20.54  19.70 20.36 27.83    10

    Currently supported are German, Dutch, Spanish, Italian, French and Portuguese.

    diff --git a/docs/articles/benchmarks_files/figure-html/unnamed-chunk-5-1.png b/docs/articles/benchmarks_files/figure-html/unnamed-chunk-5-1.png index e79a5f73b..95aa15088 100644 Binary files a/docs/articles/benchmarks_files/figure-html/unnamed-chunk-5-1.png and b/docs/articles/benchmarks_files/figure-html/unnamed-chunk-5-1.png differ diff --git a/docs/articles/freq.html b/docs/articles/freq.html index 3da235d37..4424b9cf0 100644 --- a/docs/articles/freq.html +++ b/docs/articles/freq.html @@ -40,7 +40,7 @@ AMR (for R) - 0.7.1.9005 + 0.7.1.9012 @@ -192,7 +192,7 @@

    How to create frequency tables

    Matthijs S. Berends

    -

    01 July 2019

    +

    10 July 2019

    @@ -210,17 +210,17 @@

    Frequencies of one variable

    To only show and quickly review the content of one variable, you can just select this variable in various ways. Let’s say we want to get the frequencies of the gender variable of the septic_patients dataset:

    - +

    Frequency table of gender from septic_patients (2,000 x 49)

    Class: character
    Length: 2,000 (of which NA: 0 = 0.00%)
    @@ -262,22 +262,22 @@ Longest: 1

    Frequencies of more than one variable

    Multiple variables will be pasted into one variable to review individual cases, keeping a univariate frequency table.

    For illustration, we could add some more variables to the septic_patients dataset to learn about bacterial properties:

    - +

    Now all variables of the microorganisms dataset have been joined to the septic_patients dataset. The microorganisms dataset consists of the following variables:

    - +

    If we compare the dimensions between the old and new dataset, we can see that these 15 variables were added:

    -
    dim(septic_patients)
    -# [1] 2000   49
    -dim(my_patients)
    -# [1] 2000   64
    +
    dim(septic_patients)
    +# [1] 2000   49
    +dim(my_patients)
    +# [1] 2000   64

    So now the genus and species variables are available. A frequency table of these combined variables can be created like this:

    -
    my_patients %>%
    -  freq(genus, species, nmax = 15)
    +
    my_patients %>%
    +  freq(genus, species, nmax = 15)

    Frequency table of genus and species from my_patients (2,000 x 64)

    Columns: 2
    Length: 2,000 (of which NA: 0 = 0.00%)
    @@ -423,10 +423,10 @@ Longest: 34

    Frequencies of numeric values

    Frequency tables can be created of any input.

    In case of numeric values (like integers, doubles, etc.) additional descriptive statistics will be calculated and shown into the header:

    - +

    Frequency table of age from a data.frame (981 x 49)

    Class: numeric
    Length: 981 (of which NA: 0 = 0.00%)
    @@ -506,8 +506,8 @@ Outliers: 15 (unique count: 12)

    Frequencies of factors

    To sort frequencies of factors on their levels instead of item count, use the sort.count parameter.

    sort.count is TRUE by default. Compare this default behaviour…

    - +

    Frequency table of hospital_id from septic_patients (2,000 x 49)

    Class: factor (numeric)
    Length: 2,000 (of which NA: 0 = 0.00%)
    @@ -558,8 +558,8 @@ Unique: 4

  • … to this, where items are now sorted on factor levels:

    -
    septic_patients %>%
    -  freq(hospital_id, sort.count = FALSE)
    +
    septic_patients %>%
    +  freq(hospital_id, sort.count = FALSE)

    Frequency table of hospital_id from septic_patients (2,000 x 49)

    Class: factor (numeric)
    Length: 2,000 (of which NA: 0 = 0.00%)
    @@ -610,8 +610,8 @@ Unique: 4

    All classes will be printed into the header. Variables with the new rsi class of this AMR package are actually ordered factors and have three classes (look at Class in the header):

    -
    septic_patients %>%
    -  freq(AMX, header = TRUE)
    +
    septic_patients %>%
    +  freq(AMX, header = TRUE)

    Frequency table of AMX from septic_patients (2,000 x 49)

    Class: factor > ordered > rsi (numeric)
    Length: 2,000 (of which NA: 771 = 38.55%)
    @@ -661,8 +661,8 @@ Group: Beta-lactams/penicillins

    Frequencies of dates

    Frequencies of dates will show the oldest and newest date in the data, and the amount of days between them:

    -
    septic_patients %>%
    -  freq(date, nmax = 5, header = TRUE)
    +
    septic_patients %>%
    +  freq(date, nmax = 5, header = TRUE)

    Frequency table of date from septic_patients (2,000 x 49)

    Class: Date (numeric)
    Length: 2,000 (of which NA: 0 = 0.00%)
    @@ -728,11 +728,11 @@ Median: 31 July 2009 (47.39%)

    Assigning a frequency table to an object

    A frequency table is actually a regular data.frame, with the exception that it contains an additional class.

    - +

    [1] “freq” “data.frame”

    Because of this additional class, a frequency table prints like the examples above. But the object itself contains the complete table without a row limitation:

    -
    dim(my_df)
    +
    dim(my_df)

    [1] 74 5

    @@ -743,8 +743,8 @@ Median: 31 July 2009 (47.39%)

    Parameter na.rm

    With the na.rm parameter you can remove NA values from the frequency table (defaults to TRUE, but the number of NA values will always be shown into the header):

    -
    septic_patients %>%
    -  freq(AMX, na.rm = FALSE)
    +
    septic_patients %>%
    +  freq(AMX, na.rm = FALSE)

    Frequency table of AMX from septic_patients (2,000 x 49)

    Class: factor > ordered > rsi (numeric)
    Length: 2,000 (of which NA: 771 = 38.55%)
    @@ -803,8 +803,8 @@ Group: Beta-lactams/penicillins
    Parameter row.names

    A frequency table shows row indices. To remove them, use row.names = FALSE:

    -
    septic_patients %>%
    -  freq(hospital_id, row.names = FALSE)
    +
    septic_patients %>%
    +  freq(hospital_id, row.names = FALSE)

    Frequency table of hospital_id from septic_patients (2,000 x 49)

    Class: factor (numeric)
    Length: 2,000 (of which NA: 0 = 0.00%)
    @@ -855,21 +855,21 @@ Unique: 4

    Parameter markdown

    The markdown parameter is TRUE at default in non-interactive sessions, like in reports created with R Markdown. This will always print all rows, unless nmax is set. Without markdown (like in regular R), a frequency table would print like:

    - +
    diff --git a/docs/articles/index.html b/docs/articles/index.html index a13d0919d..f839a1166 100644 --- a/docs/articles/index.html +++ b/docs/articles/index.html @@ -78,7 +78,7 @@ AMR (for R) - 0.7.1.9008 + 0.7.1.9026 @@ -156,13 +156,6 @@ Get properties of an antibiotic
  • -
  • - - - - Create frequency tables - -
  • @@ -242,7 +235,6 @@
  • How to import data from SPSS / SAS / Stata
  • How to work with WHONET data
  • Benchmarks
  • -
  • How to create frequency tables
  • How to predict antimicrobial resistance
  • diff --git a/docs/articles/resistance_predict.html b/docs/articles/resistance_predict.html index 43c052081..a2a5bee30 100644 --- a/docs/articles/resistance_predict.html +++ b/docs/articles/resistance_predict.html @@ -40,7 +40,7 @@ AMR (for R) - 0.7.1.9005 + 0.7.1.9015 @@ -118,13 +118,6 @@ Get properties of an antibiotic -
  • - - - - Create frequency tables - -
  • @@ -192,7 +185,7 @@

    How to predict antimicrobial resistance

    Matthijs S. Berends

    -

    01 July 2019

    +

    29 July 2019

    @@ -206,28 +199,28 @@
    Needed R packages

    As with many uses in R, we need some additional packages for AMR analysis. Our package works closely together with the tidyverse packages dplyr and ggplot2 by Dr Hadley Wickham. The tidyverse tremendously improves the way we conduct data science - it allows for a very natural way of writing syntaxes and creating beautiful plots in R.

    Our AMR package depends on these packages and even extends their use and functions.

    - +

    Prediction analysis

    Our package contains a function resistance_predict(), which takes the same input as functions for other AMR analysis. Based on a date column, it calculates cases per year and uses a regression model to predict antimicrobial resistance.

    It is basically as easy as:

    - +

    The function will look for a date column itself if col_date is not set.

    When running any of these commands, a summary of the regression model will be printed unless using resistance_predict(..., info = FALSE).

    # NOTE: Using column `date` as input for `col_date`.
    @@ -257,55 +250,55 @@
     # 
     # Number of Fisher Scoring iterations: 4

    This text is only a printed summary - the actual result (output) of the function is a data.frame containing for each year: the number of observations, the actual observed resistance, the estimated resistance and the standard error below and above the estimation:

    -
    predict_TZP
    -#    year      value    se_min    se_max observations   observed  estimated
    -# 1  2003 0.06250000        NA        NA           32 0.06250000 0.05486389
    -# 2  2004 0.08536585        NA        NA           82 0.08536585 0.06089002
    -# 3  2005 0.05000000        NA        NA           60 0.05000000 0.06753075
    -# 4  2006 0.05084746        NA        NA           59 0.05084746 0.07483801
    -# 5  2007 0.12121212        NA        NA           66 0.12121212 0.08286570
    -# 6  2008 0.04166667        NA        NA           72 0.04166667 0.09166918
    -# 7  2009 0.01639344        NA        NA           61 0.01639344 0.10130461
    -# 8  2010 0.05660377        NA        NA           53 0.05660377 0.11182814
    -# 9  2011 0.18279570        NA        NA           93 0.18279570 0.12329488
    -# 10 2012 0.30769231        NA        NA           65 0.30769231 0.13575768
    -# 11 2013 0.06896552        NA        NA           58 0.06896552 0.14926576
    -# 12 2014 0.10000000        NA        NA           60 0.10000000 0.16386307
    -# 13 2015 0.23636364        NA        NA           55 0.23636364 0.17958657
    -# 14 2016 0.22619048        NA        NA           84 0.22619048 0.19646431
    -# 15 2017 0.16279070        NA        NA           86 0.16279070 0.21451350
    -# 16 2018 0.23373852 0.2021578 0.2653193           NA         NA 0.23373852
    -# 17 2019 0.25412909 0.2168525 0.2914057           NA         NA 0.25412909
    -# 18 2020 0.27565854 0.2321869 0.3191302           NA         NA 0.27565854
    -# 19 2021 0.29828252 0.2481942 0.3483709           NA         NA 0.29828252
    -# 20 2022 0.32193804 0.2649008 0.3789753           NA         NA 0.32193804
    -# 21 2023 0.34654311 0.2823269 0.4107593           NA         NA 0.34654311
    -# 22 2024 0.37199700 0.3004860 0.4435080           NA         NA 0.37199700
    -# 23 2025 0.39818127 0.3193839 0.4769787           NA         NA 0.39818127
    -# 24 2026 0.42496142 0.3390173 0.5109056           NA         NA 0.42496142
    -# 25 2027 0.45218939 0.3593720 0.5450068           NA         NA 0.45218939
    -# 26 2028 0.47970658 0.3804212 0.5789920           NA         NA 0.47970658
    -# 27 2029 0.50734745 0.4021241 0.6125708           NA         NA 0.50734745
    +
    predict_TZP
    +#    year      value    se_min    se_max observations   observed  estimated
    +# 1  2003 0.06250000        NA        NA           32 0.06250000 0.05486389
    +# 2  2004 0.08536585        NA        NA           82 0.08536585 0.06089002
    +# 3  2005 0.05000000        NA        NA           60 0.05000000 0.06753075
    +# 4  2006 0.05084746        NA        NA           59 0.05084746 0.07483801
    +# 5  2007 0.12121212        NA        NA           66 0.12121212 0.08286570
    +# 6  2008 0.04166667        NA        NA           72 0.04166667 0.09166918
    +# 7  2009 0.01639344        NA        NA           61 0.01639344 0.10130461
    +# 8  2010 0.05660377        NA        NA           53 0.05660377 0.11182814
    +# 9  2011 0.18279570        NA        NA           93 0.18279570 0.12329488
    +# 10 2012 0.30769231        NA        NA           65 0.30769231 0.13575768
    +# 11 2013 0.06896552        NA        NA           58 0.06896552 0.14926576
    +# 12 2014 0.10000000        NA        NA           60 0.10000000 0.16386307
    +# 13 2015 0.23636364        NA        NA           55 0.23636364 0.17958657
    +# 14 2016 0.22619048        NA        NA           84 0.22619048 0.19646431
    +# 15 2017 0.16279070        NA        NA           86 0.16279070 0.21451350
    +# 16 2018 0.23373852 0.2021578 0.2653193           NA         NA 0.23373852
    +# 17 2019 0.25412909 0.2168525 0.2914057           NA         NA 0.25412909
    +# 18 2020 0.27565854 0.2321869 0.3191302           NA         NA 0.27565854
    +# 19 2021 0.29828252 0.2481942 0.3483709           NA         NA 0.29828252
    +# 20 2022 0.32193804 0.2649008 0.3789753           NA         NA 0.32193804
    +# 21 2023 0.34654311 0.2823269 0.4107593           NA         NA 0.34654311
    +# 22 2024 0.37199700 0.3004860 0.4435080           NA         NA 0.37199700
    +# 23 2025 0.39818127 0.3193839 0.4769787           NA         NA 0.39818127
    +# 24 2026 0.42496142 0.3390173 0.5109056           NA         NA 0.42496142
    +# 25 2027 0.45218939 0.3593720 0.5450068           NA         NA 0.45218939
    +# 26 2028 0.47970658 0.3804212 0.5789920           NA         NA 0.47970658
    +# 27 2029 0.50734745 0.4021241 0.6125708           NA         NA 0.50734745

    The function plot is available in base R, and can be extended by other packages to depend the output based on the type of input. We extended its function to cope with resistance predictions:

    -
    plot(predict_TZP)
    +
    plot(predict_TZP)

    This is the fastest way to plot the result. It automatically adds the right axes, error bars, titles, number of available observations and type of model.

    We also support the ggplot2 package with our custom function ggplot_rsi_predict() to create more appealing plots:

    -
    ggplot_rsi_predict(predict_TZP)
    +
    ggplot_rsi_predict(predict_TZP)

    - +

    Choosing the right model

    Resistance is not easily predicted; if we look at vancomycin resistance in Gram positives, the spread (i.e. standard error) is enormous:

    -
    septic_patients %>%
    -  filter(mo_gramstain(mo, language = NULL) == "Gram-positive") %>%
    -  resistance_predict(col_ab = "VAN", year_min = 2010, info = FALSE) %>% 
    -  ggplot_rsi_predict()
    -# NOTE: Using column `date` as input for `col_date`.
    +
    septic_patients %>%
    +  filter(mo_gramstain(mo, language = NULL) == "Gram-positive") %>%
    +  resistance_predict(col_ab = "VAN", year_min = 2010, info = FALSE) %>% 
    +  ggplot_rsi_predict()
    +# NOTE: Using column `date` as input for `col_date`.

    Vancomycin resistance could be 100% in ten years, but might also stay around 0%.

    You can define the model with the model parameter. The default model is a generalised linear regression model using a binomial distribution, assuming that a period of zero resistance was followed by a period of increasing resistance leading slowly to more and more resistance.

    @@ -346,25 +339,25 @@

    For the vancomycin resistance in Gram positive bacteria, a linear model might be more appropriate since no (left half of a) binomial distribution is to be expected based on the observed years:

    -
    septic_patients %>%
    -  filter(mo_gramstain(mo, language = NULL) == "Gram-positive") %>%
    -  resistance_predict(col_ab = "VAN", year_min = 2010, info = FALSE, model = "linear") %>% 
    -  ggplot_rsi_predict()
    -# NOTE: Using column `date` as input for `col_date`.
    +
    septic_patients %>%
    +  filter(mo_gramstain(mo, language = NULL) == "Gram-positive") %>%
    +  resistance_predict(col_ab = "VAN", year_min = 2010, info = FALSE, model = "linear") %>% 
    +  ggplot_rsi_predict()
    +# NOTE: Using column `date` as input for `col_date`.

    This seems more likely, doesn’t it?

    The model itself is also available from the object, as an attribute:

    - +
    diff --git a/docs/articles/resistance_predict_files/figure-html/unnamed-chunk-4-1.png b/docs/articles/resistance_predict_files/figure-html/unnamed-chunk-4-1.png index 12921bf25..43bc315bc 100644 Binary files a/docs/articles/resistance_predict_files/figure-html/unnamed-chunk-4-1.png and b/docs/articles/resistance_predict_files/figure-html/unnamed-chunk-4-1.png differ diff --git a/docs/articles/resistance_predict_files/figure-html/unnamed-chunk-5-1.png b/docs/articles/resistance_predict_files/figure-html/unnamed-chunk-5-1.png index b2341c9d5..55784d096 100644 Binary files a/docs/articles/resistance_predict_files/figure-html/unnamed-chunk-5-1.png and b/docs/articles/resistance_predict_files/figure-html/unnamed-chunk-5-1.png differ diff --git a/docs/articles/resistance_predict_files/figure-html/unnamed-chunk-5-2.png b/docs/articles/resistance_predict_files/figure-html/unnamed-chunk-5-2.png index b15c22477..722cfef8f 100644 Binary files a/docs/articles/resistance_predict_files/figure-html/unnamed-chunk-5-2.png and b/docs/articles/resistance_predict_files/figure-html/unnamed-chunk-5-2.png differ diff --git a/docs/articles/resistance_predict_files/figure-html/unnamed-chunk-6-1.png b/docs/articles/resistance_predict_files/figure-html/unnamed-chunk-6-1.png index 41041d740..210de02d1 100644 Binary files a/docs/articles/resistance_predict_files/figure-html/unnamed-chunk-6-1.png and b/docs/articles/resistance_predict_files/figure-html/unnamed-chunk-6-1.png differ diff --git a/docs/articles/resistance_predict_files/figure-html/unnamed-chunk-7-1.png b/docs/articles/resistance_predict_files/figure-html/unnamed-chunk-7-1.png index 5ca2fafe9..dfa1b0f10 100644 Binary files a/docs/articles/resistance_predict_files/figure-html/unnamed-chunk-7-1.png and b/docs/articles/resistance_predict_files/figure-html/unnamed-chunk-7-1.png differ diff --git a/docs/authors.html b/docs/authors.html index cf84b4a77..44c9d9018 100644 --- a/docs/authors.html +++ b/docs/authors.html @@ -78,7 +78,7 @@ AMR (for R) - 0.7.1.9008 + 0.7.1.9026 @@ -156,13 +156,6 @@ Get properties of an antibiotic
  • -
  • - - - - Create frequency tables - -
  • diff --git a/docs/extra.css b/docs/extra.css index 1ab8e5306..527655960 100644 --- a/docs/extra.css +++ b/docs/extra.css @@ -187,8 +187,10 @@ table a:not(.btn):hover, .table a:not(.btn):hover { /* text below header in manual overview */ .template-reference-index h2 ~ p { - font-size: 110%; - /* font-weight: bold; */ + font-size: 16px; +} +.template-reference-topic h2 { + font-size: 24px; } /* logos on index page */ diff --git a/docs/extra.js b/docs/extra.js index d2ce07dc8..69ef2e00d 100644 --- a/docs/extra.js +++ b/docs/extra.js @@ -46,6 +46,9 @@ $( document ).ready(function() { window.location.replace(url_new); } + // Replace 'Value' in manual to 'Returned value' + $(".template-reference-topic h2#value").text("Returned value"); + // PR for 'R for Data Science' on How To pages if ($(".template-article").length > 0) { $('#sidebar').prepend( diff --git a/docs/index.html b/docs/index.html index ebd8cd116..e5fedb316 100644 --- a/docs/index.html +++ b/docs/index.html @@ -42,7 +42,7 @@ AMR (for R) - 0.7.1.9008 + 0.7.1.9026 @@ -120,13 +120,6 @@ Get properties of an antibiotic
  • -
  • - - - - Create frequency tables - -
  • @@ -190,9 +183,9 @@
    -
    +

    (TLDR - to find out how to conduct AMR analysis, please continue reading here to get started.


    @@ -204,16 +197,15 @@
    • Reference for the taxonomy of microorganisms, since the package contains all microbial (sub)species from the Catalogue of Life (manual)
    • Interpreting raw MIC and disk diffusion values, based on the latest CLSI or EUCAST guidelines (manual)
    • +
    • Determining first isolates to be used for AMR analysis (manual)
    • Calculating antimicrobial resistance (tutorial)
    • Determining multi-drug resistance (MDR) / multi-drug resistant organisms (MDRO) (tutorial)
    • -
    • Calculating empirical susceptibility of both mono therapy and combination therapy (tutorial)
    • +
    • Calculating (empirical) susceptibility of both mono therapy and combination therapies (tutorial)
    • Predicting future antimicrobial resistance using regression models (tutorial)
    • Getting properties for any microorganism (like Gram stain, species, genus or family) (manual)
    • Getting properties for any antibiotic (like name, ATC code, defined daily dose or trade name) (manual)
    • Plotting antimicrobial resistance (tutorial)
    • -
    • Determining first isolates to be used for AMR analysis (manual)
    • Applying EUCAST expert rules (manual)
    • -
    • Descriptive statistics: frequency tables, kurtosis and skewness (tutorial)

    This package is ready-to-use for a professional environment by specialists in the following fields:

    Medical Microbiology

    @@ -248,7 +240,7 @@

    Latest released version

    This package is available on the official R network (CRAN), which has a peer-reviewed submission process. Install this package in R with:

    - +

    It will be downloaded and installed automatically. For RStudio, click on the menu Tools > Install Packages… and then type in “AMR” and press Install.

    Note: Not all functions on this website may be available in this latest release. To use all functions and data sets mentioned on this website, install the latest development version.

    @@ -256,8 +248,8 @@

    Latest development version

    The latest and unpublished development version can be installed with (precaution: may be unstable):

    -
    install.packages("devtools")
    -devtools::install_gitlab("msberends/AMR")
    +
    install.packages("devtools")
    +devtools::install_gitlab("msberends/AMR")
    @@ -296,11 +288,12 @@

    WHO Collaborating Centre for Drug Statistics Methodology

    This package contains all ~450 antimicrobial drugs and their Anatomical Therapeutic Chemical (ATC) codes, ATC groups and Defined Daily Dose (DDD, oral and IV) from the World Health Organization Collaborating Centre for Drug Statistics Methodology (WHOCC, https://www.whocc.no) and the Pharmaceuticals Community Register of the European Commission.

    +

    NOTE: The WHOCC copyright does not allow use for commercial purposes, unlike any other info from this package. See \url{https://www.whocc.no/copyright_disclaimer/}.

    Read more about the data from WHOCC in our manual.

    -
    +

    -WHONET / EARS-Net

    +WHONET / EARS-Net

    We support WHONET and EARS-Net data. Exported files from WHONET can be imported into R and can be analysed easily using this package. For education purposes, we created an example data set WHONET with the exact same structure as a WHONET export file. Furthermore, this package also contains a data set antibiotics with all EARS-Net antibiotic abbreviations, and knows almost all WHONET abbreviations for microorganisms. When using WHONET data as input for analysis, all input parameters will be set automatically.

    Read our tutorial about how to work with WHONET data here.

    @@ -314,7 +307,7 @@

    It cleanses existing data by providing new classes for microoganisms, antibiotics and antimicrobial results (both S/I/R and MIC). By installing this package, you teach R everything about microbiology that is needed for analysis. These functions all use intelligent rules to guess results that you would expect:

    • Use as.mo() to get a microbial ID. The IDs are human readable for the trained eye - the ID of Klebsiella pneumoniae is “B_KLBSL_PNE” (B stands for Bacteria) and the ID of S. aureus is “B_STPHY_AUR”. The function takes almost any text as input that looks like the name or code of a microorganism like “E. coli”, “esco” or “esccol” and tries to find expected results using intelligent rules combined with the included Catalogue of Life data set. It only takes milliseconds to find results, please see our benchmarks. Moreover, it can group Staphylococci into coagulase negative and positive (CoNS and CoPS, see source) and can categorise Streptococci into Lancefield groups (like beta-haemolytic Streptococcus Group B, source).
    • -
    • Use as.ab() to get an antibiotic ID. Like microbial IDs, these IDs are also human readable based on those used by EARS-Net. For example, the ID of amoxicillin is AMX and the ID of gentamicin is GEN. The as.ab() function also uses intelligent rules to find results like accepting misspelling, trade names and abbrevations used in many laboratory systems. For instance, the values “Furabid”, “Furadantin”, “nitro” all return the ID of Nitrofurantoine. To accomplish this, the package contains a database with most LIS codes, official names, trade names, DDDs and categories of antibiotics. The function as.atc() will return the ATC code of an antibiotic as defined by the WHO.
    • +
    • Use as.ab() to get an antibiotic ID. Like microbial IDs, these IDs are also human readable based on those used by EARS-Net. For example, the ID of amoxicillin is AMX and the ID of gentamicin is GEN. The as.ab() function also uses intelligent rules to find results like accepting misspelling, trade names and abbrevations used in many laboratory systems. For instance, the values “Furabid”, “Furadantin”, “nitro” all return the ID of Nitrofurantoine. To accomplish this, the package contains a database with most LIS codes, official names, trade names, ATC codes, defined daily doses (DDD) and drug categories of antibiotics.
    • Use as.rsi() to get antibiotic interpretations based on raw MIC values (in mg/L) or disk diffusion values (in mm), or transform existing values to valid antimicrobial results. It produces just S, I or R based on your input and warns about invalid values. Even values like “<=0.002; S” (combined MIC/RSI) will result in “S”.
    • Use as.mic() to cleanse your MIC values. It produces a so-called factor (called ordinal in SPSS) with valid MIC values as levels. A value like “<=0.002; S” (combined MIC/RSI) will result in “<=0.002”.
    @@ -330,7 +323,7 @@
  • Use mdro() (abbreviation of Multi Drug Resistant Organisms) to check your isolates for exceptional resistance with country-specific guidelines or EUCAST rules. Currently, national guidelines for Germany and the Netherlands are supported.
  • The data set microorganisms contains the complete taxonomic tree of ~65,000 microorganisms. Furthermore, some colloquial names and all Gram stains are available, which enables resistance analysis of e.g. different antibiotics per Gram stain. The package also contains functions to look up values in this data set like mo_genus(), mo_family(), mo_gramstain() or even mo_phylum(). As they use as.mo() internally, they also use the same intelligent rules for determination. For example, mo_genus("MRSA") and mo_genus("S. aureus") will both return "Staphylococcus". They also come with support for German, Dutch, Spanish, Italian, French and Portuguese. These functions can be used to add new variables to your data.
  • -
  • The data set antibiotics contains ~450 antimicrobial drugs with their EARS-Net code, ATC code, PubChem compound ID, official name, common LIS codes and DDDs of both oral and parenteral administration. It also contains all (thousands of) trade names found in PubChem. Use functions like ab_name(), ab_group() and ab_tradenames() to look up values. The ab_* functions use as.ab() internally so they support the same intelligent rules to guess the most probable result. For example, ab_name("Fluclox"), ab_name("Floxapen") and ab_name("J01CF05") will all return "Flucloxacillin". These functions can again be used to add new variables to your data.
  • +
  • The data set antibiotics contains ~450 antimicrobial drugs with their EARS-Net code, ATC code, PubChem compound ID, official name, common LIS codes and DDDs of both oral and parenteral administration. It also contains all (thousands of) trade names found in PubChem. The function ab_atc() will return the ATC code of an antibiotic as defined by the WHO. Use functions like ab_name(), ab_group() and ab_tradenames() to look up values. The ab_* functions use as.ab() internally so they support the same intelligent rules to guess the most probable result. For example, ab_name("Fluclox"), ab_name("Floxapen") and ab_name("J01CF05") will all return "Flucloxacillin". These functions can again be used to add new variables to your data.
  • @@ -339,8 +332,6 @@
  • Calculate the resistance (and even co-resistance) of microbial isolates with the portion_R(), portion_IR(), portion_I(), portion_SI() and portion_S() functions. Similarly, the number of isolates can be determined with the count_R(), count_IR(), count_I(), count_SI() and count_S() functions. All these functions can be used with the dplyr package (e.g. in conjunction with summarise())
  • Plot AMR results with geom_rsi(), a function made for the ggplot2 package
  • Predict antimicrobial resistance for the nextcoming years using logistic regression models with the resistance_predict() function
  • -
  • Conduct descriptive statistics to enhance base R: calculate kurtosis(), skewness() and create frequency tables with freq() -
  • @@ -386,9 +377,7 @@

    License

    diff --git a/docs/news/index.html b/docs/news/index.html index b2a19f2e7..49872dce0 100644 --- a/docs/news/index.html +++ b/docs/news/index.html @@ -78,7 +78,7 @@ AMR (for R) - 0.7.1.9008 + 0.7.1.9026
    @@ -156,13 +156,6 @@ Get properties of an antibiotic
  • -
  • - - - - Create frequency tables - -
  • @@ -232,39 +225,46 @@ -
    +

    -AMR 0.7.1.9008 Unreleased +AMR 0.7.1.9026 Unreleased

    +
    +

    +Breaking

    +
      +
    • Function freq() has moved to a new package, clean (CRAN link). Creating frequency tables is actually not the scope of this package (never was) and this function has matured a lot over the last two years. Therefore, a new package was created for data cleaning and checking and it perfectly fits the freq() function. The clean package is available on CRAN and will be installed automatically when updating the AMR package, that now imports it. In a later stage, the skewness() and kurtosis() functions will be moved to the clean package too.
    • +
    +

    New

    @@ -273,23 +273,33 @@

    Changed

      +
    • Fixed a bug in eucast_rules() that caused an error when the input was a specific kind of tibble +
    • Removed class atc - using as.atc() is now deprecated in favour of ab_atc() and this will return a character, not the atc class anymore
    • Removed deprecated functions abname(), ab_official(), atc_name(), atc_official(), atc_property(), atc_tradenames(), atc_trivial_nl()
    • Fix and speed improvement for mo_shortname()
    • -
    • Fix for as.mo() where misspelled input would not be understood
    • +
    • Algorithm improvements for as.mo(): +
        +
      • Some misspelled input were not understood
      • +
      • These new trivial names known to the field are now understood: meningococcus, gonococcus, pneumococcus
      • +
      • Added support for unknown yeasts and fungi
      • +
      +
    • +
    • Added the newest taxonomic data from the IJSEM journal (now up to date until August 2019)
    • Fix for using mo_* functions where the coercion uncertainties and failures would not be available through mo_uncertainties() and mo_failures() anymore
    • Deprecated the country parameter of mdro() in favour of the already existing guideline parameter to support multiple guidelines within one country
    • -
    • Fix for frequency tables when creating one directly on a group (using group_by())
    • -
    • The name of RIF is now Rifampicin instead of Rifampin
    • +
    • The name of RIF is now Rifampicin instead of Rifampin
    • The antibiotics data set is now sorted by name
    • +
    • Using verbose mode with eucast_rules(..., verbose = TRUE) returns more informative and readable output
    • +
    • Speed improvement for guess_ab_col() which is now 30 times faster for antibiotic abbreviations
    -
    +

    -AMR 0.7.1 2019-06-23 +AMR 0.7.1 2019-06-23

    @@ -297,14 +307,14 @@

    All these lead to the microbial ID of E. coli:

    - +

  • Function mo_info() as an analogy to ab_info(). The mo_info() prints a list with the full taxonomy, authors, and the URL to the online database of a microorganism
  • Function mo_synonyms() to get all previously accepted taxonomic names of a microorganism

  • @@ -368,9 +378,9 @@ -
    +

    -AMR 0.7.0 2019-06-03 +AMR 0.7.0 2019-06-03

    @@ -419,21 +429,21 @@ Please age() function gained a new parameter exact to determine ages with decimals
  • Removed deprecated functions guess_mo(), guess_atc(), EUCAST_rules(), interpretive_reading(), rsi()
  • -
  • Frequency tables (freq()): +
  • Frequency tables (freq()):
  • @@ -443,7 +453,7 @@ Please age_groups(), to let groups of fives and tens end with 100+ instead of 120+ -
  • Fix for freq() for when all values are NA +
  • Fix for freq() for when all values are NA
  • Fix for first_isolate() for when dates are missing
  • Improved speed of guess_ab_col() @@ -465,9 +475,9 @@ Please +

    -AMR 0.6.1 2019-03-29 +AMR 0.6.1 2019-03-29

    @@ -479,9 +489,9 @@ Please +

    -AMR 0.6.0 2019-03-27 +AMR 0.6.0 2019-03-27

    New website!

    We’ve got a new website: https://msberends.gitlab.io/AMR (built with the great pkgdown)

    @@ -504,7 +514,7 @@ Please catalogue_of_life_version().

  • -
  • Due to this change, some mo codes changed (e.g. Streptococcus changed from B_STRPTC to B_STRPT). A translation table is used internally to support older microorganism IDs, so users will not notice this difference.
  • +
  • Due to this change, some mo codes changed (e.g. Streptococcus changed from B_STRPTC to B_STRPT). A translation table is used internally to support older microorganism IDs, so users will not notice this difference.
  • New function mo_rank() for the taxonomic rank (genus, species, infraspecies, etc.)
  • New function mo_url() to get the direct URL of a species from the Catalogue of Life
  • @@ -518,33 +528,33 @@ This data is updated annually - check the included version with the new function
  • New filters for antimicrobial classes. Use these functions to filter isolates on results in one of more antibiotics from a specific class:

    - +

    The antibiotics data set will be searched, after which the input data will be checked for column names with a value in any abbreviations, codes or official names found in the antibiotics data set. For example:

    - +
  • All ab_* functions are deprecated and replaced by atc_* functions:

    - -These functions use as.atc() internally. The old atc_property has been renamed atc_online_property(). This is done for two reasons: firstly, not all ATC codes are of antibiotics (ab) but can also be of antivirals or antifungals. Secondly, the input must have class atc or must be coerable to this class. Properties of these classes should start with the same class name, analogous to as.mo() and e.g. mo_genus.
  • + +These functions use as.atc() internally. The old atc_property has been renamed atc_online_property(). This is done for two reasons: firstly, not all ATC codes are of antibiotics (ab) but can also be of antivirals or antifungals. Secondly, the input must have class atc or must be coerable to this class. Properties of these classes should start with the same class name, analogous to as.mo() and e.g. mo_genus.
  • New functions set_mo_source() and get_mo_source() to use your own predefined MO codes as input for as.mo() and consequently all mo_* functions
  • Support for the upcoming dplyr version 0.8.0
  • New function guess_ab_col() to find an antibiotic column in a table
  • @@ -555,20 +565,20 @@ These functions use as.atc()New function age_groups() to split ages into custom or predefined groups (like children or elderly). This allows for easier demographic antimicrobial resistance analysis per age group.
  • New function ggplot_rsi_predict() as well as the base R plot() function can now be used for resistance prediction calculated with resistance_predict():

    -
    x <- resistance_predict(septic_patients, col_ab = "amox")
    -plot(x)
    -ggplot_rsi_predict(x)
    +
    x <- resistance_predict(septic_patients, col_ab = "amox")
    +plot(x)
    +ggplot_rsi_predict(x)
  • Functions filter_first_isolate() and filter_first_weighted_isolate() to shorten and fasten filtering on data sets with antimicrobial results, e.g.:

    - +

    is equal to:

    -
    septic_patients %>%
    -  mutate(only_firsts = first_isolate(septic_patients, ...)) %>%
    -  filter(only_firsts == TRUE) %>%
    -  select(-only_firsts)
    +
    septic_patients %>%
    +  mutate(only_firsts = first_isolate(septic_patients, ...)) %>%
    +  filter(only_firsts == TRUE) %>%
    +  select(-only_firsts)
  • New function availability() to check the number of available (non-empty) results in a data.frame
  • @@ -597,33 +607,33 @@ These functions use as.atc()
  • Now handles incorrect spelling, like i instead of y and f instead of ph:

    - +
  • Uncertainty of the algorithm is now divided into four levels, 0 to 3, where the default allow_uncertain = TRUE is equal to uncertainty level 2. Run ?as.mo for more info about these levels.

    -
    # equal:
    -as.mo(..., allow_uncertain = TRUE)
    -as.mo(..., allow_uncertain = 2)
    -
    -# also equal:
    -as.mo(..., allow_uncertain = FALSE)
    -as.mo(..., allow_uncertain = 0)
    +
    # equal:
    +as.mo(..., allow_uncertain = TRUE)
    +as.mo(..., allow_uncertain = 2)
    +
    +# also equal:
    +as.mo(..., allow_uncertain = FALSE)
    +as.mo(..., allow_uncertain = 0)
    Using as.mo(..., allow_uncertain = 3) could lead to very unreliable results.
  • Implemented the latest publication of Becker et al. (2019), for categorising coagulase-negative Staphylococci
  • All microbial IDs that found are now saved to a local file ~/.Rhistory_mo. Use the new function clean_mo_history() to delete this file, which resets the algorithms.
  • Incoercible results will now be considered ‘unknown’, MO code UNKNOWN. On foreign systems, properties of these will be translated to all languages already previously supported: German, Dutch, French, Italian, Spanish and Portuguese:

    - +
  • Fix for vector containing only empty values
  • Finds better results when input is in other languages
  • @@ -665,23 +675,23 @@ Using as.mo(..., allow_uncertain = 3) -
  • Frequency tables (freq() function): +
  • Frequency tables (freq() function):
  • -
    +

    -AMR 0.5.0 2018-11-30 +AMR 0.5.0 2018-11-30

    @@ -756,10 +766,10 @@ Using as.mo(..., allow_uncertain = 3)Fewer than 3 characters as input for as.mo will return NA
  • Function as.mo (and all mo_* wrappers) now supports genus abbreviations with “species” attached

    -
    as.mo("E. species")        # B_ESCHR
    -mo_fullname("E. spp.")     # "Escherichia species"
    -as.mo("S. spp")            # B_STPHY
    -mo_fullname("S. species")  # "Staphylococcus species"
    +
    as.mo("E. species")        # B_ESCHR
    +mo_fullname("E. spp.")     # "Escherichia species"
    +as.mo("S. spp")            # B_STPHY
    +mo_fullname("S. species")  # "Staphylococcus species"
  • Added parameter combine_IR (TRUE/FALSE) to functions portion_df and count_df, to indicate that all values of I and R must be merged into one, so the output only consists of S vs. IR (susceptible vs. non-susceptible)
  • Fix for portion_*(..., as_percent = TRUE) when minimal number of isolates would not be met
  • @@ -768,21 +778,21 @@ Using as.mo(..., allow_uncertain = 3)Using portion_* functions now throws a warning when total available isolate is below parameter minimum
  • Functions as.mo, as.rsi, as.mic, as.atc and freq will not set package name as attribute anymore
  • -
  • Frequency tables - freq(): +
  • Frequency tables - freq():
  • -
    +

    -AMR 0.4.0 2018-10-01 +AMR 0.4.0 2018-10-01

    @@ -860,18 +870,18 @@ Using as.mo(..., allow_uncertain = 3)

    They also come with support for German, Dutch, French, Italian, Spanish and Portuguese:

    -
    mo_gramstain("E. coli")
    -# [1] "Gram negative"
    -mo_gramstain("E. coli", language = "de") # German
    -# [1] "Gramnegativ"
    -mo_gramstain("E. coli", language = "es") # Spanish
    -# [1] "Gram negativo"
    -mo_fullname("S. group A", language = "pt") # Portuguese
    -# [1] "Streptococcus grupo A"
    +
    mo_gramstain("E. coli")
    +# [1] "Gram negative"
    +mo_gramstain("E. coli", language = "de") # German
    +# [1] "Gramnegativ"
    +mo_gramstain("E. coli", language = "es") # Spanish
    +# [1] "Gram negativo"
    +mo_fullname("S. group A", language = "pt") # Portuguese
    +# [1] "Streptococcus grupo A"

    Furthermore, former taxonomic names will give a note about the current taxonomic name:

    - +
  • Functions count_R, count_IR, count_I, count_SI and count_S to selectively count resistant or susceptible isolates
  • @@ -974,9 +984,9 @@ Using as.mo(..., allow_uncertain = 3)
    -
    +

    -AMR 0.3.0 2018-08-14 +AMR 0.3.0 2018-08-14

    @@ -1036,13 +1046,13 @@ Using as.mo(..., allow_uncertain = 3)
  • A vignette to explain its usage
  • Support for rsi (antimicrobial resistance) to use as input
  • -
  • Support for table to use as input: freq(table(x, y)) +
  • Support for table to use as input: freq(table(x, y))
  • Support for existing functions hist and plot to use a frequency table as input: hist(freq(df$age))
  • Support for as.vector, as.data.frame, as_tibble and format
  • -
  • Support for quasiquotation: freq(mydata, mycolumn) is the same as mydata %>% freq(mycolumn) +
  • Support for quasiquotation: freq(mydata, mycolumn) is the same as mydata %>% freq(mycolumn)
  • Function top_freq function to return the top/below n items as vector
  • Header of frequency tables now also show Mean Absolute Deviaton (MAD) and Interquartile Range (IQR)
  • @@ -1081,7 +1091,7 @@ Using as.mo(..., allow_uncertain = 3) -
  • Now possible to coerce MIC values with a space between operator and value, i.e. as.mic("<= 0.002") now works
  • +
  • Now possible to coerce MIC values with a space between operator and value, i.e. as.mic("<= 0.002") now works
  • Classes rsi and mic do not add the attribute package.version anymore
  • Added "groups" option for atc_property(..., property). It will return a vector of the ATC hierarchy as defined by the WHO. The new function atc_groups is a convenient wrapper around this.
  • Build-in host check for atc_property as it requires the host set by url to be responsive
  • @@ -1111,9 +1121,9 @@ Using as.mo(..., allow_uncertain = 3)

    -
    +

    -AMR 0.2.0 2018-05-03 +AMR 0.2.0 2018-05-03

    @@ -1169,9 +1179,9 @@ Using as.mo(..., allow_uncertain = 3)

    -
    +

    -AMR 0.1.1 2018-03-14 +AMR 0.1.1 2018-03-14

    -
    +

    -AMR 0.1.0 2018-02-22 +AMR 0.1.0 2018-02-22

    diff --git a/docs/pkgdown.yml b/docs/pkgdown.yml index 9f781c72c..a569ce0ce 100644 --- a/docs/pkgdown.yml +++ b/docs/pkgdown.yml @@ -1,4 +1,4 @@ -pandoc: '2.6' +pandoc: 2.3.1 pkgdown: 1.3.0 pkgdown_sha: ~ articles: @@ -8,7 +8,6 @@ articles: SPSS: SPSS.html WHONET: WHONET.html benchmarks: benchmarks.html - freq: freq.html resistance_predict: resistance_predict.html urls: reference: https://msberends.gitlab.io/AMR/reference diff --git a/docs/reference/AMR-deprecated.html b/docs/reference/AMR-deprecated.html index 368607bfd..6e3451992 100644 --- a/docs/reference/AMR-deprecated.html +++ b/docs/reference/AMR-deprecated.html @@ -80,7 +80,7 @@ AMR (for R) - 0.7.1.9007 + 0.7.1.9015
    @@ -158,13 +158,6 @@ Get properties of an antibiotic -
  • - - - - Create frequency tables - -
  • diff --git a/docs/reference/AMR.html b/docs/reference/AMR.html index 6559b61c6..dea7cd8c4 100644 --- a/docs/reference/AMR.html +++ b/docs/reference/AMR.html @@ -80,7 +80,7 @@ AMR (for R) - 0.7.1.9005 + 0.7.1.9015
  • @@ -158,13 +158,6 @@ Get properties of an antibiotic -
  • - - - - Create frequency tables - -
  • diff --git a/docs/reference/ITIS.html b/docs/reference/ITIS.html deleted file mode 100644 index 12a08129e..000000000 --- a/docs/reference/ITIS.html +++ /dev/null @@ -1,337 +0,0 @@ - - - - - - - - -ITIS: Integrated Taxonomic Information System — ITIS • AMR (for R) - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
    -
    - - - -
    - -
    -
    - - -
    - -

    All taxonomic names of all microorganisms are included in this package, using the authoritative Integrated Taxonomic Information System (ITIS).

    - -
    - - -

    ITIS

    - - -


    -This package contains the complete microbial taxonomic data (with all nine taxonomic ranks - from kingdom to subspecies) from the publicly available Integrated Taxonomic Information System (ITIS, https://www.itis.gov).

    -

    All ~20,000 (sub)species from the taxonomic kingdoms Bacteria, Fungi and Protozoa are included in this package, as well as all their ~2,500 previously accepted names known to ITIS. Furthermore, the responsible authors and year of publication are available. This allows users to use authoritative taxonomic information for their data analysis on any microorganism, not only human pathogens. It also helps to quickly determine the Gram stain of bacteria, since ITIS honours the taxonomic branching order of bacterial phyla according to Cavalier-Smith (2002), which defines that all bacteria are classified into either subkingdom Negibacteria or subkingdom Posibacteria.

    -

    ITIS is a partnership of U.S., Canadian, and Mexican agencies and taxonomic specialists [3].

    - -

    Read more on our website!

    - - -


    -On our website https://msberends.gitlab.io/AMR you can find a comprehensive tutorial about how to conduct AMR analysis, the complete documentation of all functions (which reads a lot easier than here in R) and an example analysis using WHONET data.

    - - -

    Examples

    -
    # NOT RUN {
    -# Get a note when a species was renamed
    -mo_shortname("Chlamydia psittaci")
    -# Note: 'Chlamydia psittaci' (Page, 1968) was renamed
    -#       'Chlamydophila psittaci' (Everett et al., 1999)
    -# [1] "C. psittaci"
    -
    -# Get any property from the entire taxonomic tree for all included species
    -mo_class("E. coli")
    -# [1] "Gammaproteobacteria"
    -
    -mo_family("E. coli")
    -# [1] "Enterobacteriaceae"
    -
    -mo_subkingdom("E. coli")
    -# [1] "Negibacteria"
    -
    -mo_gramstain("E. coli") # based on subkingdom
    -# [1] "Gram negative"
    -
    -mo_ref("E. coli")
    -# [1] "Castellani and Chalmers, 1919"
    -
    -# Do not get mistaken - the package only includes microorganisms
    -mo_phylum("C. elegans")
    -# [1] "Cyanobacteria"                   # Bacteria?!
    -mo_fullname("C. elegans")
    -# [1] "Chroococcus limneticus elegans"  # Because a microorganism was found
    -# }
    -
    - -
    - - -
    - - - - - - - - - diff --git a/docs/reference/WHOCC.html b/docs/reference/WHOCC.html index bcb3bbcee..cfedd039a 100644 --- a/docs/reference/WHOCC.html +++ b/docs/reference/WHOCC.html @@ -80,7 +80,7 @@ AMR (for R) - 0.7.1.9005 + 0.7.1.9026
  • @@ -158,13 +158,6 @@ Get properties of an antibiotic -
  • - - - - Create frequency tables - -
  • @@ -246,7 +239,7 @@


    -This package contains all ~450 antimicrobial drugs and their Anatomical Therapeutic Chemical (ATC) codes, ATC groups and Defined Daily Dose (DDD) from the World Health Organization Collaborating Centre for Drug Statistics Methodology (WHOCC,
    https://www.whocc.no) and the Pharmaceuticals Community Register of the European Commission (http://ec.europa.eu/health/documents/community-register/html/atc.htm).

    +This package contains all ~450 antimicrobial drugs and their Anatomical Therapeutic Chemical (ATC) codes, ATC groups and Defined Daily Dose (DDD) from the World Health Organization Collaborating Centre for Drug Statistics Methodology (WHOCC, https://www.whocc.no) and the Pharmaceuticals Community Register of the European Commission (http://ec.europa.eu/health/documents/community-register/html/atc.htm). NOTE: The WHOCC copyright does not allow use for commercial purposes, unlike any other info from this package. See https://www.whocc.no/copyright_disclaimer/.

    These have become the gold standard for international drug utilisation monitoring and research.

    The WHOCC is located in Oslo at the Norwegian Institute of Public Health and funded by the Norwegian government. The European Commission is the executive of the European Union and promotes its general interest.

    diff --git a/docs/reference/WHONET.html b/docs/reference/WHONET.html index 205a3cd2f..8366c8c8a 100644 --- a/docs/reference/WHONET.html +++ b/docs/reference/WHONET.html @@ -80,7 +80,7 @@ AMR (for R) - 0.7.1.9007 + 0.7.1.9015
  • @@ -158,13 +158,6 @@ Get properties of an antibiotic -
  • - - - - Create frequency tables - -
  • diff --git a/docs/reference/ab_property.html b/docs/reference/ab_property.html index 956260632..5972d9884 100644 --- a/docs/reference/ab_property.html +++ b/docs/reference/ab_property.html @@ -80,7 +80,7 @@ AMR (for R) - 0.7.1.9005 + 0.7.1.9015
  • @@ -158,13 +158,6 @@ Get properties of an antibiotic -
  • - - - - Create frequency tables - -
  • diff --git a/docs/reference/age.html b/docs/reference/age.html index af8125076..9c6d191bc 100644 --- a/docs/reference/age.html +++ b/docs/reference/age.html @@ -80,7 +80,7 @@ AMR (for R) - 0.7.1.9005 + 0.7.1.9015 @@ -158,13 +158,6 @@ Get properties of an antibiotic
  • -
  • - - - - Create frequency tables - -
  • diff --git a/docs/reference/age_groups.html b/docs/reference/age_groups.html index f1b5113bb..ef94ec085 100644 --- a/docs/reference/age_groups.html +++ b/docs/reference/age_groups.html @@ -80,7 +80,7 @@ AMR (for R) - 0.7.1.9005 + 0.7.1.9015 @@ -158,13 +158,6 @@ Get properties of an antibiotic
  • -
  • - - - - Create frequency tables - -
  • diff --git a/docs/reference/antibiotics.html b/docs/reference/antibiotics.html index d396612f1..fbbf2e7f4 100644 --- a/docs/reference/antibiotics.html +++ b/docs/reference/antibiotics.html @@ -80,7 +80,7 @@ AMR (for R) - 0.7.1.9005 + 0.7.1.9026 @@ -158,13 +158,6 @@ Get properties of an antibiotic
  • -
  • - - - - Create frequency tables - -
  • @@ -276,7 +269,7 @@


    -This package contains all ~450 antimicrobial drugs and their Anatomical Therapeutic Chemical (ATC) codes, ATC groups and Defined Daily Dose (DDD) from the World Health Organization Collaborating Centre for Drug Statistics Methodology (WHOCC,
    https://www.whocc.no) and the Pharmaceuticals Community Register of the European Commission (http://ec.europa.eu/health/documents/community-register/html/atc.htm).

    +This package contains all ~450 antimicrobial drugs and their Anatomical Therapeutic Chemical (ATC) codes, ATC groups and Defined Daily Dose (DDD) from the World Health Organization Collaborating Centre for Drug Statistics Methodology (WHOCC, https://www.whocc.no) and the Pharmaceuticals Community Register of the European Commission (http://ec.europa.eu/health/documents/community-register/html/atc.htm). NOTE: The WHOCC copyright does not allow use for commercial purposes, unlike any other info from this package. See https://www.whocc.no/copyright_disclaimer/.

    These have become the gold standard for international drug utilisation monitoring and research.

    The WHOCC is located in Oslo at the Norwegian Institute of Public Health and funded by the Norwegian government. The European Commission is the executive of the European Union and promotes its general interest.

    diff --git a/docs/reference/as.ab.html b/docs/reference/as.ab.html index b2fb70b55..80e2ce26e 100644 --- a/docs/reference/as.ab.html +++ b/docs/reference/as.ab.html @@ -80,7 +80,7 @@ AMR (for R) - 0.7.1.9005 + 0.7.1.9026 @@ -158,13 +158,6 @@ Get properties of an antibiotic
  • -
  • - - - - Create frequency tables - -
  • @@ -274,7 +267,7 @@


    -This package contains all ~450 antimicrobial drugs and their Anatomical Therapeutic Chemical (ATC) codes, ATC groups and Defined Daily Dose (DDD) from the World Health Organization Collaborating Centre for Drug Statistics Methodology (WHOCC,
    https://www.whocc.no) and the Pharmaceuticals Community Register of the European Commission (http://ec.europa.eu/health/documents/community-register/html/atc.htm).

    +This package contains all ~450 antimicrobial drugs and their Anatomical Therapeutic Chemical (ATC) codes, ATC groups and Defined Daily Dose (DDD) from the World Health Organization Collaborating Centre for Drug Statistics Methodology (WHOCC, https://www.whocc.no) and the Pharmaceuticals Community Register of the European Commission (http://ec.europa.eu/health/documents/community-register/html/atc.htm). NOTE: The WHOCC copyright does not allow use for commercial purposes, unlike any other info from this package. See https://www.whocc.no/copyright_disclaimer/.

    These have become the gold standard for international drug utilisation monitoring and research.

    The WHOCC is located in Oslo at the Norwegian Institute of Public Health and funded by the Norwegian government. The European Commission is the executive of the European Union and promotes its general interest.

    diff --git a/docs/reference/as.disk.html b/docs/reference/as.disk.html index b2aa1c906..2b43b8e6a 100644 --- a/docs/reference/as.disk.html +++ b/docs/reference/as.disk.html @@ -80,7 +80,7 @@ AMR (for R) - 0.7.1.9005 + 0.7.1.9015 @@ -158,13 +158,6 @@ Get properties of an antibiotic
  • -
  • - - - - Create frequency tables - -
  • diff --git a/docs/reference/as.mic.html b/docs/reference/as.mic.html index 0e02ee0b9..4764d16bb 100644 --- a/docs/reference/as.mic.html +++ b/docs/reference/as.mic.html @@ -80,7 +80,7 @@ AMR (for R) - 0.7.1.9005 + 0.7.1.9026 @@ -158,13 +158,6 @@ Get properties of an antibiotic
  • -
  • - - - - Create frequency tables - -
  • @@ -296,7 +289,9 @@ plot(mic_data) barplot(mic_data) -freq(mic_data) + +library(clean) +freq(mic_data) # } @@ -158,13 +158,6 @@ Get properties of an antibiotic
  • -
  • - - - - Create frequency tables - -
  • diff --git a/docs/reference/as.rsi.html b/docs/reference/as.rsi.html index 19a2fc574..1998dea24 100644 --- a/docs/reference/as.rsi.html +++ b/docs/reference/as.rsi.html @@ -80,7 +80,7 @@ AMR (for R) - 0.7.1.9007 + 0.7.1.9026 @@ -158,13 +158,6 @@ Get properties of an antibiotic
  • -
  • - - - - Create frequency tables - -
  • @@ -343,7 +336,9 @@ plot(rsi_data) # for percentages barplot(rsi_data) # for frequencies -freq(rsi_data) # frequency table with informative header + +library(clean) +freq(rsi_data) # frequency table with informative header # using dplyr's mutate library(dplyr) diff --git a/docs/reference/atc_online.html b/docs/reference/atc_online.html index 761c1564e..d9f7f5b64 100644 --- a/docs/reference/atc_online.html +++ b/docs/reference/atc_online.html @@ -80,7 +80,7 @@ AMR (for R) - 0.7.1.9005 + 0.7.1.9015 @@ -158,13 +158,6 @@ Get properties of an antibiotic
  • -
  • - - - - Create frequency tables - -
  • diff --git a/docs/reference/availability.html b/docs/reference/availability.html index 2d22deff6..0e232912e 100644 --- a/docs/reference/availability.html +++ b/docs/reference/availability.html @@ -80,7 +80,7 @@ AMR (for R) - 0.7.1.9005 + 0.7.1.9015 @@ -158,13 +158,6 @@ Get properties of an antibiotic
  • -
  • - - - - Create frequency tables - -
  • diff --git a/docs/reference/catalogue_of_life.html b/docs/reference/catalogue_of_life.html index ebbd8feb7..8e5b534cb 100644 --- a/docs/reference/catalogue_of_life.html +++ b/docs/reference/catalogue_of_life.html @@ -80,7 +80,7 @@ AMR (for R) - 0.7.1.9005 + 0.7.1.9015 @@ -158,13 +158,6 @@ Get properties of an antibiotic
  • -
  • - - - - Create frequency tables - -
  • diff --git a/docs/reference/catalogue_of_life_version.html b/docs/reference/catalogue_of_life_version.html index c02e7631e..477007ac2 100644 --- a/docs/reference/catalogue_of_life_version.html +++ b/docs/reference/catalogue_of_life_version.html @@ -80,7 +80,7 @@ AMR (for R) - 0.7.1.9005 + 0.7.1.9026 @@ -158,13 +158,6 @@ Get properties of an antibiotic
  • -
  • - - - - Create frequency tables - -
  • @@ -271,8 +264,9 @@ This package contains the complete taxonomic tree of almost all microorganisms (

    Examples

    # NOT RUN {
     library(dplyr)
    -microorganisms %>% freq(kingdom)
    -microorganisms %>% group_by(kingdom) %>% freq(phylum, nmax = NULL)
    +library(clean)
    +microorganisms %>% freq(kingdom)
    +microorganisms %>% group_by(kingdom) %>% freq(phylum, nmax = NULL)
     # }
    @@ -159,13 +159,6 @@ count_R and count_IR can be used to count resistant isolates, count_S and count_ Get properties of an antibiotic
  • -
  • - - - - Create frequency tables - -
  • diff --git a/docs/reference/eucast_rules.html b/docs/reference/eucast_rules.html index 705f6c9c7..39d664d6a 100644 --- a/docs/reference/eucast_rules.html +++ b/docs/reference/eucast_rules.html @@ -80,7 +80,7 @@ AMR (for R) - 0.7.1.9005 + 0.7.1.9026 @@ -158,13 +158,6 @@ Get properties of an antibiotic
  • -
  • - - - - Create frequency tables - -
  • @@ -265,7 +258,7 @@ verbose -

    a logical to indicate whether extensive info should be returned as a data.frame with info about which rows and columns are effected. It runs all EUCAST rules, but will not be applied to an output - only an informative data.frame with changes will be returned as output.

    +

    a logical to turn Verbose mode on and off (default is off). In Verbose mode, the function does not apply rules to the data, but instead returns a data.frame with extensive info about which rows and columns would be effected and in which way.

    ... @@ -381,8 +374,6 @@

    Examples

    # NOT RUN {
    -a <- eucast_rules(septic_patients)
    -
     a <- data.frame(mo = c("Staphylococcus aureus",
                            "Enterococcus faecalis",
                            "Escherichia coli",
    @@ -418,7 +409,7 @@
     # 5 Pseudomonas aeruginosa    R    R    -    -    R    R    R
     
     
    -# do not apply EUCAST rules, but rather get a a data.frame
    +# do not apply EUCAST rules, but rather get a data.frame
     # with 18 rows, containing all details about the transformations:
     c <- eucast_rules(a, verbose = TRUE)
     # }
    diff --git a/docs/reference/extended-functions.html b/docs/reference/extended-functions.html index ce2e53b53..60d1a7c1e 100644 --- a/docs/reference/extended-functions.html +++ b/docs/reference/extended-functions.html @@ -80,7 +80,7 @@ AMR (for R) - 0.7.1.9005 + 0.7.1.9015 @@ -158,13 +158,6 @@ Get properties of an antibiotic
  • -
  • - - - - Create frequency tables - -
  • diff --git a/docs/reference/filter_ab_class.html b/docs/reference/filter_ab_class.html index 727881708..3559e96bb 100644 --- a/docs/reference/filter_ab_class.html +++ b/docs/reference/filter_ab_class.html @@ -80,7 +80,7 @@ AMR (for R) - 0.7.1.9005 + 0.7.1.9015 @@ -158,13 +158,6 @@ Get properties of an antibiotic
  • -
  • - - - - Create frequency tables - -
  • diff --git a/docs/reference/first_isolate.html b/docs/reference/first_isolate.html index 5a72e039c..8db58b5bc 100644 --- a/docs/reference/first_isolate.html +++ b/docs/reference/first_isolate.html @@ -80,7 +80,7 @@ AMR (for R) - 0.7.1.9005 + 0.7.1.9015 @@ -158,13 +158,6 @@ Get properties of an antibiotic
  • -
  • - - - - Create frequency tables - -
  • diff --git a/docs/reference/freq.html b/docs/reference/freq.html index e49000cce..cf70e6379 100644 --- a/docs/reference/freq.html +++ b/docs/reference/freq.html @@ -81,7 +81,7 @@ top_freq can be used to get the top/bottom n items of a frequency table, with co AMR (for R) - 0.7.1.9007 + 0.7.1.9009 diff --git a/docs/reference/g.test.html b/docs/reference/g.test.html index d4d57e49e..a60d6dc28 100644 --- a/docs/reference/g.test.html +++ b/docs/reference/g.test.html @@ -80,7 +80,7 @@ AMR (for R) - 0.7.1.9005 + 0.7.1.9015 @@ -158,13 +158,6 @@ Get properties of an antibiotic
  • -
  • - - - - Create frequency tables - -
  • diff --git a/docs/reference/ggplot_rsi.html b/docs/reference/ggplot_rsi.html index 82f7b606e..9769a94d9 100644 --- a/docs/reference/ggplot_rsi.html +++ b/docs/reference/ggplot_rsi.html @@ -80,7 +80,7 @@ AMR (for R) - 0.7.1.9007 + 0.7.1.9015 @@ -158,13 +158,6 @@ Get properties of an antibiotic
  • -
  • - - - - Create frequency tables - -
  • @@ -457,7 +450,7 @@ # create new bacterial ID's, with all CoNS under the same group (Becker et al.) mutate(mo = as.mo(mo, Becker = TRUE)) %>% # filter on top three bacterial ID's - filter(mo %in% top_freq(freq(.$mo), 3)) %>% + filter(mo %in% top_freq(freq(.$mo), 3)) %>% # filter on first isolates filter_first_isolate() %>% # get short MO names (like "E. coli") diff --git a/docs/reference/guess_ab_col.html b/docs/reference/guess_ab_col.html index ce86c0538..74c36996e 100644 --- a/docs/reference/guess_ab_col.html +++ b/docs/reference/guess_ab_col.html @@ -80,7 +80,7 @@ AMR (for R) - 0.7.1.9005 + 0.7.1.9015 @@ -158,13 +158,6 @@ Get properties of an antibiotic
  • -
  • - - - - Create frequency tables - -
  • diff --git a/docs/reference/index.html b/docs/reference/index.html index b3aca75dc..5febcf299 100644 --- a/docs/reference/index.html +++ b/docs/reference/index.html @@ -78,7 +78,7 @@ AMR (for R) - 0.7.1.9008 + 0.7.1.9026 @@ -156,13 +156,6 @@ Get properties of an antibiotic
  • -
  • - - - - Create frequency tables - -
  • @@ -411,7 +404,7 @@

    Analysing your data

    -

    Functions for conducting AMR analysis, like counting isolates, calculating resistance or susceptibility, creating frequency tables or make plots.

    +

    Functions for conducting AMR analysis, like counting isolates, calculating resistance or susceptibility, or make plots.

    @@ -440,12 +433,6 @@

    Filter isolates on result in antibiotic class

    - -

    freq() frequency_tbl() top_freq() header() print(<freq>)

    - -

    Frequency table

    - -

    g.test()

    diff --git a/docs/reference/itis.html b/docs/reference/itis.html deleted file mode 100644 index 12a08129e..000000000 --- a/docs/reference/itis.html +++ /dev/null @@ -1,337 +0,0 @@ - - - - - - - - -ITIS: Integrated Taxonomic Information System — ITIS • AMR (for R) - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
    -
    - - - -
    - -
    -
    - - -
    - -

    All taxonomic names of all microorganisms are included in this package, using the authoritative Integrated Taxonomic Information System (ITIS).

    - -
    - - -

    ITIS

    - - -


    -This package contains the complete microbial taxonomic data (with all nine taxonomic ranks - from kingdom to subspecies) from the publicly available Integrated Taxonomic Information System (ITIS, https://www.itis.gov).

    -

    All ~20,000 (sub)species from the taxonomic kingdoms Bacteria, Fungi and Protozoa are included in this package, as well as all their ~2,500 previously accepted names known to ITIS. Furthermore, the responsible authors and year of publication are available. This allows users to use authoritative taxonomic information for their data analysis on any microorganism, not only human pathogens. It also helps to quickly determine the Gram stain of bacteria, since ITIS honours the taxonomic branching order of bacterial phyla according to Cavalier-Smith (2002), which defines that all bacteria are classified into either subkingdom Negibacteria or subkingdom Posibacteria.

    -

    ITIS is a partnership of U.S., Canadian, and Mexican agencies and taxonomic specialists [3].

    - -

    Read more on our website!

    - - -


    -On our website https://msberends.gitlab.io/AMR you can find a comprehensive tutorial about how to conduct AMR analysis, the complete documentation of all functions (which reads a lot easier than here in R) and an example analysis using WHONET data.

    - - -

    Examples

    -
    # NOT RUN {
    -# Get a note when a species was renamed
    -mo_shortname("Chlamydia psittaci")
    -# Note: 'Chlamydia psittaci' (Page, 1968) was renamed
    -#       'Chlamydophila psittaci' (Everett et al., 1999)
    -# [1] "C. psittaci"
    -
    -# Get any property from the entire taxonomic tree for all included species
    -mo_class("E. coli")
    -# [1] "Gammaproteobacteria"
    -
    -mo_family("E. coli")
    -# [1] "Enterobacteriaceae"
    -
    -mo_subkingdom("E. coli")
    -# [1] "Negibacteria"
    -
    -mo_gramstain("E. coli") # based on subkingdom
    -# [1] "Gram negative"
    -
    -mo_ref("E. coli")
    -# [1] "Castellani and Chalmers, 1919"
    -
    -# Do not get mistaken - the package only includes microorganisms
    -mo_phylum("C. elegans")
    -# [1] "Cyanobacteria"                   # Bacteria?!
    -mo_fullname("C. elegans")
    -# [1] "Chroococcus limneticus elegans"  # Because a microorganism was found
    -# }
    -
    - -
    - - -
    - - - - - - - - - diff --git a/docs/reference/join.html b/docs/reference/join.html index 81e39cafc..8f972cd66 100644 --- a/docs/reference/join.html +++ b/docs/reference/join.html @@ -80,7 +80,7 @@ AMR (for R) - 0.7.1.9005 + 0.7.1.9015 @@ -158,13 +158,6 @@ Get properties of an antibiotic
  • -
  • - - - - Create frequency tables - -
  • diff --git a/docs/reference/key_antibiotics.html b/docs/reference/key_antibiotics.html index 114bde048..22c913b5f 100644 --- a/docs/reference/key_antibiotics.html +++ b/docs/reference/key_antibiotics.html @@ -80,7 +80,7 @@ AMR (for R) - 0.7.1.9005 + 0.7.1.9015 @@ -158,13 +158,6 @@ Get properties of an antibiotic
  • -
  • - - - - Create frequency tables - -
  • diff --git a/docs/reference/kurtosis.html b/docs/reference/kurtosis.html index 0d731157e..977c8b5fd 100644 --- a/docs/reference/kurtosis.html +++ b/docs/reference/kurtosis.html @@ -80,7 +80,7 @@ AMR (for R) - 0.7.1.9005 + 0.7.1.9015 @@ -158,13 +158,6 @@ Get properties of an antibiotic
  • -
  • - - - - Create frequency tables - -
  • diff --git a/docs/reference/like.html b/docs/reference/like.html index d7b742597..f83642ef6 100644 --- a/docs/reference/like.html +++ b/docs/reference/like.html @@ -80,7 +80,7 @@ AMR (for R) - 0.7.1.9005 + 0.7.1.9026 @@ -158,13 +158,6 @@ Get properties of an antibiotic
  • -
  • - - - - Create frequency tables - -
  • @@ -306,10 +299,11 @@ # get frequencies of bacteria whose name start with 'Ent' or 'ent' library(dplyr) +library(clean) septic_patients %>% left_join_microorganisms() %>% filter(genus %like% '^ent') %>% - freq(genus, species) + freq(genus, species) # } @@ -158,13 +158,6 @@ Get properties of an antibiotic
  • -
  • - - - - Create frequency tables - -
  • @@ -244,7 +237,7 @@
    mdro(x, guideline = NULL, col_mo = NULL, info = TRUE,
       verbose = FALSE, ...)
     
    -brmo(..., guideline = "BRMO")
    +brmo(x, guideline = "BRMO", ...)
     
     mrgn(x, guideline = "MRGN", ...)
     
    diff --git a/docs/reference/microorganisms.codes.html b/docs/reference/microorganisms.codes.html
    index 0bc2192b7..541e64c09 100644
    --- a/docs/reference/microorganisms.codes.html
    +++ b/docs/reference/microorganisms.codes.html
    @@ -80,7 +80,7 @@
           
           
             AMR (for R)
    -        0.7.1.9005
    +        0.7.1.9015
           
         
     
    @@ -158,13 +158,6 @@
             Get properties of an antibiotic
           
         
  • -
  • - - - - Create frequency tables - -
  • diff --git a/docs/reference/microorganisms.html b/docs/reference/microorganisms.html index e86e90b7d..1ab2dc7f8 100644 --- a/docs/reference/microorganisms.html +++ b/docs/reference/microorganisms.html @@ -80,7 +80,7 @@ AMR (for R) - 0.7.1.9005 + 0.7.1.9026 @@ -158,13 +158,6 @@ Get properties of an antibiotic
  • -
  • - - - - Create frequency tables - -
  • @@ -245,7 +238,7 @@

    Format

    -

    A data.frame with 67,906 observations and 16 variables:

    +

    A data.frame with 68,260 observations and 16 variables:

    mo

    ID of microorganism as used by this package

    col_id

    Catalogue of Life ID

    fullname

    Full name, like "Escherichia coli"

    @@ -268,8 +261,8 @@
  • 9 entries of Streptococcus (beta haemolytic groups A, B, C, D, F, G, H, K and unspecified)

  • 2 entries of Staphylococcus (coagulase-negative [CoNS] and coagulase-positive [CoPS])

  • 3 entries of Trichomonas (Trichomonas vaginalis, and its family and genus)

  • -
  • 3 other 'undefined' entries (unknown, unknown Gram negatives and unknown Gram positives)

  • -
  • 8,830 species from the DSMZ (Deutsche Sammlung von Mikroorganismen und Zellkulturen) that are not in the Catalogue of Life

  • +
  • 5 other 'undefined' entries (unknown, unknown Gram negatives, unknown Gram positives, unknown yeast and unknown fungus)

  • +
  • 8,970 species from the DSMZ (Deutsche Sammlung von Mikroorganismen und Zellkulturen) that are not in the Catalogue of Life

  • About the records from DSMZ (see source)

    diff --git a/docs/reference/microorganisms.old.html b/docs/reference/microorganisms.old.html index 5024ae93a..15bf81e6f 100644 --- a/docs/reference/microorganisms.old.html +++ b/docs/reference/microorganisms.old.html @@ -80,7 +80,7 @@ AMR (for R) - 0.7.1.9005 + 0.7.1.9026 @@ -158,13 +158,6 @@ Get properties of an antibiotic
  • -
  • - - - - Create frequency tables - -
  • @@ -245,7 +238,7 @@

    Format

    -

    A data.frame with 21,342 observations and 4 variables:

    +

    A data.frame with 21,743 observations and 4 variables:

    col_id

    Catalogue of Life ID that was originally given

    col_id_new

    New Catalogue of Life ID that responds to an entry in the microorganisms data set

    fullname

    Old full taxonomic name of the microorganism

    diff --git a/docs/reference/mo_property.html b/docs/reference/mo_property.html index 201789259..ff83d0d8e 100644 --- a/docs/reference/mo_property.html +++ b/docs/reference/mo_property.html @@ -80,7 +80,7 @@ AMR (for R) - 0.7.1.9007 + 0.7.1.9015 @@ -158,13 +158,6 @@ Get properties of an antibiotic
  • -
  • - - - - Create frequency tables - -
  • diff --git a/docs/reference/mo_source.html b/docs/reference/mo_source.html index 0606ee24a..50d67bc84 100644 --- a/docs/reference/mo_source.html +++ b/docs/reference/mo_source.html @@ -81,7 +81,7 @@ This is the fastest way to have your organisation (or analysis) specific codes p AMR (for R) - 0.7.1.9005 + 0.7.1.9015 @@ -159,13 +159,6 @@ This is the fastest way to have your organisation (or analysis) specific codes p Get properties of an antibiotic
  • -
  • - - - - Create frequency tables - -
  • diff --git a/docs/reference/p.symbol.html b/docs/reference/p.symbol.html index 6bcaa8c47..35774dfc3 100644 --- a/docs/reference/p.symbol.html +++ b/docs/reference/p.symbol.html @@ -80,7 +80,7 @@ AMR (for R) - 0.7.1.9005 + 0.7.1.9015 @@ -158,13 +158,6 @@ Get properties of an antibiotic
  • -
  • - - - - Create frequency tables - -
  • diff --git a/docs/reference/portion.html b/docs/reference/portion.html index b86bb2f89..a3433e1ee 100644 --- a/docs/reference/portion.html +++ b/docs/reference/portion.html @@ -81,7 +81,7 @@ portion_R and portion_IR can be used to calculate resistance, portion_S and port AMR (for R) - 0.7.1.9007 + 0.7.1.9015 @@ -159,13 +159,6 @@ portion_R and portion_IR can be used to calculate resistance, portion_S and port Get properties of an antibiotic
  • -
  • - - - - Create frequency tables - -
  • diff --git a/docs/reference/read.4D.html b/docs/reference/read.4D.html index c255e9d8e..76d0263d0 100644 --- a/docs/reference/read.4D.html +++ b/docs/reference/read.4D.html @@ -80,7 +80,7 @@ AMR (for R) - 0.7.1.9005 + 0.7.1.9015 @@ -158,13 +158,6 @@ Get properties of an antibiotic
  • -
  • - - - - Create frequency tables - -
  • diff --git a/docs/reference/resistance_predict.html b/docs/reference/resistance_predict.html index 6fea0ba05..048a5e499 100644 --- a/docs/reference/resistance_predict.html +++ b/docs/reference/resistance_predict.html @@ -80,7 +80,7 @@ AMR (for R) - 0.7.1.9005 + 0.7.1.9026 @@ -158,13 +158,6 @@ Get properties of an antibiotic
  • -
  • - - - - Create frequency tables - -
  • @@ -291,11 +284,11 @@ model -

    the statistical model of choice. Defaults to a generalised linear regression model with binomial distribution, assuming that a period of zero resistance was followed by a period of increasing resistance leading slowly to more and more resistance. See Details for valid options.

    +

    the statistical model of choice. Defaults to a generalised linear regression model with binomial distribution (i.e. using glm(..., family = binomial)), assuming that a period of zero resistance was followed by a period of increasing resistance leading slowly to more and more resistance. See Details for valid options.

    I_as_S -

    a logical to indicate whether values I should be treated as S

    +

    a logical to indicate whether values I should be treated as S (will otherwise be treated as R)

    preserve_measurements diff --git a/docs/reference/rsi_translation.html b/docs/reference/rsi_translation.html index e96af83b7..ed09bc3a9 100644 --- a/docs/reference/rsi_translation.html +++ b/docs/reference/rsi_translation.html @@ -80,7 +80,7 @@ AMR (for R) - 0.7.1.9005 + 0.7.1.9015 @@ -158,13 +158,6 @@ Get properties of an antibiotic
  • -
  • - - - - Create frequency tables - -
  • diff --git a/docs/reference/septic_patients.html b/docs/reference/septic_patients.html index 98b03dcea..3442d8a49 100644 --- a/docs/reference/septic_patients.html +++ b/docs/reference/septic_patients.html @@ -80,7 +80,7 @@ AMR (for R) - 0.7.1.9007 + 0.7.1.9015 @@ -158,13 +158,6 @@ Get properties of an antibiotic
  • -
  • - - - - Create frequency tables - -
  • diff --git a/docs/reference/skewness.html b/docs/reference/skewness.html index 2e7d66c11..120de23a8 100644 --- a/docs/reference/skewness.html +++ b/docs/reference/skewness.html @@ -81,7 +81,7 @@ When negative: the left tail is longer; the mass of the distribution is concentr AMR (for R) - 0.7.1.9005 + 0.7.1.9015 @@ -159,13 +159,6 @@ When negative: the left tail is longer; the mass of the distribution is concentr Get properties of an antibiotic
  • -
  • - - - - Create frequency tables - -
  • diff --git a/docs/reference/translate.html b/docs/reference/translate.html index 7c38140cd..8de2ba512 100644 --- a/docs/reference/translate.html +++ b/docs/reference/translate.html @@ -80,7 +80,7 @@ AMR (for R) - 0.7.1.9005 + 0.7.1.9015 @@ -158,13 +158,6 @@ Get properties of an antibiotic
  • -
  • - - - - Create frequency tables - -
  • diff --git a/docs/sitemap.xml b/docs/sitemap.xml index 67b4672ec..c327e49fa 100644 --- a/docs/sitemap.xml +++ b/docs/sitemap.xml @@ -69,9 +69,6 @@ https://msberends.gitlab.io/AMR/reference/first_isolate.html - - https://msberends.gitlab.io/AMR/reference/freq.html - https://msberends.gitlab.io/AMR/reference/g.test.html @@ -153,9 +150,6 @@ https://msberends.gitlab.io/AMR/articles/benchmarks.html - - https://msberends.gitlab.io/AMR/articles/freq.html - https://msberends.gitlab.io/AMR/articles/resistance_predict.html diff --git a/git_premaster.sh b/git_premaster.sh index 66a38be53..d2743490d 100755 --- a/git_premaster.sh +++ b/git_premaster.sh @@ -36,6 +36,7 @@ fi sed -i -- "s/^Version: .*/Version: ${new_version}/" DESCRIPTION # update 1st line of NEWS.md sed -i -- "1s/.*/# AMR ${new_version}/" NEWS.md +rm *-- || true echo "• First 3 lines of DESCRIPTION:" head -3 DESCRIPTION echo diff --git a/index.md b/index.md index 4df057328..67b00c9e0 100644 --- a/index.md +++ b/index.md @@ -17,16 +17,15 @@ This package can be used for: * Reference for the taxonomy of microorganisms, since the package contains all microbial (sub)species from the [Catalogue of Life](http://www.catalogueoflife.org) ([manual](./reference/mo_property.html)) * Interpreting raw MIC and disk diffusion values, based on the latest CLSI or EUCAST guidelines ([manual](./reference/as.rsi.html)) + * Determining first isolates to be used for AMR analysis ([manual](./reference/first_isolate.html)) * Calculating antimicrobial resistance ([tutorial](./articles/AMR.html)) * Determining multi-drug resistance (MDR) / multi-drug resistant organisms (MDRO) ([tutorial](./articles/MDR.html)) - * Calculating empirical susceptibility of both mono therapy and combination therapy ([tutorial](./articles/AMR.html)) + * Calculating (empirical) susceptibility of both mono therapy and combination therapies ([tutorial](./articles/AMR.html)) * Predicting future antimicrobial resistance using regression models ([tutorial](./articles/resistance_predict.html)) * Getting properties for any microorganism (like Gram stain, species, genus or family) ([manual](./reference/mo_property.html)) * Getting properties for any antibiotic (like name, ATC code, defined daily dose or trade name) ([manual](./reference/ab_property.html)) * Plotting antimicrobial resistance ([tutorial](./articles/AMR.html)) - * Determining first isolates to be used for AMR analysis ([manual](./reference/first_isolate.html)) * Applying EUCAST expert rules ([manual](./reference/eucast_rules.html)) - * Descriptive statistics: frequency tables, kurtosis and skewness ([tutorial](./articles/freq.html)) This package is ready-to-use for a professional environment by specialists in the following fields: @@ -117,6 +116,8 @@ Read more about the data from the Catalogue of Life [in our manual](./reference/ This package contains **all ~450 antimicrobial drugs** and their Anatomical Therapeutic Chemical (ATC) codes, ATC groups and Defined Daily Dose (DDD, oral and IV) from the World Health Organization Collaborating Centre for Drug Statistics Methodology (WHOCC, https://www.whocc.no) and the [Pharmaceuticals Community Register of the European Commission](http://ec.europa.eu/health/documents/community-register/html/atc.htm). +**NOTE: The WHOCC copyright does not allow use for commercial purposes, unlike any other info from this package. See \url{https://www.whocc.no/copyright_disclaimer/}.** + Read more about the data from WHOCC [in our manual](./reference/WHOCC.html). #### WHONET / EARS-Net @@ -134,7 +135,7 @@ The `AMR` package basically does four important things: 1. It **cleanses existing data** by providing new *classes* for microoganisms, antibiotics and antimicrobial results (both S/I/R and MIC). By installing this package, you teach R everything about microbiology that is needed for analysis. These functions all use intelligent rules to guess results that you would expect: * Use `as.mo()` to get a microbial ID. The IDs are human readable for the trained eye - the ID of *Klebsiella pneumoniae* is "B_KLBSL_PNE" (B stands for Bacteria) and the ID of *S. aureus* is "B_STPHY_AUR". The function takes almost any text as input that looks like the name or code of a microorganism like "E. coli", "esco" or "esccol" and tries to find expected results using intelligent rules combined with the included Catalogue of Life data set. It only takes milliseconds to find results, please see our [benchmarks](./articles/benchmarks.html). Moreover, it can group *Staphylococci* into coagulase negative and positive (CoNS and CoPS, see [source](./reference/as.mo.html#source)) and can categorise *Streptococci* into Lancefield groups (like beta-haemolytic *Streptococcus* Group B, [source](./reference/as.mo.html#source)). - * Use `as.ab()` to get an antibiotic ID. Like microbial IDs, these IDs are also human readable based on those used by EARS-Net. For example, the ID of amoxicillin is `AMX` and the ID of gentamicin is `GEN`. The `as.ab()` function also uses intelligent rules to find results like accepting misspelling, trade names and abbrevations used in many laboratory systems. For instance, the values "Furabid", "Furadantin", "nitro" all return the ID of Nitrofurantoine. To accomplish this, the package contains a database with most LIS codes, official names, trade names, DDDs and categories of antibiotics. The function `as.atc()` will return the ATC code of an antibiotic as defined by the WHO. + * Use `as.ab()` to get an antibiotic ID. Like microbial IDs, these IDs are also human readable based on those used by EARS-Net. For example, the ID of amoxicillin is `AMX` and the ID of gentamicin is `GEN`. The `as.ab()` function also uses intelligent rules to find results like accepting misspelling, trade names and abbrevations used in many laboratory systems. For instance, the values "Furabid", "Furadantin", "nitro" all return the ID of Nitrofurantoine. To accomplish this, the package contains a database with most LIS codes, official names, trade names, ATC codes, defined daily doses (DDD) and drug categories of antibiotics. * Use `as.rsi()` to get antibiotic interpretations based on raw MIC values (in mg/L) or disk diffusion values (in mm), or transform existing values to valid antimicrobial results. It produces just S, I or R based on your input and warns about invalid values. Even values like "<=0.002; S" (combined MIC/RSI) will result in "S". * Use `as.mic()` to cleanse your MIC values. It produces a so-called factor (called *ordinal* in SPSS) with valid MIC values as levels. A value like "<=0.002; S" (combined MIC/RSI) will result in "<=0.002". @@ -145,14 +146,13 @@ The `AMR` package basically does four important things: * You can also identify first *weighted* isolates of every patient, an adjusted version of the CLSI guideline. This takes into account key antibiotics of every strain and compares them. * Use `mdro()` (abbreviation of Multi Drug Resistant Organisms) to check your isolates for exceptional resistance with country-specific guidelines or EUCAST rules. Currently, national guidelines for Germany and the Netherlands are supported. * The [data set `microorganisms`](./reference/microorganisms.html) contains the complete taxonomic tree of ~65,000 microorganisms. Furthermore, some colloquial names and all Gram stains are available, which enables resistance analysis of e.g. different antibiotics per Gram stain. The package also contains functions to look up values in this data set like `mo_genus()`, `mo_family()`, `mo_gramstain()` or even `mo_phylum()`. As they use `as.mo()` internally, they also use the same intelligent rules for determination. For example, `mo_genus("MRSA")` and `mo_genus("S. aureus")` will both return `"Staphylococcus"`. They also come with support for German, Dutch, Spanish, Italian, French and Portuguese. These functions can be used to add new variables to your data. - * The [data set `antibiotics`](./reference/antibiotics.html) contains ~450 antimicrobial drugs with their EARS-Net code, ATC code, PubChem compound ID, official name, common LIS codes and DDDs of both oral and parenteral administration. It also contains all (thousands of) trade names found in PubChem. Use functions like `ab_name()`, `ab_group()` and `ab_tradenames()` to look up values. The `ab_*` functions use `as.ab()` internally so they support the same intelligent rules to guess the most probable result. For example, `ab_name("Fluclox")`, `ab_name("Floxapen")` and `ab_name("J01CF05")` will all return `"Flucloxacillin"`. These functions can again be used to add new variables to your data. + * The [data set `antibiotics`](./reference/antibiotics.html) contains ~450 antimicrobial drugs with their EARS-Net code, ATC code, PubChem compound ID, official name, common LIS codes and DDDs of both oral and parenteral administration. It also contains all (thousands of) trade names found in PubChem. The function `ab_atc()` will return the ATC code of an antibiotic as defined by the WHO. Use functions like `ab_name()`, `ab_group()` and `ab_tradenames()` to look up values. The `ab_*` functions use `as.ab()` internally so they support the same intelligent rules to guess the most probable result. For example, `ab_name("Fluclox")`, `ab_name("Floxapen")` and `ab_name("J01CF05")` will all return `"Flucloxacillin"`. These functions can again be used to add new variables to your data. 3. It **analyses the data** with convenient functions that use well-known methods. * Calculate the resistance (and even co-resistance) of microbial isolates with the `portion_R()`, `portion_IR()`, `portion_I()`, `portion_SI()` and `portion_S()` functions. Similarly, the *number* of isolates can be determined with the `count_R()`, `count_IR()`, `count_I()`, `count_SI()` and `count_S()` functions. All these functions can be used with the `dplyr` package (e.g. in conjunction with `summarise()`) * Plot AMR results with `geom_rsi()`, a function made for the `ggplot2` package * Predict antimicrobial resistance for the nextcoming years using logistic regression models with the `resistance_predict()` function - * Conduct descriptive statistics to enhance base R: calculate `kurtosis()`, `skewness()` and create frequency tables with `freq()` 4. It **teaches the user** how to use all the above actions. diff --git a/man/WHOCC.Rd b/man/WHOCC.Rd index 90593dc29..5cc0b7477 100644 --- a/man/WHOCC.Rd +++ b/man/WHOCC.Rd @@ -9,7 +9,7 @@ All antimicrobial drugs and their official names, ATC codes, ATC groups and defi \section{WHOCC}{ \if{html}{\figure{logo_who.png}{options: height=60px style=margin-bottom:5px} \cr} -This package contains \strong{all ~450 antimicrobial drugs} and their Anatomical Therapeutic Chemical (ATC) codes, ATC groups and Defined Daily Dose (DDD) from the World Health Organization Collaborating Centre for Drug Statistics Methodology (WHOCC, \url{https://www.whocc.no}) and the Pharmaceuticals Community Register of the European Commission (\url{http://ec.europa.eu/health/documents/community-register/html/atc.htm}). +This package contains \strong{all ~450 antimicrobial drugs} and their Anatomical Therapeutic Chemical (ATC) codes, ATC groups and Defined Daily Dose (DDD) from the World Health Organization Collaborating Centre for Drug Statistics Methodology (WHOCC, \url{https://www.whocc.no}) and the Pharmaceuticals Community Register of the European Commission (\url{http://ec.europa.eu/health/documents/community-register/html/atc.htm}). \strong{NOTE: The WHOCC copyright does not allow use for commercial purposes, unlike any other info from this package. See \url{https://www.whocc.no/copyright_disclaimer/}.} These have become the gold standard for international drug utilisation monitoring and research. diff --git a/man/antibiotics.Rd b/man/antibiotics.Rd index 406a9298b..83e1357b2 100644 --- a/man/antibiotics.Rd +++ b/man/antibiotics.Rd @@ -41,7 +41,7 @@ Synonyms (i.e. trade names) are derived from the Compound ID (\code{cid}) and co \section{WHOCC}{ \if{html}{\figure{logo_who.png}{options: height=60px style=margin-bottom:5px} \cr} -This package contains \strong{all ~450 antimicrobial drugs} and their Anatomical Therapeutic Chemical (ATC) codes, ATC groups and Defined Daily Dose (DDD) from the World Health Organization Collaborating Centre for Drug Statistics Methodology (WHOCC, \url{https://www.whocc.no}) and the Pharmaceuticals Community Register of the European Commission (\url{http://ec.europa.eu/health/documents/community-register/html/atc.htm}). +This package contains \strong{all ~450 antimicrobial drugs} and their Anatomical Therapeutic Chemical (ATC) codes, ATC groups and Defined Daily Dose (DDD) from the World Health Organization Collaborating Centre for Drug Statistics Methodology (WHOCC, \url{https://www.whocc.no}) and the Pharmaceuticals Community Register of the European Commission (\url{http://ec.europa.eu/health/documents/community-register/html/atc.htm}). \strong{NOTE: The WHOCC copyright does not allow use for commercial purposes, unlike any other info from this package. See \url{https://www.whocc.no/copyright_disclaimer/}.} These have become the gold standard for international drug utilisation monitoring and research. diff --git a/man/as.ab.Rd b/man/as.ab.Rd index 9ff2dd9a6..fe05a1948 100644 --- a/man/as.ab.Rd +++ b/man/as.ab.Rd @@ -35,7 +35,7 @@ European Commission Public Health PHARMACEUTICALS - COMMUNITY REGISTER: \url{htt \section{WHOCC}{ \if{html}{\figure{logo_who.png}{options: height=60px style=margin-bottom:5px} \cr} -This package contains \strong{all ~450 antimicrobial drugs} and their Anatomical Therapeutic Chemical (ATC) codes, ATC groups and Defined Daily Dose (DDD) from the World Health Organization Collaborating Centre for Drug Statistics Methodology (WHOCC, \url{https://www.whocc.no}) and the Pharmaceuticals Community Register of the European Commission (\url{http://ec.europa.eu/health/documents/community-register/html/atc.htm}). +This package contains \strong{all ~450 antimicrobial drugs} and their Anatomical Therapeutic Chemical (ATC) codes, ATC groups and Defined Daily Dose (DDD) from the World Health Organization Collaborating Centre for Drug Statistics Methodology (WHOCC, \url{https://www.whocc.no}) and the Pharmaceuticals Community Register of the European Commission (\url{http://ec.europa.eu/health/documents/community-register/html/atc.htm}). \strong{NOTE: The WHOCC copyright does not allow use for commercial purposes, unlike any other info from this package. See \url{https://www.whocc.no/copyright_disclaimer/}.} These have become the gold standard for international drug utilisation monitoring and research. diff --git a/man/as.mic.Rd b/man/as.mic.Rd index 25a09b472..b46617e93 100755 --- a/man/as.mic.Rd +++ b/man/as.mic.Rd @@ -47,6 +47,8 @@ as.rsi(x = as.mic(4), plot(mic_data) barplot(mic_data) + +library(clean) freq(mic_data) } \seealso{ diff --git a/man/as.rsi.Rd b/man/as.rsi.Rd index 2f19890e4..5751e1769 100755 --- a/man/as.rsi.Rd +++ b/man/as.rsi.Rd @@ -90,6 +90,8 @@ as.rsi(x = as.mic(4), plot(rsi_data) # for percentages barplot(rsi_data) # for frequencies + +library(clean) freq(rsi_data) # frequency table with informative header # using dplyr's mutate diff --git a/man/catalogue_of_life_version.Rd b/man/catalogue_of_life_version.Rd index e9dde6494..c39ce4604 100644 --- a/man/catalogue_of_life_version.Rd +++ b/man/catalogue_of_life_version.Rd @@ -30,6 +30,7 @@ On our website \url{https://msberends.gitlab.io/AMR} you can find \href{https:// \examples{ library(dplyr) +library(clean) microorganisms \%>\% freq(kingdom) microorganisms \%>\% group_by(kingdom) \%>\% freq(phylum, nmax = NULL) } diff --git a/man/eucast_rules.Rd b/man/eucast_rules.Rd index bf13470f1..d217c8cb8 100644 --- a/man/eucast_rules.Rd +++ b/man/eucast_rules.Rd @@ -33,7 +33,7 @@ eucast_rules(x, col_mo = NULL, info = TRUE, rules = c("breakpoints", \item{rules}{a character vector that specifies which rules should be applied - one or more of \code{c("breakpoints", "expert", "other", "all")}} -\item{verbose}{a logical to indicate whether extensive info should be returned as a \code{data.frame} with info about which rows and columns are effected. It runs all EUCAST rules, but will not be applied to an output - only an informative \code{data.frame} with changes will be returned as output.} +\item{verbose}{a logical to turn Verbose mode on and off (default is off). In Verbose mode, the function does not apply rules to the data, but instead returns a \code{data.frame} with extensive info about which rows and columns would be effected and in which way.} \item{...}{column name of an antibiotic, see section Antibiotics} } @@ -135,8 +135,6 @@ On our website \url{https://msberends.gitlab.io/AMR} you can find \href{https:// } \examples{ -a <- eucast_rules(septic_patients) - a <- data.frame(mo = c("Staphylococcus aureus", "Enterococcus faecalis", "Escherichia coli", @@ -172,7 +170,7 @@ b # 5 Pseudomonas aeruginosa R R - - R R R -# do not apply EUCAST rules, but rather get a a data.frame +# do not apply EUCAST rules, but rather get a data.frame # with 18 rows, containing all details about the transformations: c <- eucast_rules(a, verbose = TRUE) } diff --git a/man/freq.Rd b/man/freq.Rd deleted file mode 100755 index cafaf4ae3..000000000 --- a/man/freq.Rd +++ /dev/null @@ -1,234 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/freq.R -\name{freq} -\alias{freq} -\alias{frequency_tbl} -\alias{top_freq} -\alias{header} -\alias{print.freq} -\title{Frequency table} -\usage{ -freq(x, ..., sort.count = TRUE, nmax = getOption("max.print.freq"), - na.rm = TRUE, row.names = TRUE, markdown = !interactive(), - digits = 2, quote = NULL, header = TRUE, title = NULL, - na = "", droplevels = TRUE, sep = " ", - decimal.mark = getOption("OutDec"), big.mark = ifelse(decimal.mark != - ",", ",", ".")) - -frequency_tbl(x, ..., sort.count = TRUE, - nmax = getOption("max.print.freq"), na.rm = TRUE, row.names = TRUE, - markdown = !interactive(), digits = 2, quote = NULL, - header = TRUE, title = NULL, na = "", droplevels = TRUE, - sep = " ", decimal.mark = getOption("OutDec"), - big.mark = ifelse(decimal.mark != ",", ",", ".")) - -top_freq(f, n) - -header(f, property = NULL) - -\method{print}{freq}(x, nmax = getOption("max.print.freq", default = 15), - markdown = !interactive(), header = TRUE, - decimal.mark = getOption("OutDec"), big.mark = ifelse(decimal.mark != - ",", ",", "."), ...) -} -\arguments{ -\item{x}{vector of any class or a \code{\link{data.frame}}, \code{\link{tibble}} (may contain a grouping variable) or \code{\link{table}}} - -\item{...}{up to nine different columns of \code{x} when \code{x} is a \code{data.frame} or \code{tibble}, to calculate frequencies from - see Examples. Also supports quasiquotion.} - -\item{sort.count}{sort on count, i.e. frequencies. This will be \code{TRUE} at default for everything except when using grouping variables.} - -\item{nmax}{number of row to print. The default, \code{15}, uses \code{\link{getOption}("max.print.freq")}. Use \code{nmax = 0}, \code{nmax = Inf}, \code{nmax = NULL} or \code{nmax = NA} to print all rows.} - -\item{na.rm}{a logical value indicating whether \code{NA} values should be removed from the frequency table. The header (if set) will always print the amount of \code{NA}s.} - -\item{row.names}{a logical value indicating whether row indices should be printed as \code{1:nrow(x)}} - -\item{markdown}{a logical value indicating whether the frequency table should be printed in markdown format. This will print all rows (except when \code{nmax} is defined) and is default behaviour in non-interactive R sessions (like when knitting RMarkdown files).} - -\item{digits}{how many significant digits are to be used for numeric values in the header (not for the items themselves, that depends on \code{\link{getOption}("digits")})} - -\item{quote}{a logical value indicating whether or not strings should be printed with surrounding quotes. Default is to print them only around characters that are actually numeric values.} - -\item{header}{a logical value indicating whether an informative header should be printed} - -\item{title}{text to show above frequency table, at default to tries to coerce from the variables passed to \code{x}} - -\item{na}{a character string that should be used to show empty (\code{NA}) values (only useful when \code{na.rm = FALSE})} - -\item{droplevels}{a logical value indicating whether in factors empty levels should be dropped} - -\item{sep}{a character string to separate the terms when selecting multiple columns} - -\item{decimal.mark}{% - used for prettying (longish) numerical and complex sequences. - Passed to \code{\link{prettyNum}}: that help page explains the details.} - -\item{big.mark}{% - used for prettying (longish) numerical and complex sequences. - Passed to \code{\link{prettyNum}}: that help page explains the details.} - -\item{f}{a frequency table} - -\item{n}{number of top \emph{n} items to return, use -n for the bottom \emph{n} items. It will include more than \code{n} rows if there are ties.} - -\item{property}{property in header to return this value directly} -} -\value{ -A \code{data.frame} (with an additional class \code{"freq"}) with five columns: \code{item}, \code{count}, \code{percent}, \code{cum_count} and \code{cum_percent}. -} -\description{ -Create a frequency table of a vector with items or a \code{data.frame}. Supports quasiquotation and markdown for reports. Best practice is: \code{data \%>\% freq(var)}.\cr -\code{top_freq} can be used to get the top/bottom \emph{n} items of a frequency table, with counts as names. -} -\details{ -Frequency tables (or frequency distributions) are summaries of the distribution of values in a sample. With the `freq` function, you can create univariate frequency tables. Multiple variables will be pasted into one variable, so it forces a univariate distribution. This package also has a vignette available to explain the use of this function further, run \code{browseVignettes("AMR")} to read it. - -For numeric values of any class, these additional values will all be calculated with \code{na.rm = TRUE} and shown into the header: -\itemize{ - \item{Mean, using \code{\link[base]{mean}}} - \item{Standard Deviation, using \code{\link[stats]{sd}}} - \item{Coefficient of Variation (CV), the standard deviation divided by the mean} - \item{Mean Absolute Deviation (MAD), using \code{\link[stats]{mad}}} - \item{Tukey Five-Number Summaries (minimum, Q1, median, Q3, maximum), using \code{\link[stats]{fivenum}}} - \item{Interquartile Range (IQR) calculated as \code{Q3 - Q1} using the Tukey Five-Number Summaries, i.e. \strong{not} using the \code{\link[stats]{quantile}} function} - \item{Coefficient of Quartile Variation (CQV, sometimes called coefficient of dispersion), calculated as \code{(Q3 - Q1) / (Q3 + Q1)} using the Tukey Five-Number Summaries} - \item{Outliers (total count and unique count), using \code{\link[grDevices]{boxplot.stats}}} -} - -For dates and times of any class, these additional values will be calculated with \code{na.rm = TRUE} and shown into the header: -\itemize{ - \item{Oldest, using \code{\link{min}}} - \item{Newest, using \code{\link{max}}, with difference between newest and oldest} - \item{Median, using \code{\link[stats]{median}}, with percentage since oldest} -} - -In factors, all factor levels that are not existing in the input data will be dropped. - -The function \code{top_freq} uses \code{\link[dplyr]{top_n}} internally and will include more than \code{n} rows if there are ties. -} -\section{Read more on our website!}{ - -On our website \url{https://msberends.gitlab.io/AMR} you can find \href{https://msberends.gitlab.io/AMR/articles/AMR.html}{a tutorial} about how to conduct AMR analysis, the \href{https://msberends.gitlab.io/AMR/reference}{complete documentation of all functions} (which reads a lot easier than here in R) and \href{https://msberends.gitlab.io/AMR/articles/WHONET.html}{an example analysis using WHONET data}. -} - -\examples{ -library(dplyr) - -# this all gives the same result: -freq(septic_patients$hospital_id) -freq(septic_patients[, "hospital_id"]) -septic_patients$hospital_id \%>\% freq() -septic_patients[, "hospital_id"] \%>\% freq() -septic_patients \%>\% freq("hospital_id") -septic_patients \%>\% freq(hospital_id) #<- easiest to remember (tidyverse) - - -# you could also use `select` or `pull` to get your variables -septic_patients \%>\% - filter(hospital_id == "A") \%>\% - select(mo) \%>\% - freq() - - -# multiple selected variables will be pasted together -septic_patients \%>\% - left_join_microorganisms \%>\% - freq(genus, species) - -# functions as quasiquotation are also supported -septic_patients \%>\% - freq(mo_genus(mo), mo_species(mo)) - - -# group a variable and analyse another -septic_patients \%>\% - group_by(hospital_id) \%>\% - freq(gender) - - -# get top 10 bugs of hospital A as a vector -septic_patients \%>\% - filter(hospital_id == "A") \%>\% - freq(mo) \%>\% - top_freq(10) - - -# save frequency table to an object -years <- septic_patients \%>\% - mutate(year = format(date, "\%Y")) \%>\% - freq(year) - - -# show only the top 5 -years \%>\% print(nmax = 5) - - -# save to an object with formatted percentages -years <- format(years) - - -# print a histogram of numeric values -septic_patients \%>\% - freq(age) \%>\% - hist() - -# or a boxplot of numeric values -septic_patients \%>\% - freq(age) \%>\% - boxplot() - -# or even a boxplot per group -septic_patients \%>\% - group_by(hospital_id) \%>\% - freq(age) \%>\% - boxplot() - -# or print all points to a regular plot -septic_patients \%>\% - freq(age) \%>\% - plot() - - -# transform to a data.frame or tibble -septic_patients \%>\% - freq(age) \%>\% - as.data.frame() - - -# or transform (back) to a vector -septic_patients \%>\% - freq(age) \%>\% - as.vector() - -identical(septic_patients \%>\% - freq(age) \%>\% - as.vector() \%>\% - sort(), - sort(septic_patients$age)) # TRUE - - -# it also supports `table` objects -table(septic_patients$gender, - septic_patients$age) \%>\% - freq(sep = " **sep** ") - - -# only get selected columns -septic_patients \%>\% - freq(hospital_id) \%>\% - select(item, percent) - -septic_patients \%>\% - freq(hospital_id) \%>\% - select(-count, -cum_count) - - -# check differences between frequency tables -diff(freq(septic_patients$TMP), - freq(septic_patients$SXT)) -} -\keyword{freq} -\keyword{frequency} -\keyword{summarise} -\keyword{summary} diff --git a/man/like.Rd b/man/like.Rd index 6823a643e..74d820fdc 100755 --- a/man/like.Rd +++ b/man/like.Rd @@ -56,6 +56,7 @@ a \%like\% b # get frequencies of bacteria whose name start with 'Ent' or 'ent' library(dplyr) +library(clean) septic_patients \%>\% left_join_microorganisms() \%>\% filter(genus \%like\% '^ent') \%>\% diff --git a/man/mdro.Rd b/man/mdro.Rd index f1f76a059..a0d1b09e8 100644 --- a/man/mdro.Rd +++ b/man/mdro.Rd @@ -18,7 +18,7 @@ Rijksinstituut voor Volksgezondheid en Milieu "WIP-richtlijn BRMO (Bijzonder Res mdro(x, guideline = NULL, col_mo = NULL, info = TRUE, verbose = FALSE, ...) -brmo(..., guideline = "BRMO") +brmo(x, guideline = "BRMO", ...) mrgn(x, guideline = "MRGN", ...) diff --git a/man/microorganisms.Rd b/man/microorganisms.Rd index 7df34f039..0e7ff53b4 100755 --- a/man/microorganisms.Rd +++ b/man/microorganisms.Rd @@ -4,7 +4,7 @@ \name{microorganisms} \alias{microorganisms} \title{Data set with ~65,000 microorganisms} -\format{A \code{\link{data.frame}} with 67,906 observations and 16 variables: +\format{A \code{\link{data.frame}} with 68,260 observations and 16 variables: \describe{ \item{\code{mo}}{ID of microorganism as used by this package} \item{\code{col_id}}{Catalogue of Life ID} @@ -33,8 +33,8 @@ Manually added were: \item{9 entries of \emph{Streptococcus} (beta haemolytic groups A, B, C, D, F, G, H, K and unspecified)} \item{2 entries of \emph{Staphylococcus} (coagulase-negative [CoNS] and coagulase-positive [CoPS])} \item{3 entries of Trichomonas (Trichomonas vaginalis, and its family and genus)} - \item{3 other 'undefined' entries (unknown, unknown Gram negatives and unknown Gram positives)} - \item{8,830 species from the DSMZ (Deutsche Sammlung von Mikroorganismen und Zellkulturen) that are not in the Catalogue of Life} + \item{5 other 'undefined' entries (unknown, unknown Gram negatives, unknown Gram positives, unknown yeast and unknown fungus)} + \item{8,970 species from the DSMZ (Deutsche Sammlung von Mikroorganismen und Zellkulturen) that are not in the Catalogue of Life} } } \section{About the records from DSMZ (see source)}{ diff --git a/man/microorganisms.old.Rd b/man/microorganisms.old.Rd index 11d67a48f..c685c73af 100644 --- a/man/microorganisms.old.Rd +++ b/man/microorganisms.old.Rd @@ -4,7 +4,7 @@ \name{microorganisms.old} \alias{microorganisms.old} \title{Data set with previously accepted taxonomic names} -\format{A \code{\link{data.frame}} with 21,342 observations and 4 variables: +\format{A \code{\link{data.frame}} with 21,743 observations and 4 variables: \describe{ \item{\code{col_id}}{Catalogue of Life ID that was originally given} \item{\code{col_id_new}}{New Catalogue of Life ID that responds to an entry in the \code{\link{microorganisms}} data set} diff --git a/man/resistance_predict.Rd b/man/resistance_predict.Rd index b82ada42d..3862a718c 100644 --- a/man/resistance_predict.Rd +++ b/man/resistance_predict.Rd @@ -38,9 +38,9 @@ ggplot_rsi_predict(x, main = paste("Resistance Prediction of", x_name), \item{minimum}{minimal amount of available isolates per year to include. Years containing less observations will be estimated by the model.} -\item{model}{the statistical model of choice. Defaults to a generalised linear regression model with binomial distribution, assuming that a period of zero resistance was followed by a period of increasing resistance leading slowly to more and more resistance. See Details for valid options.} +\item{model}{the statistical model of choice. Defaults to a generalised linear regression model with binomial distribution (i.e. using \code{\link{glm}(..., family = \link{binomial})}), assuming that a period of zero resistance was followed by a period of increasing resistance leading slowly to more and more resistance. See Details for valid options.} -\item{I_as_S}{a logical to indicate whether values \code{I} should be treated as \code{S}} +\item{I_as_S}{a logical to indicate whether values \code{I} should be treated as \code{S} (will otherwise be treated as \code{R})} \item{preserve_measurements}{a logical to indicate whether predictions of years that are actually available in the data should be overwritten by the original data. The standard errors of those years will be \code{NA}.} diff --git a/pkgdown/extra.css b/pkgdown/extra.css index 1ab8e5306..527655960 100644 --- a/pkgdown/extra.css +++ b/pkgdown/extra.css @@ -187,8 +187,10 @@ table a:not(.btn):hover, .table a:not(.btn):hover { /* text below header in manual overview */ .template-reference-index h2 ~ p { - font-size: 110%; - /* font-weight: bold; */ + font-size: 16px; +} +.template-reference-topic h2 { + font-size: 24px; } /* logos on index page */ diff --git a/pkgdown/extra.js b/pkgdown/extra.js index d2ce07dc8..69ef2e00d 100644 --- a/pkgdown/extra.js +++ b/pkgdown/extra.js @@ -46,6 +46,9 @@ $( document ).ready(function() { window.location.replace(url_new); } + // Replace 'Value' in manual to 'Returned value' + $(".template-reference-topic h2#value").text("Returned value"); + // PR for 'R for Data Science' on How To pages if ($(".template-article").length > 0) { $('#sidebar').prepend( diff --git a/tests/testthat/test-eucast_rules.R b/tests/testthat/test-eucast_rules.R index 270297771..e471de06d 100755 --- a/tests/testthat/test-eucast_rules.R +++ b/tests/testthat/test-eucast_rules.R @@ -25,7 +25,7 @@ test_that("EUCAST rules work", { # thoroughly check input table expect_equal(colnames(eucast_rules_file), - c("if_mo_property", "like_is_one_of", "this_value", + c("if_mo_property", "like.is.one_of", "this_value", "and_these_antibiotics", "have_these_values", "then_change_these_antibiotics", "to_value", "reference.rule", "reference.rule_group")) diff --git a/tests/testthat/test-extended.R b/tests/testthat/test-extended.R new file mode 100644 index 000000000..9d79dc257 --- /dev/null +++ b/tests/testthat/test-extended.R @@ -0,0 +1,29 @@ +# ==================================================================== # +# TITLE # +# Antimicrobial Resistance (AMR) Analysis # +# # +# SOURCE # +# https://gitlab.com/msberends/AMR # +# # +# LICENCE # +# (c) 2019 Berends MS (m.s.berends@umcg.nl), Luz CF (c.f.luz@umcg.nl) # +# # +# This R package is free software; you can freely use and distribute # +# it for both personal and commercial purposes under the terms of the # +# GNU General Public License version 2.0 (GNU GPL-2), as published by # +# the Free Software Foundation. # +# # +# This R package was created for academic research and was publicly # +# released in the hope that it will be useful, but it comes WITHOUT # +# ANY WARRANTY OR LIABILITY. # +# Visit our website for more info: https://msberends.gitlab.io/AMR. # +# ==================================================================== # + +context("extended.R") + +test_that("extensions work", { + + expect_identical(scale_type.mo(), "discrete") + expect_identical(scale_type.ab(), "discrete") + +}) diff --git a/tests/testthat/test-freq.R b/tests/testthat/test-freq.R index 8e95ad034..5d329901f 100755 --- a/tests/testthat/test-freq.R +++ b/tests/testthat/test-freq.R @@ -22,168 +22,12 @@ context("freq.R") test_that("frequency table works", { - library(dplyr) - - expect_equal(nrow(freq(c(1, 1, 2, 2, 3, 3, 4, 4, 5, 5))), 5) - expect_equal(nrow(frequency_tbl(c(1, 1, 2, 2, 3, 3, 4, 4, 5, 5))), 5) - - # date column of septic_patients should contain 1140 unique dates - expect_equal(nrow(freq(septic_patients$date)), 1140) - expect_equal(nrow(freq(septic_patients$date)), - length(unique(septic_patients$date))) - - expect_output(print(septic_patients %>% freq(age))) - expect_output(print(septic_patients %>% freq(age, nmax = 5))) - expect_output(print(septic_patients %>% freq(age, nmax = Inf, markdown = FALSE))) - expect_output(print(freq(septic_patients$age, nmax = Inf))) - expect_output(print(freq(septic_patients$age, nmax = NA))) - expect_output(print(freq(septic_patients$age, nmax = NULL))) - expect_output(print(freq(septic_patients$age, sort.count = FALSE))) - expect_output(print(freq(septic_patients$age, markdown = TRUE))) - expect_output(print(freq(septic_patients$age, markdown = TRUE), markdown = FALSE)) - expect_output(print(freq(septic_patients$age, markdown = TRUE), markdown = TRUE)) - expect_output(print(freq(septic_patients$age[0]))) - expect_output(print(freq(septic_patients$age, quote = TRUE))) - expect_output(print(freq(septic_patients$age, markdown = TRUE, title = "TITLE"))) - - # character - expect_output(print(freq(microorganisms$genus))) - expect_output(print(structure(freq(microorganisms$genus), - # check printing of old class: - class = c("frequency_tbl", "data.frame")))) + library(clean) # mo - expect_output(print(freq(septic_patients$mo))) + expect_true(is.freq(freq(septic_patients$mo))) # rsi - expect_output(print(freq(septic_patients$AMX))) - # integer - expect_output(print(freq(septic_patients$age))) - # date - expect_output(print(freq(septic_patients$date))) - # factor - expect_output(print(freq(septic_patients$hospital_id))) - # table - expect_output(print(freq(table(septic_patients$gender, septic_patients$age)))) - # rsi - expect_output(print(freq(septic_patients$AMC))) - # hms - expect_output(print(freq(hms::as.hms(sample(c(0:86399), 50))))) - # matrix - expect_output(print(freq(as.matrix(septic_patients$age)))) - expect_output(print(freq(as.matrix(septic_patients[, c("age", "gender")])))) - # list - expect_output(print(freq(list(age = septic_patients$age)))) - expect_output(print(freq(list(age = septic_patients$age, gender = septic_patients$gender)))) - # difftime - expect_output(print( - freq(difftime(Sys.time(), - Sys.time() - runif(5, min = 0, max = 60 * 60 * 24), - units = "hours")))) - - expect_output(print(freq(septic_patients$age)[,1:3])) - + expect_true(is.freq(freq(septic_patients$AMX))) library(dplyr) - expect_output(septic_patients %>% select(1:2) %>% freq() %>% print()) - expect_output(septic_patients %>% select(1:3) %>% freq() %>% print()) - expect_output(septic_patients %>% select(1:4) %>% freq() %>% print()) - expect_output(septic_patients %>% select(1:5) %>% freq() %>% print()) - expect_output(septic_patients %>% select(1:6) %>% freq() %>% print()) - expect_output(septic_patients %>% select(1:7) %>% freq() %>% print()) - expect_output(septic_patients %>% select(1:8) %>% freq() %>% print()) - expect_output(septic_patients %>% select(1:9) %>% freq() %>% print()) - expect_output(print(freq(septic_patients$age), nmax = 20)) - - # grouping variable - expect_output(print(septic_patients %>% group_by(gender) %>% freq(hospital_id))) - expect_output(print(septic_patients %>% group_by(gender) %>% freq(AMX, quote = TRUE))) - expect_output(print(septic_patients %>% group_by(gender) %>% freq(AMX, markdown = TRUE))) - - # quasiquotation - expect_output(print(septic_patients %>% freq(mo_genus(mo)))) - expect_output(print(septic_patients %>% freq(mo, mo_genus(mo)))) - expect_output(print(septic_patients %>% group_by(gender) %>% freq(mo_genus(mo)))) - expect_output(print(septic_patients %>% group_by(gender) %>% freq(mo, mo_genus(mo)))) - - # top 5 - expect_equal( - septic_patients %>% - freq(mo) %>% - top_freq(5) %>% - length(), - 5) - # there are more than 5 lowest values - expect_gt( - septic_patients %>% - freq(mo) %>% - top_freq(-5) %>% - length(), - 5) - # n has length > 1 - expect_error( - septic_patients %>% - freq(mo) %>% - top_freq(n = c(1, 2)) - ) - # input must be freq tbl - expect_error(septic_patients %>% top_freq(1)) - - # charts from plot, hist and boxplot, should not raise errors - plot(freq(septic_patients, age)) - hist(freq(septic_patients, age)) - boxplot(freq(septic_patients, age)) - boxplot(freq(dplyr::group_by(septic_patients, gender), age)) - - # check vector - expect_identical(septic_patients %>% - freq(age) %>% - as.vector() %>% - sort(), - septic_patients %>% - pull(age) %>% - sort()) - - # check format - expect_identical(septic_patients %>% - freq(age) %>% - format() %>% - apply(2, class) %>% - unname(), - rep("character", 5)) - - # check tibble - expect_identical(septic_patients %>% - freq(age) %>% - as_tibble() %>% - class() %>% - .[1], - "tbl_df") - - expect_error(septic_patients %>% freq(nonexisting)) - expect_error(septic_patients %>% select(1:10) %>% freq()) - expect_error(septic_patients %>% freq(peni, oxac, clox, AMX, AMC, - ampi, pita, czol, cfep, cfur)) - - # (un)select columns - expect_equal(septic_patients %>% freq(hospital_id) %>% select(item) %>% ncol(), - 1) - expect_equal(septic_patients %>% freq(hospital_id) %>% select(-item) %>% ncol(), - 4) - - # run diff - expect_output(print( - diff(freq(septic_patients$AMC), - freq(septic_patients$AMX)) - )) - expect_output(print( - diff(freq(septic_patients$age), - freq(septic_patients$age)) # "No differences found." - )) - expect_error(print( - diff(freq(septic_patients$AMX), - "Just a string") # not a freq tbl - )) - - # directly on group - expect_output(print(septic_patients %>% group_by(ageplusone = as.character(age + 1)) %>% freq(ageplusone))) - + expect_true(is.freq(septic_patients %>% freq(mo))) }) diff --git a/tests/testthat/test-mo_property.R b/tests/testthat/test-mo_property.R index 94dd43746..969948d41 100644 --- a/tests/testthat/test-mo_property.R +++ b/tests/testthat/test-mo_property.R @@ -50,6 +50,7 @@ test_that("mo_property works", { expect_equal(mo_year("Escherichia coli"), 1919) expect_equal(mo_shortname("Escherichia coli"), "E. coli") + expect_equal(mo_shortname("Escherichia"), "E. spp.") expect_equal(mo_shortname("Staphylococcus aureus"), "S. aureus") expect_equal(mo_shortname("Staphylococcus aureus", Becker = TRUE), "S. aureus") expect_equal(mo_shortname("Staphylococcus aureus", Becker = "all", language = "en"), "CoPS") diff --git a/tests/testthat/test-resistance_predict.R b/tests/testthat/test-resistance_predict.R index f0da07cde..3fd90b8e9 100644 --- a/tests/testthat/test-resistance_predict.R +++ b/tests/testthat/test-resistance_predict.R @@ -22,15 +22,15 @@ context("portion.R") test_that("prediction of rsi works", { - AMX_R <- septic_patients %>% - filter(mo == "B_ESCHR_COL") %>% - rsi_predict(col_ab = "AMX", - col_date = "date", - minimum = 10, - info = TRUE) %>% - pull("value") - # AMX resistance will increase according to data set `septic_patients` - expect_true(AMX_R[3] < AMX_R[20]) + # AMX_R <- septic_patients %>% + # filter(mo == "B_ESCHR_COL") %>% + # rsi_predict(col_ab = "AMX", + # col_date = "date", + # minimum = 10, + # info = TRUE) %>% + # pull("value") + # # AMX resistance will increase according to data set `septic_patients` + # expect_true(AMX_R[3] < AMX_R[20]) x <- resistance_predict(septic_patients, col_ab = "AMX", year_min = 2010) plot(x) diff --git a/vignettes/AMR.Rmd b/vignettes/AMR.Rmd index 97fbef2e6..4dce8bf3e 100755 --- a/vignettes/AMR.Rmd +++ b/vignettes/AMR.Rmd @@ -144,7 +144,14 @@ knitr::kable(head(data), align = "c") Now, let's start the cleaning and the analysis! # Cleaning the data -Use the frequency table function `freq()` to look specifically for unique values in any variable. For example, for the `gender` variable: + +We also created a package dedicated to data cleaning and checking, called the `clean` package. It gets automatically installed with the `AMR` package, so we only have to load it: + +```{r lib clean, message = FALSE} +library(clean) +``` + +Use the frequency table function `freq()` from this `clean` package to look specifically for unique values in any variable. For example, for the `gender` variable: ```{r freq gender 1, eval = FALSE} data %>% freq(gender) # this would be the same: freq(data$gender) diff --git a/vignettes/MDR.Rmd b/vignettes/MDR.Rmd index 356389b62..601e2988f 100644 --- a/vignettes/MDR.Rmd +++ b/vignettes/MDR.Rmd @@ -72,7 +72,13 @@ We can now add the interpretation of MDR-TB to our data set: my_TB_data$mdr <- mdr_tb(my_TB_data) ``` -And review the result with a frequency table: +We also created a package dedicated to data cleaning and checking, called the `clean` package. It gets automatically installed with the `AMR` package, so we only have to load it: + +```{r lib clean, message = FALSE} +library(clean) +``` + +It contains the `freq()` function, to create a frequency table: ```{r, results = 'asis'} freq(my_TB_data$mdr) diff --git a/vignettes/WHONET.Rmd b/vignettes/WHONET.Rmd index 667bdc5de..6dda56038 100644 --- a/vignettes/WHONET.Rmd +++ b/vignettes/WHONET.Rmd @@ -60,7 +60,17 @@ data <- WHONET %>% mutate_at(vars(AMP_ND10:CIP_EE), as.rsi) ``` -No errors or warnings, so all values are transformed succesfully. Let's check it though, with a couple of frequency tables: +No errors or warnings, so all values are transformed succesfully. + +We created a package dedicated to data cleaning and checking, called the `clean` package. It gets automatically installed with the `AMR` package, so we only have to load it: + +```{r lib clean, message = FALSE} +library(clean) +``` + +It contains the `freq()` function, to create frequency tables. + +So let's check our data, with a couple of frequency tables: ```{r, results = 'asis'} # our newly created `mo` variable diff --git a/vignettes/freq.Rmd b/vignettes/freq.Rmd deleted file mode 100644 index e450dc2df..000000000 --- a/vignettes/freq.Rmd +++ /dev/null @@ -1,182 +0,0 @@ ---- -title: "How to create frequency tables" -author: "Matthijs S. Berends" -date: '`r format(Sys.Date(), "%d %B %Y")`' -output: - rmarkdown::html_vignette: - toc: true - toc_depth: 3 -vignette: > - %\VignetteIndexEntry{How to create frequency tables} - %\VignetteEncoding{UTF-8} - %\VignetteEngine{knitr::rmarkdown} -editor_options: - chunk_output_type: console ---- - -```{r setup, include = FALSE, results = 'asis'} -knitr::opts_chunk$set( - collapse = TRUE, - comment = "#", - results = 'asis', - fig.width = 7.5, - fig.height = 4.5 -) -library(dplyr) -library(AMR) -``` - -## Introduction - -Frequency tables (or frequency distributions) are summaries of the distribution of values in a sample. With the `freq()` function, you can create univariate frequency tables. Multiple variables will be pasted into one variable, so it forces a univariate distribution. We take the `septic_patients` dataset (included in this AMR package) as example. - -## Frequencies of one variable - -To only show and quickly review the content of one variable, you can just select this variable in various ways. Let's say we want to get the frequencies of the `gender` variable of the `septic_patients` dataset: -```{r, echo = TRUE} -# Any of these will work: -# freq(septic_patients$gender) -# freq(septic_patients[, "gender"]) - -# Using tidyverse: -# septic_patients$gender %>% freq() -# septic_patients[, "gender"] %>% freq() -# septic_patients %>% freq("gender") - -# Probably the fastest and easiest: -septic_patients %>% freq(gender) -``` -This immediately shows the class of the variable, its length and availability (i.e. the amount of `NA`), the amount of unique values and (most importantly) that among septic patients men are more prevalent than women. - -## Frequencies of more than one variable - -Multiple variables will be pasted into one variable to review individual cases, keeping a univariate frequency table. - -For illustration, we could add some more variables to the `septic_patients` dataset to learn about bacterial properties: -```{r, echo = TRUE, results = 'hide'} -my_patients <- septic_patients %>% left_join_microorganisms() -``` -Now all variables of the `microorganisms` dataset have been joined to the `septic_patients` dataset. The `microorganisms` dataset consists of the following variables: -```{r, echo = TRUE, results = 'markup'} -colnames(microorganisms) -``` - -If we compare the dimensions between the old and new dataset, we can see that these `r ncol(my_patients) - ncol(septic_patients)` variables were added: -```{r, echo = TRUE, results = 'markup'} -dim(septic_patients) -dim(my_patients) -``` - -So now the `genus` and `species` variables are available. A frequency table of these combined variables can be created like this: -```{r, echo = TRUE} -my_patients %>% - freq(genus, species, nmax = 15) -``` - -## Frequencies of numeric values - -Frequency tables can be created of any input. - -In case of numeric values (like integers, doubles, etc.) additional descriptive statistics will be calculated and shown into the header: - -```{r, echo = TRUE} -# # get age distribution of unique patients -septic_patients %>% - distinct(patient_id, .keep_all = TRUE) %>% - freq(age, nmax = 5, header = TRUE) -``` - -So the following properties are determined, where `NA` values are always ignored: - -* **Mean** - -* **Standard deviation** - -* **Coefficient of variation** (CV), the standard deviation divided by the mean - -* **Mean absolute deviation** (MAD), the median of the absolute deviations from the median - a more robust statistic than the standard deviation - -* **Five numbers of Tukey**, namely: the minimum, Q1, median, Q3 and maximum - -* **Interquartile range** (IQR), the distance between Q1 and Q3 - -* **Coefficient of quartile variation** (CQV, sometimes called *coefficient of dispersion*), calculated as (Q3 - Q1) / (Q3 + Q1) using `quantile()` with `type = 6` as quantile algorithm to comply with SPSS standards - -* **Outliers** (total count and unique count) - -So for example, the above frequency table quickly shows the median age of patients being `r my_patients %>% distinct(patient_id, .keep_all = TRUE) %>% pull(age) %>% median(na.rm = TRUE)`. - -## Frequencies of factors - -To sort frequencies of factors on their levels instead of item count, use the `sort.count` parameter. - -`sort.count` is `TRUE` by default. Compare this default behaviour... - -```{r, echo = TRUE} -septic_patients %>% - freq(hospital_id) -``` - -... to this, where items are now sorted on factor levels: - -```{r, echo = TRUE} -septic_patients %>% - freq(hospital_id, sort.count = FALSE) -``` - -All classes will be printed into the header. Variables with the new `rsi` class of this AMR package are actually ordered factors and have three classes (look at `Class` in the header): - -```{r, echo = TRUE} -septic_patients %>% - freq(AMX, header = TRUE) -``` - -## Frequencies of dates - -Frequencies of dates will show the oldest and newest date in the data, and the amount of days between them: - -```{r, echo = TRUE} -septic_patients %>% - freq(date, nmax = 5, header = TRUE) -``` - -## Assigning a frequency table to an object - -A frequency table is actually a regular `data.frame`, with the exception that it contains an additional class. - -```{r, echo = TRUE} -my_df <- septic_patients %>% freq(age) -class(my_df) -``` - -Because of this additional class, a frequency table prints like the examples above. But the object itself contains the complete table without a row limitation: - -```{r, echo = TRUE} -dim(my_df) -``` - -## Additional parameters - -### Parameter `na.rm` -With the `na.rm` parameter you can remove `NA` values from the frequency table (defaults to `TRUE`, but the number of `NA` values will always be shown into the header): - -```{r, echo = TRUE} -septic_patients %>% - freq(AMX, na.rm = FALSE) -``` - -### Parameter `row.names` -A frequency table shows row indices. To remove them, use `row.names = FALSE`: - -```{r, echo = TRUE} -septic_patients %>% - freq(hospital_id, row.names = FALSE) -``` - -### Parameter `markdown` -The `markdown` parameter is `TRUE` at default in non-interactive sessions, like in reports created with R Markdown. This will always print all rows, unless `nmax` is set. Without markdown (like in regular R), a frequency table would print like: - -```{r, echo = TRUE, results = 'markup'} -septic_patients %>% - freq(hospital_id, markdown = FALSE) -```