Merge pull request #164 from taxonomicallyinformedannotation/dev

WIP #140
taxonomicallyinformedannotation · Jul 24, 2024 · c330d1d · c330d1d
2 parents 9c1b09a + 7954343
commit c330d1d
Show file tree

Hide file tree

Showing 25 changed files with 610 additions and 599 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -85,6 +85,7 @@ Collate:
     'parse_adduct.R'
     'calculate_mass_of_m.R'
     'clean_bio.R'
+    'filter_high_confidence_only.R'
     'columns_model.R'
     'clean_collapse.R'
     'clean_chemo.R'

diff --git a/NAMESPACE b/NAMESPACE
@@ -31,6 +31,7 @@ export(fake_hmdb)
 export(fake_lotus)
 export(fake_sop_columns)
 export(filter_annotations)
+export(filter_high_confidence_only)
 export(get_example_sirius)
 export(get_file)
 export(get_gnps_tables)

diff --git a/NEWS.md b/NEWS.md
@@ -2,161 +2,162 @@
 
 # timaR 2.9.7
 
-* Adding possibility to add internal libraries through the GUI (#159)
-* Adding number of peaks in spectrum
+* Added possibility to add internal libraries through the GUI (#159)
+* Added number of peaks in spectrum
 * Clearer handling of SIRIUS scores (#146, #147)
-* Expose more parameters to the GUI (#159)
-* Fix adducts and remove nitrogen rule
-* Fix number of matched peaks
+* Exposed more parameters to the GUI (#159)
+* Fixed adducts and remove nitrogen rule
+* Fixed number of matched peaks
 * Improved imports
+* Reduced warnings
 * Updated benchmarking steps
 
 # timaR 2.9.6
 
-* Adding light-switch thanks to `pkgdown 2.1.0`.
+* Added light-switch thanks to `pkgdown 2.1.0`.
 * Attempt to simplify installation
-* Fix library/adducts confusion (#123)
-* Fix some incorrect adduct differences annotations
+* Fixed library/adducts confusion (#123)
+* Fixed some incorrect adduct differences annotations
 * Refactored adducts / neutral losses / dimers annotation to allow for more flexibility (#141, #144)
 
 # timaR 2.9.5
 
 * Do not re-package if already the latest version
 * SIRIUS 6 default and compatible (keeping SIRIUS 5 backward compatibility)
-* Update to Massbank version `2024.06`
+* Updated to Massbank version `2024.06`
 
 # timaR 2.9.4
 
 * Automated update
 * Added an option to remove ties (#134)
 * Added some details for SIRIUS, add manual workspace addition (#132)
 * Additional preprocessing (reduction) of noisy spectra
+* Dependencies update
 * Docker updates (#131)
 * Handle cases when same (feature_id, mslevel) pairs are present within an MGF (#133)
 * Improved documentation
 * New working directory at `$HOME/.tima`
-* Dependencies update
-* Update R and Bioconductor versions
+* Updated R and Bioconductor versions
 
 # timaR 2.9.3
 
-* Allow for SIRIUS jobs containing only summaries
-* Allow for underscores in job pattern
-* Change some default values (less stringent)
+* Allowed for SIRIUS jobs containing only summaries
+* Allowed for underscores in job pattern
+* Changed some default values (less stringent)
 * Dependencies update
-* Migrate app testing to `shinytest2`
-* Remove further some inconsistent MS1 annotations
-* Remove tests dependencies by default
+* Migrated app testing to `shinytest2`
+* Removed further some inconsistent MS1 annotations
+* Removed tests dependencies by default
 
 # timaR 2.9.2
 
-* Add Nitrogen rule to filter out some annotations
+* Added Nitrogen rule to filter out some annotations
 * Better handling of partial downloads (#118)
 * Dependencies update (mainly `targets 1.5.1`, will invalidate previous targets)
-* Fix some port issues in Shiny (#122)
-* Remove completely empty columns from final output to avoid confusion (#120)
+* Fixed some port issues in Shiny (#122)
+* Removed completely empty columns from final output to avoid confusion (#120)
 
 # timaR 2.9.1
 
-* Add [Waystation](https://caltechlibrary.github.io/waystation/) action
-* Add structures from spectral libraries to SOP library (#113)
-* Expose all parameters (#107, #108)
-* Fix for Zenodo API
+* Added [Waystation](https://caltechlibrary.github.io/waystation/) action
+* Added structures from spectral libraries to SOP library (#113)
+* Exposed all parameters (#107, #108)
+* Fixed for Zenodo API
 * HMDB structures support
-* Optimize grep/gsub by adding `perl=TRUE` or `fixed=TRUE`
-* Update to Massbank version `2023.11`
-* Update SIRIUS preparation (#74, #115)
+* Optimized grep/gsub by adding `perl=TRUE` or `fixed=TRUE`
+* Updated to Massbank version `2023.11`
+* Updated SIRIUS preparation (#74, #115)
 
 # timaR 2.9.0
 
 * Added compounds names as parameter
 * Added MassBank spectral library (#77)
-* Allow files outside `data/source` (#89)
+* Allowed files outside `data/source` (#89)
 * Added RT library as annotation library (#86)
-* Be less dependent of GNPS by default
 * Better handling of download errors
 * Fixed Docker mount path
 * Improved naming (#91)
 * Internal variables refactoring
+* Multiple Shiny fixes and tests addition (#60)
+* Multiple fixes (#71, #81, #82)
 * New adducts (#79, #80)
 * Refactored adducts, clusters and neutral losses
 * Refactored biological and chemical score
 * Refactored RT matching (#76)
 * Refactored Sirius scores (#92)
-* Multiple Shiny fixes and tests addition (#60)
-* Multiple fixes (#71, #81, #82)
+* Removed GNPS dependency by default
 
 # timaR 2.8.2
 
-* Change from pbmclapply to pblapply
 * Added spectral entropy
 * Added MS1 only possibility
 * Added Fluorine adduct
-* Fix empty chemical classes
-* Fix not classified taxa
+* Changed from pbmclapply to pblapply
+* Documentation improvement
+* Fixed empty chemical classes
+* Fixed not classified taxa
+* Github Actions improvement
 * [renv](https://rstudio.github.io/renv/index.html) removal
 * Performance improvement by replacing the [tidyverse](https://www.tidyverse.org) by the [fastverse](https://fastverse.github.io/fastverse) (in progress)
-* Github Actions improvement
-* Documentation improvement
 * Reduced warnings (CRAN and jscpd)
 
 # timaR 2.8.1
 
+* Adapted tests
 * Added `retry` parameter to `get_organism_taxonomy_ott`
 * Dependencies update
-* Replace `extdata` loading
+* Minor fixes
 * Moved `/params` and `paths.yaml` to `/inst` as more standard. (see <https://r-pkgs.org/misc.html#other-directories>)
-* Adapted tests
 * Performance improvement by replacing the [tidyverse](https://www.tidyverse.org) by the [fastverse](https://fastverse.github.io/fastverse) (in progress)
-* Minor fixes
+* Replaced `extdata` loading
 
 # timaR 2.8.0
 
-* Adding GUI prototype
+* Added GUI prototype
 * Started using [renv](https://rstudio.github.io/renv/index.html)
 
 # timaR 2.7.4
 
+* Clearer vocabulary
 * ECMDB support
 * Edges (mass and spectra-based) and components are generated if not present.
-* Fix case when no GNPS job ID
-* Re-introducing Classyfire support.
-* Retention time matching additionally to MS2 if RT present in library
-* Parameters refactoring
-* Steps refactoring
+* Fixed case when no GNPS job ID
 * Further [Targets](https://books.ropensci.org/targets/) improvements
-* Clearer vocabulary
 * Lot of fixes
+* Parameters refactoring
+* Re-introduced Classyfire support.
+* Retention time matching additionally to MS2 if RT present in library
+* Steps refactoring
 
 # timaR 2.7.3
 
-* [Targets](https://books.ropensci.org/targets/) implementation
-* Parameters refactoring
 * Improved calculations over redundant formulas
-* Spectral matching update (see <https://github.com/rformassspectrometry/MetaboAnnotation/issues/93>)
 * Minor fixes
+* Parameters refactoring
+* Spectral matching update (see <https://github.com/rformassspectrometry/MetaboAnnotation/issues/93>)
+* [Targets](https://books.ropensci.org/targets/) implementation
 
 # timaR 2.7.2
 
 * Benchmark update (including negative mode)
-* Spectral comparison + intensity filtering update
-* Switched r-base Docker image to bioconductor with ARM support
 * Improved parameters documentation
 * Minor fixes
+* Spectral comparison + intensity filtering update
+* Switched r-base Docker image to bioconductor with ARM support
 
 # timaR 2.7.1
 
+* Added MONA helpers
 * Added parallelization on process_spectra
 * Added sqlite storing for spectra
-* Added MONA helpers
-* Improved testing time
 * Improved code documentation
+* Improved testing time
 * Minor fixes
 
 # timaR 2.7.0
 
-* Added MS2 annotation capability (kudos @jorainer for the awesome *Spectra* suite)
 * Added HMDB helpers for both taxo and ISDB
+* Added MS2 annotation capability (kudos @jorainer for the awesome *Spectra* suite)
 * Minor fixes
 
 # timaR 2.6.0
@@ -167,14 +168,14 @@
 
 # timaR 2.5.6
 
+* Dependencies removal (e.g. metabo-store)
 * Minor fixes
 * Partial functions cleanup
-* Dependencies removal (e.g. metabo-store)
 
 # timaR 2.5.5
 
-* Minor fixes
 * Automation and parameters improvement
+* Minor fixes
 
 # timaR 2.5.4
 
@@ -197,41 +198,40 @@
 
 # timaR 2.5.0
 
-* Minor fixes
 * LOTUS update
+* Minor fixes
 
 # timaR 2.4.0
 
-* Minor fixes
-* Improved output (#34)
 * Added chemical names and xlogp to output (#33)
 * Added support for case when no consensus is found (#30)
+* Improved output (#34)
+* Minor fixes
 
 # timaR 2.3.0
 
-* Minor fixes
 * Added support for annotation without MN (#28)
 * Added support for multi tool annotations (#27)
 * Added support for classical MN GNPS jobs (#25)
 * Added support for new version of LOTUS
 * General improvements for manual inputs
-* Updated adducts
 * Improved tests code coverage
+* Minor fixes
+* Updated adducts
 
 # timaR 2.2.2
 
+* Additional benchmark figure ([Candidates distribution](https://taxonomicallyinformedannotation.github.io/tima-r/articles/V-actual-performance.html#candidates-distribution))
 * Minor fixes
-* Additional benchmark
-  figure ([Candidates distribution](https://taxonomicallyinformedannotation.github.io/tima-r/articles/V-actual-performance.html#candidates-distribution))
 
 # timaR 2.2.1
 
 * Minor version name fixes
 
 # timaR 2.2.0
 
-* Various fixes
 * Added benchmark ([here](https://taxonomicallyinformedannotation.github.io/tima-r/articles/V-actual-performance.html))
+* Various fixes
 
 # timaR 2.1.0
 

diff --git a/R/clean_chemo.R b/R/clean_chemo.R
@@ -45,6 +45,7 @@ import::from(tidytable, where, .into = environment())
 #'
 #' @include clean_collapse.R
 #' @include columns_model.R
+#' @include filter_high_confidence_only.R
 #'
 #' @param annot_table_wei_chemo Table containing your
 #'    chemically weighted annotation
@@ -56,6 +57,7 @@ import::from(tidytable, where, .into = environment())
 #' @param minimal_ms1_bio Minimal biological score to keep MS1 based annotation
 #' @param minimal_ms1_chemo Minimal chemical score to keep MS1 based annotation
 #' @param minimal_ms1_condition Condition to be used. Must be "OR" or "AND".
+#' @param high_confidence Report high confidence candidates only. BOOLEAN
 #' @param remove_ties Remove ties. BOOLEAN
 #' @param summarise Boolean. summarise results (1 row per feature)
 #'
@@ -76,6 +78,7 @@ clean_chemo <-
            minimal_ms1_bio = get("minimal_ms1_bio", envir = parent.frame()),
            minimal_ms1_chemo = get("minimal_ms1_chemo", envir = parent.frame()),
            minimal_ms1_condition = get("minimal_ms1_condition", envir = parent.frame()),
+           high_confidence = get("high_confidence", envir = parent.frame()),
            remove_ties = get("remove_ties", envir = parent.frame()),
            summarise = get("summarise", envir = parent.frame())) {
     model <- columns_model()
@@ -121,6 +124,11 @@ clean_chemo <-
         )
     }
 
+    if (high_confidence) {
+      df1 <- df1 |>
+        filter_high_confidence_only()
+    }
+
     df1 <- df1 |>
       arrange(desc(score_pondered_chemo)) |>
       distinct(feature_id,

diff --git a/R/filter_high_confidence_only.R b/R/filter_high_confidence_only.R
@@ -0,0 +1,36 @@
+import::from(crayon, red, .into = environment())
+import::from(tidytable, filter, .into = environment())
+
+#' @title Filter high confidence only
+#' `r lifecycle::badge("experimental")`
+#'
+#' @description This function filters highly confident annotations only.
+#'
+#' @importFrom crayon red
+#' @importFrom tidytable filter
+#'
+#' @param df Dataframe
+#' @param score_bio_min Minimal biological score. Current default to 0.85.
+#' @param score_ini_min Minimal initial score. Current default to 0.75.
+#'
+#' @return NULL
+#'
+#' @export
+#'
+#' @examples NULL
+filter_high_confidence_only <-
+  function(df,
+           score_bio_min = 0.85,
+           score_ini_min = 0.75) {
+    log_debug("Keeping high confidence candidates only...")
+    before <- nrow(df)
+    # TODO this is very basic for now but already massively filters.
+    # TODO Later implement SIRIUS/internal library filters.
+    df <- df |>
+      filter(score_biological >= score_bio_min |
+        candidate_score_pseudo_initial >= score_ini_min)
+    after <- nrow(df)
+    log_debug("Removed", red(before - after), "low confidence candidates")
+    rm(before, after)
+    return(df)
+  }