From 48d4a720fb66957c4853b0083202226870ac6bcf Mon Sep 17 00:00:00 2001
From: Adriano Rutz <adriano.rutz@hotmail.com>
Date: Sun, 23 Jul 2023 10:55:18 +0200
Subject: [PATCH] benchmark update

---
 DESCRIPTION                       |   1 -
 NAMESPACE                         |   1 -
 R/sanitize_spectra_benchmark.R    | 199 ------------------------------
 inst/pipelines/_targets.R         | 172 +++++++++++++++++++++++++-
 man/sanitize_spectra_benchmark.Rd |  34 -----
 5 files changed, 169 insertions(+), 238 deletions(-)
 delete mode 100644 R/sanitize_spectra_benchmark.R
 delete mode 100644 man/sanitize_spectra_benchmark.Rd

diff --git a/DESCRIPTION b/DESCRIPTION
index 0dfb9414e..d62466273 100644
--- a/DESCRIPTION
+++ b/DESCRIPTION
@@ -129,7 +129,6 @@ Collate:
     'prepare_params.R'
     'prepare_taxa.R'
     'replace_id.R'
-    'sanitize_spectra_benchmark.R'
     'select_sop_columns.R'
     'taxize_spectra_benchmark.R'
     'weight_chemo.R'
diff --git a/NAMESPACE b/NAMESPACE
index e8a331e62..24955b0c6 100644
--- a/NAMESPACE
+++ b/NAMESPACE
@@ -61,7 +61,6 @@ export(remove_above_precursor)
 export(replace_id)
 export(round_reals)
 export(sanitize_spectra)
-export(sanitize_spectra_benchmark)
 export(select_annotations_columns)
 export(select_sirius_columns)
 export(select_sirius_columns_2)
diff --git a/R/sanitize_spectra_benchmark.R b/R/sanitize_spectra_benchmark.R
deleted file mode 100644
index 6835f2cdc..000000000
--- a/R/sanitize_spectra_benchmark.R
+++ /dev/null
@@ -1,199 +0,0 @@
-#' @title Sanitize spectra benchmark
-#'
-#' @description This function sanitizes the benchmark spectra
-#'
-#' @details Because they are still quite dirty
-#'
-#' @include extract_spectra.R
-#' @include harmonize_spectra.R
-#' @include normalize_peaks.R
-#' @include remove_above_precursor.R
-#'
-#' @param sp Spectra
-#' @param mgf_pos_path Path to store the positive spectra
-#' @param mgf_neg_path Path to store the negative spectra
-#' @param meta_pos_path Path to store the positive metadata
-#' @param meta_neg_path Path to store the negative metadata
-#'
-#' @return NULL
-#'
-#' @export
-#'
-#' @examples NULL
-sanitize_spectra_benchmark <-
-  function(sp,
-           mgf_pos_path,
-           mgf_neg_path,
-           meta_pos_path = "data/interim/benchmark/benchmark_meta_pos.tsv",
-           meta_neg_path = "data/interim/benchmark/benchmark_meta_neg.tsv") {
-    sp$precursorMz <- as.numeric(sp$PRECURSOR_MZ)
-    sp$precursorCharge <- as.integer(sp$CHARGE)
-
-    sp_clean <- sp |>
-      Spectra::addProcessing(remove_above_precursor(),
-        spectraVariables = c("precursorMz")
-      ) |>
-      Spectra::addProcessing(normalize_peaks()) |>
-      Spectra::applyProcessing()
-
-    adduct <- sp_clean$ADDUCT
-    inchikey <- sp_clean$inchikey
-    instrument <- sp_clean$SOURCE_INSTRUMENT
-    # fragments <- sp_clean$NUM.PEAKS
-    fragments <- lapply(sp_clean@backend@peaksData, length) |>
-      as.character() |>
-      as.numeric() / 2
-    # pepmass <- gsub("\\[|\\]", "", sp_clean$PARENT_MASS)
-    pepmass <- sp_clean$PEPMASS
-    smiles <- sp_clean$smiles
-    ccmslib <- sp_clean$SPECTRUMID
-    charge <- sp_clean$precursorCharge
-
-    df_meta <- tidytable::tidytable(
-      adduct,
-      inchikey,
-      instrument,
-      fragments,
-      pepmass,
-      smiles,
-      ccmslib,
-      charge
-    ) |>
-      tidyft::mutate_vars(
-        is.character,
-        .func = function(x) {
-          tidytable::na_if(x, "")
-        }
-      )
-
-    df_clean <- df_meta |>
-      dplyr::filter(!is.na(inchikey)) |>
-      dplyr::filter(fragments > 5) |>
-      dplyr::filter(fragments <= 1000) |>
-      dplyr::filter(!grepl(
-        pattern = "QQQ",
-        x = instrument,
-        fixed = TRUE
-      )) |>
-      dplyr::mutate(mass = pepmass) |>
-      tidyr::separate(
-        col = mass,
-        sep = "\\.",
-        into = c("a", "b")
-      ) |>
-      dplyr::filter(!is.na(b)) |>
-      dplyr::filter(stringr::str_length(b) > 1) |>
-      dplyr::select(-a, -b) |>
-      dplyr::mutate(inchikey_2D = gsub(
-        pattern = "-.*",
-        replacement = "",
-        x = inchikey
-      )) |>
-      dplyr::distinct(inchikey_2D, adduct, .keep_all = TRUE) |>
-      dplyr::mutate(mz = pepmass) |>
-      dplyr::group_by(inchikey_2D) |>
-      ## Weird way to have some kind of retention time
-      dplyr::mutate(rt = dplyr::cur_group_id()) |>
-      dplyr::ungroup()
-
-    df_clean_neg <- df_clean |>
-      dplyr::filter(grepl(
-        pattern = "-",
-        x = charge,
-        fixed = TRUE
-      )) |>
-      dplyr::mutate(feature_id = dplyr::row_number())
-
-    df_clean_pos <- df_clean |>
-      dplyr::anti_join(df_clean_neg) |>
-      dplyr::mutate(feature_id = dplyr::row_number())
-
-    sp_pos <- sp_clean[sp_clean$SPECTRUMID %in% df_clean_pos$ccmslib]
-    sp_neg <- sp_clean[sp_clean$SPECTRUMID %in% df_clean_neg$ccmslib]
-    sp_pos$feature_id <- df_clean_pos$feature_id
-    sp_neg$feature_id <- df_clean_neg$feature_id
-    sp_pos$spectrum_id <- df_clean_pos$feature_id
-    sp_neg$spectrum_id <- df_clean_neg$feature_id
-
-    spectra_harmonized_pos <- sp_pos |>
-      extract_spectra() |>
-      dplyr::mutate(polarity = "pos") |>
-      harmonize_spectra(
-        col_ce = NA,
-        col_ci = NA,
-        col_em = "PARENT_MASS",
-        col_in = "inchi",
-        col_io = NA,
-        col_ik = "inchikey",
-        col_il = NA,
-        # col_mf = "formula",
-        col_mf = NA,
-        col_na = "name",
-        col_po = "polarity",
-        col_sm = "smiles",
-        col_sn = NA,
-        col_si = "spectrum_id",
-        col_sp = NA,
-        col_sy = NA,
-        col_xl = NA,
-        mode = "pos"
-      )
-
-    spectra_harmonized_neg <- sp_neg |>
-      extract_spectra() |>
-      dplyr::mutate(polarity = "neg") |>
-      harmonize_spectra(
-        col_ce = NA,
-        col_ci = NA,
-        col_em = "PARENT_MASS",
-        col_in = "inchi",
-        col_io = NA,
-        col_ik = "inchikey",
-        col_il = NA,
-        # col_mf = "formula",
-        col_mf = NA,
-        col_na = "name",
-        col_po = "polarity",
-        col_sm = "smiles",
-        col_sn = NA,
-        col_si = "spectrum_id",
-        col_sp = NA,
-        col_sy = NA,
-        col_xl = NA,
-        mode = "neg"
-      )
-
-    spectra_harmonized_pos$acquisitionNum <-
-      spectra_harmonized_pos$spectrum_id |>
-      as.integer()
-    spectra_harmonized_neg$acquisitionNum <-
-      spectra_harmonized_neg$spectrum_id |>
-      as.integer()
-
-    log_debug("Exporting")
-    spectra_harmonized_pos |>
-      Spectra::Spectra() |>
-      Spectra::export(
-        backend = MsBackendMgf::MsBackendMgf(),
-        file = mgf_pos_path
-      )
-    spectra_harmonized_neg |>
-      Spectra::Spectra() |>
-      Spectra::export(
-        backend = MsBackendMgf::MsBackendMgf(),
-        file = mgf_neg_path
-      )
-    df_clean_pos |>
-      export_output(meta_pos_path)
-    df_clean_neg |>
-      export_output(meta_neg_path)
-
-    return(
-      c(
-        "spectra_pos" = mgf_pos_path,
-        "spectra_neg" = mgf_neg_path,
-        "meta_pos" = meta_pos_path,
-        "meta_neg" = meta_neg_path
-      )
-    )
-  }
diff --git a/inst/pipelines/_targets.R b/inst/pipelines/_targets.R
index ca98aec58..eb70d42ce 100644
--- a/inst/pipelines/_targets.R
+++ b/inst/pipelines/_targets.R
@@ -1867,9 +1867,175 @@ list(
           Spectra::Spectra(source = MsBackendMsp::MsBackendMsp()) |>
           Spectra::setBackend(Spectra::MsBackendMemory())
 
-        sp |> sanitize_spectra_benchmark(
-          mgf_pos_path = benchmark_path_mgf_pos,
-          mgf_neg_path = benchmark_path_mgf_neg
+        sp$precursorMz <- as.numeric(sp$PRECURSOR_MZ)
+        sp$precursorCharge <- as.integer(sp$CHARGE)
+
+        sp_clean <- sp |>
+          Spectra::addProcessing(remove_above_precursor(),
+            spectraVariables = c("precursorMz")
+          ) |>
+          Spectra::addProcessing(normalize_peaks()) |>
+          Spectra::applyProcessing()
+
+        adduct <- sp_clean$ADDUCT
+        inchikey <- sp_clean$inchikey
+        instrument <- sp_clean$SOURCE_INSTRUMENT
+        # fragments <- sp_clean$NUM.PEAKS
+        fragments <- lapply(sp_clean@backend@peaksData, length) |>
+          as.character() |>
+          as.numeric() / 2
+        # pepmass <- gsub("\\[|\\]", "", sp_clean$PARENT_MASS)
+        pepmass <- sp_clean$PEPMASS
+        smiles <- sp_clean$smiles
+        ccmslib <- sp_clean$SPECTRUMID
+        charge <- sp_clean$precursorCharge
+
+        df_meta <- tidytable::tidytable(
+          adduct,
+          inchikey,
+          instrument,
+          fragments,
+          pepmass,
+          smiles,
+          ccmslib,
+          charge
+        ) |>
+          tidyft::mutate_vars(
+            is.character,
+            .func = function(x) {
+              tidytable::na_if(x, "")
+            }
+          )
+
+        df_clean <- df_meta |>
+          dplyr::filter(!is.na(inchikey)) |>
+          dplyr::filter(fragments > 5) |>
+          dplyr::filter(fragments <= 1000) |>
+          dplyr::filter(!grepl(
+            pattern = "QQQ",
+            x = instrument,
+            fixed = TRUE
+          )) |>
+          dplyr::mutate(mass = pepmass) |>
+          tidyr::separate(
+            col = mass,
+            sep = "\\.",
+            into = c("a", "b")
+          ) |>
+          dplyr::filter(!is.na(b)) |>
+          dplyr::filter(stringr::str_length(b) > 1) |>
+          dplyr::select(-a, -b) |>
+          dplyr::mutate(inchikey_2D = gsub(
+            pattern = "-.*",
+            replacement = "",
+            x = inchikey
+          )) |>
+          dplyr::distinct(inchikey_2D, adduct, .keep_all = TRUE) |>
+          dplyr::mutate(mz = pepmass) |>
+          dplyr::group_by(inchikey_2D) |>
+          ## Weird way to have some kind of retention time
+          dplyr::mutate(rt = dplyr::cur_group_id()) |>
+          dplyr::ungroup()
+
+        df_clean_neg <- df_clean |>
+          dplyr::filter(grepl(
+            pattern = "-",
+            x = charge,
+            fixed = TRUE
+          )) |>
+          dplyr::mutate(feature_id = dplyr::row_number())
+
+        df_clean_pos <- df_clean |>
+          dplyr::anti_join(df_clean_neg) |>
+          dplyr::mutate(feature_id = dplyr::row_number())
+
+        sp_pos <- sp_clean[sp_clean$SPECTRUMID %in% df_clean_pos$ccmslib]
+        sp_neg <- sp_clean[sp_clean$SPECTRUMID %in% df_clean_neg$ccmslib]
+        sp_pos$feature_id <- df_clean_pos$feature_id
+        sp_neg$feature_id <- df_clean_neg$feature_id
+        sp_pos$spectrum_id <- df_clean_pos$feature_id
+        sp_neg$spectrum_id <- df_clean_neg$feature_id
+
+        spectra_harmonized_pos <- sp_pos |>
+          extract_spectra() |>
+          dplyr::mutate(polarity = "pos") |>
+          harmonize_spectra(
+            col_ce = NA,
+            col_ci = NA,
+            col_em = "PARENT_MASS",
+            col_in = "inchi",
+            col_io = NA,
+            col_ik = "inchikey",
+            col_il = NA,
+            # col_mf = "formula",
+            col_mf = NA,
+            col_na = "name",
+            col_po = "polarity",
+            col_sm = "smiles",
+            col_sn = NA,
+            col_si = "spectrum_id",
+            col_sp = NA,
+            col_sy = NA,
+            col_xl = NA,
+            mode = "pos"
+          )
+
+        spectra_harmonized_neg <- sp_neg |>
+          extract_spectra() |>
+          dplyr::mutate(polarity = "neg") |>
+          harmonize_spectra(
+            col_ce = NA,
+            col_ci = NA,
+            col_em = "PARENT_MASS",
+            col_in = "inchi",
+            col_io = NA,
+            col_ik = "inchikey",
+            col_il = NA,
+            # col_mf = "formula",
+            col_mf = NA,
+            col_na = "name",
+            col_po = "polarity",
+            col_sm = "smiles",
+            col_sn = NA,
+            col_si = "spectrum_id",
+            col_sp = NA,
+            col_sy = NA,
+            col_xl = NA,
+            mode = "neg"
+          )
+
+        spectra_harmonized_pos$acquisitionNum <-
+          spectra_harmonized_pos$spectrum_id |>
+          as.integer()
+        spectra_harmonized_neg$acquisitionNum <-
+          spectra_harmonized_neg$spectrum_id |>
+          as.integer()
+
+        log_debug("Exporting")
+        spectra_harmonized_pos |>
+          Spectra::Spectra() |>
+          Spectra::export(
+            backend = MsBackendMgf::MsBackendMgf(),
+            file = mgf_pos_path
+          )
+        spectra_harmonized_neg |>
+          Spectra::Spectra() |>
+          Spectra::export(
+            backend = MsBackendMgf::MsBackendMgf(),
+            file = mgf_neg_path
+          )
+        df_clean_pos |>
+          export_output(meta_pos_path)
+        df_clean_neg |>
+          export_output(meta_neg_path)
+
+        return(
+          c(
+            "spectra_pos" = benchmark_path_mgf_pos,
+            "spectra_neg" = benchmark_path_mgf_neg,
+            "meta_pos" = "data/interim/benchmark/benchmark_meta_pos.tsv",
+            "meta_neg" = "data/interim/benchmark/benchmark_meta_neg.tsv"
+          )
         )
       }
     ),
diff --git a/man/sanitize_spectra_benchmark.Rd b/man/sanitize_spectra_benchmark.Rd
deleted file mode 100644
index e616caf9e..000000000
--- a/man/sanitize_spectra_benchmark.Rd
+++ /dev/null
@@ -1,34 +0,0 @@
-% Generated by roxygen2: do not edit by hand
-% Please edit documentation in R/sanitize_spectra_benchmark.R
-\name{sanitize_spectra_benchmark}
-\alias{sanitize_spectra_benchmark}
-\title{Sanitize spectra benchmark}
-\usage{
-sanitize_spectra_benchmark(
-  sp,
-  mgf_pos_path,
-  mgf_neg_path,
-  meta_pos_path = "data/interim/benchmark/benchmark_meta_pos.tsv",
-  meta_neg_path = "data/interim/benchmark/benchmark_meta_neg.tsv"
-)
-}
-\arguments{
-\item{sp}{Spectra}
-
-\item{mgf_pos_path}{Path to store the positive spectra}
-
-\item{mgf_neg_path}{Path to store the negative spectra}
-
-\item{meta_pos_path}{Path to store the positive metadata}
-
-\item{meta_neg_path}{Path to store the negative metadata}
-}
-\description{
-This function sanitizes the benchmark spectra
-}
-\details{
-Because they are still quite dirty
-}
-\examples{
-NULL
-}