From 48d4a720fb66957c4853b0083202226870ac6bcf Mon Sep 17 00:00:00 2001 From: Adriano Rutz Date: Sun, 23 Jul 2023 10:55:18 +0200 Subject: [PATCH] benchmark update --- DESCRIPTION | 1 - NAMESPACE | 1 - R/sanitize_spectra_benchmark.R | 199 ------------------------------ inst/pipelines/_targets.R | 172 +++++++++++++++++++++++++- man/sanitize_spectra_benchmark.Rd | 34 ----- 5 files changed, 169 insertions(+), 238 deletions(-) delete mode 100644 R/sanitize_spectra_benchmark.R delete mode 100644 man/sanitize_spectra_benchmark.Rd diff --git a/DESCRIPTION b/DESCRIPTION index 0dfb9414e..d62466273 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -129,7 +129,6 @@ Collate: 'prepare_params.R' 'prepare_taxa.R' 'replace_id.R' - 'sanitize_spectra_benchmark.R' 'select_sop_columns.R' 'taxize_spectra_benchmark.R' 'weight_chemo.R' diff --git a/NAMESPACE b/NAMESPACE index e8a331e62..24955b0c6 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -61,7 +61,6 @@ export(remove_above_precursor) export(replace_id) export(round_reals) export(sanitize_spectra) -export(sanitize_spectra_benchmark) export(select_annotations_columns) export(select_sirius_columns) export(select_sirius_columns_2) diff --git a/R/sanitize_spectra_benchmark.R b/R/sanitize_spectra_benchmark.R deleted file mode 100644 index 6835f2cdc..000000000 --- a/R/sanitize_spectra_benchmark.R +++ /dev/null @@ -1,199 +0,0 @@ -#' @title Sanitize spectra benchmark -#' -#' @description This function sanitizes the benchmark spectra -#' -#' @details Because they are still quite dirty -#' -#' @include extract_spectra.R -#' @include harmonize_spectra.R -#' @include normalize_peaks.R -#' @include remove_above_precursor.R -#' -#' @param sp Spectra -#' @param mgf_pos_path Path to store the positive spectra -#' @param mgf_neg_path Path to store the negative spectra -#' @param meta_pos_path Path to store the positive metadata -#' @param meta_neg_path Path to store the negative metadata -#' -#' @return NULL -#' -#' @export -#' -#' @examples NULL -sanitize_spectra_benchmark <- - function(sp, - mgf_pos_path, - mgf_neg_path, - meta_pos_path = "data/interim/benchmark/benchmark_meta_pos.tsv", - meta_neg_path = "data/interim/benchmark/benchmark_meta_neg.tsv") { - sp$precursorMz <- as.numeric(sp$PRECURSOR_MZ) - sp$precursorCharge <- as.integer(sp$CHARGE) - - sp_clean <- sp |> - Spectra::addProcessing(remove_above_precursor(), - spectraVariables = c("precursorMz") - ) |> - Spectra::addProcessing(normalize_peaks()) |> - Spectra::applyProcessing() - - adduct <- sp_clean$ADDUCT - inchikey <- sp_clean$inchikey - instrument <- sp_clean$SOURCE_INSTRUMENT - # fragments <- sp_clean$NUM.PEAKS - fragments <- lapply(sp_clean@backend@peaksData, length) |> - as.character() |> - as.numeric() / 2 - # pepmass <- gsub("\\[|\\]", "", sp_clean$PARENT_MASS) - pepmass <- sp_clean$PEPMASS - smiles <- sp_clean$smiles - ccmslib <- sp_clean$SPECTRUMID - charge <- sp_clean$precursorCharge - - df_meta <- tidytable::tidytable( - adduct, - inchikey, - instrument, - fragments, - pepmass, - smiles, - ccmslib, - charge - ) |> - tidyft::mutate_vars( - is.character, - .func = function(x) { - tidytable::na_if(x, "") - } - ) - - df_clean <- df_meta |> - dplyr::filter(!is.na(inchikey)) |> - dplyr::filter(fragments > 5) |> - dplyr::filter(fragments <= 1000) |> - dplyr::filter(!grepl( - pattern = "QQQ", - x = instrument, - fixed = TRUE - )) |> - dplyr::mutate(mass = pepmass) |> - tidyr::separate( - col = mass, - sep = "\\.", - into = c("a", "b") - ) |> - dplyr::filter(!is.na(b)) |> - dplyr::filter(stringr::str_length(b) > 1) |> - dplyr::select(-a, -b) |> - dplyr::mutate(inchikey_2D = gsub( - pattern = "-.*", - replacement = "", - x = inchikey - )) |> - dplyr::distinct(inchikey_2D, adduct, .keep_all = TRUE) |> - dplyr::mutate(mz = pepmass) |> - dplyr::group_by(inchikey_2D) |> - ## Weird way to have some kind of retention time - dplyr::mutate(rt = dplyr::cur_group_id()) |> - dplyr::ungroup() - - df_clean_neg <- df_clean |> - dplyr::filter(grepl( - pattern = "-", - x = charge, - fixed = TRUE - )) |> - dplyr::mutate(feature_id = dplyr::row_number()) - - df_clean_pos <- df_clean |> - dplyr::anti_join(df_clean_neg) |> - dplyr::mutate(feature_id = dplyr::row_number()) - - sp_pos <- sp_clean[sp_clean$SPECTRUMID %in% df_clean_pos$ccmslib] - sp_neg <- sp_clean[sp_clean$SPECTRUMID %in% df_clean_neg$ccmslib] - sp_pos$feature_id <- df_clean_pos$feature_id - sp_neg$feature_id <- df_clean_neg$feature_id - sp_pos$spectrum_id <- df_clean_pos$feature_id - sp_neg$spectrum_id <- df_clean_neg$feature_id - - spectra_harmonized_pos <- sp_pos |> - extract_spectra() |> - dplyr::mutate(polarity = "pos") |> - harmonize_spectra( - col_ce = NA, - col_ci = NA, - col_em = "PARENT_MASS", - col_in = "inchi", - col_io = NA, - col_ik = "inchikey", - col_il = NA, - # col_mf = "formula", - col_mf = NA, - col_na = "name", - col_po = "polarity", - col_sm = "smiles", - col_sn = NA, - col_si = "spectrum_id", - col_sp = NA, - col_sy = NA, - col_xl = NA, - mode = "pos" - ) - - spectra_harmonized_neg <- sp_neg |> - extract_spectra() |> - dplyr::mutate(polarity = "neg") |> - harmonize_spectra( - col_ce = NA, - col_ci = NA, - col_em = "PARENT_MASS", - col_in = "inchi", - col_io = NA, - col_ik = "inchikey", - col_il = NA, - # col_mf = "formula", - col_mf = NA, - col_na = "name", - col_po = "polarity", - col_sm = "smiles", - col_sn = NA, - col_si = "spectrum_id", - col_sp = NA, - col_sy = NA, - col_xl = NA, - mode = "neg" - ) - - spectra_harmonized_pos$acquisitionNum <- - spectra_harmonized_pos$spectrum_id |> - as.integer() - spectra_harmonized_neg$acquisitionNum <- - spectra_harmonized_neg$spectrum_id |> - as.integer() - - log_debug("Exporting") - spectra_harmonized_pos |> - Spectra::Spectra() |> - Spectra::export( - backend = MsBackendMgf::MsBackendMgf(), - file = mgf_pos_path - ) - spectra_harmonized_neg |> - Spectra::Spectra() |> - Spectra::export( - backend = MsBackendMgf::MsBackendMgf(), - file = mgf_neg_path - ) - df_clean_pos |> - export_output(meta_pos_path) - df_clean_neg |> - export_output(meta_neg_path) - - return( - c( - "spectra_pos" = mgf_pos_path, - "spectra_neg" = mgf_neg_path, - "meta_pos" = meta_pos_path, - "meta_neg" = meta_neg_path - ) - ) - } diff --git a/inst/pipelines/_targets.R b/inst/pipelines/_targets.R index ca98aec58..eb70d42ce 100644 --- a/inst/pipelines/_targets.R +++ b/inst/pipelines/_targets.R @@ -1867,9 +1867,175 @@ list( Spectra::Spectra(source = MsBackendMsp::MsBackendMsp()) |> Spectra::setBackend(Spectra::MsBackendMemory()) - sp |> sanitize_spectra_benchmark( - mgf_pos_path = benchmark_path_mgf_pos, - mgf_neg_path = benchmark_path_mgf_neg + sp$precursorMz <- as.numeric(sp$PRECURSOR_MZ) + sp$precursorCharge <- as.integer(sp$CHARGE) + + sp_clean <- sp |> + Spectra::addProcessing(remove_above_precursor(), + spectraVariables = c("precursorMz") + ) |> + Spectra::addProcessing(normalize_peaks()) |> + Spectra::applyProcessing() + + adduct <- sp_clean$ADDUCT + inchikey <- sp_clean$inchikey + instrument <- sp_clean$SOURCE_INSTRUMENT + # fragments <- sp_clean$NUM.PEAKS + fragments <- lapply(sp_clean@backend@peaksData, length) |> + as.character() |> + as.numeric() / 2 + # pepmass <- gsub("\\[|\\]", "", sp_clean$PARENT_MASS) + pepmass <- sp_clean$PEPMASS + smiles <- sp_clean$smiles + ccmslib <- sp_clean$SPECTRUMID + charge <- sp_clean$precursorCharge + + df_meta <- tidytable::tidytable( + adduct, + inchikey, + instrument, + fragments, + pepmass, + smiles, + ccmslib, + charge + ) |> + tidyft::mutate_vars( + is.character, + .func = function(x) { + tidytable::na_if(x, "") + } + ) + + df_clean <- df_meta |> + dplyr::filter(!is.na(inchikey)) |> + dplyr::filter(fragments > 5) |> + dplyr::filter(fragments <= 1000) |> + dplyr::filter(!grepl( + pattern = "QQQ", + x = instrument, + fixed = TRUE + )) |> + dplyr::mutate(mass = pepmass) |> + tidyr::separate( + col = mass, + sep = "\\.", + into = c("a", "b") + ) |> + dplyr::filter(!is.na(b)) |> + dplyr::filter(stringr::str_length(b) > 1) |> + dplyr::select(-a, -b) |> + dplyr::mutate(inchikey_2D = gsub( + pattern = "-.*", + replacement = "", + x = inchikey + )) |> + dplyr::distinct(inchikey_2D, adduct, .keep_all = TRUE) |> + dplyr::mutate(mz = pepmass) |> + dplyr::group_by(inchikey_2D) |> + ## Weird way to have some kind of retention time + dplyr::mutate(rt = dplyr::cur_group_id()) |> + dplyr::ungroup() + + df_clean_neg <- df_clean |> + dplyr::filter(grepl( + pattern = "-", + x = charge, + fixed = TRUE + )) |> + dplyr::mutate(feature_id = dplyr::row_number()) + + df_clean_pos <- df_clean |> + dplyr::anti_join(df_clean_neg) |> + dplyr::mutate(feature_id = dplyr::row_number()) + + sp_pos <- sp_clean[sp_clean$SPECTRUMID %in% df_clean_pos$ccmslib] + sp_neg <- sp_clean[sp_clean$SPECTRUMID %in% df_clean_neg$ccmslib] + sp_pos$feature_id <- df_clean_pos$feature_id + sp_neg$feature_id <- df_clean_neg$feature_id + sp_pos$spectrum_id <- df_clean_pos$feature_id + sp_neg$spectrum_id <- df_clean_neg$feature_id + + spectra_harmonized_pos <- sp_pos |> + extract_spectra() |> + dplyr::mutate(polarity = "pos") |> + harmonize_spectra( + col_ce = NA, + col_ci = NA, + col_em = "PARENT_MASS", + col_in = "inchi", + col_io = NA, + col_ik = "inchikey", + col_il = NA, + # col_mf = "formula", + col_mf = NA, + col_na = "name", + col_po = "polarity", + col_sm = "smiles", + col_sn = NA, + col_si = "spectrum_id", + col_sp = NA, + col_sy = NA, + col_xl = NA, + mode = "pos" + ) + + spectra_harmonized_neg <- sp_neg |> + extract_spectra() |> + dplyr::mutate(polarity = "neg") |> + harmonize_spectra( + col_ce = NA, + col_ci = NA, + col_em = "PARENT_MASS", + col_in = "inchi", + col_io = NA, + col_ik = "inchikey", + col_il = NA, + # col_mf = "formula", + col_mf = NA, + col_na = "name", + col_po = "polarity", + col_sm = "smiles", + col_sn = NA, + col_si = "spectrum_id", + col_sp = NA, + col_sy = NA, + col_xl = NA, + mode = "neg" + ) + + spectra_harmonized_pos$acquisitionNum <- + spectra_harmonized_pos$spectrum_id |> + as.integer() + spectra_harmonized_neg$acquisitionNum <- + spectra_harmonized_neg$spectrum_id |> + as.integer() + + log_debug("Exporting") + spectra_harmonized_pos |> + Spectra::Spectra() |> + Spectra::export( + backend = MsBackendMgf::MsBackendMgf(), + file = mgf_pos_path + ) + spectra_harmonized_neg |> + Spectra::Spectra() |> + Spectra::export( + backend = MsBackendMgf::MsBackendMgf(), + file = mgf_neg_path + ) + df_clean_pos |> + export_output(meta_pos_path) + df_clean_neg |> + export_output(meta_neg_path) + + return( + c( + "spectra_pos" = benchmark_path_mgf_pos, + "spectra_neg" = benchmark_path_mgf_neg, + "meta_pos" = "data/interim/benchmark/benchmark_meta_pos.tsv", + "meta_neg" = "data/interim/benchmark/benchmark_meta_neg.tsv" + ) ) } ), diff --git a/man/sanitize_spectra_benchmark.Rd b/man/sanitize_spectra_benchmark.Rd deleted file mode 100644 index e616caf9e..000000000 --- a/man/sanitize_spectra_benchmark.Rd +++ /dev/null @@ -1,34 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/sanitize_spectra_benchmark.R -\name{sanitize_spectra_benchmark} -\alias{sanitize_spectra_benchmark} -\title{Sanitize spectra benchmark} -\usage{ -sanitize_spectra_benchmark( - sp, - mgf_pos_path, - mgf_neg_path, - meta_pos_path = "data/interim/benchmark/benchmark_meta_pos.tsv", - meta_neg_path = "data/interim/benchmark/benchmark_meta_neg.tsv" -) -} -\arguments{ -\item{sp}{Spectra} - -\item{mgf_pos_path}{Path to store the positive spectra} - -\item{mgf_neg_path}{Path to store the negative spectra} - -\item{meta_pos_path}{Path to store the positive metadata} - -\item{meta_neg_path}{Path to store the negative metadata} -} -\description{ -This function sanitizes the benchmark spectra -} -\details{ -Because they are still quite dirty -} -\examples{ -NULL -}