From 8563a8af84ccff8febc42df1da8a7545ce3ed4e5 Mon Sep 17 00:00:00 2001 From: oliviaAB Date: Wed, 24 Jul 2024 15:11:42 +1200 Subject: [PATCH] Added seed argument to perf_splsda and run_splsda --- DESCRIPTION | 2 +- NEWS.md | 2 + R/prefiltering.R | 282 ++++++++++++++------- man/dot-clean_seed.Rd | 19 ++ man/feature_preselection_splsda_factory.Rd | 78 ++++-- man/perf_splsda.Rd | 73 ++++-- man/plot_feature_preselection_splsda.Rd | 24 +- man/run_splsda.Rd | 43 ++-- tests/testthat/test-prefiltering.R | 49 +++- 9 files changed, 395 insertions(+), 177 deletions(-) create mode 100644 man/dot-clean_seed.Rd diff --git a/DESCRIPTION b/DESCRIPTION index b8a6a7c..5187ad8 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -17,7 +17,7 @@ BugReports: https://github.com/Plant-Food-Research-Open/moiraine/issues License: MIT + file LICENSE Encoding: UTF-8 Roxygen: list(markdown = TRUE) -RoxygenNote: 7.2.3 +RoxygenNote: 7.3.1 Imports: Biobase, circlize, diff --git a/NEWS.md b/NEWS.md index 63a75eb..df478c7 100644 --- a/NEWS.md +++ b/NEWS.md @@ -7,3 +7,5 @@ - `where()` function now imported from tidyselect instead of dplyr (as it required a newer version of dplyr). - Fixed typo in samples metadata file, samples with no value for "rnaseq_batch" variable now have `NA` rather than `"BNA"` values. + +- `perf_splsda()` and `run_splsda()` now have a `seed` argument (hopefully self-explanatory :)). Accordingly, `feature_preselection_splsda_factory` now has arguments `seed_perf` and `seed_run` to pass on seeds to `perf_splsda()` and `run_splsda`. diff --git a/R/prefiltering.R b/R/prefiltering.R index 15fdb8c..1559085 100644 --- a/R/prefiltering.R +++ b/R/prefiltering.R @@ -500,41 +500,54 @@ get_input_splsda <- function(mo_data, dataset_name, group, multilevel = NULL) { } -#' Assess optimal number of components for sPLS-DA on omics dataset from MultiDataSet object +#' Assess optimal number of components for sPLS-DA on omics dataset from +#' MultiDataSet object #' -#' Performs cross-validation for a PLS-DA run (implemented in the `mixOmics` package) on an omics dataset from a -#' `MultiDataSet` object. This allows to estimate the optimal number of latent components to construct. -#' This is intended for feature preselection in the omics dataset (see examples below). +#' Performs cross-validation for a PLS-DA run (implemented in the `mixOmics` +#' package) on an omics dataset from a `MultiDataSet` object. This allows to +#' estimate the optimal number of latent components to construct. This is +#' intended for feature preselection in the omics dataset (see examples below). #' -#' This function uses the \code{\link[mixOmics]{plsda}} and \code{\link[mixOmics]{perf}} -#' function from the `mixOmics` package. +#' This function uses the [mixOmics::plsda()] and [mixOmics::perf()] function +#' from the `mixOmics` package. #' -#' @param splsda_input Input for the sPLS-DA functions from mixOmics, created with [get_input_splsda()]. -#' @param ncomp_max Integer, the maximum number of latent components to test when estimating the number of -#' latent components to use. Default value is `5`. -#' @param validation Character, which cross-validation method to use, can be one of `"Mfold"` or `"loo"` -#' (see [mixOmics::perf()]). Default value is `"Mfold"`. -#' @param folds Integer, number of folds to use in the M-fold cross-validation (see [mixOmics::perf()]). -#' Default value is 5. -#' @param nrepeat Integer, number of times the cross-validation is repeated (see [mixOmics::perf()]). -#' @param measure Performance measure used to select the optimal value of `ncomp`, can be one of `"BER"` or `"overall"` -#' (see [mixOmics::perf()]). -#' Default value is `"BER"`. -#' @param distance Distance metric used to select the optimal value of `ncomp`, can be one of `"max.dist"`, -#' `"centroids.dist"` or `"mahalanobis.dist"` (see [mixOmics::perf()]). Default value is `"centroids.dist"`. +#' @param splsda_input Input for the sPLS-DA functions from mixOmics, created +#' with [get_input_splsda()]. +#' @param ncomp_max Integer, the maximum number of latent components to test +#' when estimating the number of latent components to use. Default value is +#' `5`. +#' @param validation Character, which cross-validation method to use, can be one +#' of `"Mfold"` or `"loo"` (see [mixOmics::perf()]). Default value is +#' `"Mfold"`. +#' @param folds Integer, number of folds to use in the M-fold cross-validation +#' (see [mixOmics::perf()]). Default value is 5. +#' @param nrepeat Integer, number of times the cross-validation is repeated (see +#' [mixOmics::perf()]). +#' @param measure Performance measure used to select the optimal value of +#' `ncomp`, can be one of `"BER"` or `"overall"` (see [mixOmics::perf()]). +#' Default value is `"BER"`. +#' @param distance Distance metric used to select the optimal value of `ncomp`, +#' can be one of `"max.dist"`, `"centroids.dist"` or `"mahalanobis.dist"` (see +#' [mixOmics::perf()]). Default value is `"centroids.dist"`. #' @param cpus Integer, number of cpus to use. -#' @param progressBar Logical, whether to display a progress bar during the optimisation of `ncomp`. Default -#' value is `TRUE`. -#' @return A list as per the output of the [mixOmics::perf()] function, with the following additional elements: -#' \itemize{ -#' \item `dataset_name`: the name of the dataset analysed; -#' \item `group`: column name in the samples information data-frame used as samples group; -#' \item `optim_ncomp`: the optimal number of latent components as per the `measure` and `distance` specified; -#' \item `optim_measure`: the measure used to select the optimal number of latent components; -#' \item `optim_distance`: the distance metric used to select the optimal number of latent components. -#' } -#' In addition, the name of the dataset analysed and the column name in the samples information data-frame -#' used as samples group as stored as attributes `dataset_name` and `group`, respectively. +#' @param progressBar Logical, whether to display a progress bar during the +#' optimisation of `ncomp`. Default value is `TRUE`. +#' @param seed Integer, seed to use. Default is `NULL`, i.e. no seed is set +#' inside the function. +#' @returns A list as per the output of the [mixOmics::perf()] function, with +#' the following additional elements: +#' * `dataset_name`: the name of the dataset analysed; +#' * `group`: column name in the samples information data-frame used as samples +#' group; +#' * `optim_ncomp`: the optimal number of latent components as per the `measure` +#' and `distance` specified; +#' * `optim_measure`: the measure used to select the optimal number of latent +#' components; +#' * `optim_distance`: the distance metric used to select the optimal number of +#' latent components. +#' In addition, the name of the dataset analysed and the column name in the +#' samples information data-frame used as samples group as stored as attributes +#' `dataset_name` and `group`, respectively. #' @export perf_splsda <- function(splsda_input, ncomp_max = 5, @@ -544,11 +557,14 @@ perf_splsda <- function(splsda_input, measure = "BER", distance = "centroids.dist", cpus = 1, - progressBar = TRUE) { + progressBar = TRUE, + seed = NULL) { dataset_name <- setdiff(names(splsda_input), "Y") multilevel <- attr(splsda_input, "multilevel") + if (!is.null(seed)) set.seed(seed) + ## Run the PLS-DA with several latent components plsda_res <- mixOmics::plsda( splsda_input[[dataset_name]], @@ -587,27 +603,40 @@ perf_splsda <- function(splsda_input, #' Performs sPLS-DA on omics dataset from MultiDataSet object #' -#' Performs a sPLS-DA (implemented in the `mixOmics`) package on a omics dataset from a -#' MultiDataSet object. This is intended for feature preselection in the omics dataset -#' (see \code{\link{get_filtered_dataset_splsda}}). +#' Performs a sPLS-DA (implemented in the `mixOmics`) package on a omics dataset +#' from a MultiDataSet object. This is intended for feature preselection in the +#' omics dataset (see [get_filtered_dataset_splsda()]). #' -#' This function uses the \code{\link[mixOmics]{plsda}} function from the `mixOmics` package. -#' Note that the sPLS-DA method can select the same feature for several latent components, so the number of -#' features retained for a dataset might be less than the number specified in the `to_keep_n` argument. +#' This function uses the [mixOmics::plsda()] function from the `mixOmics` +#' package. Note that the sPLS-DA method can select the same feature for several +#' latent components, so the number of features retained for a dataset might be +#' less than the number specified in the `to_keep_n` argument. #' -#' @param splsda_input Input for the sPLS-DA functions from mixOmics, created with [get_input_splsda()]. -#' @param perf_res Result of the \code{\link{perf_splsda}} function. If not supplied, sPLS-DA will be run on -#' dataset specified by argument `dataset_name` with number of latent components specified by argument `comp`. -#' @param to_keep_n Integer, the number of features to retain in the dataset. Should be less than the number of -#' features in the dataset. If `NULL` or `NA`, `to_keep_prop` will be used instead. -#' @param to_keep_prop Numeric, the proportion of features to retain in the dataset. Will be ignored if `to_keep_n` -#' is supplied. Value should be > 0 and < 1. -#' @param ncomp Integer, number of latent components to construct. Ignored if `perf_res` is supplied. -#' Default value is `NULL`. -#' @return A list as per the output of the \code{\link[mixOmics]{splsda}} function. +#' @param splsda_input Input for the sPLS-DA functions from mixOmics, created +#' with [get_input_splsda()]. +#' @param perf_res Result of the [perf_splsda()] function. If not supplied, +#' sPLS-DA will be run on dataset specified by argument `dataset_name` with +#' number of latent components specified by argument `comp`. +#' @param to_keep_n Integer, the number of features to retain in the dataset. +#' Should be less than the number of features in the dataset. If `NULL` or +#' `NA`, `to_keep_prop` will be used instead. +#' @param to_keep_prop Numeric, the proportion of features to retain in the +#' dataset. Will be ignored if `to_keep_n` is supplied. Value should be > 0 +#' and < 1. +#' @param ncomp Integer, number of latent components to construct. Ignored if +#' `perf_res` is supplied. Default value is `NULL`. +#' @param seed Integer, seed to use. Default is `NULL`, i.e. no seed is set +#' inside the function. +#' @returns A list as per the output of the [mixOmics::splsda()] function. #' @export -run_splsda <- function(splsda_input, perf_res, to_keep_n = NULL, to_keep_prop = NULL, ncomp = NULL) { - +run_splsda <- function(splsda_input, + perf_res, + to_keep_n = NULL, + to_keep_prop = NULL, + ncomp = NULL, + seed = NULL) { + + if (!is.null(seed)) set.seed(seed) if (!missing(perf_res)) ncomp <- perf_res$optim_ncomp dataset_name <- setdiff(names(splsda_input), "Y") @@ -703,33 +732,56 @@ get_filtered_dataset_splsda <- function(mo_data, #' Target factory for feature preselection based on sPLS-DA #' -#' Creates a list of targets to perform feature preselection on datasets from a `MultiDataSet` -#' object with sPLS-DA (from the `mixOmics` package). +#' Creates a list of targets to perform feature preselection on datasets from a +#' `MultiDataSet` object with sPLS-DA (from the `mixOmics` package). #' -#' @param mo_data_target Symbol, the name of the target containing the `MultiDataSet` object. -#' @param group Character, the column name in the samples information data-frame to use as samples group. -#' @param to_keep_ns Named integer vector, the number of feature to retain in each dataset to be prefiltered -#' (names should correspond to a dataset name). Value should be less than the number of features in the -#' corresponding dataset. Set to `NULL` in order to use `to_keep_props` instead. -#' @param to_keep_props Named numeric vector, the proportion of features to retain in each dataset -#' to be prefiltered (names should correspond to a dataset name). Value should be > 0 and < 1. -#' Will be ignored if `to_keep_ns` is not `NULL`. -#' @param target_name_prefix Character, a prefix to add to the name of the targets created by this target factory. -#' Default value is `""`. -#' @param filtered_set_target_name Character, the name of the final target containing the filtered `MultiDataSet` object. -#' If NULL, a name will automatically be supplied. Default value is `NULL`. -#' @param multilevel Character vector of length 1 or 3 to be used as information about repeated measurements. -#' See [get_input_splsda()] for details. Default value is `NULL` (no repeated measurements). -#' @param ... Further arguments passed to the \code{\link{perf_splsda}} function. -#' @return A list of target objects. With `target_name_prefix = ""` and `filtered_set_target_name = NULL`, -#' the following targets are created: -#' * `splsda_spec`: generates a grouped tibble where each row corresponds to one dataset to be filtered, -#' with the columns specifying each dataset name, and associated values from `to_keep_ns` and `to_keep_props`. -#' * `individual_splsda_input`: a dynamic branching target that runs the [get_input_splsda()] function for each dataset. -#' * `individual_splsda_perf`: a dynamic branching target that runs the [perf_splsda()] function for each dataset. -#' * `individual_splsda_run`: a dynamic branching target that runs the [run_splsda()] function for each dataset, -#' using the results from `individual_splsda_perf` to guide the number of latent components to construct. -#' * `filtered_set_slpsda`: a target to retain from the original `MultiDataSet` object only features selected in each sPLS-DA run. +#' @param mo_data_target Symbol, the name of the target containing the +#' `MultiDataSet` object. +#' @param group Character, the column name in the samples information data-frame +#' to use as samples group. +#' @param to_keep_ns Named integer vector, the number of feature to retain in +#' each dataset to be prefiltered (names should correspond to a dataset name). +#' Value should be less than the number of features in the corresponding +#' dataset. Set to `NULL` in order to use `to_keep_props` instead. +#' @param to_keep_props Named numeric vector, the proportion of features to +#' retain in each dataset to be prefiltered (names should correspond to a +#' dataset name). Value should be > 0 and < 1. Will be ignored if `to_keep_ns` +#' is not `NULL`. +#' @param target_name_prefix Character, a prefix to add to the name of the +#' targets created by this target factory. Default value is `""`. +#' @param filtered_set_target_name Character, the name of the final target +#' containing the filtered `MultiDataSet` object. If NULL, a name will +#' automatically be supplied. Default value is `NULL`. +#' @param multilevel Character vector of length 1 or 3 to be used as information +#' about repeated measurements. See [get_input_splsda()] for details. Default +#' value is `NULL` (no repeated measurements). +#' @param seed_perf Named integer vector, the seed to use for the +#' [perf_splsda()] function for each dataset. The length and names should +#' match those of `to_keep_ns` or `to_keep_props`. If not named, the values +#' will be used in order of the datasets in `to_keep_ns` or `to_keep_props`. +#' Default value is `NULL`, i.e. no seed is set. +#' @param seed_run Named integer vector, the seed to use for the +#' [run_splsda()] function for each dataset. The length and names should +#' match those of `to_keep_ns` or `to_keep_props`. If not named, the values +#' will be used in order of the datasets in `to_keep_ns` or `to_keep_props`. +#' Default value is `NULL`, i.e. no seed is set. +#' @param ... Further arguments passed to the [perf_splsda] +#' function. +#' @returns A list of target objects. With `target_name_prefix = ""` and +#' `filtered_set_target_name = NULL`, the following targets are created: +#' * `splsda_spec`: generates a grouped tibble where each row corresponds to one +#' dataset to be filtered, with the columns specifying each dataset name, and +#' associated values from `to_keep_ns` and `to_keep_props`. +#' * `individual_splsda_input`: a dynamic branching target that runs the +#' [get_input_splsda()] function for each dataset. +#' * `individual_splsda_perf`: a dynamic branching target that runs the +#' [perf_splsda()] function for each dataset. +#' * `individual_splsda_run`: a dynamic branching target that runs the +#' [run_splsda()] function for each dataset, using the results from +#' `individual_splsda_perf` to guide the number of latent components to +#' construct. +#' * `filtered_set_slpsda`: a target to retain from the original `MultiDataSet` +#' object only features selected in each sPLS-DA run. #' @examples #' \dontrun{ #' ## in the _targets.R @@ -771,6 +823,8 @@ feature_preselection_splsda_factory <- function(mo_data_target, target_name_prefix = "", filtered_set_target_name = NULL, multilevel = NULL, + seed_perf = NULL, + seed_run = NULL, ...) { splsda_spec_name <- paste0(target_name_prefix, "splsda_spec") splsda_input_name <- paste0(target_name_prefix, "individual_splsda_input") @@ -795,13 +849,25 @@ feature_preselection_splsda_factory <- function(mo_data_target, stop("'to_keep_ns' or 'to_keep_props' argument should be named.") } + ## Checking that number of seeds match number of datasets, and names match + ## dataset names + seed_perf <- .clean_seed(seed_perf, dataset_names) + seed_run <- .clean_seed(seed_run, dataset_names) + + list( ## store the splsda specifications (arguments) as a tibble (one row per dataset to prefilter) ## and group it by dataset name so that following targets will be applied to each row in turn targets::tar_target_raw( splsda_spec_name, substitute( - tibble::tibble(dsn = dataset_names, tkn = to_keep_ns, tkp = to_keep_props) |> + tibble::tibble( + dsn = dataset_names, + tkn = to_keep_ns, + tkp = to_keep_props, + sp = seed_perf, + sr = seed_run + ) |> dplyr::group_by(dsn) |> tar_group()), iteration = "group" @@ -818,8 +884,8 @@ feature_preselection_splsda_factory <- function(mo_data_target, ## run the perf function for each row of the specfications dataframe targets::tar_target_raw( splsda_perf_name, - substitute(perf_splsda(splsda_input_target, ...)), - pattern = substitute(map(splsda_input_target)), + substitute(perf_splsda(splsda_input_target, seed = splsda_spec_target$sp, ...)), + pattern = substitute(map(splsda_input_target, splsda_spec_target)), iteration = "list" ), @@ -831,7 +897,8 @@ feature_preselection_splsda_factory <- function(mo_data_target, splsda_input_target, perf_res = splsda_perf_target, to_keep_n = splsda_spec_target$tkn, - to_keep_prop = splsda_spec_target$tkp + to_keep_prop = splsda_spec_target$tkp, + seed = splsda_spec_target$sr ) ), pattern = substitute(map(splsda_input_target, splsda_perf_target, splsda_spec_target)), @@ -936,17 +1003,21 @@ plot_feature_preselection_cov <- function(cov_list) { #' Diagnostics plots for sPLS-DA-based feature preselection #' #' Displays the PLS-DA classification performance across different number of -#' latent components for each prefiltered dataset. The classification error rates are computed with different measures -#' (column facets) and different distance metrics (colours). A vertical grey bar represents for each dataset the number -#' of latent components selected for the feature preselection step. In addition, a circle highlights the measure and -#' distance metric used to select the number of latent component. +#' latent components for each prefiltered dataset. The classification error +#' rates are computed with different measures (column facets) and different +#' distance metrics (colours). A vertical grey bar represents for each dataset +#' the number of latent components selected for the feature preselection step. +#' In addition, a circle highlights the measure and distance metric used to +#' select the number of latent component. #' -#' @param perf_splsda_res A list with the result from the \code{\link{perf_splsda}} for each dataset -#' to be filtered. -#' @param measure Which measure(s) should be displayed? Can be one of `"BER"` -#' or `"overall"`. If NULL, all measures will be displayed. Default value is `NULL`. -#' @param distance Which measure(s) should be displayed? Can be one of `"max.dist"`, -#' `"centroids.dist"` or `"mahalanobis.dist"`. If NULL, all measures will be displayed. Default value is `NULL`. +#' @param perf_splsda_res A list with the result from the +#' [perf_splsda] for each dataset to be filtered. +#' @param measure Which measure(s) should be displayed? Can be one of `"BER"` or +#' `"overall"`. If NULL, all measures will be displayed. Default value is +#' `NULL`. +#' @param distance Which measure(s) should be displayed? Can be one of +#' `"max.dist"`, `"centroids.dist"` or `"mahalanobis.dist"`. If NULL, all +#' measures will be displayed. Default value is `NULL`. #' @return A ggplot. #' @export plot_feature_preselection_splsda <- function(perf_splsda_res, @@ -1026,3 +1097,34 @@ plot_feature_preselection_splsda <- function(perf_splsda_res, return(res_plot) } + +#' Clean seed for sPLS-DA preselection factory +#' +#' Checks seed input arguments for the [feature_preselection_splsda_factory()] function. +#' +#' @param x Integer vector of seeds to use. +#' @param ds Character vector of dataset names. +#' @returns `x` with names if it didn't have them or an error. +.clean_seed <- function(x, ds) { + x_name <- deparse(substitute(x)) + x_name <- paste0("`", x_name, "`") + if (!is.null(x)) { + if (length(x) != length(ds)) { + stop( + x_name, + " should be an integer vector with same length as `to_keep_ns` or `to_keep_props`.", + call. = FALSE + ) + } + if (is.null(names(x))) names(x) <- ds + .check_names( + names(x), + ds, + paste0(x_name, " names do not match `to_keep_ns` or `to_keep_props` names.") + ) + + x <- x[ds] + } + + x +} diff --git a/man/dot-clean_seed.Rd b/man/dot-clean_seed.Rd new file mode 100644 index 0000000..01972c3 --- /dev/null +++ b/man/dot-clean_seed.Rd @@ -0,0 +1,19 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/prefiltering.R +\name{.clean_seed} +\alias{.clean_seed} +\title{Clean seed for sPLS-DA preselection factory} +\usage{ +.clean_seed(x, ds) +} +\arguments{ +\item{x}{Integer vector of seeds to use.} + +\item{ds}{Character vector of dataset names.} +} +\value{ +\code{x} with names if it didn't have them or an error. +} +\description{ +Checks seed input arguments for the \code{\link[=feature_preselection_splsda_factory]{feature_preselection_splsda_factory()}} function. +} diff --git a/man/feature_preselection_splsda_factory.Rd b/man/feature_preselection_splsda_factory.Rd index 65e3a31..1dbf0aa 100644 --- a/man/feature_preselection_splsda_factory.Rd +++ b/man/feature_preselection_splsda_factory.Rd @@ -12,51 +12,77 @@ feature_preselection_splsda_factory( target_name_prefix = "", filtered_set_target_name = NULL, multilevel = NULL, + seed_perf = NULL, + seed_run = NULL, ... ) } \arguments{ -\item{mo_data_target}{Symbol, the name of the target containing the \code{MultiDataSet} object.} +\item{mo_data_target}{Symbol, the name of the target containing the +\code{MultiDataSet} object.} -\item{group}{Character, the column name in the samples information data-frame to use as samples group.} +\item{group}{Character, the column name in the samples information data-frame +to use as samples group.} -\item{to_keep_ns}{Named integer vector, the number of feature to retain in each dataset to be prefiltered -(names should correspond to a dataset name). Value should be less than the number of features in the -corresponding dataset. Set to \code{NULL} in order to use \code{to_keep_props} instead.} +\item{to_keep_ns}{Named integer vector, the number of feature to retain in +each dataset to be prefiltered (names should correspond to a dataset name). +Value should be less than the number of features in the corresponding +dataset. Set to \code{NULL} in order to use \code{to_keep_props} instead.} -\item{to_keep_props}{Named numeric vector, the proportion of features to retain in each dataset -to be prefiltered (names should correspond to a dataset name). Value should be > 0 and < 1. -Will be ignored if \code{to_keep_ns} is not \code{NULL}.} +\item{to_keep_props}{Named numeric vector, the proportion of features to +retain in each dataset to be prefiltered (names should correspond to a +dataset name). Value should be > 0 and < 1. Will be ignored if \code{to_keep_ns} +is not \code{NULL}.} -\item{target_name_prefix}{Character, a prefix to add to the name of the targets created by this target factory. -Default value is \code{""}.} +\item{target_name_prefix}{Character, a prefix to add to the name of the +targets created by this target factory. Default value is \code{""}.} -\item{filtered_set_target_name}{Character, the name of the final target containing the filtered \code{MultiDataSet} object. -If NULL, a name will automatically be supplied. Default value is \code{NULL}.} +\item{filtered_set_target_name}{Character, the name of the final target +containing the filtered \code{MultiDataSet} object. If NULL, a name will +automatically be supplied. Default value is \code{NULL}.} -\item{multilevel}{Character vector of length 1 or 3 to be used as information about repeated measurements. -See \code{\link[=get_input_splsda]{get_input_splsda()}} for details. Default value is \code{NULL} (no repeated measurements).} +\item{multilevel}{Character vector of length 1 or 3 to be used as information +about repeated measurements. See \code{\link[=get_input_splsda]{get_input_splsda()}} for details. Default +value is \code{NULL} (no repeated measurements).} -\item{...}{Further arguments passed to the \code{\link{perf_splsda}} function.} +\item{seed_perf}{Named integer vector, the seed to use for the +\code{\link[=perf_splsda]{perf_splsda()}} function for each dataset. The length and names should +match those of \code{to_keep_ns} or \code{to_keep_props}. If not named, the values +will be used in order of the datasets in \code{to_keep_ns} or \code{to_keep_props}. +Default value is \code{NULL}, i.e. no seed is set. +@param seed_run Named integer vector, the seed to use for the +\code{\link[=run_splsda]{run_splsda()}} function for each dataset. The length and names should +match those of \code{to_keep_ns} or \code{to_keep_props}. If not named, the values +will be used in order of the datasets in \code{to_keep_ns} or \code{to_keep_props}. +Default value is \code{NULL}, i.e. no seed is set.} + +\item{...}{Further arguments passed to the \link{perf_splsda} +function.} } \value{ -A list of target objects. With \code{target_name_prefix = ""} and \code{filtered_set_target_name = NULL}, -the following targets are created: +A list of target objects. With \code{target_name_prefix = ""} and +\code{filtered_set_target_name = NULL}, the following targets are created: \itemize{ -\item \code{splsda_spec}: generates a grouped tibble where each row corresponds to one dataset to be filtered, -with the columns specifying each dataset name, and associated values from \code{to_keep_ns} and \code{to_keep_props}. +\item \code{splsda_spec}: generates a grouped tibble where each row corresponds to one +dataset to be filtered, with the columns specifying each dataset name, and +associated values from \code{to_keep_ns} and \code{to_keep_props}. \itemize{ -\item \code{individual_splsda_input}: a dynamic branching target that runs the \code{\link[=get_input_splsda]{get_input_splsda()}} function for each dataset. +\item \code{individual_splsda_input}: a dynamic branching target that runs the +\code{\link[=get_input_splsda]{get_input_splsda()}} function for each dataset. } -\item \code{individual_splsda_perf}: a dynamic branching target that runs the \code{\link[=perf_splsda]{perf_splsda()}} function for each dataset. -\item \code{individual_splsda_run}: a dynamic branching target that runs the \code{\link[=run_splsda]{run_splsda()}} function for each dataset, -using the results from \code{individual_splsda_perf} to guide the number of latent components to construct. -\item \code{filtered_set_slpsda}: a target to retain from the original \code{MultiDataSet} object only features selected in each sPLS-DA run. +\item \code{individual_splsda_perf}: a dynamic branching target that runs the +\code{\link[=perf_splsda]{perf_splsda()}} function for each dataset. +\item \code{individual_splsda_run}: a dynamic branching target that runs the +\code{\link[=run_splsda]{run_splsda()}} function for each dataset, using the results from +\code{individual_splsda_perf} to guide the number of latent components to +construct. +\item \code{filtered_set_slpsda}: a target to retain from the original \code{MultiDataSet} +object only features selected in each sPLS-DA run. } } \description{ -Creates a list of targets to perform feature preselection on datasets from a \code{MultiDataSet} -object with sPLS-DA (from the \code{mixOmics} package). +Creates a list of targets to perform feature preselection on datasets from a +\code{MultiDataSet} object with sPLS-DA (from the \code{mixOmics} package). } \examples{ \dontrun{ diff --git a/man/perf_splsda.Rd b/man/perf_splsda.Rd index 9169b09..fe9de11 100644 --- a/man/perf_splsda.Rd +++ b/man/perf_splsda.Rd @@ -2,7 +2,8 @@ % Please edit documentation in R/prefiltering.R \name{perf_splsda} \alias{perf_splsda} -\title{Assess optimal number of components for sPLS-DA on omics dataset from MultiDataSet object} +\title{Assess optimal number of components for sPLS-DA on omics dataset from +MultiDataSet object} \usage{ perf_splsda( splsda_input, @@ -13,53 +14,69 @@ perf_splsda( measure = "BER", distance = "centroids.dist", cpus = 1, - progressBar = TRUE + progressBar = TRUE, + seed = NULL ) } \arguments{ -\item{splsda_input}{Input for the sPLS-DA functions from mixOmics, created with \code{\link[=get_input_splsda]{get_input_splsda()}}.} +\item{splsda_input}{Input for the sPLS-DA functions from mixOmics, created +with \code{\link[=get_input_splsda]{get_input_splsda()}}.} -\item{ncomp_max}{Integer, the maximum number of latent components to test when estimating the number of -latent components to use. Default value is \code{5}.} +\item{ncomp_max}{Integer, the maximum number of latent components to test +when estimating the number of latent components to use. Default value is +\code{5}.} -\item{validation}{Character, which cross-validation method to use, can be one of \code{"Mfold"} or \code{"loo"} -(see \code{\link[mixOmics:perf]{mixOmics::perf()}}). Default value is \code{"Mfold"}.} +\item{validation}{Character, which cross-validation method to use, can be one +of \code{"Mfold"} or \code{"loo"} (see \code{\link[mixOmics:perf]{mixOmics::perf()}}). Default value is +\code{"Mfold"}.} -\item{folds}{Integer, number of folds to use in the M-fold cross-validation (see \code{\link[mixOmics:perf]{mixOmics::perf()}}). -Default value is 5.} +\item{folds}{Integer, number of folds to use in the M-fold cross-validation +(see \code{\link[mixOmics:perf]{mixOmics::perf()}}). Default value is 5.} -\item{nrepeat}{Integer, number of times the cross-validation is repeated (see \code{\link[mixOmics:perf]{mixOmics::perf()}}).} +\item{nrepeat}{Integer, number of times the cross-validation is repeated (see +\code{\link[mixOmics:perf]{mixOmics::perf()}}).} -\item{measure}{Performance measure used to select the optimal value of \code{ncomp}, can be one of \code{"BER"} or \code{"overall"} -(see \code{\link[mixOmics:perf]{mixOmics::perf()}}). +\item{measure}{Performance measure used to select the optimal value of +\code{ncomp}, can be one of \code{"BER"} or \code{"overall"} (see \code{\link[mixOmics:perf]{mixOmics::perf()}}). Default value is \code{"BER"}.} -\item{distance}{Distance metric used to select the optimal value of \code{ncomp}, can be one of \code{"max.dist"}, -\code{"centroids.dist"} or \code{"mahalanobis.dist"} (see \code{\link[mixOmics:perf]{mixOmics::perf()}}). Default value is \code{"centroids.dist"}.} +\item{distance}{Distance metric used to select the optimal value of \code{ncomp}, +can be one of \code{"max.dist"}, \code{"centroids.dist"} or \code{"mahalanobis.dist"} (see +\code{\link[mixOmics:perf]{mixOmics::perf()}}). Default value is \code{"centroids.dist"}.} \item{cpus}{Integer, number of cpus to use.} -\item{progressBar}{Logical, whether to display a progress bar during the optimisation of \code{ncomp}. Default -value is \code{TRUE}.} +\item{progressBar}{Logical, whether to display a progress bar during the +optimisation of \code{ncomp}. Default value is \code{TRUE}.} + +\item{seed}{Integer, seed to use. Default is \code{NULL}, i.e. no seed is set +inside the function.} } \value{ -A list as per the output of the \code{\link[mixOmics:perf]{mixOmics::perf()}} function, with the following additional elements: +A list as per the output of the \code{\link[mixOmics:perf]{mixOmics::perf()}} function, with +the following additional elements: \itemize{ \item \code{dataset_name}: the name of the dataset analysed; -\item \code{group}: column name in the samples information data-frame used as samples group; -\item \code{optim_ncomp}: the optimal number of latent components as per the \code{measure} and \code{distance} specified; -\item \code{optim_measure}: the measure used to select the optimal number of latent components; -\item \code{optim_distance}: the distance metric used to select the optimal number of latent components. +\item \code{group}: column name in the samples information data-frame used as samples +group; +\item \code{optim_ncomp}: the optimal number of latent components as per the \code{measure} +and \code{distance} specified; +\item \code{optim_measure}: the measure used to select the optimal number of latent +components; +\item \code{optim_distance}: the distance metric used to select the optimal number of +latent components. +In addition, the name of the dataset analysed and the column name in the +samples information data-frame used as samples group as stored as attributes +\code{dataset_name} and \code{group}, respectively. } -In addition, the name of the dataset analysed and the column name in the samples information data-frame -used as samples group as stored as attributes \code{dataset_name} and \code{group}, respectively. } \description{ -Performs cross-validation for a PLS-DA run (implemented in the \code{mixOmics} package) on an omics dataset from a -\code{MultiDataSet} object. This allows to estimate the optimal number of latent components to construct. -This is intended for feature preselection in the omics dataset (see examples below). +Performs cross-validation for a PLS-DA run (implemented in the \code{mixOmics} +package) on an omics dataset from a \code{MultiDataSet} object. This allows to +estimate the optimal number of latent components to construct. This is +intended for feature preselection in the omics dataset (see examples below). } \details{ -This function uses the \code{\link[mixOmics]{plsda}} and \code{\link[mixOmics]{perf}} -function from the \code{mixOmics} package. +This function uses the \code{\link[mixOmics:plsda]{mixOmics::plsda()}} and \code{\link[mixOmics:perf]{mixOmics::perf()}} function +from the \code{mixOmics} package. } diff --git a/man/plot_feature_preselection_splsda.Rd b/man/plot_feature_preselection_splsda.Rd index ab4528c..a3ae92e 100644 --- a/man/plot_feature_preselection_splsda.Rd +++ b/man/plot_feature_preselection_splsda.Rd @@ -11,22 +11,26 @@ plot_feature_preselection_splsda( ) } \arguments{ -\item{perf_splsda_res}{A list with the result from the \code{\link{perf_splsda}} for each dataset -to be filtered.} +\item{perf_splsda_res}{A list with the result from the +\link{perf_splsda} for each dataset to be filtered.} -\item{measure}{Which measure(s) should be displayed? Can be one of \code{"BER"} -or \code{"overall"}. If NULL, all measures will be displayed. Default value is \code{NULL}.} +\item{measure}{Which measure(s) should be displayed? Can be one of \code{"BER"} or +\code{"overall"}. If NULL, all measures will be displayed. Default value is +\code{NULL}.} -\item{distance}{Which measure(s) should be displayed? Can be one of \code{"max.dist"}, -\code{"centroids.dist"} or \code{"mahalanobis.dist"}. If NULL, all measures will be displayed. Default value is \code{NULL}.} +\item{distance}{Which measure(s) should be displayed? Can be one of +\code{"max.dist"}, \code{"centroids.dist"} or \code{"mahalanobis.dist"}. If NULL, all +measures will be displayed. Default value is \code{NULL}.} } \value{ A ggplot. } \description{ Displays the PLS-DA classification performance across different number of -latent components for each prefiltered dataset. The classification error rates are computed with different measures -(column facets) and different distance metrics (colours). A vertical grey bar represents for each dataset the number -of latent components selected for the feature preselection step. In addition, a circle highlights the measure and -distance metric used to select the number of latent component. +latent components for each prefiltered dataset. The classification error +rates are computed with different measures (column facets) and different +distance metrics (colours). A vertical grey bar represents for each dataset +the number of latent components selected for the feature preselection step. +In addition, a circle highlights the measure and distance metric used to +select the number of latent component. } diff --git a/man/run_splsda.Rd b/man/run_splsda.Rd index acf460e..9cd6e6b 100644 --- a/man/run_splsda.Rd +++ b/man/run_splsda.Rd @@ -9,34 +9,43 @@ run_splsda( perf_res, to_keep_n = NULL, to_keep_prop = NULL, - ncomp = NULL + ncomp = NULL, + seed = NULL ) } \arguments{ -\item{splsda_input}{Input for the sPLS-DA functions from mixOmics, created with \code{\link[=get_input_splsda]{get_input_splsda()}}.} +\item{splsda_input}{Input for the sPLS-DA functions from mixOmics, created +with \code{\link[=get_input_splsda]{get_input_splsda()}}.} -\item{perf_res}{Result of the \code{\link{perf_splsda}} function. If not supplied, sPLS-DA will be run on -dataset specified by argument \code{dataset_name} with number of latent components specified by argument \code{comp}.} +\item{perf_res}{Result of the \code{\link[=perf_splsda]{perf_splsda()}} function. If not supplied, +sPLS-DA will be run on dataset specified by argument \code{dataset_name} with +number of latent components specified by argument \code{comp}.} -\item{to_keep_n}{Integer, the number of features to retain in the dataset. Should be less than the number of -features in the dataset. If \code{NULL} or \code{NA}, \code{to_keep_prop} will be used instead.} +\item{to_keep_n}{Integer, the number of features to retain in the dataset. +Should be less than the number of features in the dataset. If \code{NULL} or +\code{NA}, \code{to_keep_prop} will be used instead.} -\item{to_keep_prop}{Numeric, the proportion of features to retain in the dataset. Will be ignored if \code{to_keep_n} -is supplied. Value should be > 0 and < 1.} +\item{to_keep_prop}{Numeric, the proportion of features to retain in the +dataset. Will be ignored if \code{to_keep_n} is supplied. Value should be > 0 +and < 1.} -\item{ncomp}{Integer, number of latent components to construct. Ignored if \code{perf_res} is supplied. -Default value is \code{NULL}.} +\item{ncomp}{Integer, number of latent components to construct. Ignored if +\code{perf_res} is supplied. Default value is \code{NULL}.} + +\item{seed}{Integer, seed to use. Default is \code{NULL}, i.e. no seed is set +inside the function.} } \value{ -A list as per the output of the \code{\link[mixOmics]{splsda}} function. +A list as per the output of the \code{\link[mixOmics:splsda]{mixOmics::splsda()}} function. } \description{ -Performs a sPLS-DA (implemented in the \code{mixOmics}) package on a omics dataset from a -MultiDataSet object. This is intended for feature preselection in the omics dataset -(see \code{\link{get_filtered_dataset_splsda}}). +Performs a sPLS-DA (implemented in the \code{mixOmics}) package on a omics dataset +from a MultiDataSet object. This is intended for feature preselection in the +omics dataset (see \code{\link[=get_filtered_dataset_splsda]{get_filtered_dataset_splsda()}}). } \details{ -This function uses the \code{\link[mixOmics]{plsda}} function from the \code{mixOmics} package. -Note that the sPLS-DA method can select the same feature for several latent components, so the number of -features retained for a dataset might be less than the number specified in the \code{to_keep_n} argument. +This function uses the \code{\link[mixOmics:plsda]{mixOmics::plsda()}} function from the \code{mixOmics} +package. Note that the sPLS-DA method can select the same feature for several +latent components, so the number of features retained for a dataset might be +less than the number specified in the \code{to_keep_n} argument. } diff --git a/tests/testthat/test-prefiltering.R b/tests/testthat/test-prefiltering.R index e89bf26..1524bab 100644 --- a/tests/testthat/test-prefiltering.R +++ b/tests/testthat/test-prefiltering.R @@ -21,7 +21,7 @@ test_that("feature_preselection_splsda_factory works", { ## whitespace error I cannot fix expect_equal( tar_res[[1]]$command$expr |> deparse() |> paste0(collapse = ""), - "expression(tar_group(dplyr::group_by(tibble::tibble(dsn = c(\"snps+A\", \"metabolome\"), tkn = list(50, 30), tkp = NULL), dsn)))" + "expression(tar_group(dplyr::group_by(tibble::tibble(dsn = c(\"snps+A\", \"metabolome\"), tkn = list(50, 30), tkp = NULL, sp = NULL, sr = NULL), dsn)))" ) # Testing targets command @@ -31,9 +31,10 @@ test_that("feature_preselection_splsda_factory works", { }), list( expression(get_input_splsda(mo_data, splsda_spec$dsn, "pheno_group", NULL)), - expression(perf_splsda(individual_splsda_input)), + expression(perf_splsda(individual_splsda_input, seed = splsda_spec$sp)), expression(run_splsda(individual_splsda_input, perf_res = individual_splsda_perf, - to_keep_n = splsda_spec$tkn, to_keep_prop = splsda_spec$tkp)), + to_keep_n = splsda_spec$tkn, to_keep_prop = splsda_spec$tkp, + seed = splsda_spec$sr)), expression(get_filtered_dataset_splsda(mo_data, individual_splsda_run)) ) ) @@ -46,7 +47,7 @@ test_that("feature_preselection_splsda_factory works", { list( NULL, expression(map(splsda_spec)), - expression(map(individual_splsda_input)), + expression(map(individual_splsda_input, splsda_spec)), expression(map(individual_splsda_input, individual_splsda_perf, splsda_spec)), NULL ) @@ -106,7 +107,7 @@ test_that("feature_preselection_splsda_factory works", { ) expect_equal( tar_res3[[3]]$command$expr, - expression(perf_splsda(individual_splsda_input, ncomp_max = 10, folds = 3)) + expression(perf_splsda(individual_splsda_input, seed = splsda_spec$sp, ncomp_max = 10, folds = 3)) ) ## Testing custom complete target name + custom prefix for target names @@ -125,3 +126,41 @@ test_that("feature_preselection_splsda_factory works", { "test_individual_splsda_run", "final_test") ) }) + +test_that("feature_preselection_splsda_factory works (seed)", { + tar_res <- feature_preselection_splsda_factory( + mo_data, + group = "pheno_group", + to_keep_ns = list("snps+A" = 50, "metabolome" = 30) + ) + + expect_error( + feature_preselection_splsda_factory( + mo_data, + group = "pheno_group", + to_keep_ns = list("snps+A" = 50, "metabolome" = 30), + seed_perf = 1 + ), + "`seed_perf` should be an integer vector with same length as `to_keep_ns` or `to_keep_props`." + ) + + expect_error( + feature_preselection_splsda_factory( + mo_data, + group = "pheno_group", + to_keep_ns = list("snps+A" = 50, "metabolome" = 30), + seed_perf = c("test" = 1, "rnaseq" = 2) + ), + "`seed_perf` names do not match `to_keep_ns` or `to_keep_props` names." + ) + + expect_no_error( + feature_preselection_splsda_factory( + mo_data, + group = "pheno_group", + to_keep_ns = list("snps+A" = 50, "metabolome" = 30), + seed_perf = c(1, 2) + ) + ) + +})