From 8563a8af84ccff8febc42df1da8a7545ce3ed4e5 Mon Sep 17 00:00:00 2001
From: oliviaAB <olivia.angelinbonnet@gmail.com>
Date: Wed, 24 Jul 2024 15:11:42 +1200
Subject: [PATCH] Added seed argument to perf_splsda and run_splsda

---
 DESCRIPTION                                |   2 +-
 NEWS.md                                    |   2 +
 R/prefiltering.R                           | 282 ++++++++++++++-------
 man/dot-clean_seed.Rd                      |  19 ++
 man/feature_preselection_splsda_factory.Rd |  78 ++++--
 man/perf_splsda.Rd                         |  73 ++++--
 man/plot_feature_preselection_splsda.Rd    |  24 +-
 man/run_splsda.Rd                          |  43 ++--
 tests/testthat/test-prefiltering.R         |  49 +++-
 9 files changed, 395 insertions(+), 177 deletions(-)
 create mode 100644 man/dot-clean_seed.Rd

diff --git a/DESCRIPTION b/DESCRIPTION
index b8a6a7c..5187ad8 100644
--- a/DESCRIPTION
+++ b/DESCRIPTION
@@ -17,7 +17,7 @@ BugReports: https://github.com/Plant-Food-Research-Open/moiraine/issues
 License: MIT + file LICENSE
 Encoding: UTF-8
 Roxygen: list(markdown = TRUE)
-RoxygenNote: 7.2.3
+RoxygenNote: 7.3.1
 Imports: 
     Biobase,
     circlize,
diff --git a/NEWS.md b/NEWS.md
index 63a75eb..df478c7 100644
--- a/NEWS.md
+++ b/NEWS.md
@@ -7,3 +7,5 @@
 - `where()` function now imported from tidyselect instead of dplyr (as it required a newer version of dplyr).
 
 - Fixed typo in samples metadata file, samples with no value for "rnaseq_batch" variable now have `NA` rather than `"BNA"` values. 
+
+- `perf_splsda()` and `run_splsda()` now have a `seed` argument (hopefully self-explanatory :)). Accordingly, `feature_preselection_splsda_factory` now has arguments `seed_perf` and `seed_run` to pass on seeds to `perf_splsda()` and `run_splsda`. 
diff --git a/R/prefiltering.R b/R/prefiltering.R
index 15fdb8c..1559085 100644
--- a/R/prefiltering.R
+++ b/R/prefiltering.R
@@ -500,41 +500,54 @@ get_input_splsda <- function(mo_data, dataset_name, group, multilevel = NULL) {
 }
 
 
-#' Assess optimal number of components for sPLS-DA on omics dataset from MultiDataSet object
+#' Assess optimal number of components for sPLS-DA on omics dataset from
+#' MultiDataSet object
 #'
-#' Performs cross-validation for a PLS-DA run (implemented in the `mixOmics` package) on an omics dataset from a
-#' `MultiDataSet` object. This allows to estimate the optimal number of latent components to construct.
-#' This is intended for feature preselection in the omics dataset (see examples below).
+#' Performs cross-validation for a PLS-DA run (implemented in the `mixOmics`
+#' package) on an omics dataset from a `MultiDataSet` object. This allows to
+#' estimate the optimal number of latent components to construct. This is
+#' intended for feature preselection in the omics dataset (see examples below).
 #'
-#' This function uses the \code{\link[mixOmics]{plsda}} and  \code{\link[mixOmics]{perf}}
-#' function from the `mixOmics` package.
+#' This function uses the [mixOmics::plsda()] and  [mixOmics::perf()] function
+#' from the `mixOmics` package.
 #'
-#' @param splsda_input Input for the sPLS-DA functions from mixOmics, created with [get_input_splsda()].
-#' @param ncomp_max Integer, the maximum number of latent components to test when estimating the number of
-#' latent components to use. Default value is `5`.
-#' @param validation Character, which cross-validation method to use, can be one of `"Mfold"` or `"loo"`
-#' (see [mixOmics::perf()]). Default value is `"Mfold"`.
-#' @param folds Integer, number of folds to use in the M-fold cross-validation (see [mixOmics::perf()]).
-#' Default value is 5.
-#' @param nrepeat Integer, number of times the cross-validation is repeated (see [mixOmics::perf()]).
-#' @param measure Performance measure used to select the optimal value of `ncomp`, can be one of `"BER"` or `"overall"`
-#' (see [mixOmics::perf()]).
-#' Default value is `"BER"`.
-#' @param distance Distance metric used to select the optimal value of `ncomp`, can be one of `"max.dist"`,
-#' `"centroids.dist"` or `"mahalanobis.dist"` (see [mixOmics::perf()]). Default value is `"centroids.dist"`.
+#' @param splsda_input Input for the sPLS-DA functions from mixOmics, created
+#'   with [get_input_splsda()].
+#' @param ncomp_max Integer, the maximum number of latent components to test
+#'   when estimating the number of latent components to use. Default value is
+#'   `5`.
+#' @param validation Character, which cross-validation method to use, can be one
+#'   of `"Mfold"` or `"loo"` (see [mixOmics::perf()]). Default value is
+#'   `"Mfold"`.
+#' @param folds Integer, number of folds to use in the M-fold cross-validation
+#'   (see [mixOmics::perf()]). Default value is 5.
+#' @param nrepeat Integer, number of times the cross-validation is repeated (see
+#'   [mixOmics::perf()]).
+#' @param measure Performance measure used to select the optimal value of
+#'   `ncomp`, can be one of `"BER"` or `"overall"` (see [mixOmics::perf()]).
+#'   Default value is `"BER"`.
+#' @param distance Distance metric used to select the optimal value of `ncomp`,
+#'   can be one of `"max.dist"`, `"centroids.dist"` or `"mahalanobis.dist"` (see
+#'   [mixOmics::perf()]). Default value is `"centroids.dist"`.
 #' @param cpus Integer, number of cpus to use.
-#' @param progressBar Logical, whether to display a progress bar during the optimisation of `ncomp`. Default
-#' value is `TRUE`.
-#' @return A list as per the output of the [mixOmics::perf()] function, with the following additional elements:
-#' \itemize{
-#' \item `dataset_name`: the name of the dataset analysed;
-#' \item `group`: column name in the samples information data-frame used as samples group;
-#' \item `optim_ncomp`: the optimal number of latent components as per the `measure` and `distance` specified;
-#' \item `optim_measure`: the measure used to select the optimal number of latent components;
-#' \item `optim_distance`: the distance metric used to select the optimal number of latent components.
-#' }
-#' In addition, the name of the dataset analysed and the column name in the samples information data-frame
-#' used as samples group as stored as attributes `dataset_name` and `group`, respectively.
+#' @param progressBar Logical, whether to display a progress bar during the
+#'   optimisation of `ncomp`. Default value is `TRUE`.
+#' @param seed Integer, seed to use. Default is `NULL`, i.e. no seed is set
+#'   inside the function.
+#' @returns A list as per the output of the [mixOmics::perf()] function, with
+#'   the following additional elements:
+#' * `dataset_name`: the name of the dataset analysed;
+#' * `group`: column name in the samples information data-frame used as samples
+#'    group;
+#' * `optim_ncomp`: the optimal number of latent components as per the `measure`
+#'    and `distance` specified;
+#' * `optim_measure`: the measure used to select the optimal number of latent
+#'    components;
+#' * `optim_distance`: the distance metric used to select the optimal number of
+#'    latent components.
+#' In addition, the name of the dataset analysed and the column name in the
+#' samples information data-frame used as samples group as stored as attributes
+#' `dataset_name` and `group`, respectively.
 #' @export
 perf_splsda <- function(splsda_input,
                         ncomp_max = 5,
@@ -544,11 +557,14 @@ perf_splsda <- function(splsda_input,
                         measure = "BER",
                         distance = "centroids.dist",
                         cpus = 1,
-                        progressBar = TRUE) {
+                        progressBar = TRUE,
+                        seed = NULL) {
 
   dataset_name <- setdiff(names(splsda_input), "Y")
   multilevel <- attr(splsda_input, "multilevel")
 
+  if (!is.null(seed)) set.seed(seed)
+
   ## Run the PLS-DA with several latent components
   plsda_res <- mixOmics::plsda(
     splsda_input[[dataset_name]],
@@ -587,27 +603,40 @@ perf_splsda <- function(splsda_input,
 
 #' Performs sPLS-DA on omics dataset from MultiDataSet object
 #'
-#' Performs a sPLS-DA (implemented in the `mixOmics`) package on a omics dataset from a
-#' MultiDataSet object. This is intended for feature preselection in the omics dataset
-#' (see \code{\link{get_filtered_dataset_splsda}}).
+#' Performs a sPLS-DA (implemented in the `mixOmics`) package on a omics dataset
+#' from a MultiDataSet object. This is intended for feature preselection in the
+#' omics dataset (see [get_filtered_dataset_splsda()]).
 #'
-#' This function uses the \code{\link[mixOmics]{plsda}} function from the `mixOmics` package.
-#' Note that the sPLS-DA method can select the same feature for several latent components, so the number of
-#' features retained for a dataset might be less than the number specified in the `to_keep_n` argument.
+#' This function uses the [mixOmics::plsda()] function from the `mixOmics`
+#' package. Note that the sPLS-DA method can select the same feature for several
+#' latent components, so the number of features retained for a dataset might be
+#' less than the number specified in the `to_keep_n` argument.
 #'
-#' @param splsda_input Input for the sPLS-DA functions from mixOmics, created with [get_input_splsda()].
-#' @param perf_res Result of the \code{\link{perf_splsda}} function. If not supplied, sPLS-DA will be run on
-#' dataset specified by argument `dataset_name` with number of latent components specified by argument `comp`.
-#' @param to_keep_n Integer, the number of features to retain in the dataset. Should be less than the number of
-#' features in the dataset. If `NULL` or `NA`, `to_keep_prop` will be used instead.
-#' @param to_keep_prop Numeric, the proportion of features to retain in the dataset. Will be ignored if `to_keep_n`
-#' is supplied. Value should be > 0 and < 1.
-#' @param ncomp Integer, number of latent components to construct. Ignored if `perf_res` is supplied.
-#' Default value is `NULL`.
-#' @return A list as per the output of the \code{\link[mixOmics]{splsda}} function.
+#' @param splsda_input Input for the sPLS-DA functions from mixOmics, created
+#'   with [get_input_splsda()].
+#' @param perf_res Result of the [perf_splsda()] function. If not supplied,
+#'   sPLS-DA will be run on dataset specified by argument `dataset_name` with
+#'   number of latent components specified by argument `comp`.
+#' @param to_keep_n Integer, the number of features to retain in the dataset.
+#'   Should be less than the number of features in the dataset. If `NULL` or
+#'   `NA`, `to_keep_prop` will be used instead.
+#' @param to_keep_prop Numeric, the proportion of features to retain in the
+#'   dataset. Will be ignored if `to_keep_n` is supplied. Value should be > 0
+#'   and < 1.
+#' @param ncomp Integer, number of latent components to construct. Ignored if
+#'   `perf_res` is supplied. Default value is `NULL`.
+#' @param seed Integer, seed to use. Default is `NULL`, i.e. no seed is set
+#'   inside the function.
+#' @returns A list as per the output of the [mixOmics::splsda()] function.
 #' @export
-run_splsda <- function(splsda_input, perf_res, to_keep_n = NULL, to_keep_prop = NULL, ncomp = NULL) {
-
+run_splsda <- function(splsda_input,
+                       perf_res,
+                       to_keep_n = NULL,
+                       to_keep_prop = NULL,
+                       ncomp = NULL,
+                       seed = NULL) {
+
+  if (!is.null(seed)) set.seed(seed)
   if (!missing(perf_res)) ncomp <- perf_res$optim_ncomp
 
   dataset_name <- setdiff(names(splsda_input), "Y")
@@ -703,33 +732,56 @@ get_filtered_dataset_splsda <- function(mo_data,
 
 #' Target factory for feature preselection based on sPLS-DA
 #'
-#' Creates a list of targets to perform feature preselection on datasets from a `MultiDataSet`
-#' object with sPLS-DA (from the `mixOmics` package).
+#' Creates a list of targets to perform feature preselection on datasets from a
+#' `MultiDataSet` object with sPLS-DA (from the `mixOmics` package).
 #'
-#' @param mo_data_target Symbol, the name of the target containing the `MultiDataSet` object.
-#' @param group Character, the column name in the samples information data-frame to use as samples group.
-#' @param to_keep_ns Named integer vector, the number of feature to retain in each dataset to be prefiltered
-#' (names should correspond to a dataset name). Value should be less than the number of features in the
-#' corresponding dataset. Set to `NULL` in order to use `to_keep_props` instead.
-#' @param to_keep_props Named numeric vector, the proportion of features to retain in each dataset
-#' to be prefiltered (names should correspond to a dataset name). Value should be > 0 and < 1.
-#' Will be ignored if `to_keep_ns` is not `NULL`.
-#' @param target_name_prefix Character, a prefix to add to the name of the targets created by this target factory.
-#' Default value is `""`.
-#' @param filtered_set_target_name Character, the name of the final target containing the filtered `MultiDataSet` object.
-#' If NULL, a name will automatically be supplied. Default value is `NULL`.
-#' @param multilevel Character vector of length 1 or 3 to be used as information about repeated measurements.
-#' See [get_input_splsda()] for details. Default value is `NULL` (no repeated measurements).
-#' @param ... Further arguments passed to the \code{\link{perf_splsda}} function.
-#' @return A list of target objects. With `target_name_prefix = ""` and `filtered_set_target_name = NULL`,
-#' the following targets are created:
-#' * `splsda_spec`: generates a grouped tibble where each row corresponds to one dataset to be filtered,
-#'   with the columns specifying each dataset name, and associated values from `to_keep_ns` and `to_keep_props`.
-#'   * `individual_splsda_input`: a dynamic branching target that runs the [get_input_splsda()] function for each dataset.
-#' * `individual_splsda_perf`: a dynamic branching target that runs the [perf_splsda()] function for each dataset.
-#' * `individual_splsda_run`: a dynamic branching target that runs the [run_splsda()] function for each dataset,
-#'   using the results from `individual_splsda_perf` to guide the number of latent components to construct.
-#' * `filtered_set_slpsda`: a target to retain from the original `MultiDataSet` object only features selected in each sPLS-DA run.
+#' @param mo_data_target Symbol, the name of the target containing the
+#'   `MultiDataSet` object.
+#' @param group Character, the column name in the samples information data-frame
+#'   to use as samples group.
+#' @param to_keep_ns Named integer vector, the number of feature to retain in
+#'   each dataset to be prefiltered (names should correspond to a dataset name).
+#'   Value should be less than the number of features in the corresponding
+#'   dataset. Set to `NULL` in order to use `to_keep_props` instead.
+#' @param to_keep_props Named numeric vector, the proportion of features to
+#'   retain in each dataset to be prefiltered (names should correspond to a
+#'   dataset name). Value should be > 0 and < 1. Will be ignored if `to_keep_ns`
+#'   is not `NULL`.
+#' @param target_name_prefix Character, a prefix to add to the name of the
+#'   targets created by this target factory. Default value is `""`.
+#' @param filtered_set_target_name Character, the name of the final target
+#'   containing the filtered `MultiDataSet` object. If NULL, a name will
+#'   automatically be supplied. Default value is `NULL`.
+#' @param multilevel Character vector of length 1 or 3 to be used as information
+#'   about repeated measurements. See [get_input_splsda()] for details. Default
+#'   value is `NULL` (no repeated measurements).
+#' @param seed_perf Named integer vector, the seed to use for the
+#'   [perf_splsda()] function for each dataset. The length and names should
+#'   match those of `to_keep_ns` or `to_keep_props`. If not named, the values
+#'   will be used in order of the datasets in `to_keep_ns` or `to_keep_props`.
+#'   Default value is `NULL`, i.e. no seed is set.
+#'  @param seed_run Named integer vector, the seed to use for the
+#'   [run_splsda()] function for each dataset. The length and names should
+#'   match those of `to_keep_ns` or `to_keep_props`. If not named, the values
+#'   will be used in order of the datasets in `to_keep_ns` or `to_keep_props`.
+#'   Default value is `NULL`, i.e. no seed is set.
+#' @param ... Further arguments passed to the [perf_splsda]
+#'   function.
+#' @returns A list of target objects. With `target_name_prefix = ""` and
+#'   `filtered_set_target_name = NULL`, the following targets are created:
+#' * `splsda_spec`: generates a grouped tibble where each row corresponds to one
+#'   dataset to be filtered, with the columns specifying each dataset name, and
+#'   associated values from `to_keep_ns` and `to_keep_props`.
+#'   * `individual_splsda_input`: a dynamic branching target that runs the
+#'   [get_input_splsda()] function for each dataset.
+#' * `individual_splsda_perf`: a dynamic branching target that runs the
+#'   [perf_splsda()] function for each dataset.
+#' * `individual_splsda_run`: a dynamic branching target that runs the
+#'   [run_splsda()] function for each dataset, using the results from
+#'   `individual_splsda_perf` to guide the number of latent components to
+#'   construct.
+#' * `filtered_set_slpsda`: a target to retain from the original `MultiDataSet`
+#'   object only features selected in each sPLS-DA run.
 #' @examples
 #' \dontrun{
 #' ## in the _targets.R
@@ -771,6 +823,8 @@ feature_preselection_splsda_factory <- function(mo_data_target,
                                                 target_name_prefix = "",
                                                 filtered_set_target_name = NULL,
                                                 multilevel = NULL,
+                                                seed_perf = NULL,
+                                                seed_run = NULL,
                                                 ...) {
   splsda_spec_name <- paste0(target_name_prefix, "splsda_spec")
   splsda_input_name <- paste0(target_name_prefix, "individual_splsda_input")
@@ -795,13 +849,25 @@ feature_preselection_splsda_factory <- function(mo_data_target,
     stop("'to_keep_ns' or 'to_keep_props' argument should be named.")
   }
 
+  ## Checking that number of seeds match number of datasets, and names match
+  ## dataset names
+  seed_perf <- .clean_seed(seed_perf, dataset_names)
+  seed_run <- .clean_seed(seed_run, dataset_names)
+
+
   list(
     ## store the splsda specifications (arguments) as a tibble (one row per dataset to prefilter)
     ## and group it by dataset name so that following targets will be applied to each row in turn
     targets::tar_target_raw(
       splsda_spec_name,
       substitute(
-        tibble::tibble(dsn = dataset_names, tkn = to_keep_ns, tkp = to_keep_props) |>
+        tibble::tibble(
+          dsn = dataset_names,
+          tkn = to_keep_ns,
+          tkp = to_keep_props,
+          sp = seed_perf,
+          sr = seed_run
+        ) |>
           dplyr::group_by(dsn) |>
           tar_group()),
       iteration = "group"
@@ -818,8 +884,8 @@ feature_preselection_splsda_factory <- function(mo_data_target,
     ## run the perf function for each row of the specfications dataframe
     targets::tar_target_raw(
       splsda_perf_name,
-      substitute(perf_splsda(splsda_input_target, ...)),
-      pattern = substitute(map(splsda_input_target)),
+      substitute(perf_splsda(splsda_input_target, seed = splsda_spec_target$sp, ...)),
+      pattern = substitute(map(splsda_input_target, splsda_spec_target)),
       iteration = "list"
     ),
 
@@ -831,7 +897,8 @@ feature_preselection_splsda_factory <- function(mo_data_target,
           splsda_input_target,
           perf_res = splsda_perf_target,
           to_keep_n = splsda_spec_target$tkn,
-          to_keep_prop = splsda_spec_target$tkp
+          to_keep_prop = splsda_spec_target$tkp,
+          seed = splsda_spec_target$sr
         )
       ),
       pattern = substitute(map(splsda_input_target, splsda_perf_target, splsda_spec_target)),
@@ -936,17 +1003,21 @@ plot_feature_preselection_cov <- function(cov_list) {
 #' Diagnostics plots for sPLS-DA-based feature preselection
 #'
 #' Displays the PLS-DA classification performance across different number of
-#' latent components for each prefiltered dataset. The classification error rates are computed with different measures
-#' (column facets) and different distance metrics (colours). A vertical grey bar represents for each dataset the number
-#' of latent components selected for the feature preselection step. In addition, a circle highlights the measure and
-#' distance metric used to select the number of latent component.
+#' latent components for each prefiltered dataset. The classification error
+#' rates are computed with different measures (column facets) and different
+#' distance metrics (colours). A vertical grey bar represents for each dataset
+#' the number of latent components selected for the feature preselection step.
+#' In addition, a circle highlights the measure and distance metric used to
+#' select the number of latent component.
 #'
-#' @param perf_splsda_res A list with the result from the \code{\link{perf_splsda}} for each dataset
-#' to be filtered.
-#' @param measure Which measure(s) should be displayed? Can be one of `"BER"`
-#' or `"overall"`. If NULL, all measures will be displayed. Default value is `NULL`.
-#' @param distance Which measure(s) should be displayed? Can be one of `"max.dist"`,
-#' `"centroids.dist"` or `"mahalanobis.dist"`. If NULL, all measures will be displayed. Default value is `NULL`.
+#' @param perf_splsda_res A list with the result from the
+#'   [perf_splsda] for each dataset to be filtered.
+#' @param measure Which measure(s) should be displayed? Can be one of `"BER"` or
+#'   `"overall"`. If NULL, all measures will be displayed. Default value is
+#'   `NULL`.
+#' @param distance Which measure(s) should be displayed? Can be one of
+#'   `"max.dist"`, `"centroids.dist"` or `"mahalanobis.dist"`. If NULL, all
+#'   measures will be displayed. Default value is `NULL`.
 #' @return A ggplot.
 #' @export
 plot_feature_preselection_splsda <- function(perf_splsda_res,
@@ -1026,3 +1097,34 @@ plot_feature_preselection_splsda <- function(perf_splsda_res,
 
   return(res_plot)
 }
+
+#' Clean seed for sPLS-DA preselection factory
+#'
+#' Checks seed input arguments for the [feature_preselection_splsda_factory()] function.
+#'
+#' @param x Integer vector of seeds to use.
+#' @param ds Character vector of dataset names.
+#' @returns `x` with names if it didn't have them or an error.
+.clean_seed <- function(x, ds) {
+  x_name <- deparse(substitute(x))
+  x_name <- paste0("`", x_name, "`")
+  if (!is.null(x)) {
+    if (length(x) != length(ds)) {
+      stop(
+        x_name,
+        " should be an integer vector with same length as `to_keep_ns` or `to_keep_props`.",
+        call. = FALSE
+      )
+    }
+    if (is.null(names(x))) names(x) <- ds
+    .check_names(
+      names(x),
+      ds,
+      paste0(x_name, " names do not match `to_keep_ns` or `to_keep_props` names.")
+    )
+
+    x <- x[ds]
+  }
+
+  x
+}
diff --git a/man/dot-clean_seed.Rd b/man/dot-clean_seed.Rd
new file mode 100644
index 0000000..01972c3
--- /dev/null
+++ b/man/dot-clean_seed.Rd
@@ -0,0 +1,19 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/prefiltering.R
+\name{.clean_seed}
+\alias{.clean_seed}
+\title{Clean seed for sPLS-DA preselection factory}
+\usage{
+.clean_seed(x, ds)
+}
+\arguments{
+\item{x}{Integer vector of seeds to use.}
+
+\item{ds}{Character vector of dataset names.}
+}
+\value{
+\code{x} with names if it didn't have them or an error.
+}
+\description{
+Checks seed input arguments for the \code{\link[=feature_preselection_splsda_factory]{feature_preselection_splsda_factory()}} function.
+}
diff --git a/man/feature_preselection_splsda_factory.Rd b/man/feature_preselection_splsda_factory.Rd
index 65e3a31..1dbf0aa 100644
--- a/man/feature_preselection_splsda_factory.Rd
+++ b/man/feature_preselection_splsda_factory.Rd
@@ -12,51 +12,77 @@ feature_preselection_splsda_factory(
   target_name_prefix = "",
   filtered_set_target_name = NULL,
   multilevel = NULL,
+  seed_perf = NULL,
+  seed_run = NULL,
   ...
 )
 }
 \arguments{
-\item{mo_data_target}{Symbol, the name of the target containing the \code{MultiDataSet} object.}
+\item{mo_data_target}{Symbol, the name of the target containing the
+\code{MultiDataSet} object.}
 
-\item{group}{Character, the column name in the samples information data-frame to use as samples group.}
+\item{group}{Character, the column name in the samples information data-frame
+to use as samples group.}
 
-\item{to_keep_ns}{Named integer vector, the number of feature to retain in each dataset to be prefiltered
-(names should correspond to a dataset name). Value should be less than the number of features in the
-corresponding dataset. Set to \code{NULL} in order to use \code{to_keep_props} instead.}
+\item{to_keep_ns}{Named integer vector, the number of feature to retain in
+each dataset to be prefiltered (names should correspond to a dataset name).
+Value should be less than the number of features in the corresponding
+dataset. Set to \code{NULL} in order to use \code{to_keep_props} instead.}
 
-\item{to_keep_props}{Named numeric vector, the proportion of features to retain in each dataset
-to be prefiltered (names should correspond to a dataset name). Value should be > 0 and < 1.
-Will be ignored if \code{to_keep_ns} is not \code{NULL}.}
+\item{to_keep_props}{Named numeric vector, the proportion of features to
+retain in each dataset to be prefiltered (names should correspond to a
+dataset name). Value should be > 0 and < 1. Will be ignored if \code{to_keep_ns}
+is not \code{NULL}.}
 
-\item{target_name_prefix}{Character, a prefix to add to the name of the targets created by this target factory.
-Default value is \code{""}.}
+\item{target_name_prefix}{Character, a prefix to add to the name of the
+targets created by this target factory. Default value is \code{""}.}
 
-\item{filtered_set_target_name}{Character, the name of the final target containing the filtered \code{MultiDataSet} object.
-If NULL, a name will automatically be supplied. Default value is \code{NULL}.}
+\item{filtered_set_target_name}{Character, the name of the final target
+containing the filtered \code{MultiDataSet} object. If NULL, a name will
+automatically be supplied. Default value is \code{NULL}.}
 
-\item{multilevel}{Character vector of length 1 or 3 to be used as information about repeated measurements.
-See \code{\link[=get_input_splsda]{get_input_splsda()}} for details. Default value is \code{NULL} (no repeated measurements).}
+\item{multilevel}{Character vector of length 1 or 3 to be used as information
+about repeated measurements. See \code{\link[=get_input_splsda]{get_input_splsda()}} for details. Default
+value is \code{NULL} (no repeated measurements).}
 
-\item{...}{Further arguments passed to the \code{\link{perf_splsda}} function.}
+\item{seed_perf}{Named integer vector, the seed to use for the
+\code{\link[=perf_splsda]{perf_splsda()}} function for each dataset. The length and names should
+match those of \code{to_keep_ns} or \code{to_keep_props}. If not named, the values
+will be used in order of the datasets in \code{to_keep_ns} or \code{to_keep_props}.
+Default value is \code{NULL}, i.e. no seed is set.
+@param seed_run Named integer vector, the seed to use for the
+\code{\link[=run_splsda]{run_splsda()}} function for each dataset. The length and names should
+match those of \code{to_keep_ns} or \code{to_keep_props}. If not named, the values
+will be used in order of the datasets in \code{to_keep_ns} or \code{to_keep_props}.
+Default value is \code{NULL}, i.e. no seed is set.}
+
+\item{...}{Further arguments passed to the \link{perf_splsda}
+function.}
 }
 \value{
-A list of target objects. With \code{target_name_prefix = ""} and \code{filtered_set_target_name = NULL},
-the following targets are created:
+A list of target objects. With \code{target_name_prefix = ""} and
+\code{filtered_set_target_name = NULL}, the following targets are created:
 \itemize{
-\item \code{splsda_spec}: generates a grouped tibble where each row corresponds to one dataset to be filtered,
-with the columns specifying each dataset name, and associated values from \code{to_keep_ns} and \code{to_keep_props}.
+\item \code{splsda_spec}: generates a grouped tibble where each row corresponds to one
+dataset to be filtered, with the columns specifying each dataset name, and
+associated values from \code{to_keep_ns} and \code{to_keep_props}.
 \itemize{
-\item \code{individual_splsda_input}: a dynamic branching target that runs the \code{\link[=get_input_splsda]{get_input_splsda()}} function for each dataset.
+\item \code{individual_splsda_input}: a dynamic branching target that runs the
+\code{\link[=get_input_splsda]{get_input_splsda()}} function for each dataset.
 }
-\item \code{individual_splsda_perf}: a dynamic branching target that runs the \code{\link[=perf_splsda]{perf_splsda()}} function for each dataset.
-\item \code{individual_splsda_run}: a dynamic branching target that runs the \code{\link[=run_splsda]{run_splsda()}} function for each dataset,
-using the results from \code{individual_splsda_perf} to guide the number of latent components to construct.
-\item \code{filtered_set_slpsda}: a target to retain from the original \code{MultiDataSet} object only features selected in each sPLS-DA run.
+\item \code{individual_splsda_perf}: a dynamic branching target that runs the
+\code{\link[=perf_splsda]{perf_splsda()}} function for each dataset.
+\item \code{individual_splsda_run}: a dynamic branching target that runs the
+\code{\link[=run_splsda]{run_splsda()}} function for each dataset, using the results from
+\code{individual_splsda_perf} to guide the number of latent components to
+construct.
+\item \code{filtered_set_slpsda}: a target to retain from the original \code{MultiDataSet}
+object only features selected in each sPLS-DA run.
 }
 }
 \description{
-Creates a list of targets to perform feature preselection on datasets from a \code{MultiDataSet}
-object with sPLS-DA (from the \code{mixOmics} package).
+Creates a list of targets to perform feature preselection on datasets from a
+\code{MultiDataSet} object with sPLS-DA (from the \code{mixOmics} package).
 }
 \examples{
 \dontrun{
diff --git a/man/perf_splsda.Rd b/man/perf_splsda.Rd
index 9169b09..fe9de11 100644
--- a/man/perf_splsda.Rd
+++ b/man/perf_splsda.Rd
@@ -2,7 +2,8 @@
 % Please edit documentation in R/prefiltering.R
 \name{perf_splsda}
 \alias{perf_splsda}
-\title{Assess optimal number of components for sPLS-DA on omics dataset from MultiDataSet object}
+\title{Assess optimal number of components for sPLS-DA on omics dataset from
+MultiDataSet object}
 \usage{
 perf_splsda(
   splsda_input,
@@ -13,53 +14,69 @@ perf_splsda(
   measure = "BER",
   distance = "centroids.dist",
   cpus = 1,
-  progressBar = TRUE
+  progressBar = TRUE,
+  seed = NULL
 )
 }
 \arguments{
-\item{splsda_input}{Input for the sPLS-DA functions from mixOmics, created with \code{\link[=get_input_splsda]{get_input_splsda()}}.}
+\item{splsda_input}{Input for the sPLS-DA functions from mixOmics, created
+with \code{\link[=get_input_splsda]{get_input_splsda()}}.}
 
-\item{ncomp_max}{Integer, the maximum number of latent components to test when estimating the number of
-latent components to use. Default value is \code{5}.}
+\item{ncomp_max}{Integer, the maximum number of latent components to test
+when estimating the number of latent components to use. Default value is
+\code{5}.}
 
-\item{validation}{Character, which cross-validation method to use, can be one of \code{"Mfold"} or \code{"loo"}
-(see \code{\link[mixOmics:perf]{mixOmics::perf()}}). Default value is \code{"Mfold"}.}
+\item{validation}{Character, which cross-validation method to use, can be one
+of \code{"Mfold"} or \code{"loo"} (see \code{\link[mixOmics:perf]{mixOmics::perf()}}). Default value is
+\code{"Mfold"}.}
 
-\item{folds}{Integer, number of folds to use in the M-fold cross-validation (see \code{\link[mixOmics:perf]{mixOmics::perf()}}).
-Default value is 5.}
+\item{folds}{Integer, number of folds to use in the M-fold cross-validation
+(see \code{\link[mixOmics:perf]{mixOmics::perf()}}). Default value is 5.}
 
-\item{nrepeat}{Integer, number of times the cross-validation is repeated (see \code{\link[mixOmics:perf]{mixOmics::perf()}}).}
+\item{nrepeat}{Integer, number of times the cross-validation is repeated (see
+\code{\link[mixOmics:perf]{mixOmics::perf()}}).}
 
-\item{measure}{Performance measure used to select the optimal value of \code{ncomp}, can be one of \code{"BER"} or \code{"overall"}
-(see \code{\link[mixOmics:perf]{mixOmics::perf()}}).
+\item{measure}{Performance measure used to select the optimal value of
+\code{ncomp}, can be one of \code{"BER"} or \code{"overall"} (see \code{\link[mixOmics:perf]{mixOmics::perf()}}).
 Default value is \code{"BER"}.}
 
-\item{distance}{Distance metric used to select the optimal value of \code{ncomp}, can be one of \code{"max.dist"},
-\code{"centroids.dist"} or \code{"mahalanobis.dist"} (see \code{\link[mixOmics:perf]{mixOmics::perf()}}). Default value is \code{"centroids.dist"}.}
+\item{distance}{Distance metric used to select the optimal value of \code{ncomp},
+can be one of \code{"max.dist"}, \code{"centroids.dist"} or \code{"mahalanobis.dist"} (see
+\code{\link[mixOmics:perf]{mixOmics::perf()}}). Default value is \code{"centroids.dist"}.}
 
 \item{cpus}{Integer, number of cpus to use.}
 
-\item{progressBar}{Logical, whether to display a progress bar during the optimisation of \code{ncomp}. Default
-value is \code{TRUE}.}
+\item{progressBar}{Logical, whether to display a progress bar during the
+optimisation of \code{ncomp}. Default value is \code{TRUE}.}
+
+\item{seed}{Integer, seed to use. Default is \code{NULL}, i.e. no seed is set
+inside the function.}
 }
 \value{
-A list as per the output of the \code{\link[mixOmics:perf]{mixOmics::perf()}} function, with the following additional elements:
+A list as per the output of the \code{\link[mixOmics:perf]{mixOmics::perf()}} function, with
+the following additional elements:
 \itemize{
 \item \code{dataset_name}: the name of the dataset analysed;
-\item \code{group}: column name in the samples information data-frame used as samples group;
-\item \code{optim_ncomp}: the optimal number of latent components as per the \code{measure} and \code{distance} specified;
-\item \code{optim_measure}: the measure used to select the optimal number of latent components;
-\item \code{optim_distance}: the distance metric used to select the optimal number of latent components.
+\item \code{group}: column name in the samples information data-frame used as samples
+group;
+\item \code{optim_ncomp}: the optimal number of latent components as per the \code{measure}
+and \code{distance} specified;
+\item \code{optim_measure}: the measure used to select the optimal number of latent
+components;
+\item \code{optim_distance}: the distance metric used to select the optimal number of
+latent components.
+In addition, the name of the dataset analysed and the column name in the
+samples information data-frame used as samples group as stored as attributes
+\code{dataset_name} and \code{group}, respectively.
 }
-In addition, the name of the dataset analysed and the column name in the samples information data-frame
-used as samples group as stored as attributes \code{dataset_name} and \code{group}, respectively.
 }
 \description{
-Performs cross-validation for a PLS-DA run (implemented in the \code{mixOmics} package) on an omics dataset from a
-\code{MultiDataSet} object. This allows to estimate the optimal number of latent components to construct.
-This is intended for feature preselection in the omics dataset (see examples below).
+Performs cross-validation for a PLS-DA run (implemented in the \code{mixOmics}
+package) on an omics dataset from a \code{MultiDataSet} object. This allows to
+estimate the optimal number of latent components to construct. This is
+intended for feature preselection in the omics dataset (see examples below).
 }
 \details{
-This function uses the \code{\link[mixOmics]{plsda}} and  \code{\link[mixOmics]{perf}}
-function from the \code{mixOmics} package.
+This function uses the \code{\link[mixOmics:plsda]{mixOmics::plsda()}} and  \code{\link[mixOmics:perf]{mixOmics::perf()}} function
+from the \code{mixOmics} package.
 }
diff --git a/man/plot_feature_preselection_splsda.Rd b/man/plot_feature_preselection_splsda.Rd
index ab4528c..a3ae92e 100644
--- a/man/plot_feature_preselection_splsda.Rd
+++ b/man/plot_feature_preselection_splsda.Rd
@@ -11,22 +11,26 @@ plot_feature_preselection_splsda(
 )
 }
 \arguments{
-\item{perf_splsda_res}{A list with the result from the \code{\link{perf_splsda}} for each dataset
-to be filtered.}
+\item{perf_splsda_res}{A list with the result from the
+\link{perf_splsda} for each dataset to be filtered.}
 
-\item{measure}{Which measure(s) should be displayed? Can be one of \code{"BER"}
-or \code{"overall"}. If NULL, all measures will be displayed. Default value is \code{NULL}.}
+\item{measure}{Which measure(s) should be displayed? Can be one of \code{"BER"} or
+\code{"overall"}. If NULL, all measures will be displayed. Default value is
+\code{NULL}.}
 
-\item{distance}{Which measure(s) should be displayed? Can be one of \code{"max.dist"},
-\code{"centroids.dist"} or \code{"mahalanobis.dist"}. If NULL, all measures will be displayed. Default value is \code{NULL}.}
+\item{distance}{Which measure(s) should be displayed? Can be one of
+\code{"max.dist"}, \code{"centroids.dist"} or \code{"mahalanobis.dist"}. If NULL, all
+measures will be displayed. Default value is \code{NULL}.}
 }
 \value{
 A ggplot.
 }
 \description{
 Displays the PLS-DA classification performance across different number of
-latent components for each prefiltered dataset. The classification error rates are computed with different measures
-(column facets) and different distance metrics (colours). A vertical grey bar represents for each dataset the number
-of latent components selected for the feature preselection step. In addition, a circle highlights the measure and
-distance metric used to select the number of latent component.
+latent components for each prefiltered dataset. The classification error
+rates are computed with different measures (column facets) and different
+distance metrics (colours). A vertical grey bar represents for each dataset
+the number of latent components selected for the feature preselection step.
+In addition, a circle highlights the measure and distance metric used to
+select the number of latent component.
 }
diff --git a/man/run_splsda.Rd b/man/run_splsda.Rd
index acf460e..9cd6e6b 100644
--- a/man/run_splsda.Rd
+++ b/man/run_splsda.Rd
@@ -9,34 +9,43 @@ run_splsda(
   perf_res,
   to_keep_n = NULL,
   to_keep_prop = NULL,
-  ncomp = NULL
+  ncomp = NULL,
+  seed = NULL
 )
 }
 \arguments{
-\item{splsda_input}{Input for the sPLS-DA functions from mixOmics, created with \code{\link[=get_input_splsda]{get_input_splsda()}}.}
+\item{splsda_input}{Input for the sPLS-DA functions from mixOmics, created
+with \code{\link[=get_input_splsda]{get_input_splsda()}}.}
 
-\item{perf_res}{Result of the \code{\link{perf_splsda}} function. If not supplied, sPLS-DA will be run on
-dataset specified by argument \code{dataset_name} with number of latent components specified by argument \code{comp}.}
+\item{perf_res}{Result of the \code{\link[=perf_splsda]{perf_splsda()}} function. If not supplied,
+sPLS-DA will be run on dataset specified by argument \code{dataset_name} with
+number of latent components specified by argument \code{comp}.}
 
-\item{to_keep_n}{Integer, the number of features to retain in the dataset. Should be less than the number of
-features in the dataset. If \code{NULL} or \code{NA}, \code{to_keep_prop} will be used instead.}
+\item{to_keep_n}{Integer, the number of features to retain in the dataset.
+Should be less than the number of features in the dataset. If \code{NULL} or
+\code{NA}, \code{to_keep_prop} will be used instead.}
 
-\item{to_keep_prop}{Numeric, the proportion of features to retain in the dataset. Will be ignored if \code{to_keep_n}
-is supplied. Value should be > 0 and < 1.}
+\item{to_keep_prop}{Numeric, the proportion of features to retain in the
+dataset. Will be ignored if \code{to_keep_n} is supplied. Value should be > 0
+and < 1.}
 
-\item{ncomp}{Integer, number of latent components to construct. Ignored if \code{perf_res} is supplied.
-Default value is \code{NULL}.}
+\item{ncomp}{Integer, number of latent components to construct. Ignored if
+\code{perf_res} is supplied. Default value is \code{NULL}.}
+
+\item{seed}{Integer, seed to use. Default is \code{NULL}, i.e. no seed is set
+inside the function.}
 }
 \value{
-A list as per the output of the \code{\link[mixOmics]{splsda}} function.
+A list as per the output of the \code{\link[mixOmics:splsda]{mixOmics::splsda()}} function.
 }
 \description{
-Performs a sPLS-DA (implemented in the \code{mixOmics}) package on a omics dataset from a
-MultiDataSet object. This is intended for feature preselection in the omics dataset
-(see \code{\link{get_filtered_dataset_splsda}}).
+Performs a sPLS-DA (implemented in the \code{mixOmics}) package on a omics dataset
+from a MultiDataSet object. This is intended for feature preselection in the
+omics dataset (see \code{\link[=get_filtered_dataset_splsda]{get_filtered_dataset_splsda()}}).
 }
 \details{
-This function uses the \code{\link[mixOmics]{plsda}} function from the \code{mixOmics} package.
-Note that the sPLS-DA method can select the same feature for several latent components, so the number of
-features retained for a dataset might be less than the number specified in the \code{to_keep_n} argument.
+This function uses the \code{\link[mixOmics:plsda]{mixOmics::plsda()}} function from the \code{mixOmics}
+package. Note that the sPLS-DA method can select the same feature for several
+latent components, so the number of features retained for a dataset might be
+less than the number specified in the \code{to_keep_n} argument.
 }
diff --git a/tests/testthat/test-prefiltering.R b/tests/testthat/test-prefiltering.R
index e89bf26..1524bab 100644
--- a/tests/testthat/test-prefiltering.R
+++ b/tests/testthat/test-prefiltering.R
@@ -21,7 +21,7 @@ test_that("feature_preselection_splsda_factory works", {
   ## whitespace error I cannot fix
   expect_equal(
     tar_res[[1]]$command$expr |> deparse() |> paste0(collapse = ""),
-    "expression(tar_group(dplyr::group_by(tibble::tibble(dsn = c(\"snps+A\", \"metabolome\"), tkn = list(50, 30), tkp = NULL), dsn)))"
+    "expression(tar_group(dplyr::group_by(tibble::tibble(dsn = c(\"snps+A\", \"metabolome\"), tkn = list(50, 30), tkp = NULL, sp = NULL, sr = NULL),     dsn)))"
   )
 
   # Testing targets command
@@ -31,9 +31,10 @@ test_that("feature_preselection_splsda_factory works", {
     }),
     list(
       expression(get_input_splsda(mo_data, splsda_spec$dsn, "pheno_group", NULL)),
-      expression(perf_splsda(individual_splsda_input)),
+      expression(perf_splsda(individual_splsda_input, seed = splsda_spec$sp)),
       expression(run_splsda(individual_splsda_input, perf_res = individual_splsda_perf,
-                            to_keep_n = splsda_spec$tkn, to_keep_prop = splsda_spec$tkp)),
+                            to_keep_n = splsda_spec$tkn, to_keep_prop = splsda_spec$tkp,
+                            seed = splsda_spec$sr)),
       expression(get_filtered_dataset_splsda(mo_data, individual_splsda_run))
     )
   )
@@ -46,7 +47,7 @@ test_that("feature_preselection_splsda_factory works", {
     list(
       NULL,
       expression(map(splsda_spec)),
-      expression(map(individual_splsda_input)),
+      expression(map(individual_splsda_input, splsda_spec)),
       expression(map(individual_splsda_input, individual_splsda_perf, splsda_spec)),
       NULL
     )
@@ -106,7 +107,7 @@ test_that("feature_preselection_splsda_factory works", {
   )
   expect_equal(
     tar_res3[[3]]$command$expr,
-    expression(perf_splsda(individual_splsda_input, ncomp_max = 10, folds = 3))
+    expression(perf_splsda(individual_splsda_input, seed = splsda_spec$sp, ncomp_max = 10, folds = 3))
   )
 
   ## Testing custom complete target name + custom prefix for target names
@@ -125,3 +126,41 @@ test_that("feature_preselection_splsda_factory works", {
       "test_individual_splsda_run", "final_test")
   )
 })
+
+test_that("feature_preselection_splsda_factory works (seed)", {
+  tar_res <- feature_preselection_splsda_factory(
+    mo_data,
+    group = "pheno_group",
+    to_keep_ns = list("snps+A" = 50, "metabolome" = 30)
+  )
+
+  expect_error(
+    feature_preselection_splsda_factory(
+      mo_data,
+      group = "pheno_group",
+      to_keep_ns = list("snps+A" = 50, "metabolome" = 30),
+      seed_perf = 1
+    ),
+    "`seed_perf` should be an integer vector with same length as `to_keep_ns` or `to_keep_props`."
+  )
+
+  expect_error(
+    feature_preselection_splsda_factory(
+      mo_data,
+      group = "pheno_group",
+      to_keep_ns = list("snps+A" = 50, "metabolome" = 30),
+      seed_perf = c("test" = 1, "rnaseq" = 2)
+    ),
+    "`seed_perf` names do not match `to_keep_ns` or `to_keep_props` names."
+  )
+
+  expect_no_error(
+    feature_preselection_splsda_factory(
+      mo_data,
+      group = "pheno_group",
+      to_keep_ns = list("snps+A" = 50, "metabolome" = 30),
+      seed_perf = c(1, 2)
+    )
+  )
+
+})