nf-core · nictru · Jun 13, 2024 · Jun 13, 2024 · Jun 13, 2024 · Jun 13, 2024
diff --git a/README.md b/README.md
@@ -25,15 +25,15 @@
 
 Not all of the steps shown in the metromap have already been implemented, as this pipeline is still under development. However, the following steps are already functional:
 
-1. Per-dataset preprocessing
+1. Per-sample preprocessing
    1. Convert all RDS files to h5ad format
    2. Present QC for raw counts ([`MultiQC`](http://multiqc.info/))
    3. Apply user-defined QC filters
    4. Remove ambient RNA
       - [decontX](https://bioconductor.org/packages/release/bioc/html/decontX.html)
    5. Doublet detection
       - [SOLO](https://docs.scvi-tools.org/en/stable/user_guide/models/solo.html)
-2. Dataset aggregation
+2. Sample aggregation
    1. Merge into a single h5ad file
    2. Present QC for merged counts ([`MultiQC`](http://multiqc.info/))
    3. Integration
@@ -53,9 +53,9 @@ First, prepare a samplesheet with your input data that looks as follows:
 `samplesheet.csv`:
 
 ```csv
-dataset,file
-dataset1,/absolute/path/to/dataset1.h5ad
-dataset2,relative/path/to/dataset2.rds
+sample,file
+sample1,/absolute/path/to/sample1.h5ad
+sample2,relative/path/to/sample2.rds
 ```
 
 Each row represents a h5ad or RDS file. RDS files may contain any object that can be converted to a SingleCellExperiment using the [Seurat `as.SingleCellExperiment`](https://satijalab.org/seurat/reference/as.singlecellexperiment) function.

diff --git a/assets/nf-core-scdownstream_logo_light.png b/assets/nf-core-scdownstream_logo_light.png
diff --git a/assets/schema_input.json b/assets/schema_input.json
@@ -7,10 +7,10 @@
     "items": {
         "type": "object",
         "properties": {
-            "dataset": {
+            "sample": {
                 "type": "string",
                 "pattern": "^\\S+$",
-                "errorMessage": "Dataset name must be provided and cannot contain spaces",
+                "errorMessage": "Sample name must be provided and cannot contain spaces",
                 "meta": ["id"]
             },
             "file": {
@@ -41,6 +41,6 @@
                 "meta": ["unknown_label"]
             }
         },
-        "required": ["dataset", "file"]
+        "required": ["sample", "file"]
     }
 }
diff --git a/conf/test.config b/conf/test.config
@@ -20,10 +20,5 @@ params {
     max_time   = '6.h'
 
     // Input data
-    // TODO nf-core: Specify the paths to your test data on nf-core/test-datasets
-    // TODO nf-core: Give any required params for the test so that command line flags are not needed
-    input  = params.pipelines_testdata_base_path + 'viralrecon/samplesheet/samplesheet_test_illumina_amplicon.csv'
-
-    // Genome references
-    genome = 'R64-1-1'
+    input  = params.pipelines_testdata_base_path + 'scdownstream/samplesheet.csv'
 }
diff --git a/conf/test_full.config b/conf/test_full.config
@@ -15,10 +15,5 @@ params {
     config_profile_description = 'Full test dataset to check pipeline function'
 
     // Input data for full size test
-    // TODO nf-core: Specify the paths to your full test data ( on nf-core/test-datasets or directly in repositories, e.g. SRA)
-    // TODO nf-core: Give any required params for the test so that command line flags are not needed
-    input = params.pipelines_testdata_base_path + 'viralrecon/samplesheet/samplesheet_full_illumina_amplicon.csv'
-
-    // Genome references
-    genome = 'R64-1-1'
+    input  = params.pipelines_testdata_base_path + 'scdownstream/samplesheet.csv'
 }
diff --git a/modules/local/adata/unify/environment.yml b/modules/local/adata/unify/environment.yml
@@ -3,4 +3,4 @@ channels:
   - conda-forge
   - bioconda
 dependencies:
-  - conda-forge::anndata=0.10.7
+  - conda-forge::scanpy=1.10.1
diff --git a/modules/local/adata/unify/main.nf b/modules/local/adata/unify/main.nf
@@ -4,8 +4,8 @@ process ADATA_UNIFY {
 
     conda "${moduleDir}/environment.yml"
     container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
-        'oras://community.wave.seqera.io/library/anndata:0.10.7--e9840a94592528c8':
-        'community.wave.seqera.io/library/anndata:0.10.7--336c6c1921a0632b' }"
+        'oras://community.wave.seqera.io/library/scanpy:1.10.1--ea08051addf267ac':
+        'community.wave.seqera.io/library/scanpy:1.10.1--0c8c97148fc05558' }"
 
     input:
     tuple val(meta), path(h5ad)

diff --git a/modules/local/adata/unify/templates/unify.py b/modules/local/adata/unify/templates/unify.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python3
 
-import anndata as ad
+import scanpy as sc
 import scipy
 from scipy.sparse import csr_matrix
 import platform
@@ -24,7 +24,7 @@ def format_yaml_like(data: dict, indent: int = 0) -> str:
             yaml_str += f"{spaces}{key}: {value}\\n"
     return yaml_str
 
-adata = ad.read_h5ad("$h5ad")
+adata = sc.read_h5ad("$h5ad")
 
 # Unify batches
 batch_col = "${meta.batch_col}"
@@ -57,23 +57,27 @@ def format_yaml_like(data: dict, indent: int = 0) -> str:
         raise ValueError("The label column already exists.")
     adata.obs["label"] = "unknown"
 
-# Add "dataset" column
-if "dataset" in adata.obs:
-    raise ValueError("The dataset column already exists.")
-adata.obs["dataset"] = "${meta.id}"
+# Add "sample" column
+if "sample" in adata.obs and not adata.obs["sample"].equals("${meta.id}"):
+    adata.obs["sample_original"] = adata.obs["sample"]
+adata.obs["sample"] = "${meta.id}"
 
 # Convert to CSR matrix
 adata.X = csr_matrix(adata.X)
 adata.layers["counts"] = adata.X
 
+# Perform basic filtering
+sc.pp.filter_cells(adata, min_genes=1)
+sc.pp.filter_genes(adata, min_cells=1)
+
 adata.write_h5ad("${prefix}.h5ad")
 
 # Versions
 
 versions = {
     "${task.process}": {
         "python": platform.python_version(),
-        "anndata": ad.__version__,
+        "scanpy": sc.__version__,
         "scipy": scipy.__version__
     }
 }

diff --git a/modules/local/adata/upsetgenes/main.nf b/modules/local/adata/upsetgenes/main.nf
@@ -20,6 +20,6 @@ process ADATA_UPSETGENES {
 
     script:
     prefix    = task.ext.prefix    ?: "${meta.id}"
-    split_col = task.ext.split_col ?: 'dataset'
+    split_col = task.ext.split_col ?: 'sample'
     template 'upsetplot.py'
 }
diff --git a/modules/local/adata/upsetgenes/templates/upsetplot.py b/modules/local/adata/upsetgenes/templates/upsetplot.py
@@ -30,17 +30,17 @@ def format_yaml_like(data: dict, indent: int = 0) -> str:
 adata = sc.read_h5ad("${h5ad}")
 split_col = "${split_col}"
 prefix = "${prefix}"
-dataset_genes = {}
+sample_genes = {}
 
-# Split into multiple adatas, based on dataset
-datasets = adata.obs[split_col].unique()
-for dataset in datasets:
-    adata_dataset = adata[adata.obs[split_col] == dataset].copy()
+# Split into multiple adatas, based on sample
+samples = adata.obs[split_col].unique()
+for sample in samples:
+    adata_sample = adata[adata.obs[split_col] == sample].copy()
     # Keep only genes with at least 1 count in at least 1 cell
-    sc.pp.filter_genes(adata_dataset, min_cells=1)
-    dataset_genes[dataset] = set(adata_dataset.var_names)
+    sc.pp.filter_genes(adata_sample, min_cells=1)
+    sample_genes[sample] = set(adata_sample.var_names)
 
-plot_data = upsetplot.from_contents(dataset_genes)
+plot_data = upsetplot.from_contents(sample_genes)
 
 upsetplot.plot(plot_data, sort_by="cardinality", show_counts=True, min_subset_size=10)
 plot_path = f"{prefix}_{split_col}_genes.png"

diff --git a/subworkflows/local/preprocess.nf b/subworkflows/local/preprocess.nf
@@ -8,23 +8,23 @@ include { SCANPY_PLOTQC as QC_FILTERED } from '../../modules/local/scanpy/plotqc
 workflow PREPROCESS {
 
     take:
-    ch_datasets // channel: [ val(meta), file ]
+    ch_samples // channel: [ val(meta), file ]
 
     main:
 
     ch_versions = Channel.empty()
     ch_multiqc_files = Channel.empty()
 
-    ch_datasets = ch_datasets.map { meta, file -> [meta, file, file.extension.toLowerCase()] }
+    ch_samples = ch_samples.map { meta, file -> [meta, file, file.extension.toLowerCase()] }
         .branch { meta, file, ext ->
             h5ad: ext == "h5ad"
                 return [meta, file]
             rds: ext == "rds"
                 return [meta, file]
         }
 
-    ADATA_READRDS(ch_datasets.rds)
-    ch_h5ad = ch_datasets.h5ad.mix(ADATA_READRDS.out.h5ad)
+    ADATA_READRDS(ch_samples.rds)
+    ch_h5ad = ch_samples.h5ad.mix(ADATA_READRDS.out.h5ad)
     ch_versions = ch_versions.mix(ADATA_READRDS.out.versions)
 
     ADATA_UNIFY(ch_h5ad)

diff --git a/workflows/scdownstream.nf b/workflows/scdownstream.nf
@@ -33,15 +33,15 @@ workflow SCDOWNSTREAM {
     ch_multiqc_files = Channel.empty()
 
     //
-    // Per-dataset preprocessing
+    // Per-sample preprocessing
     //
 
     PREPROCESS(ch_samplesheet)
     ch_versions = ch_versions.mix(PREPROCESS.out.versions)
     ch_multiqc_files = ch_multiqc_files.mix(PREPROCESS.out.multiqc_files)
 
     //
-    // Combine datasets and perform integration
+    // Combine samples and perform integration
     //
 
     COMBINE(PREPROCESS.out.h5ad)