diff --git a/assets/schema_input.json b/assets/schema_input.json index 7a91546..56498f9 100644 --- a/assets/schema_input.json +++ b/assets/schema_input.json @@ -88,6 +88,14 @@ "errorMessage": "Minimum number of counts per gene must be an integer greater than 0.", "meta": ["min_counts_gene"] }, + "max_mito_fraction": { + "type": "integer", + "minimum": 0, + "maximum": 100, + "default": 100, + "errorMessage": "Max mitochondrial fraction must be an integer between 0 and 100.", + "meta": ["max_mito_fraction"] + }, "expected_cells": { "type": "integer", "minimum": 1, diff --git a/docs/usage.md b/docs/usage.md index 5812cf2..47e349c 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -46,20 +46,21 @@ sample3,/absolute/path/to/sample3_filtered.csv,/absolute/path/to/sample3.csv,,,, For CSV input files, specifying the `batch_col`, `label_col`, and `unknown_label` columns will not have any effect, as no additional metadata is available in the CSV file. -| Column | Description | -| ----------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| `sample` | Unique sample identifier. Will be added to the pipeline output objects as `sample` column. | -| `filtered` | May contain paths to `h5ad`, `h5`, `rds`, or `csv` files. `rds` files may contain any object that can be converted to a `SingleCellExperiment` using the [Seurat `as.SingleCellExperiment`](https://satijalab.org/seurat/reference/as.singlecellexperiment) function. `csv` files should contain a matrix with genes as columns and cells as rows. | -| `unfiltered` | Same as `file`, but for the unfiltered cellranger or nf-core/scrnaseq output. If not provided, only `decontX` can be used for ambient RNA removal. | -| `batch_col` | Column in the input file containing batch information. Defaults to `batch`. If the column does not exist in the input object, the pipeline will create a new column and put the sample identifier in it. If the `batch_col` is something else than `batch`, it will be renamed to `batch` during pipeline execution. | -| `symbol_col` | Column in the input file containing gene symbol information. Defaults to `index`. There are two special values that can be used: `index` and `none`. `index` will use the row names of the matrix as gene symbols. `none` will trigger the pipeline to perform gene symbol conversion (this is not supported yet). The values from `symbol_col` will be copied to a column `gene_symbols` during pipeline execution. | -| `label_col` | Column in the input file containing cell type information. Defaults to `label`. If the column does not exist in the input object, the pipeline will create a new column and put `unknown` in it. If the `label_col` is something else than `label`, it will be renamed to `label` during pipeline execution. | -| `unknown_label` | Value in the `label_col` column that should be considered as unknown. Defaults to `unknown`. If the `unknown_label` is something else than `unknown`, it will be renamed to `unknown` during pipeline execution. If trying to perform integration with scANVI, more than one unique label other than `unknown` must exist in the input data. | -| `min_genes` | Minimum number of genes required for a cell to be considered. Defaults to `1`. | -| `min_cells` | Minimum number of cells required for a gene to be considered. Defaults to `1`. | -| `min_counts_cell` | Minimum number of counts required for a cell to be considered. Defaults to `1`. | -| `min_counts_gene` | Minimum number of counts required for a gene to be considered. Defaults to `1`. | -| `expected_cells` | Number of expected cells, used as input to Cellbender. | +| Column | Description | +| ------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `sample` | Unique sample identifier. Will be added to the pipeline output objects as `sample` column. | +| `filtered` | May contain paths to `h5ad`, `h5`, `rds`, or `csv` files. `rds` files may contain any object that can be converted to a `SingleCellExperiment` using the [Seurat `as.SingleCellExperiment`](https://satijalab.org/seurat/reference/as.singlecellexperiment) function. `csv` files should contain a matrix with genes as columns and cells as rows. | +| `unfiltered` | Same as `file`, but for the unfiltered cellranger or nf-core/scrnaseq output. If not provided, only `decontX` can be used for ambient RNA removal. | +| `batch_col` | Column in the input file containing batch information. Defaults to `batch`. If the column does not exist in the input object, the pipeline will create a new column and put the sample identifier in it. If the `batch_col` is something else than `batch`, it will be renamed to `batch` during pipeline execution. | +| `symbol_col` | Column in the input file containing gene symbol information. Defaults to `index`. There are two special values that can be used: `index` and `none`. `index` will use the row names of the matrix as gene symbols. `none` will trigger the pipeline to perform gene symbol conversion (this is not supported yet). The values from `symbol_col` will be copied to a column `gene_symbols` during pipeline execution. | +| `label_col` | Column in the input file containing cell type information. Defaults to `label`. If the column does not exist in the input object, the pipeline will create a new column and put `unknown` in it. If the `label_col` is something else than `label`, it will be renamed to `label` during pipeline execution. | +| `unknown_label` | Value in the `label_col` column that should be considered as unknown. Defaults to `unknown`. If the `unknown_label` is something else than `unknown`, it will be renamed to `unknown` during pipeline execution. If trying to perform integration with scANVI, more than one unique label other than `unknown` must exist in the input data. | +| `min_genes` | Minimum number of genes required for a cell to be considered. Defaults to `1`. | +| `min_cells` | Minimum number of cells required for a gene to be considered. Defaults to `1`. | +| `min_counts_cell` | Minimum number of counts required for a cell to be considered. Defaults to `1`. | +| `min_counts_gene` | Minimum number of counts required for a gene to be considered. Defaults to `1`. | +| `expected_cells` | Number of expected cells, used as input to Cellbender. | +| `max_mito_fraction` | Maximum fraction of mitochondrial reads for a cell to be considered. Defaults to `100`. | An [example samplesheet](../assets/samplesheet.csv) has been provided with the pipeline. @@ -107,6 +108,10 @@ genome: 'GRCh37' You can also generate such `YAML`/`JSON` files via [nf-core/launch](https://nf-co.re/launch). +### Cell type annotation + +Automated cell type annotation using [Celltypist](https://github.com/Teichlab/celltypist) is supported. You can specify the models to use with the [`celltypist_model` parameter](https://nf-co.re/scdownstream/dev/parameters/#celltypist_model). If no models are specified, no cell type annotation will be performed. + ### Reference mapping The pipeline supports mapping new samples into the latent space of an existing scVI/scANVI model. diff --git a/modules/local/scanpy/filter/main.nf b/modules/local/scanpy/filter/main.nf index 3ac4abe..0fcb7e5 100644 --- a/modules/local/scanpy/filter/main.nf +++ b/modules/local/scanpy/filter/main.nf @@ -22,6 +22,7 @@ process SCANPY_FILTER { min_cells = meta.min_cells ?: 1 min_counts_gene = meta.min_counts_gene ?: 1 min_counts_cell = meta.min_counts_cell ?: 1 + max_mito_fraction = meta.max_mito_fraction ?: 100 prefix = task.ext.prefix ?: "${meta.id}" template 'filter.py' } diff --git a/modules/local/scanpy/filter/templates/filter.py b/modules/local/scanpy/filter/templates/filter.py index 4bcbc05..847cd11 100644 --- a/modules/local/scanpy/filter/templates/filter.py +++ b/modules/local/scanpy/filter/templates/filter.py @@ -29,6 +29,12 @@ def format_yaml_like(data: dict, indent: int = 0) -> str: adata = sc.read_h5ad("${h5ad}") prefix = "${prefix}" +adata.var["mt"] = adata.var_names.str.startswith(("MT-", "mt-")) +sc.pp.calculate_qc_metrics( + adata, qc_vars=["mt"], percent_top=None, log1p=False, inplace=True +) +adata = adata[adata.obs.pct_counts_mt < int("${max_mito_fraction}"), :].copy() + sc.pp.filter_cells(adata, min_counts=int("${min_counts_cell}")) sc.pp.filter_genes(adata, min_counts=int("${min_counts_gene}"))