From c1f4acade1bc4e991ef54f4e17c19a02ee86c500 Mon Sep 17 00:00:00 2001 From: Ramprasad Neethiraj <20065894+ramprasadn@users.noreply.github.com> Date: Fri, 21 Jul 2023 10:50:30 +0200 Subject: [PATCH] review suggestions --- CHANGELOG.md | 1 - docs/output.md | 23 +++++++++++----------- lib/WorkflowRaredisease.groovy | 4 ---- nextflow_schema.json | 31 ++++++++++++++++++++++++++++- workflows/raredisease.nf | 36 ---------------------------------- 5 files changed, 42 insertions(+), 53 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 204d7a6c..65e05ecf 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -8,7 +8,6 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### `Added` - Add GATK's cnv calling pipeline [#362](https://github.com/nf-core/raredisease/pull/362) -- Add `public_aws_ecr` profile for using AWS ECR public gallery images [#360](https://github.com/nf-core/raredisease/pull/360) - GATK's ShiftFasta to generate all the files required for mitochondrial analysis [#354](https://github.com/nf-core/raredisease/pull/354) - Feature to calculate CADD scores for indels [#325](https://github.com/nf-core/raredisease/pull/325) - HmtNote to annotate mitochondria [#355](https://github.com/nf-core/raredisease/pull/355) diff --git a/docs/output.md b/docs/output.md index 66fe854a..6d07a3bd 100644 --- a/docs/output.md +++ b/docs/output.md @@ -54,6 +54,7 @@ The pipeline is built using [Nextflow](https://www.nextflow.io/) and processes d - [Annotation:](#annotation-) - [HaploGrep2](#haplogrep2) - [vcfanno](#vcfanno-1) + - [CADD](#cadd-1) - [VEP](#vep-2) - [HmtNote](#hmtnote) - [Rank variants and filtering](#rank-variants-and-filtering) @@ -247,15 +248,15 @@ The pipeline performs variant calling using [Sentieon DNAscope](https://support. #### Manta -[Manta](https://github.com/Illumina/manta) calls structural variants (SVs) and indels from mapped paired-end sequencing reads. It combines paired and split-read evidence during SV discovery and scoring to improve accuracy, but does not require split-reads or successful breakpoint assemblies to report a variant in cases where there is strong evidence otherwise. Output vcf files are treated as intermediates and are not placed in the output folder by default. +[Manta](https://github.com/Illumina/manta) calls structural variants (SVs) and indels from mapped paired-end sequencing reads. It combines paired and split-read evidence during SV discovery and scoring to improve accuracy, but does not require split-reads or successful breakpoint assemblies to report a variant in cases where there is strong evidence otherwise. Output vcf files are treated as intermediates and are not placed in the output folder. #### TIDDIT sv -[TIDDIT's sv](https://github.com/SciLifeLab/TIDDIT) is used to identify chromosomal rearrangements using sequencing data. TIDDIT identifies intra and inter-chromosomal translocations, deletions, tandem-duplications and inversions, using supplementary alignments as well as discordant pairs. TIDDIT searches for discordant reads and split reads (supplementary alignments). Output vcf files are treated as intermediates and are not placed in the output folder by default. +[TIDDIT's sv](https://github.com/SciLifeLab/TIDDIT) is used to identify chromosomal rearrangements using sequencing data. TIDDIT identifies intra and inter-chromosomal translocations, deletions, tandem-duplications and inversions, using supplementary alignments as well as discordant pairs. TIDDIT searches for discordant reads and split reads (supplementary alignments). Output vcf files are treated as intermediates and are not placed in the output folder. #### GATK GermlineCNVCaller - CNV calling -[GATK GermlineCNVCaller](https://github.com/broadinstitute/gatk) is used to identify copy number variants in germline samples given their read counts and a model describing a sample's ploidy. Output vcf files are treated as intermediates and are not placed in the output folder by default. +[GATK GermlineCNVCaller](https://github.com/broadinstitute/gatk) is used to identify copy number variants in germline samples given their read counts and a model describing a sample's ploidy. Output vcf files are treated as intermediates and are not placed in the output folder. #### SVDB merge @@ -302,17 +303,17 @@ The pipeline performs variant calling using [Sentieon DNAscope](https://support. #### bcftools roh -[bcftools roh](https://samtools.github.io/bcftools/bcftools.html#roh) is a program for detecting runs of homo/autozygosity.from only bi-allelic sites. The output files are not published in the output folder by default, and is passed to vcfanno for further annotation. +[bcftools roh](https://samtools.github.io/bcftools/bcftools.html#roh) is a program for detecting runs of homo/autozygosity.from only bi-allelic sites. The output files are not published in the output folder, and is passed to vcfanno for further annotation. #### vcfanno -[vcfanno](https://github.com/brentp/vcfanno) allows you to quickly annotate your VCF with any number of INFO fields from any number of VCFs. It uses a simple conf file to allow the user to specify the source annotation files and fields and how they will be added to the info of the query VCF. Values are pulled by name from the INFO field with special-cases of ID and FILTER to pull from those VCF columns. The output files are not published in the output folder by default, and is passed to CADD and/or VEP for further annotation. +[vcfanno](https://github.com/brentp/vcfanno) allows you to quickly annotate your VCF with any number of INFO fields from any number of VCFs. It uses a simple configuration file to allow the user to specify the source annotation files and fields and how they will be added to the info of the query VCF. Values are pulled by name from the INFO field with special-cases of ID and FILTER to pull from those VCF columns. The output files are not published in the output folder, and is passed to CADD and/or VEP for further annotation. We recommend using vcfanno to annotate SNVs with precomputed CADD scores (files can be downloaded from [here](https://cadd.gs.washington.edu/download)). #### CADD -[CADD](https://cadd.gs.washington.edu/) is a tool for scoring the deleteriousness of single nucleotide variants as well as insertion/deletions variants in the human genome. In nf-core/raredisease, SNVs can be annotated with precomputed CADD scores using vcfanno. However, for small indels they will be calculated on the fly by CADD. The output files are not published in the output folder by default, and is passed to VEP for further annotation. +[CADD](https://cadd.gs.washington.edu/) is a tool for scoring the deleteriousness of single nucleotide variants as well as insertion/deletions variants in the human genome. In nf-core/raredisease, SNVs can be annotated with precomputed CADD scores using vcfanno. However, for small indels they will be calculated on the fly by CADD. The output files are not published in the output folder, and is passed to VEP for further annotation. #### VEP @@ -360,7 +361,7 @@ Based on VEP annotations, custom scripts used by the pipeline further annotate e #### SVDB query -[SVDB query](https://github.com/J35P312/SVDB#Query) allows you to quickly annotate your VCF with data from one or more structural variant databases. The output files are not published in the output folder by default, and is passed to vep for further annotation. +[SVDB query](https://github.com/J35P312/SVDB#Query) allows you to quickly annotate your VCF with data from one or more structural variant databases. The output files are not published in the output folder, and is passed to vep for further annotation. #### VEP @@ -407,17 +408,17 @@ The pipeline for mitochondrial variant discovery, using Mutect2, uses a high sen ##### vcfanno -[vcfanno](https://github.com/brentp/vcfanno) allows you to quickly annotate your VCF with any number of INFO fields from any number of VCFs. It uses a simple conf file to allow the user to specify the source annotation files and fields and how they will be added to the info of the query VCF. Values are pulled by name from the INFO field with special-cases of ID and FILTER to pull from those VCF columns. The output files are not published in the output folder by default, and is passed to vep for further annotation. +[vcfanno](https://github.com/brentp/vcfanno) allows you to quickly annotate your VCF with any number of INFO fields from any number of VCFs. It uses a simple conf file to allow the user to specify the source annotation files and fields and how they will be added to the info of the query VCF. Values are pulled by name from the INFO field with special-cases of ID and FILTER to pull from those VCF columns. The output files are not published in the output folder, and is passed to vep for further annotation. We recommend using vcfanno to annotate SNVs with precomputed CADD scores (files can be downloaded from [here](https://cadd.gs.washington.edu/download)). -#### CADD +##### CADD -[CADD](https://cadd.gs.washington.edu/) is a tool for scoring the deleteriousness of single nucleotide variants as well as insertion/deletions variants in the human genome. In nf-core/raredisease, SNVs can be annotated with precomputed CADD scores using vcfanno. However, for small indels they will be calculated on the fly by CADD. The output files are not published in the output folder by default, and is passed to VEP for further annotation. +[CADD](https://cadd.gs.washington.edu/) is a tool for scoring the deleteriousness of single nucleotide variants as well as insertion/deletions variants in the human genome. In nf-core/raredisease, SNVs can be annotated with precomputed CADD scores using vcfanno. However, for small indels they will be calculated on the fly by CADD. The output files are not published in the output folder, and is passed to VEP for further annotation. ##### Hmtnote -[HmtNote](https://github.com/robertopreste/HmtNote) annotates vcf containing human mitochondrial variants with HmtVar. It will run offline by default wiht a database within the container. +[HmtNote](https://github.com/robertopreste/HmtNote) annotates vcf containing human mitochondrial variants with HmtVar. It will run offline by default with a database within the container. ##### VEP diff --git a/lib/WorkflowRaredisease.groovy b/lib/WorkflowRaredisease.groovy index 48f8bb4d..99e5f500 100755 --- a/lib/WorkflowRaredisease.groovy +++ b/lib/WorkflowRaredisease.groovy @@ -14,10 +14,6 @@ class WorkflowRaredisease { genomeExistsError(params, log) - - if (!params.fasta) { - Nextflow.error "Genome fasta file not specified with e.g. '--fasta genome.fa' or via a detectable config file." - } } // diff --git a/nextflow_schema.json b/nextflow_schema.json index dce94330..5367ada4 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -59,6 +59,7 @@ }, "bwa": { "type": "string", + "exists": true, "format": "directory-path", "description": "Directory for pre-built bwa index.", "help_text": "If none provided, will be generated automatically from the FASTA reference.", @@ -66,6 +67,7 @@ }, "bwamem2": { "type": "string", + "exists": true, "format": "directory-path", "description": "Directory for pre-built bwamem2 index.", "help_text": "If none provided, will be generated automatically from the FASTA reference.", @@ -73,6 +75,7 @@ }, "cadd_resources": { "type": "string", + "exists": true, "format": "directory-path", "fa_icon": "fas fa-file", "description": "Path to the directory containing cadd annotations.", @@ -80,6 +83,7 @@ }, "fai": { "type": "string", + "exists": true, "format": "file-path", "help_text": "If none provided, will be generated automatically from the FASTA reference", "description": "Path to FASTA genome index file.", @@ -98,6 +102,7 @@ }, "gcnvcaller_model": { "type": "string", + "exists": true, "fa_icon": "fas fa-file", "description": "A file containing the path to models produced by GATK4 GermlineCNVCaller cohort.", "format": "file-path", @@ -112,6 +117,7 @@ }, "gens_gnomad_pos": { "type": "string", + "exists": true, "format": "file-path", "fa_icon": "fas fa-file", "description": "Path to a list of common SNP locations for Gens.", @@ -120,6 +126,7 @@ }, "gens_interval_list": { "type": "string", + "exists": true, "format": "file-path", "fa_icon": "fas fa-file", "description": "Path to interval list for Gens.", @@ -128,6 +135,7 @@ }, "gens_pon": { "type": "string", + "exists": true, "format": "file-path", "fa_icon": "fas fa-file", "description": "Path to panel of normals for Gens.", @@ -136,6 +144,7 @@ }, "gnomad_af": { "type": "string", + "exists": true, "format": "path", "fa_icon": "fas fa-file", "pattern": "^\\S+\\.tab(\\.gz)?$", @@ -144,6 +153,7 @@ }, "gnomad_af_idx": { "type": "string", + "exists": true, "format": "path", "fa_icon": "fas fa-file", "pattern": "^\\S+\\.bed(\\.gz)?\\.idx$", @@ -165,6 +175,7 @@ }, "intervals_wgs": { "type": "string", + "exists": true, "format": "path", "fa_icon": "fas fa-file", "pattern": "^\\S+\\.intervals?(_list)?$", @@ -173,6 +184,7 @@ }, "intervals_y": { "type": "string", + "exists": true, "format": "path", "fa_icon": "fas fa-file", "pattern": "^\\S+\\.intervals?(_list)?$", @@ -181,6 +193,7 @@ }, "known_dbsnp": { "type": "string", + "exists": true, "format": "path", "fa_icon": "fas fa-file", "pattern": "^\\S+\\.vcf(\\.gz)?$", @@ -188,6 +201,7 @@ }, "known_dbsnp_tbi": { "type": "string", + "exists": true, "format": "path", "fa_icon": "fas fa-file", "pattern": "^\\S+\\.vcf(\\.gz)?\\.tbi$", @@ -202,19 +216,20 @@ }, "mito_name": { "type": "string", - "format": "path", "description": "Name of the mitochondrial contig in the reference fasta file", "help_text": "Used to extract relevant information from the references to analyse mitochondria", "fa_icon": "fas fa-align-center" }, "ml_model": { "type": "string", + "exists": true, "format": "path", "fa_icon": "fas fa-file", "description": "Path to sentieon machine learning model file." }, "mt_fasta": { "type": "string", + "exists": true, "format": "file-path", "mimetype": "text/plain", "pattern": "^\\S+\\.fn?a(sta)?(\\.gz)?$", @@ -223,6 +238,7 @@ }, "ploidy_model": { "type": "string", + "exists": true, "fa_icon": "fas fa-folder-open", "description": "Directory containing the ploidy model files", "format": "directory-path", @@ -230,6 +246,7 @@ }, "readcount_intervals": { "type": "string", + "exists": true, "fa_icon": "fas fa-file", "description": "Interval list file containing the intervals over which read counts are tabulated for CNV calling", "format": "file-path", @@ -237,6 +254,7 @@ }, "reduced_penetrance": { "type": "string", + "exists": true, "format": "path", "fa_icon": "fas fa-file-csv", "description": "File with gene ids that have reduced penetrance. For use with genmod" @@ -249,18 +267,21 @@ }, "score_config_snv": { "type": "string", + "exists": true, "format": "path", "fa_icon": "fas fa-file", "description": "SNV rank model config file for genmod." }, "score_config_sv": { "type": "string", + "exists": true, "format": "path", "fa_icon": "fas fa-file", "description": "SV rank model config file for genmod." }, "sequence_dictionary": { "type": "string", + "exists": true, "format": "path", "fa_icon": "fas fa-file", "pattern": "^\\S+\\.dict$", @@ -268,6 +289,7 @@ }, "svdb_query_dbs": { "type": "string", + "exists": true, "format": "file-path", "description": "Databases used for structural variant annotation.", "fa_icon": "fas fa-file-csv", @@ -275,6 +297,7 @@ }, "target_bed": { "type": "string", + "exists": true, "format": "path", "fa_icon": "fas fa-file", "pattern": "^\\S+\\.bed(\\.gz)?$", @@ -283,6 +306,7 @@ }, "variant_catalog": { "type": "string", + "exists": true, "format": "file-path", "description": "Path to variant catalog file", "help_text": "Used with ExpansionHunter and if no catalogue is passed, then a default will be used.", @@ -290,12 +314,14 @@ }, "vcfanno_resources": { "type": "string", + "exists": true, "description": "Path to a file containing the absolute paths to resources defined within the vcfanno toml file. One line per resource.", "help_text": "If no file is passed, default configurations will be used according to genome build within the context of the pipeline.", "fa_icon": "fas fa-file" }, "vcfanno_toml": { "type": "string", + "exists": true, "description": "Path to the vcfanno toml file.", "pattern": "^\\S+\\.toml$", "help_text": "If no toml is passed, default configurations will be used according to genome build within the context of the pipeline.", @@ -303,6 +329,7 @@ }, "vcfanno_lua": { "type": "string", + "exists": true, "description": "Path to the vcfanno lua file.", "pattern": "^\\S+\\.lua$", "help_text": "Custom operations file (lua). For use when the built-in ops don't supply the needed reduction.", @@ -310,12 +337,14 @@ }, "vep_cache": { "type": "string", + "exists": true, "description": "Path to vep's cache directory.", "help_text": "If no directory path is passed, vcf files will not be annotated by vep.", "fa_icon": "fas fa-folder-open" }, "vep_filters": { "type": "string", + "exists": true, "format": "path", "fa_icon": "fas fa-file-csv", "description": "Path to the file containing HGNC_IDs of interest on separate lines." diff --git a/workflows/raredisease.nf b/workflows/raredisease.nf index 50d3eaed..e5497222 100644 --- a/workflows/raredisease.nf +++ b/workflows/raredisease.nf @@ -10,42 +10,6 @@ def logo = NfcoreTemplate.logo(workflow, params.monochrome_logs) def citation = '\n' + WorkflowMain.citation(workflow) + '\n' def summary_params = paramsSummaryMap(workflow) -// Check input path parameters to see if they exist -def checkPathParamList = [ - params.bwa, - params.bwamem2, - params.call_interval, - params.cadd_resources, - params.fasta, - params.fai, - params.gens_gnomad_pos, - params.gens_interval_list, - params.gens_pon, - params.gnomad_af, - params.gnomad_af_idx, - params.input, - params.intervals_wgs, - params.intervals_y, - params.known_dbsnp, - params.known_dbsnp_tbi, - params.ml_model, - params.mt_fasta, - params.multiqc_config, - params.reduced_penetrance, - params.score_config_snv, - params.score_config_sv, - params.sequence_dictionary, - params.svdb_query_dbs, - params.target_bed, - params.variant_catalog, - params.vcfanno_lua, - params.vcfanno_resources, - params.vcfanno_toml, - params.vep_cache, - params.vep_filters -] - -for (param in checkPathParamList) { if (param) { file(param, checkIfExists: true) } } // Print parameter summary log to screen log.info logo + paramsSummaryLog(workflow) + citation