From 74d9ea7560df793902df99b6d459aaf4c897cee2 Mon Sep 17 00:00:00 2001 From: Usman Rashid Date: Tue, 20 Feb 2024 17:23:27 +1300 Subject: [PATCH] Updated pipeline upto gt/stat --- README.md | 14 +- conf/modules.config | 221 ++++++++++++++++++ modules.json | 35 +++ .../environment.yml | 9 + .../checkgff3fastacorrespondence/main.nf | 25 ++ .../checkgff3fastacorrespondence/meta.yml | 56 +++++ .../check_gff3_fasta_correspondence.sh | 70 ++++++ .../tests/main.nf.test | 70 ++++++ .../tests/main.nf.test.snap | 72 ++++++ .../tests/tags.yml | 2 + modules/pfr/gt/gff3/environment.yml | 9 + modules/pfr/gt/gff3/main.nf | 51 ++++ modules/pfr/gt/gff3/meta.yml | 48 ++++ modules/pfr/gt/gff3/tests/main.nf.test | 61 +++++ modules/pfr/gt/gff3/tests/main.nf.test.snap | 72 ++++++ modules/pfr/gt/gff3/tests/nextflow.config | 3 + modules/pfr/gt/gff3/tests/tags.yml | 2 + modules/pfr/gt/gff3validator/environment.yml | 9 + modules/pfr/gt/gff3validator/main.nf | 50 ++++ modules/pfr/gt/gff3validator/meta.yml | 49 ++++ .../pfr/gt/gff3validator/tests/main.nf.test | 63 +++++ .../gt/gff3validator/tests/main.nf.test.snap | 72 ++++++ modules/pfr/gt/gff3validator/tests/tags.yml | 2 + modules/pfr/gt/stat/environment.yml | 9 + modules/pfr/gt/stat/main.nf | 35 +++ modules/pfr/gt/stat/meta.yml | 46 ++++ modules/pfr/gt/stat/tests/main.nf.test | 37 +++ modules/pfr/gt/stat/tests/main.nf.test.snap | 31 +++ modules/pfr/gt/stat/tests/tags.yml | 2 + subworkflows/pfr/gff3_validate/main.nf | 61 +++++ subworkflows/pfr/gff3_validate/meta.yml | 50 ++++ .../pfr/gff3_validate/tests/main.nf.test | 86 +++++++ .../pfr/gff3_validate/tests/main.nf.test.snap | 115 +++++++++ .../pfr/gff3_validate/tests/nextflow.config | 6 + subworkflows/pfr/gff3_validate/tests/tags.yml | 2 + workflows/assemblyqc.nf | 33 ++- 36 files changed, 1566 insertions(+), 12 deletions(-) create mode 100644 modules/pfr/custom/checkgff3fastacorrespondence/environment.yml create mode 100644 modules/pfr/custom/checkgff3fastacorrespondence/main.nf create mode 100644 modules/pfr/custom/checkgff3fastacorrespondence/meta.yml create mode 100755 modules/pfr/custom/checkgff3fastacorrespondence/templates/check_gff3_fasta_correspondence.sh create mode 100644 modules/pfr/custom/checkgff3fastacorrespondence/tests/main.nf.test create mode 100644 modules/pfr/custom/checkgff3fastacorrespondence/tests/main.nf.test.snap create mode 100644 modules/pfr/custom/checkgff3fastacorrespondence/tests/tags.yml create mode 100644 modules/pfr/gt/gff3/environment.yml create mode 100644 modules/pfr/gt/gff3/main.nf create mode 100644 modules/pfr/gt/gff3/meta.yml create mode 100644 modules/pfr/gt/gff3/tests/main.nf.test create mode 100644 modules/pfr/gt/gff3/tests/main.nf.test.snap create mode 100644 modules/pfr/gt/gff3/tests/nextflow.config create mode 100644 modules/pfr/gt/gff3/tests/tags.yml create mode 100644 modules/pfr/gt/gff3validator/environment.yml create mode 100644 modules/pfr/gt/gff3validator/main.nf create mode 100644 modules/pfr/gt/gff3validator/meta.yml create mode 100644 modules/pfr/gt/gff3validator/tests/main.nf.test create mode 100644 modules/pfr/gt/gff3validator/tests/main.nf.test.snap create mode 100644 modules/pfr/gt/gff3validator/tests/tags.yml create mode 100644 modules/pfr/gt/stat/environment.yml create mode 100644 modules/pfr/gt/stat/main.nf create mode 100644 modules/pfr/gt/stat/meta.yml create mode 100644 modules/pfr/gt/stat/tests/main.nf.test create mode 100644 modules/pfr/gt/stat/tests/main.nf.test.snap create mode 100644 modules/pfr/gt/stat/tests/tags.yml create mode 100644 subworkflows/pfr/gff3_validate/main.nf create mode 100644 subworkflows/pfr/gff3_validate/meta.yml create mode 100644 subworkflows/pfr/gff3_validate/tests/main.nf.test create mode 100644 subworkflows/pfr/gff3_validate/tests/main.nf.test.snap create mode 100644 subworkflows/pfr/gff3_validate/tests/nextflow.config create mode 100644 subworkflows/pfr/gff3_validate/tests/tags.yml diff --git a/README.md b/README.md index ee85285..7e9252f 100644 --- a/README.md +++ b/README.md @@ -69,12 +69,14 @@ flowchart LR Prepare an `assemblysheet.csv` file with following columns representing target assemblies and associated meta-data. -- tag: A unique tag which represents the target assembly throughout the pipeline and in the final report -- fasta: FASTA file -- gff3 [Optional]: GFF3 annotation file if available -- monoploid_ids [Optional]: A txt file listing the IDs used to calculate LAI in monoploid mode if necessary -- hic_reads [Optional] A SRA id such as 'SRR8238190' or path to paired reads such as 'PG_PETUNIA_HiC_CGYCF_CACTCA_L001_R{1,2}.fastq.gz' -- synteny_labels [Optional]: A two column tsv file listing fasta sequence ids (first column) and labels for the synteny plots (second column) when performing synteny analysis +- `tag:` A unique tag which represents the target assembly throughout the pipeline and in the final report +- `fasta:` FASTA file +- `gff3 [Optional]:` GFF3 annotation file if available +- `monoploid_ids [Optional]:` A txt file listing the IDs used to calculate LAI in monoploid mode if necessary +- `hic_reads [Optional]:` A SRA id such as 'SRR8238190' or path to paired reads such as 'PG_PETUNIA_HiC_CGYCF_CACTCA_L001_R{1,2}.fastq.gz' +- `synteny_labels [Optional]:` A two column tsv file listing fasta sequence ids (first column) and labels for the synteny plots (second column) when performing synteny analysis + +See a minimal example [assemblysheet.csv](./assets/assemblysheet.csv) Now, you can run the pipeline using: diff --git a/conf/modules.config b/conf/modules.config index bb1fc48..26190d9 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -21,3 +21,224 @@ process { } } + +process { + withName: ASSEMBLATHON_STATS { + publishDir = [ + path: { "${params.outdir}/assemblathon_stats" }, + mode: params.publish_dir_mode, + saveAs: { filename -> filename.equals("versions.yml") ? null : filename } + ] + } + + withName: NCBI_FCS_ADAPTOR { + publishDir = [ + path: { "${params.outdir}/ncbi_fcs_adaptor" }, + mode: params.publish_dir_mode, + saveAs: { filename -> filename.equals("versions.yml") ? null : filename } + ] + } + + withName: RUN_BUSCO { + publishDir = [ + path: { "${params.outdir}/busco" }, + mode: params.publish_dir_mode, + saveAs: { filename -> filename.equals("versions.yml") ? null : filename } + ] + } + + withName: RUN_KRAKEN2 { + publishDir = [ + path: { "${params.outdir}/kraken2" }, + mode: params.publish_dir_mode, + saveAs: { filename -> filename.equals("versions.yml") ? null : filename } + ] + } + + withName: NCBI_FCS_GX_SCREEN_SAMPLES { + publishDir = [ + path: { "${params.outdir}/ncbi_fcs_gx" }, + mode: params.publish_dir_mode, + saveAs: { filename -> filename.equals("versions.yml") ? null : filename } + ] + } + + withName: CIRCOS { + publishDir = [ + path: { "${params.outdir}/synteny/${target_on_ref_seq}" }, + mode: params.publish_dir_mode, + saveAs: { filename -> filename.equals("versions.yml") ? null : filename } + ] + } +} + + +process { + withName: '.*:GFF3_VALIDATE:GT_GFF3' { + ext.args = '-tidy -retainids' + } + + withName: GT_STAT { + ext.args = [ + '-genelengthdistri', + '-genescoredistri', + '-exonlengthdistri', + '-exonnumberdistri', + '-intronlengthdistri', + '-cdslengthdistri', + '-addintrons' + ].join(' ').trim() + + publishDir = [ + path: { "${params.outdir}/genometools_gt_stat" }, + mode: params.publish_dir_mode, + saveAs: { filename -> filename.equals("versions.yml") ? null : filename } + ] + } +} + +process { + + withName: FILTER_BY_LENGTH { + ext.args = params.tidk_filter_by_size ? "-m ${params.tidk_filter_size_bp}" : '' + ext.prefix = { "${meta.id}.filtered" } + } + + withName: SORT_BY_LENGTH { + ext.args = '--quiet --reverse --by-length' + ext.prefix = { "${meta.id}.sorted" } + } + + withName: TIDK_EXPLORE { + ext.args = '--minimum 5 --maximum 30' + publishDir = [ + path: { "${params.outdir}/tidk" }, + mode: params.publish_dir_mode, + saveAs: { filename -> filename.equals("versions.yml") ? null : filename } + ] + } + + withName: TIDK_SEARCH_APRIORI { + ext.prefix = { "${meta.id}.apriori" } + ext.args = '--extension tsv' + publishDir = [ + path: { "${params.outdir}/tidk" }, + mode: params.publish_dir_mode, + saveAs: { filename -> filename.equals("versions.yml") ? null : filename } + ] + } + + withName: TIDK_SEARCH_APOSTERIORI { + ext.prefix = { "${meta.id}.aposteriori" } + ext.args = '--extension tsv' + publishDir = [ + path: { "${params.outdir}/tidk" }, + mode: params.publish_dir_mode, + saveAs: { filename -> filename.equals("versions.yml") ? null : filename } + ] + } + + withName: TIDK_PLOT_APRIORI { + ext.prefix = { "${meta.id}.apriori" } + publishDir = [ + path: { "${params.outdir}/tidk" }, + mode: params.publish_dir_mode, + saveAs: { filename -> filename.equals("versions.yml") ? null : filename } + ] + } + + withName: TIDK_PLOT_APOSTERIORI { + ext.prefix = { "${meta.id}.aposteriori" } + publishDir = [ + path: { "${params.outdir}/tidk" }, + mode: params.publish_dir_mode, + saveAs: { filename -> filename.equals("versions.yml") ? null : filename } + ] + } +} + + +process { + + withName: CUSTOM_SHORTENFASTAIDS { + publishDir = [ + path: { "${params.outdir}/lai" }, + mode: params.publish_dir_mode, + pattern: '*.short.ids.tsv' + ] + } + + withName: EDTA_LTRHARVEST { + ext.prefix = { "${meta.id}_edta_ltrharvest" } + } + + withName: LTRFINDER { + ext.args = '-harvest_out -size 1000000 -time 300' + } + + withName: CAT_CAT { + ext.prefix = { "${meta.id}_ltrharvest_ltrfinder.tabout" } + } + + withName: LTRRETRIEVER { + publishDir = [ + path: { "${params.outdir}/lai" }, + mode: params.publish_dir_mode, + pattern: '*.LTRlib.fa' + ] + } + + withName: CUSTOM_RESTOREGFFIDS { + publishDir = [ + path: { "${params.outdir}/lai" }, + mode: params.publish_dir_mode, + saveAs: { filename -> filename.equals("versions.yml") ? null : filename } + ] + } + + withName: LAI { + publishDir = [ + path: { "${params.outdir}/lai" }, + mode: params.publish_dir_mode, + saveAs: { filename -> filename.equals("versions.yml") ? null : filename } + ] + } +} + +process { + + withName: FASTQC_RAW { + publishDir = [ + path: { "${params.outdir}/hic/fastqc_raw" }, + mode: params.publish_dir_mode, + saveAs: { filename -> filename.equals("versions.yml") ? null : filename } + ] + } + + withName: FASTQC_TRIM { + publishDir = [ + path: { "${params.outdir}/hic/fastqc_trim" }, + mode: params.publish_dir_mode, + saveAs: { filename -> filename.equals("versions.yml") ? null : filename } + ] + } + + withName: FASTP { + ext.args = params.hic_fastp_ext_args + publishDir = [ + path: { "${params.outdir}/hic/fastp" }, + mode: params.publish_dir_mode, + saveAs: { filename -> filename.equals("versions.yml") ? null : filename } + ] + } + + withName: BWA_MEM { + ext.prefix = { "${meta.id}.on.${meta.ref_id}.bwa.mem" } + ext.args = '-5SP' + } + + withName: SAMBLASTER { + ext.prefix = { "${meta.id}.on.${meta.ref_id}.samblaster" } + ext.args3 = '-h -F 2316' + } +} diff --git a/modules.json b/modules.json index f01155f..ef57949 100644 --- a/modules.json +++ b/modules.json @@ -2,6 +2,41 @@ "name": "plant-food-research-open/assemblyqc", "homePage": "https://github.com/plant-food-research-open/assemblyqc", "repos": { + "git@github.com:PlantandFoodResearch/nxf-modules.git": { + "modules": { + "pfr": { + "custom/checkgff3fastacorrespondence": { + "branch": "main", + "git_sha": "1a76f884082c786760559c462063a5d1de94ca83", + "installed_by": ["gff3_validate"] + }, + "gt/gff3": { + "branch": "main", + "git_sha": "bfa4874d3942bdff70cb8df17322834125cafb28", + "installed_by": ["gff3_validate"] + }, + "gt/gff3validator": { + "branch": "main", + "git_sha": "889b9b57b611dcb063594608c2a222c928327cba", + "installed_by": ["gff3_validate"] + }, + "gt/stat": { + "branch": "main", + "git_sha": "cb5fb0be78a98fd1e32b7c90d6adf8c3bf44133e", + "installed_by": ["modules"] + } + } + }, + "subworkflows": { + "pfr": { + "gff3_validate": { + "branch": "main", + "git_sha": "f9b96bf8142a01f0649ff90570fb10aa973504b9", + "installed_by": ["subworkflows"] + } + } + } + }, "https://github.com/nf-core/modules.git": { "modules": { "nf-core": { diff --git a/modules/pfr/custom/checkgff3fastacorrespondence/environment.yml b/modules/pfr/custom/checkgff3fastacorrespondence/environment.yml new file mode 100644 index 0000000..ec0e86d --- /dev/null +++ b/modules/pfr/custom/checkgff3fastacorrespondence/environment.yml @@ -0,0 +1,9 @@ +--- +# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/environment-schema.json +name: "custom_checkgff3fastacorrespondence" +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - "bioconda::samtools=1.18" diff --git a/modules/pfr/custom/checkgff3fastacorrespondence/main.nf b/modules/pfr/custom/checkgff3fastacorrespondence/main.nf new file mode 100644 index 0000000..c1abb6f --- /dev/null +++ b/modules/pfr/custom/checkgff3fastacorrespondence/main.nf @@ -0,0 +1,25 @@ +process CUSTOM_CHECKGFF3FASTACORRESPONDENCE { + tag "$meta.id" + label 'process_single' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/samtools:1.18--h50ea8bc_1': + 'biocontainers/samtools:1.18--h50ea8bc_1' }" + + input: + tuple val(meta), path(gff3) + path(fasta) + + output: + tuple val(meta), path('*.success.log') , emit: success_log , optional: true + tuple val(meta), path('*.error.log') , emit: error_log , optional: true + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + shell: + prefix = task.ext.prefix ?: "${meta.id}" + template 'check_gff3_fasta_correspondence.sh' +} diff --git a/modules/pfr/custom/checkgff3fastacorrespondence/meta.yml b/modules/pfr/custom/checkgff3fastacorrespondence/meta.yml new file mode 100644 index 0000000..69bbd05 --- /dev/null +++ b/modules/pfr/custom/checkgff3fastacorrespondence/meta.yml @@ -0,0 +1,56 @@ +--- +# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/meta-schema.json +name: "custom_checkgff3fastacorrespondence" +description: "A custom bash script which checks the correspondence of a gff3 file with a fasta file" +keywords: + - genome + - gff3 + - annotation + - validation +tools: + - samtools: + description: | + SAMtools is a set of utilities for interacting with and post-processing + short DNA sequence read alignments in the SAM, BAM and CRAM formats, written by Heng Li. + These files are generated as output by short read aligners like BWA. + homepage: http://www.htslib.org/ + documentation: http://www.htslib.org/doc/samtools.html + doi: 10.1093/bioinformatics/btp352 + licence: ["MIT"] + +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. `[ id:'test' ]` + - gff3: + type: file + description: Input gff3 file + pattern: "*.{gff,gff3}" + - fasta: + type: file + description: Input fasta file + pattern: "*.{fsa,fa,fasta}" +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. `[ id:'test' ]` + - success_log: + type: file + description: Log file for successful validation + pattern: "*.success.log" + - error_log: + type: file + description: Log file for failed validation + pattern: "*.error.log" + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" +authors: + - "@GallVp" +maintainers: + - "@GallVp" diff --git a/modules/pfr/custom/checkgff3fastacorrespondence/templates/check_gff3_fasta_correspondence.sh b/modules/pfr/custom/checkgff3fastacorrespondence/templates/check_gff3_fasta_correspondence.sh new file mode 100755 index 0000000..611c64b --- /dev/null +++ b/modules/pfr/custom/checkgff3fastacorrespondence/templates/check_gff3_fasta_correspondence.sh @@ -0,0 +1,70 @@ +#!/usr/bin/env bash + +# Bump VERSION on edit +VERSION="v1" + +gff3_file="!{gff3}" +fasta_file="!{fasta}" +out_prefix="!{prefix}" +task_process="!{task.process}" + +# Record versions +cat <<-END_VERSIONS > versions.yml +"${task_process}": + samtools: $(echo $(samtools --version 2>&1) | sed 's/^.*samtools //; s/Using.*$//' ) +END_VERSIONS + +# Requires +# samtools faidx + +## STEP 1 +# Check that gff3 has no identifers that are not in fasta (fasta can +# have ids that are not in gff3 since not all assembly units have gff3 records + +# Extract identifiers from the GFF3 file +gff3_identifiers=$(grep -v '^#' "$gff3_file" | awk '{print $1}' | sort -u) + +# Extract identifiers from the FASTA file +fasta_identifiers=$(grep '^>' "$fasta_file" | awk '{print substr($1, 2)}' | sort -u) + +# Compare identifiers and find any that are present in the GFF3 but not in the FASTA +missing_identifiers=$(comm -23 <(echo "$gff3_identifiers") <(echo "$fasta_identifiers")) + +# Check if any missing identifiers were found +if [[ -n "$missing_identifiers" ]]; then + touch "${out_prefix}.error.log" + echo "Failed to validate gff3 file for: $tag_label" >> "${out_prefix}.error.log" + echo "Fasta file: $fasta_file" >> "${out_prefix}.error.log" + echo "Gff3 file: $gff3_file" >> "${out_prefix}.error.log" + echo "GFF3 file contains identifiers not present in FASTA:" >> "${out_prefix}.error.log" + echo "$missing_identifiers" >> "${out_prefix}.error.log" + exit 0 +fi + +## STEP 2 +# check that there are no coordiantes in gff3 for any seqid that are +# greater than the seq length of the paretn fasta entry + +# Compute sequence lengths using samtools faidx +samtools faidx "$fasta_file" | cut -f 1,2 > sequence_lengths.txt + +# Check GFF3 file for coordinates exceeding sequence lengths +while IFS=$'\t' read -r seqname source feature start end score strand frame attributes && \ + read -r seq seq_length <&3; do + if [[ $start -gt $seq_length || $end -gt $seq_length ]]; then + touch "${out_prefix}.error.log" + echo "Failed to validate gff3 file for: $tag_label" >> "${out_prefix}.error.log" + echo "Fasta file: $fasta_file" >> "${out_prefix}.error.log" + echo "Gff3 file: $gff3_file" >> "${out_prefix}.error.log" + echo "Coordinates exceed sequence length in GFF3 file:" >> "${out_prefix}.error.log" + echo "Sequence: $seqname" >> "${out_prefix}.error.log" + echo "Sequence length: $seq_length" >> "${out_prefix}.error.log" + echo "Start: $start" >> "${out_prefix}.error.log" + echo "End: $end" >> "${out_prefix}.error.log" + exit 0 + fi +done < "$gff3_file" 3< "sequence_lengths.txt" + +touch "${out_prefix}.success.log" +echo "All tests passed..." >> "${out_prefix}.success.log" +exit 0 diff --git a/modules/pfr/custom/checkgff3fastacorrespondence/tests/main.nf.test b/modules/pfr/custom/checkgff3fastacorrespondence/tests/main.nf.test new file mode 100644 index 0000000..91578e5 --- /dev/null +++ b/modules/pfr/custom/checkgff3fastacorrespondence/tests/main.nf.test @@ -0,0 +1,70 @@ +nextflow_process { + + name "Test Process CUSTOM_CHECKGFF3FASTACORRESPONDENCE" + script "../main.nf" + process "CUSTOM_CHECKGFF3FASTACORRESPONDENCE" + + tag "modules" + tag "modules_nfcore" + tag "custom" + tag "custom/checkgff3fastacorrespondence" + + test("sarscov2-fasta-gff3-success") { + + when { + process { + """ + input[0] = [ + [ id:'test' ], // meta map + file(params.test_data['sarscov2']['genome']['genome_gff3'], checkIfExists: true) + ] + + input[1] = [ + file(params.test_data['sarscov2']['genome']['genome_fasta'], checkIfExists: true) + ] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() }, + { assert process.out.error_log == [] }, + { assert process.out.success_log != null }, + { assert path(process.out.success_log.get(0).get(1)).getText().contains("All tests passed...")}, + ) + } + + } + + test("sarscov2-gff3-homo_sapiens-fasta-error") { + + when { + process { + """ + input[0] = [ + [ id:'test' ], // meta map + file(params.test_data['sarscov2']['genome']['genome_gff3'], checkIfExists: true) + ] + + input[1] = [ + file(params.test_data['homo_sapiens']['genome']['genome_fasta'], checkIfExists: true) + ] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() }, + { assert process.out.success_log == [] }, + { assert process.out.error_log != null }, + { assert path(process.out.error_log.get(0).get(1)).getText().contains("GFF3 file contains identifiers not present in FASTA")}, + ) + } + + } + +} diff --git a/modules/pfr/custom/checkgff3fastacorrespondence/tests/main.nf.test.snap b/modules/pfr/custom/checkgff3fastacorrespondence/tests/main.nf.test.snap new file mode 100644 index 0000000..261e0dc --- /dev/null +++ b/modules/pfr/custom/checkgff3fastacorrespondence/tests/main.nf.test.snap @@ -0,0 +1,72 @@ +{ + "sarscov2-gff3-homo_sapiens-fasta-error": { + "content": [ + { + "0": [ + + ], + "1": [ + [ + { + "id": "test" + }, + "test.error.log:md5,8a119170625dc95fb2faa6843fad2c3f" + ] + ], + "2": [ + "versions.yml:md5,c8e0bb60f7422aa6c15db35013620802" + ], + "error_log": [ + [ + { + "id": "test" + }, + "test.error.log:md5,8a119170625dc95fb2faa6843fad2c3f" + ] + ], + "success_log": [ + + ], + "versions": [ + "versions.yml:md5,c8e0bb60f7422aa6c15db35013620802" + ] + } + ], + "timestamp": "2023-11-29T12:24:08.677505" + }, + "sarscov2-fasta-gff3-success": { + "content": [ + { + "0": [ + [ + { + "id": "test" + }, + "test.success.log:md5,5cad27984e6af4889f7dcf12264fe47b" + ] + ], + "1": [ + + ], + "2": [ + "versions.yml:md5,c8e0bb60f7422aa6c15db35013620802" + ], + "error_log": [ + + ], + "success_log": [ + [ + { + "id": "test" + }, + "test.success.log:md5,5cad27984e6af4889f7dcf12264fe47b" + ] + ], + "versions": [ + "versions.yml:md5,c8e0bb60f7422aa6c15db35013620802" + ] + } + ], + "timestamp": "2023-11-29T12:24:04.530428" + } +} \ No newline at end of file diff --git a/modules/pfr/custom/checkgff3fastacorrespondence/tests/tags.yml b/modules/pfr/custom/checkgff3fastacorrespondence/tests/tags.yml new file mode 100644 index 0000000..708130d --- /dev/null +++ b/modules/pfr/custom/checkgff3fastacorrespondence/tests/tags.yml @@ -0,0 +1,2 @@ +custom/checkgff3fastacorrespondence: + - "modules/pfr/custom/checkgff3fastacorrespondence/**" diff --git a/modules/pfr/gt/gff3/environment.yml b/modules/pfr/gt/gff3/environment.yml new file mode 100644 index 0000000..8289fb3 --- /dev/null +++ b/modules/pfr/gt/gff3/environment.yml @@ -0,0 +1,9 @@ +--- +# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/environment-schema.json +name: "gt_gff3" +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - "bioconda::genometools-genometools=1.6.5" diff --git a/modules/pfr/gt/gff3/main.nf b/modules/pfr/gt/gff3/main.nf new file mode 100644 index 0000000..d27e2bb --- /dev/null +++ b/modules/pfr/gt/gff3/main.nf @@ -0,0 +1,51 @@ +process GT_GFF3 { + tag "$meta.id" + label 'process_single' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/genometools-genometools:1.6.5--py310h3db02ab_0': + 'biocontainers/genometools-genometools:1.6.5--py310h3db02ab_0' }" + + input: + tuple val(meta), path(gff3) + + output: + tuple val(meta), path("*.gt.gff3") , emit: gt_gff3 , optional: true + tuple val(meta), path("*.error.log"), emit: error_log , optional: true + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + """ + gt \\ + gff3 \\ + $args \\ + "$gff3" \\ + > "${prefix}.gt.gff3" \\ + 2> "${prefix}.error.log" \\ + || echo "Errors from gt-gff3 printed to ${prefix}.error.log" + + if grep -q "gt gff3: error:" "${prefix}.error.log"; then + echo "gt-gff3 failed to parse $gff3" + + rm \\ + "${prefix}.gt.gff3" + else + echo "gt-gff3 successfully parsed $gff3" + + mv \\ + "${prefix}.error.log" \\ + gt_gff3.stderr + fi + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + genometools: \$(gt --version | head -1 | sed 's/gt (GenomeTools) //') + END_VERSIONS + """ +} diff --git a/modules/pfr/gt/gff3/meta.yml b/modules/pfr/gt/gff3/meta.yml new file mode 100644 index 0000000..5cecd8d --- /dev/null +++ b/modules/pfr/gt/gff3/meta.yml @@ -0,0 +1,48 @@ +--- +# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/meta-schema.json +name: "gt_gff3" +description: "GenomeTools gt-gff3 utility to parse, possibly transform, and output GFF3 files" +keywords: + - genome + - gff3 + - annotation +tools: + - "gt": + description: "The GenomeTools genome analysis system" + homepage: "https://genometools.org/index.html" + documentation: "https://genometools.org/documentation.html" + tool_dev_url: "https://github.com/genometools/genometools" + doi: "10.1109/TCBB.2013.68" + licence: ["ISC"] +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. `[ id:'test' ]` + - gff3: + type: file + description: Input gff3 file + pattern: "*.{gff,gff3}" +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. `[ id:'test' ]` + - gt_gff3: + type: file + description: Parsed gff3 file produced only if there is no parsing error + pattern: "*.gt.gff3" + - error_log: + type: file + description: Error log if gt-gff3 failed to parse the input gff3 file + pattern: "*.error.log" + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" +authors: + - "@gallvp" +maintainers: + - "@gallvp" diff --git a/modules/pfr/gt/gff3/tests/main.nf.test b/modules/pfr/gt/gff3/tests/main.nf.test new file mode 100644 index 0000000..cb44bc8 --- /dev/null +++ b/modules/pfr/gt/gff3/tests/main.nf.test @@ -0,0 +1,61 @@ +nextflow_process { + + name "Test Process GT_GFF3" + script "../main.nf" + process "GT_GFF3" + config "./nextflow.config" + + tag "modules" + tag "modules_nfcore" + tag "gt" + tag "gt/gff3" + + test("sarscov2-gff3-valid") { + when { + process { + """ + input[0] = [ + [ id:'test' ], // meta map + file(params.test_data['sarscov2']['genome']['genome_gff3'], checkIfExists: true) + ] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() }, + { assert process.out.gt_gff3 != null }, + { assert process.out.error_log == [] } + ) + } + + } + + test("sarscov2-gff3-invalid") { + when { + process { + """ + input[0] = Channel.of( + '##gff-version 3', + 'chr22\tID=gene:ENSG00000233995;Name=AP000547.1' + ) + .collectFile(name: 'sample.gff3', newLine: true) + .map { file -> [ [ id:'test' ], file ] } + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() }, + { assert process.out.gt_gff3 == [] }, + { assert process.out.error_log != null }, + { assert path(process.out.error_log.get(0).get(1)).getText().contains("gt gff3: error:") } + ) + } + } + +} diff --git a/modules/pfr/gt/gff3/tests/main.nf.test.snap b/modules/pfr/gt/gff3/tests/main.nf.test.snap new file mode 100644 index 0000000..f31e8d1 --- /dev/null +++ b/modules/pfr/gt/gff3/tests/main.nf.test.snap @@ -0,0 +1,72 @@ +{ + "sarscov2-gff3-invalid": { + "content": [ + { + "0": [ + + ], + "1": [ + [ + { + "id": "test" + }, + "test.error.log:md5,31e6117c516f936ec403f792c732bc76" + ] + ], + "2": [ + "versions.yml:md5,9753770dd19a2a306dcf16d4aaf049eb" + ], + "error_log": [ + [ + { + "id": "test" + }, + "test.error.log:md5,31e6117c516f936ec403f792c732bc76" + ] + ], + "gt_gff3": [ + + ], + "versions": [ + "versions.yml:md5,9753770dd19a2a306dcf16d4aaf049eb" + ] + } + ], + "timestamp": "2023-11-28T13:43:34.620429" + }, + "sarscov2-gff3-valid": { + "content": [ + { + "0": [ + [ + { + "id": "test" + }, + "test.gt.gff3:md5,2ae900237ace415557b8735fac088b85" + ] + ], + "1": [ + + ], + "2": [ + "versions.yml:md5,9753770dd19a2a306dcf16d4aaf049eb" + ], + "error_log": [ + + ], + "gt_gff3": [ + [ + { + "id": "test" + }, + "test.gt.gff3:md5,2ae900237ace415557b8735fac088b85" + ] + ], + "versions": [ + "versions.yml:md5,9753770dd19a2a306dcf16d4aaf049eb" + ] + } + ], + "timestamp": "2023-11-28T13:43:31.065832" + } +} \ No newline at end of file diff --git a/modules/pfr/gt/gff3/tests/nextflow.config b/modules/pfr/gt/gff3/tests/nextflow.config new file mode 100644 index 0000000..af56226 --- /dev/null +++ b/modules/pfr/gt/gff3/tests/nextflow.config @@ -0,0 +1,3 @@ +process { + ext.args = '-tidy -retainids' +} diff --git a/modules/pfr/gt/gff3/tests/tags.yml b/modules/pfr/gt/gff3/tests/tags.yml new file mode 100644 index 0000000..ae04030 --- /dev/null +++ b/modules/pfr/gt/gff3/tests/tags.yml @@ -0,0 +1,2 @@ +gt/gff3: + - "modules/pfr/gt/gff3/**" diff --git a/modules/pfr/gt/gff3validator/environment.yml b/modules/pfr/gt/gff3validator/environment.yml new file mode 100644 index 0000000..ea57ebe --- /dev/null +++ b/modules/pfr/gt/gff3validator/environment.yml @@ -0,0 +1,9 @@ +--- +# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/environment-schema.json +name: "gt_gff3validator" +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - "bioconda::genometools-genometools=1.6.5" diff --git a/modules/pfr/gt/gff3validator/main.nf b/modules/pfr/gt/gff3validator/main.nf new file mode 100644 index 0000000..ae7ec9e --- /dev/null +++ b/modules/pfr/gt/gff3validator/main.nf @@ -0,0 +1,50 @@ +process GT_GFF3VALIDATOR { + tag "$meta.id" + label 'process_single' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/genometools-genometools:1.6.5--py310h3db02ab_0': + 'biocontainers/genometools-genometools:1.6.5--py310h3db02ab_0' }" + + input: + tuple val(meta), path(gff3) + + output: + tuple val(meta), path('*.success.log') , emit: success_log , optional: true + tuple val(meta), path('*.error.log') , emit: error_log , optional: true + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def prefix = task.ext.prefix ?: "${meta.id}" + """ + gt \\ + gff3validator \\ + "$gff3" \\ + > "${prefix}.success.log" \\ + 2> "${prefix}.error.log" \\ + || echo "Errors from gt-gff3validator printed to ${prefix}.error.log" + + if grep -q "input is valid GFF3" "${prefix}.success.log"; then + echo "Validation successful..." + + mv \\ + "${prefix}.error.log" \\ + gt_gff3validator.stderr + else + echo "Validation failed..." + + mv \\ + "${prefix}.success.log" \\ + gt_gff3validator.stdout + fi + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + genometools: \$(gt --version | head -1 | sed 's/gt (GenomeTools) //') + END_VERSIONS + """ +} diff --git a/modules/pfr/gt/gff3validator/meta.yml b/modules/pfr/gt/gff3validator/meta.yml new file mode 100644 index 0000000..3322faf --- /dev/null +++ b/modules/pfr/gt/gff3validator/meta.yml @@ -0,0 +1,49 @@ +--- +# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/meta-schema.json +name: "gt_gff3validator" +description: "GenomeTools gt-gff3validator utility to strictly validate a GFF3 file" +keywords: + - genome + - gff3 + - annotation + - validation +tools: + - "gt": + description: "The GenomeTools genome analysis system" + homepage: "https://genometools.org/index.html" + documentation: "https://genometools.org/documentation.html" + tool_dev_url: "https://github.com/genometools/genometools" + doi: "10.1109/TCBB.2013.68" + licence: ["ISC"] +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. `[ id:'test' ]` + - gff3: + type: file + description: Input gff3 file + pattern: "*.{gff,gff3}" +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. `[ id:'test' ]` + - success_log: + type: file + description: Log file for successful validation + pattern: "*.success.log" + - error_log: + type: file + description: Log file for failed validation + pattern: "*.error.log" + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" +authors: + - "@GallVp" +maintainers: + - "@GallVp" diff --git a/modules/pfr/gt/gff3validator/tests/main.nf.test b/modules/pfr/gt/gff3validator/tests/main.nf.test new file mode 100644 index 0000000..1b99e55 --- /dev/null +++ b/modules/pfr/gt/gff3validator/tests/main.nf.test @@ -0,0 +1,63 @@ +nextflow_process { + + name "Test Process GT_GFF3VALIDATOR" + script "../main.nf" + process "GT_GFF3VALIDATOR" + + tag "modules" + tag "modules_nfcore" + tag "gt" + tag "gt/gff3validator" + + test("custom-gff3-valid") { + + when { + process { + """ + input[0] = Channel.of( + '##gff-version 3', + 'chr22\thavana\tpseudogene\t16572027\t16574637\t.\t+\t.\tID=gene:ENSG00000233995;Name=AP000547.1' + ) + .collectFile(name: 'sample.gff3', newLine: true) + .map { file -> [ [ id:'test' ], file ] } + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() }, + { assert process.out.error_log == [] }, + { assert process.out.success_log != null }, + { assert path(process.out.success_log.get(0).get(1)).getText().contains("input is valid GFF3") } + ) + } + + } + + test("sarscov2-gff3-invalid") { + + when { + process { + """ + input[0] = [ + [ id:'test' ], // meta map + file(params.test_data['homo_sapiens']['genome']['genome_gff3'], checkIfExists: true) + ] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() }, + { assert process.out.success_log == [] }, + { assert process.out.error_log != null }, + { assert path(process.out.error_log.get(0).get(1)).getText().contains("gt gff3validator: error:") } + ) + } + + } +} diff --git a/modules/pfr/gt/gff3validator/tests/main.nf.test.snap b/modules/pfr/gt/gff3validator/tests/main.nf.test.snap new file mode 100644 index 0000000..0b6f065 --- /dev/null +++ b/modules/pfr/gt/gff3validator/tests/main.nf.test.snap @@ -0,0 +1,72 @@ +{ + "sarscov2-gff3-invalid": { + "content": [ + { + "0": [ + + ], + "1": [ + [ + { + "id": "test" + }, + "test.error.log:md5,c5d16b263a87072a13cca44fd811b8e2" + ] + ], + "2": [ + "versions.yml:md5,5927673eb73a8c22408643d224414215" + ], + "error_log": [ + [ + { + "id": "test" + }, + "test.error.log:md5,c5d16b263a87072a13cca44fd811b8e2" + ] + ], + "success_log": [ + + ], + "versions": [ + "versions.yml:md5,5927673eb73a8c22408643d224414215" + ] + } + ], + "timestamp": "2023-11-29T11:09:23.708792" + }, + "custom-gff3-valid": { + "content": [ + { + "0": [ + [ + { + "id": "test" + }, + "test.success.log:md5,b11ca5c18c865fc808ea0fef0b07da30" + ] + ], + "1": [ + + ], + "2": [ + "versions.yml:md5,5927673eb73a8c22408643d224414215" + ], + "error_log": [ + + ], + "success_log": [ + [ + { + "id": "test" + }, + "test.success.log:md5,b11ca5c18c865fc808ea0fef0b07da30" + ] + ], + "versions": [ + "versions.yml:md5,5927673eb73a8c22408643d224414215" + ] + } + ], + "timestamp": "2023-11-29T11:09:19.530068" + } +} \ No newline at end of file diff --git a/modules/pfr/gt/gff3validator/tests/tags.yml b/modules/pfr/gt/gff3validator/tests/tags.yml new file mode 100644 index 0000000..e247d55 --- /dev/null +++ b/modules/pfr/gt/gff3validator/tests/tags.yml @@ -0,0 +1,2 @@ +gt/gff3validator: + - "modules/pfr/gt/gff3validator/**" diff --git a/modules/pfr/gt/stat/environment.yml b/modules/pfr/gt/stat/environment.yml new file mode 100644 index 0000000..dca959b --- /dev/null +++ b/modules/pfr/gt/stat/environment.yml @@ -0,0 +1,9 @@ +--- +# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/environment-schema.json +name: "gt_stat" +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - "bioconda::genometools-genometools=1.6.5" diff --git a/modules/pfr/gt/stat/main.nf b/modules/pfr/gt/stat/main.nf new file mode 100644 index 0000000..3308b56 --- /dev/null +++ b/modules/pfr/gt/stat/main.nf @@ -0,0 +1,35 @@ +process GT_STAT { + tag "$meta.id" + label 'process_single' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/genometools-genometools:1.6.5--py310h3db02ab_0': + 'biocontainers/genometools-genometools:1.6.5--py310h3db02ab_0' }" + + input: + tuple val(meta), path(gff3) + + output: + tuple val(meta), path("*.gt.stat.yml") , emit: stats + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + """ + gt \\ + stat \\ + $args \\ + "$gff3" \\ + > "${prefix}.gt.stat.yml" + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + genometools: \$(gt --version | head -1 | sed 's/gt (GenomeTools) //') + END_VERSIONS + """ +} diff --git a/modules/pfr/gt/stat/meta.yml b/modules/pfr/gt/stat/meta.yml new file mode 100644 index 0000000..203059a --- /dev/null +++ b/modules/pfr/gt/stat/meta.yml @@ -0,0 +1,46 @@ +--- +# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/meta-schema.json +name: "gt_stat" +description: "GenomeTools gt-stat utility to show statistics about features contained in GFF3 files" +keywords: + - genome + - gff3 + - annotation + - statistics + - stats +tools: + - "gt": + description: "The GenomeTools genome analysis system" + homepage: "https://genometools.org/index.html" + documentation: "https://genometools.org/documentation.html" + tool_dev_url: "https://github.com/genometools/genometools" + doi: "10.1109/TCBB.2013.68" + licence: ["ISC"] +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. `[ id:'test' ]` + - gff3: + type: file + description: Input gff3 file + pattern: "*.{gff,gff3}" +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. `[ id:'test' ]` + - stats: + type: file + description: Stats file in yaml format + pattern: "*.gt.stat.yml" + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" +authors: + - "@GallVp" +maintainers: + - "@GallVp" diff --git a/modules/pfr/gt/stat/tests/main.nf.test b/modules/pfr/gt/stat/tests/main.nf.test new file mode 100644 index 0000000..57f5992 --- /dev/null +++ b/modules/pfr/gt/stat/tests/main.nf.test @@ -0,0 +1,37 @@ +nextflow_process { + + name "Test Process GT_STAT" + script "../main.nf" + process "GT_STAT" + + tag "modules" + tag "modules_nfcore" + tag "gt" + tag "gt/stat" + + test("sarscov2-gff3") { + + when { + process { + """ + input[0] = Channel.of( + "##gff-version 3" + + file(params.test_data['sarscov2']['genome']['genome_gff3'], checkIfExists: true).getText().toLowerCase() + ) + .collectFile(name: 'sample.gff3', newLine: true) + .map { file -> [ [ id:'test' ], file ] } + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() }, + { assert path(process.out.stats.get(0).get(1)).getText().contains("cdss: 12") } + ) + } + + } + +} diff --git a/modules/pfr/gt/stat/tests/main.nf.test.snap b/modules/pfr/gt/stat/tests/main.nf.test.snap new file mode 100644 index 0000000..2fcfb8a --- /dev/null +++ b/modules/pfr/gt/stat/tests/main.nf.test.snap @@ -0,0 +1,31 @@ +{ + "sarscov2-gff3": { + "content": [ + { + "0": [ + [ + { + "id": "test" + }, + "test.gt.stat.yml:md5,ebba7831ddbf916b8bbea675ba8693b5" + ] + ], + "1": [ + "versions.yml:md5,a184b50afb2ad6dd2d3d37b0a211dd71" + ], + "stats": [ + [ + { + "id": "test" + }, + "test.gt.stat.yml:md5,ebba7831ddbf916b8bbea675ba8693b5" + ] + ], + "versions": [ + "versions.yml:md5,a184b50afb2ad6dd2d3d37b0a211dd71" + ] + } + ], + "timestamp": "2023-11-29T11:34:48.057277" + } +} \ No newline at end of file diff --git a/modules/pfr/gt/stat/tests/tags.yml b/modules/pfr/gt/stat/tests/tags.yml new file mode 100644 index 0000000..46be634 --- /dev/null +++ b/modules/pfr/gt/stat/tests/tags.yml @@ -0,0 +1,2 @@ +gt/stat: + - "modules/pfr/gt/stat/**" diff --git a/subworkflows/pfr/gff3_validate/main.nf b/subworkflows/pfr/gff3_validate/main.nf new file mode 100644 index 0000000..5437c5a --- /dev/null +++ b/subworkflows/pfr/gff3_validate/main.nf @@ -0,0 +1,61 @@ +include { GT_GFF3 } from '../../../modules/pfr/gt/gff3/main' +include { GT_GFF3VALIDATOR } from '../../../modules/pfr/gt/gff3validator/main' +include { CUSTOM_CHECKGFF3FASTACORRESPONDENCE } from '../../../modules/pfr/custom/checkgff3fastacorrespondence/main' + +workflow GFF3_VALIDATE { + + take: + ch_gff3 // channel: [ val(meta), gff3 ] + ch_fasta // channel: [ val(meta), fasta ] + + main: + + ch_versions = Channel.empty() + + // MODULE: GT_GFF3 + GT_GFF3 ( ch_gff3 ) + ch_versions = ch_versions.mix(GT_GFF3.out.versions.first()) + + // MODULE: GT_GFF3VALIDATOR + GT_GFF3VALIDATOR ( GT_GFF3.out.gt_gff3 ) + ch_versions = ch_versions.mix(GT_GFF3VALIDATOR.out.versions.first()) + + // MODULE: CUSTOM_CHECKGFF3FASTACORRESPONDENCE + GT_GFF3VALIDATOR.out.success_log + | join ( + GT_GFF3.out.gt_gff3 + ) + | map { meta, log, gff3 -> [ meta, gff3 ] } + | join ( + ch_fasta + ) + | set { ch_gff3_fasta } + + CUSTOM_CHECKGFF3FASTACORRESPONDENCE ( + ch_gff3_fasta.map { meta, gff3, fasta -> [ meta, gff3 ] }, + ch_gff3_fasta.map { meta, gff3, fasta -> fasta } + ) + + ch_versions = ch_versions.mix(CUSTOM_CHECKGFF3FASTACORRESPONDENCE.out.versions.first()) + + CUSTOM_CHECKGFF3FASTACORRESPONDENCE.out.success_log + | join ( + ch_gff3_fasta.map { meta, gff3, fasta -> [ meta, gff3 ] } + ) + | map { meta, log, gff3 -> [ meta, gff3 ] } + | set { ch_valid_gff3 } + + GT_GFF3.out.error_log + | mix ( + GT_GFF3VALIDATOR.out.error_log + ) + | mix ( + CUSTOM_CHECKGFF3FASTACORRESPONDENCE.out.error_log + ) + | set { ch_log_for_invalid_gff3 } + + emit: + valid_gff3 = ch_valid_gff3 // channel: [ val(meta), gff3 ] + log_for_invalid_gff3 = ch_log_for_invalid_gff3 // channel: [ val(meta), log ] + versions = ch_versions // channel: [ versions.yml ] +} diff --git a/subworkflows/pfr/gff3_validate/meta.yml b/subworkflows/pfr/gff3_validate/meta.yml new file mode 100644 index 0000000..5dea12a --- /dev/null +++ b/subworkflows/pfr/gff3_validate/meta.yml @@ -0,0 +1,50 @@ +# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/subworkflows/yaml-schema.json +name: gff3_validate +description: | + Validates a gff3 file using GenomeTools gt-gff3, gt-gff3validator and + checks its correspondence with a fasta file +keywords: + - genome + - gff3 + - annotation + - validation +components: + - gt/gff3 + - gt/gff3validator + - custom/checkgff3fastacorrespondence +input: + - ch_gff3: + type: file + description: | + Input channel containing a gff3 file + Structure: [ val(meta), path(gff3) ] + pattern: "*.{gff,gff3}" + - ch_fasta: + type: file + description: | + Input channel containing a fasta file + Structure: [ val(meta), path(fasta) ] + pattern: "*.{fsa,fa,fasta}" +output: + - valid_gff3: + type: file + description: | + Valid gff3 file + Structure: [ val(meta), path(gff3) ] + pattern: "*.gff3" + - log_for_invalid_gff3: + type: file + description: | + Error log if the gff3 file is invalid + Structure: [ val(meta), path(gff3) ] + pattern: "*.gff3" + - versions: + type: file + description: | + File containing software versions + Structure: [ path(versions.yml) ] + pattern: "versions.yml" +authors: + - "@GallVp" +maintainers: + - "@GallVp" diff --git a/subworkflows/pfr/gff3_validate/tests/main.nf.test b/subworkflows/pfr/gff3_validate/tests/main.nf.test new file mode 100644 index 0000000..e71712b --- /dev/null +++ b/subworkflows/pfr/gff3_validate/tests/main.nf.test @@ -0,0 +1,86 @@ +nextflow_workflow { + + name "Test Workflow GFF3_VALIDATE" + script "../main.nf" + workflow "GFF3_VALIDATE" + config "./nextflow.config" + + tag "subworkflows" + tag "subworkflows_nfcore" + tag "subworkflows/gff3_validate" + tag "gff3_validate" + tag "gt" + tag "gt/gff3" + tag "gt/gff3validator" + tag "custom" + tag "custom/checkgff3fastacorrespondence" + + test("sarscov2-genome_gff3-genome_fasta-all_pass") { + + when { + workflow { + """ + input[0] = Channel.of([ [ id:'test' ], // meta map + file(params.test_data['sarscov2']['genome']['genome_gff3'], checkIfExists: true) + ]) + input[1] = Channel.of([ [ id:'test' ], + file(params.test_data['sarscov2']['genome']['genome_fasta'], checkIfExists: true) + ]) + """ + } + } + + then { + assertAll( + { assert workflow.success}, + { assert snapshot(workflow.out).match()} + ) + } + } + + test("homo_sapiens-genome_bed-genome_fasta-gt_gff3_fail") { + + when { + workflow { + """ + input[0] = Channel.of([ [ id:'test' ], // meta map + file(params.test_data['homo_sapiens']['genome']['genome_bed'], checkIfExists: true) + ]) + input[1] = Channel.of([ [ id:'test' ], + file(params.test_data['sarscov2']['genome']['genome_fasta'], checkIfExists: true) + ]) + """ + } + } + + then { + assertAll( + { assert workflow.success}, + { assert snapshot(workflow.out).match()} + ) + } + } + + test("sarscov2-genome_gff3-homo_sapiens-genome_fasta-correspondence_fail") { + + when { + workflow { + """ + input[0] = Channel.of([ [ id:'test' ], // meta map + file(params.test_data['sarscov2']['genome']['genome_gff3'], checkIfExists: true) + ]) + input[1] = Channel.of([ [ id:'test' ], + file(params.test_data['homo_sapiens']['genome']['genome_fasta'], checkIfExists: true) + ]) + """ + } + } + + then { + assertAll( + { assert workflow.success}, + { assert snapshot(workflow.out).match()} + ) + } + } +} diff --git a/subworkflows/pfr/gff3_validate/tests/main.nf.test.snap b/subworkflows/pfr/gff3_validate/tests/main.nf.test.snap new file mode 100644 index 0000000..4d2a59b --- /dev/null +++ b/subworkflows/pfr/gff3_validate/tests/main.nf.test.snap @@ -0,0 +1,115 @@ +{ + "sarscov2-genome_gff3-genome_fasta-all_pass": { + "content": [ + { + "0": [ + [ + { + "id": "test" + }, + "test.gt.gff3:md5,2ae900237ace415557b8735fac088b85" + ] + ], + "1": [ + + ], + "2": [ + "versions.yml:md5,10fe5c201e5fcddb52c3607ab3fdfb34", + "versions.yml:md5,856745cef2fff087e50ea4c0ffa3addd", + "versions.yml:md5,a89255422a163684b0c80ebdd8ad28ae" + ], + "log_for_invalid_gff3": [ + + ], + "valid_gff3": [ + [ + { + "id": "test" + }, + "test.gt.gff3:md5,2ae900237ace415557b8735fac088b85" + ] + ], + "versions": [ + "versions.yml:md5,10fe5c201e5fcddb52c3607ab3fdfb34", + "versions.yml:md5,856745cef2fff087e50ea4c0ffa3addd", + "versions.yml:md5,a89255422a163684b0c80ebdd8ad28ae" + ] + } + ], + "timestamp": "2023-12-07T10:33:21.09887" + }, + "homo_sapiens-genome_bed-genome_fasta-gt_gff3_fail": { + "content": [ + { + "0": [ + + ], + "1": [ + [ + { + "id": "test" + }, + "test.error.log:md5,c096494c3cd02864eb54434c294ba382" + ] + ], + "2": [ + "versions.yml:md5,a89255422a163684b0c80ebdd8ad28ae" + ], + "log_for_invalid_gff3": [ + [ + { + "id": "test" + }, + "test.error.log:md5,c096494c3cd02864eb54434c294ba382" + ] + ], + "valid_gff3": [ + + ], + "versions": [ + "versions.yml:md5,a89255422a163684b0c80ebdd8ad28ae" + ] + } + ], + "timestamp": "2023-12-07T10:35:26.549003" + }, + "sarscov2-genome_gff3-homo_sapiens-genome_fasta-correspondence_fail": { + "content": [ + { + "0": [ + + ], + "1": [ + [ + { + "id": "test" + }, + "test.error.log:md5,67686ea1ef271821f1218a8fe0207e1f" + ] + ], + "2": [ + "versions.yml:md5,10fe5c201e5fcddb52c3607ab3fdfb34", + "versions.yml:md5,856745cef2fff087e50ea4c0ffa3addd", + "versions.yml:md5,a89255422a163684b0c80ebdd8ad28ae" + ], + "log_for_invalid_gff3": [ + [ + { + "id": "test" + }, + "test.error.log:md5,67686ea1ef271821f1218a8fe0207e1f" + ] + ], + "valid_gff3": [ + + ], + "versions": [ + "versions.yml:md5,10fe5c201e5fcddb52c3607ab3fdfb34", + "versions.yml:md5,856745cef2fff087e50ea4c0ffa3addd", + "versions.yml:md5,a89255422a163684b0c80ebdd8ad28ae" + ] + } + ], + "timestamp": "2023-12-07T10:35:32.53584" + } +} \ No newline at end of file diff --git a/subworkflows/pfr/gff3_validate/tests/nextflow.config b/subworkflows/pfr/gff3_validate/tests/nextflow.config new file mode 100644 index 0000000..d07a888 --- /dev/null +++ b/subworkflows/pfr/gff3_validate/tests/nextflow.config @@ -0,0 +1,6 @@ +process { + + withName: GT_GFF3 { + ext.args = '-tidy -retainids -addintrons' + } +} diff --git a/subworkflows/pfr/gff3_validate/tests/tags.yml b/subworkflows/pfr/gff3_validate/tests/tags.yml new file mode 100644 index 0000000..60ffbf0 --- /dev/null +++ b/subworkflows/pfr/gff3_validate/tests/tags.yml @@ -0,0 +1,2 @@ +subworkflows/gff3_validate: + - subworkflows/pfr/gff3_validate/** diff --git a/workflows/assemblyqc.nf b/workflows/assemblyqc.nf index 56f2143..4af2a4a 100644 --- a/workflows/assemblyqc.nf +++ b/workflows/assemblyqc.nf @@ -21,18 +21,14 @@ WorkflowAssemblyqc.initialise(params, log) ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ */ -// ch_multiqc_config = Channel.fromPath("$projectDir/assets/multiqc_config.yml", checkIfExists: true) - /* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ IMPORT LOCAL MODULES/SUBWORKFLOWS ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ */ -// -// SUBWORKFLOW: Consisting of a mix of local and nf-core/modules -// -// include { INPUT_CHECK } from '../subworkflows/local/input_check' +include { GT_STAT } from '../modules/pfr/gt/stat/main' +include { GFF3_VALIDATE } from '../subworkflows/pfr/gff3_validate/main' /* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -113,6 +109,31 @@ workflow ASSEMBLYQC { ch_versions = ch_versions.mix(FASTAVALIDATOR.out.versions.first()) + // SUBWORKFLOW: GFF3_VALIDATE + GFF3_VALIDATE ( + ch_assembly_gff3, + ch_valid_target_assembly + ) + + ch_valid_gff3 = GFF3_VALIDATE.out.valid_gff3 + + ch_invalid_gff3_log = GFF3_VALIDATE.out.log_for_invalid_gff3 + | map { meta, error_log -> + log.warn("GFF3 validation failed for ${meta.id}\n${error_log.text}") + + [ meta, error_log ] + } + + ch_versions = ch_versions.mix(GFF3_VALIDATE.out.versions) + + // MODULE: GT_STAT + GT_STAT ( ch_valid_gff3 ) + + ch_gt_stats = GT_STAT.out.stats + | map { meta, yml -> yml } + + ch_versions = ch_versions.mix(GT_STAT.out.versions.first()) + // MODULE: CUSTOM_DUMPSOFTWAREVERSIONS CUSTOM_DUMPSOFTWAREVERSIONS ( ch_versions.unique().collectFile(name: 'collated_versions.yml')