Updated pipeline upto gt/stat

Plant-Food-Research-Open · Feb 20, 2024 · 74d9ea7 · 74d9ea7
1 parent 2f8b46a
commit 74d9ea7
Show file tree

Hide file tree

Showing 36 changed files with 1,566 additions and 12 deletions.
diff --git a/README.md b/README.md
@@ -69,12 +69,14 @@ flowchart LR
 
 Prepare an `assemblysheet.csv` file with following columns representing target assemblies and associated meta-data.
 
-- tag: A unique tag which represents the target assembly throughout the pipeline and in the final report
-- fasta: FASTA file
-- gff3 [Optional]: GFF3 annotation file if available
-- monoploid_ids [Optional]: A txt file listing the IDs used to calculate LAI in monoploid mode if necessary
-- hic_reads [Optional] A SRA id such as 'SRR8238190' or path to paired reads such as 'PG_PETUNIA_HiC_CGYCF_CACTCA_L001_R{1,2}.fastq.gz'
-- synteny_labels [Optional]: A two column tsv file listing fasta sequence ids (first column) and labels for the synteny plots (second column) when performing synteny analysis
+- `tag:` A unique tag which represents the target assembly throughout the pipeline and in the final report
+- `fasta:` FASTA file
+- `gff3 [Optional]:` GFF3 annotation file if available
+- `monoploid_ids [Optional]:` A txt file listing the IDs used to calculate LAI in monoploid mode if necessary
+- `hic_reads [Optional]:` A SRA id such as 'SRR8238190' or path to paired reads such as 'PG_PETUNIA_HiC_CGYCF_CACTCA_L001_R{1,2}.fastq.gz'
+- `synteny_labels [Optional]:` A two column tsv file listing fasta sequence ids (first column) and labels for the synteny plots (second column) when performing synteny analysis
+
+See a minimal example [assemblysheet.csv](./assets/assemblysheet.csv)
 
 Now, you can run the pipeline using:
 

diff --git a/conf/modules.config b/conf/modules.config
@@ -21,3 +21,224 @@ process {
     }
 
 }
+
+process {
+    withName: ASSEMBLATHON_STATS {
+        publishDir = [
+            path: { "${params.outdir}/assemblathon_stats" },
+            mode: params.publish_dir_mode,
+            saveAs: { filename -> filename.equals("versions.yml") ? null : filename }
+        ]
+    }
+
+    withName: NCBI_FCS_ADAPTOR {
+        publishDir = [
+            path: { "${params.outdir}/ncbi_fcs_adaptor" },
+            mode: params.publish_dir_mode,
+            saveAs: { filename -> filename.equals("versions.yml") ? null : filename }
+        ]
+    }
+
+    withName: RUN_BUSCO {
+        publishDir = [
+            path: { "${params.outdir}/busco" },
+            mode: params.publish_dir_mode,
+            saveAs: { filename -> filename.equals("versions.yml") ? null : filename }
+        ]
+    }
+
+    withName: RUN_KRAKEN2 {
+        publishDir = [
+            path: { "${params.outdir}/kraken2" },
+            mode: params.publish_dir_mode,
+            saveAs: { filename -> filename.equals("versions.yml") ? null : filename }
+        ]
+    }
+
+    withName: NCBI_FCS_GX_SCREEN_SAMPLES {
+        publishDir = [
+            path: { "${params.outdir}/ncbi_fcs_gx" },
+            mode: params.publish_dir_mode,
+            saveAs: { filename -> filename.equals("versions.yml") ? null : filename }
+        ]
+    }
+
+    withName: CIRCOS {
+        publishDir = [
+            path: { "${params.outdir}/synteny/${target_on_ref_seq}" },
+            mode: params.publish_dir_mode,
+            saveAs: { filename -> filename.equals("versions.yml") ? null : filename }
+        ]
+    }
+}
+
+
+process {
+    withName: '.*:GFF3_VALIDATE:GT_GFF3' {
+        ext.args = '-tidy -retainids'
+    }
+
+    withName: GT_STAT {
+        ext.args = [
+            '-genelengthdistri',
+            '-genescoredistri',
+            '-exonlengthdistri',
+            '-exonnumberdistri',
+            '-intronlengthdistri',
+            '-cdslengthdistri',
+            '-addintrons'
+        ].join(' ').trim()
+
+        publishDir = [
+            path: { "${params.outdir}/genometools_gt_stat" },
+            mode: params.publish_dir_mode,
+            saveAs: { filename -> filename.equals("versions.yml") ? null : filename }
+        ]
+    }
+}
+
+process {
+
+    withName: FILTER_BY_LENGTH {
+        ext.args = params.tidk_filter_by_size ? "-m ${params.tidk_filter_size_bp}" : ''
+        ext.prefix = { "${meta.id}.filtered" }
+    }
+
+    withName: SORT_BY_LENGTH {
+        ext.args = '--quiet --reverse --by-length'
+        ext.prefix = { "${meta.id}.sorted" }
+    }
+
+    withName: TIDK_EXPLORE {
+        ext.args = '--minimum 5 --maximum 30'
+        publishDir = [
+            path: { "${params.outdir}/tidk" },
+            mode: params.publish_dir_mode,
+            saveAs: { filename -> filename.equals("versions.yml") ? null : filename }
+        ]
+    }
+
+    withName: TIDK_SEARCH_APRIORI {
+        ext.prefix = { "${meta.id}.apriori" }
+        ext.args = '--extension tsv'
+        publishDir = [
+            path: { "${params.outdir}/tidk" },
+            mode: params.publish_dir_mode,
+            saveAs: { filename -> filename.equals("versions.yml") ? null : filename }
+        ]
+    }
+
+    withName: TIDK_SEARCH_APOSTERIORI {
+        ext.prefix = { "${meta.id}.aposteriori" }
+        ext.args = '--extension tsv'
+        publishDir = [
+            path: { "${params.outdir}/tidk" },
+            mode: params.publish_dir_mode,
+            saveAs: { filename -> filename.equals("versions.yml") ? null : filename }
+        ]
+    }
+
+    withName: TIDK_PLOT_APRIORI {
+        ext.prefix = { "${meta.id}.apriori" }
+        publishDir = [
+            path: { "${params.outdir}/tidk" },
+            mode: params.publish_dir_mode,
+            saveAs: { filename -> filename.equals("versions.yml") ? null : filename }
+        ]
+    }
+
+    withName: TIDK_PLOT_APOSTERIORI {
+        ext.prefix = { "${meta.id}.aposteriori" }
+        publishDir = [
+            path: { "${params.outdir}/tidk" },
+            mode: params.publish_dir_mode,
+            saveAs: { filename -> filename.equals("versions.yml") ? null : filename }
+        ]
+    }
+}
+
+
+process {
+
+    withName: CUSTOM_SHORTENFASTAIDS {
+        publishDir = [
+            path: { "${params.outdir}/lai" },
+            mode: params.publish_dir_mode,
+            pattern: '*.short.ids.tsv'
+        ]
+    }
+
+    withName: EDTA_LTRHARVEST {
+        ext.prefix = { "${meta.id}_edta_ltrharvest" }
+    }
+
+    withName: LTRFINDER {
+        ext.args = '-harvest_out -size 1000000 -time 300'
+    }
+
+    withName: CAT_CAT {
+        ext.prefix = { "${meta.id}_ltrharvest_ltrfinder.tabout" }
+    }
+
+    withName: LTRRETRIEVER {
+        publishDir = [
+            path: { "${params.outdir}/lai" },
+            mode: params.publish_dir_mode,
+            pattern: '*.LTRlib.fa'
+        ]
+    }
+
+    withName: CUSTOM_RESTOREGFFIDS {
+        publishDir = [
+            path: { "${params.outdir}/lai" },
+            mode: params.publish_dir_mode,
+            saveAs: { filename -> filename.equals("versions.yml") ? null : filename }
+        ]
+    }
+
+    withName: LAI {
+        publishDir = [
+            path: { "${params.outdir}/lai" },
+            mode: params.publish_dir_mode,
+            saveAs: { filename -> filename.equals("versions.yml") ? null : filename }
+        ]
+    }
+}
+
+process {
+
+    withName: FASTQC_RAW {
+        publishDir = [
+            path: { "${params.outdir}/hic/fastqc_raw" },
+            mode: params.publish_dir_mode,
+            saveAs: { filename -> filename.equals("versions.yml") ? null : filename }
+        ]
+    }
+
+    withName: FASTQC_TRIM {
+        publishDir = [
+            path: { "${params.outdir}/hic/fastqc_trim" },
+            mode: params.publish_dir_mode,
+            saveAs: { filename -> filename.equals("versions.yml") ? null : filename }
+        ]
+    }
+
+    withName: FASTP {
+        ext.args = params.hic_fastp_ext_args
+        publishDir = [
+            path: { "${params.outdir}/hic/fastp" },
+            mode: params.publish_dir_mode,
+            saveAs: { filename -> filename.equals("versions.yml") ? null : filename }
+        ]
+    }
+
+    withName: BWA_MEM {
+        ext.prefix = { "${meta.id}.on.${meta.ref_id}.bwa.mem" }
+        ext.args = '-5SP'
+    }
+
+    withName: SAMBLASTER {
+        ext.prefix = { "${meta.id}.on.${meta.ref_id}.samblaster" }
+        ext.args3 = '-h -F 2316'
+    }
+}
diff --git a/modules.json b/modules.json
@@ -2,6 +2,41 @@
     "name": "plant-food-research-open/assemblyqc",
     "homePage": "https://github.com/plant-food-research-open/assemblyqc",
     "repos": {
+        "[email protected]:PlantandFoodResearch/nxf-modules.git": {
+            "modules": {
+                "pfr": {
+                    "custom/checkgff3fastacorrespondence": {
+                        "branch": "main",
+                        "git_sha": "1a76f884082c786760559c462063a5d1de94ca83",
+                        "installed_by": ["gff3_validate"]
+                    },
+                    "gt/gff3": {
+                        "branch": "main",
+                        "git_sha": "bfa4874d3942bdff70cb8df17322834125cafb28",
+                        "installed_by": ["gff3_validate"]
+                    },
+                    "gt/gff3validator": {
+                        "branch": "main",
+                        "git_sha": "889b9b57b611dcb063594608c2a222c928327cba",
+                        "installed_by": ["gff3_validate"]
+                    },
+                    "gt/stat": {
+                        "branch": "main",
+                        "git_sha": "cb5fb0be78a98fd1e32b7c90d6adf8c3bf44133e",
+                        "installed_by": ["modules"]
+                    }
+                }
+            },
+            "subworkflows": {
+                "pfr": {
+                    "gff3_validate": {
+                        "branch": "main",
+                        "git_sha": "f9b96bf8142a01f0649ff90570fb10aa973504b9",
+                        "installed_by": ["subworkflows"]
+                    }
+                }
+            }
+        },
         "https://github.com/nf-core/modules.git": {
             "modules": {
                 "nf-core": {

diff --git a/modules/pfr/custom/checkgff3fastacorrespondence/environment.yml b/modules/pfr/custom/checkgff3fastacorrespondence/environment.yml
@@ -0,0 +1,9 @@
+---
+# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/environment-schema.json
+name: "custom_checkgff3fastacorrespondence"
+channels:
+  - conda-forge
+  - bioconda
+  - defaults
+dependencies:
+  - "bioconda::samtools=1.18"
diff --git a/modules/pfr/custom/checkgff3fastacorrespondence/main.nf b/modules/pfr/custom/checkgff3fastacorrespondence/main.nf
@@ -0,0 +1,25 @@
+process CUSTOM_CHECKGFF3FASTACORRESPONDENCE {
+    tag "$meta.id"
+    label 'process_single'
+
+    conda "${moduleDir}/environment.yml"
+    container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
+        'https://depot.galaxyproject.org/singularity/samtools:1.18--h50ea8bc_1':
+        'biocontainers/samtools:1.18--h50ea8bc_1' }"
+
+    input:
+    tuple val(meta), path(gff3)
+    path(fasta)
+
+    output:
+    tuple val(meta), path('*.success.log')  , emit: success_log , optional: true
+    tuple val(meta), path('*.error.log')    , emit: error_log   , optional: true
+    path "versions.yml"                     , emit: versions
+
+    when:
+    task.ext.when == null || task.ext.when
+
+    shell:
+    prefix = task.ext.prefix ?: "${meta.id}"
+    template 'check_gff3_fasta_correspondence.sh'
+}
diff --git a/modules/pfr/custom/checkgff3fastacorrespondence/meta.yml b/modules/pfr/custom/checkgff3fastacorrespondence/meta.yml
@@ -0,0 +1,56 @@
+---
+# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/meta-schema.json
+name: "custom_checkgff3fastacorrespondence"
+description: "A custom bash script which checks the correspondence of a gff3 file with a fasta file"
+keywords:
+  - genome
+  - gff3
+  - annotation
+  - validation
+tools:
+  - samtools:
+      description: |
+        SAMtools is a set of utilities for interacting with and post-processing
+        short DNA sequence read alignments in the SAM, BAM and CRAM formats, written by Heng Li.
+        These files are generated as output by short read aligners like BWA.
+      homepage: http://www.htslib.org/
+      documentation: http://www.htslib.org/doc/samtools.html
+      doi: 10.1093/bioinformatics/btp352
+      licence: ["MIT"]
+
+input:
+  - meta:
+      type: map
+      description: |
+        Groovy Map containing sample information
+        e.g. `[ id:'test' ]`
+  - gff3:
+      type: file
+      description: Input gff3 file
+      pattern: "*.{gff,gff3}"
+  - fasta:
+      type: file
+      description: Input fasta file
+      pattern: "*.{fsa,fa,fasta}"
+output:
+  - meta:
+      type: map
+      description: |
+        Groovy Map containing sample information
+        e.g. `[ id:'test' ]`
+  - success_log:
+      type: file
+      description: Log file for successful validation
+      pattern: "*.success.log"
+  - error_log:
+      type: file
+      description: Log file for failed validation
+      pattern: "*.error.log"
+  - versions:
+      type: file
+      description: File containing software versions
+      pattern: "versions.yml"
+authors:
+  - "@GallVp"
+maintainers:
+  - "@GallVp"