Skip to content

Commit

Permalink
Updated pipeline upto gt/stat
Browse files Browse the repository at this point in the history
  • Loading branch information
GallVp committed Feb 20, 2024
1 parent 2f8b46a commit 74d9ea7
Show file tree
Hide file tree
Showing 36 changed files with 1,566 additions and 12 deletions.
14 changes: 8 additions & 6 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -69,12 +69,14 @@ flowchart LR
Prepare an `assemblysheet.csv` file with following columns representing target assemblies and associated meta-data.

- tag: A unique tag which represents the target assembly throughout the pipeline and in the final report
- fasta: FASTA file
- gff3 [Optional]: GFF3 annotation file if available
- monoploid_ids [Optional]: A txt file listing the IDs used to calculate LAI in monoploid mode if necessary
- hic_reads [Optional] A SRA id such as 'SRR8238190' or path to paired reads such as 'PG_PETUNIA_HiC_CGYCF_CACTCA_L001_R{1,2}.fastq.gz'
- synteny_labels [Optional]: A two column tsv file listing fasta sequence ids (first column) and labels for the synteny plots (second column) when performing synteny analysis
- `tag:` A unique tag which represents the target assembly throughout the pipeline and in the final report
- `fasta:` FASTA file
- `gff3 [Optional]:` GFF3 annotation file if available
- `monoploid_ids [Optional]:` A txt file listing the IDs used to calculate LAI in monoploid mode if necessary
- `hic_reads [Optional]:` A SRA id such as 'SRR8238190' or path to paired reads such as 'PG_PETUNIA_HiC_CGYCF_CACTCA_L001_R{1,2}.fastq.gz'
- `synteny_labels [Optional]:` A two column tsv file listing fasta sequence ids (first column) and labels for the synteny plots (second column) when performing synteny analysis

See a minimal example [assemblysheet.csv](./assets/assemblysheet.csv)

Now, you can run the pipeline using:

Expand Down
221 changes: 221 additions & 0 deletions conf/modules.config
Original file line number Diff line number Diff line change
Expand Up @@ -21,3 +21,224 @@ process {
}

}

process {
withName: ASSEMBLATHON_STATS {
publishDir = [
path: { "${params.outdir}/assemblathon_stats" },
mode: params.publish_dir_mode,
saveAs: { filename -> filename.equals("versions.yml") ? null : filename }
]
}

withName: NCBI_FCS_ADAPTOR {
publishDir = [
path: { "${params.outdir}/ncbi_fcs_adaptor" },
mode: params.publish_dir_mode,
saveAs: { filename -> filename.equals("versions.yml") ? null : filename }
]
}

withName: RUN_BUSCO {
publishDir = [
path: { "${params.outdir}/busco" },
mode: params.publish_dir_mode,
saveAs: { filename -> filename.equals("versions.yml") ? null : filename }
]
}

withName: RUN_KRAKEN2 {
publishDir = [
path: { "${params.outdir}/kraken2" },
mode: params.publish_dir_mode,
saveAs: { filename -> filename.equals("versions.yml") ? null : filename }
]
}

withName: NCBI_FCS_GX_SCREEN_SAMPLES {
publishDir = [
path: { "${params.outdir}/ncbi_fcs_gx" },
mode: params.publish_dir_mode,
saveAs: { filename -> filename.equals("versions.yml") ? null : filename }
]
}

withName: CIRCOS {
publishDir = [
path: { "${params.outdir}/synteny/${target_on_ref_seq}" },
mode: params.publish_dir_mode,
saveAs: { filename -> filename.equals("versions.yml") ? null : filename }
]
}
}


process {
withName: '.*:GFF3_VALIDATE:GT_GFF3' {
ext.args = '-tidy -retainids'
}

withName: GT_STAT {
ext.args = [
'-genelengthdistri',
'-genescoredistri',
'-exonlengthdistri',
'-exonnumberdistri',
'-intronlengthdistri',
'-cdslengthdistri',
'-addintrons'
].join(' ').trim()

publishDir = [
path: { "${params.outdir}/genometools_gt_stat" },
mode: params.publish_dir_mode,
saveAs: { filename -> filename.equals("versions.yml") ? null : filename }
]
}
}

process {

withName: FILTER_BY_LENGTH {
ext.args = params.tidk_filter_by_size ? "-m ${params.tidk_filter_size_bp}" : ''
ext.prefix = { "${meta.id}.filtered" }
}

withName: SORT_BY_LENGTH {
ext.args = '--quiet --reverse --by-length'
ext.prefix = { "${meta.id}.sorted" }
}

withName: TIDK_EXPLORE {
ext.args = '--minimum 5 --maximum 30'
publishDir = [
path: { "${params.outdir}/tidk" },
mode: params.publish_dir_mode,
saveAs: { filename -> filename.equals("versions.yml") ? null : filename }
]
}

withName: TIDK_SEARCH_APRIORI {
ext.prefix = { "${meta.id}.apriori" }
ext.args = '--extension tsv'
publishDir = [
path: { "${params.outdir}/tidk" },
mode: params.publish_dir_mode,
saveAs: { filename -> filename.equals("versions.yml") ? null : filename }
]
}

withName: TIDK_SEARCH_APOSTERIORI {
ext.prefix = { "${meta.id}.aposteriori" }
ext.args = '--extension tsv'
publishDir = [
path: { "${params.outdir}/tidk" },
mode: params.publish_dir_mode,
saveAs: { filename -> filename.equals("versions.yml") ? null : filename }
]
}

withName: TIDK_PLOT_APRIORI {
ext.prefix = { "${meta.id}.apriori" }
publishDir = [
path: { "${params.outdir}/tidk" },
mode: params.publish_dir_mode,
saveAs: { filename -> filename.equals("versions.yml") ? null : filename }
]
}

withName: TIDK_PLOT_APOSTERIORI {
ext.prefix = { "${meta.id}.aposteriori" }
publishDir = [
path: { "${params.outdir}/tidk" },
mode: params.publish_dir_mode,
saveAs: { filename -> filename.equals("versions.yml") ? null : filename }
]
}
}


process {

withName: CUSTOM_SHORTENFASTAIDS {
publishDir = [
path: { "${params.outdir}/lai" },
mode: params.publish_dir_mode,
pattern: '*.short.ids.tsv'
]
}

withName: EDTA_LTRHARVEST {
ext.prefix = { "${meta.id}_edta_ltrharvest" }
}

withName: LTRFINDER {
ext.args = '-harvest_out -size 1000000 -time 300'
}

withName: CAT_CAT {
ext.prefix = { "${meta.id}_ltrharvest_ltrfinder.tabout" }
}

withName: LTRRETRIEVER {
publishDir = [
path: { "${params.outdir}/lai" },
mode: params.publish_dir_mode,
pattern: '*.LTRlib.fa'
]
}

withName: CUSTOM_RESTOREGFFIDS {
publishDir = [
path: { "${params.outdir}/lai" },
mode: params.publish_dir_mode,
saveAs: { filename -> filename.equals("versions.yml") ? null : filename }
]
}

withName: LAI {
publishDir = [
path: { "${params.outdir}/lai" },
mode: params.publish_dir_mode,
saveAs: { filename -> filename.equals("versions.yml") ? null : filename }
]
}
}

process {

withName: FASTQC_RAW {
publishDir = [
path: { "${params.outdir}/hic/fastqc_raw" },
mode: params.publish_dir_mode,
saveAs: { filename -> filename.equals("versions.yml") ? null : filename }
]
}

withName: FASTQC_TRIM {
publishDir = [
path: { "${params.outdir}/hic/fastqc_trim" },
mode: params.publish_dir_mode,
saveAs: { filename -> filename.equals("versions.yml") ? null : filename }
]
}

withName: FASTP {
ext.args = params.hic_fastp_ext_args
publishDir = [
path: { "${params.outdir}/hic/fastp" },
mode: params.publish_dir_mode,
saveAs: { filename -> filename.equals("versions.yml") ? null : filename }
]
}

withName: BWA_MEM {
ext.prefix = { "${meta.id}.on.${meta.ref_id}.bwa.mem" }
ext.args = '-5SP'
}

withName: SAMBLASTER {
ext.prefix = { "${meta.id}.on.${meta.ref_id}.samblaster" }
ext.args3 = '-h -F 2316'
}
}
35 changes: 35 additions & 0 deletions modules.json
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,41 @@
"name": "plant-food-research-open/assemblyqc",
"homePage": "https://github.com/plant-food-research-open/assemblyqc",
"repos": {
"[email protected]:PlantandFoodResearch/nxf-modules.git": {
"modules": {
"pfr": {
"custom/checkgff3fastacorrespondence": {
"branch": "main",
"git_sha": "1a76f884082c786760559c462063a5d1de94ca83",
"installed_by": ["gff3_validate"]
},
"gt/gff3": {
"branch": "main",
"git_sha": "bfa4874d3942bdff70cb8df17322834125cafb28",
"installed_by": ["gff3_validate"]
},
"gt/gff3validator": {
"branch": "main",
"git_sha": "889b9b57b611dcb063594608c2a222c928327cba",
"installed_by": ["gff3_validate"]
},
"gt/stat": {
"branch": "main",
"git_sha": "cb5fb0be78a98fd1e32b7c90d6adf8c3bf44133e",
"installed_by": ["modules"]
}
}
},
"subworkflows": {
"pfr": {
"gff3_validate": {
"branch": "main",
"git_sha": "f9b96bf8142a01f0649ff90570fb10aa973504b9",
"installed_by": ["subworkflows"]
}
}
}
},
"https://github.com/nf-core/modules.git": {
"modules": {
"nf-core": {
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
---
# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/environment-schema.json
name: "custom_checkgff3fastacorrespondence"
channels:
- conda-forge
- bioconda
- defaults
dependencies:
- "bioconda::samtools=1.18"
25 changes: 25 additions & 0 deletions modules/pfr/custom/checkgff3fastacorrespondence/main.nf
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
process CUSTOM_CHECKGFF3FASTACORRESPONDENCE {
tag "$meta.id"
label 'process_single'

conda "${moduleDir}/environment.yml"
container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
'https://depot.galaxyproject.org/singularity/samtools:1.18--h50ea8bc_1':
'biocontainers/samtools:1.18--h50ea8bc_1' }"

input:
tuple val(meta), path(gff3)
path(fasta)

output:
tuple val(meta), path('*.success.log') , emit: success_log , optional: true
tuple val(meta), path('*.error.log') , emit: error_log , optional: true
path "versions.yml" , emit: versions

when:
task.ext.when == null || task.ext.when

shell:
prefix = task.ext.prefix ?: "${meta.id}"
template 'check_gff3_fasta_correspondence.sh'
}
56 changes: 56 additions & 0 deletions modules/pfr/custom/checkgff3fastacorrespondence/meta.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
---
# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/meta-schema.json
name: "custom_checkgff3fastacorrespondence"
description: "A custom bash script which checks the correspondence of a gff3 file with a fasta file"
keywords:
- genome
- gff3
- annotation
- validation
tools:
- samtools:
description: |
SAMtools is a set of utilities for interacting with and post-processing
short DNA sequence read alignments in the SAM, BAM and CRAM formats, written by Heng Li.
These files are generated as output by short read aligners like BWA.
homepage: http://www.htslib.org/
documentation: http://www.htslib.org/doc/samtools.html
doi: 10.1093/bioinformatics/btp352
licence: ["MIT"]

input:
- meta:
type: map
description: |
Groovy Map containing sample information
e.g. `[ id:'test' ]`
- gff3:
type: file
description: Input gff3 file
pattern: "*.{gff,gff3}"
- fasta:
type: file
description: Input fasta file
pattern: "*.{fsa,fa,fasta}"
output:
- meta:
type: map
description: |
Groovy Map containing sample information
e.g. `[ id:'test' ]`
- success_log:
type: file
description: Log file for successful validation
pattern: "*.success.log"
- error_log:
type: file
description: Log file for failed validation
pattern: "*.error.log"
- versions:
type: file
description: File containing software versions
pattern: "versions.yml"
authors:
- "@GallVp"
maintainers:
- "@GallVp"
Loading

0 comments on commit 74d9ea7

Please sign in to comment.