diff --git a/CHANGELOG.md b/CHANGELOG.md
index 3d8fc04a..32ca969a 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -7,9 +7,12 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
### `Added`
+- [#504](https://github.com/nf-core/mag/pull/504) - New parameters `--busco_db`, `--kraken2_db`, and `--centrifuge_db` now support directory input of a pre-uncompressed database archive directory (by @gregorysprenger).
+
### `Changed`
- [#511](https://github.com/nf-core/mag/pull/511) - Update to nf-core 2.10 `TEMPLATE` (by @jfy133)
+- [#504](https://github.com/nf-core/mag/pull/504) - `--save_busco_reference` is now replaced by `--save_busco_db` (by @gregorysprenger).
### `Fixed`
@@ -21,6 +24,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
### `Dependencies`
+### `Deprecated`
+
+- [#504](https://github.com/nf-core/mag/pull/504) - `--busco_reference`, `--busco_download_path`, `--save_busco_reference` parameters have been deprecated and replaced with new parameters (by @gregorysprenger).
+
## 2.4.0 - 2023-09-26
### `Added`
diff --git a/conf/test.config b/conf/test.config
index 348b95d5..9c93278f 100644
--- a/conf/test.config
+++ b/conf/test.config
@@ -26,7 +26,7 @@ params {
skip_krona = true
min_length_unbinned_contigs = 1
max_unbinned_contigs = 2
- busco_reference = "https://busco-data.ezlab.org/v5/data/lineages/bacteria_odb10.2020-03-06.tar.gz"
+ busco_db = "https://busco-data.ezlab.org/v5/data/lineages/bacteria_odb10.2020-03-06.tar.gz"
busco_clean = true
skip_gtdbtk = true
skip_concoct = true
diff --git a/conf/test_adapterremoval.config b/conf/test_adapterremoval.config
index 92d51aec..d8bd581a 100644
--- a/conf/test_adapterremoval.config
+++ b/conf/test_adapterremoval.config
@@ -27,7 +27,7 @@ params {
skip_krona = true
min_length_unbinned_contigs = 1
max_unbinned_contigs = 2
- busco_reference = "https://busco-data.ezlab.org/v5/data/lineages/bacteria_odb10.2020-03-06.tar.gz"
+ busco_db = "https://busco-data.ezlab.org/v5/data/lineages/bacteria_odb10.2020-03-06.tar.gz"
skip_gtdbtk = true
clip_tool = 'adapterremoval'
skip_concoct = true
diff --git a/conf/test_ancient_dna.config b/conf/test_ancient_dna.config
index 325362fc..ea672651 100644
--- a/conf/test_ancient_dna.config
+++ b/conf/test_ancient_dna.config
@@ -26,7 +26,7 @@ params {
skip_krona = true
min_length_unbinned_contigs = 1
max_unbinned_contigs = 2
- busco_reference = "https://busco-data.ezlab.org/v5/data/lineages/bacteria_odb10.2020-03-06.tar.gz"
+ busco_db = "https://busco-data.ezlab.org/v5/data/lineages/bacteria_odb10.2020-03-06.tar.gz"
skip_gtdbtk = true
ancient_dna = true
binning_map_mode = 'own'
diff --git a/conf/test_bbnorm.config b/conf/test_bbnorm.config
index 5f481adf..55d48a8b 100644
--- a/conf/test_bbnorm.config
+++ b/conf/test_bbnorm.config
@@ -32,7 +32,7 @@ params {
skip_krona = true
min_length_unbinned_contigs = 1
max_unbinned_contigs = 2
- busco_reference = "https://busco-data.ezlab.org/v5/data/lineages/bacteria_odb10.2020-03-06.tar.gz"
+ busco_db = "https://busco-data.ezlab.org/v5/data/lineages/bacteria_odb10.2020-03-06.tar.gz"
busco_clean = true
skip_gtdbtk = true
bbnorm = true
diff --git a/conf/test_binrefinement.config b/conf/test_binrefinement.config
index 85dda8db..bc1796d5 100644
--- a/conf/test_binrefinement.config
+++ b/conf/test_binrefinement.config
@@ -27,7 +27,7 @@ params {
skip_krona = true
min_length_unbinned_contigs = 1
max_unbinned_contigs = 2
- busco_reference = "https://busco-data.ezlab.org/v5/data/lineages/bacteria_odb10.2020-03-06.tar.gz"
+ busco_db = "https://busco-data.ezlab.org/v5/data/lineages/bacteria_odb10.2020-03-06.tar.gz"
skip_gtdbtk = true
refine_bins_dastool = true
refine_bins_dastool_threshold = 0
diff --git a/conf/test_full.config b/conf/test_full.config
index 4917332e..ea694247 100644
--- a/conf/test_full.config
+++ b/conf/test_full.config
@@ -28,7 +28,7 @@ params {
spades_fix_cpus = 10
spadeshybrid_fix_cpus = 10
megahit_fix_cpu_1 = true
- // available options to enable reproducibility for BUSCO (--busco_download_path or --busco_reference) not used here
+ // available options to enable reproducibility for BUSCO (--busco_db) not used here
// to allow detection of possible problems in automated lineage selection mode using public databases
// test CAT with official taxonomic ranks only
diff --git a/conf/test_host_rm.config b/conf/test_host_rm.config
index b3487c6b..7af3bcd4 100644
--- a/conf/test_host_rm.config
+++ b/conf/test_host_rm.config
@@ -24,7 +24,7 @@ params {
input = 'https://raw.githubusercontent.com/nf-core/test-datasets/mag/samplesheets/samplesheet.host_rm.csv'
min_length_unbinned_contigs = 1
max_unbinned_contigs = 2
- busco_reference = "https://busco-data.ezlab.org/v5/data/lineages/bacteria_odb10.2020-03-06.tar.gz"
+ busco_db = "https://busco-data.ezlab.org/v5/data/lineages/bacteria_odb10.2020-03-06.tar.gz"
skip_gtdbtk = true
skip_concoct = true
}
diff --git a/conf/test_hybrid.config b/conf/test_hybrid.config
index bc22d3d2..0600c88c 100644
--- a/conf/test_hybrid.config
+++ b/conf/test_hybrid.config
@@ -23,7 +23,7 @@ params {
input = 'https://raw.githubusercontent.com/nf-core/test-datasets/mag/samplesheets/samplesheet.hybrid.csv'
min_length_unbinned_contigs = 1
max_unbinned_contigs = 2
- busco_reference = "https://busco-data.ezlab.org/v5/data/lineages/bacteria_odb10.2020-03-06.tar.gz"
- skip_gtdbtk = true
+ busco_db = "https://busco-data.ezlab.org/v5/data/lineages/bacteria_odb10.2020-03-06.tar.gz"
+ skip_gtdbtk = true
skip_concoct = true
}
diff --git a/docs/output.md b/docs/output.md
index 20cab1da..1061870f 100644
--- a/docs/output.md
+++ b/docs/output.md
@@ -484,7 +484,7 @@ For each bin or refined bin the median sequencing depth is computed based on the
#### BUSCO
-[BUSCO](https://busco.ezlab.org/) is a tool used to assess the completeness of a genome assembly. It is run on all the genome bins and high quality contigs obtained by the applied binning and/or binning refinement methods (depending on the `--postbinning_input` parameter). By default, BUSCO is run in automated lineage selection mode in which it first tries to select the domain and then a more specific lineage based on phylogenetic placement. If available, result files for both the selected domain lineage and the selected more specific lineage are placed in the output directory. If a lineage dataset is specified already with `--busco_reference`, only results for this specific lineage will be generated.
+[BUSCO](https://busco.ezlab.org/) is a tool used to assess the completeness of a genome assembly. It is run on all the genome bins and high quality contigs obtained by the applied binning and/or binning refinement methods (depending on the `--postbinning_input` parameter). By default, BUSCO is run in automated lineage selection mode in which it first tries to select the domain and then a more specific lineage based on phylogenetic placement. If available, result files for both the selected domain lineage and the selected more specific lineage are placed in the output directory. If a lineage dataset is specified already with `--busco_db`, only results for this specific lineage will be generated.
Output files
@@ -493,21 +493,21 @@ For each bin or refined bin the median sequencing depth is computed based on the
- `[assembler]-[bin]_busco.log`: Log file containing the standard output of BUSCO.
- `[assembler]-[bin]_busco.err`: File containing potential error messages returned from BUSCO.
- `short_summary.domain.[lineage].[assembler]-[bin].txt`: BUSCO summary of the results for the selected domain when run in automated lineage selection mode. Not available for bins for which a viral lineage was selected.
- - `short_summary.specific_lineage.[lineage].[assembler]-[bin].txt`: BUSCO summary of the results in case a more specific lineage than the domain could be selected or for the lineage provided via `--busco_reference`.
+ - `short_summary.specific_lineage.[lineage].[assembler]-[bin].txt`: BUSCO summary of the results in case a more specific lineage than the domain could be selected or for the lineage provided via `--busco_db`.
- `[assembler]-[bin]_buscos.[lineage].fna.gz`: Nucleotide sequence of all identified BUSCOs for used lineages (domain or specific).
- `[assembler]-[bin]_buscos.[lineage].faa.gz`: Aminoacid sequence of all identified BUSCOs for used lineages (domain or specific).
- `[assembler]-[bin]_prodigal.gff`: Genes predicted with Prodigal.
-If the parameter `--save_busco_reference` is set, additionally the used BUSCO lineage datasets are stored in the output directory.
+If the parameter `--save_busco_db` is set, additionally the used BUSCO lineage datasets are stored in the output directory.
Output files
- `GenomeBinning/QC/BUSCO/`
- `busco_downloads/`: All files and lineage datasets downloaded by BUSCO when run in automated lineage selection mode. (Can currently not be used to reproduce analysis, see the [nf-core/mag website documentation](https://nf-co.re/mag/usage#reproducibility) how to achieve reproducible BUSCO results).
- - `reference/*.tar.gz`: BUSCO reference lineage dataset that was provided via `--busco_reference`.
+ - `reference/*.tar.gz`: BUSCO reference lineage dataset that was provided via `--busco_db`.
diff --git a/docs/usage.md b/docs/usage.md
index ed23936d..a644a49b 100644
--- a/docs/usage.md
+++ b/docs/usage.md
@@ -190,7 +190,7 @@ You can fix this by using the prameter `--megahit_fix_cpu_1`. In both cases, do
MetaBAT2 is run by default with a fixed seed within this pipeline, thus producing reproducible results.
-To allow also reproducible bin QC with BUSCO, run BUSCO providing already downloaded lineage datasets with `--busco_download_path` (BUSCO will be run using automated lineage selection in offline mode) or provide a specific lineage dataset via `--busco_reference` and use the parameter `--save_busco_reference`. This may be useful since BUSCO datasets are frequently updated and old versions do not always remain (easily) accessible.
+To allow also reproducible bin QC with BUSCO, run BUSCO providing already downloaded lineage datasets (BUSCO will be run using automated lineage selection in offline mode) or provide a specific lineage dataset via `--busco_db` and use the parameter `--save_busco_db`. This may be useful since BUSCO datasets are frequently updated and old versions do not always remain (easily) accessible.
For the taxonomic bin classification with [CAT](https://github.com/dutilh/CAT), when running the pipeline with `--cat_db_generate` the parameter `--save_cat_db` can be used to also save the generated database to allow reproducibility in future runs. Note that when specifying a pre-built database with `--cat_db`, currently the database can not be saved.
diff --git a/lib/WorkflowMag.groovy b/lib/WorkflowMag.groovy
index de66f06e..cd1a4456 100755
--- a/lib/WorkflowMag.groovy
+++ b/lib/WorkflowMag.groovy
@@ -102,22 +102,13 @@ class WorkflowMag {
Nextflow.error('Both --skip_binqc and --binqc_tool \'checkm\' are specified! Invalid combination, please specify either --skip_binqc or --binqc_tool.')
}
if (params.skip_binqc) {
- if (params.busco_reference) {
- Nextflow.error('Both --skip_binqc and --busco_reference are specified! Invalid combination, please specify either --skip_binqc or --binqc_tool \'busco\' with --busco_reference.')
- }
- if (params.busco_download_path) {
- Nextflow.error('Both --skip_binqc and --busco_download_path are specified! Invalid combination, please specify either --skip_binqc or --binqc_tool \'busco\' with --busco_download_path.')
+ if (params.busco_db) {
+ Nextflow.error('Both --skip_binqc and --busco_db are specified! Invalid combination, please specify either --skip_binqc or --binqc_tool \'busco\' with --busco_db.')
}
if (params.busco_auto_lineage_prok) {
Nextflow.error('Both --skip_binqc and --busco_auto_lineage_prok are specified! Invalid combination, please specify either --skip_binqc or --binqc_tool \'busco\' with --busco_auto_lineage_prok.')
}
}
- if (params.busco_reference && params.busco_download_path) {
- Nextflow.error('Both --busco_reference and --busco_download_path are specified! Invalid combination, please specify either --busco_reference or --busco_download_path.')
- }
- if (params.busco_auto_lineage_prok && params.busco_reference) {
- Nextflow.error('Both --busco_auto_lineage_prok and --busco_reference are specified! Invalid combination, please specify either --busco_auto_lineage_prok or --busco_reference.')
- }
if (params.skip_binqc && !params.skip_gtdbtk) {
log.warn '--skip_binqc is specified, but --skip_gtdbtk is explictly set to run! GTDB-tk will be omitted because GTDB-tk bin classification requires bin filtering based on BUSCO or CheckM QC results to avoid GTDB-tk errors.'
diff --git a/modules/local/busco.nf b/modules/local/busco.nf
index 58e79efc..014046f0 100644
--- a/modules/local/busco.nf
+++ b/modules/local/busco.nf
@@ -8,8 +8,7 @@ process BUSCO {
input:
tuple val(meta), path(bin)
- path(db)
- path(download_folder)
+ tuple val(db_meta), path(db)
output:
tuple val(meta), path("short_summary.domain.*.${bin}.txt") , optional:true , emit: summary_domain
@@ -25,17 +24,16 @@ process BUSCO {
script:
def cp_augustus_config = workflow.profile.toString().indexOf("conda") != -1 ? "N" : "Y"
- def lineage_dataset_provided = params.busco_reference ? "Y" : "N"
+ def lineage_dataset_provided = "${db_meta.lineage}"
def busco_clean = params.busco_clean ? "Y" : "N"
- def p = "--auto-lineage"
- if (params.busco_reference){
+ def p = params.busco_auto_lineage_prok ? "--auto-lineage-prok" : "--auto-lineage"
+ if ( "${lineage_dataset_provided}" == "Y" ) {
p = "--lineage_dataset dataset/${db}"
+ } else if ( "${lineage_dataset_provided}" == "N" ) {
+ p += " --offline --download_path ${db}"
} else {
- if (params.busco_auto_lineage_prok)
- p = "--auto-lineage-prok"
- if (params.busco_download_path)
- p += " --offline --download_path ${download_folder}"
+ lineage_dataset_provided = ""
}
"""
run_busco.sh "${p}" "${cp_augustus_config}" "${db}" "${bin}" ${task.cpus} "${lineage_dataset_provided}" "${busco_clean}"
diff --git a/modules/local/busco_db_preparation.nf b/modules/local/busco_db_preparation.nf
index cddc130d..e3418cb6 100644
--- a/modules/local/busco_db_preparation.nf
+++ b/modules/local/busco_db_preparation.nf
@@ -10,9 +10,8 @@ process BUSCO_DB_PREPARATION {
path database
output:
- path "buscodb/*" , emit: db
- path database , emit: database
- path "versions.yml" , emit: versions
+ tuple val("${database.getSimpleName()}"), path("buscodb/*"), emit: db
+ path "versions.yml" , emit: versions
script:
"""
diff --git a/modules/local/busco_summary.nf b/modules/local/busco_summary.nf
index 799196d7..bafcc495 100644
--- a/modules/local/busco_summary.nf
+++ b/modules/local/busco_summary.nf
@@ -15,11 +15,12 @@ process BUSCO_SUMMARY {
path "versions.yml" , emit: versions
script:
- def auto = params.busco_reference ? "" : "-a"
+ def reference = params.busco_db.toString().contains('odb10')
+ def auto = reference ? "" : "-a"
def ss = summaries_specific.sort().size() > 0 ? "-ss ${summaries_specific}" : ""
def sd = summaries_domain.sort().size() > 0 ? "-sd ${summaries_domain}" : ""
def f = ""
- if (!params.busco_reference && failed_bins.sort().size() > 0)
+ if ("${reference}" == false && failed_bins.sort().size() > 0)
f = "-f ${failed_bins}"
"""
summary_busco.py $auto $ss $sd $f -o busco_summary.tsv
diff --git a/modules/local/centrifuge_db_preparation.nf b/modules/local/centrifuge_db_preparation.nf
index 5bd76c69..fe48512c 100644
--- a/modules/local/centrifuge_db_preparation.nf
+++ b/modules/local/centrifuge_db_preparation.nf
@@ -9,12 +9,15 @@ process CENTRIFUGE_DB_PREPARATION {
path db
output:
- tuple val("${db.toString().replace(".tar.gz", "")}"), path("*.cf"), emit: db
- path "versions.yml" , emit: versions
+ path "database/*.cf", emit: db
+ path "versions.yml" , emit: versions
script:
"""
- tar -xf "${db}"
+ mkdir db_tmp
+ tar -xf "${db}" -C db_tmp
+ mkdir database
+ mv `find db_tmp/ -name "*.cf"` database/
cat <<-END_VERSIONS > versions.yml
"${task.process}":
diff --git a/nextflow.config b/nextflow.config
index 6db8af4e..74b1878f 100644
--- a/nextflow.config
+++ b/nextflow.config
@@ -121,10 +121,9 @@ params {
// Bin QC
skip_binqc = false
binqc_tool = 'busco'
- busco_reference = null
- busco_download_path = null
+ busco_db = null
busco_auto_lineage_prok = false
- save_busco_reference = false
+ save_busco_db = false
busco_clean = false
checkm_download_url = "https://data.ace.uq.edu.au/public/CheckM_databases/checkm_data_2015_01_16.tar.gz"
checkm_db = null
diff --git a/nextflow_schema.json b/nextflow_schema.json
index 5dbd2a26..837b4e20 100644
--- a/nextflow_schema.json
+++ b/nextflow_schema.json
@@ -481,12 +481,12 @@
"centrifuge_db": {
"type": "string",
"description": "Database for taxonomic binning with centrifuge.",
- "help_text": "E.g. ftp://ftp.ccb.jhu.edu/pub/infphilo/centrifuge/data/p_compressed+h+v.tar.gz."
+ "help_text": "Local directory containing `*.cf` files, or a URL or local path to a downloaded compressed tar archive of a Centrifuge database. E.g. ftp://ftp.ccb.jhu.edu/pub/infphilo/centrifuge/data/p_compressed+h+v.tar.gz."
},
"kraken2_db": {
"type": "string",
"description": "Database for taxonomic binning with kraken2.",
- "help_text": "The database file must be a compressed tar archive that contains at least the three files `hash.k2d`, `opts.k2d` and `taxo.k2d`. E.g. ftp://ftp.ccb.jhu.edu/pub/data/kraken2_dbs/minikraken_8GB_202003.tgz."
+ "help_text": "Path to a local directory, archive file, or a URL to compressed tar archive that contains at least the three files `hash.k2d`, `opts.k2d` and `taxo.k2d`. E.g. ftp://ftp.ccb.jhu.edu/pub/data/kraken2_dbs/minikraken_8GB_202003.tgz."
},
"krona_db": {
"type": "string",
@@ -757,23 +757,18 @@
"description": "Specify which tool for bin quality-control validation to use.",
"enum": ["busco", "checkm"]
},
- "busco_reference": {
+ "busco_db": {
"type": "string",
- "description": "Download path for BUSCO lineage dataset, instead of using automated lineage selection.",
- "help_text": "E.g. https://busco-data.ezlab.org/v5/data/lineages/bacteria_odb10.2020-03-06.tar.gz. Available databases are listed here: https://busco-data.ezlab.org/v5/data/lineages/."
- },
- "busco_download_path": {
- "type": "string",
- "description": "Path to local folder containing already downloaded and unpacked lineage datasets.",
- "help_text": "If provided, BUSCO analysis will be run in offline mode. Data can be downloaded from https://busco-data.ezlab.org/v5/data/ (files still need to be unpacked manually). Run in combination with automated lineage selection."
+ "description": "Download URL for BUSCO lineage dataset, or path to a tar.gz archive, or local directory containing already downloaded and unpacked lineage datasets.",
+ "help_text": "E.g. https://busco-data.ezlab.org/v5/data/lineages/bacteria_odb10.2020-03-06.tar.gz or '/path/to/buscodb' (files still need to be unpacked manually). Available databases are listed here: https://busco-data.ezlab.org/v5/data/lineages/."
},
"busco_auto_lineage_prok": {
"type": "boolean",
"description": "Run BUSCO with automated lineage selection, but ignoring eukaryotes (saves runtime)."
},
- "save_busco_reference": {
+ "save_busco_db": {
"type": "boolean",
- "description": "Save the used BUSCO lineage datasets provided via --busco_reference or downloaded when not using --busco_reference or --busco_download_path.",
+ "description": "Save the used BUSCO lineage datasets provided via `--busco_db`.",
"help_text": "Useful to allow reproducibility, as BUSCO datasets are frequently updated and old versions do not always remain accessible."
},
"busco_clean": {
diff --git a/subworkflows/local/busco_qc.nf b/subworkflows/local/busco_qc.nf
index 6165be47..ce92efd5 100644
--- a/subworkflows/local/busco_qc.nf
+++ b/subworkflows/local/busco_qc.nf
@@ -9,23 +9,57 @@ include { BUSCO_SUMMARY } from '../../modules/local/busco_summ
workflow BUSCO_QC {
take:
- busco_db_file // channel: path
- busco_download_folder // channel: path
- bins // channel: [ val(meta), path(bin) ]
+ busco_db // channel: path
+ bins // channel: [ val(meta), path(bin) ]
main:
- if (params.busco_reference){
- BUSCO_DB_PREPARATION ( busco_db_file )
- ch_busco_db = BUSCO_DB_PREPARATION.out.db
+ if ( !busco_db.isEmpty() ) {
+ if ( busco_db.extension in ['gz', 'tgz'] ) {
+ // Expects to be tar.gz!
+ ch_db_for_busco = BUSCO_DB_PREPARATION ( busco_db ).db
+ .map{
+ meta, db ->
+ def meta_new = [:]
+ meta_new['id'] = meta
+ meta_new['lineage'] = 'Y'
+ [ meta_new, db ]
+ }
+ } else if ( busco_db.isDirectory() ) {
+ // Set meta to match expected channel cardinality for BUSCO
+ ch_db_for_busco = Channel
+ .of(busco_db)
+ .map{
+ db ->
+ def meta = [:]
+ meta['id'] = db.getBaseName()
+ if ( meta['id'].contains('odb10') == true ) {
+ meta['lineage'] = 'Y'
+ } else {
+ meta['lineage'] = 'N'
+ }
+ [ meta, db ]
+ }
+ .collect()
+ }
} else {
- ch_busco_db = Channel.empty()
+ // Set BUSCO database to empty to allow for --auto-lineage
+ ch_db_for_busco = Channel
+ .of([])
+ .map{
+ empty_db ->
+ def meta = [:]
+ meta['lineage'] = ''
+ [ meta, [] ]
+ }
+ .collect()
}
+
BUSCO (
bins,
- ch_busco_db.collect().ifEmpty([]),
- busco_download_folder.collect().ifEmpty([])
+ ch_db_for_busco
)
- if (params.save_busco_reference){
+
+ if (params.save_busco_db){
// publish files downloaded by Busco
ch_downloads = BUSCO.out.busco_downloads.groupTuple().map{lin,downloads -> downloads[0]}.toSortedList().flatten()
BUSCO_SAVE_DOWNLOAD ( ch_downloads )
diff --git a/subworkflows/local/gtdbtk.nf b/subworkflows/local/gtdbtk.nf
index 21823962..7586b142 100644
--- a/subworkflows/local/gtdbtk.nf
+++ b/subworkflows/local/gtdbtk.nf
@@ -24,7 +24,7 @@ workflow GTDBTK {
def completeness = -1
def contamination = -1
def missing, duplicated
- if (params.busco_reference) {
+ if (params.busco_db.getBaseName().contains('odb10')) {
missing = row.'%Missing (specific)' // TODO or just take '%Complete'?
duplicated = row.'%Complete and duplicated (specific)'
} else {
diff --git a/workflows/mag.nf b/workflows/mag.nf
index 65394ae2..11449c8c 100644
--- a/workflows/mag.nf
+++ b/workflows/mag.nf
@@ -31,7 +31,7 @@ log.info logo + paramsSummaryLog(workflow) + citation
WorkflowMag.initialise(params, log, hybrid)
// Check input path parameters to see if they exist
-def checkPathParamList = [ params.input, params.multiqc_config, params.phix_reference, params.host_fasta, params.centrifuge_db, params.kraken2_db, params.cat_db, params.krona_db, params.gtdb_db, params.lambda_reference, params.busco_reference ]
+def checkPathParamList = [ params.input, params.multiqc_config, params.phix_reference, params.host_fasta, params.centrifuge_db, params.kraken2_db, params.cat_db, params.krona_db, params.gtdb_db, params.lambda_reference, params.busco_db ]
for (param in checkPathParamList) { if (param) { file(param, checkIfExists: true) } }
/*
@@ -145,17 +145,10 @@ if ( params.host_genome ) {
ch_host_fasta = Channel.empty()
}
-if(params.busco_reference){
- ch_busco_db_file = Channel
- .value(file( "${params.busco_reference}" ))
+if (params.busco_db) {
+ ch_busco_db = file(params.busco_db, checkIfExists: true)
} else {
- ch_busco_db_file = Channel.empty()
-}
-if (params.busco_download_path) {
- ch_busco_download_folder = Channel
- .value(file( "${params.busco_download_path}" ))
-} else {
- ch_busco_download_folder = Channel.empty()
+ ch_busco_db = []
}
if(params.checkm_db) {
@@ -169,17 +162,15 @@ if (params.gunc_db) {
}
if(params.centrifuge_db){
- ch_centrifuge_db_file = Channel
- .value(file( "${params.centrifuge_db}" ))
+ ch_centrifuge_db_file = file(params.centrifuge_db, checkIfExists: true)
} else {
- ch_centrifuge_db_file = Channel.empty()
+ ch_centrifuge_db_file = []
}
if(params.kraken2_db){
- ch_kraken2_db_file = Channel
- .value(file( "${params.kraken2_db}" ))
+ ch_kraken2_db_file = file(params.kraken2_db, checkIfExists: true)
} else {
- ch_kraken2_db_file = Channel.empty()
+ ch_kraken2_db_file = []
}
if(params.cat_db){
@@ -463,20 +454,63 @@ workflow MAG {
Taxonomic information
================================================================================
*/
- CENTRIFUGE_DB_PREPARATION ( ch_centrifuge_db_file )
+ if ( !ch_centrifuge_db_file.isEmpty() ) {
+ if ( ch_centrifuge_db_file.extension in ['gz', 'tgz'] ) {
+ // Expects to be tar.gz!
+ ch_db_for_centrifuge = CENTRIFUGE_DB_PREPARATION ( ch_centrifuge_db_file ).db
+ } else if ( ch_centrifuge_db_file.isDirectory() ) {
+ ch_db_for_centrifuge = Channel
+ .fromPath( "${ch_centrifuge_db_file}/*.cf" )
+ } else {
+ ch_db_for_centrifuge = Channel.empty()
+ }
+ } else {
+ ch_db_for_centrifuge = Channel.empty()
+ }
+
+ // Centrifuge val(db_name) has to be the basename of any of the
+ // index files up to but not including the final .1.cf
+ ch_db_for_centrifuge = ch_db_for_centrifuge
+ .collect()
+ .map{
+ db ->
+ def db_name = db[0].getBaseName().split('\\.')[0]
+ [ db_name, db ]
+ }
CENTRIFUGE (
ch_short_reads,
- CENTRIFUGE_DB_PREPARATION.out.db
+ ch_db_for_centrifuge
)
ch_versions = ch_versions.mix(CENTRIFUGE.out.versions.first())
- KRAKEN2_DB_PREPARATION (
- ch_kraken2_db_file
- )
+ if ( !ch_kraken2_db_file.isEmpty() ) {
+ if ( ch_kraken2_db_file.extension in ['gz', 'tgz'] ) {
+ // Expects to be tar.gz!
+ ch_db_for_kraken2 = KRAKEN2_DB_PREPARATION ( ch_kraken2_db_file ).db
+ } else if ( ch_kraken2_db_file.isDirectory() ) {
+ ch_db_for_kraken2 = Channel
+ .fromPath( "${ch_kraken2_db_file}/*.k2d" )
+ .collect()
+ .map{
+ file ->
+ if (file.size() >= 3) {
+ def db_name = file[0].getParent().getName()
+ [ db_name, file ]
+ } else {
+ error("Kraken2 requires '{hash,opts,taxo}.k2d' files.")
+ }
+ }
+ } else {
+ ch_db_for_kraken2 = Channel.empty()
+ }
+ } else {
+ ch_db_for_kraken2 = Channel.empty()
+ }
+
KRAKEN2 (
ch_short_reads,
- KRAKEN2_DB_PREPARATION.out.db
+ ch_db_for_kraken2
)
ch_versions = ch_versions.mix(KRAKEN2.out.versions.first())
@@ -805,8 +839,7 @@ workflow MAG {
*/
BUSCO_QC (
- ch_busco_db_file,
- ch_busco_download_folder,
+ ch_busco_db,
ch_input_bins_for_qc
)
ch_busco_summary = BUSCO_QC.out.summary