diff --git a/CHANGELOG.md b/CHANGELOG.md index 3d8fc04a..32ca969a 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,9 +7,12 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### `Added` +- [#504](https://github.com/nf-core/mag/pull/504) - New parameters `--busco_db`, `--kraken2_db`, and `--centrifuge_db` now support directory input of a pre-uncompressed database archive directory (by @gregorysprenger). + ### `Changed` - [#511](https://github.com/nf-core/mag/pull/511) - Update to nf-core 2.10 `TEMPLATE` (by @jfy133) +- [#504](https://github.com/nf-core/mag/pull/504) - `--save_busco_reference` is now replaced by `--save_busco_db` (by @gregorysprenger). ### `Fixed` @@ -21,6 +24,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### `Dependencies` +### `Deprecated` + +- [#504](https://github.com/nf-core/mag/pull/504) - `--busco_reference`, `--busco_download_path`, `--save_busco_reference` parameters have been deprecated and replaced with new parameters (by @gregorysprenger). + ## 2.4.0 - 2023-09-26 ### `Added` diff --git a/conf/test.config b/conf/test.config index 348b95d5..9c93278f 100644 --- a/conf/test.config +++ b/conf/test.config @@ -26,7 +26,7 @@ params { skip_krona = true min_length_unbinned_contigs = 1 max_unbinned_contigs = 2 - busco_reference = "https://busco-data.ezlab.org/v5/data/lineages/bacteria_odb10.2020-03-06.tar.gz" + busco_db = "https://busco-data.ezlab.org/v5/data/lineages/bacteria_odb10.2020-03-06.tar.gz" busco_clean = true skip_gtdbtk = true skip_concoct = true diff --git a/conf/test_adapterremoval.config b/conf/test_adapterremoval.config index 92d51aec..d8bd581a 100644 --- a/conf/test_adapterremoval.config +++ b/conf/test_adapterremoval.config @@ -27,7 +27,7 @@ params { skip_krona = true min_length_unbinned_contigs = 1 max_unbinned_contigs = 2 - busco_reference = "https://busco-data.ezlab.org/v5/data/lineages/bacteria_odb10.2020-03-06.tar.gz" + busco_db = "https://busco-data.ezlab.org/v5/data/lineages/bacteria_odb10.2020-03-06.tar.gz" skip_gtdbtk = true clip_tool = 'adapterremoval' skip_concoct = true diff --git a/conf/test_ancient_dna.config b/conf/test_ancient_dna.config index 325362fc..ea672651 100644 --- a/conf/test_ancient_dna.config +++ b/conf/test_ancient_dna.config @@ -26,7 +26,7 @@ params { skip_krona = true min_length_unbinned_contigs = 1 max_unbinned_contigs = 2 - busco_reference = "https://busco-data.ezlab.org/v5/data/lineages/bacteria_odb10.2020-03-06.tar.gz" + busco_db = "https://busco-data.ezlab.org/v5/data/lineages/bacteria_odb10.2020-03-06.tar.gz" skip_gtdbtk = true ancient_dna = true binning_map_mode = 'own' diff --git a/conf/test_bbnorm.config b/conf/test_bbnorm.config index 5f481adf..55d48a8b 100644 --- a/conf/test_bbnorm.config +++ b/conf/test_bbnorm.config @@ -32,7 +32,7 @@ params { skip_krona = true min_length_unbinned_contigs = 1 max_unbinned_contigs = 2 - busco_reference = "https://busco-data.ezlab.org/v5/data/lineages/bacteria_odb10.2020-03-06.tar.gz" + busco_db = "https://busco-data.ezlab.org/v5/data/lineages/bacteria_odb10.2020-03-06.tar.gz" busco_clean = true skip_gtdbtk = true bbnorm = true diff --git a/conf/test_binrefinement.config b/conf/test_binrefinement.config index 85dda8db..bc1796d5 100644 --- a/conf/test_binrefinement.config +++ b/conf/test_binrefinement.config @@ -27,7 +27,7 @@ params { skip_krona = true min_length_unbinned_contigs = 1 max_unbinned_contigs = 2 - busco_reference = "https://busco-data.ezlab.org/v5/data/lineages/bacteria_odb10.2020-03-06.tar.gz" + busco_db = "https://busco-data.ezlab.org/v5/data/lineages/bacteria_odb10.2020-03-06.tar.gz" skip_gtdbtk = true refine_bins_dastool = true refine_bins_dastool_threshold = 0 diff --git a/conf/test_full.config b/conf/test_full.config index 4917332e..ea694247 100644 --- a/conf/test_full.config +++ b/conf/test_full.config @@ -28,7 +28,7 @@ params { spades_fix_cpus = 10 spadeshybrid_fix_cpus = 10 megahit_fix_cpu_1 = true - // available options to enable reproducibility for BUSCO (--busco_download_path or --busco_reference) not used here + // available options to enable reproducibility for BUSCO (--busco_db) not used here // to allow detection of possible problems in automated lineage selection mode using public databases // test CAT with official taxonomic ranks only diff --git a/conf/test_host_rm.config b/conf/test_host_rm.config index b3487c6b..7af3bcd4 100644 --- a/conf/test_host_rm.config +++ b/conf/test_host_rm.config @@ -24,7 +24,7 @@ params { input = 'https://raw.githubusercontent.com/nf-core/test-datasets/mag/samplesheets/samplesheet.host_rm.csv' min_length_unbinned_contigs = 1 max_unbinned_contigs = 2 - busco_reference = "https://busco-data.ezlab.org/v5/data/lineages/bacteria_odb10.2020-03-06.tar.gz" + busco_db = "https://busco-data.ezlab.org/v5/data/lineages/bacteria_odb10.2020-03-06.tar.gz" skip_gtdbtk = true skip_concoct = true } diff --git a/conf/test_hybrid.config b/conf/test_hybrid.config index bc22d3d2..0600c88c 100644 --- a/conf/test_hybrid.config +++ b/conf/test_hybrid.config @@ -23,7 +23,7 @@ params { input = 'https://raw.githubusercontent.com/nf-core/test-datasets/mag/samplesheets/samplesheet.hybrid.csv' min_length_unbinned_contigs = 1 max_unbinned_contigs = 2 - busco_reference = "https://busco-data.ezlab.org/v5/data/lineages/bacteria_odb10.2020-03-06.tar.gz" - skip_gtdbtk = true + busco_db = "https://busco-data.ezlab.org/v5/data/lineages/bacteria_odb10.2020-03-06.tar.gz" + skip_gtdbtk = true skip_concoct = true } diff --git a/docs/output.md b/docs/output.md index 20cab1da..1061870f 100644 --- a/docs/output.md +++ b/docs/output.md @@ -484,7 +484,7 @@ For each bin or refined bin the median sequencing depth is computed based on the #### BUSCO -[BUSCO](https://busco.ezlab.org/) is a tool used to assess the completeness of a genome assembly. It is run on all the genome bins and high quality contigs obtained by the applied binning and/or binning refinement methods (depending on the `--postbinning_input` parameter). By default, BUSCO is run in automated lineage selection mode in which it first tries to select the domain and then a more specific lineage based on phylogenetic placement. If available, result files for both the selected domain lineage and the selected more specific lineage are placed in the output directory. If a lineage dataset is specified already with `--busco_reference`, only results for this specific lineage will be generated. +[BUSCO](https://busco.ezlab.org/) is a tool used to assess the completeness of a genome assembly. It is run on all the genome bins and high quality contigs obtained by the applied binning and/or binning refinement methods (depending on the `--postbinning_input` parameter). By default, BUSCO is run in automated lineage selection mode in which it first tries to select the domain and then a more specific lineage based on phylogenetic placement. If available, result files for both the selected domain lineage and the selected more specific lineage are placed in the output directory. If a lineage dataset is specified already with `--busco_db`, only results for this specific lineage will be generated.
Output files @@ -493,21 +493,21 @@ For each bin or refined bin the median sequencing depth is computed based on the - `[assembler]-[bin]_busco.log`: Log file containing the standard output of BUSCO. - `[assembler]-[bin]_busco.err`: File containing potential error messages returned from BUSCO. - `short_summary.domain.[lineage].[assembler]-[bin].txt`: BUSCO summary of the results for the selected domain when run in automated lineage selection mode. Not available for bins for which a viral lineage was selected. - - `short_summary.specific_lineage.[lineage].[assembler]-[bin].txt`: BUSCO summary of the results in case a more specific lineage than the domain could be selected or for the lineage provided via `--busco_reference`. + - `short_summary.specific_lineage.[lineage].[assembler]-[bin].txt`: BUSCO summary of the results in case a more specific lineage than the domain could be selected or for the lineage provided via `--busco_db`. - `[assembler]-[bin]_buscos.[lineage].fna.gz`: Nucleotide sequence of all identified BUSCOs for used lineages (domain or specific). - `[assembler]-[bin]_buscos.[lineage].faa.gz`: Aminoacid sequence of all identified BUSCOs for used lineages (domain or specific). - `[assembler]-[bin]_prodigal.gff`: Genes predicted with Prodigal.
-If the parameter `--save_busco_reference` is set, additionally the used BUSCO lineage datasets are stored in the output directory. +If the parameter `--save_busco_db` is set, additionally the used BUSCO lineage datasets are stored in the output directory.
Output files - `GenomeBinning/QC/BUSCO/` - `busco_downloads/`: All files and lineage datasets downloaded by BUSCO when run in automated lineage selection mode. (Can currently not be used to reproduce analysis, see the [nf-core/mag website documentation](https://nf-co.re/mag/usage#reproducibility) how to achieve reproducible BUSCO results). - - `reference/*.tar.gz`: BUSCO reference lineage dataset that was provided via `--busco_reference`. + - `reference/*.tar.gz`: BUSCO reference lineage dataset that was provided via `--busco_db`.
diff --git a/docs/usage.md b/docs/usage.md index ed23936d..a644a49b 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -190,7 +190,7 @@ You can fix this by using the prameter `--megahit_fix_cpu_1`. In both cases, do MetaBAT2 is run by default with a fixed seed within this pipeline, thus producing reproducible results. -To allow also reproducible bin QC with BUSCO, run BUSCO providing already downloaded lineage datasets with `--busco_download_path` (BUSCO will be run using automated lineage selection in offline mode) or provide a specific lineage dataset via `--busco_reference` and use the parameter `--save_busco_reference`. This may be useful since BUSCO datasets are frequently updated and old versions do not always remain (easily) accessible. +To allow also reproducible bin QC with BUSCO, run BUSCO providing already downloaded lineage datasets (BUSCO will be run using automated lineage selection in offline mode) or provide a specific lineage dataset via `--busco_db` and use the parameter `--save_busco_db`. This may be useful since BUSCO datasets are frequently updated and old versions do not always remain (easily) accessible. For the taxonomic bin classification with [CAT](https://github.com/dutilh/CAT), when running the pipeline with `--cat_db_generate` the parameter `--save_cat_db` can be used to also save the generated database to allow reproducibility in future runs. Note that when specifying a pre-built database with `--cat_db`, currently the database can not be saved. diff --git a/lib/WorkflowMag.groovy b/lib/WorkflowMag.groovy index de66f06e..cd1a4456 100755 --- a/lib/WorkflowMag.groovy +++ b/lib/WorkflowMag.groovy @@ -102,22 +102,13 @@ class WorkflowMag { Nextflow.error('Both --skip_binqc and --binqc_tool \'checkm\' are specified! Invalid combination, please specify either --skip_binqc or --binqc_tool.') } if (params.skip_binqc) { - if (params.busco_reference) { - Nextflow.error('Both --skip_binqc and --busco_reference are specified! Invalid combination, please specify either --skip_binqc or --binqc_tool \'busco\' with --busco_reference.') - } - if (params.busco_download_path) { - Nextflow.error('Both --skip_binqc and --busco_download_path are specified! Invalid combination, please specify either --skip_binqc or --binqc_tool \'busco\' with --busco_download_path.') + if (params.busco_db) { + Nextflow.error('Both --skip_binqc and --busco_db are specified! Invalid combination, please specify either --skip_binqc or --binqc_tool \'busco\' with --busco_db.') } if (params.busco_auto_lineage_prok) { Nextflow.error('Both --skip_binqc and --busco_auto_lineage_prok are specified! Invalid combination, please specify either --skip_binqc or --binqc_tool \'busco\' with --busco_auto_lineage_prok.') } } - if (params.busco_reference && params.busco_download_path) { - Nextflow.error('Both --busco_reference and --busco_download_path are specified! Invalid combination, please specify either --busco_reference or --busco_download_path.') - } - if (params.busco_auto_lineage_prok && params.busco_reference) { - Nextflow.error('Both --busco_auto_lineage_prok and --busco_reference are specified! Invalid combination, please specify either --busco_auto_lineage_prok or --busco_reference.') - } if (params.skip_binqc && !params.skip_gtdbtk) { log.warn '--skip_binqc is specified, but --skip_gtdbtk is explictly set to run! GTDB-tk will be omitted because GTDB-tk bin classification requires bin filtering based on BUSCO or CheckM QC results to avoid GTDB-tk errors.' diff --git a/modules/local/busco.nf b/modules/local/busco.nf index 58e79efc..014046f0 100644 --- a/modules/local/busco.nf +++ b/modules/local/busco.nf @@ -8,8 +8,7 @@ process BUSCO { input: tuple val(meta), path(bin) - path(db) - path(download_folder) + tuple val(db_meta), path(db) output: tuple val(meta), path("short_summary.domain.*.${bin}.txt") , optional:true , emit: summary_domain @@ -25,17 +24,16 @@ process BUSCO { script: def cp_augustus_config = workflow.profile.toString().indexOf("conda") != -1 ? "N" : "Y" - def lineage_dataset_provided = params.busco_reference ? "Y" : "N" + def lineage_dataset_provided = "${db_meta.lineage}" def busco_clean = params.busco_clean ? "Y" : "N" - def p = "--auto-lineage" - if (params.busco_reference){ + def p = params.busco_auto_lineage_prok ? "--auto-lineage-prok" : "--auto-lineage" + if ( "${lineage_dataset_provided}" == "Y" ) { p = "--lineage_dataset dataset/${db}" + } else if ( "${lineage_dataset_provided}" == "N" ) { + p += " --offline --download_path ${db}" } else { - if (params.busco_auto_lineage_prok) - p = "--auto-lineage-prok" - if (params.busco_download_path) - p += " --offline --download_path ${download_folder}" + lineage_dataset_provided = "" } """ run_busco.sh "${p}" "${cp_augustus_config}" "${db}" "${bin}" ${task.cpus} "${lineage_dataset_provided}" "${busco_clean}" diff --git a/modules/local/busco_db_preparation.nf b/modules/local/busco_db_preparation.nf index cddc130d..e3418cb6 100644 --- a/modules/local/busco_db_preparation.nf +++ b/modules/local/busco_db_preparation.nf @@ -10,9 +10,8 @@ process BUSCO_DB_PREPARATION { path database output: - path "buscodb/*" , emit: db - path database , emit: database - path "versions.yml" , emit: versions + tuple val("${database.getSimpleName()}"), path("buscodb/*"), emit: db + path "versions.yml" , emit: versions script: """ diff --git a/modules/local/busco_summary.nf b/modules/local/busco_summary.nf index 799196d7..bafcc495 100644 --- a/modules/local/busco_summary.nf +++ b/modules/local/busco_summary.nf @@ -15,11 +15,12 @@ process BUSCO_SUMMARY { path "versions.yml" , emit: versions script: - def auto = params.busco_reference ? "" : "-a" + def reference = params.busco_db.toString().contains('odb10') + def auto = reference ? "" : "-a" def ss = summaries_specific.sort().size() > 0 ? "-ss ${summaries_specific}" : "" def sd = summaries_domain.sort().size() > 0 ? "-sd ${summaries_domain}" : "" def f = "" - if (!params.busco_reference && failed_bins.sort().size() > 0) + if ("${reference}" == false && failed_bins.sort().size() > 0) f = "-f ${failed_bins}" """ summary_busco.py $auto $ss $sd $f -o busco_summary.tsv diff --git a/modules/local/centrifuge_db_preparation.nf b/modules/local/centrifuge_db_preparation.nf index 5bd76c69..fe48512c 100644 --- a/modules/local/centrifuge_db_preparation.nf +++ b/modules/local/centrifuge_db_preparation.nf @@ -9,12 +9,15 @@ process CENTRIFUGE_DB_PREPARATION { path db output: - tuple val("${db.toString().replace(".tar.gz", "")}"), path("*.cf"), emit: db - path "versions.yml" , emit: versions + path "database/*.cf", emit: db + path "versions.yml" , emit: versions script: """ - tar -xf "${db}" + mkdir db_tmp + tar -xf "${db}" -C db_tmp + mkdir database + mv `find db_tmp/ -name "*.cf"` database/ cat <<-END_VERSIONS > versions.yml "${task.process}": diff --git a/nextflow.config b/nextflow.config index 6db8af4e..74b1878f 100644 --- a/nextflow.config +++ b/nextflow.config @@ -121,10 +121,9 @@ params { // Bin QC skip_binqc = false binqc_tool = 'busco' - busco_reference = null - busco_download_path = null + busco_db = null busco_auto_lineage_prok = false - save_busco_reference = false + save_busco_db = false busco_clean = false checkm_download_url = "https://data.ace.uq.edu.au/public/CheckM_databases/checkm_data_2015_01_16.tar.gz" checkm_db = null diff --git a/nextflow_schema.json b/nextflow_schema.json index 5dbd2a26..837b4e20 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -481,12 +481,12 @@ "centrifuge_db": { "type": "string", "description": "Database for taxonomic binning with centrifuge.", - "help_text": "E.g. ftp://ftp.ccb.jhu.edu/pub/infphilo/centrifuge/data/p_compressed+h+v.tar.gz." + "help_text": "Local directory containing `*.cf` files, or a URL or local path to a downloaded compressed tar archive of a Centrifuge database. E.g. ftp://ftp.ccb.jhu.edu/pub/infphilo/centrifuge/data/p_compressed+h+v.tar.gz." }, "kraken2_db": { "type": "string", "description": "Database for taxonomic binning with kraken2.", - "help_text": "The database file must be a compressed tar archive that contains at least the three files `hash.k2d`, `opts.k2d` and `taxo.k2d`. E.g. ftp://ftp.ccb.jhu.edu/pub/data/kraken2_dbs/minikraken_8GB_202003.tgz." + "help_text": "Path to a local directory, archive file, or a URL to compressed tar archive that contains at least the three files `hash.k2d`, `opts.k2d` and `taxo.k2d`. E.g. ftp://ftp.ccb.jhu.edu/pub/data/kraken2_dbs/minikraken_8GB_202003.tgz." }, "krona_db": { "type": "string", @@ -757,23 +757,18 @@ "description": "Specify which tool for bin quality-control validation to use.", "enum": ["busco", "checkm"] }, - "busco_reference": { + "busco_db": { "type": "string", - "description": "Download path for BUSCO lineage dataset, instead of using automated lineage selection.", - "help_text": "E.g. https://busco-data.ezlab.org/v5/data/lineages/bacteria_odb10.2020-03-06.tar.gz. Available databases are listed here: https://busco-data.ezlab.org/v5/data/lineages/." - }, - "busco_download_path": { - "type": "string", - "description": "Path to local folder containing already downloaded and unpacked lineage datasets.", - "help_text": "If provided, BUSCO analysis will be run in offline mode. Data can be downloaded from https://busco-data.ezlab.org/v5/data/ (files still need to be unpacked manually). Run in combination with automated lineage selection." + "description": "Download URL for BUSCO lineage dataset, or path to a tar.gz archive, or local directory containing already downloaded and unpacked lineage datasets.", + "help_text": "E.g. https://busco-data.ezlab.org/v5/data/lineages/bacteria_odb10.2020-03-06.tar.gz or '/path/to/buscodb' (files still need to be unpacked manually). Available databases are listed here: https://busco-data.ezlab.org/v5/data/lineages/." }, "busco_auto_lineage_prok": { "type": "boolean", "description": "Run BUSCO with automated lineage selection, but ignoring eukaryotes (saves runtime)." }, - "save_busco_reference": { + "save_busco_db": { "type": "boolean", - "description": "Save the used BUSCO lineage datasets provided via --busco_reference or downloaded when not using --busco_reference or --busco_download_path.", + "description": "Save the used BUSCO lineage datasets provided via `--busco_db`.", "help_text": "Useful to allow reproducibility, as BUSCO datasets are frequently updated and old versions do not always remain accessible." }, "busco_clean": { diff --git a/subworkflows/local/busco_qc.nf b/subworkflows/local/busco_qc.nf index 6165be47..ce92efd5 100644 --- a/subworkflows/local/busco_qc.nf +++ b/subworkflows/local/busco_qc.nf @@ -9,23 +9,57 @@ include { BUSCO_SUMMARY } from '../../modules/local/busco_summ workflow BUSCO_QC { take: - busco_db_file // channel: path - busco_download_folder // channel: path - bins // channel: [ val(meta), path(bin) ] + busco_db // channel: path + bins // channel: [ val(meta), path(bin) ] main: - if (params.busco_reference){ - BUSCO_DB_PREPARATION ( busco_db_file ) - ch_busco_db = BUSCO_DB_PREPARATION.out.db + if ( !busco_db.isEmpty() ) { + if ( busco_db.extension in ['gz', 'tgz'] ) { + // Expects to be tar.gz! + ch_db_for_busco = BUSCO_DB_PREPARATION ( busco_db ).db + .map{ + meta, db -> + def meta_new = [:] + meta_new['id'] = meta + meta_new['lineage'] = 'Y' + [ meta_new, db ] + } + } else if ( busco_db.isDirectory() ) { + // Set meta to match expected channel cardinality for BUSCO + ch_db_for_busco = Channel + .of(busco_db) + .map{ + db -> + def meta = [:] + meta['id'] = db.getBaseName() + if ( meta['id'].contains('odb10') == true ) { + meta['lineage'] = 'Y' + } else { + meta['lineage'] = 'N' + } + [ meta, db ] + } + .collect() + } } else { - ch_busco_db = Channel.empty() + // Set BUSCO database to empty to allow for --auto-lineage + ch_db_for_busco = Channel + .of([]) + .map{ + empty_db -> + def meta = [:] + meta['lineage'] = '' + [ meta, [] ] + } + .collect() } + BUSCO ( bins, - ch_busco_db.collect().ifEmpty([]), - busco_download_folder.collect().ifEmpty([]) + ch_db_for_busco ) - if (params.save_busco_reference){ + + if (params.save_busco_db){ // publish files downloaded by Busco ch_downloads = BUSCO.out.busco_downloads.groupTuple().map{lin,downloads -> downloads[0]}.toSortedList().flatten() BUSCO_SAVE_DOWNLOAD ( ch_downloads ) diff --git a/subworkflows/local/gtdbtk.nf b/subworkflows/local/gtdbtk.nf index 21823962..7586b142 100644 --- a/subworkflows/local/gtdbtk.nf +++ b/subworkflows/local/gtdbtk.nf @@ -24,7 +24,7 @@ workflow GTDBTK { def completeness = -1 def contamination = -1 def missing, duplicated - if (params.busco_reference) { + if (params.busco_db.getBaseName().contains('odb10')) { missing = row.'%Missing (specific)' // TODO or just take '%Complete'? duplicated = row.'%Complete and duplicated (specific)' } else { diff --git a/workflows/mag.nf b/workflows/mag.nf index 65394ae2..11449c8c 100644 --- a/workflows/mag.nf +++ b/workflows/mag.nf @@ -31,7 +31,7 @@ log.info logo + paramsSummaryLog(workflow) + citation WorkflowMag.initialise(params, log, hybrid) // Check input path parameters to see if they exist -def checkPathParamList = [ params.input, params.multiqc_config, params.phix_reference, params.host_fasta, params.centrifuge_db, params.kraken2_db, params.cat_db, params.krona_db, params.gtdb_db, params.lambda_reference, params.busco_reference ] +def checkPathParamList = [ params.input, params.multiqc_config, params.phix_reference, params.host_fasta, params.centrifuge_db, params.kraken2_db, params.cat_db, params.krona_db, params.gtdb_db, params.lambda_reference, params.busco_db ] for (param in checkPathParamList) { if (param) { file(param, checkIfExists: true) } } /* @@ -145,17 +145,10 @@ if ( params.host_genome ) { ch_host_fasta = Channel.empty() } -if(params.busco_reference){ - ch_busco_db_file = Channel - .value(file( "${params.busco_reference}" )) +if (params.busco_db) { + ch_busco_db = file(params.busco_db, checkIfExists: true) } else { - ch_busco_db_file = Channel.empty() -} -if (params.busco_download_path) { - ch_busco_download_folder = Channel - .value(file( "${params.busco_download_path}" )) -} else { - ch_busco_download_folder = Channel.empty() + ch_busco_db = [] } if(params.checkm_db) { @@ -169,17 +162,15 @@ if (params.gunc_db) { } if(params.centrifuge_db){ - ch_centrifuge_db_file = Channel - .value(file( "${params.centrifuge_db}" )) + ch_centrifuge_db_file = file(params.centrifuge_db, checkIfExists: true) } else { - ch_centrifuge_db_file = Channel.empty() + ch_centrifuge_db_file = [] } if(params.kraken2_db){ - ch_kraken2_db_file = Channel - .value(file( "${params.kraken2_db}" )) + ch_kraken2_db_file = file(params.kraken2_db, checkIfExists: true) } else { - ch_kraken2_db_file = Channel.empty() + ch_kraken2_db_file = [] } if(params.cat_db){ @@ -463,20 +454,63 @@ workflow MAG { Taxonomic information ================================================================================ */ - CENTRIFUGE_DB_PREPARATION ( ch_centrifuge_db_file ) + if ( !ch_centrifuge_db_file.isEmpty() ) { + if ( ch_centrifuge_db_file.extension in ['gz', 'tgz'] ) { + // Expects to be tar.gz! + ch_db_for_centrifuge = CENTRIFUGE_DB_PREPARATION ( ch_centrifuge_db_file ).db + } else if ( ch_centrifuge_db_file.isDirectory() ) { + ch_db_for_centrifuge = Channel + .fromPath( "${ch_centrifuge_db_file}/*.cf" ) + } else { + ch_db_for_centrifuge = Channel.empty() + } + } else { + ch_db_for_centrifuge = Channel.empty() + } + + // Centrifuge val(db_name) has to be the basename of any of the + // index files up to but not including the final .1.cf + ch_db_for_centrifuge = ch_db_for_centrifuge + .collect() + .map{ + db -> + def db_name = db[0].getBaseName().split('\\.')[0] + [ db_name, db ] + } CENTRIFUGE ( ch_short_reads, - CENTRIFUGE_DB_PREPARATION.out.db + ch_db_for_centrifuge ) ch_versions = ch_versions.mix(CENTRIFUGE.out.versions.first()) - KRAKEN2_DB_PREPARATION ( - ch_kraken2_db_file - ) + if ( !ch_kraken2_db_file.isEmpty() ) { + if ( ch_kraken2_db_file.extension in ['gz', 'tgz'] ) { + // Expects to be tar.gz! + ch_db_for_kraken2 = KRAKEN2_DB_PREPARATION ( ch_kraken2_db_file ).db + } else if ( ch_kraken2_db_file.isDirectory() ) { + ch_db_for_kraken2 = Channel + .fromPath( "${ch_kraken2_db_file}/*.k2d" ) + .collect() + .map{ + file -> + if (file.size() >= 3) { + def db_name = file[0].getParent().getName() + [ db_name, file ] + } else { + error("Kraken2 requires '{hash,opts,taxo}.k2d' files.") + } + } + } else { + ch_db_for_kraken2 = Channel.empty() + } + } else { + ch_db_for_kraken2 = Channel.empty() + } + KRAKEN2 ( ch_short_reads, - KRAKEN2_DB_PREPARATION.out.db + ch_db_for_kraken2 ) ch_versions = ch_versions.mix(KRAKEN2.out.versions.first()) @@ -805,8 +839,7 @@ workflow MAG { */ BUSCO_QC ( - ch_busco_db_file, - ch_busco_download_folder, + ch_busco_db, ch_input_bins_for_qc ) ch_busco_summary = BUSCO_QC.out.summary