Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Use GVCFs for genotyping - run time/CPU hours substantially reduced (0.52) #158

Merged
merged 16 commits into from
Jun 7, 2024
Merged
3 changes: 2 additions & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,8 @@ This project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.htm
---

## [Unreleased]

### Added
- Add workflow for genotyping from GVCFs
---

## [10.0.0] - 2024-03-08
Expand Down
54 changes: 33 additions & 21 deletions main.nf
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ Current Configuration:
bundle_omni_1000g_2p5_vcf_gz: ${params.bundle_omni_1000g_2p5_vcf_gz}
bundle_phase1_1000g_snps_high_conf_vcf_gz: ${params.bundle_phase1_1000g_snps_high_conf_vcf_gz}

- output:
- output:
output: ${params.output_dir}
output_dir_base: ${params.output_dir_base}
log_output_dir: ${params.log_output_dir}
Expand Down Expand Up @@ -61,6 +61,9 @@ include {
run_HaplotypeCallerVCF_GATK
run_HaplotypeCallerGVCF_GATK
} from './module/haplotypecaller.nf'
include { run_GenomicsDBImport_GATK } from './module/genomicsdb-import.nf'
include { run_CombineGVCFs_GATK } from './module/combine-gvcfs.nf'
include { run_GenotypeGVCFs_GATK } from './module/genotype-gvcfs.nf'
include {
run_MergeVcfs_Picard as run_MergeVcfs_Picard_VCF
run_MergeVcfs_Picard as run_MergeVcfs_Picard_GVCF
Expand Down Expand Up @@ -147,25 +150,6 @@ workflow {
/**
* Haplotype calling
*/
input_ch_collected_files.combine(input_ch_intervals)
.map{ it ->
[
it[0].bams,
it[0].indices,
it[1].interval_path,
it[1].interval_id
]
}
.set{ input_ch_haplotypecallervcf }

run_HaplotypeCallerVCF_GATK(
params.reference_fasta,
"${params.reference_fasta}.fai",
"${file(params.reference_fasta).parent}/${file(params.reference_fasta).baseName}.dict",
params.bundle_v0_dbsnp138_vcf_gz,
"${params.bundle_v0_dbsnp138_vcf_gz}.tbi",
input_ch_haplotypecallervcf
)

input_ch_samples_with_index.combine(input_ch_intervals)
.map{ it ->
Expand All @@ -188,10 +172,38 @@ workflow {
input_ch_haplotypecallergvcf
)

run_HaplotypeCallerGVCF_GATK.out.gvcfs
.groupTuple(by: 4) // Group by interval_path
.map{ it ->
[
it[1].flatten(), // GVCFs
it[2].flatten(), // Indices
it[3][0], // Interval path
it[4] // Interval ID
]
}
.set { input_ch_combine_gvcfs }

run_CombineGVCFs_GATK(
params.reference_fasta,
"${params.reference_fasta}.fai",
"${file(params.reference_fasta).parent}/${file(params.reference_fasta).baseName}.dict",
input_ch_combine_gvcfs
)

run_GenotypeGVCFs_GATK(
params.reference_fasta,
"${params.reference_fasta}.fai",
"${file(params.reference_fasta).parent}/${file(params.reference_fasta).baseName}.dict",
params.bundle_v0_dbsnp138_vcf_gz,
"${params.bundle_v0_dbsnp138_vcf_gz}.tbi",
run_CombineGVCFs_GATK.out.combined_gvcf
)

/**
* Merge VCFs
*/
run_HaplotypeCallerVCF_GATK.out.vcfs
run_GenotypeGVCFs_GATK.out.vcfs
.reduce( ['vcfs': [], 'indices': []] ){ a, b ->
a.vcfs.add(b[0]);
a.indices.add(b[1]);
Expand Down
52 changes: 52 additions & 0 deletions module/combine-gvcfs.nf
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
include { generate_standard_filename } from '../external/pipeline-Nextflow-module/modules/common/generate_standardized_filename/main.nf'

/*
Nextflow module for merging GVCFs for joint genotyping with GATK
*/
process run_CombineGVCFs_GATK {
tyamaguchi-ucla marked this conversation as resolved.
Show resolved Hide resolved
container params.docker_image_gatk
publishDir path: "${params.output_dir_base}/intermediate/${task.process.replace(':', '/')}",
mode: "copy",
enabled: params.save_intermediate_files,
pattern: '*g.gvcf.gz*'
publishDir path: "${params.log_output_dir}/process-log",
pattern: ".command.*",
mode: "copy",
saveAs: { "${task.process.replace(':', '/')}/${task.process.split(':')[-1]}-${interval_id}/log${file(it).getName()}" }

input:
path(reference_fasta)
path(reference_fasta_fai)
path(reference_fasta_dict)
tuple path(gvcfs), path(gvcf_indices), path(interval_path), val(interval_id)

output:
path(".command.*")
tuple path(output_filename), path("${output_filename}.tbi"), path(interval_path), val(interval_id), emit: combined_gvcf

script:
output_filename = generate_standard_filename(
"GATK-${params.gatk_version}",
params.dataset_id,
params.patient_id,
[
'additional_information': "${interval_id}.g.vcf.gz"
]
)
gvcf_input_str = gvcfs.collect{ "--variant '${it}'" }.join(' ')
interval_str = "--intervals ${interval_path}"
interval_padding = params.is_targeted ? "--interval-padding 100" : ""
"""
set -euo pipefail

gatk --java-options "-Xmx${(task.memory - params.gatk_command_mem_diff).getMega()}m" \
CombineGVCFs \
--reference ${reference_fasta} \
${gvcf_input_str} \
--output ${output_filename} \
--create-output-variant-index true \
--verbosity INFO \
${interval_str} \
${interval_padding}
"""
}
48 changes: 48 additions & 0 deletions module/genomicsdb-import.nf
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
include { generate_standard_filename } from '../external/pipeline-Nextflow-module/modules/common/generate_standardized_filename/main.nf'

/*
Nextflow module for importing GVCFs into GenomicsDB for joint genotyping with GATK
*/
process run_GenomicsDBImport_GATK {
container params.docker_image_gatk
publishDir path: "${params.output_dir_base}/intermediate/${task.process.replace(':', '/')}",
mode: "copy",
enabled: params.save_intermediate_files,
pattern: '*genomicsdb'

publishDir path: "${params.log_output_dir}/process-log",
pattern: ".command.*",
mode: "copy",
saveAs: { "${task.process.replace(':', '/')}/${task.process.split(':')[-1]}-${interval_id}/log${file(it).getName()}" }

input:
tuple path(gvcfs), path(gvcf_indices), path(interval_path), val(interval_id)

output:
path(".command.*")
tuple path(output_filename), path(interval_path), val(interval_id), emit: genomicsdb

script:
output_filename = generate_standard_filename(
"GATK-${params.gatk_version}",
params.dataset_id,
params.patient_id,
[
'additional_information': "${interval_id}.genomicsdb"
]
)
gvcf_input_str = gvcfs.collect{ "--variant '${it}'" }.join(' ')
interval_str = "--intervals ${interval_path}"
interval_padding = params.is_targeted ? "--interval-padding 100" : ""
"""
set -euo pipefail

gatk --java-options "-Xmx${(task.memory - params.gatk_command_mem_diff).getMega()}m" \
GenomicsDBImport \
${gvcf_input_str} \
--genomicsdb-workspace-path ${output_filename} \
--verbosity INFO \
${interval_str} \
${interval_padding}
"""
}
55 changes: 55 additions & 0 deletions module/genotype-gvcfs.nf
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
include { generate_standard_filename } from '../external/pipeline-Nextflow-module/modules/common/generate_standardized_filename/main.nf'

/*
Nextflow module for joint genotyping merged GVCFs with GATK
*/
process run_GenotypeGVCFs_GATK {
container params.docker_image_gatk
publishDir path: "${params.output_dir_base}/intermediate/${task.process.replace(':', '/')}",
mode: "copy",
enabled: params.save_intermediate_files,
pattern: '*.vcf*'

publishDir path: "${params.log_output_dir}/process-log",
pattern: ".command.*",
mode: "copy",
saveAs: { "${task.process.replace(':', '/')}/${task.process.split(':')[-1]}-${interval_id}/log${file(it).getName()}" }

input:
path(reference_fasta)
path(reference_fasta_fai)
path(reference_fasta_dict)
path(dbsnp_bundle)
path(dbsnp_bundle_index)
tuple path(combined_gvcf), path(combined_gvcf_index), path(interval_path), val(interval_id)

output:
path(".command.*")
tuple path(output_filename), path("${output_filename}.tbi"), emit: vcfs

script:
output_filename = generate_standard_filename(
"GATK-${params.gatk_version}",
params.dataset_id,
params.patient_id,
[
'additional_information': "${interval_id}.vcf.gz"
]
)
interval_str = "--intervals ${interval_path}"
interval_padding = params.is_targeted ? "--interval-padding 100" : ""
"""
set -euo pipefail

gatk --java-options "-Xmx${(task.memory - params.gatk_command_mem_diff).getMega()}m" \
GenotypeGVCFs \
--variant ${combined_gvcf} \
--reference ${reference_fasta} \
--verbosity INFO \
--output ${output_filename} \
--dbsnp ${dbsnp_bundle} \
--standard-min-confidence-threshold-for-calling 50 \
${interval_str} \
${interval_padding}
"""
}
6 changes: 3 additions & 3 deletions module/haplotypecaller.nf
yashpatel6 marked this conversation as resolved.
Show resolved Hide resolved
Original file line number Diff line number Diff line change
Expand Up @@ -120,12 +120,12 @@ process run_HaplotypeCallerGVCF_GATK {
path(reference_fasta_dict)
path(dbsnp_bundle)
path(dbsnp_bundle_index)
tuple val(sample_id), path(bam), path(bam_index), path(interval), val(interval_id)
tuple val(sample_id), path(bam), path(bam_index), path(interval_path), val(interval_id)


output:
path(".command.*")
tuple val(sample_id), path(output_filename), path("${output_filename}.tbi"), emit: gvcfs
tuple val(sample_id), path(output_filename), path("${output_filename}.tbi"), path(interval_path), val(interval_id), emit: gvcfs

script:
output_filename = generate_standard_filename(
Expand All @@ -136,7 +136,7 @@ process run_HaplotypeCallerGVCF_GATK {
'additional_information': "${interval_id}_raw_variants.g.vcf.gz"
]
)
interval_str = "--intervals ${interval}"
interval_str = "--intervals ${interval_path}"
interval_padding = params.is_targeted ? "--interval-padding 100" : ""
output_mode = params.emit_all_confident_sites ? "EMIT_ALL_CONFIDENT_SITES" : "EMIT_VARIANTS_ONLY"
"""
Expand Down
Loading