Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add PDC as an input file source #17

Merged
merged 6 commits into from
Sep 11, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions container_images.config
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ params {
diann: 'quay.io/protio/diann:1.8.1',
bibliospec: 'quay.io/protio/bibliospec-linux:3.0',
panorama_client: 'quay.io/protio/panorama-client:1.1.0',
pdc_client: 'quay.io/mauraisa/pdc_client:0.15',
encyclopedia: 'quay.io/protio/encyclopedia:2.12.30-2',
encyclopedia3_mriffle: 'quay.io/protio/encyclopedia:3.0.0-MRIFFLE',
qc_pipeline: 'quay.io/mauraisa/dia_qc_report:2.2.4',
Expand Down
12 changes: 12 additions & 0 deletions docs/source/workflow_parameters.rst
Original file line number Diff line number Diff line change
Expand Up @@ -77,6 +77,18 @@ The ``params`` Section
* -
- ``search_engine``
- Must be set to either ``'encyclopedia'`` or ``'diann'``. If set to ``'diann'``, ``chromatogram_library_spectra_dir``, ``chromatogram_library_spectra_glob``, and EncyclopeDIA-specific parameters will be ignored. Default: ``'encyclopedia'``.
* -
- ``pdc.study_id``
- When this option is set, raw files and metadata will be downloaded from the PDC. Default: ``null``.
* -
- ``pdc.gene_level_data``
- A ``tsv`` file mapping gene names to NCIB gene IDs and gene metadata. Required for PDC gene reports. Default: ``null``.
* -
- ``pdc.n_raw_files``
- If this option is set, only ``n`` raw files are downloaded. This is useful for testing but otherwise should be ``null``.
* -
- ``pdc.client_args``
- Additional command line arguments passed to ``PDC_client``. Default is ``null``.
* -
- ``skyline.skip``
- If set to ``true``, will skip the creation of a Skyline document. Default: ``false``.
Expand Down
83 changes: 49 additions & 34 deletions main.nf
Original file line number Diff line number Diff line change
Expand Up @@ -16,13 +16,15 @@ include { generate_dia_qc_report } from "./workflows/generate_qc_report"
include { panorama_upload_results } from "./workflows/panorama_upload"
include { panorama_upload_mzmls } from "./workflows/panorama_upload"
include { save_run_details } from "./workflows/save_run_details"
include { get_pdc_files } from "./workflows/get_pdc_files"

// modules
include { ENCYCLOPEDIA_BLIB_TO_DLIB } from "./modules/encyclopedia"
include { ENCYCLOPEDIA_DLIB_TO_TSV } from "./modules/encyclopedia"
include { BLIB_BUILD_LIBRARY } from "./modules/diann"
include { GET_AWS_USER_ID } from "./modules/aws"
include { BUILD_AWS_SECRETS } from "./modules/aws"
include { EXPORT_GENE_REPORTS } from "./modules/qc_report"

// useful functions and variables
include { param_to_list } from "./workflows/get_input_files"
Expand Down Expand Up @@ -96,25 +98,37 @@ workflow {
aws_secret_id = Channel.of('none').collect() // ensure this is a value channel
}

// only perform msconvert and terminate
if(params.msconvert_only) {
get_wide_mzmls(params.quant_spectra_dir, params.quant_spectra_glob, aws_secret_id) // get wide windows mzmls
// get mzML files
if(params.pdc.study_id) {
get_pdc_files()
wide_mzml_ch = get_pdc_files.out.wide_mzml_ch
pdc_study_name = get_pdc_files.out.study_name
} else{
get_wide_mzmls(params.quant_spectra_dir, params.quant_spectra_glob, aws_secret_id)
wide_mzml_ch = get_wide_mzmls.out.mzml_ch
}
narrow_mzml_ch = null
if(params.chromatogram_library_spectra_dir != null) {
get_narrow_mzmls(params.chromatogram_library_spectra_dir,
params.chromatogram_library_spectra_glob,
aws_secret_id)

narrow_mzml_ch = get_narrow_mzmls.out.mzml_ch
all_mzml_ch = wide_mzml_ch.concat(narrow_mzml_ch)
} else {
all_mzml_ch = wide_mzml_ch
}

if(params.chromatogram_library_spectra_dir != null) {
get_narrow_mzmls(params.chromatogram_library_spectra_dir,
params.chromatogram_library_spectra_glob,
aws_secret_id)

narrow_mzml_ch = get_narrow_mzmls.out.mzml_ch
all_mzml_ch = wide_mzml_ch.concat(narrow_mzml_ch)
} else {
all_mzml_ch = wide_mzml_ch
}
// only perform msconvert and terminate
if(params.msconvert_only) {
// save details about this run
input_files = all_mzml_ch.map{ it -> ['Spectra File', it.baseName] }
version_files = Channel.empty()
save_run_details(input_files.collect(), version_files.collect())
run_details_file = save_run_details.out.run_details

// if requested, upload mzMLs to panorama
if(params.panorama.upload) {

panorama_upload_mzmls(
params.panorama.upload_url,
all_mzml_ch,
Expand All @@ -124,30 +138,27 @@ workflow {
)
}


// save details about this run
input_files = all_mzml_ch.map{ it -> ['Spectra File', it.baseName] }
version_files = Channel.empty()
save_run_details(input_files.collect(), version_files.collect())
run_details_file = save_run_details.out.run_details

return
}

get_input_files(aws_secret_id) // get input files
get_wide_mzmls(params.quant_spectra_dir, params.quant_spectra_glob, aws_secret_id) // get wide windows mzmls

// set up some convenience variables

if(params.spectral_library) {
spectral_library = get_input_files.out.spectral_library
} else {
spectral_library = Channel.empty()
}

if(params.pdc.study_id) {
if(params.replicate_metadata) {
log.warn "params.replicate_metadata will be overritten by PDC metadata"
}
replicate_metadata = get_pdc_files.out.annotations_csv
} else {
replicate_metadata = get_input_files.out.replicate_metadata
}
fasta = get_input_files.out.fasta
skyline_template_zipfile = get_input_files.out.skyline_template_zipfile
wide_mzml_ch = get_wide_mzmls.out.mzml_ch
skyr_file_ch = get_input_files.out.skyr_files

final_elib = null
Expand Down Expand Up @@ -175,13 +186,6 @@ workflow {

// create elib if requested
if(params.chromatogram_library_spectra_dir != null) {
// get narrow windows mzmls
get_narrow_mzmls(params.chromatogram_library_spectra_dir,
params.chromatogram_library_spectra_glob,
aws_secret_id)
narrow_mzml_ch = get_narrow_mzmls.out.mzml_ch

all_mzml_ch = wide_mzml_ch.concat(narrow_mzml_ch)

// create chromatogram library
encyclopeda_export_elib(
Expand Down Expand Up @@ -327,16 +331,26 @@ workflow {
// annotate skyline document if replicate_metadata was specified
if(params.replicate_metadata != null) {
skyline_annotate_doc(skyline_import.out.skyline_results,
get_input_files.out.replicate_metadata)
replicate_metadata)
final_skyline_file = skyline_annotate_doc.out.skyline_results
} else {
final_skyline_file = skyline_import.out.skyline_results
}

// generate QC report
if(!params.qc_report.skip) {
generate_dia_qc_report(final_skyline_file, get_input_files.out.replicate_metadata)
generate_dia_qc_report(final_skyline_file, replicate_metadata)
dia_qc_version = generate_dia_qc_report.out.dia_qc_version

// Export PDC gene tables
if(params.pdc.gene_level_data != null) {
EXPORT_GENE_REPORTS(generate_dia_qc_report.out.qc_report_db,
params.pdc.gene_level_data,
pdc_study_name)
EXPORT_GENE_REPORTS.out.gene_reports | flatten | set{ gene_reports }
} else {
gene_reports = Channel.empty()
}
} else {
dia_qc_version = Channel.empty()
}
Expand All @@ -361,6 +375,7 @@ workflow {
qc_report_files = Channel.empty()
proteowizard_version = Channel.empty()
dia_qc_version = Channel.empty()
gene_reports = Channel.empty()
}

version_files = encyclopedia_version.concat(diann_version,
Expand Down
75 changes: 75 additions & 0 deletions modules/pdc.nf
Original file line number Diff line number Diff line change
@@ -0,0 +1,75 @@

def format_client_args(var) {
ret = (var == null ? "" : var)
return ret
}

process GET_STUDY_METADATA {
publishDir "${params.result_dir}/pdc", failOnError: true, mode: 'copy'
errorStrategy 'retry'
maxRetries 5
label 'process_low_constant'
container params.images.pdc_client

input:
val pdc_study_id

output:
path('study_metadata.tsv'), emit: metadata
path('study_metadata_annotations.csv'), emit: skyline_annotations
env(study_id), emit: study_id
env(study_name), emit: study_name
path('pdc_client_version.txt'), emit: version

shell:
n_files_arg = params.pdc.n_raw_files == null ? "" : "--nFiles ${params.pdc.n_raw_files}"
pdc_client_args = params.pdc.client_args == null ? "" : params.pdc.client_args

'''
study_id=$(PDC_client studyID !{pdc_client_args} !{pdc_study_id} | tee study_id.txt)
study_name=$(PDC_client studyName --normalize !{pdc_client_args} ${study_id} | tee study_name.txt)
PDC_client metadata !{pdc_client_args} -f tsv !{n_files_arg} --skylineAnnotations ${study_id}

echo "pdc_client_git_repo='$GIT_REPO - $GIT_BRANCH [$GIT_SHORT_HASH]'" > pdc_client_version.txt
'''
}

process METADATA_TO_SKY_ANNOTATIONS {
label 'process_low_constant'
container params.images.pdc_client

input:
path pdc_study_metadata

output:
path('skyline_annotations.csv'), emit: skyline_annotations

shell:
'''
PDC_client metadataToSky !{pdc_study_metadata}
'''
}

process GET_FILE {
storeDir "${params.panorama_cache_directory}"
label 'process_low_constant'
container params.images.pdc_client
errorStrategy 'retry'
maxRetries 1

input:
tuple val(url), val(file_name), val(md5)

output:
path(file_name), emit: downloaded_file

shell:
'''
PDC_client file -o '!{file_name}' -m '!{md5}' '!{url}'
'''

stub:
"""
touch ${file_name}
"""
}
28 changes: 28 additions & 0 deletions modules/qc_report.nf
Original file line number Diff line number Diff line change
Expand Up @@ -151,3 +151,31 @@ process RENDER_QC_REPORT {
"""
}

process EXPORT_GENE_REPORTS {
publishDir "${params.result_dir}/gene_reports", failOnError: true, mode: 'copy'
label 'process_high_memory'
container params.images.qc_pipeline

input:
path batch_db
path gene_level_data
val file_prefix

output:
path("*.tsv"), emit: gene_reports
path("*.stdout"), emit: stdout
path("*.stderr"), emit: stderr

script:
"""
dia_qc export_gene_matrix --prefix=${file_prefix} --useAliquotId \
'${gene_level_data}' '${batch_db}' \
> >(tee "export_reports.stdout") 2> >(tee "export_reports.stderr" >&2)
"""

stub:
"""
touch stub.tsv
touch stub.stdout stub.stderr
"""
}
7 changes: 7 additions & 0 deletions nextflow.config
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,13 @@ params {
skip_skyline = null
skyline_skyr_file = null

// Optional PDC study settings
pdc.client_args = ''
pdc.study_id = null
pdc.n_raw_files = null
pdc.metadata_tsv = null
pdc.gene_level_data = null

// The final skyline document will be named using this name. For example,
// if skyline_custom_name = 'human_dia' then the final Skyline document
// will be named "human_dia.sky.zip". When importing into PanoramaWeb--this
Expand Down
51 changes: 51 additions & 0 deletions workflows/get_pdc_files.nf
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@

include { GET_STUDY_METADATA } from "../modules/pdc.nf"
include { METADATA_TO_SKY_ANNOTATIONS } from "../modules/pdc.nf"
include { GET_FILE } from "../modules/pdc.nf"
include { MSCONVERT } from "../modules/msconvert.nf"

workflow get_pdc_study_metadata {
emit:
study_name
metadata
annotations_csv

main:
if(params.pdc.metadata_tsv == null) {
GET_STUDY_METADATA(params.pdc.study_id)
metadata = GET_STUDY_METADATA.out.metadata
annotations_csv = GET_STUDY_METADATA.out.skyline_annotations
study_name = GET_STUDY_METADATA.out.study_name
} else {
metadata = Channel.fromPath(file(params.pdc.metadata_tsv, checkIfExists: true))
METADATA_TO_SKY_ANNOTATIONS(metadata)
annotations_csv = METADATA_TO_SKY_ANNOTATIONS.out
study_name = params.pdc.study_name
}
}

workflow get_pdc_files {
emit:
study_name
metadata
annotations_csv
wide_mzml_ch

main:
get_pdc_study_metadata()
metadata = get_pdc_study_metadata.out.metadata
annotations_csv = get_pdc_study_metadata.out.annotations_csv
study_name = get_pdc_study_metadata.out.study_name

metadata \
| splitCsv(header:true, sep:'\t') \
| map{row -> tuple(row.url, row.file_name, row.md5sum)} \
| GET_FILE

MSCONVERT(GET_FILE.out.downloaded_file,
params.msconvert.do_demultiplex,
params.msconvert.do_simasspectra)

wide_mzml_ch = MSCONVERT.out.mzml_file
}

Loading