Skip to content

Commit

Permalink
Merge pull request #20 from ajmaurais/file_hashes
Browse files Browse the repository at this point in the history
Calculate output file md5 hashes and file sizes.
  • Loading branch information
mriffle authored Sep 19, 2024
2 parents bca28cd + 80b2119 commit 6501e15
Show file tree
Hide file tree
Showing 16 changed files with 255 additions and 55 deletions.
23 changes: 23 additions & 0 deletions conf/output_directories.config
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@

params {
output_directories = [
panorama: "${params.result_dir}/panorama",
aws: "${params.result_dir}/aws",
msconvert: "${params.result_dir}/msconvert",
diann: "${params.result_dir}/diann",
qc_report: "${params.result_dir}/qc_report",
qc_report_tables: "${params.result_dir}/qc_report/tables",
gene_reports: "${params.result_dir}/gene_reports",
encyclopedia: [
convert_blib: "${params.result_dir}/encyclopedia/convert-blib",
search_file: "${params.result_dir}/encyclopedia/search-file",
create_elib: "${params.result_dir}/encyclopedia/create-elib"
],
skyline: [
add_lib: "${params.result_dir}/skyline/add-lib",
import_spectra: "${params.result_dir}/skyline/import-spectra",
minimize: "${params.result_dir}/skyline/minimize",
reports: "${params.result_dir}/skyline/reports"
]
]
}
23 changes: 23 additions & 0 deletions main.nf
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ include { panorama_upload_results } from "./workflows/panorama_upload"
include { panorama_upload_mzmls } from "./workflows/panorama_upload"
include { save_run_details } from "./workflows/save_run_details"
include { get_pdc_files } from "./workflows/get_pdc_files"
include { combine_file_hashes } from "./workflows/combine_file_hashes"

// modules
include { ENCYCLOPEDIA_BLIB_TO_DLIB } from "./modules/encyclopedia"
Expand Down Expand Up @@ -197,12 +198,14 @@ workflow {
)

quant_library = encyclopeda_export_elib.out.elib
spec_lib_hashes = encyclopeda_export_elib.out.output_file_stats

all_elib_ch = encyclopeda_export_elib.out.elib.concat(
encyclopeda_export_elib.out.individual_elibs
)
} else {
quant_library = spectral_library_to_use
spec_lib_hashes = Channel.empty()
all_mzml_ch = wide_mzml_ch
all_elib_ch = Channel.empty()
}
Expand All @@ -219,6 +222,7 @@ workflow {
)

encyclopedia_version = encyclopedia_quant.out.encyclopedia_version
search_file_stats = encyclopedia_quant.out.output_file_stats.concat(spec_lib_hashes)

final_elib = encyclopedia_quant.out.elib
all_elib_ch = all_elib_ch.concat(
Expand Down Expand Up @@ -284,6 +288,7 @@ workflow {
)

diann_version = diann_search.out.diann_version
search_file_stats = diann_search.out.output_file_stats

// create compatible spectral library for Skyline, if needed
if(!params.skyline.skip) {
Expand Down Expand Up @@ -329,11 +334,17 @@ workflow {
}

final_skyline_file = skyline_import.out.skyline_results
final_skyline_hash = skyline_import.out.skyline_results_hash

// generate QC report
if(!params.qc_report.skip) {
generate_dia_qc_report(final_skyline_file, replicate_metadata)
dia_qc_version = generate_dia_qc_report.out.dia_qc_version
qc_report_files = generate_dia_qc_report.out.qc_reports.concat(
generate_dia_qc_report.out.qc_report_qmd,
generate_dia_qc_report.out.qc_report_db,
generate_dia_qc_report.out.qc_tables
)

// Export PDC gene tables
if(params.pdc.gene_level_data != null) {
Expand All @@ -346,6 +357,8 @@ workflow {
}
} else {
dia_qc_version = Channel.empty()
qc_report_files = Channel.empty()
gene_reports = Channel.empty()
}

// run reports if requested
Expand All @@ -367,6 +380,7 @@ workflow {
final_skyline_file = Channel.empty()
qc_report_files = Channel.empty()
proteowizard_version = Channel.empty()
final_skyline_hash = Channel.empty()
dia_qc_version = Channel.empty()
gene_reports = Channel.empty()
}
Expand All @@ -382,6 +396,15 @@ workflow {
save_run_details(input_files.collect(), version_files.collect())
run_details_file = save_run_details.out.run_details

combine_file_hashes(fasta, spectral_library,
search_file_stats,
final_skyline_file,
final_skyline_hash,
skyline_reports_ch,
qc_report_files,
gene_reports,
run_details_file)

// upload results to Panorama
if(params.panorama.upload) {

Expand Down
4 changes: 2 additions & 2 deletions modules/aws.nf
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ process BUILD_AWS_SECRETS {
label 'process_low_constant'
secret 'PANORAMA_API_KEY'
executor 'local' // always run this locally
publishDir "${params.result_dir}/aws", failOnError: true, mode: 'copy'
publishDir params.output_directories.aws, failOnError: true, mode: 'copy'
cache false // never cache

input:
Expand Down Expand Up @@ -123,4 +123,4 @@ process BUILD_AWS_SECRETS {
// touch aws-destroy-secrets.stderr
// touch aws-destroy-secrets.stdout
// """
// }
// }
24 changes: 21 additions & 3 deletions modules/diann.nf
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
process DIANN_SEARCH {
publishDir "${params.result_dir}/diann", failOnError: true, mode: 'copy'
publishDir params.output_directories.diann, failOnError: true, mode: 'copy'
label 'process_high_constant'
container params.images.diann

Expand All @@ -16,6 +16,7 @@ process DIANN_SEARCH {
path("report.tsv"), emit: precursor_tsv
path("*.quant"), emit: quant_files
path("diann_version.txt"), emit: version
path("output_file_stats.txt"), emit: output_file_stats

script:

Expand All @@ -37,18 +38,26 @@ process DIANN_SEARCH {
mv -v lib.tsv.speclib report.tsv.speclib
head -n 1 diann.stdout | egrep -o '[0-9]+\\.[0-9]+\\.[0-9]+' | xargs printf "diann_version=%s\\n" > diann_version.txt
md5sum '${ms_files.join('\' \'')}' report.tsv.speclib report.tsv *.quant | sed -E 's/([a-f0-9]{32}) [ \\*](.*)/\\2\\t\\1/' | sort > hashes.txt
stat -L --printf='%n\t%s\n' '${ms_files.join('\' \'')}' report.tsv.speclib report.tsv *.quant | sort > sizes.txt
join -t'\t' hashes.txt sizes.txt > output_file_stats.txt
"""

stub:
"""
touch report.tsv.speclib report.tsv stub.quant
touch stub.stderr stub.stdout
diann | egrep -o '[0-9]+\\.[0-9]+\\.[0-9]+' | xargs printf "diann_version=%s\\n" > diann_version.txt
md5sum '${ms_files.join('\' \'')}' report.tsv.speclib report.tsv *.quant | sed -E 's/([a-f0-9]{32}) [ \\*](.*)/\\2\\t\\1/' | sort > hashes.txt
stat -L --printf='%n\t%s\n' '${ms_files.join('\' \'')}' report.tsv.speclib report.tsv *.quant | sort > sizes.txt
join -t'\t' hashes.txt sizes.txt > output_file_stats.txt
"""
}

process DIANN_SEARCH_LIB_FREE {
publishDir "${params.result_dir}/diann", failOnError: true, mode: 'copy'
publishDir params.output_directories.diann, failOnError: true, mode: 'copy'
label 'process_high_constant'
container params.images.diann

Expand All @@ -65,6 +74,7 @@ process DIANN_SEARCH_LIB_FREE {
path("*.quant"), emit: quant_files
path("lib.predicted.speclib"), emit: predicted_speclib
path("diann_version.txt"), emit: version
path("output_file_stats.txt"), emit: output_file_stats

script:

Expand All @@ -87,19 +97,27 @@ process DIANN_SEARCH_LIB_FREE {
mv -v lib.tsv.speclib report.tsv.speclib
head -n 1 diann.stdout | egrep -o '[0-9]+\\.[0-9]+\\.[0-9]+' | xargs printf "diann_version=%s\\n" > diann_version.txt
md5sum '${ms_files.join('\' \'')}' report.tsv.speclib report.tsv *.quant | sed -E 's/([a-f0-9]{32}) [ \\*](.*)/\\2\\t\\1/' | sort > hashes.txt
stat -L --printf='%n\t%s\n' '${ms_files.join('\' \'')}' report.tsv.speclib report.tsv *.quant | sort > sizes.txt
join -t'\t' hashes.txt sizes.txt > output_file_stats.txt
"""

stub:
"""
touch lib.predicted.speclib report.tsv.speclib report.tsv stub.quant
touch stub.stderr stub.stdout
diann | egrep -o '[0-9]+\\.[0-9]+\\.[0-9]+' | xargs printf "diann_version=%s\\n" > diann_version.txt
md5sum '${ms_files.join('\' \'')}' report.tsv.speclib report.tsv *.quant | sed -E 's/([a-f0-9]{32}) [ \\*](.*)/\\2\\t\\1/' | sort > hashes.txt
stat -L --printf='%n\t%s\n' '${ms_files.join('\' \'')}' report.tsv.speclib report.tsv *.quant | sort > sizes.txt
join -t'\t' hashes.txt sizes.txt > output_file_stats.txt
"""
}


process BLIB_BUILD_LIBRARY {
publishDir "${params.result_dir}/diann", failOnError: true, mode: 'copy'
publishDir params.output_directories.diann, failOnError: true, mode: 'copy'
label 'process_medium'
container params.images.bibliospec

Expand Down
28 changes: 18 additions & 10 deletions modules/encyclopedia.nf
Original file line number Diff line number Diff line change
Expand Up @@ -4,13 +4,12 @@ def exec_java_command(mem) {
}

process ENCYCLOPEDIA_SEARCH_FILE {
publishDir "${params.result_dir}/encyclopedia/search-file", pattern: "*.stderr", failOnError: true, mode: 'copy'
publishDir "${params.result_dir}/encyclopedia/search-file", pattern: "*.stdout", failOnError: true, mode: 'copy'
publishDir "${params.result_dir}/encyclopedia/search-file", pattern: "*.elib", failOnError: true, mode: 'copy', enabled: params.encyclopedia.save_output
publishDir "${params.result_dir}/encyclopedia/search-file", pattern: "*.dia", failOnError: true, mode: 'copy', enabled: params.encyclopedia.save_output
publishDir "${params.result_dir}/encyclopedia/search-file", pattern: "*.features.txt", failOnError: true, mode: 'copy', enabled: params.encyclopedia.save_output
publishDir "${params.result_dir}/encyclopedia/search-file", pattern: "*.encyclopedia.txt", failOnError: true, mode: 'copy', enabled: params.encyclopedia.save_output
publishDir "${params.result_dir}/encyclopedia/search-file", pattern: "*.encyclopedia.decoy.txt", failOnError: true, mode: 'copy', enabled: params.encyclopedia.save_output
publishDir params.output_directories.encyclopedia.search_file, pattern: "*.stderr", failOnError: true, mode: 'copy'
publishDir params.output_directories.encyclopedia.search_file, pattern: "*.stdout", failOnError: true, mode: 'copy'
publishDir params.output_directories.encyclopedia.search_file, pattern: "*.elib", failOnError: true, mode: 'copy', enabled: params.encyclopedia.save_output
publishDir params.output_directories.encyclopedia.search_file, pattern: "*.features.txt", failOnError: true, mode: 'copy', enabled: params.encyclopedia.save_output
publishDir params.output_directories.encyclopedia.search_file, pattern: "*.encyclopedia.txt", failOnError: true, mode: 'copy', enabled: params.encyclopedia.save_output
publishDir params.output_directories.encyclopedia.search_file, pattern: "*.encyclopedia.decoy.txt", failOnError: true, mode: 'copy', enabled: params.encyclopedia.save_output
label 'process_high_constant'
container params.images.encyclopedia

Expand All @@ -28,6 +27,7 @@ process ENCYCLOPEDIA_SEARCH_FILE {
path("${mzml_file}.features.txt"), emit: features
path("${mzml_file}.encyclopedia.txt"), emit: results_targets
path("${mzml_file}.encyclopedia.decoy.txt"), emit: results_decoys
path("output_file_stats.txt"), emit: output_file_stats


script:
Expand All @@ -40,6 +40,10 @@ process ENCYCLOPEDIA_SEARCH_FILE {
-percolatorVersion /usr/local/bin/percolator \\
${encyclopedia_params} \\
> >(tee "encyclopedia-${mzml_file.baseName}.stdout") 2> >(tee "encyclopedia-${mzml_file.baseName}.stderr" >&2)
md5sum *.elib *.features.txt *.encyclopedia.txt *.encyclopedia.decoy.txt *.mzML | sed -E 's/([a-f0-9]{32}) [ \\*](.*)/\\2\\t\\1/' | sort > hashes.txt
stat -L --printf='%n\t%s\n' *.elib *.features.txt *.encyclopedia.txt *.encyclopedia.decoy.txt *.mzML | sort > sizes.txt
join -t'\t' hashes.txt sizes.txt > output_file_stats.txt
"""

stub:
Expand All @@ -50,11 +54,15 @@ process ENCYCLOPEDIA_SEARCH_FILE {
touch "${mzml_file}.features.txt"
touch "${mzml_file}.encyclopedia.txt"
touch "${mzml_file}.encyclopedia.decoy.txt"
md5sum *.elib *.features.txt *.encyclopedia.txt *.encyclopedia.decoy.txt *.mzML | sed -E 's/([a-f0-9]{32}) [ \\*](.*)/\\2\\t\\1/' | sort > hashes.txt
stat -L --printf='%n\t%s\n' *.elib *.features.txt *.encyclopedia.txt *.encyclopedia.decoy.txt *.mzML | sort > sizes.txt
join -t'\t' hashes.txt sizes.txt > output_file_stats.txt
"""
}

process ENCYCLOPEDIA_CREATE_ELIB {
publishDir "${params.result_dir}/encyclopedia/create-elib", failOnError: true, mode: 'copy'
publishDir params.output_directories.encyclopedia.create_elib, failOnError: true, mode: 'copy'
label 'process_memory_high_constant'
container params.images.encyclopedia

Expand Down Expand Up @@ -113,7 +121,7 @@ process ENCYCLOPEDIA_CREATE_ELIB {
}

process ENCYCLOPEDIA_BLIB_TO_DLIB {
publishDir "${params.result_dir}/encyclopedia/convert-blib", failOnError: true, mode: 'copy'
publishDir params.output_directories.encyclopedia.convert_blib, failOnError: true, mode: 'copy'
label 'process_medium'
label 'process_high_memory'
container params.images.encyclopedia
Expand Down Expand Up @@ -147,7 +155,7 @@ process ENCYCLOPEDIA_BLIB_TO_DLIB {
}

process ENCYCLOPEDIA_DLIB_TO_TSV {
publishDir "${params.result_dir}/encyclopedia/convert-blib", failOnError: true, mode: 'copy'
publishDir params.output_directories.encyclopedia.convert_blib, failOnError: true, mode: 'copy'
label 'process_medium'
label 'process_high_memory'
container params.images.encyclopedia3_mriffle
Expand Down
37 changes: 37 additions & 0 deletions modules/file_stats.nf
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@

process CALCULATE_MD5 {
label 'process_low'
container params.images.ubuntu

input:
path(file_to_check)

output:
tuple val("${file_to_check.name}"), env(md5_sum)

shell:
'''
md5_sum=$( md5sum !{file_to_check} |awk '{print $1}' )
'''
}

process WRITE_FILE_STATS {
label 'process_low'
container params.images.ubuntu
publishDir "${params.result_dir}", failOnError: true, mode: 'copy'

input:
val file_stats

output:
path("file_checksums.tsv")

script:
data = file_stats.join('\\n')
"""
text="${data}"
echo -e 'file\\tpath\\tmd5_hash\\tsize' > file_checksums.tsv
echo -e \$text >> file_checksums.tsv
"""
}
2 changes: 1 addition & 1 deletion modules/msconvert.nf
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
process MSCONVERT {
storeDir "${params.mzml_cache_directory}/${workflow.commitId}/${params.msconvert.do_demultiplex}/${params.msconvert.do_simasspectra}"
publishDir "${params.result_dir}/msconvert", pattern: "*.mzML", failOnError: true, mode: 'copy', enabled: params.msconvert_only && !params.panorama.upload
publishDir params.output_directories.msconvert, pattern: "*.mzML", failOnError: true, mode: 'copy', enabled: params.msconvert_only && !params.panorama.upload
label 'process_medium'
label 'process_high_memory'
label 'error_retry'
Expand Down
18 changes: 9 additions & 9 deletions modules/panorama.nf
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,7 @@ process PANORAMA_GET_RAW_FILE_LIST {
label 'process_low_constant'
label 'error_retry'
container params.images.panorama_client
publishDir "${params.result_dir}/panorama", failOnError: true, mode: 'copy'
publishDir params.output_directories.panorama, failOnError: true, mode: 'copy'
secret 'PANORAMA_API_KEY'

input:
Expand Down Expand Up @@ -93,8 +93,8 @@ process PANORAMA_GET_FILE {
label 'process_low_constant'
label 'error_retry'
container params.images.panorama_client
publishDir "${params.result_dir}/panorama", failOnError: true, mode: 'copy', pattern: "*.stdout"
publishDir "${params.result_dir}/panorama", failOnError: true, mode: 'copy', pattern: "*.stderr"
publishDir params.output_directories.panorama, failOnError: true, mode: 'copy', pattern: "*.stdout"
publishDir params.output_directories.panorama, failOnError: true, mode: 'copy', pattern: "*.stderr"
secret 'PANORAMA_API_KEY'

input:
Expand Down Expand Up @@ -169,8 +169,8 @@ process PANORAMA_GET_SKYR_FILE {
label 'process_low_constant'
label 'error_retry'
container params.images.panorama_client
publishDir "${params.result_dir}/panorama", failOnError: true, mode: 'copy', pattern: "*.stdout"
publishDir "${params.result_dir}/panorama", failOnError: true, mode: 'copy', pattern: "*.stderr"
publishDir params.output_directories.panorama, failOnError: true, mode: 'copy', pattern: "*.stdout"
publishDir params.output_directories.panorama, failOnError: true, mode: 'copy', pattern: "*.stderr"
secret 'PANORAMA_API_KEY'

input:
Expand Down Expand Up @@ -202,8 +202,8 @@ process UPLOAD_FILE {
label 'error_retry'
maxForks 2
container params.images.panorama_client
publishDir "${params.result_dir}/panorama", failOnError: true, mode: 'copy', pattern: "*.stdout"
publishDir "${params.result_dir}/panorama", failOnError: true, mode: 'copy', pattern: "*.stderr"
publishDir params.output_directories.panorama, failOnError: true, mode: 'copy', pattern: "*.stdout"
publishDir params.output_directories.panorama, failOnError: true, mode: 'copy', pattern: "*.stderr"
secret 'PANORAMA_API_KEY'

input:
Expand Down Expand Up @@ -239,8 +239,8 @@ process UPLOAD_FILE {
process IMPORT_SKYLINE {
label 'process_low_constant'
container params.images.panorama_client
publishDir "${params.result_dir}/panorama", failOnError: true, mode: 'copy', pattern: "*.stdout"
publishDir "${params.result_dir}/panorama", failOnError: true, mode: 'copy', pattern: "*.stderr"
publishDir params.output_directories.panorama, failOnError: true, mode: 'copy', pattern: "*.stdout"
publishDir params.output_directories.panorama, failOnError: true, mode: 'copy', pattern: "*.stderr"
secret 'PANORAMA_API_KEY'

input:
Expand Down
Loading

0 comments on commit 6501e15

Please sign in to comment.