Skip to content

Commit

Permalink
Demo prefilter rule for Nextstrain GISAID build
Browse files Browse the repository at this point in the history
Adds a prefilter rule to reduce the size of the input metadata for the
GISAID build before running the whole workflow.
  • Loading branch information
huddlej committed Jul 25, 2024
1 parent 4477b96 commit 4919079
Show file tree
Hide file tree
Showing 2 changed files with 28 additions and 1 deletion.
3 changes: 2 additions & 1 deletion nextstrain_profiles/nextstrain-gisaid/builds.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@ auspice_json_prefix: ncov_gisaid

# Define custom rules for pre- or post-standard workflow processing of data.
custom_rules:
- workflow/snakemake_rules/prefilter.smk
- workflow/snakemake_rules/export_for_nextstrain.smk

# These parameters are only used by the `export_for_nextstrain` rule and shouldn't need to be modified.
Expand All @@ -25,7 +26,7 @@ files:

inputs:
- name: gisaid
metadata: "s3://nextstrain-ncov-private/metadata.tsv.zst"
metadata: "data/prefiltered_metadata.tsv"
aligned: "s3://nextstrain-ncov-private/aligned.fasta.zst"
skip_sanitize_metadata: true

Expand Down
26 changes: 26 additions & 0 deletions workflow/snakemake_rules/prefilter.smk
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
rule download_metadata:
params:
metadata_url="s3://nextstrain-ncov-private/metadata.tsv.zst",
output:
metadata="data/metadata.tsv.zst",
shell:
"""
aws s3 cp {params.metadata_url} {output.metadata}
"""

rule filter_metadata:
input:
metadata="data/metadata.tsv.zst",
output:
metadata="data/prefiltered_metadata.tsv",
params:
max_sequences=500000,
group_by="division year month",
shell:
"""
augur filter \
--metadata {input.metadata} \
--subsample-max-sequences {params.max_sequences} \
--group-by {params.group_by} \
--output-metadata {output.metadata}
"""

0 comments on commit 4919079

Please sign in to comment.