diff --git a/nextstrain_profiles/nextstrain-gisaid/builds.yaml b/nextstrain_profiles/nextstrain-gisaid/builds.yaml index 4034e4730..3b3f517b8 100644 --- a/nextstrain_profiles/nextstrain-gisaid/builds.yaml +++ b/nextstrain_profiles/nextstrain-gisaid/builds.yaml @@ -2,6 +2,7 @@ auspice_json_prefix: ncov_gisaid # Define custom rules for pre- or post-standard workflow processing of data. custom_rules: + - workflow/snakemake_rules/prefilter.smk - workflow/snakemake_rules/export_for_nextstrain.smk # These parameters are only used by the `export_for_nextstrain` rule and shouldn't need to be modified. @@ -25,7 +26,7 @@ files: inputs: - name: gisaid - metadata: "s3://nextstrain-ncov-private/metadata.tsv.zst" + metadata: "data/prefiltered_metadata.tsv" aligned: "s3://nextstrain-ncov-private/aligned.fasta.zst" skip_sanitize_metadata: true diff --git a/workflow/snakemake_rules/prefilter.smk b/workflow/snakemake_rules/prefilter.smk new file mode 100644 index 000000000..d66f009a9 --- /dev/null +++ b/workflow/snakemake_rules/prefilter.smk @@ -0,0 +1,26 @@ +rule download_metadata: + params: + metadata_url="s3://nextstrain-ncov-private/metadata.tsv.zst", + output: + metadata="data/metadata.tsv.zst", + shell: + """ + aws s3 cp {params.metadata_url} {output.metadata} + """ + +rule filter_metadata: + input: + metadata="data/metadata.tsv.zst", + output: + metadata="data/prefiltered_metadata.tsv", + params: + max_sequences=500000, + group_by="division year month", + shell: + """ + augur filter \ + --metadata {input.metadata} \ + --subsample-max-sequences {params.max_sequences} \ + --group-by {params.group_by} \ + --output-metadata {output.metadata} + """