Skip to content

Commit

Permalink
Merge branch 'trs/nextclade-merge'
Browse files Browse the repository at this point in the history
  • Loading branch information
tsibley committed Sep 19, 2024
2 parents 168b91a + 4d73b7f commit e416110
Show file tree
Hide file tree
Showing 3 changed files with 70 additions and 51 deletions.
29 changes: 28 additions & 1 deletion ingest/defaults/config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -124,5 +124,32 @@ curate:
genotype_field: "virus_name"
nextclade:
dataset_name: "nextstrain/measles/N450/WHO-2012"
field_map: "defaults/nextclade_field_map.tsv"
field_map:
# The first column should be the original column name of the Nextclade TSV
# The second column should be the new column name to use in the final metadata TSV
# Nextclade can have pathogen specific output columns so make sure to check which
# columns would be useful for your downstream phylogenetic analysis.
seqName: seqName
clade: clade
coverage: coverage
totalMissing: missing_data
totalSubstitutions: divergence
totalNonACGTNs: nonACGTN
qc.overallStatus: QC_overall
qc.missingData.status: QC_missing_data
qc.mixedSites.status: QC_mixed_sites
qc.privateMutations.status: QC_rare_mutations
qc.snpClusters.status: QC_snp_clusters
qc.frameShifts.status: QC_frame_shifts
qc.stopCodons.status: QC_stop_codons
frameShifts: frame_shifts
privateNucMutations.reversionSubstitutions: private_reversion_substitutions
privateNucMutations.labeledSubstitutions: private_labeled_substitutions
privateNucMutations.unlabeledSubstitutions: private_unlabeled_substitutions
privateNucMutations.totalReversionSubstitutions: private_total_reversion_substitutions
privateNucMutations.totalLabeledSubstitutions: private_total_labeled_substitutions
privateNucMutations.totalUnlabeledSubstitutions: private_total_unlabeled_substitutions
privateNucMutations.totalPrivateSubstitutions: private_total_private_substitutions
qc.snpClusters.clusteredSNPs: private_snp_clusters
qc.snpClusters.totalSNPs: private_total_snp_clusters
id_field: "seqName"
28 changes: 0 additions & 28 deletions ingest/defaults/nextclade_field_map.tsv

This file was deleted.

64 changes: 42 additions & 22 deletions ingest/rules/nextclade.smk
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,8 @@ and sequences.
See Nextclade docs for more details on usage, inputs, and outputs if you would
like to customize the rules
"""
import sys

DATASET_NAME = config["nextclade"]["dataset_name"]


Expand Down Expand Up @@ -46,35 +48,53 @@ rule run_nextclade:
"""


rule join_metadata_and_nextclade:
if isinstance(config["nextclade"]["field_map"], str):
print(f"Converting config['nextclade']['field_map'] from TSV file ({config['nextclade']['field_map']}) to dictionary; "
f"consider putting the field map directly in the config file.", file=sys.stderr)

with open(config["nextclade"]["field_map"], "r") as f:
config["nextclade"]["field_map"] = dict(line.rstrip("\n").split("\t", 1) for line in f if not line.startswith("#"))


rule nextclade_metadata:
input:
nextclade="results/nextclade.tsv",
output:
nextclade_metadata=temp("results/nextclade_metadata.tsv"),
params:
nextclade_id_field=config["nextclade"]["id_field"],
nextclade_field_map=[f"{old}={new}" for old, new in config["nextclade"]["field_map"].items()],
nextclade_fields=",".join(config["nextclade"]["field_map"].values()),
shell:
r"""
augur curate rename \
--metadata {input.nextclade:q} \
--id-column {params.nextclade_id_field:q} \
--field-map {params.nextclade_field_map:q} \
--output-metadata - \
| tsv-select --header --fields {params.nextclade_fields:q} \
> {output.nextclade_metadata:q}
"""


rule join_metadata_and_nextclade:
input:
metadata="data/subset_metadata.tsv",
nextclade_field_map=config["nextclade"]["field_map"],
nextclade_metadata="results/nextclade_metadata.tsv",
output:
metadata="results/metadata.tsv",
params:
metadata_id_field=config["curate"]["output_id_field"],
nextclade_id_field=config["nextclade"]["id_field"],
shell:
r"""
augur merge \
--metadata \
metadata={input.metadata:q} \
nextclade={input.nextclade_metadata:q} \
--metadata-id-columns \
metadata={params.metadata_id_field:q} \
nextclade={params.nextclade_id_field:q} \
--output-metadata {output.metadata:q} \
--no-source-columns
"""
export SUBSET_FIELDS=`grep -v '^#' {input.nextclade_field_map} | awk '{{print $1}}' | tr '\n' ',' | sed 's/,$//g'`
csvtk -tl cut -f $SUBSET_FIELDS \
{input.nextclade} \
| csvtk -tl rename2 \
-F \
-f '*' \
-p '(.+)' \
-r '{{kv}}' \
-k {input.nextclade_field_map} \
| tsv-join -H \
--filter-file - \
--key-fields {params.nextclade_id_field} \
--data-fields {params.metadata_id_field} \
--append-fields '*' \
--write-all ? \
{input.metadata} \
| tsv-select -H --exclude {params.nextclade_id_field} \
> {output.metadata}
"""

0 comments on commit e416110

Please sign in to comment.