Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat(mask): Add --mask-gaps {all,terminals} option #1286

Open
wants to merge 3 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions CHANGES.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,11 +2,16 @@

## __NEXT__

### Features

* mask: Add `--mask-gaps` option to `augur mask` to allow masking of gaps at terminals via `--mask-gaps terminals` or all gaps via `--mask-gaps all`. [#1286][] (@corneliusroemer)

### Bug fixes

* distance: Improve documentation by describing how gaps get treated as indels and how users can ignore specific characters in distance calculations. [#1285][] (@huddlej)

[#1285]: https://github.com/nextstrain/augur/pull/1285
[#1286]: https://github.com/nextstrain/augur/pull/1286

## 22.3.0 (14 August 2023)

Expand Down
30 changes: 24 additions & 6 deletions augur/mask.py
Original file line number Diff line number Diff line change
Expand Up @@ -78,7 +78,7 @@ def mask_vcf(mask_sites, in_file, out_file, cleanup=True):
pass


def mask_sequence(sequence, mask_sites, mask_from_beginning, mask_from_end, mask_invalid):
def mask_sequence(sequence, mask_sites, mask_from_beginning, mask_from_end, mask_invalid, mask_gaps):
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
def mask_sequence(sequence, mask_sites, mask_from_beginning, mask_from_end, mask_invalid, mask_gaps):
def mask_sequence(sequence, mask_sites, mask_from_beginning, mask_from_end, mask_invalid, mask_gaps=None):

To match the default noted in docstring.

"""Mask characters at the given sites in a single sequence record, modifying the
record in place.

Expand All @@ -94,6 +94,8 @@ def mask_sequence(sequence, mask_sites, mask_from_beginning, mask_from_end, mask
Number of sites to mask from the end of each sequence (default 0)
mask_invalid: bool
Mask invalid nucleotides (default False)
mask_gaps: str
Mask terminal gaps ("terminals") or all gaps ("all") (default None)

Returns
-------
Expand All @@ -105,13 +107,24 @@ def mask_sequence(sequence, mask_sites, mask_from_beginning, mask_from_end, mask
sequence_length = len(sequence.seq)
beginning, end = mask_from_beginning, mask_from_end

if mask_gaps == "terminals":
# Count leading and trailing gaps
leading_gaps = len(sequence.seq) - len(sequence.seq.lstrip('-'))
trailing_gaps = len(sequence.seq) - len(sequence.seq.rstrip('-'))
# Update beginning and end to account for leading and trailing gaps
beginning = max(beginning, leading_gaps)
end = max(end, trailing_gaps)

if beginning + end > sequence_length:
beginning, end = sequence_length, 0

seq = str(sequence.seq)[beginning:-end or None]

if mask_invalid:
seq = "".join(nuc if nuc in VALID_NUCLEOTIDES else "N" for nuc in seq)

if mask_gaps == "all":
seq = "".join(nuc if nuc != "-" else "N" for nuc in seq)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

(non-blocking)

Suggested change
seq = "".join(nuc if nuc != "-" else "N" for nuc in seq)
seq = "".join("N" if nuc == "-" else nuc for nuc in seq)

This reads slightly better to me - it pairs the mask character with the gap character.


masked_sequence = MutableSeq("N" * beginning + seq + "N" * end)

Expand All @@ -125,7 +138,7 @@ def mask_sequence(sequence, mask_sites, mask_from_beginning, mask_from_end, mask
return sequence


def mask_fasta(mask_sites, in_file, out_file, mask_from_beginning=0, mask_from_end=0, mask_invalid=False):
def mask_fasta(mask_sites, in_file, out_file, mask_from_beginning=0, mask_from_end=0, mask_invalid=False, mask_gaps=None):
"""Mask the provided site list from a FASTA file and write to a new file.

Masked sites are overwritten as "N"s.
Expand All @@ -144,6 +157,8 @@ def mask_fasta(mask_sites, in_file, out_file, mask_from_beginning=0, mask_from_e
Number of sites to mask from the end of each sequence (default 0)
mask_invalid: bool
Mask invalid nucleotides (default False)
mask_gaps: str
Mask terminal gaps ("terminals") or all gaps ("all") (default None)
"""
# Load alignment as FASTA generator to prevent loading the whole alignment
# into memory.
Expand All @@ -159,6 +174,7 @@ def mask_fasta(mask_sites, in_file, out_file, mask_from_beginning=0, mask_from_e
mask_from_beginning,
mask_from_end,
mask_invalid,
mask_gaps,
)
for sequence in alignment
)
Expand All @@ -176,6 +192,7 @@ def register_arguments(parser):
"""
parser.add_argument('--sequences', '-s', required=True, help="sequences in VCF or FASTA format")
parser.add_argument('--mask', dest="mask_file", required=False, help="locations to be masked in either BED file format, DRM format, or one 1-indexed site per line.")
parser.add_argument('--mask-gaps', choices=['all', 'terminals'], help="FASTA Only: Mask terminal gaps or all gaps, default is to not mask gaps")
parser.add_argument('--mask-from-beginning', type=int, default=0, help="FASTA Only: Number of sites to mask from beginning")
parser.add_argument('--mask-from-end', type=int, default=0, help="FASTA Only: Number of sites to mask from end")
parser.add_argument('--mask-invalid', action='store_true', help="FASTA Only: Mask invalid nucleotides")
Expand Down Expand Up @@ -216,8 +233,8 @@ def run(args):
if os.path.getsize(args.mask_file) == 0:
print("ERROR: {} is an empty file.".format(args.mask_file))
sys.exit(1)
if not any((args.mask_file, args.mask_from_beginning, args.mask_from_end, args.mask_sites, args.mask_invalid)):
print("No masking sites provided. Must include one of --mask, --mask-from-beginning, --mask-from-end, --mask-invalid, or --mask-sites")
if not any((args.mask_file, args.mask_gaps, args.mask_from_beginning, args.mask_from_end, args.mask_sites, args.mask_invalid)):
print("No masking sites provided. Must include one of --mask, --mask-gaps, --mask-from-beginning, --mask-from-end, --mask-invalid, or --mask-sites")
sys.exit(1)

mask_sites = set()
Expand All @@ -236,12 +253,13 @@ def run(args):
"masked_" + os.path.basename(args.sequences))

if is_vcf(args.sequences):
if args.mask_from_beginning or args.mask_from_end or args.mask_invalid:
print("Cannot use --mask-from-beginning, --mask-from-end, or --mask-invalid with VCF files!")
if args.mask_gaps or args.mask_from_beginning or args.mask_from_end or args.mask_invalid:
print("Cannot use --mask-gaps, --mask-from-beginning, --mask-from-end, or --mask-invalid with VCF files!")
sys.exit(1)
mask_vcf(mask_sites, args.sequences, out_file, args.cleanup)
else:
mask_fasta(mask_sites, args.sequences, out_file,
mask_gaps=args.mask_gaps,
mask_from_beginning=args.mask_from_beginning,
mask_from_end=args.mask_from_end,
mask_invalid=args.mask_invalid)
Expand Down
41 changes: 34 additions & 7 deletions tests/functional/mask.t
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ Integration tests for augur mask.
Try masking a VCF without any specified mask.

$ ${AUGUR} mask --sequences mask/variants.vcf.gz
No masking sites provided. Must include one of --mask, --mask-from-beginning, --mask-from-end, --mask-invalid, or --mask-sites
No masking sites provided. Must include one of --mask, --mask-gaps, --mask-from-beginning, --mask-from-end, --mask-invalid, or --mask-sites
[1]

Mask a VCF with a BED file and no specified output file.
Expand All @@ -32,7 +32,7 @@ Mask a VCF with a BED file and a specified output file.
Try masking sequences without any specified mask.

$ ${AUGUR} mask --sequences mask/sequences.fasta
No masking sites provided. Must include one of --mask, --mask-from-beginning, --mask-from-end, --mask-invalid, or --mask-sites
No masking sites provided. Must include one of --mask, --mask-gaps, --mask-from-beginning, --mask-from-end, --mask-invalid, or --mask-sites
[1]

Mask sequences with a BED file and no specified output file.
Expand All @@ -45,7 +45,7 @@ Since no output is provided, the input file is overridden with the masked sequen

$ cat "$TMP/sequences.fasta"
>sequence_1
NNGCNG
NN-ANGCT--G
$ rm -f "$TMP/sequences.fasta"

Mask sequences with a BED file and a specified output file.
Expand All @@ -59,7 +59,7 @@ Mask sequences with a BED file and a specified output file.

$ cat "$TMP/masked.fasta"
>sequence_1
NNGCNG
NN-ANGCT--G
$ rm -f "$TMP/masked.fasta"

Mask one base from the beginning and the end.
Expand All @@ -73,7 +73,7 @@ Mask one base from the beginning and the end.

$ cat "$TMP/masked.fasta"
>sequence_1
NTGCTN
N--ATGCT--N
$ rm -f "$TMP/masked.fasta"

Mask a specific list of sites and also mask one base from the beginning and the end.
Expand All @@ -88,7 +88,7 @@ Mask a specific list of sites and also mask one base from the beginning and the

$ cat "$TMP/masked.fasta"
>sequence_1
NTNNTN
N-NNTGCT--N
$ rm -f "$TMP/masked.fasta"

Mask invalid nucleotides
Expand All @@ -104,4 +104,31 @@ Mask invalid nucleotides
ATCGNNNN
$ rm -f "$TMP/masked.fasta"

$ popd > /dev/null
Mask all gaps
$ ${AUGUR} mask \
> --sequences mask/sequences.fasta \
> --mask-gaps all \
> --output "$TMP/masked.fasta"
Removing masked sites from FASTA file.

$ cat "$TMP/masked.fasta"
>sequence_1
NNNATGCTNNG

Mask terminal gaps as well as one character from beginning and end

$ ${AUGUR} mask \
> --sequences mask/sequences.fasta \
> --mask-gaps terminals \
> --mask-from-beginning 1 \
> --mask-from-end 1 \
> --output "$TMP/masked.fasta"
Removing masked sites from FASTA file.

$ cat "$TMP/masked.fasta"
>sequence_1
NNNATGCT--N

Go back to the original directory.

$ popd > /dev/null
2 changes: 1 addition & 1 deletion tests/functional/mask/sequences.fasta
Original file line number Diff line number Diff line change
@@ -1,2 +1,2 @@
>sequence_1
ATGCTG
---ATGCT--G
Loading