Skip to content

Commit

Permalink
Merge pull request #11 from NCI-GDC/develop
Browse files Browse the repository at this point in the history
Develop
  • Loading branch information
Kyle Hernandez authored Jul 17, 2020
2 parents 114de69 + 2e2d703 commit 434d899
Show file tree
Hide file tree
Showing 7 changed files with 211 additions and 2 deletions.
2 changes: 1 addition & 1 deletion .travis.yml
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ stages:
- name: publish
if: branch = master AND type != pull_request
- name: publish-staging
if: branch != master
if: branch != master AND type != pull_request

jobs:
include:
Expand Down
2 changes: 1 addition & 1 deletion Makefile
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
VERSION = 1.0.0
VERSION = 1.0.1
REPO = variant-filtration-tool
MODULE = gdc_filtration_tools
BRANCH_NAME?=unknown
Expand Down
18 changes: 18 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -244,6 +244,24 @@ optional arguments:
-h, --help show this help message and exit
```

### `format-sanger-pindel-vcf`

Formats the Sanger Pindel VCF by filling in the `./.` genotypes to `0/0` for normal and
`0/1` for tumor.

```
usage: gdc_filtration_tools format-sanger-pindel-vcf [-h] input_vcf output_vcf
Formats Sanger Pindel VCFs to work better with GDC downstream workflows.
positional arguments:
input_vcf The input VCF file to format.
output_vcf The output formatted VCF file to create. BGzip and tabix-index created if ends with '.gz'.
optional arguments:
-h, --help show this help message and exit
```

### `position-filter-dkfz`

Removes VCF records where the POS-2 is less than 0 which
Expand Down
2 changes: 2 additions & 0 deletions gdc_filtration_tools/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
from gdc_filtration_tools.tools.filter_somatic_score import filter_somatic_score
from gdc_filtration_tools.tools.format_gdc_vcf import format_gdc_vcf
from gdc_filtration_tools.tools.format_pindel_vcf import format_pindel_vcf
from gdc_filtration_tools.tools.format_sanger_pindel_vcf import format_sanger_pindel_vcf


def main(args: List[str] = None) -> None:
Expand All @@ -39,6 +40,7 @@ def main(args: List[str] = None) -> None:
filter_somatic_score,
format_gdc_vcf,
format_pindel_vcf,
format_sanger_pindel_vcf,
position_filter_dkfz,
]
defopt.run(
Expand Down
69 changes: 69 additions & 0 deletions gdc_filtration_tools/tools/format_sanger_pindel_vcf.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,69 @@
"""Format Sanger PINDEL VCFs for downstream GDC workflows. This includes:
1. Force NORMAL genotypes to be 0/0 and TUMOR genotypes to be 0/1.
@author: Kyle Hernandez <[email protected]>
"""
import pysam

from gdc_filtration_tools.logger import Logger
from gdc_filtration_tools.utils import get_pysam_outmode


def format_sanger_pindel_vcf(input_vcf: str, output_vcf: str) -> None:
"""
Formats Sanger Pindel VCFs to work better with GDC downstream workflows.
:param input_vcf: The input VCF file to format.
:param output_vcf: The output formatted VCF file to create. BGzip and tabix-index created if ends with '.gz'.
"""
logger = Logger.get_logger("format_sanger_pindel_vcf")
logger.info("Formats Sanger Pindel VCFs.")

# setup
total = 0
reader = pysam.VariantFile(input_vcf)
mode = get_pysam_outmode(output_vcf)
writer = pysam.VariantFile(output_vcf, mode=mode, header=reader.header)

# Process
try:
for record in reader.fetch():
total += 1

record.samples["TUMOR"]["GT"] = (0, 1)
record.samples["NORMAL"]["GT"] = (0, 0)

# New record
new_record = writer.new_record()
new_record.contig = record.contig
new_record.alleles = record.alleles
new_record.start = record.start
new_record.stop = record.stop
new_record.id = record.id
new_record.qual = record.qual

for f in record.filter:
new_record.filter.add(f)

for k, v in record.info.items():
new_record.info[k] = v

for i, sample in enumerate(record.samples):
for k, v in record.samples[sample].items():
new_record.samples[i][k] = v

writer.write(new_record)

if total % 100000 == 0:
logger.info("Processed {0} records...".format(total))

finally:
reader.close()
writer.close()

if mode == "wz":
logger.info("Creating tabix index...")
tbx = pysam.tabix_index(output_vcf, preset="vcf", force=True)

logger.info("Processed {} records.".format(total))
36 changes: 36 additions & 0 deletions tests/data/sanger_pindel_test.vcf
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
##fileformat=VCFv4.2
##FILTER=<ID=PASS,Description="All filters passed">
##FILTER=<ID=FF004,Description="Tum medium read depth strand bias check: Calls In 8% Reads Bt Depth 10 And 200 (inclusive)">
##FILTER=<ID=FF005,Description="Tum high read depth strand bias check: Calls In 4% Reads > Depth 200">
##FILTER=<ID=FF006,Description="Small call excessive repeat check: Fail if Length <= 4 and Repeats > 9">
##FILTER=<ID=FF010,Description="Variant must not exist within the Unmatched Normal Panel">
##FILTER=<ID=FF012,Description="Germline: When length < 11 and depth > 9, fail if the variant is seen in both 20% of normal reads AND 20% of tumour reads in either pindel or bwa">
##FILTER=<ID=FF015,Description="No normal calls">
##FILTER=<ID=FF016,Description="Verify indel condition">
##FILTER=<ID=FF018,Description="Sufficient Depth: Pass if depth > 10">
##FORMAT=<ID=FC,Number=1,Type=Integer,Description="Fragment calls">
##FORMAT=<ID=FD,Number=1,Type=Integer,Description="Fragment depth">
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
##FORMAT=<ID=NB,Number=1,Type=Integer,Description="BWA calls on the negative strand">
##FORMAT=<ID=ND,Number=1,Type=Integer,Description="BWA mapped reads on the negative strand">
##FORMAT=<ID=NP,Number=1,Type=Integer,Description="Pindel calls on the negative strand">
##FORMAT=<ID=NR,Number=1,Type=Integer,Description="Total mapped reads on the negative strand">
##FORMAT=<ID=NU,Number=1,Type=Integer,Description="Unique calls on the negative strand">
##FORMAT=<ID=PB,Number=1,Type=Integer,Description="BWA calls on the positive strand">
##FORMAT=<ID=PD,Number=1,Type=Integer,Description="BWA mapped reads on the positive strand">
##FORMAT=<ID=PP,Number=1,Type=Integer,Description="Pindel calls on the positive strand">
##FORMAT=<ID=PR,Number=1,Type=Integer,Description="Total mapped reads on the positive strand">
##FORMAT=<ID=PU,Number=1,Type=Integer,Description="Unique calls on the positive strand">
##INFO=<ID=FF017,Number=0,Type=Flag,Description="Variant must not overlap with a simple repeat">
##INFO=<ID=LEN,Number=1,Type=Integer,Description="Length">
##INFO=<ID=OLD_VARIANT,Number=.,Type=String,Description="Original chr:pos:ref:alt encoding">
##INFO=<ID=PC,Number=1,Type=String,Description="Pindel call">
##INFO=<ID=RE,Number=1,Type=Integer,Description="Range end">
##INFO=<ID=REP,Number=1,Type=Integer,Description="Change repeat count within range">
##INFO=<ID=RS,Number=1,Type=Integer,Description="Range start">
##INFO=<ID=S1,Number=1,Type=Integer,Description="S1">
##INFO=<ID=S2,Number=1,Type=Float,Description="S2">
##contig=<ID=chr1,length=248956422>
#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NORMAL TUMOR
chr1 10 7c794825-35ff-4acc-8d08-94f5e5e2b554 T TA 20 FF010;FF015;FF018 FF017;LEN=1;PC=I;RE=10;REP=1;RS=9;S1=39;S2=1172.77 GT:PP:NP:PB:NB:PD:ND:PR:NR:PU:NU:FD:FC ./.:8:1:13:2:13:5:13:5:13:2:18:15 ./.:4:1:6:1:6:3:6:3:6:1:8:7
chr1 20 1c5a543d-0382-4d18-9c30-9077ac5fb209 CT C 60 FF010;FF012;FF015;FF016 FF017;LEN=1;PC=D;RE=20;REP=4;RS=19;S1=10;S2=430.85 GT:PP:NP:PB:NB:PD:ND:PR:NR:PU:NU:FD:FC ./.:3:0:5:3:13:12:13:12:5:3:24:8 ./.:1:1:5:3:22:10:22:10:5:3:31:7
84 changes: 84 additions & 0 deletions tests/test_format_sanger_pindel_vcf.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,84 @@
"""Tests the ``gdc_filtration_tools.tools.format_sanger_pindel_vcf`` module.
"""
import tempfile
import unittest

import attr
import pysam

from gdc_filtration_tools.__main__ import main
from gdc_filtration_tools.tools.format_sanger_pindel_vcf import format_sanger_pindel_vcf
from tests.utils import captured_output, cleanup_files, get_test_data_path


class TestFormatSangerPindelVcf(unittest.TestCase):
def test_format_sanger_pindel_vcf(self):
ivcf = get_test_data_path("sanger_pindel_test.vcf")
(fd, fn) = tempfile.mkstemp(suffix=".vcf.gz")
try:
with captured_output() as (_, stderr):
format_sanger_pindel_vcf(ivcf, fn)

vcf = pysam.VariantFile(fn)
self.assertEqual(list(vcf.header.samples), ["NORMAL", "TUMOR"])
rec = next(vcf)
self.assertEqual(rec.pos, 10)
self.assertEqual(rec.samples["TUMOR"]["GT"], (0, 1))
self.assertEqual(rec.samples["NORMAL"]["GT"], (0, 0))

rec = next(vcf)
self.assertEqual(rec.pos, 20)
self.assertEqual(rec.samples["TUMOR"]["GT"], (0, 1))
self.assertEqual(rec.samples["NORMAL"]["GT"], (0, 0))

with self.assertRaises(StopIteration):
rec = next(vcf)
vcf.close()

serr = stderr.getvalue()
self.assertTrue(
"[gdc_filtration_tools.format_sanger_pindel_vcf] - Creating tabix index..."
in serr
)
self.assertTrue(
"[gdc_filtration_tools.format_sanger_pindel_vcf] - Processed 2 records."
in serr
)
finally:
cleanup_files(fn)

def test_cli(self):
ivcf = get_test_data_path("sanger_pindel_test.vcf")
(fd, fn) = tempfile.mkstemp(suffix=".vcf.gz")
try:
with captured_output() as (_, stderr):
main(["format-sanger-pindel-vcf", ivcf, fn])

vcf = pysam.VariantFile(fn)
self.assertEqual(list(vcf.header.samples), ["NORMAL", "TUMOR"])
rec = next(vcf)
self.assertEqual(rec.pos, 10)
self.assertEqual(rec.samples["TUMOR"]["GT"], (0, 1))
self.assertEqual(rec.samples["NORMAL"]["GT"], (0, 0))

rec = next(vcf)
self.assertEqual(rec.pos, 20)
self.assertEqual(rec.samples["TUMOR"]["GT"], (0, 1))
self.assertEqual(rec.samples["NORMAL"]["GT"], (0, 0))

with self.assertRaises(StopIteration):
rec = next(vcf)
vcf.close()

serr = stderr.getvalue()
self.assertTrue(
"[gdc_filtration_tools.format_sanger_pindel_vcf] - Creating tabix index..."
in serr
)
self.assertTrue(
"[gdc_filtration_tools.format_sanger_pindel_vcf] - Processed 2 records."
in serr
)
self.assertTrue("gdc_filtration_tools.main" in serr)
finally:
cleanup_files(fn)

0 comments on commit 434d899

Please sign in to comment.