-
Notifications
You must be signed in to change notification settings - Fork 2
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #11 from NCI-GDC/develop
Develop
- Loading branch information
Showing
7 changed files
with
211 additions
and
2 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,69 @@ | ||
"""Format Sanger PINDEL VCFs for downstream GDC workflows. This includes: | ||
1. Force NORMAL genotypes to be 0/0 and TUMOR genotypes to be 0/1. | ||
@author: Kyle Hernandez <[email protected]> | ||
""" | ||
import pysam | ||
|
||
from gdc_filtration_tools.logger import Logger | ||
from gdc_filtration_tools.utils import get_pysam_outmode | ||
|
||
|
||
def format_sanger_pindel_vcf(input_vcf: str, output_vcf: str) -> None: | ||
""" | ||
Formats Sanger Pindel VCFs to work better with GDC downstream workflows. | ||
:param input_vcf: The input VCF file to format. | ||
:param output_vcf: The output formatted VCF file to create. BGzip and tabix-index created if ends with '.gz'. | ||
""" | ||
logger = Logger.get_logger("format_sanger_pindel_vcf") | ||
logger.info("Formats Sanger Pindel VCFs.") | ||
|
||
# setup | ||
total = 0 | ||
reader = pysam.VariantFile(input_vcf) | ||
mode = get_pysam_outmode(output_vcf) | ||
writer = pysam.VariantFile(output_vcf, mode=mode, header=reader.header) | ||
|
||
# Process | ||
try: | ||
for record in reader.fetch(): | ||
total += 1 | ||
|
||
record.samples["TUMOR"]["GT"] = (0, 1) | ||
record.samples["NORMAL"]["GT"] = (0, 0) | ||
|
||
# New record | ||
new_record = writer.new_record() | ||
new_record.contig = record.contig | ||
new_record.alleles = record.alleles | ||
new_record.start = record.start | ||
new_record.stop = record.stop | ||
new_record.id = record.id | ||
new_record.qual = record.qual | ||
|
||
for f in record.filter: | ||
new_record.filter.add(f) | ||
|
||
for k, v in record.info.items(): | ||
new_record.info[k] = v | ||
|
||
for i, sample in enumerate(record.samples): | ||
for k, v in record.samples[sample].items(): | ||
new_record.samples[i][k] = v | ||
|
||
writer.write(new_record) | ||
|
||
if total % 100000 == 0: | ||
logger.info("Processed {0} records...".format(total)) | ||
|
||
finally: | ||
reader.close() | ||
writer.close() | ||
|
||
if mode == "wz": | ||
logger.info("Creating tabix index...") | ||
tbx = pysam.tabix_index(output_vcf, preset="vcf", force=True) | ||
|
||
logger.info("Processed {} records.".format(total)) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,36 @@ | ||
##fileformat=VCFv4.2 | ||
##FILTER=<ID=PASS,Description="All filters passed"> | ||
##FILTER=<ID=FF004,Description="Tum medium read depth strand bias check: Calls In 8% Reads Bt Depth 10 And 200 (inclusive)"> | ||
##FILTER=<ID=FF005,Description="Tum high read depth strand bias check: Calls In 4% Reads > Depth 200"> | ||
##FILTER=<ID=FF006,Description="Small call excessive repeat check: Fail if Length <= 4 and Repeats > 9"> | ||
##FILTER=<ID=FF010,Description="Variant must not exist within the Unmatched Normal Panel"> | ||
##FILTER=<ID=FF012,Description="Germline: When length < 11 and depth > 9, fail if the variant is seen in both 20% of normal reads AND 20% of tumour reads in either pindel or bwa"> | ||
##FILTER=<ID=FF015,Description="No normal calls"> | ||
##FILTER=<ID=FF016,Description="Verify indel condition"> | ||
##FILTER=<ID=FF018,Description="Sufficient Depth: Pass if depth > 10"> | ||
##FORMAT=<ID=FC,Number=1,Type=Integer,Description="Fragment calls"> | ||
##FORMAT=<ID=FD,Number=1,Type=Integer,Description="Fragment depth"> | ||
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype"> | ||
##FORMAT=<ID=NB,Number=1,Type=Integer,Description="BWA calls on the negative strand"> | ||
##FORMAT=<ID=ND,Number=1,Type=Integer,Description="BWA mapped reads on the negative strand"> | ||
##FORMAT=<ID=NP,Number=1,Type=Integer,Description="Pindel calls on the negative strand"> | ||
##FORMAT=<ID=NR,Number=1,Type=Integer,Description="Total mapped reads on the negative strand"> | ||
##FORMAT=<ID=NU,Number=1,Type=Integer,Description="Unique calls on the negative strand"> | ||
##FORMAT=<ID=PB,Number=1,Type=Integer,Description="BWA calls on the positive strand"> | ||
##FORMAT=<ID=PD,Number=1,Type=Integer,Description="BWA mapped reads on the positive strand"> | ||
##FORMAT=<ID=PP,Number=1,Type=Integer,Description="Pindel calls on the positive strand"> | ||
##FORMAT=<ID=PR,Number=1,Type=Integer,Description="Total mapped reads on the positive strand"> | ||
##FORMAT=<ID=PU,Number=1,Type=Integer,Description="Unique calls on the positive strand"> | ||
##INFO=<ID=FF017,Number=0,Type=Flag,Description="Variant must not overlap with a simple repeat"> | ||
##INFO=<ID=LEN,Number=1,Type=Integer,Description="Length"> | ||
##INFO=<ID=OLD_VARIANT,Number=.,Type=String,Description="Original chr:pos:ref:alt encoding"> | ||
##INFO=<ID=PC,Number=1,Type=String,Description="Pindel call"> | ||
##INFO=<ID=RE,Number=1,Type=Integer,Description="Range end"> | ||
##INFO=<ID=REP,Number=1,Type=Integer,Description="Change repeat count within range"> | ||
##INFO=<ID=RS,Number=1,Type=Integer,Description="Range start"> | ||
##INFO=<ID=S1,Number=1,Type=Integer,Description="S1"> | ||
##INFO=<ID=S2,Number=1,Type=Float,Description="S2"> | ||
##contig=<ID=chr1,length=248956422> | ||
#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NORMAL TUMOR | ||
chr1 10 7c794825-35ff-4acc-8d08-94f5e5e2b554 T TA 20 FF010;FF015;FF018 FF017;LEN=1;PC=I;RE=10;REP=1;RS=9;S1=39;S2=1172.77 GT:PP:NP:PB:NB:PD:ND:PR:NR:PU:NU:FD:FC ./.:8:1:13:2:13:5:13:5:13:2:18:15 ./.:4:1:6:1:6:3:6:3:6:1:8:7 | ||
chr1 20 1c5a543d-0382-4d18-9c30-9077ac5fb209 CT C 60 FF010;FF012;FF015;FF016 FF017;LEN=1;PC=D;RE=20;REP=4;RS=19;S1=10;S2=430.85 GT:PP:NP:PB:NB:PD:ND:PR:NR:PU:NU:FD:FC ./.:3:0:5:3:13:12:13:12:5:3:24:8 ./.:1:1:5:3:22:10:22:10:5:3:31:7 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,84 @@ | ||
"""Tests the ``gdc_filtration_tools.tools.format_sanger_pindel_vcf`` module. | ||
""" | ||
import tempfile | ||
import unittest | ||
|
||
import attr | ||
import pysam | ||
|
||
from gdc_filtration_tools.__main__ import main | ||
from gdc_filtration_tools.tools.format_sanger_pindel_vcf import format_sanger_pindel_vcf | ||
from tests.utils import captured_output, cleanup_files, get_test_data_path | ||
|
||
|
||
class TestFormatSangerPindelVcf(unittest.TestCase): | ||
def test_format_sanger_pindel_vcf(self): | ||
ivcf = get_test_data_path("sanger_pindel_test.vcf") | ||
(fd, fn) = tempfile.mkstemp(suffix=".vcf.gz") | ||
try: | ||
with captured_output() as (_, stderr): | ||
format_sanger_pindel_vcf(ivcf, fn) | ||
|
||
vcf = pysam.VariantFile(fn) | ||
self.assertEqual(list(vcf.header.samples), ["NORMAL", "TUMOR"]) | ||
rec = next(vcf) | ||
self.assertEqual(rec.pos, 10) | ||
self.assertEqual(rec.samples["TUMOR"]["GT"], (0, 1)) | ||
self.assertEqual(rec.samples["NORMAL"]["GT"], (0, 0)) | ||
|
||
rec = next(vcf) | ||
self.assertEqual(rec.pos, 20) | ||
self.assertEqual(rec.samples["TUMOR"]["GT"], (0, 1)) | ||
self.assertEqual(rec.samples["NORMAL"]["GT"], (0, 0)) | ||
|
||
with self.assertRaises(StopIteration): | ||
rec = next(vcf) | ||
vcf.close() | ||
|
||
serr = stderr.getvalue() | ||
self.assertTrue( | ||
"[gdc_filtration_tools.format_sanger_pindel_vcf] - Creating tabix index..." | ||
in serr | ||
) | ||
self.assertTrue( | ||
"[gdc_filtration_tools.format_sanger_pindel_vcf] - Processed 2 records." | ||
in serr | ||
) | ||
finally: | ||
cleanup_files(fn) | ||
|
||
def test_cli(self): | ||
ivcf = get_test_data_path("sanger_pindel_test.vcf") | ||
(fd, fn) = tempfile.mkstemp(suffix=".vcf.gz") | ||
try: | ||
with captured_output() as (_, stderr): | ||
main(["format-sanger-pindel-vcf", ivcf, fn]) | ||
|
||
vcf = pysam.VariantFile(fn) | ||
self.assertEqual(list(vcf.header.samples), ["NORMAL", "TUMOR"]) | ||
rec = next(vcf) | ||
self.assertEqual(rec.pos, 10) | ||
self.assertEqual(rec.samples["TUMOR"]["GT"], (0, 1)) | ||
self.assertEqual(rec.samples["NORMAL"]["GT"], (0, 0)) | ||
|
||
rec = next(vcf) | ||
self.assertEqual(rec.pos, 20) | ||
self.assertEqual(rec.samples["TUMOR"]["GT"], (0, 1)) | ||
self.assertEqual(rec.samples["NORMAL"]["GT"], (0, 0)) | ||
|
||
with self.assertRaises(StopIteration): | ||
rec = next(vcf) | ||
vcf.close() | ||
|
||
serr = stderr.getvalue() | ||
self.assertTrue( | ||
"[gdc_filtration_tools.format_sanger_pindel_vcf] - Creating tabix index..." | ||
in serr | ||
) | ||
self.assertTrue( | ||
"[gdc_filtration_tools.format_sanger_pindel_vcf] - Processed 2 records." | ||
in serr | ||
) | ||
self.assertTrue("gdc_filtration_tools.main" in serr) | ||
finally: | ||
cleanup_files(fn) |