diff --git a/.travis.yml b/.travis.yml index bf3ac21..f58ebfe 100644 --- a/.travis.yml +++ b/.travis.yml @@ -14,7 +14,7 @@ stages: - name: publish if: branch = master AND type != pull_request - name: publish-staging - if: branch != master + if: branch != master AND type != pull_request jobs: include: diff --git a/Makefile b/Makefile index 2ede8e9..0f83c9f 100644 --- a/Makefile +++ b/Makefile @@ -1,4 +1,4 @@ -VERSION = 1.0.0 +VERSION = 1.0.1 REPO = variant-filtration-tool MODULE = gdc_filtration_tools BRANCH_NAME?=unknown diff --git a/README.md b/README.md index 823ec95..3a6476c 100644 --- a/README.md +++ b/README.md @@ -244,6 +244,24 @@ optional arguments: -h, --help show this help message and exit ``` +### `format-sanger-pindel-vcf` + +Formats the Sanger Pindel VCF by filling in the `./.` genotypes to `0/0` for normal and +`0/1` for tumor. + +``` +usage: gdc_filtration_tools format-sanger-pindel-vcf [-h] input_vcf output_vcf + +Formats Sanger Pindel VCFs to work better with GDC downstream workflows. + +positional arguments: + input_vcf The input VCF file to format. + output_vcf The output formatted VCF file to create. BGzip and tabix-index created if ends with '.gz'. + +optional arguments: + -h, --help show this help message and exit +``` + ### `position-filter-dkfz` Removes VCF records where the POS-2 is less than 0 which diff --git a/gdc_filtration_tools/__main__.py b/gdc_filtration_tools/__main__.py index 8d4f21e..76cf254 100644 --- a/gdc_filtration_tools/__main__.py +++ b/gdc_filtration_tools/__main__.py @@ -19,6 +19,7 @@ from gdc_filtration_tools.tools.filter_somatic_score import filter_somatic_score from gdc_filtration_tools.tools.format_gdc_vcf import format_gdc_vcf from gdc_filtration_tools.tools.format_pindel_vcf import format_pindel_vcf +from gdc_filtration_tools.tools.format_sanger_pindel_vcf import format_sanger_pindel_vcf def main(args: List[str] = None) -> None: @@ -39,6 +40,7 @@ def main(args: List[str] = None) -> None: filter_somatic_score, format_gdc_vcf, format_pindel_vcf, + format_sanger_pindel_vcf, position_filter_dkfz, ] defopt.run( diff --git a/gdc_filtration_tools/tools/format_sanger_pindel_vcf.py b/gdc_filtration_tools/tools/format_sanger_pindel_vcf.py new file mode 100644 index 0000000..2c233cd --- /dev/null +++ b/gdc_filtration_tools/tools/format_sanger_pindel_vcf.py @@ -0,0 +1,69 @@ +"""Format Sanger PINDEL VCFs for downstream GDC workflows. This includes: + +1. Force NORMAL genotypes to be 0/0 and TUMOR genotypes to be 0/1. + +@author: Kyle Hernandez +""" +import pysam + +from gdc_filtration_tools.logger import Logger +from gdc_filtration_tools.utils import get_pysam_outmode + + +def format_sanger_pindel_vcf(input_vcf: str, output_vcf: str) -> None: + """ + Formats Sanger Pindel VCFs to work better with GDC downstream workflows. + + :param input_vcf: The input VCF file to format. + :param output_vcf: The output formatted VCF file to create. BGzip and tabix-index created if ends with '.gz'. + """ + logger = Logger.get_logger("format_sanger_pindel_vcf") + logger.info("Formats Sanger Pindel VCFs.") + + # setup + total = 0 + reader = pysam.VariantFile(input_vcf) + mode = get_pysam_outmode(output_vcf) + writer = pysam.VariantFile(output_vcf, mode=mode, header=reader.header) + + # Process + try: + for record in reader.fetch(): + total += 1 + + record.samples["TUMOR"]["GT"] = (0, 1) + record.samples["NORMAL"]["GT"] = (0, 0) + + # New record + new_record = writer.new_record() + new_record.contig = record.contig + new_record.alleles = record.alleles + new_record.start = record.start + new_record.stop = record.stop + new_record.id = record.id + new_record.qual = record.qual + + for f in record.filter: + new_record.filter.add(f) + + for k, v in record.info.items(): + new_record.info[k] = v + + for i, sample in enumerate(record.samples): + for k, v in record.samples[sample].items(): + new_record.samples[i][k] = v + + writer.write(new_record) + + if total % 100000 == 0: + logger.info("Processed {0} records...".format(total)) + + finally: + reader.close() + writer.close() + + if mode == "wz": + logger.info("Creating tabix index...") + tbx = pysam.tabix_index(output_vcf, preset="vcf", force=True) + + logger.info("Processed {} records.".format(total)) diff --git a/tests/data/sanger_pindel_test.vcf b/tests/data/sanger_pindel_test.vcf new file mode 100644 index 0000000..19f40ca --- /dev/null +++ b/tests/data/sanger_pindel_test.vcf @@ -0,0 +1,36 @@ +##fileformat=VCFv4.2 +##FILTER= +##FILTER= +##FILTER= Depth 200"> +##FILTER= 9"> +##FILTER= +##FILTER= 9, fail if the variant is seen in both 20% of normal reads AND 20% of tumour reads in either pindel or bwa"> +##FILTER= +##FILTER= +##FILTER= 10"> +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##contig= +#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NORMAL TUMOR +chr1 10 7c794825-35ff-4acc-8d08-94f5e5e2b554 T TA 20 FF010;FF015;FF018 FF017;LEN=1;PC=I;RE=10;REP=1;RS=9;S1=39;S2=1172.77 GT:PP:NP:PB:NB:PD:ND:PR:NR:PU:NU:FD:FC ./.:8:1:13:2:13:5:13:5:13:2:18:15 ./.:4:1:6:1:6:3:6:3:6:1:8:7 +chr1 20 1c5a543d-0382-4d18-9c30-9077ac5fb209 CT C 60 FF010;FF012;FF015;FF016 FF017;LEN=1;PC=D;RE=20;REP=4;RS=19;S1=10;S2=430.85 GT:PP:NP:PB:NB:PD:ND:PR:NR:PU:NU:FD:FC ./.:3:0:5:3:13:12:13:12:5:3:24:8 ./.:1:1:5:3:22:10:22:10:5:3:31:7 diff --git a/tests/test_format_sanger_pindel_vcf.py b/tests/test_format_sanger_pindel_vcf.py new file mode 100644 index 0000000..1bc19d5 --- /dev/null +++ b/tests/test_format_sanger_pindel_vcf.py @@ -0,0 +1,84 @@ +"""Tests the ``gdc_filtration_tools.tools.format_sanger_pindel_vcf`` module. +""" +import tempfile +import unittest + +import attr +import pysam + +from gdc_filtration_tools.__main__ import main +from gdc_filtration_tools.tools.format_sanger_pindel_vcf import format_sanger_pindel_vcf +from tests.utils import captured_output, cleanup_files, get_test_data_path + + +class TestFormatSangerPindelVcf(unittest.TestCase): + def test_format_sanger_pindel_vcf(self): + ivcf = get_test_data_path("sanger_pindel_test.vcf") + (fd, fn) = tempfile.mkstemp(suffix=".vcf.gz") + try: + with captured_output() as (_, stderr): + format_sanger_pindel_vcf(ivcf, fn) + + vcf = pysam.VariantFile(fn) + self.assertEqual(list(vcf.header.samples), ["NORMAL", "TUMOR"]) + rec = next(vcf) + self.assertEqual(rec.pos, 10) + self.assertEqual(rec.samples["TUMOR"]["GT"], (0, 1)) + self.assertEqual(rec.samples["NORMAL"]["GT"], (0, 0)) + + rec = next(vcf) + self.assertEqual(rec.pos, 20) + self.assertEqual(rec.samples["TUMOR"]["GT"], (0, 1)) + self.assertEqual(rec.samples["NORMAL"]["GT"], (0, 0)) + + with self.assertRaises(StopIteration): + rec = next(vcf) + vcf.close() + + serr = stderr.getvalue() + self.assertTrue( + "[gdc_filtration_tools.format_sanger_pindel_vcf] - Creating tabix index..." + in serr + ) + self.assertTrue( + "[gdc_filtration_tools.format_sanger_pindel_vcf] - Processed 2 records." + in serr + ) + finally: + cleanup_files(fn) + + def test_cli(self): + ivcf = get_test_data_path("sanger_pindel_test.vcf") + (fd, fn) = tempfile.mkstemp(suffix=".vcf.gz") + try: + with captured_output() as (_, stderr): + main(["format-sanger-pindel-vcf", ivcf, fn]) + + vcf = pysam.VariantFile(fn) + self.assertEqual(list(vcf.header.samples), ["NORMAL", "TUMOR"]) + rec = next(vcf) + self.assertEqual(rec.pos, 10) + self.assertEqual(rec.samples["TUMOR"]["GT"], (0, 1)) + self.assertEqual(rec.samples["NORMAL"]["GT"], (0, 0)) + + rec = next(vcf) + self.assertEqual(rec.pos, 20) + self.assertEqual(rec.samples["TUMOR"]["GT"], (0, 1)) + self.assertEqual(rec.samples["NORMAL"]["GT"], (0, 0)) + + with self.assertRaises(StopIteration): + rec = next(vcf) + vcf.close() + + serr = stderr.getvalue() + self.assertTrue( + "[gdc_filtration_tools.format_sanger_pindel_vcf] - Creating tabix index..." + in serr + ) + self.assertTrue( + "[gdc_filtration_tools.format_sanger_pindel_vcf] - Processed 2 records." + in serr + ) + self.assertTrue("gdc_filtration_tools.main" in serr) + finally: + cleanup_files(fn)