Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

update for gen_fusion_file #23

Merged
merged 1 commit into from
May 20, 2020
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
79 changes: 79 additions & 0 deletions scripts/make_fusion_genes.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,79 @@
"""
Generate a user-defined fusion file

Input file should be a text file containing
1. gene name and
2. transcript name (optional)

For example:
gene1 transcript1
gene2
gene3 transcript3
"""
__author__ = "Kai"
__date__ = "20/05/2020"

import argparse


def read_genelist(inputf):
with open(inputf, "r") as fh:
for line in fh:
yield line.rstrip("\n").split()


def make_fusion_gene(gene, fw, refflat):
# no transcript specified --> use the longest transcript
if len(gene) == 1:
transcripts = {}
with open(refflat, "r") as fh:
for line in fh:
if gene[0] not in line:
continue
_, transcript, chrom, _, start, end, _, _, _, exonstart, exonend = line.rstrip("\n").split("\t")
transcripts[transcript] = (chrom, start, end, exonstart, exonend)
transcript = get_longest_transcript(transcripts.keys(), refflat)
chrom, start, end, exonstart, exonend = transcripts[transcript]

# use user-specified transcript
elif len(gene) == 2:
with open(refflat, "r") as fh:
for line in fh:
if gene[1] not in line:
continue
_, transcript, chrom, _, start, end, _, _, _, exonstart, exonend = line.rstrip("\n").split("\t")
break

# write to a file
header = f">{gene[0]}_{transcript},{chrom}:{start}-{end}\n"
fw.write(header)
exons = list(zip(exonstart.split(","), exonend.split(",")))[:-1]
for index, each_exon in enumerate(exons, start=1):
fw.write(f'{index},{each_exon[0]},{each_exon[1]}\n')
fw.write("\n")


def get_longest_transcript(transcripts, refflat):
longest_length = 0
longest_transcript = ""
with open(refflat, "r") as fh:
for line in fh:
line = line.strip().split()
if line[1] in transcripts:
length = int(line[5]) - int(line[4])
if length > longest_length:
longest_length = length
longest_transcript = line[1]
return longest_transcript


if __name__ == "__main__":
parser = argparse.ArgumentParser(description=__doc__)
parser.add_argument("input", help="Input filename")
parser.add_argument("-r", "--refflat", required=True, help="Path to the refFlat.txt file, need to be downloaded from UCSC in advance")
parser.add_argument("-o", "--output", required=True, help="The output filename")
args = parser.parse_args()

with open(args.output, "w") as fw:
for gene in read_genelist(args.input):
make_fusion_gene(gene, fw, args.refflat)