-
Notifications
You must be signed in to change notification settings - Fork 0
/
enrichmenTE.py
150 lines (122 loc) · 4.43 KB
/
enrichmenTE.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
#!/usr/bin/env python3
import sys
import os
import time
from shutil import copyfile
import datetime
import logging
from enrichmenTE_utility import mkdir, get_lines, format_time
from enrichmenTE_input import get_args
from enrichmenTE_detect import detect
from enrichmenTE_cluster import cluster
# from enrichmenTE_cluster import cluster
"""
Author: Shunhua Han <[email protected]>
"""
def main():
# Fetch command-line options
args = get_args()
# create output directory
out_dir = args.out
mkdir(out_dir)
# logging config
formatstr = "%(asctime)s: %(levelname)s: %(message)s"
datestr = "%m/%d/%Y %H:%M:%S"
# log_filename = ".".join(["enrichmenTE", prefix, "log"])
now = datetime.datetime.now()
log_filename = "enrichmenTE_{}.log".format(now.strftime("%d_%m_%Y_%T"))
logging.basicConfig(
level=logging.DEBUG,
filename=os.path.join(out_dir, log_filename),
filemode="w",
format=formatstr,
datefmt=datestr,
)
logging.info("CMD: " + " ".join(sys.argv))
start_time = time.time()
if not args.sub:
print(
"Please choose one of the two modes ('detect' or 'cluster'). See --help for more information."
)
return
logging.info("WORKING DIR: {0}".format(os.path.abspath(out_dir)))
logging.info("Start enrichmenTE...")
# detect
if args.sub == "detect":
logging.info("MODE: DETECT")
logging.info("Read 1: {0}".format(os.path.abspath(args.read1)))
logging.info("Read 2: {0}".format(os.path.abspath(args.read2)))
logging.info("Reference genome: {0}".format(os.path.abspath(args.reference)))
# set up default parameters
if args.thread is None:
args.thread = 1
if args.window is None:
args.window = 100
if args.tsd_max is None:
args.tsd_max = 25
if args.gap_max is None:
args.gap_max = 25
# get prefix
if args.prefix is None:
print(args.read1)
r1 = (args.read1).split(",")
prefix = os.path.basename(r1[0]).replace(".fastq.gz", "")
else:
prefix = args.prefix
# create output directory
out_dir = os.path.join(out_dir, prefix)
mkdir(out_dir)
# create module output directory
detect_out_dir = os.path.join(out_dir, "detect")
mkdir(detect_out_dir)
# create directory for intermediate files
tmp_dir = os.path.join(detect_out_dir, "detect_intermediate_files")
mkdir(tmp_dir)
nonref_pred = detect(
prefix=prefix,
read1=args.read1,
read2=args.read2,
reference=args.reference,
outdir=tmp_dir,
depth_config=args.depth_config,
ref_te_bed=args.gff,
window=args.window,
tsd_max=args.tsd_max,
gap_max=args.gap_max,
filter_region=args.filter_region,
thread=args.thread,
)
nonref_output = os.path.join(detect_out_dir, prefix + ".nonref.bed")
copyfile(nonref_pred, nonref_output)
num_nonref = get_lines(nonref_output)
proc_time_all = time.time() - start_time
logging.info("enrichmenTE DETECT finished in " + format_time(proc_time_all))
logging.info("Number of non-reference TEs: " + str(num_nonref))
elif args.sub == "cluster":
logging.info("MODE: CLUSTER")
# get prefix
if args.prefix is None:
prefix = "clustergram"
else:
prefix = args.prefix
# create output directory
out_dir = os.path.join(out_dir, prefix)
mkdir(out_dir)
# create module output directory
cluster_out_dir = os.path.join(out_dir, "cluster")
mkdir(cluster_out_dir)
tree_file, pdf_file = cluster(
prefix=prefix,
enrichmente_out_dirs=args.enrichmente_out_dirs,
filter_region=args.filter_region,
outgroup=args.outgroup,
out=cluster_out_dir,
include_families=args.include_families,
exclude_families=args.exclude_families,
exclude_samples=args.exclude_samples,
)
proc_time_all = time.time() - start_time
logging.info("enrichmenTE CLUSTER finished in " + format_time(proc_time_all))
logging.info("Clustergram file in newick format: " + tree_file)
logging.info("Clustergram file in PDF format: " + pdf_file)
main()