-
Notifications
You must be signed in to change notification settings - Fork 2
/
gdc-tsv-tool.py
executable file
·291 lines (269 loc) · 10.8 KB
/
gdc-tsv-tool.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
import sys
import json
import requests
import re
import argparse
def arg_parse():
parser = argparse.ArgumentParser(
description = '----GDC Metadata TSV Download Tool v2.0----',
usage = 'python gdc-tsv-tool.py <options> MANIFEST_FILE')
parser.add_argument('-o', '--output', metavar = 'FILE_PREFIX',
action = "store", dest = 'o', type = str, default = "metadata",
help = 'Designates a prefix for output files')
parser.add_argument('-c', '--clinical', action = "store_true",
help = 'Only outputs clinical metadata')
parser.add_argument('-b', '--biospecimen', action = "store_true",
help = 'Only outputs biospecimen metadata')
parser.add_argument('-u', '--uuid_list', action = "store_false",
help = 'Pass a plain text list of UUIDs ' \
'(one UUID per line) instead of a manifest')
parser.add_argument('-l', '--legacy', action = "store_true",
help = 'Manifest from GDC Legacy Archive')
parser.add_argument('-s', '--simple', action = "store_true",
help = 'Output a simple set of fields' \
'(file name, file id, project id, ' \
'case barcode, sample type)')
parser.add_argument('-x', '--mafout', action = "store_true",
help = 'Output separate metadata file for ' \
'MAF or XLSX file (warning: messy)')
parser.add_argument('-a', '--allop', action = "store_true",
help = 'Empty or datetime columns are not removed' \
' from the output file')
parser.add_argument('manifest_file', action = "store",
help = 'Path to manifest file (or UUID List with -u)')
args = parser.parse_args()
return args
def error_parse(code):
'''
Generates the error messages
'''
error = {
"bad_mani": "Input must be valid GDC Manifest. " \
"\n\tGo to https://portal.gdc.cancer.gov/ to download a manifest",
"no_result": "Query produced no results, " \
"are these files from the Legacy Archive? (use -l)"
}
print("ERROR: " + error[code])
sys.exit(2)
def verbose():
'''
Generates the running messages
'''
global get_clin, get_bio, maf_info, is_manifest
message = '\n'
if sim_arg == True: message += ">-- Retrieving basic metadata\n"
if get_clin == True: message += ">-- Retrieving clinical metadata\n"
if get_bio == True: message += ">-- Retrieving biospecimen metadata\n"
if maf_info == True: message += ">-- Retrieving MAF/XLSX metadata\n"
print(message)
print("***************************************\n")
def main(args):
'''
Retrieves and parses the arguments
'''
global get_clin, get_bio, maf_info, is_manifest, bio_arg, clin_arg,\
sim_arg, all_columns, o_filename, legacy, manifest_file
maf_info = args.mafout
is_manifest = args.uuid_list
bio_arg = args.biospecimen
clin_arg = args.clinical
sim_arg = args.simple
all_columns = args.allop
o_filename = args.o
legacy = args.legacy
manifest_file = args.manifest_file
get_clin = True; get_bio = True
if bio_arg == True: get_clin = False
if clin_arg == True: get_bio = False
if bio_arg == True and clin_arg == True: get_bio = True; get_clin = True
if sim_arg == True: get_bio = False; get_clin = False
def get_uuid_list(manifest):
'''
Retrieves thes list of UUIDs from
the manifest passed to the script
'''
with open(manifest,'r') as myfile:
uuid_dict = {}
if is_manifest == True:
if myfile.readline()[0:2] != 'id': error_parse("bad_mani")
for x in myfile:
uuid = x.split('\t')[0]
file_name = x.split('\t')[1]
uuid_dict[uuid] = file_name
else:
for x in myfile:
uuid = x.strip()
file_name = ''
uuid_dict[uuid] = file_name
return uuid_dict
def classify_file_list(file_dict):
'''
Mixing files with different numbers of aliquots can be messy.
This function separates all of the files into 'mono' , 'di',
and 'poly' -aliquots by reading their extension on the filename.
'''
mono,poly,di = [],[],[]
if is_manifest == True:
for uuid in file_dict.keys():
filename = file_dict[uuid].strip()
extension = filename.split(".")[-1]
if extension == 'gz':
extension = filename.split(".")[-2]
if extension == 'maf' or extension == 'xlsx':
poly.append(uuid)
elif extension == 'vcf': di.append(uuid)
else: mono.append(uuid)
elif extension == 'vcf': di.append(uuid)
elif extension == 'maf': poly.append(uuid)
else: mono.append(uuid)
else:
for uuid in file_dict.keys(): mono.append(uuid)
return mono,di,poly
def retrieve_metadata_for_list(file_list):
'''
This function makes the API call based on a list of UUIDs
and arguments (clinical, biospecimen, etc)
'''
headers = {'Content-Type': 'application/json'}
url = 'https://api.gdc.cancer.gov/files'
if legacy == True: url = 'https://api.gdc.cancer.gov/legacy/files'
fields = "file_id,file_name,cases.submitter_id,cases.samples.sample_type," \
"cases.project.project_id, cases.project.name"
expand = ""
if get_clin == True:
expand += "cases,cases.demographic,cases.exposures," \
"cases.diagnoses,cases.diagnoses.treatments," \
"cases.diagnoses,cases.family_histories,"
if get_bio == True:
expand += "cases,cases.samples,cases.samples.portions," \
"cases.samples.portions.analytes," \
"cases.samples.portions.analytes.aliquots," \
"cases.samples.portions.slides," \
"analysis.metadata.read_groups"
params = {"filters":
{"op":"in","content":
{"field":"file_id", "value":file_list}},
"format":"TSV",
"size": "10000",
"fields":fields,
"expand":expand}
response = requests.post(url,
data = json.dumps(params),
headers = headers,
stream = True)
if len(response.content.strip()) == 0: error_parse("no_result")
return response.content
def order_columns(matrix_list):
'''
Note: This function is used in the clean_matrix function
It puts the columns in a somewhat desirable order
'''
new_matrix = []
nmdict = {}
# Special fields go first
special = ["file_name", "file_id", "project_project_id", "project_name"]
# This step looks at the_order and rearranges the column based on
# their entity-of-origin
the_order = ["special", "cases", "samples", "portions", "analytes",
"aliquots", "slides", "demographic", "exposures", "diagnoses",
"treatments", "family_histories", "analysis_metadata_read_groups"]
clinfields = ["demographic", "exposures", "treatments",
"diagnoses", "family_histories", "cases"]
biofields = ["samples", "portions", "analytes", "aliquots", "slides",
"analysis_metadata_read_groups", "project"]
donefields = []
nmdict["special"] = []
for item in special:
for j in matrix_list:
col_name = j[0]
if item in col_name:
nmdict["special"].append(j)
donefields.append(j)
for item in biofields:
subid = []
nmdict[item] = []
for j in matrix_list:
col_name = j[0]
ents = re.split('_[0-9]_',col_name)
if len(ents) > 1:
last_ent = ents[-2]
if item == last_ent and (j not in donefields):
# Adding submitter_id to be start of entity
if "submitter_id" in col_name:
subid.append(j)
donefields.append(j)
else:
nmdict[item].append(j)
donefields.append(j)
if len(subid) > 0: nmdict[item] = subid + nmdict[item]
for item in clinfields:
nmdict[item] = []
for j in matrix_list:
col_name = j[0]
if item in col_name and (j not in donefields):
if "submitter_id" in col_name:
subid.append(j)
donefields.append(j)
else:
nmdict[item].append(j)
donefields.append(j)
if len(subid) > 0:
nmdict[item] = subid + nmdict[item]
subid = []
for item in the_order: new_matrix += nmdict[item]
# Adding remainder of fields to the matrix
for j in matrix_list:
if j not in donefields:
new_matrix.append(j)
return new_matrix
def clean_matrix(testcase):
'''
Removes empty columns and datetime columns (unless -a is specified)
'''
global all_columns
matrix = []
# Turning output into list of lists (matrix)
testcase = testcase.decode().strip().split('\n')
for row in testcase:
row = row.replace('\r','')
row_list = row.split('\t')
matrix.append(row_list)
columns = zip(*matrix)
# The good_cols list will contain the transposed columns
good_cols = []
for column in columns:
column = list(column)
col_name = column[0]
if (len(list(filter(lambda x: x != 'live',(filter(None,column))))) > 1 \
and 'datetime' not in col_name) or all_columns == True:
good_cols.append(column)
good_cols = order_columns(good_cols)
clean_matrix = zip(*good_cols)
inter_matrix = []
for row in clean_matrix: inter_matrix.append('\t'.join(row))
final_matrix = '\n'.join(inter_matrix)
return final_matrix
def run_main(my_list, extension, file_type):
fn = o_filename + extension
with open(fn,'w') as my_file:
matrix = retrieve_metadata_for_list(my_list)
my_file.write(clean_matrix(matrix))
print(">-- {} file metadata written to {}\n".format(file_type,fn))
main(arg_parse())
uuid_dictionary = get_uuid_list(str(manifest_file))
mono,di,poly = classify_file_list(uuid_dictionary)
master = [(mono,".files.txt","Standard"),
(di,".vcfs.txt","VCF"),
(poly,".mafs.txt","MAF/XLSX")]
# Checks to see if each subset of files is actually present.
# Then it performs the query and writes it to the file.
# If the query returned nothing, print an error
if len(mono) + len(di) + len(poly) == 0: error_parse("no_result")
verbose()
for my_list, extension, file_type in master[:2]:
if len(my_list) > 0:
run_main(my_list, extension, file_type)
if maf_info == True:
for my_list, extension, file_type in master[2:]:
if len(my_list) > 0:
run_main(my_list, extension, file_type)