-
Notifications
You must be signed in to change notification settings - Fork 0
/
blurb_benchmark.py
157 lines (139 loc) · 7.63 KB
/
blurb_benchmark.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
import datasets
from typing import Tuple, List
TASKS = {
"BC5CDR-chem-IOB":"NER chemical mentions",
"BC5CDR-disease-IOB":"NER disease mentions",
"BC2GM-IOB": "NER gene mentions",
"NCBI-disease-IOB":"NER disease mentions",
"JNLPBA":"NER jnlpba protein, cell_type, cell_line, DNA, RNA mentions",
"BIOSSES":"Sentence similarity",
"PubMedQA": "Question answering"
}
class BlurbIOBToSeq2Seq:
"""Converts the BLURB datasets, or any dataset follwoing the
IOB tagging schema into a sequence to sequence problem. Any
dataset out of BLURB can be added, given that a task for it is added
to the TASK dictionary and the dataset follows the:
{'tokens': List['str'], 'ner_tags': List[int]}.
And the dataset is loadable using `datasets.load_dataset`.
"""
def __init__(self,
dataset_name: str = "BC5CDR-chem-IOB",
separator: str = "|separator|",
label_mode: str = "full-text",
output: str = None, #gpt or hf,
closing_token_outputs: str = "[END]", # Needed for GPT-3 API
closing_token_inputs: str = "\n\n[END]\n\n", # Needed for GPT-3 API
class_start: str = "_class_***", # Needed for GPT-3 API
class_end: str = "***_class_" # Needed for GPT-3 API
):
"""Converts the BLURB datasets, or any dataset follwoing the
IOB tagging schema into a sequence to sequence problem.
Args:
dataset_name (str, optional): Name of the dataset in the EMBO/BLURB dataset repository. Defaults to "BC5CDR-chem-IOB".
separator (str, optional): string to separate to consecutive tag mentions in the generated text.
Ignored if `label_mode` is not `labels-only`. Defaults to "|separator|".
label_mode (str, optional): Possible values are `labels-only` or `full-text`. Modifies the output string
of the dataset. `labels-only` lists the labels mentioned in the input. `
full-text` repeats the entire text, adding the labels in the text. The
labels are indicated with `class_start` and `class_end`. Relevant only f
or token clasification (NER) tasks.. Defaults to "full_text".
output (str, optional): Whether to generate the dataset on a `gpt` or 🤗 (`hf`) format. Defaults to None.
closing_token_outputs (str, optional): Indicates end of sentence in outputs. To be added to inference in GPT. Defaults to "[END]".
closing_token_inputs (str, optional): Indicates end of inputs. Defaults to "[END]".
class_start (str, optional): Beginning of an entity in the text. Defaults to "[END]".
class_end (str, optional): Indicates endd of entityu in text. Defaults to "[END]".
"""
self.base_data = "EMBO/BLURB"
self.dataset_name = dataset_name
self.task_name = TASKS[self.dataset_name]
self.model = model
self.separator = separator
self.label_mode = label_mode
self.output = output
self.closing_token_outputs = closing_token_outputs
self.class_start = class_start
self.class_end = class_end
self.closing_token_inputs = closing_token_inputs
self.dataset = datasets.load_dataset(self.base_data, self.dataset_name, download_mode='force_redownload')
self.id2label, self.label2id = self._get_data_labels()
self.dataset_seq2seq = self.dataset.map(
self._convert_into_seq2seq,
batched=True,
remove_columns=["tokens", "ner_tags", "id"]
)
def _convert_into_seq2seq(self, examples) -> dict:
"""Converts IOB tagged datasets into Seq2seq datasets
Args:
examples (_type_): Batch of data generated by HuggingFace
Raises:
NotImplemented: Currently only supports NER related tasks
Returns: batch of dict
dict: {'prompt': str, 'completion': str} if `output='gpt'`
{'tokens': str, 'ner_tags': str} if `output='hf'`
"""
if "NER" in self.task_name:
modified_data = self._ner_into_seq2seq(examples)
else:
raise NotImplemented
return modified_data
def _ner_into_seq2seq(self, examples) -> dict:
inputs_ = 'prompt' if self.output == "gpt" else 'inputs'
targets_ = 'completion' if self.output == "gpt" else 'targets'
modified_data = {inputs_: [], targets_: []}
for i, t in zip(examples['tokens'], examples['ner_tags']):
modified_data[inputs_].append(f"""{self.task_name}: {" ".join(i)}{self.closing_token_inputs}""")
modified_data[targets_].append(self._get_text_labels(i, t))
return modified_data
def _get_data_labels(self) -> Tuple[dict, dict]:
"""_summary_
Returns:
Tuple[dict, dict]: _description_
"""
num_labels = self.dataset['train'].info.features['ner_tags'].feature.num_classes
label_list = self.dataset['train'].info.features['ner_tags'].feature.names
id2label, label2id = {}, {}
for class_, label in zip(range(num_labels), label_list):
id2label[class_] = label
label2id[label] = class_
print(f"\nTraining on {num_labels} features:")
print(", ".join(label_list))
return id2label, label2id
def _get_text_labels(self, inputs: List[str], targets: List[int]) -> str:
if self.label_mode == "labels-only":
output_string = "We found the following chemical mentions -> "
for i, t in zip(inputs, targets):
if t != 0:
output_string += f"{self.id2label[t]}:{i} {self.separator}"
if output_string == "We found the following chemical mentions -> ":
output_string = "No entities found"
output_string += f" {self.closing_token_outputs}"
if self.label_mode == "full-text":
inside_label = None
output_string = ""
for i, t in zip(inputs, targets):
if inside_label:
if t == 0:
output_string += f""" {self.class_end.replace('_class_', inside_label)} {i}"""
inside_label = None
if self.id2label[t].startswith("B"):
output_string += f""" {self.class_end.replace("_class_", inside_label)} {self.class_start.replace("_class_", inside_label)} {i}"""
if self.id2label[t].startswith("I"):
output_string += f" {i}"
else:
if t == 0:
output_string += f"{i} "
else:
if self.id2label[t].startswith("B"):
inside_label = self.id2label[t].split("-")[-1]
output_string += f""" {self.class_start.replace("_class_", inside_label)} {i}"""
output_string += f" {self.closing_token_outputs}"
return output_string
def to_jsonl(self, path_to_file: str, splits: List[str]) -> None:
"""Writes the seq2seq dataset into a jsonl file. Specially useful to generate compatible GPT API data.
Args:
path_to_file (str): _description_
splits (List[str]): _description_
"""
for split in splits:
self.dataset_seq2seq[split].to_json(f"{path_to_file}_{split}.jsonl", lines=True, orient='records')