-
Notifications
You must be signed in to change notification settings - Fork 1
/
scrape.py
executable file
·109 lines (81 loc) · 2.9 KB
/
scrape.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
import re
import json
from tqdm import tqdm
from wiki_dump_reader import Cleaner, iterate
from nltk.tokenize import sent_tokenize
database = []
index = {}
prefix = "enwiki"
cleaner = Cleaner()
for title, text in tqdm(iterate(f"./source/{prefix}-latest-pages-articles.xml"), total=21181268):
# for title, text in tqdm(iterate(f"./source/{prefix}-latest-pages-articles.xml"), total=346229):
text = cleaner.clean_text(text)
cleaned_text, links = cleaner.build_links(text)
abstract = []
passage = []
abstracting = True
clean_text = re.sub(r"__NOTOC__", "", cleaned_text)
for i in clean_text.split("\n"):
try:
if (i[0] == "="):
abstracting = False
else:
if abstracting:
abstract.append(i)
else:
passage.append(i+"\n")
except IndexError:
continue;
if (len(abstract) < 2) or (len(passage) < 10):
continue
abstract_text = " ".join(abstract)
passage_text = " ".join(passage)
linkdb = []
for i in links:
linkdb.append(i["link"])
abstract_splits = sent_tokenize(abstract_text)
passage_splits = sent_tokenize(passage_text)
if len(abstract_splits) < 2 or len(passage_splits) < 20:
continue
front_raw = abstract_splits.pop(0)
back_raw = passage_splits.pop()
# things in the parens often suck.
front = re.sub(" ", " ", re.sub(r"\(.*?\)", "", front_raw))
abstract_text = abstract_text.replace(front_raw, "").strip()
passage_text = passage_text.replace(back_raw, "").strip()
database.append({"title": title, "context": abstract_text, "full_context": passage_text, "target": front, "links": linkdb, "oncontext": True})
index[title] = front
ldatabase = []
i = 0
for item in tqdm(database, total=len(database)):
ldatabase.append(item)
if len(ldatabase) > 53760:
with open(f"./data/{prefix}-parsed-long-oc-MD{i}.json", "w") as df:
df.write(json.dumps(ldatabase))
ldatabase = []
i += 1
with open(f"./data/{prefix}-parsed-long-oc-MD{i}.json", "w") as df:
df.write(json.dumps(database))
ldatabase = []
i += 1
ldatabase = []
i = 0
for item in tqdm(database, total=len(database)):
try:
for link in item["links"]:
try:
ldatabase.append({"title": link, "context": item["context"], "full_context": item["full_context"], "target": index[link], "oncontext": False})
except KeyError:
continue
except KeyError:
continue
if len(ldatabase) > 53760:
with open(f"./data/{prefix}-parsed-long-oc-OC{i}.json", "w") as df:
df.write(json.dumps(ldatabase))
ldatabase = []
i += 1
with open(f"./data/{prefix}-parsed-long-oc-OC{i}.json", "w") as df:
df.write(json.dumps(ldatabase))
ldatabase = []
i += 1
breakpoint()