-
Notifications
You must be signed in to change notification settings - Fork 0
/
scrape_surrogate.py
52 lines (39 loc) · 1.42 KB
/
scrape_surrogate.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
import csv
import re
import json
import urllib2
input_file = open('/Users/brian_jo/Desktop/Playground/text_to_emoji/dataset.csv')
input_url = 'http://www.fileformat.info/info/unicode/char/'
suffix = '/index.htm'
input_file.readline()
worddic = {}
i = 0
for line in input_file.readlines():
html_id = str.split(line.strip(), ',')[0]
anno = str.split(line.strip(), ',')[-1]
anno_array = str.split(anno, ';')
search_url = input_url + html_id + suffix
try:
conn = urllib2.urlopen(search_url)
text = conn.read()
ind = text.find('UTF-16 (decimal)')
new_text = text[ind+43:]
ind2 = new_text.find(' <')
surr_code = new_text[:ind2]
surr_code_int = str.replace(surr_code,',','')
arr = str.split(surr_code_int," ")
except:
continue
i = i+1
if i%100 == 0:
print str(i)
if html_id not in worddic:
worddic[html_id] = []
if len(arr) == 2:
worddic[html_id].append('_'.join(arr))
else:
worddic[html_id].append(arr[0])
worddic[html_id] = worddic[html_id] + anno_array
#print worddic[html_id]
with open('/Users/brian_jo/Desktop/Playground/text_to_emoji/jsondata.txt', 'w') as outfile:
json.dump(worddic, outfile)