-
Notifications
You must be signed in to change notification settings - Fork 0
/
words_pool.py
67 lines (62 loc) · 2.44 KB
/
words_pool.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
from hyper_params import *
import numpy as np
import time
import unicodedata
from gensim.models import Word2Vec
from copy import deepcopy
if Language == 'ru':
alphabet = set('ёйцукенгшщзхъфываролджэячсмитьбюп')
vowels = set('аэыоуяеиёю')
def remove_accents(input_str):
return input_str
else:
alphabet = 'abcdefghijklmnopqrstuvwxyzıß'
if Language=='de':
alphabet = set(alphabet+alphabet.upper())
else:
alphabet = set(alphabet)
vowels = 'aeiouyı'
def remove_accents(input_str):
nfkd_form = unicodedata.normalize('NFKD', input_str)
return "".join([c for c in nfkd_form if not unicodedata.combining(c)])
print('loading word vectors...')
start = time.time()
if UseValidatedWords:
cond = lambda x: x == x.lower()
if Language=='de':
cond = lambda x: x[1:] == x[1:].lower()
with open(ValidatedWordsPath, encoding='utf8') as f:
validated_words = f.readlines()
validated_words = {x.strip() for x in validated_words if cond(x)}
validated_words = {x for x in validated_words if all([c in alphabet for c in remove_accents(x)])
and any([c in vowels for c in remove_accents(x)])}
word_vectors = {}
if PreTrainedVecsPath.endswith(('.txt', '.vec')):
with open(PreTrainedVecsPath, encoding='utf8', errors='ignore') as f:
header = f.readline()
vocab_size, vector_size = (int(x) for x in header.split()) # throws away invalid file format
for line in f:
parts = line.rstrip().split()
word, vec = parts[0], np.asarray([float(x) for x in parts[1:]])
if UseValidatedWords:
if word not in validated_words:
continue
if any([c not in alphabet for c in remove_accents(word)]):
continue
vec /= np.linalg.norm(vec)
word_vectors[word] = vec
elif PreTrainedVecsPath.endswith('.bin'):
wv = Word2Vec.load(PreTrainedVecsPath).wv
for word in wv.vocab:
if UseValidatedWords:
if word not in validated_words:
continue
if any([c not in alphabet for c in remove_accents(word)]):
continue
vec = deepcopy(wv[word])
vec /= np.linalg.norm(vec)
word_vectors[word] = vec
else:
raise NotImplementedError
print(len(word_vectors))
print('took {:.0f} s'.format(time.time()-start))