-
Notifications
You must be signed in to change notification settings - Fork 0
/
rewriteTiccl.v3.py
70 lines (55 loc) · 2.6 KB
/
rewriteTiccl.v3.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
import collections
import os
import sys
ticcloutput = sys.argv[1] #nlfiscaal: "TICCLv2.OUT.clean.ldcalc.ranked
inputwords = sys.argv[2] # nlfiscaal: nlf_wordlist.v2
#inlezen van spelling correcties
#file format
#Word variant # frequency word variant # Correction # frequency correction # Anagram Value Difference # Levenstein Distance # Confidence Score
#pprocureur#1#procureur#100002624#20113571875#1#0.865979
#pereel#1#perceel#100002620#24883200000#1#0.999284
#erceel#1#perceel#100002620#20113571875#1#0.995794
#uitbreding#1#uitbreiding#100002620#14693280768#1#0.989803
#tegmoet#1#tegemoet#100002620#11592740743#1#0.982571
#tegenmoet#6#tegemoet#100002620#12166529024#1#0.941964
### checks if spellingsuggestion should be kept- based on simple heuristic rules
#longer words that occ twice or more do not need a spelling Correction
#i also checked length of 12 but that is too short
# this entails:
#if longword has higher freq than spellingsuggestion -> not a candidate and validlongword
# only if correction occurs more otften than word -it is a validlongword
# if len(word)>14 and int(freqw)>1 and int(freqcorr ) > int(freqw):
# nope niet nuttig - word is meestal gewoon goed
def validlongword(word, freqw,correct,freqcorr,anadiff,ldiff,conf_score):
if len(word)>14 and int(freqw)>1:
return True
else:
return False
ticcl_corrections={}
with open(ticcloutput,'r',encoding='utf-8') as ticcl:
for line in ticcl:
(word, freqw,correct,freqcorr,anadiff,ldiff,conf_score) = line.split('#')
if(validlongword(word, freqw,correct,freqcorr,anadiff,ldiff,conf_score)):
print("valid word " ,word,file=sys.stderr)
else:
ticcl_corrections[word] = correct
#inlezen van lexicon in een dictionary waarbij ieder woord aan zijn correctie is gekoppeld
#tel hoeveel woorden geen correctie krijgen
new_lexicon = {}
correct_counter = 0
with open(inputwords,'r',encoding='utf-8') as lexicon:
for w in lexicon.readlines():
w = w.strip()
if( w in ticcl_corrections):
# new_lexicon[w] = ticcl_corrections[w]
print(ticcl_corrections[w])
else:
# new_lexicon[w] = w #"-"
# correct_counter += 1
print(w)
#newlexlength = len(new_lexicon)
#print("new lexicon has ",newlexlength, " items and ", correct_counter, " words that were not corrected by ticcl \n", file=sys.stderr)
#sorted_new_lexicon = collections.OrderedDict(sorted(new_lexicon.items()))
#for k, v in sorted_new_lexicon.items(): print(k + "\t" + str(v))
#for k, v in sorted_new_lexicon.items(): print(str(v))
# z.isupper() or z.islower()