-
Notifications
You must be signed in to change notification settings - Fork 5
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
New config file for English Twitter data. Recognizes and retains #has…
…tags and @mentions.
- Loading branch information
Showing
1 changed file
with
289 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,289 @@ | ||
version=0.2 | ||
[RULE-ORDER] | ||
SUFFIX WORD-TOKEN ABBREVIATION-KNOWN NUMBER-ORDINAL URL URL-WWW URL-DOMAIN | ||
E-MAIL WORD-PARPREFIX WORD-PARSUFFIX WORD-COMPOUND | ||
ABBREVIATION INITIAL SMILEY REVERSE-SMILEY PUNCTUATION-MULTI DATE-REVERSE DATE | ||
NUMBER-YEAR TIME FRACNUMBER NUMBER CURRENCY WORD PUNCTUATION UNKNOWN | ||
|
||
[META-RULES] | ||
SPLITTER=% | ||
NUMBER-ORDINAL = (?>(?:[\.,]?\p{N}+)+)-?(?i)(?: %ORDINALS% )(?:\Z|\P{Lu}|\P{Ll})$ | ||
#ABBREVIATION-KNOWN = (?:\p{P}*)?(?:\A|[^\p{L}\.])((?:%ABBREVIATIONS%)\.)(?:\Z|\P{L}) | ||
ABBREVIATION-KNOWN = (?:\p{P}*)?(?:\A|[^\p{L}\.])(?:\A)((?:%ABBREVIATIONS%)(?:\.{0,1}))(?:\Z|\P{L}) | ||
WORD-TOKEN =(%TOKENS%)(?:\p{P}*)?$ | ||
#PREFIX = (?:\A|[^\p{Lu}\.]|[^\p{Ll}\.])(%PREFIXES% )(?:\p{L}+) | ||
SUFFIX = (?:\A|\p{L})+( %SUFFIXES% )(?:\Z|\P{L}) | ||
|
||
[RULES] | ||
%include url | ||
%include e-mail | ||
%include smiley | ||
|
||
#Ex: (dis)information | ||
WORD-PARPREFIX=(?:\p{Ps}\p{L}+[\p{Pc}\p{Pd}]?\p{Pe}[\p{Pc}\p{Pd}]?)\p{L}+(?:[\p{Pc}\p{Pd}]\p{L}+)* | ||
|
||
#Ex: understand(s) | ||
WORD-PARSUFFIX=\p{L}+(?:[\p{Pc}\p{Pd}]\p{L}+)*(?:[\p{Pc}\p{Pd}]?\p{Ps}[\p{Pc}\p{Pd}]?\p{L}+\p{Pe}) | ||
|
||
#Keep dash/underscore connected parts (even if they are in parenthesis) | ||
WORD-COMPOUND=\p{L}+(?:[\p{Pc}\p{Pd}]\p{L}+)+ | ||
|
||
#Abbreviations with multiple periods | ||
ABBREVIATION=^(?:[\{\(\[\<]?)(\p{L}{1,3}(?:\.\p{L}{1,3})+\.?)(?:\Z|[,:;\}\)\]\>]) | ||
|
||
#retain initials | ||
INITIAL=^(?:\p{Lt}|\p{Lu})\.$ | ||
|
||
#retain hashtag string | ||
HASHTAG=#[\p{L}\p{Mn}\p{N}_\-]+ | ||
|
||
#retain addressee string | ||
ADDRESSEE=@[\p{L}\p{Mn}\p{N}_\-]+ | ||
|
||
#Homogeneous punctuation (ellipsis etc) | ||
PUNCTUATION-MULTI=(?:\.|\-|[!\?]){2,} | ||
|
||
#Date | ||
DATE=\p{N}{1,2}/\p{Ps}?\p{N}{1,2}[/]\p{Ps}?\p{N}{2,4} | ||
DATE-REVERSE=\p{N}{4}/\p{N}{1,2}/\p{N}{1,2} | ||
|
||
FRACNUMBER=\p{N}+(?:/\p{N}+)+ | ||
|
||
NUMBER-YEAR=('\p{N}{2})(?:\P{N}|\z) | ||
#NUMBER-YEAR=('\p{N}{2})\P{N} | ||
|
||
#Times | ||
TIME=\p{N}{1,2}:\p{N}{1,2}(?::\p{N})?(?i:a\.?m\.?|p\.?m\.?)? | ||
|
||
#retain digits, including those starting with initial period (.22), and negative numbers | ||
NUMBER=-?(?:[\.,]?\p{N}+)+ | ||
|
||
CURRENCY=\p{Sc} | ||
|
||
WORD=[\p{L}\p{Mn}]+ | ||
|
||
PUNCTUATION=\p{P} | ||
|
||
UNKNOWN=. | ||
|
||
[PREFIXES] | ||
|
||
[SUFFIXES] | ||
n't|N'T | ||
're|'RE | ||
'm|'M | ||
's|'S | ||
'd|'D | ||
've|'VE | ||
'll|'LL | ||
|
||
[ORDINALS] | ||
st|ST | ||
nd|ND | ||
rd|RD | ||
th|TH | ||
|
||
[TOKENS] | ||
't|'T | ||
n/a|N/A | ||
|
||
[UNITS] | ||
km | ||
m | ||
cm | ||
mm | ||
g | ||
kg | ||
C | ||
l | ||
s | ||
sec | ||
min | ||
gb | ||
mb | ||
kb | ||
|
||
|
||
[CURRENCY] | ||
USD | ||
GBP | ||
CAD | ||
NZD | ||
AUD | ||
SGD | ||
HKD | ||
EUR | ||
|
||
[ABBREVIATIONS] | ||
acc | ||
AD | ||
Adm | ||
al | ||
Ala | ||
anon | ||
Apr | ||
Ariz | ||
Ark | ||
arr | ||
assoc | ||
Aug | ||
av | ||
Ave | ||
Bancorp | ||
Bart | ||
BC | ||
Bhd | ||
Bros | ||
B.S | ||
B.Sc | ||
Calif | ||
cap | ||
Capt | ||
cf | ||
Cie | ||
Co | ||
CO | ||
col | ||
Col | ||
Colo | ||
comb | ||
comb.form | ||
compar | ||
Conn | ||
cont | ||
contd | ||
contr | ||
Corp | ||
CORP | ||
Cos | ||
COS | ||
cu | ||
Dec | ||
Del | ||
dept | ||
Dept | ||
dist | ||
div | ||
D-Mass | ||
doc | ||
doz | ||
Dr | ||
e.g | ||
esp | ||
Esq | ||
est | ||
etc | ||
Etc | ||
Ex | ||
Feb | ||
fem | ||
ff | ||
fig | ||
Fla | ||
for | ||
Fri | ||
ft | ||
Ga | ||
Gen | ||
gm | ||
Gov | ||
Hon | ||
Ill | ||
Inc | ||
INC | ||
Ind | ||
inst | ||
Jan | ||
Jansz | ||
Jos | ||
Jr | ||
Jul | ||
Jun | ||
Kan | ||
Ky | ||
La | ||
Lt | ||
Ltd | ||
M\.A | ||
M\.Sc | ||
MA | ||
MSc | ||
Maj | ||
masc | ||
Mass | ||
Md | ||
Messrs | ||
met | ||
Mfg | ||
Mich | ||
Minn | ||
Miss | ||
Mo | ||
Mon | ||
Mr | ||
Mrs | ||
Ms | ||
Neb | ||
neg | ||
Nev | ||
no | ||
No | ||
nom | ||
Nos | ||
Nov | ||
Oct | ||
Okla | ||
Ore | ||
Pa | ||
pass | ||
pers | ||
Ph | ||
phr | ||
pla | ||
poss | ||
pres | ||
Prof | ||
Prop | ||
Pty | ||
ref | ||
refl | ||
Rep | ||
Reps | ||
Rev | ||
sc | ||
Sen | ||
Sens | ||
Sept | ||
sing | ||
Sr | ||
St | ||
superl | ||
Tenn | ||
Tex | ||
Tues | ||
usu | ||
v | ||
Va | ||
var | ||
viz | ||
vs | ||
Vt | ||
Wash | ||
Wis | ||
Wyo | ||
|
||
|
||
[FILTER] | ||
fl fl | ||
ff ff | ||
ffi ffi | ||
ffl ffl | ||
# also filter soft hyphen | ||
\u00AD | ||
|
||
|
||
[EOSMARKERS] | ||
%include standard-eos | ||
|
||
[QUOTES] | ||
%include standard-quotes |