Skip to content

Commit

Permalink
#42 #48 #53 Python module for separating clitics from verbs; minor ch…
Browse files Browse the repository at this point in the history
…anges in the other Python modules
  • Loading branch information
leoalenc committed Jul 2, 2018
1 parent 8465a8f commit 1fb3897
Show file tree
Hide file tree
Showing 8 changed files with 52 additions and 5 deletions.
2 changes: 1 addition & 1 deletion tools/prepare-delaf.sh
Original file line number Diff line number Diff line change
Expand Up @@ -30,5 +30,5 @@ grep -F ".N:" $1 | # select nouns
grep -F ".V+PRO:" $1 | # select verbs with clitics
# rm spurious colon like in abstinhas:-lhe,abster.V+PRO:I2s
sed "s/:-/-/" |
splitW31 > delaf.clitics
splitW31 | SeparateHyphen.py > delaf.clitics

Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@

# Author: Leonel Figueiredo de Alencar - Federal University of Ceará
# [email protected]
# Date: June 27, 2018
# Date: July 2, 2018
"""
This module annotates enclitic or mesoclitic pronouns in entries in the MBR format
Expand All @@ -21,7 +21,7 @@
degustares-lhe degustar+V.ele.DAT.3.SG+SBJF+2+SG
Tag conversion is performed by the AnnotateClitic function from
the module ConvertDELAF.py. Ambiguity of clitic "nos" is also handled.
module ConvertDELAF.py. Ambiguity of clitic "nos" is also handled.
For more details, see the respective module documentation.
"""
import sys
Expand All @@ -34,8 +34,9 @@ def main():
if HasClitic(entry):
parts=ParseEntry(entry,r"\t|\+")
word,lemma,cat,feats=parts[0],parts[1],parts[2],parts[4:]
print AnnotateClitic(word,lemma,cat,feats).encode("utf-8")
sys.stdout.write("%s\n" % AnnotateClitic(word,lemma,cat,feats).encode("utf-8"))
else:
print entry.encode("utf-8")
sys.stdout.write("%s\n" % entry.encode("utf-8"))

if __name__ == '__main__':
main()
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,9 @@
EXTENSION="mbr" # output file extension
SEPARATOR=r"[,.:]+"
PRO="PRO"
# PATTERN=r"(^[^-]+)(vos|n[oa]s?|l[oa]s?|lhes?|me|se|te)(,.+\V\+PRO)"
PATTERN1=r"(^[^-]+)(vos|n[oa]s?|l[oa]s?|lhes?|me|se|te)(,)"
PATTERN2=r"(^[^-]+)([oa]s?)(,)"

def UnpickleMapping(infile):
f=open(infile,"rb")
Expand Down Expand Up @@ -109,6 +112,13 @@ def ConcatenateFeatures(feats):
def ConvertFeatures(feats,dic=TAG_MAPPING):
return [dic.get(f,f) for f in feats]

def SeparateClitic(entry):
"""Separate clitic from verb form in entries like abluirlhe,abluir.V+PRO:U1s,
returning entries like abluir-lhe,abluir.V+PRO:U1s. Clitic separation is performed in two steps: first, clitics beginning with a consonant are separated; then, clitics beginning with a vowel are separated. This is necessary to prevent unwanted separations like
zuirn-os,zuir.V+PRO:W3s instead of zuir-nos,zuir.V+PRO:W3s, since the form of latter
type clitics are contained in the ones of the former."""
return re.sub(PATTERN2,r"\1-\2\3",re.sub(PATTERN1,r"\1-\2\3",entry))

def CorrectEntry(entry):
"Eliminate spurious colon in cases like abstinhas:-lhe,abster.V+PRO:I2s"
error=":-"
Expand Down
36 changes: 36 additions & 0 deletions tools/python-tools/SeparateHyphen.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
#! /usr/bin/env python2.7
# -*- coding: utf-8 -*-

# Author: Leonel Figueiredo de Alencar - Federal University of Ceará
# [email protected]
# Date: July 2, 2018

"""
This module correct DELAF entries with the V+PRO tag from standard input
by inserting the missing hyphen separating the clitic pronoun from
the verb form in entries like the following:
abluirlhe,abluir.V+PRO:U1s
The output are correct entries, e.g.:
abluir-lhe,abluir.V+PRO:U1s
Usage: cat INFILE | SeparateHyphen.py > OUTFILE
The module uses the SeparateClitic function from module ConvertDELAF.
Clitic separation is performed using PATTERN1, which presupposes that the
input entries contain the V+PRO tag.
"""

import sys
from ConvertDELAF import *


def main():
entries=ExtractEntries(sys.stdin)
for entry in entries:
sys.stdout.write("%s\n" % SeparateClitic(entry).encode("utf-8"))

if __name__ == '__main__':
main()
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.

0 comments on commit 1fb3897

Please sign in to comment.