#42 #48 #53 Python module for separating clitics from verbs; minor ch…

…anges in the other Python modules
LR-POR · Jul 2, 2018 · 1fb3897 · 1fb3897
1 parent 8465a8f
commit 1fb3897
Show file tree

Hide file tree

Showing 8 changed files with 52 additions and 5 deletions.
diff --git a/tools/prepare-delaf.sh b/tools/prepare-delaf.sh
@@ -30,5 +30,5 @@ grep -F ".N:" $1 | # select nouns
 grep -F ".V+PRO:" $1 | # select verbs with clitics
 # rm spurious colon like in abstinhas:-lhe,abster.V+PRO:I2s
     sed "s/:-/-/" |
-    splitW31 > delaf.clitics
+    splitW31 | SeparateHyphen.py > delaf.clitics
 
diff --git a/tools/python-converter/AnnotateClitics.py → tools/python-tools/AnnotateClitics.py b/tools/python-converter/AnnotateClitics.py → tools/python-tools/AnnotateClitics.py
@@ -3,7 +3,7 @@
 
 # Author: Leonel Figueiredo de Alencar - Federal University of Ceará
 # [email protected]
-# Date: June 27, 2018
+# Date: July 2, 2018
 """
 This module annotates enclitic or mesoclitic pronouns in entries in the MBR format
 
@@ -21,7 +21,7 @@
 degustares-lhe	degustar+V.ele.DAT.3.SG+SBJF+2+SG
 
 Tag conversion is performed by the AnnotateClitic function from
-the module ConvertDELAF.py. Ambiguity of clitic "nos" is also handled. 
+module ConvertDELAF.py. Ambiguity of clitic "nos" is also handled. 
 For more details, see the respective module documentation.
 """
 import sys
@@ -34,8 +34,9 @@ def main():
         if HasClitic(entry):
             parts=ParseEntry(entry,r"\t|\+")
             word,lemma,cat,feats=parts[0],parts[1],parts[2],parts[4:]
-            print AnnotateClitic(word,lemma,cat,feats).encode("utf-8")
+            sys.stdout.write("%s\n" % AnnotateClitic(word,lemma,cat,feats).encode("utf-8"))
         else:
-            print entry.encode("utf-8")
+            sys.stdout.write("%s\n" % entry.encode("utf-8"))
+
 if __name__ == '__main__':
 	main()
diff --git a/tools/python-converter/ConvertDELAF.py → tools/python-tools/ConvertDELAF.py b/tools/python-converter/ConvertDELAF.py → tools/python-tools/ConvertDELAF.py
@@ -44,6 +44,9 @@
 EXTENSION="mbr" # output file extension
 SEPARATOR=r"[,.:]+"
 PRO="PRO"
+# PATTERN=r"(^[^-]+)(vos|n[oa]s?|l[oa]s?|lhes?|me|se|te)(,.+\V\+PRO)"
+PATTERN1=r"(^[^-]+)(vos|n[oa]s?|l[oa]s?|lhes?|me|se|te)(,)"
+PATTERN2=r"(^[^-]+)([oa]s?)(,)"
 
 def UnpickleMapping(infile):
     f=open(infile,"rb")
@@ -109,6 +112,13 @@ def ConcatenateFeatures(feats):
 def ConvertFeatures(feats,dic=TAG_MAPPING):
     return [dic.get(f,f) for f in feats]
 
+def SeparateClitic(entry):
+    """Separate clitic from verb form in entries like abluirlhe,abluir.V+PRO:U1s,
+    returning entries like abluir-lhe,abluir.V+PRO:U1s. Clitic separation is performed in two steps: first, clitics beginning with a consonant are separated; then, clitics beginning with a vowel are separated. This is necessary to prevent unwanted separations like
+zuirn-os,zuir.V+PRO:W3s instead of zuir-nos,zuir.V+PRO:W3s, since the form of latter
+type clitics are contained in the ones of the former."""
+    return re.sub(PATTERN2,r"\1-\2\3",re.sub(PATTERN1,r"\1-\2\3",entry))
+
 def CorrectEntry(entry):
     "Eliminate spurious colon in cases like abstinhas:-lhe,abster.V+PRO:I2s"
     error=":-"

diff --git a/tools/python-tools/SeparateHyphen.py b/tools/python-tools/SeparateHyphen.py
@@ -0,0 +1,36 @@
+#! /usr/bin/env python2.7
+# -*- coding: utf-8 -*-
+
+# Author: Leonel Figueiredo de Alencar - Federal University of Ceará
+# [email protected]
+# Date: July 2, 2018
+
+"""
+This module correct DELAF entries with the V+PRO tag from standard input
+by inserting the missing hyphen separating the clitic pronoun from
+the verb form in entries like the following:
+
+abluirlhe,abluir.V+PRO:U1s
+
+The output are correct entries, e.g.: 
+
+abluir-lhe,abluir.V+PRO:U1s
+
+Usage: cat INFILE | SeparateHyphen.py > OUTFILE
+
+The module uses the SeparateClitic function from module ConvertDELAF.
+Clitic separation is performed using PATTERN1, which presupposes that the
+input entries contain the V+PRO tag.
+"""
+
+import sys
+from ConvertDELAF import *
+
+
+def main():
+    entries=ExtractEntries(sys.stdin)
+    for entry in entries:
+        sys.stdout.write("%s\n" % SeparateClitic(entry).encode("utf-8"))
+
+if __name__ == '__main__':
+	main()
diff --git a/tools/python-converter/clitics.pkl → tools/python-tools/clitics.pkl b/tools/python-converter/clitics.pkl → tools/python-tools/clitics.pkl
diff --git a/tools/python-converter/clitics.txt → tools/python-tools/clitics.txt b/tools/python-converter/clitics.txt → tools/python-tools/clitics.txt
diff --git a/tools/python-converter/tag_mapping.pkl → tools/python-tools/tag_mapping.pkl b/tools/python-converter/tag_mapping.pkl → tools/python-tools/tag_mapping.pkl
diff --git a/tools/python-converter/tag_mapping.txt → tools/python-tools/tag_mapping.txt b/tools/python-converter/tag_mapping.txt → tools/python-tools/tag_mapping.txt