aboutsummaryrefslogtreecommitdiffstats
path: root/string_utils.py
diff options
context:
space:
mode:
authorGuillaume Horel <guillaume.horel@gmail.com>2013-08-17 19:01:24 -0400
committerGuillaume Horel <guillaume.horel@gmail.com>2013-08-17 19:01:24 -0400
commit1d53df7c99126679d391a1efc96b30aa3848b4d3 (patch)
tree2d2977bac09ef557b12faa8f8a3a3707f37ec128 /string_utils.py
parentf633ea807ef9a83f84f41767e71fad71656a4439 (diff)
downloadocr-layer-curation-1d53df7c99126679d391a1efc96b30aa3848b4d3.tar.gz
add function for converting alignment to sexp
Diffstat (limited to 'string_utils.py')
-rw-r--r--string_utils.py18
1 files changed, 18 insertions, 0 deletions
diff --git a/string_utils.py b/string_utils.py
index 1b94ce3..3e0706b 100644
--- a/string_utils.py
+++ b/string_utils.py
@@ -1,5 +1,6 @@
# -*- coding: utf-8 -*-
from Levenshtein import distance as levenshtein
+import re
def simplify(text):
mapp = [(u"’", u"'"), (u"↑", u"."), (u"…", u"..."), (u"É", u"E"),
@@ -171,3 +172,20 @@ def print_alignment(l1, l2, c2, alignment):
for word in l2:
print u"{0:>25} | {1}".format("", word)
+
+def alignment_to_sexp(alignment, sexp, l2):
+ alignment = iter(alignment)
+ for line in sexp:
+ if "word" not in line:
+ print line
+ else:
+ index = alignment.next()
+ if index == -1:
+ break
+ else:
+ re.sub("(?P<begin>\d+ \d+ \d+ \d+\s) \w+(?P<end>\)+$)",
+ "\g<begin>{0}\g<end>".format(
+ " ".join([l2[i] for i in list(index)])),
+ line)
+ line.encode('string-escape')
+ print line