preliminary alignment_to_sexp

* right now just output a list of pairs (corrected_word, coords) * need to generate a sexp file if we want to reinsert into the djvu * bounding boxes are not smart at all right now (no merge or splits)
author: Guillaume Horel <guillaume.horel@gmail.com> 2014-03-01 15:28:42 -0500
committer: Guillaume Horel <guillaume.horel@gmail.com> 2014-03-01 15:30:27 -0500
commit: c5734b6b776727959f1b485651f1ddc7c8121a85 (patch)
tree: 0d81bc762052bed11fd08e811ef26fdccf974cdd
parent: 424aa29f600bc17c8391a3802206385962648519 (diff)
download: ocr-layer-curation-c5734b6b776727959f1b485651f1ddc7c8121a85.tar.gz
1 files changed, 13 insertions, 20 deletions
diff --git a/string_utils.py b/string_utils.py
index 186f2eb..a8a38c0 100644
--- a/string_utils.py
+++ b/string_utils.py
@@ -156,25 +156,18 @@ def print_alignment(l1, l2, c2, alignment):
         for word in l2:
             print u"{0:>25} | {1}".format("", word)
 
-
-def alignment_to_sexp(alignment, sexp, l2):
-    alignment = iter(alignment)
-    for line in sexp:
-        if "word" not in line:
-            print line
+def alignment_to_sexp(l1, l2, c2, alignment):
+    r = []
+    prev = 0
+    for index, g in itertools.groupby(zip(l1, alignment), lambda x:x[1]):
+        word = " ".join([a[0] for a in g])
+        if not index:
+            r.append([word, []])
         else:
-            index = alignment.next()
-            if index == -1:
-                break
+            begin, end = index[0], index[-1]
+            if end > begin:
+                #need to find a better way to get the box coordinates
+                r.append([word, c2[begin]])
             else:
-                if type(index) == tuple:
-                    word = " ".join([l2[i] for i in list(index)])
-                else:
-                    try:
-                        word = l2[index]
-                    except IndexError:
-                        print index
-                word = word.encode("utf-8").encode("string-escape")
-                re.sub("(?P<begin>\d+ \d+ \d+ \d+\s)    \w+(?P<end>\)+$)",
-                       "\g<begin>{0}\g<end>".format(word), line)
-                print line
+                r.append([word, c2[begin]])
+    return r
author	Guillaume Horel <guillaume.horel@gmail.com>	2014-03-01 15:28:42 -0500
committer	Guillaume Horel <guillaume.horel@gmail.com>	2014-03-01 15:30:27 -0500
commit	c5734b6b776727959f1b485651f1ddc7c8121a85 (patch)
tree	0d81bc762052bed11fd08e811ef26fdccf974cdd
parent	424aa29f600bc17c8391a3802206385962648519 (diff)
download	ocr-layer-curation-c5734b6b776727959f1b485651f1ddc7c8121a85.tar.gz