add function for converting alignment to sexp

author: Guillaume Horel <guillaume.horel@gmail.com> 2013-08-17 19:01:24 -0400
committer: Guillaume Horel <guillaume.horel@gmail.com> 2013-08-17 19:01:24 -0400
commit: 1d53df7c99126679d391a1efc96b30aa3848b4d3 (patch)
tree: 2d2977bac09ef557b12faa8f8a3a3707f37ec128
parent: f633ea807ef9a83f84f41767e71fad71656a4439 (diff)
download: ocr-layer-curation-1d53df7c99126679d391a1efc96b30aa3848b4d3.tar.gz
2 files changed, 24 insertions, 3 deletions
diff --git a/parsedjvutext.py b/parsedjvutext.py
index 3d4ee96..773a1d4 100644
--- a/parsedjvutext.py
+++ b/parsedjvutext.py
@@ -33,11 +33,14 @@ def parse_wordline(line):
     coords = map(int, line[1:5])
     return word, coords
 
-def parse_page_sexp(djvubook, pagenumber):
+def page_sexp(djvubook, pagenumber):
     args = ["djvused", "-e", "select {0};print-txt".format(pagenumber),
             djvubook]
-    page = [parse_wordline(line) for line in \
-            subprocess.check_output(args).split("\n") if "word" in line]
+    return subprocess.check_output(args).split("\n")
+
+def parse_page_sexp(djvubook, pagenumber):
+    page = [parse_wordline(line) for line in page_sexp(djvubook, pagenumber) \
+            if "word" in line]
     return {"words": [a for a, b in page], "coords": [b for a, b in page]}
 
 def parse_book_sexp(djvubook):
diff --git a/string_utils.py b/string_utils.py
index 1b94ce3..3e0706b 100644
--- a/string_utils.py
+++ b/string_utils.py
@@ -1,5 +1,6 @@
 # -*- coding: utf-8 -*-
 from Levenshtein import distance as levenshtein
+import re
 
 def simplify(text):
     mapp = [(u"’", u"'"), (u"↑", u"."), (u"…", u"..."), (u"É", u"E"),
@@ -171,3 +172,20 @@ def print_alignment(l1, l2, c2, alignment):
         for word in l2:
             print u"{0:>25} | {1}".format("", word)
 
+
+def alignment_to_sexp(alignment, sexp, l2):
+    alignment = iter(alignment)
+    for line in sexp:
+        if "word" not in line:
+            print line
+        else:
+            index = alignment.next()
+            if index == -1:
+                break
+            else:
+                re.sub("(?P<begin>\d+ \d+ \d+ \d+\s) \w+(?P<end>\)+$)",
+                       "\g<begin>{0}\g<end>".format(
+                           " ".join([l2[i] for i in list(index)])),
+                       line)
+                line.encode('string-escape')
+                print line
author	Guillaume Horel <guillaume.horel@gmail.com>	2013-08-17 19:01:24 -0400
committer	Guillaume Horel <guillaume.horel@gmail.com>	2013-08-17 19:01:24 -0400
commit	1d53df7c99126679d391a1efc96b30aa3848b4d3 (patch)
tree	2d2977bac09ef557b12faa8f8a3a3707f37ec128
parent	f633ea807ef9a83f84f41767e71fad71656a4439 (diff)
download	ocr-layer-curation-1d53df7c99126679d391a1efc96b30aa3848b4d3.tar.gz