try to fix the alignment_to_sexp function

author: Guillaume Horel <guillaume.horel@gmail.com> 2013-08-18 16:54:53 -0400
committer: Guillaume Horel <guillaume.horel@gmail.com> 2013-08-18 16:54:53 -0400
commit: 17c23c5d2b6680f90117a7804e65dd7fe541848f (patch)
tree: 767a541bde7c7d2f5141994fa503b2f1f4425dc1
parent: 4e99558cb00144d045fe1fc00793b4b16f0e6fab (diff)
download: ocr-layer-curation-17c23c5d2b6680f90117a7804e65dd7fe541848f.tar.gz
2 files changed, 18 insertions, 8 deletions
diff --git a/compare.py b/compare.py
index 7305517..b6a4c8a 100644
--- a/compare.py
+++ b/compare.py
@@ -1,16 +1,21 @@
 # -*- coding: utf-8 -*-
 
 from wikisource import get_page
-from parsedjvutext import parse_page_sexp
+from parsedjvutext import page_sexp, parse_page_sexp
 import string_utils as su
+import pdb
 
-# wikibook = "Bloy - Le Sang du pauvre, Stock, 1932.djvu".replace(" ", "_")
-wikibook = "Villiers de L'Isle-Adam - Tribulat Bonhomet, 1908.djvu".replace(" ", "_")
+wikibook = "Bloy - Le Sang du pauvre, Stock, 1932.djvu".replace(" ", "_")
+#wikibook = "Villiers de L'Isle-Adam - Tribulat Bonhomet, 1908.djvu".replace(" ", "_")
 
 n = 88
 ocrpage = parse_page_sexp(wikibook, n)
 l1, c1 = ocrpage['words'], ocrpage["coords"]
 l2 = get_page(wikibook, n)
+print len(l2.split())
 l3 = su.simplify(l2)
 C = su.align(l3.split(), l1, c1)
+pdb.set_trace()
+sexp = page_sexp(wikibook, n)
+su.alignment_to_sexp(C[1], sexp, l2.split())
 su.print_alignment(l2.split(), l1, c1, C[1])
diff --git a/string_utils.py b/string_utils.py
index c4439da..12d22b8 100644
--- a/string_utils.py
+++ b/string_utils.py
@@ -172,9 +172,14 @@ def alignment_to_sexp(alignment, sexp, l2):
             if index == -1:
                 break
             else:
-                re.sub("(?P<begin>\d+ \d+ \d+ \d+\s) \w+(?P<end>\)+$)",
-                       "\g<begin>{0}\g<end>".format(
-                           " ".join([l2[i] for i in list(index)])),
-                       line)
-                line.encode('string-escape')
+                if type(index) == tuple:
+                    word = " ".join([l2[i] for i in list(index)])
+                else:
+                    try:
+                        word = l2[index]
+                    except IndexError:
+                        print index
+                word = word.encode("utf-8").encode("string-escape")
+                re.sub("(?P<begin>\d+ \d+ \d+ \d+\s)    \w+(?P<end>\)+$)",
+                       "\g<begin>{0}\g<end>".format(word), line)
                 print line
author	Guillaume Horel <guillaume.horel@gmail.com>	2013-08-18 16:54:53 -0400
committer	Guillaume Horel <guillaume.horel@gmail.com>	2013-08-18 16:54:53 -0400
commit	17c23c5d2b6680f90117a7804e65dd7fe541848f (patch)
tree	767a541bde7c7d2f5141994fa503b2f1f4425dc1
parent	4e99558cb00144d045fe1fc00793b4b16f0e6fab (diff)
download	ocr-layer-curation-17c23c5d2b6680f90117a7804e65dd7fe541848f.tar.gz