update with the new functions

author: Guillaume Horel <guillaume.horel@serenitascapital.com> 2014-02-28 17:31:19 -0500
committer: Guillaume Horel <guillaume.horel@serenitascapital.com> 2014-02-28 17:31:54 -0500
commit: 6e694d555e1004da58ec3425d33043b2f1b5f715 (patch)
tree: 01fc9f907401f50cf6c869c7e1c057287b2dc405
parent: aaa42a8efcd53576ced9bf2311e84d8ff2a5c8cf (diff)
download: ocr-layer-curation-6e694d555e1004da58ec3425d33043b2f1b5f715.tar.gz
2 files changed, 6 insertions, 9 deletions
diff --git a/compare.py b/compare.py
index 5f2352b..492dd53 100644
--- a/compare.py
+++ b/compare.py
@@ -1,21 +1,18 @@
 # -*- coding: utf-8 -*-
 
 from wikisource import get_page
-from parsedjvutext import page_sexp, parse_page_sexp
+from parsedjvutext import parse_book
 import string_utils as su
-import pdb
 
 wikibook = "Bloy - Le Sang du pauvre, Stock, 1932.djvu".replace(" ", "_")
 #wikibook = "Villiers de L'Isle-Adam - Tribulat Bonhomet, 1908.djvu".replace(" ", "_")
 
 n = 88
-ocrpage = parse_page_sexp(wikibook, n)
-l1, c1 = ocrpage['words'], ocrpage["coords"]
+ocrpage = parse_book(wikibook, n)
+l1, c1 = zip(*ocrpage[0])
+l1 = list(l1)
+c1 = list(c1)
 l2 = get_page(wikibook, n)
-print len(l2.split())
 l3 = su.simplify(l2)
 C = su.align(l3.split(), l1, c1)
-#pdb.set_trace()
-#sexp = page_sexp(wikibook, n)
-#su.alignment_to_sexp(C[1], sexp, l2.split())
 su.print_alignment(l2.split(), l1, c1, C[1])
diff --git a/parsedjvutext.py b/parsedjvutext.py
index ad98d1d..6bd9950 100644
--- a/parsedjvutext.py
+++ b/parsedjvutext.py
@@ -21,7 +21,7 @@ def parse_page(page, html=False):
                 else:
                     coords = [s[i].value for i in xrange(1, 5)]
                 word = s[5].value
-                yield (word, coords)
+                yield (word.decode("utf-8"), coords)
             else:
                 for c in chain.from_iterable(aux(child, html) for child in s[5:]):
                     yield c
author	Guillaume Horel <guillaume.horel@serenitascapital.com>	2014-02-28 17:31:19 -0500
committer	Guillaume Horel <guillaume.horel@serenitascapital.com>	2014-02-28 17:31:54 -0500
commit	6e694d555e1004da58ec3425d33043b2f1b5f715 (patch)
tree	01fc9f907401f50cf6c869c7e1c057287b2dc405
parent	aaa42a8efcd53576ced9bf2311e84d8ff2a5c8cf (diff)
download	ocr-layer-curation-6e694d555e1004da58ec3425d33043b2f1b5f715.tar.gz