From 6e694d555e1004da58ec3425d33043b2f1b5f715 Mon Sep 17 00:00:00 2001
From: Guillaume Horel <guillaume.horel@serenitascapital.com>
Date: Fri, 28 Feb 2014 17:31:19 -0500
Subject: update with the new functions

---
 compare.py       | 13 +++++--------
 parsedjvutext.py |  2 +-
 2 files changed, 6 insertions(+), 9 deletions(-)

diff --git a/compare.py b/compare.py
index 5f2352b..492dd53 100644
--- a/compare.py
+++ b/compare.py
@@ -1,21 +1,18 @@
 # -*- coding: utf-8 -*-
 
 from wikisource import get_page
-from parsedjvutext import page_sexp, parse_page_sexp
+from parsedjvutext import parse_book
 import string_utils as su
-import pdb
 
 wikibook = "Bloy - Le Sang du pauvre, Stock, 1932.djvu".replace(" ", "_")
 #wikibook = "Villiers de L'Isle-Adam - Tribulat Bonhomet, 1908.djvu".replace(" ", "_")
 
 n = 88
-ocrpage = parse_page_sexp(wikibook, n)
-l1, c1 = ocrpage['words'], ocrpage["coords"]
+ocrpage = parse_book(wikibook, n)
+l1, c1 = zip(*ocrpage[0])
+l1 = list(l1)
+c1 = list(c1)
 l2 = get_page(wikibook, n)
-print len(l2.split())
 l3 = su.simplify(l2)
 C = su.align(l3.split(), l1, c1)
-#pdb.set_trace()
-#sexp = page_sexp(wikibook, n)
-#su.alignment_to_sexp(C[1], sexp, l2.split())
 su.print_alignment(l2.split(), l1, c1, C[1])
diff --git a/parsedjvutext.py b/parsedjvutext.py
index ad98d1d..6bd9950 100644
--- a/parsedjvutext.py
+++ b/parsedjvutext.py
@@ -21,7 +21,7 @@ def parse_page(page, html=False):
                 else:
                     coords = [s[i].value for i in xrange(1, 5)]
                 word = s[5].value
-                yield (word, coords)
+                yield (word.decode("utf-8"), coords)
             else:
                 for c in chain.from_iterable(aux(child, html) for child in s[5:]):
                     yield c
-- 
cgit v1.2.3-70-g09d2