aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--compare.py13
-rw-r--r--parsedjvutext.py2
2 files changed, 6 insertions, 9 deletions
diff --git a/compare.py b/compare.py
index 5f2352b..492dd53 100644
--- a/compare.py
+++ b/compare.py
@@ -1,21 +1,18 @@
# -*- coding: utf-8 -*-
from wikisource import get_page
-from parsedjvutext import page_sexp, parse_page_sexp
+from parsedjvutext import parse_book
import string_utils as su
-import pdb
wikibook = "Bloy - Le Sang du pauvre, Stock, 1932.djvu".replace(" ", "_")
#wikibook = "Villiers de L'Isle-Adam - Tribulat Bonhomet, 1908.djvu".replace(" ", "_")
n = 88
-ocrpage = parse_page_sexp(wikibook, n)
-l1, c1 = ocrpage['words'], ocrpage["coords"]
+ocrpage = parse_book(wikibook, n)
+l1, c1 = zip(*ocrpage[0])
+l1 = list(l1)
+c1 = list(c1)
l2 = get_page(wikibook, n)
-print len(l2.split())
l3 = su.simplify(l2)
C = su.align(l3.split(), l1, c1)
-#pdb.set_trace()
-#sexp = page_sexp(wikibook, n)
-#su.alignment_to_sexp(C[1], sexp, l2.split())
su.print_alignment(l2.split(), l1, c1, C[1])
diff --git a/parsedjvutext.py b/parsedjvutext.py
index ad98d1d..6bd9950 100644
--- a/parsedjvutext.py
+++ b/parsedjvutext.py
@@ -21,7 +21,7 @@ def parse_page(page, html=False):
else:
coords = [s[i].value for i in xrange(1, 5)]
word = s[5].value
- yield (word, coords)
+ yield (word.decode("utf-8"), coords)
else:
for c in chain.from_iterable(aux(child, html) for child in s[5:]):
yield c