From 6e694d555e1004da58ec3425d33043b2f1b5f715 Mon Sep 17 00:00:00 2001 From: Guillaume Horel Date: Fri, 28 Feb 2014 17:31:19 -0500 Subject: update with the new functions --- compare.py | 13 +++++-------- parsedjvutext.py | 2 +- 2 files changed, 6 insertions(+), 9 deletions(-) diff --git a/compare.py b/compare.py index 5f2352b..492dd53 100644 --- a/compare.py +++ b/compare.py @@ -1,21 +1,18 @@ # -*- coding: utf-8 -*- from wikisource import get_page -from parsedjvutext import page_sexp, parse_page_sexp +from parsedjvutext import parse_book import string_utils as su -import pdb wikibook = "Bloy - Le Sang du pauvre, Stock, 1932.djvu".replace(" ", "_") #wikibook = "Villiers de L'Isle-Adam - Tribulat Bonhomet, 1908.djvu".replace(" ", "_") n = 88 -ocrpage = parse_page_sexp(wikibook, n) -l1, c1 = ocrpage['words'], ocrpage["coords"] +ocrpage = parse_book(wikibook, n) +l1, c1 = zip(*ocrpage[0]) +l1 = list(l1) +c1 = list(c1) l2 = get_page(wikibook, n) -print len(l2.split()) l3 = su.simplify(l2) C = su.align(l3.split(), l1, c1) -#pdb.set_trace() -#sexp = page_sexp(wikibook, n) -#su.alignment_to_sexp(C[1], sexp, l2.split()) su.print_alignment(l2.split(), l1, c1, C[1]) diff --git a/parsedjvutext.py b/parsedjvutext.py index ad98d1d..6bd9950 100644 --- a/parsedjvutext.py +++ b/parsedjvutext.py @@ -21,7 +21,7 @@ def parse_page(page, html=False): else: coords = [s[i].value for i in xrange(1, 5)] word = s[5].value - yield (word, coords) + yield (word.decode("utf-8"), coords) else: for c in chain.from_iterable(aux(child, html) for child in s[5:]): yield c -- cgit v1.2.3-70-g09d2