diff options
| author | Guillaume Horel <guillaume.horel@gmail.com> | 2013-08-18 16:54:53 -0400 |
|---|---|---|
| committer | Guillaume Horel <guillaume.horel@gmail.com> | 2013-08-18 16:54:53 -0400 |
| commit | 17c23c5d2b6680f90117a7804e65dd7fe541848f (patch) | |
| tree | 767a541bde7c7d2f5141994fa503b2f1f4425dc1 /compare.py | |
| parent | 4e99558cb00144d045fe1fc00793b4b16f0e6fab (diff) | |
| download | ocr-layer-curation-17c23c5d2b6680f90117a7804e65dd7fe541848f.tar.gz | |
try to fix the alignment_to_sexp function
Diffstat (limited to 'compare.py')
| -rw-r--r-- | compare.py | 11 |
1 files changed, 8 insertions, 3 deletions
@@ -1,16 +1,21 @@ # -*- coding: utf-8 -*- from wikisource import get_page -from parsedjvutext import parse_page_sexp +from parsedjvutext import page_sexp, parse_page_sexp import string_utils as su +import pdb -# wikibook = "Bloy - Le Sang du pauvre, Stock, 1932.djvu".replace(" ", "_") -wikibook = "Villiers de L'Isle-Adam - Tribulat Bonhomet, 1908.djvu".replace(" ", "_") +wikibook = "Bloy - Le Sang du pauvre, Stock, 1932.djvu".replace(" ", "_") +#wikibook = "Villiers de L'Isle-Adam - Tribulat Bonhomet, 1908.djvu".replace(" ", "_") n = 88 ocrpage = parse_page_sexp(wikibook, n) l1, c1 = ocrpage['words'], ocrpage["coords"] l2 = get_page(wikibook, n) +print len(l2.split()) l3 = su.simplify(l2) C = su.align(l3.split(), l1, c1) +pdb.set_trace() +sexp = page_sexp(wikibook, n) +su.alignment_to_sexp(C[1], sexp, l2.split()) su.print_alignment(l2.split(), l1, c1, C[1]) |
