diff options
| author | Thibaut Horel <thibaut.horel@gmail.com> | 2013-08-17 14:59:36 +0200 |
|---|---|---|
| committer | Thibaut Horel <thibaut.horel@gmail.com> | 2013-08-17 14:59:36 +0200 |
| commit | b5a00a5e914da988dcd81c6d276f7bb22a46aa20 (patch) | |
| tree | 366237b3f5f64acb02a9089e0635eca3ddb7300e /compare.py | |
| parent | 241e0f3f7cf72f1d771ed0b4651542168b16329e (diff) | |
| download | ocr-layer-curation-b5a00a5e914da988dcd81c6d276f7bb22a46aa20.tar.gz | |
Some tweaks
Diffstat (limited to 'compare.py')
| -rw-r--r-- | compare.py | 11 |
1 files changed, 6 insertions, 5 deletions
@@ -4,12 +4,13 @@ from wikisource import get_page from parsedjvutext import parse_page_sexp import string_utils as su -wikibook = "Bloy - Le Sang du pauvre, Stock, 1932.djvu".replace(" ", "_") -# wikibook = "Villiers de L'Isle-Adam - Tribulat Bonhomet, 1908.djvu".replace(" ", "_") +# wikibook = "Bloy - Le Sang du pauvre, Stock, 1932.djvu".replace(" ", "_") +wikibook = "Villiers de L'Isle-Adam - Tribulat Bonhomet, 1908.djvu".replace(" ", "_") n = 79 ocrpage = parse_page_sexp(wikibook, n) l1 = ocrpage['words'] -l2 = get_page(wikibook, n).replace(u"’", u"'").split() -C = su.align(l2, l1) -su.print_alignment(l2, l1, C[1]) +l2 = get_page(wikibook, n) +l3 = su.simplify(l2) +C = su.align(l2.split(), l1) +su.print_alignment(l3.split(), l1, C[1]) |
