diff options
| author | Thibaut Horel <thibaut.horel@gmail.com> | 2013-08-05 22:29:24 +0200 |
|---|---|---|
| committer | Thibaut Horel <thibaut.horel@gmail.com> | 2013-08-05 22:45:45 +0200 |
| commit | b51137379fab2d5579e0caeada5389c682ec3cc5 (patch) | |
| tree | 49e1c6bab1a69143e9aa290e48e5849e7b2b6756 /compare.py | |
| parent | 80133c8d2d0a7334e8f53a11eaa48a8d7b70c1da (diff) | |
| download | ocr-layer-curation-b51137379fab2d5579e0caeada5389c682ec3cc5.tar.gz | |
Use a Needleman-Wunsch type algorithm for text alignment
Diffstat (limited to 'compare.py')
| -rw-r--r-- | compare.py | 15 |
1 files changed, 9 insertions, 6 deletions
@@ -1,12 +1,15 @@ +# -*- coding: utf-8 -*- + from wikisource import get_page from parsedjvutext import parse_page_sexp -from string_utils import LCS, printDiff +import string_utils as su -wikibook = "Villiers de L'Isle-Adam - Tribulat Bonhomet, 1908.djvu" +wikibook = "Bloy - Le Sang du pauvre, Stock, 1932.djvu".replace(" ", "_") +# wikibook = "Villiers de L'Isle-Adam - Tribulat Bonhomet, 1908.djvu".replace(" ", "_") -n = 42 +n = 79 ocrpage = parse_page_sexp(wikibook, n) l1 = ocrpage['words'] -l2 = get_page(wikibook, n).split() -C = LCS(l1, l2) -printDiff(C, l1, l2, len(l1), len(l2)) +l2 = get_page(wikibook, n).replace(u"’", u"'").split() +C = su.align(l2, l1) +su.print_alignment(l2, l1, C[1]) |
