diff options
| author | Guillaume Horel <guillaume.horel@gmail.com> | 2013-08-05 00:05:59 -0400 |
|---|---|---|
| committer | Guillaume Horel <guillaume.horel@gmail.com> | 2013-08-05 00:05:59 -0400 |
| commit | 80133c8d2d0a7334e8f53a11eaa48a8d7b70c1da (patch) | |
| tree | d3341f75c05fe7c8ff49638ebb57c8f8ab242a27 | |
| parent | 47f58b93fc71a2820dd6d1e1038d11b1b80a475c (diff) | |
| download | ocr-layer-curation-80133c8d2d0a7334e8f53a11eaa48a8d7b70c1da.tar.gz | |
use new functions in compare.py
| -rw-r--r-- | compare.py | 19 | ||||
| -rw-r--r-- | string_utils.py | 2 |
2 files changed, 10 insertions, 11 deletions
@@ -1,15 +1,12 @@ -import pdb from wikisource import get_page -from parsedjvutext import parse_book -import lcs +from parsedjvutext import parse_page_sexp +from string_utils import LCS, printDiff wikibook = "Villiers de L'Isle-Adam - Tribulat Bonhomet, 1908.djvu" -ocrbook = "Tribulat Bonhomet.xml" -ocrbook = parse_book(ocrbook) - -n = 14 -l1 = ocrbook['words'][n] -l2 = get_page(wikibook, n+1).split() -C = lcs.LCS(l1, l2) -lcs.printDiff(C, l1, l2, len(l1), len(l2)) +n = 42 +ocrpage = parse_page_sexp(wikibook, n) +l1 = ocrpage['words'] +l2 = get_page(wikibook, n).split() +C = LCS(l1, l2) +printDiff(C, l1, l2, len(l1), len(l2)) diff --git a/string_utils.py b/string_utils.py index 81f448f..8b7a3a3 100644 --- a/string_utils.py +++ b/string_utils.py @@ -1,3 +1,5 @@ +# -*- coding: utf-8 -*- + def levenshtein(word1, word2): """Return triplet of number of (substitutions, insertions, deletions) to transform word1 into word2. |
