1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
|
# -*- coding: utf-8 -*-
from wikisource import get_page
from parsedjvutext import page_sexp, parse_page_sexp
import string_utils as su
import pdb
wikibook = "Bloy - Le Sang du pauvre, Stock, 1932.djvu".replace(" ", "_")
#wikibook = "Villiers de L'Isle-Adam - Tribulat Bonhomet, 1908.djvu".replace(" ", "_")
n = 88
ocrpage = parse_page_sexp(wikibook, n)
l1, c1 = ocrpage['words'], ocrpage["coords"]
l2 = get_page(wikibook, n)
print len(l2.split())
l3 = su.simplify(l2)
def del_cost1(w, pos):
return 50
def del_cost2(w, pos):
return 1+3*len([c for c in w if c.isalnum()])
bactrack1 = 8
backtrack2 = 5
C = su.align(l3.split(), l1, c1)
pdb.set_trace()
su.print_alignment(l2.split(), l1, c1, C[1])
sexp = page_sexp(wikibook, n)
su.alignment_to_sexp(C[1], sexp, l2.split())
su.print_alignment(l2.split(), l1, c1, C[1])
|