1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
|
# -*- coding: utf-8 -*-
import requests
import sys
from bs4 import BeautifulSoup, NavigableString
from itertools import takewhile, count
from types import SliceType
import string_utils as su
import djvu_utils as du
URL = "http://fr.wikisource.org/w/index.php"
def spanify(string, start=0):
soup = BeautifulSoup()
for i, word in enumerate(string.split()):
span = soup.new_tag("span")
span["data-id"]=start + i
span.string = word
string.insert_before(span)
string.insert_before(" ")
string.replace_with("")
return start + i + 1
class HtmlText(object):
## This class takes the corrected html from a wikisource page
## and adds extra information fo facilitate the mapping.
## At initialization, it wraps each word into a <span>
## with attribute data-id=i where i is the index of the corrected word.
## Once we set align, it adds to each span an id attribute
## of the form id="corr-x,y,z" where x, y, z are ids in
## the image map.
def __init__(self, elem):
self._elem = elem
start = 0
strings = list(string for string in self._elem.strings
if string.strip())
for string in strings:
start = spanify(string, start)
self._length = start
self._align = None
def __len__(self):
return self._length
def __getitem__(self, key):
if type(key) is SliceType:
return [unicode(self[w]) for w in range(*key.indices(self.length))]
if key >= len(self):
raise IndexError
if key < 0:
key = len(self) - key
return self._elem.find("span", {"data-id": key}).text
def __unicode__(self):
return self._elem.text
def __str__(self):
return unicode(self).encode("utf-8")
@property
def align(self):
return self._align
@align.setter
def align(self, val):
self._align = val
for i in range(len(self)):
self._elem.find("span", {"data-id": i})['id']="corr-" + \
",".join(map(str, val[i]))
def get_page(title, page):
params = {"action": "render", "title": "Page:" + title + "/" + str(page)}
r = requests.get(URL, params=params)
if r.status_code == requests.codes.ok:
soup = BeautifulSoup(r.text, "lxml")
return HtmlText(soup.select("div.pagetext")[0])
else:
return None
def get_pages(title, begin=1, end=None):
if end:
return (get_page(title, i) for i in xrange(begin, end + 1))
else:
return takewhile(lambda x: x is not None,
(get_page(title, i) for i in count(begin)))
def gen_html(book, page_number):
d = du.parse_book(book, page_number)[0]
corrected_text = get_page(book, int(page_number))
corrected_words = su.simplify(unicode(corrected_text)).split()
if d:
orig_words, orig_coords = zip(*d)
C = su.align(corrected_words, list(orig_words), list(orig_coords))
corrected_text.align = C[1]
return orig_coords, orig_words, corrected_text
if __name__ == "__main__":
wikibook = "Bloy - Le Sang du pauvre, Stock, 1932.djvu".replace(" ", "_")
test = gen_html(wikibook, 28)
# print type(c[0])
# print su.align(c, [u"asd"], None)
# print c[0:1]
|