utils/wikisource.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86

# -*- coding: utf-8 -*-
import requests
import sys
from bs4 import BeautifulSoup, NavigableString
from itertools import takewhile, count
from types import SliceType
import string_utils as su
import djvu_utils as du

URL = "http://fr.wikisource.org/w/index.php"


def spanify(string, start=0):
    soup = BeautifulSoup()
    for i, word in enumerate(string.split()):
        span = soup.new_tag("span", id="word-" + str(start + i))
        span.string = word
        string.insert_before(span)
        string.insert_before(" ")
    string.replace_with("")
    return start + i + 1


class HtmlText():

    def __init__(self, elem):
        self.elem = elem
        start = 0
        strings = list(string for string in self.elem.strings
                       if string.strip())

        for string in strings:
            start = spanify(string, start)
        self.length = start

    def __len__(self):
        return self.length

    def __getitem__(self, key):
        if type(key) is SliceType:
            return [unicode(self[w]) for w in range(*key.indices(self.length))]
        if key >= self.length:
            raise IndexError
        if key < 0:
            key = self.length - key
        return self.elem.find("span", {"id": "word-" + str(key)}).text

    def __str__(self):
        return str(self.elem)


def get_page(title, page):
    params = {"action": "render", "title": "Page:" + title + "/" + str(page)}
    r = requests.get(URL, params=params)
    if r.status_code == requests.codes.ok:
        soup = BeautifulSoup(r.text, "lxml")
        return HtmlText(soup.select("div.pagetext")[0])
    else:
        return None

def get_pages(title, begin=1, end=None):
    if end:
        return (get_page(title, i) for i in xrange(begin, end + 1))
    else:
        return takewhile(lambda x: x is not None,
                         (get_page(title, i) for i in count(begin)))

def gen_html(book, page_number):
    doc = du.get_document(book)
    page = doc.pages[int(page_number)-1]
    d = du.parse_page(page)
    corrected_text = get_page(book, int(page_number))
    corrected_words = su.simplify(corrected_text.elem.text).split()
    if d:
        orig_words, orig_coords = zip(*d)
        C = su.align(corrected_words, list(orig_words), list(orig_coords))
        corr_words = corrected_text
        orig_coords_html = du.convert_to_htmlcoord(orig_coords, page.size[1])
    return orig_coords_html, orig_words, corr_words, C[1]

if __name__ == "__main__":
    wikibook = "Bloy - Le Sang du pauvre, Stock, 1932.djvu".replace(" ", "_")
    test = get_page(wikibook, 28)
    # print type(c[0])
    # print su.align(c, [u"asd"], None)
    # print c[0:1]