1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
|
# -*- coding: utf-8 -*-
import requests
import sys
from bs4 import BeautifulSoup, NavigableString
from itertools import takewhile, count
from types import SliceType
from string_utils import align
URL = "http://fr.wikisource.org/w/index.php"
def spanify(string, start=0):
soup = BeautifulSoup()
for i, word in enumerate(string.split()):
span = soup.new_tag("span", id="word-" + str(start + i))
span.string = word
string.insert_before(span)
string.insert_before(" ")
string.replace_with("")
return start + i + 1
class HtmlText():
def __init__(self, elem):
self.elem = elem
start = 0
strings = list(string for string in self.elem.strings
if string.strip())
for string in strings:
start = spanify(string, start)
self.length = start
def __len__(self):
return self.length
def __getitem__(self, key):
if type(key) is SliceType:
return [self[w] for w in range(*key.indices(self.length))]
if key >= self.length:
raise IndexError
if key < 0:
key = self.length - key
return self.elem.find("span", {"id": "word-" + str(key)}).text
def __str__(self):
return str(self.elem)
def get_page(title, page):
params = {"action": "render", "title": "Page:" + title + "/" + str(page)}
r = requests.get(URL, params=params)
if r.status_code == requests.codes.ok:
soup = BeautifulSoup(r.text, "lxml")
return soup.select("div.pagetext")[0].text
else:
return None
def get_page2(text):
soup = BeautifulSoup(text, "lxml")
elem = soup.select("div.pagetext")[0]
return HtmlText(elem), elem.text
def get_pages(title, begin=1, end=None):
if end:
return (get_page(title, i) for i in xrange(begin, end + 1))
else:
return takewhile(lambda x: x is not None,
(get_page(title, i) for i in count(begin)))
if __name__ == "__main__":
b = BeautifulSoup("<a>asd</a>")
c = HtmlText(b)
print type(c[0])
print align(c, [u"asd"], None)
print c[0:1]
|