aboutsummaryrefslogtreecommitdiffstats
path: root/utils/wikisource.py
diff options
context:
space:
mode:
Diffstat (limited to 'utils/wikisource.py')
-rw-r--r--utils/wikisource.py25
1 files changed, 9 insertions, 16 deletions
diff --git a/utils/wikisource.py b/utils/wikisource.py
index 589c88e..283d5fe 100644
--- a/utils/wikisource.py
+++ b/utils/wikisource.py
@@ -38,7 +38,7 @@ class HtmlText():
def __getitem__(self, key):
if type(key) is SliceType:
- return [self[w] for w in range(*key.indices(self.length))]
+ return [unicode(self[w]) for w in range(*key.indices(self.length))]
if key >= self.length:
raise IndexError
if key < 0:
@@ -54,17 +54,10 @@ def get_page(title, page):
r = requests.get(URL, params=params)
if r.status_code == requests.codes.ok:
soup = BeautifulSoup(r.text, "lxml")
- return soup.select("div.pagetext")[0].text
+ return HtmlText(soup.select("div.pagetext")[0])
else:
return None
-
-def get_page2(text):
- soup = BeautifulSoup(text, "lxml")
- elem = soup.select("div.pagetext")[0]
- return HtmlText(elem), elem.text
-
-
def get_pages(title, begin=1, end=None):
if end:
return (get_page(title, i) for i in xrange(begin, end + 1))
@@ -77,17 +70,17 @@ def gen_html(book, page_number):
page = doc.pages[int(page_number)-1]
d = du.parse_page(page)
corrected_text = get_page(book, int(page_number))
- corrected_words = su.simplify(corrected_text).split()
+ corrected_words = su.simplify(corrected_text.elem.text).split()
if d:
orig_words, orig_coords = zip(*d)
C = su.align(corrected_words, list(orig_words), list(orig_coords))
- corr_words = corrected_text.split()
+ corr_words = corrected_text
orig_coords_html = du.convert_to_htmlcoord(orig_coords, page.size[1])
return orig_coords_html, orig_words, corr_words, C[1]
if __name__ == "__main__":
- b = BeautifulSoup("<a>asd</a>")
- c = HtmlText(b)
- print type(c[0])
- print su.align(c, [u"asd"], None)
- print c[0:1]
+ wikibook = "Bloy - Le Sang du pauvre, Stock, 1932.djvu".replace(" ", "_")
+ test = get_page(wikibook, 28)
+ # print type(c[0])
+ # print su.align(c, [u"asd"], None)
+ # print c[0:1]