From 0e8b0c88a4d3009cbbea695f606e49faef27f373 Mon Sep 17 00:00:00 2001 From: Guillaume Horel Date: Sun, 7 Sep 2014 18:21:37 -0400 Subject: Reorganize the code hope I did it right. We have two packages now, one for the server and one for the actual library. --- djvu_utils.py | 67 --------------------- string_utils.py | 161 -------------------------------------------------- utils/__init__.py | 0 utils/djvu_utils.py | 67 +++++++++++++++++++++ utils/string_utils.py | 161 ++++++++++++++++++++++++++++++++++++++++++++++++++ utils/wikisource.py | 93 +++++++++++++++++++++++++++++ web/__init__.py | 2 + web/djvu_utils.py | 1 - web/main.py | 37 ------------ web/server.py | 35 +++++++++++ web/settings.py | 8 +-- web/utils.py | 20 ------- wikisource.py | 80 ------------------------- 13 files changed, 362 insertions(+), 370 deletions(-) delete mode 100644 djvu_utils.py delete mode 100644 string_utils.py create mode 100644 utils/__init__.py create mode 100644 utils/djvu_utils.py create mode 100644 utils/string_utils.py create mode 100644 utils/wikisource.py create mode 100644 web/__init__.py delete mode 120000 web/djvu_utils.py delete mode 100644 web/main.py create mode 100644 web/server.py delete mode 100644 web/utils.py delete mode 100644 wikisource.py diff --git a/djvu_utils.py b/djvu_utils.py deleted file mode 100644 index 21692a1..0000000 --- a/djvu_utils.py +++ /dev/null @@ -1,67 +0,0 @@ -import sys -from bs4 import BeautifulSoup -import djvu -from djvu.decode import Context -from itertools import chain -import collections -from PIL import Image - -def parse_page(page): - s = page.text.sexpr - - def aux(s): - if type(s) is djvu.sexpr.ListExpression: - if len(s) == 0: - pass - if str(s[0].value) == "word": - coords = [s[i].value for i in xrange(1, 5)] - word = s[5].value - yield (word.decode("utf-8"), coords) - else: - for c in chain.from_iterable(aux(child) for child in s[5:]): - yield c - else: - pass - return aux(s) if s else None - -def convert_to_htmlcoord(coords, page_size): - return [",".join(map(str, [c[0], page_size - c[3], - c[2], page_size - c[1]])) for c in coords] - -def get_document(djvufile): - c = Context() - document = c.new_document(djvu.decode.FileURI(djvufile)) - document.decoding_job.wait() - return document - -def parse_book(djvubook, page=None): - """ - returns the list of words and coordinates from a djvu book. - if page is None, returns the whole book. - """ - document = get_document(djvubook) - - if type(page) is int: - toparse = [document.pages[page - 1]] - elif isinstance(page, collections.Iterable): - toparse = [document.pages[p - 1] for p in page] - else: - toparse = document.pages - - return [parse_page(page) for page in toparse] - -def image_from_book(djvubook, page): - document = get_document(djvubook) - mode = djvu.decode.RENDER_COLOR - djvu_pixel_format = djvu.decode.PixelFormatRgb() - page = document.pages[page-1] - page_job = page.decode(wait=True) - width, height = page_job.size - rect = (0, 0, width, height) - buf = page_job.render(mode, rect, rect, djvu_pixel_format) - return Image.frombuffer("RGB", (width, height), buf, 'raw', 'RGB', 0, -1) - -if __name__ == "__main__": - book = parse_book(sys.argv[1], page=[10,11], html=True) - im = image_from_book(sys.argv[1], 11) - im.save("test.jpeg") diff --git a/string_utils.py b/string_utils.py deleted file mode 100644 index b6c8ce0..0000000 --- a/string_utils.py +++ /dev/null @@ -1,161 +0,0 @@ -# -*- coding: utf-8 -*- -from Levenshtein import distance as levenshtein -import re -import itertools - -def simplify(text): - mapp = [(u"’", u"'"), (u"↑", u"."), (u"…", u"..."), (u"É", u"E"), - (u"À", u"A"), (u"Ô", u"O"), (u"—", u"-")] - - for a, b in mapp: - text = text.replace(a, b) - - return text - -def cut(word, left, right): - """Return pair of strings (p + "-", s) such that p+s == word and - L(p + "-", left) + L(s, right) is minimal, where L is the levenshtein - distance. - - Implementation is suboptimal since the computation of the Levenshtein - distances will involve comparing the same segments repeatedly. - - TODO: handle the case when word contains an hyphen (e.g. c'est-a-dire) - """ - - def aux(i): - leftw, rightw = word[:i] + "-", word[i:] - return (leftw, rightw, - levenshtein(leftw, left) + levenshtein(rightw, right)) - - l = [aux(i) for i in xrange(len(word) + 1)] - return min(l, key=lambda x: x[2])[:2] - -def join_ocr_words(l, c): - m = list(l) - if len(l) >= 2 and c[-2][2] > c[-1][0] and (not l[-2][-1].isalnum()): - l[-2] = l[-2][:-1] - return "".join(l) - -def join_words(l): - return "".join(l) - -def align(l1, l2, c2): - """Compute the optimal alignment between two list of words - à la Needleman-Wunsch. - - The function returns a (score, alignment) pair. An alignment is simply - a list of list of size len(l1) giving for each word in l1, the list of - indices in l2 it maps to (the list is empty if the word maps to nothing). - - Note that if the list is of size>1, the word in l1 will map to a sequence - of words in l2. Conversly, consecutive words in l1 can map to - the same word in l2. - """ - - # Throughout the function, l1 is to be thought of as the proofread text, - # and l2 as the OCR text. The deletion costs are not symmetric: removing - # junk from the OCR is frequent while removing a word from the proofread - # text should be rare. - del_cost1 = 50 - def del_cost2(w): - return 1+3*len([c for c in w if c.isalnum()]) - w = 3 # multiplicative cost factor for the Levenshtein distance - - n, m = len(l1), len(l2) - # a is the (score, alignment) matrix. a[i][j] is the (score, alignment) - # pair of the first i words of l1 to the first j words of l2 - a = [[(0, [])] * (m + 1) for i in xrange(n + 1)] - - for j in xrange(1, m + 1): - a[0][j] = j, [] - - for i in xrange(1, n + 1): - a[i][0] = i * del_cost1, [[]] * i - - for j in xrange(1, m + 1): - - s, b = a[i-1][j-1] - d = levenshtein(l1[i-1], l2[j-1]) - min_s, min_b = s + w * d, b + [[j-1]] - - s, b = a[i-1][j] - if s + del_cost1 < min_s: - min_s, min_b = s + del_cost1, b + [[]] - - s, b = a[i][j-1] - if s + del_cost2(l2[j-1]) < min_s: - min_s, min_b = s + del_cost2(l2[j-1]), b - - for k in xrange(1, 8): - for l in xrange(1, 5): - if k + l <= 2: - continue - if k+l > 7: - break - if j < l or i < k: - break - s, b = a[i-k][j-l] - d = levenshtein(join_words(l1[i-k:i]), - join_ocr_words(l2[j-l:j], c2[j-l:j])) - if s + w * d < min_s: - temp = [[j-1]] if l == 1 else [range(j-l, j)] - min_s, min_b = s + w * d, b + temp * k - - a[i][j] = min_s, min_b - - return a[n][m] - -def print_alignment(l1, l2, c2, alignment): - """Given two list of words and an alignment (as defined in :func:`align`) - print the two list of words side-by-side and aligned. - """ - prev = 0 - for index, g in itertools.groupby(zip(l1, alignment), lambda x:x[1]): - word = " ".join([a[0] for a in g]) - if not index: - print u"{0:>25} | ".format(word) - else: - begin, end = index[0], index[-1] - for i in range(prev, begin-1): - print u"{0:>25} | {1}".format("", l2[i+1]) - prev = end - - if end > begin: - print u"{0:>25} | {1:<25} (M)".format(word, - join_ocr_words(l2[begin:end+1], c2[begin:end+1])) - else: - print u"{0:>25} | {1:<25}".format(word, l2[begin]) - - if not l1: - for word in l2: - print u"{0:>25} | {1}".format("", word) - -def invert_align(alignment, n): - l = [[] for _ in range(n)] - for i, e in enumerate(alignment): - for a in e: - l[a].append(i) - return l - -def alignment_to_coord(l1, alignment): - # l1 list of corrected words - # alignment list of size len(l1) qui mappe mots dans l2 - # returns indices in l2 - - r = [] - prev = 0 - for index, g in itertools.groupby(zip(l1, alignment), lambda x:x[1]): - word = " ".join([a[0] for a in g]) - r.append([word, index]) - # if not index: - # r.append([word, None]) - # else: - - # begin, end = index[0], index[-1] - # if end > begin: - # #need to find a better way to get the box coordinates - # r.append([word, begin]) - # else: - # r.append([word, begin]) - return r diff --git a/utils/__init__.py b/utils/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/utils/djvu_utils.py b/utils/djvu_utils.py new file mode 100644 index 0000000..21692a1 --- /dev/null +++ b/utils/djvu_utils.py @@ -0,0 +1,67 @@ +import sys +from bs4 import BeautifulSoup +import djvu +from djvu.decode import Context +from itertools import chain +import collections +from PIL import Image + +def parse_page(page): + s = page.text.sexpr + + def aux(s): + if type(s) is djvu.sexpr.ListExpression: + if len(s) == 0: + pass + if str(s[0].value) == "word": + coords = [s[i].value for i in xrange(1, 5)] + word = s[5].value + yield (word.decode("utf-8"), coords) + else: + for c in chain.from_iterable(aux(child) for child in s[5:]): + yield c + else: + pass + return aux(s) if s else None + +def convert_to_htmlcoord(coords, page_size): + return [",".join(map(str, [c[0], page_size - c[3], + c[2], page_size - c[1]])) for c in coords] + +def get_document(djvufile): + c = Context() + document = c.new_document(djvu.decode.FileURI(djvufile)) + document.decoding_job.wait() + return document + +def parse_book(djvubook, page=None): + """ + returns the list of words and coordinates from a djvu book. + if page is None, returns the whole book. + """ + document = get_document(djvubook) + + if type(page) is int: + toparse = [document.pages[page - 1]] + elif isinstance(page, collections.Iterable): + toparse = [document.pages[p - 1] for p in page] + else: + toparse = document.pages + + return [parse_page(page) for page in toparse] + +def image_from_book(djvubook, page): + document = get_document(djvubook) + mode = djvu.decode.RENDER_COLOR + djvu_pixel_format = djvu.decode.PixelFormatRgb() + page = document.pages[page-1] + page_job = page.decode(wait=True) + width, height = page_job.size + rect = (0, 0, width, height) + buf = page_job.render(mode, rect, rect, djvu_pixel_format) + return Image.frombuffer("RGB", (width, height), buf, 'raw', 'RGB', 0, -1) + +if __name__ == "__main__": + book = parse_book(sys.argv[1], page=[10,11], html=True) + im = image_from_book(sys.argv[1], 11) + im.save("test.jpeg") diff --git a/utils/string_utils.py b/utils/string_utils.py new file mode 100644 index 0000000..b6c8ce0 --- /dev/null +++ b/utils/string_utils.py @@ -0,0 +1,161 @@ +# -*- coding: utf-8 -*- +from Levenshtein import distance as levenshtein +import re +import itertools + +def simplify(text): + mapp = [(u"’", u"'"), (u"↑", u"."), (u"…", u"..."), (u"É", u"E"), + (u"À", u"A"), (u"Ô", u"O"), (u"—", u"-")] + + for a, b in mapp: + text = text.replace(a, b) + + return text + +def cut(word, left, right): + """Return pair of strings (p + "-", s) such that p+s == word and + L(p + "-", left) + L(s, right) is minimal, where L is the levenshtein + distance. + + Implementation is suboptimal since the computation of the Levenshtein + distances will involve comparing the same segments repeatedly. + + TODO: handle the case when word contains an hyphen (e.g. c'est-a-dire) + """ + + def aux(i): + leftw, rightw = word[:i] + "-", word[i:] + return (leftw, rightw, + levenshtein(leftw, left) + levenshtein(rightw, right)) + + l = [aux(i) for i in xrange(len(word) + 1)] + return min(l, key=lambda x: x[2])[:2] + +def join_ocr_words(l, c): + m = list(l) + if len(l) >= 2 and c[-2][2] > c[-1][0] and (not l[-2][-1].isalnum()): + l[-2] = l[-2][:-1] + return "".join(l) + +def join_words(l): + return "".join(l) + +def align(l1, l2, c2): + """Compute the optimal alignment between two list of words + à la Needleman-Wunsch. + + The function returns a (score, alignment) pair. An alignment is simply + a list of list of size len(l1) giving for each word in l1, the list of + indices in l2 it maps to (the list is empty if the word maps to nothing). + + Note that if the list is of size>1, the word in l1 will map to a sequence + of words in l2. Conversly, consecutive words in l1 can map to + the same word in l2. + """ + + # Throughout the function, l1 is to be thought of as the proofread text, + # and l2 as the OCR text. The deletion costs are not symmetric: removing + # junk from the OCR is frequent while removing a word from the proofread + # text should be rare. + del_cost1 = 50 + def del_cost2(w): + return 1+3*len([c for c in w if c.isalnum()]) + w = 3 # multiplicative cost factor for the Levenshtein distance + + n, m = len(l1), len(l2) + # a is the (score, alignment) matrix. a[i][j] is the (score, alignment) + # pair of the first i words of l1 to the first j words of l2 + a = [[(0, [])] * (m + 1) for i in xrange(n + 1)] + + for j in xrange(1, m + 1): + a[0][j] = j, [] + + for i in xrange(1, n + 1): + a[i][0] = i * del_cost1, [[]] * i + + for j in xrange(1, m + 1): + + s, b = a[i-1][j-1] + d = levenshtein(l1[i-1], l2[j-1]) + min_s, min_b = s + w * d, b + [[j-1]] + + s, b = a[i-1][j] + if s + del_cost1 < min_s: + min_s, min_b = s + del_cost1, b + [[]] + + s, b = a[i][j-1] + if s + del_cost2(l2[j-1]) < min_s: + min_s, min_b = s + del_cost2(l2[j-1]), b + + for k in xrange(1, 8): + for l in xrange(1, 5): + if k + l <= 2: + continue + if k+l > 7: + break + if j < l or i < k: + break + s, b = a[i-k][j-l] + d = levenshtein(join_words(l1[i-k:i]), + join_ocr_words(l2[j-l:j], c2[j-l:j])) + if s + w * d < min_s: + temp = [[j-1]] if l == 1 else [range(j-l, j)] + min_s, min_b = s + w * d, b + temp * k + + a[i][j] = min_s, min_b + + return a[n][m] + +def print_alignment(l1, l2, c2, alignment): + """Given two list of words and an alignment (as defined in :func:`align`) + print the two list of words side-by-side and aligned. + """ + prev = 0 + for index, g in itertools.groupby(zip(l1, alignment), lambda x:x[1]): + word = " ".join([a[0] for a in g]) + if not index: + print u"{0:>25} | ".format(word) + else: + begin, end = index[0], index[-1] + for i in range(prev, begin-1): + print u"{0:>25} | {1}".format("", l2[i+1]) + prev = end + + if end > begin: + print u"{0:>25} | {1:<25} (M)".format(word, + join_ocr_words(l2[begin:end+1], c2[begin:end+1])) + else: + print u"{0:>25} | {1:<25}".format(word, l2[begin]) + + if not l1: + for word in l2: + print u"{0:>25} | {1}".format("", word) + +def invert_align(alignment, n): + l = [[] for _ in range(n)] + for i, e in enumerate(alignment): + for a in e: + l[a].append(i) + return l + +def alignment_to_coord(l1, alignment): + # l1 list of corrected words + # alignment list of size len(l1) qui mappe mots dans l2 + # returns indices in l2 + + r = [] + prev = 0 + for index, g in itertools.groupby(zip(l1, alignment), lambda x:x[1]): + word = " ".join([a[0] for a in g]) + r.append([word, index]) + # if not index: + # r.append([word, None]) + # else: + + # begin, end = index[0], index[-1] + # if end > begin: + # #need to find a better way to get the box coordinates + # r.append([word, begin]) + # else: + # r.append([word, begin]) + return r diff --git a/utils/wikisource.py b/utils/wikisource.py new file mode 100644 index 0000000..589c88e --- /dev/null +++ b/utils/wikisource.py @@ -0,0 +1,93 @@ +# -*- coding: utf-8 -*- +import requests +import sys +from bs4 import BeautifulSoup, NavigableString +from itertools import takewhile, count +from types import SliceType +import string_utils as su +import djvu_utils as du + +URL = "http://fr.wikisource.org/w/index.php" + + +def spanify(string, start=0): + soup = BeautifulSoup() + for i, word in enumerate(string.split()): + span = soup.new_tag("span", id="word-" + str(start + i)) + span.string = word + string.insert_before(span) + string.insert_before(" ") + string.replace_with("") + return start + i + 1 + + +class HtmlText(): + + def __init__(self, elem): + self.elem = elem + start = 0 + strings = list(string for string in self.elem.strings + if string.strip()) + + for string in strings: + start = spanify(string, start) + self.length = start + + def __len__(self): + return self.length + + def __getitem__(self, key): + if type(key) is SliceType: + return [self[w] for w in range(*key.indices(self.length))] + if key >= self.length: + raise IndexError + if key < 0: + key = self.length - key + return self.elem.find("span", {"id": "word-" + str(key)}).text + + def __str__(self): + return str(self.elem) + + +def get_page(title, page): + params = {"action": "render", "title": "Page:" + title + "/" + str(page)} + r = requests.get(URL, params=params) + if r.status_code == requests.codes.ok: + soup = BeautifulSoup(r.text, "lxml") + return soup.select("div.pagetext")[0].text + else: + return None + + +def get_page2(text): + soup = BeautifulSoup(text, "lxml") + elem = soup.select("div.pagetext")[0] + return HtmlText(elem), elem.text + + +def get_pages(title, begin=1, end=None): + if end: + return (get_page(title, i) for i in xrange(begin, end + 1)) + else: + return takewhile(lambda x: x is not None, + (get_page(title, i) for i in count(begin))) + +def gen_html(book, page_number): + doc = du.get_document(book) + page = doc.pages[int(page_number)-1] + d = du.parse_page(page) + corrected_text = get_page(book, int(page_number)) + corrected_words = su.simplify(corrected_text).split() + if d: + orig_words, orig_coords = zip(*d) + C = su.align(corrected_words, list(orig_words), list(orig_coords)) + corr_words = corrected_text.split() + orig_coords_html = du.convert_to_htmlcoord(orig_coords, page.size[1]) + return orig_coords_html, orig_words, corr_words, C[1] + +if __name__ == "__main__": + b = BeautifulSoup("asd") + c = HtmlText(b) + print type(c[0]) + print su.align(c, [u"asd"], None) + print c[0:1] diff --git a/web/__init__.py b/web/__init__.py new file mode 100644 index 0000000..139597f --- /dev/null +++ b/web/__init__.py @@ -0,0 +1,2 @@ + + diff --git a/web/djvu_utils.py b/web/djvu_utils.py deleted file mode 120000 index 0742170..0000000 --- a/web/djvu_utils.py +++ /dev/null @@ -1 +0,0 @@ -../djvu_utils.py \ No newline at end of file diff --git a/web/main.py b/web/main.py deleted file mode 100644 index a6826c1..0000000 --- a/web/main.py +++ /dev/null @@ -1,37 +0,0 @@ -import tornado.httpserver -from tornado.web import RequestHandler, Application -import tornado.ioloop -from settings import settings -import utils -from djvu_utils import image_from_book -import io - -class MainHandler(RequestHandler): - - def get(self, page_number): - orig_coords, orig_words, corr_words, align = \ - utils.gen_html(self.settings["book"], page_number) - self.render("index.html", page_number=page_number, orig_coords=orig_coords, - orig_words=orig_words, corr_words=corr_words, align=align) - -class ImageHandler(RequestHandler): - - def get(self, page_number): - im = image_from_book("../" + self.settings["book"], int(page_number)) - self.set_header('Content-Type', 'image/jpg') - img_buff = io.BytesIO() - im.save(img_buff, format="JPEG") - img_buff.seek(0) - self.write(img_buff.read()) - self.finish() - -application = Application([ - (r'/(\d+)/?', MainHandler), - (r'/(\d+)\.jpg/?', ImageHandler)] - , **settings) - -if __name__ == '__main__': - http_server = tornado.httpserver.HTTPServer(application) - http_server.listen(8888) - print "Listening on 8888" - tornado.ioloop.IOLoop.instance().start() diff --git a/web/server.py b/web/server.py new file mode 100644 index 0000000..1e67ad4 --- /dev/null +++ b/web/server.py @@ -0,0 +1,35 @@ +import tornado.httpserver +from tornado.web import RequestHandler, Application +import tornado.ioloop +from settings import settings +from utils.djvu_utils import image_from_book +from utils.wikisource import gen_html +import io + +class MainHandler(RequestHandler): + + def get(self, page_number): + orig_coords, orig_words, corr_words, align = \ + gen_html(self.settings["book"], page_number) + self.render("index.html", page_number=page_number, orig_coords=orig_coords, + orig_words=orig_words, corr_words=corr_words, align=align) + +class ImageHandler(RequestHandler): + + def get(self, page_number): + im = image_from_book(self.settings["book"], int(page_number)) + self.set_header('Content-Type', 'image/jpg') + img_buff = io.BytesIO() + im.save(img_buff, format="JPEG") + img_buff.seek(0) + self.write(img_buff.read()) + self.finish() + +def run(): + application = Application([ + (r'/(\d+)/?', MainHandler), + (r'/(\d+)\.jpg/?', ImageHandler)], **settings) + http_server = tornado.httpserver.HTTPServer(application) + http_server.listen(8888) + print "Listening on 8888" + tornado.ioloop.IOLoop.instance().start() diff --git a/web/settings.py b/web/settings.py index 5a8c9aa..32693b8 100644 --- a/web/settings.py +++ b/web/settings.py @@ -1,9 +1,9 @@ settings = { "debug": True, - "template_path": "templates", - "static_path": "static", + "template_path": "web/templates", + "static_path": "web/static", "cookie_secret": "toto", "login_url": "/login", - #"book": "Bloy_-_Le_Sang_du_pauvre,_Stock,_1932.djvu" - "book": "Villiers_de_L'Isle-Adam_-_Tribulat_Bonhomet,_1908.djvu" + "book": "Bloy_-_Le_Sang_du_pauvre,_Stock,_1932.djvu" + #"book": "Villiers_de_L'Isle-Adam_-_Tribulat_Bonhomet,_1908.djvu" } diff --git a/web/utils.py b/web/utils.py deleted file mode 100644 index 7e20858..0000000 --- a/web/utils.py +++ /dev/null @@ -1,20 +0,0 @@ -import djvu_utils as du -import sys -import string_utils as su -from wikisource import get_page - -def gen_html(book, page_number): - doc = du.get_document("../" + book) - page = doc.pages[int(page_number)-1] - d = du.parse_page(page) - corrected_text = get_page(book, int(page_number)) - corrected_words = su.simplify(corrected_text).split() - if d: - orig_words, orig_coords = zip(*d) - C = su.align(corrected_words, list(orig_words), list(orig_coords)) - corr_words = corrected_text.split() - orig_coords_html = du.convert_to_htmlcoord(orig_coords, page.size[1]) - return orig_coords_html, orig_words, corr_words, C[1] - -if __name__ == "__main__": - gen_html(*sys.argv[1:3]) diff --git a/wikisource.py b/wikisource.py deleted file mode 100644 index af72d34..0000000 --- a/wikisource.py +++ /dev/null @@ -1,80 +0,0 @@ -# -*- coding: utf-8 -*- -import requests -import sys -from bs4 import BeautifulSoup, NavigableString -from itertools import takewhile, count -from types import SliceType -from string_utils import align - -URL = "http://fr.wikisource.org/w/index.php" - - -def spanify(string, start=0): - soup = BeautifulSoup() - for i, word in enumerate(string.split()): - span = soup.new_tag("span", id="word-" + str(start + i)) - span.string = word - string.insert_before(span) - string.insert_before(" ") - string.replace_with("") - return start + i + 1 - - -class HtmlText(): - - def __init__(self, elem): - self.elem = elem - start = 0 - strings = list(string for string in self.elem.strings - if string.strip()) - - for string in strings: - start = spanify(string, start) - self.length = start - - def __len__(self): - return self.length - - def __getitem__(self, key): - if type(key) is SliceType: - return [self[w] for w in range(*key.indices(self.length))] - if key >= self.length: - raise IndexError - if key < 0: - key = self.length - key - return self.elem.find("span", {"id": "word-" + str(key)}).text - - def __str__(self): - return str(self.elem) - - -def get_page(title, page): - params = {"action": "render", "title": "Page:" + title + "/" + str(page)} - r = requests.get(URL, params=params) - if r.status_code == requests.codes.ok: - soup = BeautifulSoup(r.text, "lxml") - return soup.select("div.pagetext")[0].text - else: - return None - - -def get_page2(text): - soup = BeautifulSoup(text, "lxml") - elem = soup.select("div.pagetext")[0] - return HtmlText(elem), elem.text - - -def get_pages(title, begin=1, end=None): - if end: - return (get_page(title, i) for i in xrange(begin, end + 1)) - else: - return takewhile(lambda x: x is not None, - (get_page(title, i) for i in count(begin))) - - -if __name__ == "__main__": - b = BeautifulSoup("asd") - c = HtmlText(b) - print type(c[0]) - print align(c, [u"asd"], None) - print c[0:1] -- cgit v1.2.3-70-g09d2