diff options
Diffstat (limited to 'utils/wikisource.py')
| -rw-r--r-- | utils/wikisource.py | 25 |
1 files changed, 9 insertions, 16 deletions
diff --git a/utils/wikisource.py b/utils/wikisource.py index 589c88e..283d5fe 100644 --- a/utils/wikisource.py +++ b/utils/wikisource.py @@ -38,7 +38,7 @@ class HtmlText(): def __getitem__(self, key): if type(key) is SliceType: - return [self[w] for w in range(*key.indices(self.length))] + return [unicode(self[w]) for w in range(*key.indices(self.length))] if key >= self.length: raise IndexError if key < 0: @@ -54,17 +54,10 @@ def get_page(title, page): r = requests.get(URL, params=params) if r.status_code == requests.codes.ok: soup = BeautifulSoup(r.text, "lxml") - return soup.select("div.pagetext")[0].text + return HtmlText(soup.select("div.pagetext")[0]) else: return None - -def get_page2(text): - soup = BeautifulSoup(text, "lxml") - elem = soup.select("div.pagetext")[0] - return HtmlText(elem), elem.text - - def get_pages(title, begin=1, end=None): if end: return (get_page(title, i) for i in xrange(begin, end + 1)) @@ -77,17 +70,17 @@ def gen_html(book, page_number): page = doc.pages[int(page_number)-1] d = du.parse_page(page) corrected_text = get_page(book, int(page_number)) - corrected_words = su.simplify(corrected_text).split() + corrected_words = su.simplify(corrected_text.elem.text).split() if d: orig_words, orig_coords = zip(*d) C = su.align(corrected_words, list(orig_words), list(orig_coords)) - corr_words = corrected_text.split() + corr_words = corrected_text orig_coords_html = du.convert_to_htmlcoord(orig_coords, page.size[1]) return orig_coords_html, orig_words, corr_words, C[1] if __name__ == "__main__": - b = BeautifulSoup("<a>asd</a>") - c = HtmlText(b) - print type(c[0]) - print su.align(c, [u"asd"], None) - print c[0:1] + wikibook = "Bloy - Le Sang du pauvre, Stock, 1932.djvu".replace(" ", "_") + test = get_page(wikibook, 28) + # print type(c[0]) + # print su.align(c, [u"asd"], None) + # print c[0:1] |
