diff options
| author | Guillaume Horel <guillaume.horel@gmail.com> | 2014-09-07 19:10:38 -0400 |
|---|---|---|
| committer | Guillaume Horel <guillaume.horel@gmail.com> | 2014-09-07 19:10:38 -0400 |
| commit | 43806716c725f94324d792d68431b9417618b426 (patch) | |
| tree | 70961680c214a548f0bd0d5e9993fe6e98c368b5 /utils | |
| parent | e9e15829e5dc39c825f8226010884c2b65d2a5fc (diff) | |
| download | ocr-layer-curation-43806716c725f94324d792d68431b9417618b426.tar.gz | |
use the new HtmlText where we can
Diffstat (limited to 'utils')
| -rw-r--r-- | utils/wikisource.py | 25 |
1 files changed, 9 insertions, 16 deletions
diff --git a/utils/wikisource.py b/utils/wikisource.py index 589c88e..283d5fe 100644 --- a/utils/wikisource.py +++ b/utils/wikisource.py @@ -38,7 +38,7 @@ class HtmlText(): def __getitem__(self, key): if type(key) is SliceType: - return [self[w] for w in range(*key.indices(self.length))] + return [unicode(self[w]) for w in range(*key.indices(self.length))] if key >= self.length: raise IndexError if key < 0: @@ -54,17 +54,10 @@ def get_page(title, page): r = requests.get(URL, params=params) if r.status_code == requests.codes.ok: soup = BeautifulSoup(r.text, "lxml") - return soup.select("div.pagetext")[0].text + return HtmlText(soup.select("div.pagetext")[0]) else: return None - -def get_page2(text): - soup = BeautifulSoup(text, "lxml") - elem = soup.select("div.pagetext")[0] - return HtmlText(elem), elem.text - - def get_pages(title, begin=1, end=None): if end: return (get_page(title, i) for i in xrange(begin, end + 1)) @@ -77,17 +70,17 @@ def gen_html(book, page_number): page = doc.pages[int(page_number)-1] d = du.parse_page(page) corrected_text = get_page(book, int(page_number)) - corrected_words = su.simplify(corrected_text).split() + corrected_words = su.simplify(corrected_text.elem.text).split() if d: orig_words, orig_coords = zip(*d) C = su.align(corrected_words, list(orig_words), list(orig_coords)) - corr_words = corrected_text.split() + corr_words = corrected_text orig_coords_html = du.convert_to_htmlcoord(orig_coords, page.size[1]) return orig_coords_html, orig_words, corr_words, C[1] if __name__ == "__main__": - b = BeautifulSoup("<a>asd</a>") - c = HtmlText(b) - print type(c[0]) - print su.align(c, [u"asd"], None) - print c[0:1] + wikibook = "Bloy - Le Sang du pauvre, Stock, 1932.djvu".replace(" ", "_") + test = get_page(wikibook, 28) + # print type(c[0]) + # print su.align(c, [u"asd"], None) + # print c[0:1] |
