use the new HtmlText where we can

author: Guillaume Horel <guillaume.horel@gmail.com> 2014-09-07 19:10:38 -0400
committer: Guillaume Horel <guillaume.horel@gmail.com> 2014-09-07 19:10:38 -0400
commit: 43806716c725f94324d792d68431b9417618b426 (patch)
tree: 70961680c214a548f0bd0d5e9993fe6e98c368b5 /utils
parent: e9e15829e5dc39c825f8226010884c2b65d2a5fc (diff)
download: ocr-layer-curation-43806716c725f94324d792d68431b9417618b426.tar.gz
1 files changed, 9 insertions, 16 deletions
diff --git a/utils/wikisource.py b/utils/wikisource.py
index 589c88e..283d5fe 100644
--- a/utils/wikisource.py
+++ b/utils/wikisource.py
@@ -38,7 +38,7 @@ class HtmlText():
 
     def __getitem__(self, key):
         if type(key) is SliceType:
-            return [self[w] for w in range(*key.indices(self.length))]
+            return [unicode(self[w]) for w in range(*key.indices(self.length))]
         if key >= self.length:
             raise IndexError
         if key < 0:
@@ -54,17 +54,10 @@ def get_page(title, page):
     r = requests.get(URL, params=params)
     if r.status_code == requests.codes.ok:
         soup = BeautifulSoup(r.text, "lxml")
-        return soup.select("div.pagetext")[0].text
+        return HtmlText(soup.select("div.pagetext")[0])
     else:
         return None
 
-
-def get_page2(text):
-    soup = BeautifulSoup(text, "lxml")
-    elem = soup.select("div.pagetext")[0]
-    return HtmlText(elem), elem.text
-
-
 def get_pages(title, begin=1, end=None):
     if end:
         return (get_page(title, i) for i in xrange(begin, end + 1))
@@ -77,17 +70,17 @@ def gen_html(book, page_number):
     page = doc.pages[int(page_number)-1]
     d = du.parse_page(page)
     corrected_text = get_page(book, int(page_number))
-    corrected_words = su.simplify(corrected_text).split()
+    corrected_words = su.simplify(corrected_text.elem.text).split()
     if d:
         orig_words, orig_coords = zip(*d)
         C = su.align(corrected_words, list(orig_words), list(orig_coords))
-        corr_words = corrected_text.split()
+        corr_words = corrected_text
         orig_coords_html = du.convert_to_htmlcoord(orig_coords, page.size[1])
     return orig_coords_html, orig_words, corr_words, C[1]
 
 if __name__ == "__main__":
-    b = BeautifulSoup("<a>asd</a>")
-    c = HtmlText(b)
-    print type(c[0])
-    print su.align(c, [u"asd"], None)
-    print c[0:1]
+    wikibook = "Bloy - Le Sang du pauvre, Stock, 1932.djvu".replace(" ", "_")
+    test = get_page(wikibook, 28)
+    # print type(c[0])
+    # print su.align(c, [u"asd"], None)
+    # print c[0:1]
author	Guillaume Horel <guillaume.horel@gmail.com>	2014-09-07 19:10:38 -0400
committer	Guillaume Horel <guillaume.horel@gmail.com>	2014-09-07 19:10:38 -0400
commit	43806716c725f94324d792d68431b9417618b426 (patch)
tree	70961680c214a548f0bd0d5e9993fe6e98c368b5 /utils
parent	e9e15829e5dc39c825f8226010884c2b65d2a5fc (diff)
download	ocr-layer-curation-43806716c725f94324d792d68431b9417618b426.tar.gz