Reorganize the code

hope I did it right. We have two packages now, one for the server and one for the actual library.
author: Guillaume Horel <guillaume.horel@gmail.com> 2014-09-07 18:21:37 -0400
committer: Guillaume Horel <guillaume.horel@gmail.com> 2014-09-07 18:24:08 -0400
commit: 0e8b0c88a4d3009cbbea695f606e49faef27f373 (patch)
tree: 85a14a7aef3ee36e73544382c6fdec8aa6bd375c /utils/wikisource.py
parent: 74604d7b8ae98b125f1c800da753f8ab67474eb5 (diff)
download: ocr-layer-curation-0e8b0c88a4d3009cbbea695f606e49faef27f373.tar.gz
1 files changed, 93 insertions, 0 deletions
diff --git a/utils/wikisource.py b/utils/wikisource.py
new file mode 100644
index 0000000..589c88e
--- /dev/null
+++ b/utils/wikisource.py
@@ -0,0 +1,93 @@
+# -*- coding: utf-8 -*-
+import requests
+import sys
+from bs4 import BeautifulSoup, NavigableString
+from itertools import takewhile, count
+from types import SliceType
+import string_utils as su
+import djvu_utils as du
+
+URL = "http://fr.wikisource.org/w/index.php"
+
+
+def spanify(string, start=0):
+    soup = BeautifulSoup()
+    for i, word in enumerate(string.split()):
+        span = soup.new_tag("span", id="word-" + str(start + i))
+        span.string = word
+        string.insert_before(span)
+        string.insert_before(" ")
+    string.replace_with("")
+    return start + i + 1
+
+
+class HtmlText():
+
+    def __init__(self, elem):
+        self.elem = elem
+        start = 0
+        strings = list(string for string in self.elem.strings
+                       if string.strip())
+
+        for string in strings:
+            start = spanify(string, start)
+        self.length = start
+
+    def __len__(self):
+        return self.length
+
+    def __getitem__(self, key):
+        if type(key) is SliceType:
+            return [self[w] for w in range(*key.indices(self.length))]
+        if key >= self.length:
+            raise IndexError
+        if key < 0:
+            key = self.length - key
+        return self.elem.find("span", {"id": "word-" + str(key)}).text
+
+    def __str__(self):
+        return str(self.elem)
+
+
+def get_page(title, page):
+    params = {"action": "render", "title": "Page:" + title + "/" + str(page)}
+    r = requests.get(URL, params=params)
+    if r.status_code == requests.codes.ok:
+        soup = BeautifulSoup(r.text, "lxml")
+        return soup.select("div.pagetext")[0].text
+    else:
+        return None
+
+
+def get_page2(text):
+    soup = BeautifulSoup(text, "lxml")
+    elem = soup.select("div.pagetext")[0]
+    return HtmlText(elem), elem.text
+
+
+def get_pages(title, begin=1, end=None):
+    if end:
+        return (get_page(title, i) for i in xrange(begin, end + 1))
+    else:
+        return takewhile(lambda x: x is not None,
+                         (get_page(title, i) for i in count(begin)))
+
+def gen_html(book, page_number):
+    doc = du.get_document(book)
+    page = doc.pages[int(page_number)-1]
+    d = du.parse_page(page)
+    corrected_text = get_page(book, int(page_number))
+    corrected_words = su.simplify(corrected_text).split()
+    if d:
+        orig_words, orig_coords = zip(*d)
+        C = su.align(corrected_words, list(orig_words), list(orig_coords))
+        corr_words = corrected_text.split()
+        orig_coords_html = du.convert_to_htmlcoord(orig_coords, page.size[1])
+    return orig_coords_html, orig_words, corr_words, C[1]
+
+if __name__ == "__main__":
+    b = BeautifulSoup("<a>asd</a>")
+    c = HtmlText(b)
+    print type(c[0])
+    print su.align(c, [u"asd"], None)
+    print c[0:1]
author	Guillaume Horel <guillaume.horel@gmail.com>	2014-09-07 18:21:37 -0400
committer	Guillaume Horel <guillaume.horel@gmail.com>	2014-09-07 18:24:08 -0400
commit	0e8b0c88a4d3009cbbea695f606e49faef27f373 (patch)
tree	85a14a7aef3ee36e73544382c6fdec8aa6bd375c /utils/wikisource.py
parent	74604d7b8ae98b125f1c800da753f8ab67474eb5 (diff)
download	ocr-layer-curation-0e8b0c88a4d3009cbbea695f606e49faef27f373.tar.gz