From 0e8b0c88a4d3009cbbea695f606e49faef27f373 Mon Sep 17 00:00:00 2001
From: Guillaume Horel <guillaume.horel@gmail.com>
Date: Sun, 7 Sep 2014 18:21:37 -0400
Subject: Reorganize the code

hope I did it right. We have two packages now, one for the server
and one for the actual library.
---
 djvu_utils.py         |  67 ---------------------
 string_utils.py       | 161 --------------------------------------------------
 utils/__init__.py     |   0
 utils/djvu_utils.py   |  67 +++++++++++++++++++++
 utils/string_utils.py | 161 ++++++++++++++++++++++++++++++++++++++++++++++++++
 utils/wikisource.py   |  93 +++++++++++++++++++++++++++++
 web/__init__.py       |   2 +
 web/djvu_utils.py     |   1 -
 web/main.py           |  37 ------------
 web/server.py         |  35 +++++++++++
 web/settings.py       |   8 +--
 web/utils.py          |  20 -------
 wikisource.py         |  80 -------------------------
 13 files changed, 362 insertions(+), 370 deletions(-)
 delete mode 100644 djvu_utils.py
 delete mode 100644 string_utils.py
 create mode 100644 utils/__init__.py
 create mode 100644 utils/djvu_utils.py
 create mode 100644 utils/string_utils.py
 create mode 100644 utils/wikisource.py
 create mode 100644 web/__init__.py
 delete mode 120000 web/djvu_utils.py
 delete mode 100644 web/main.py
 create mode 100644 web/server.py
 delete mode 100644 web/utils.py
 delete mode 100644 wikisource.py

diff --git a/djvu_utils.py b/djvu_utils.py
deleted file mode 100644
index 21692a1..0000000
--- a/djvu_utils.py
+++ /dev/null
@@ -1,67 +0,0 @@
-import sys
-from bs4 import BeautifulSoup
-import djvu
-from djvu.decode import Context
-from itertools import chain
-import collections
-from PIL import Image
-
-def parse_page(page):
-    s = page.text.sexpr
-
-    def aux(s):
-        if type(s) is djvu.sexpr.ListExpression:
-            if len(s) == 0:
-                pass
-            if str(s[0].value) == "word":
-                coords = [s[i].value for i in xrange(1, 5)]
-                word = s[5].value
-                yield (word.decode("utf-8"), coords)
-            else:
-                for c in chain.from_iterable(aux(child) for child in s[5:]):
-                    yield c
-        else:
-            pass
-    return aux(s) if s else None
-
-def convert_to_htmlcoord(coords, page_size):
-    return [",".join(map(str, [c[0], page_size - c[3],
-                               c[2], page_size - c[1]])) for c in coords]
-
-def get_document(djvufile):
-    c = Context()
-    document = c.new_document(djvu.decode.FileURI(djvufile))
-    document.decoding_job.wait()
-    return document
-
-def parse_book(djvubook, page=None):
-    """
-    returns the list of words and coordinates from a djvu book.
-    if page is None, returns the whole book.
-    """
-    document = get_document(djvubook)
-
-    if type(page) is int:
-        toparse = [document.pages[page - 1]]
-    elif isinstance(page, collections.Iterable):
-        toparse = [document.pages[p - 1] for p in page]
-    else:
-        toparse = document.pages
-
-    return [parse_page(page) for page in toparse]
-
-def image_from_book(djvubook, page):
-    document = get_document(djvubook)
-    mode = djvu.decode.RENDER_COLOR
-    djvu_pixel_format = djvu.decode.PixelFormatRgb()
-    page = document.pages[page-1]
-    page_job = page.decode(wait=True)
-    width, height = page_job.size
-    rect = (0, 0, width, height)
-    buf = page_job.render(mode, rect, rect, djvu_pixel_format)
-    return Image.frombuffer("RGB", (width, height), buf, 'raw', 'RGB', 0, -1)
-
-if __name__ == "__main__":
-    book = parse_book(sys.argv[1], page=[10,11], html=True)
-    im = image_from_book(sys.argv[1], 11)
-    im.save("test.jpeg")
diff --git a/string_utils.py b/string_utils.py
deleted file mode 100644
index b6c8ce0..0000000
--- a/string_utils.py
+++ /dev/null
@@ -1,161 +0,0 @@
-# -*- coding: utf-8 -*-
-from Levenshtein import distance as levenshtein
-import re
-import itertools
-
-def simplify(text):
-    mapp = [(u"’", u"'"), (u"↑", u"."), (u"…", u"..."), (u"É", u"E"),
-            (u"À", u"A"), (u"Ô", u"O"), (u"—", u"-")]
-
-    for a, b in mapp:
-        text = text.replace(a, b)
-
-    return text
-
-def cut(word, left, right):
-    """Return pair of strings (p + "-", s) such that p+s == word and
-    L(p + "-", left) + L(s, right) is minimal, where L is the levenshtein
-    distance.
-
-    Implementation is suboptimal since the computation of the Levenshtein
-    distances will involve comparing the same segments repeatedly.
-
-    TODO: handle the case when word contains an hyphen (e.g. c'est-a-dire)
-    """
-
-    def aux(i):
-        leftw, rightw = word[:i] + "-", word[i:]
-        return (leftw, rightw,
-                levenshtein(leftw, left) + levenshtein(rightw, right))
-
-    l = [aux(i) for i in xrange(len(word) + 1)]
-    return min(l, key=lambda x: x[2])[:2]
-
-def join_ocr_words(l, c):
-    m = list(l)
-    if len(l) >= 2 and c[-2][2] > c[-1][0] and (not l[-2][-1].isalnum()):
-        l[-2] = l[-2][:-1]
-    return "".join(l)
-
-def join_words(l):
-    return "".join(l)
-
-def align(l1, l2, c2):
-    """Compute the optimal alignment between two list of words
-    à la Needleman-Wunsch.
-
-    The function returns a (score, alignment) pair. An alignment is simply
-    a list of list of size len(l1) giving for each word in l1, the list of
-    indices in l2 it maps to (the list is empty if the word maps to nothing).
-
-    Note that if the list is of size>1, the word in l1 will map to a sequence
-    of words in l2. Conversly, consecutive words in l1 can map to
-    the same word in l2.
-    """
-
-    # Throughout the function, l1 is to be thought of as the proofread text,
-    # and l2 as the OCR text. The deletion costs are not symmetric: removing
-    # junk from the OCR is frequent while removing a word from the proofread
-    # text should be rare.
-    del_cost1 = 50
-    def del_cost2(w):
-        return 1+3*len([c for c in w if c.isalnum()])
-    w = 3 # multiplicative cost factor for the Levenshtein distance
-
-    n, m = len(l1), len(l2)
-    # a is the (score, alignment) matrix. a[i][j] is the (score, alignment)
-    # pair of the first i words of l1 to the first j words of l2
-    a = [[(0, [])] * (m + 1) for i in xrange(n + 1)]
-
-    for j in xrange(1, m + 1):
-        a[0][j] = j, []
-
-    for i in xrange(1, n + 1):
-        a[i][0] = i * del_cost1, [[]] * i
-
-        for j in xrange(1, m + 1):
-
-            s, b = a[i-1][j-1]
-            d = levenshtein(l1[i-1], l2[j-1])
-            min_s, min_b  = s + w * d, b + [[j-1]]
-
-            s, b = a[i-1][j]
-            if s + del_cost1 < min_s:
-                min_s, min_b = s + del_cost1, b + [[]]
-
-            s, b = a[i][j-1]
-            if s + del_cost2(l2[j-1]) < min_s:
-                min_s, min_b = s + del_cost2(l2[j-1]), b
-
-            for k in xrange(1, 8):
-                for l in xrange(1, 5):
-                    if k + l <= 2:
-                        continue
-                    if k+l > 7:
-                        break
-                    if j < l or i < k:
-                        break
-                    s, b = a[i-k][j-l]
-                    d = levenshtein(join_words(l1[i-k:i]),
-                                    join_ocr_words(l2[j-l:j], c2[j-l:j]))
-                    if s + w * d < min_s:
-                        temp = [[j-1]] if l == 1 else [range(j-l, j)]
-                        min_s, min_b = s + w * d, b + temp * k
-
-            a[i][j] = min_s, min_b
-
-    return a[n][m]
-
-def print_alignment(l1, l2, c2, alignment):
-    """Given two list of words and an alignment (as defined in :func:`align`)
-    print the two list of words side-by-side and aligned.
-    """
-    prev = 0
-    for index, g in itertools.groupby(zip(l1, alignment), lambda x:x[1]):
-        word = " ".join([a[0] for a in g])
-        if not index:
-            print u"{0:>25} | ".format(word)
-        else:
-            begin, end = index[0], index[-1]
-            for i in range(prev, begin-1):
-                print u"{0:>25} | {1}".format("", l2[i+1])
-            prev = end
-
-            if end > begin:
-                print u"{0:>25} | {1:<25} (M)".format(word,
-                                                      join_ocr_words(l2[begin:end+1], c2[begin:end+1]))
-            else:
-                print u"{0:>25} | {1:<25}".format(word, l2[begin])
-
-    if not l1:
-        for word in l2:
-            print u"{0:>25} | {1}".format("", word)
-
-def invert_align(alignment, n):
-    l = [[] for _ in range(n)]
-    for i, e in enumerate(alignment):
-        for a in e:
-            l[a].append(i)
-    return l
-
-def alignment_to_coord(l1, alignment):
-    # l1 list of corrected words
-    # alignment list of size len(l1) qui mappe mots dans l2
-    # returns indices in l2
-
-    r = []
-    prev = 0
-    for index, g in itertools.groupby(zip(l1, alignment), lambda x:x[1]):
-        word = " ".join([a[0] for a in g])
-        r.append([word, index])
-        # if not index:
-        #     r.append([word, None])
-        # else:
-
-        #     begin, end = index[0], index[-1]
-        #     if end > begin:
-        #         #need to find a better way to get the box coordinates
-        #         r.append([word, begin])
-        #     else:
-        #         r.append([word, begin])
-    return r
diff --git a/utils/__init__.py b/utils/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/utils/djvu_utils.py b/utils/djvu_utils.py
new file mode 100644
index 0000000..21692a1
--- /dev/null
+++ b/utils/djvu_utils.py
@@ -0,0 +1,67 @@
+import sys
+from bs4 import BeautifulSoup
+import djvu
+from djvu.decode import Context
+from itertools import chain
+import collections
+from PIL import Image
+
+def parse_page(page):
+    s = page.text.sexpr
+
+    def aux(s):
+        if type(s) is djvu.sexpr.ListExpression:
+            if len(s) == 0:
+                pass
+            if str(s[0].value) == "word":
+                coords = [s[i].value for i in xrange(1, 5)]
+                word = s[5].value
+                yield (word.decode("utf-8"), coords)
+            else:
+                for c in chain.from_iterable(aux(child) for child in s[5:]):
+                    yield c
+        else:
+            pass
+    return aux(s) if s else None
+
+def convert_to_htmlcoord(coords, page_size):
+    return [",".join(map(str, [c[0], page_size - c[3],
+                               c[2], page_size - c[1]])) for c in coords]
+
+def get_document(djvufile):
+    c = Context()
+    document = c.new_document(djvu.decode.FileURI(djvufile))
+    document.decoding_job.wait()
+    return document
+
+def parse_book(djvubook, page=None):
+    """
+    returns the list of words and coordinates from a djvu book.
+    if page is None, returns the whole book.
+    """
+    document = get_document(djvubook)
+
+    if type(page) is int:
+        toparse = [document.pages[page - 1]]
+    elif isinstance(page, collections.Iterable):
+        toparse = [document.pages[p - 1] for p in page]
+    else:
+        toparse = document.pages
+
+    return [parse_page(page) for page in toparse]
+
+def image_from_book(djvubook, page):
+    document = get_document(djvubook)
+    mode = djvu.decode.RENDER_COLOR
+    djvu_pixel_format = djvu.decode.PixelFormatRgb()
+    page = document.pages[page-1]
+    page_job = page.decode(wait=True)
+    width, height = page_job.size
+    rect = (0, 0, width, height)
+    buf = page_job.render(mode, rect, rect, djvu_pixel_format)
+    return Image.frombuffer("RGB", (width, height), buf, 'raw', 'RGB', 0, -1)
+
+if __name__ == "__main__":
+    book = parse_book(sys.argv[1], page=[10,11], html=True)
+    im = image_from_book(sys.argv[1], 11)
+    im.save("test.jpeg")
diff --git a/utils/string_utils.py b/utils/string_utils.py
new file mode 100644
index 0000000..b6c8ce0
--- /dev/null
+++ b/utils/string_utils.py
@@ -0,0 +1,161 @@
+# -*- coding: utf-8 -*-
+from Levenshtein import distance as levenshtein
+import re
+import itertools
+
+def simplify(text):
+    mapp = [(u"’", u"'"), (u"↑", u"."), (u"…", u"..."), (u"É", u"E"),
+            (u"À", u"A"), (u"Ô", u"O"), (u"—", u"-")]
+
+    for a, b in mapp:
+        text = text.replace(a, b)
+
+    return text
+
+def cut(word, left, right):
+    """Return pair of strings (p + "-", s) such that p+s == word and
+    L(p + "-", left) + L(s, right) is minimal, where L is the levenshtein
+    distance.
+
+    Implementation is suboptimal since the computation of the Levenshtein
+    distances will involve comparing the same segments repeatedly.
+
+    TODO: handle the case when word contains an hyphen (e.g. c'est-a-dire)
+    """
+
+    def aux(i):
+        leftw, rightw = word[:i] + "-", word[i:]
+        return (leftw, rightw,
+                levenshtein(leftw, left) + levenshtein(rightw, right))
+
+    l = [aux(i) for i in xrange(len(word) + 1)]
+    return min(l, key=lambda x: x[2])[:2]
+
+def join_ocr_words(l, c):
+    m = list(l)
+    if len(l) >= 2 and c[-2][2] > c[-1][0] and (not l[-2][-1].isalnum()):
+        l[-2] = l[-2][:-1]
+    return "".join(l)
+
+def join_words(l):
+    return "".join(l)
+
+def align(l1, l2, c2):
+    """Compute the optimal alignment between two list of words
+    à la Needleman-Wunsch.
+
+    The function returns a (score, alignment) pair. An alignment is simply
+    a list of list of size len(l1) giving for each word in l1, the list of
+    indices in l2 it maps to (the list is empty if the word maps to nothing).
+
+    Note that if the list is of size>1, the word in l1 will map to a sequence
+    of words in l2. Conversly, consecutive words in l1 can map to
+    the same word in l2.
+    """
+
+    # Throughout the function, l1 is to be thought of as the proofread text,
+    # and l2 as the OCR text. The deletion costs are not symmetric: removing
+    # junk from the OCR is frequent while removing a word from the proofread
+    # text should be rare.
+    del_cost1 = 50
+    def del_cost2(w):
+        return 1+3*len([c for c in w if c.isalnum()])
+    w = 3 # multiplicative cost factor for the Levenshtein distance
+
+    n, m = len(l1), len(l2)
+    # a is the (score, alignment) matrix. a[i][j] is the (score, alignment)
+    # pair of the first i words of l1 to the first j words of l2
+    a = [[(0, [])] * (m + 1) for i in xrange(n + 1)]
+
+    for j in xrange(1, m + 1):
+        a[0][j] = j, []
+
+    for i in xrange(1, n + 1):
+        a[i][0] = i * del_cost1, [[]] * i
+
+        for j in xrange(1, m + 1):
+
+            s, b = a[i-1][j-1]
+            d = levenshtein(l1[i-1], l2[j-1])
+            min_s, min_b  = s + w * d, b + [[j-1]]
+
+            s, b = a[i-1][j]
+            if s + del_cost1 < min_s:
+                min_s, min_b = s + del_cost1, b + [[]]
+
+            s, b = a[i][j-1]
+            if s + del_cost2(l2[j-1]) < min_s:
+                min_s, min_b = s + del_cost2(l2[j-1]), b
+
+            for k in xrange(1, 8):
+                for l in xrange(1, 5):
+                    if k + l <= 2:
+                        continue
+                    if k+l > 7:
+                        break
+                    if j < l or i < k:
+                        break
+                    s, b = a[i-k][j-l]
+                    d = levenshtein(join_words(l1[i-k:i]),
+                                    join_ocr_words(l2[j-l:j], c2[j-l:j]))
+                    if s + w * d < min_s:
+                        temp = [[j-1]] if l == 1 else [range(j-l, j)]
+                        min_s, min_b = s + w * d, b + temp * k
+
+            a[i][j] = min_s, min_b
+
+    return a[n][m]
+
+def print_alignment(l1, l2, c2, alignment):
+    """Given two list of words and an alignment (as defined in :func:`align`)
+    print the two list of words side-by-side and aligned.
+    """
+    prev = 0
+    for index, g in itertools.groupby(zip(l1, alignment), lambda x:x[1]):
+        word = " ".join([a[0] for a in g])
+        if not index:
+            print u"{0:>25} | ".format(word)
+        else:
+            begin, end = index[0], index[-1]
+            for i in range(prev, begin-1):
+                print u"{0:>25} | {1}".format("", l2[i+1])
+            prev = end
+
+            if end > begin:
+                print u"{0:>25} | {1:<25} (M)".format(word,
+                                                      join_ocr_words(l2[begin:end+1], c2[begin:end+1]))
+            else:
+                print u"{0:>25} | {1:<25}".format(word, l2[begin])
+
+    if not l1:
+        for word in l2:
+            print u"{0:>25} | {1}".format("", word)
+
+def invert_align(alignment, n):
+    l = [[] for _ in range(n)]
+    for i, e in enumerate(alignment):
+        for a in e:
+            l[a].append(i)
+    return l
+
+def alignment_to_coord(l1, alignment):
+    # l1 list of corrected words
+    # alignment list of size len(l1) qui mappe mots dans l2
+    # returns indices in l2
+
+    r = []
+    prev = 0
+    for index, g in itertools.groupby(zip(l1, alignment), lambda x:x[1]):
+        word = " ".join([a[0] for a in g])
+        r.append([word, index])
+        # if not index:
+        #     r.append([word, None])
+        # else:
+
+        #     begin, end = index[0], index[-1]
+        #     if end > begin:
+        #         #need to find a better way to get the box coordinates
+        #         r.append([word, begin])
+        #     else:
+        #         r.append([word, begin])
+    return r
diff --git a/utils/wikisource.py b/utils/wikisource.py
new file mode 100644
index 0000000..589c88e
--- /dev/null
+++ b/utils/wikisource.py
@@ -0,0 +1,93 @@
+# -*- coding: utf-8 -*-
+import requests
+import sys
+from bs4 import BeautifulSoup, NavigableString
+from itertools import takewhile, count
+from types import SliceType
+import string_utils as su
+import djvu_utils as du
+
+URL = "http://fr.wikisource.org/w/index.php"
+
+
+def spanify(string, start=0):
+    soup = BeautifulSoup()
+    for i, word in enumerate(string.split()):
+        span = soup.new_tag("span", id="word-" + str(start + i))
+        span.string = word
+        string.insert_before(span)
+        string.insert_before(" ")
+    string.replace_with("")
+    return start + i + 1
+
+
+class HtmlText():
+
+    def __init__(self, elem):
+        self.elem = elem
+        start = 0
+        strings = list(string for string in self.elem.strings
+                       if string.strip())
+
+        for string in strings:
+            start = spanify(string, start)
+        self.length = start
+
+    def __len__(self):
+        return self.length
+
+    def __getitem__(self, key):
+        if type(key) is SliceType:
+            return [self[w] for w in range(*key.indices(self.length))]
+        if key >= self.length:
+            raise IndexError
+        if key < 0:
+            key = self.length - key
+        return self.elem.find("span", {"id": "word-" + str(key)}).text
+
+    def __str__(self):
+        return str(self.elem)
+
+
+def get_page(title, page):
+    params = {"action": "render", "title": "Page:" + title + "/" + str(page)}
+    r = requests.get(URL, params=params)
+    if r.status_code == requests.codes.ok:
+        soup = BeautifulSoup(r.text, "lxml")
+        return soup.select("div.pagetext")[0].text
+    else:
+        return None
+
+
+def get_page2(text):
+    soup = BeautifulSoup(text, "lxml")
+    elem = soup.select("div.pagetext")[0]
+    return HtmlText(elem), elem.text
+
+
+def get_pages(title, begin=1, end=None):
+    if end:
+        return (get_page(title, i) for i in xrange(begin, end + 1))
+    else:
+        return takewhile(lambda x: x is not None,
+                         (get_page(title, i) for i in count(begin)))
+
+def gen_html(book, page_number):
+    doc = du.get_document(book)
+    page = doc.pages[int(page_number)-1]
+    d = du.parse_page(page)
+    corrected_text = get_page(book, int(page_number))
+    corrected_words = su.simplify(corrected_text).split()
+    if d:
+        orig_words, orig_coords = zip(*d)
+        C = su.align(corrected_words, list(orig_words), list(orig_coords))
+        corr_words = corrected_text.split()
+        orig_coords_html = du.convert_to_htmlcoord(orig_coords, page.size[1])
+    return orig_coords_html, orig_words, corr_words, C[1]
+
+if __name__ == "__main__":
+    b = BeautifulSoup("<a>asd</a>")
+    c = HtmlText(b)
+    print type(c[0])
+    print su.align(c, [u"asd"], None)
+    print c[0:1]
diff --git a/web/__init__.py b/web/__init__.py
new file mode 100644
index 0000000..139597f
--- /dev/null
+++ b/web/__init__.py
@@ -0,0 +1,2 @@
+
+
diff --git a/web/djvu_utils.py b/web/djvu_utils.py
deleted file mode 120000
index 0742170..0000000
--- a/web/djvu_utils.py
+++ /dev/null
@@ -1 +0,0 @@
-../djvu_utils.py
\ No newline at end of file
diff --git a/web/main.py b/web/main.py
deleted file mode 100644
index a6826c1..0000000
--- a/web/main.py
+++ /dev/null
@@ -1,37 +0,0 @@
-import tornado.httpserver
-from tornado.web import RequestHandler, Application
-import tornado.ioloop
-from settings import settings
-import utils
-from djvu_utils import image_from_book
-import io
-
-class MainHandler(RequestHandler):
-
-    def get(self, page_number):
-        orig_coords, orig_words, corr_words, align = \
-            utils.gen_html(self.settings["book"], page_number)
-        self.render("index.html", page_number=page_number, orig_coords=orig_coords,
-                    orig_words=orig_words, corr_words=corr_words, align=align)
-
-class ImageHandler(RequestHandler):
-
-    def get(self, page_number):
-        im = image_from_book("../" + self.settings["book"], int(page_number))
-        self.set_header('Content-Type', 'image/jpg')
-        img_buff = io.BytesIO()
-        im.save(img_buff, format="JPEG")
-        img_buff.seek(0)
-        self.write(img_buff.read())
-        self.finish()
-
-application = Application([
-    (r'/(\d+)/?', MainHandler),
-    (r'/(\d+)\.jpg/?', ImageHandler)]
-    , **settings)
-
-if __name__ == '__main__':
-    http_server = tornado.httpserver.HTTPServer(application)
-    http_server.listen(8888)
-    print "Listening on 8888"
-    tornado.ioloop.IOLoop.instance().start()
diff --git a/web/server.py b/web/server.py
new file mode 100644
index 0000000..1e67ad4
--- /dev/null
+++ b/web/server.py
@@ -0,0 +1,35 @@
+import tornado.httpserver
+from tornado.web import RequestHandler, Application
+import tornado.ioloop
+from settings import settings
+from utils.djvu_utils import image_from_book
+from utils.wikisource import gen_html
+import io
+
+class MainHandler(RequestHandler):
+
+    def get(self, page_number):
+        orig_coords, orig_words, corr_words, align = \
+            gen_html(self.settings["book"], page_number)
+        self.render("index.html", page_number=page_number, orig_coords=orig_coords,
+                    orig_words=orig_words, corr_words=corr_words, align=align)
+
+class ImageHandler(RequestHandler):
+
+    def get(self, page_number):
+        im = image_from_book(self.settings["book"], int(page_number))
+        self.set_header('Content-Type', 'image/jpg')
+        img_buff = io.BytesIO()
+        im.save(img_buff, format="JPEG")
+        img_buff.seek(0)
+        self.write(img_buff.read())
+        self.finish()
+
+def run():
+    application = Application([
+        (r'/(\d+)/?', MainHandler),
+        (r'/(\d+)\.jpg/?', ImageHandler)], **settings)
+    http_server = tornado.httpserver.HTTPServer(application)
+    http_server.listen(8888)
+    print "Listening on 8888"
+    tornado.ioloop.IOLoop.instance().start()
diff --git a/web/settings.py b/web/settings.py
index 5a8c9aa..32693b8 100644
--- a/web/settings.py
+++ b/web/settings.py
@@ -1,9 +1,9 @@
 settings = {
     "debug": True,
-    "template_path": "templates",
-    "static_path": "static",
+    "template_path": "web/templates",
+    "static_path": "web/static",
     "cookie_secret": "toto",
     "login_url": "/login",
-    #"book": "Bloy_-_Le_Sang_du_pauvre,_Stock,_1932.djvu"
-    "book": "Villiers_de_L'Isle-Adam_-_Tribulat_Bonhomet,_1908.djvu"
+    "book": "Bloy_-_Le_Sang_du_pauvre,_Stock,_1932.djvu"
+    #"book": "Villiers_de_L'Isle-Adam_-_Tribulat_Bonhomet,_1908.djvu"
 }
diff --git a/web/utils.py b/web/utils.py
deleted file mode 100644
index 7e20858..0000000
--- a/web/utils.py
+++ /dev/null
@@ -1,20 +0,0 @@
-import djvu_utils as du
-import sys
-import string_utils as su
-from wikisource import get_page
-
-def gen_html(book, page_number):
-    doc = du.get_document("../" + book)
-    page = doc.pages[int(page_number)-1]
-    d = du.parse_page(page)
-    corrected_text = get_page(book, int(page_number))
-    corrected_words = su.simplify(corrected_text).split()
-    if d:
-        orig_words, orig_coords = zip(*d)
-        C = su.align(corrected_words, list(orig_words), list(orig_coords))
-        corr_words = corrected_text.split()
-        orig_coords_html = du.convert_to_htmlcoord(orig_coords, page.size[1])
-    return orig_coords_html, orig_words, corr_words, C[1]
-
-if __name__ == "__main__":
-    gen_html(*sys.argv[1:3])
diff --git a/wikisource.py b/wikisource.py
deleted file mode 100644
index af72d34..0000000
--- a/wikisource.py
+++ /dev/null
@@ -1,80 +0,0 @@
-# -*- coding: utf-8 -*-
-import requests
-import sys
-from bs4 import BeautifulSoup, NavigableString
-from itertools import takewhile, count
-from types import SliceType
-from string_utils import align
-
-URL = "http://fr.wikisource.org/w/index.php"
-
-
-def spanify(string, start=0):
-    soup = BeautifulSoup()
-    for i, word in enumerate(string.split()):
-        span = soup.new_tag("span", id="word-" + str(start + i))
-        span.string = word
-        string.insert_before(span)
-        string.insert_before(" ")
-    string.replace_with("")
-    return start + i + 1
-
-
-class HtmlText():
-
-    def __init__(self, elem):
-        self.elem = elem
-        start = 0
-        strings = list(string for string in self.elem.strings
-                       if string.strip())
-
-        for string in strings:
-            start = spanify(string, start)
-        self.length = start
-
-    def __len__(self):
-        return self.length
-
-    def __getitem__(self, key):
-        if type(key) is SliceType:
-            return [self[w] for w in range(*key.indices(self.length))]
-        if key >= self.length:
-            raise IndexError
-        if key < 0:
-            key = self.length - key
-        return self.elem.find("span", {"id": "word-" + str(key)}).text
-
-    def __str__(self):
-        return str(self.elem)
-
-
-def get_page(title, page):
-    params = {"action": "render", "title": "Page:" + title + "/" + str(page)}
-    r = requests.get(URL, params=params)
-    if r.status_code == requests.codes.ok:
-        soup = BeautifulSoup(r.text, "lxml")
-        return soup.select("div.pagetext")[0].text
-    else:
-        return None
-
-
-def get_page2(text):
-    soup = BeautifulSoup(text, "lxml")
-    elem = soup.select("div.pagetext")[0]
-    return HtmlText(elem), elem.text
-
-
-def get_pages(title, begin=1, end=None):
-    if end:
-        return (get_page(title, i) for i in xrange(begin, end + 1))
-    else:
-        return takewhile(lambda x: x is not None,
-                         (get_page(title, i) for i in count(begin)))
-
-
-if __name__ == "__main__":
-    b = BeautifulSoup("<a>asd</a>")
-    c = HtmlText(b)
-    print type(c[0])
-    print align(c, [u"asd"], None)
-    print c[0:1]
-- 
cgit v1.2.3-70-g09d2