Reorganize the code

hope I did it right. We have two packages now, one for the server and one for the actual library.
author: Guillaume Horel <guillaume.horel@gmail.com> 2014-09-07 18:21:37 -0400
committer: Guillaume Horel <guillaume.horel@gmail.com> 2014-09-07 18:24:08 -0400
commit: 0e8b0c88a4d3009cbbea695f606e49faef27f373 (patch)
tree: 85a14a7aef3ee36e73544382c6fdec8aa6bd375c /wikisource.py
parent: 74604d7b8ae98b125f1c800da753f8ab67474eb5 (diff)
download: ocr-layer-curation-0e8b0c88a4d3009cbbea695f606e49faef27f373.tar.gz
1 files changed, 0 insertions, 80 deletions
diff --git a/wikisource.py b/wikisource.py
deleted file mode 100644
index af72d34..0000000
--- a/wikisource.py
+++ /dev/null
@@ -1,80 +0,0 @@
-# -*- coding: utf-8 -*-
-import requests
-import sys
-from bs4 import BeautifulSoup, NavigableString
-from itertools import takewhile, count
-from types import SliceType
-from string_utils import align
-
-URL = "http://fr.wikisource.org/w/index.php"
-
-
-def spanify(string, start=0):
-    soup = BeautifulSoup()
-    for i, word in enumerate(string.split()):
-        span = soup.new_tag("span", id="word-" + str(start + i))
-        span.string = word
-        string.insert_before(span)
-        string.insert_before(" ")
-    string.replace_with("")
-    return start + i + 1
-
-
-class HtmlText():
-
-    def __init__(self, elem):
-        self.elem = elem
-        start = 0
-        strings = list(string for string in self.elem.strings
-                       if string.strip())
-
-        for string in strings:
-            start = spanify(string, start)
-        self.length = start
-
-    def __len__(self):
-        return self.length
-
-    def __getitem__(self, key):
-        if type(key) is SliceType:
-            return [self[w] for w in range(*key.indices(self.length))]
-        if key >= self.length:
-            raise IndexError
-        if key < 0:
-            key = self.length - key
-        return self.elem.find("span", {"id": "word-" + str(key)}).text
-
-    def __str__(self):
-        return str(self.elem)
-
-
-def get_page(title, page):
-    params = {"action": "render", "title": "Page:" + title + "/" + str(page)}
-    r = requests.get(URL, params=params)
-    if r.status_code == requests.codes.ok:
-        soup = BeautifulSoup(r.text, "lxml")
-        return soup.select("div.pagetext")[0].text
-    else:
-        return None
-
-
-def get_page2(text):
-    soup = BeautifulSoup(text, "lxml")
-    elem = soup.select("div.pagetext")[0]
-    return HtmlText(elem), elem.text
-
-
-def get_pages(title, begin=1, end=None):
-    if end:
-        return (get_page(title, i) for i in xrange(begin, end + 1))
-    else:
-        return takewhile(lambda x: x is not None,
-                         (get_page(title, i) for i in count(begin)))
-
-
-if __name__ == "__main__":
-    b = BeautifulSoup("<a>asd</a>")
-    c = HtmlText(b)
-    print type(c[0])
-    print align(c, [u"asd"], None)
-    print c[0:1]
author	Guillaume Horel <guillaume.horel@gmail.com>	2014-09-07 18:21:37 -0400
committer	Guillaume Horel <guillaume.horel@gmail.com>	2014-09-07 18:24:08 -0400
commit	0e8b0c88a4d3009cbbea695f606e49faef27f373 (patch)
tree	85a14a7aef3ee36e73544382c6fdec8aa6bd375c /wikisource.py
parent	74604d7b8ae98b125f1c800da753f8ab67474eb5 (diff)
download	ocr-layer-curation-0e8b0c88a4d3009cbbea695f606e49faef27f373.tar.gz