diff options
| author | Guillaume Horel <guillaume.horel@gmail.com> | 2014-09-07 18:21:37 -0400 |
|---|---|---|
| committer | Guillaume Horel <guillaume.horel@gmail.com> | 2014-09-07 18:24:08 -0400 |
| commit | 0e8b0c88a4d3009cbbea695f606e49faef27f373 (patch) | |
| tree | 85a14a7aef3ee36e73544382c6fdec8aa6bd375c /wikisource.py | |
| parent | 74604d7b8ae98b125f1c800da753f8ab67474eb5 (diff) | |
| download | ocr-layer-curation-0e8b0c88a4d3009cbbea695f606e49faef27f373.tar.gz | |
Reorganize the code
hope I did it right. We have two packages now, one for the server
and one for the actual library.
Diffstat (limited to 'wikisource.py')
| -rw-r--r-- | wikisource.py | 80 |
1 files changed, 0 insertions, 80 deletions
diff --git a/wikisource.py b/wikisource.py deleted file mode 100644 index af72d34..0000000 --- a/wikisource.py +++ /dev/null @@ -1,80 +0,0 @@ -# -*- coding: utf-8 -*- -import requests -import sys -from bs4 import BeautifulSoup, NavigableString -from itertools import takewhile, count -from types import SliceType -from string_utils import align - -URL = "http://fr.wikisource.org/w/index.php" - - -def spanify(string, start=0): - soup = BeautifulSoup() - for i, word in enumerate(string.split()): - span = soup.new_tag("span", id="word-" + str(start + i)) - span.string = word - string.insert_before(span) - string.insert_before(" ") - string.replace_with("") - return start + i + 1 - - -class HtmlText(): - - def __init__(self, elem): - self.elem = elem - start = 0 - strings = list(string for string in self.elem.strings - if string.strip()) - - for string in strings: - start = spanify(string, start) - self.length = start - - def __len__(self): - return self.length - - def __getitem__(self, key): - if type(key) is SliceType: - return [self[w] for w in range(*key.indices(self.length))] - if key >= self.length: - raise IndexError - if key < 0: - key = self.length - key - return self.elem.find("span", {"id": "word-" + str(key)}).text - - def __str__(self): - return str(self.elem) - - -def get_page(title, page): - params = {"action": "render", "title": "Page:" + title + "/" + str(page)} - r = requests.get(URL, params=params) - if r.status_code == requests.codes.ok: - soup = BeautifulSoup(r.text, "lxml") - return soup.select("div.pagetext")[0].text - else: - return None - - -def get_page2(text): - soup = BeautifulSoup(text, "lxml") - elem = soup.select("div.pagetext")[0] - return HtmlText(elem), elem.text - - -def get_pages(title, begin=1, end=None): - if end: - return (get_page(title, i) for i in xrange(begin, end + 1)) - else: - return takewhile(lambda x: x is not None, - (get_page(title, i) for i in count(begin))) - - -if __name__ == "__main__": - b = BeautifulSoup("<a>asd</a>") - c = HtmlText(b) - print type(c[0]) - print align(c, [u"asd"], None) - print c[0:1] |
