diff options
| -rw-r--r-- | web/static/css/style.css | 17 | ||||
| -rw-r--r-- | web/templates/layout.html | 2 | ||||
| -rw-r--r-- | web/utils.py | 8 | ||||
| -rw-r--r-- | wikisource.py | 57 |
4 files changed, 75 insertions, 9 deletions
diff --git a/web/static/css/style.css b/web/static/css/style.css index 3005bc5..b293da8 100644 --- a/web/static/css/style.css +++ b/web/static/css/style.css @@ -11,6 +11,23 @@ span:hover { margin-right: 1em; } +#text { + width: 460px; + float: left; +} + +.pagetext p { + text-align: justify; + -moz-hyphens: auto; + margin: 0; + text-indent: 1.5em; +} + +.pagetext { + padding: 3.5em 2em; + font-size: 18px; + line-height: 180%; + #texte-non-corrige { margin-top:0cm; width: 10cm; diff --git a/web/templates/layout.html b/web/templates/layout.html index ff4077d..2fef4d3 100644 --- a/web/templates/layout.html +++ b/web/templates/layout.html @@ -1,5 +1,5 @@ <!doctype html> -<html lang="en"> +<html lang="fr"> <head> <meta charset="utf-8"> <script src="static/js/jquery.js"></script> diff --git a/web/utils.py b/web/utils.py index 7e20858..e6f4309 100644 --- a/web/utils.py +++ b/web/utils.py @@ -1,14 +1,14 @@ import djvu_utils as du import sys import string_utils as su -from wikisource import get_page +from wikisource import get_page2 + def gen_html(book, page_number): doc = du.get_document("../" + book) - page = doc.pages[int(page_number)-1] + page = doc.pages[int(page_number) - 1] d = du.parse_page(page) - corrected_text = get_page(book, int(page_number)) - corrected_words = su.simplify(corrected_text).split() + elem, corrected_text = get_page2(open("test.txt").read()) if d: orig_words, orig_coords = zip(*d) C = su.align(corrected_words, list(orig_words), list(orig_coords)) diff --git a/wikisource.py b/wikisource.py index 1459468..af72d34 100644 --- a/wikisource.py +++ b/wikisource.py @@ -1,12 +1,53 @@ # -*- coding: utf-8 -*- import requests import sys -from bs4 import BeautifulSoup +from bs4 import BeautifulSoup, NavigableString from itertools import takewhile, count +from types import SliceType +from string_utils import align URL = "http://fr.wikisource.org/w/index.php" +def spanify(string, start=0): + soup = BeautifulSoup() + for i, word in enumerate(string.split()): + span = soup.new_tag("span", id="word-" + str(start + i)) + span.string = word + string.insert_before(span) + string.insert_before(" ") + string.replace_with("") + return start + i + 1 + + +class HtmlText(): + + def __init__(self, elem): + self.elem = elem + start = 0 + strings = list(string for string in self.elem.strings + if string.strip()) + + for string in strings: + start = spanify(string, start) + self.length = start + + def __len__(self): + return self.length + + def __getitem__(self, key): + if type(key) is SliceType: + return [self[w] for w in range(*key.indices(self.length))] + if key >= self.length: + raise IndexError + if key < 0: + key = self.length - key + return self.elem.find("span", {"id": "word-" + str(key)}).text + + def __str__(self): + return str(self.elem) + + def get_page(title, page): params = {"action": "render", "title": "Page:" + title + "/" + str(page)} r = requests.get(URL, params=params) @@ -17,6 +58,12 @@ def get_page(title, page): return None +def get_page2(text): + soup = BeautifulSoup(text, "lxml") + elem = soup.select("div.pagetext")[0] + return HtmlText(elem), elem.text + + def get_pages(title, begin=1, end=None): if end: return (get_page(title, i) for i in xrange(begin, end + 1)) @@ -26,6 +73,8 @@ def get_pages(title, begin=1, end=None): if __name__ == "__main__": - title = sys.argv[1] - for page in get_pages(title): - print page + b = BeautifulSoup("<a>asd</a>") + c = HtmlText(b) + print type(c[0]) + print align(c, [u"asd"], None) + print c[0:1] |
