aboutsummaryrefslogtreecommitdiffstats
path: root/utils
diff options
context:
space:
mode:
Diffstat (limited to 'utils')
-rw-r--r--utils/__init__.py0
-rw-r--r--utils/djvu_utils.py67
-rw-r--r--utils/string_utils.py161
-rw-r--r--utils/wikisource.py93
4 files changed, 321 insertions, 0 deletions
diff --git a/utils/__init__.py b/utils/__init__.py
new file mode 100644
index 0000000..e69de29
--- /dev/null
+++ b/utils/__init__.py
diff --git a/utils/djvu_utils.py b/utils/djvu_utils.py
new file mode 100644
index 0000000..21692a1
--- /dev/null
+++ b/utils/djvu_utils.py
@@ -0,0 +1,67 @@
+import sys
+from bs4 import BeautifulSoup
+import djvu
+from djvu.decode import Context
+from itertools import chain
+import collections
+from PIL import Image
+
+def parse_page(page):
+ s = page.text.sexpr
+
+ def aux(s):
+ if type(s) is djvu.sexpr.ListExpression:
+ if len(s) == 0:
+ pass
+ if str(s[0].value) == "word":
+ coords = [s[i].value for i in xrange(1, 5)]
+ word = s[5].value
+ yield (word.decode("utf-8"), coords)
+ else:
+ for c in chain.from_iterable(aux(child) for child in s[5:]):
+ yield c
+ else:
+ pass
+ return aux(s) if s else None
+
+def convert_to_htmlcoord(coords, page_size):
+ return [",".join(map(str, [c[0], page_size - c[3],
+ c[2], page_size - c[1]])) for c in coords]
+
+def get_document(djvufile):
+ c = Context()
+ document = c.new_document(djvu.decode.FileURI(djvufile))
+ document.decoding_job.wait()
+ return document
+
+def parse_book(djvubook, page=None):
+ """
+ returns the list of words and coordinates from a djvu book.
+ if page is None, returns the whole book.
+ """
+ document = get_document(djvubook)
+
+ if type(page) is int:
+ toparse = [document.pages[page - 1]]
+ elif isinstance(page, collections.Iterable):
+ toparse = [document.pages[p - 1] for p in page]
+ else:
+ toparse = document.pages
+
+ return [parse_page(page) for page in toparse]
+
+def image_from_book(djvubook, page):
+ document = get_document(djvubook)
+ mode = djvu.decode.RENDER_COLOR
+ djvu_pixel_format = djvu.decode.PixelFormatRgb()
+ page = document.pages[page-1]
+ page_job = page.decode(wait=True)
+ width, height = page_job.size
+ rect = (0, 0, width, height)
+ buf = page_job.render(mode, rect, rect, djvu_pixel_format)
+ return Image.frombuffer("RGB", (width, height), buf, 'raw', 'RGB', 0, -1)
+
+if __name__ == "__main__":
+ book = parse_book(sys.argv[1], page=[10,11], html=True)
+ im = image_from_book(sys.argv[1], 11)
+ im.save("test.jpeg")
diff --git a/utils/string_utils.py b/utils/string_utils.py
new file mode 100644
index 0000000..b6c8ce0
--- /dev/null
+++ b/utils/string_utils.py
@@ -0,0 +1,161 @@
+# -*- coding: utf-8 -*-
+from Levenshtein import distance as levenshtein
+import re
+import itertools
+
+def simplify(text):
+ mapp = [(u"’", u"'"), (u"↑", u"."), (u"…", u"..."), (u"É", u"E"),
+ (u"À", u"A"), (u"Ô", u"O"), (u"—", u"-")]
+
+ for a, b in mapp:
+ text = text.replace(a, b)
+
+ return text
+
+def cut(word, left, right):
+ """Return pair of strings (p + "-", s) such that p+s == word and
+ L(p + "-", left) + L(s, right) is minimal, where L is the levenshtein
+ distance.
+
+ Implementation is suboptimal since the computation of the Levenshtein
+ distances will involve comparing the same segments repeatedly.
+
+ TODO: handle the case when word contains an hyphen (e.g. c'est-a-dire)
+ """
+
+ def aux(i):
+ leftw, rightw = word[:i] + "-", word[i:]
+ return (leftw, rightw,
+ levenshtein(leftw, left) + levenshtein(rightw, right))
+
+ l = [aux(i) for i in xrange(len(word) + 1)]
+ return min(l, key=lambda x: x[2])[:2]
+
+def join_ocr_words(l, c):
+ m = list(l)
+ if len(l) >= 2 and c[-2][2] > c[-1][0] and (not l[-2][-1].isalnum()):
+ l[-2] = l[-2][:-1]
+ return "".join(l)
+
+def join_words(l):
+ return "".join(l)
+
+def align(l1, l2, c2):
+ """Compute the optimal alignment between two list of words
+ à la Needleman-Wunsch.
+
+ The function returns a (score, alignment) pair. An alignment is simply
+ a list of list of size len(l1) giving for each word in l1, the list of
+ indices in l2 it maps to (the list is empty if the word maps to nothing).
+
+ Note that if the list is of size>1, the word in l1 will map to a sequence
+ of words in l2. Conversly, consecutive words in l1 can map to
+ the same word in l2.
+ """
+
+ # Throughout the function, l1 is to be thought of as the proofread text,
+ # and l2 as the OCR text. The deletion costs are not symmetric: removing
+ # junk from the OCR is frequent while removing a word from the proofread
+ # text should be rare.
+ del_cost1 = 50
+ def del_cost2(w):
+ return 1+3*len([c for c in w if c.isalnum()])
+ w = 3 # multiplicative cost factor for the Levenshtein distance
+
+ n, m = len(l1), len(l2)
+ # a is the (score, alignment) matrix. a[i][j] is the (score, alignment)
+ # pair of the first i words of l1 to the first j words of l2
+ a = [[(0, [])] * (m + 1) for i in xrange(n + 1)]
+
+ for j in xrange(1, m + 1):
+ a[0][j] = j, []
+
+ for i in xrange(1, n + 1):
+ a[i][0] = i * del_cost1, [[]] * i
+
+ for j in xrange(1, m + 1):
+
+ s, b = a[i-1][j-1]
+ d = levenshtein(l1[i-1], l2[j-1])
+ min_s, min_b = s + w * d, b + [[j-1]]
+
+ s, b = a[i-1][j]
+ if s + del_cost1 < min_s:
+ min_s, min_b = s + del_cost1, b + [[]]
+
+ s, b = a[i][j-1]
+ if s + del_cost2(l2[j-1]) < min_s:
+ min_s, min_b = s + del_cost2(l2[j-1]), b
+
+ for k in xrange(1, 8):
+ for l in xrange(1, 5):
+ if k + l <= 2:
+ continue
+ if k+l > 7:
+ break
+ if j < l or i < k:
+ break
+ s, b = a[i-k][j-l]
+ d = levenshtein(join_words(l1[i-k:i]),
+ join_ocr_words(l2[j-l:j], c2[j-l:j]))
+ if s + w * d < min_s:
+ temp = [[j-1]] if l == 1 else [range(j-l, j)]
+ min_s, min_b = s + w * d, b + temp * k
+
+ a[i][j] = min_s, min_b
+
+ return a[n][m]
+
+def print_alignment(l1, l2, c2, alignment):
+ """Given two list of words and an alignment (as defined in :func:`align`)
+ print the two list of words side-by-side and aligned.
+ """
+ prev = 0
+ for index, g in itertools.groupby(zip(l1, alignment), lambda x:x[1]):
+ word = " ".join([a[0] for a in g])
+ if not index:
+ print u"{0:>25} | ".format(word)
+ else:
+ begin, end = index[0], index[-1]
+ for i in range(prev, begin-1):
+ print u"{0:>25} | {1}".format("", l2[i+1])
+ prev = end
+
+ if end > begin:
+ print u"{0:>25} | {1:<25} (M)".format(word,
+ join_ocr_words(l2[begin:end+1], c2[begin:end+1]))
+ else:
+ print u"{0:>25} | {1:<25}".format(word, l2[begin])
+
+ if not l1:
+ for word in l2:
+ print u"{0:>25} | {1}".format("", word)
+
+def invert_align(alignment, n):
+ l = [[] for _ in range(n)]
+ for i, e in enumerate(alignment):
+ for a in e:
+ l[a].append(i)
+ return l
+
+def alignment_to_coord(l1, alignment):
+ # l1 list of corrected words
+ # alignment list of size len(l1) qui mappe mots dans l2
+ # returns indices in l2
+
+ r = []
+ prev = 0
+ for index, g in itertools.groupby(zip(l1, alignment), lambda x:x[1]):
+ word = " ".join([a[0] for a in g])
+ r.append([word, index])
+ # if not index:
+ # r.append([word, None])
+ # else:
+
+ # begin, end = index[0], index[-1]
+ # if end > begin:
+ # #need to find a better way to get the box coordinates
+ # r.append([word, begin])
+ # else:
+ # r.append([word, begin])
+ return r
diff --git a/utils/wikisource.py b/utils/wikisource.py
new file mode 100644
index 0000000..589c88e
--- /dev/null
+++ b/utils/wikisource.py
@@ -0,0 +1,93 @@
+# -*- coding: utf-8 -*-
+import requests
+import sys
+from bs4 import BeautifulSoup, NavigableString
+from itertools import takewhile, count
+from types import SliceType
+import string_utils as su
+import djvu_utils as du
+
+URL = "http://fr.wikisource.org/w/index.php"
+
+
+def spanify(string, start=0):
+ soup = BeautifulSoup()
+ for i, word in enumerate(string.split()):
+ span = soup.new_tag("span", id="word-" + str(start + i))
+ span.string = word
+ string.insert_before(span)
+ string.insert_before(" ")
+ string.replace_with("")
+ return start + i + 1
+
+
+class HtmlText():
+
+ def __init__(self, elem):
+ self.elem = elem
+ start = 0
+ strings = list(string for string in self.elem.strings
+ if string.strip())
+
+ for string in strings:
+ start = spanify(string, start)
+ self.length = start
+
+ def __len__(self):
+ return self.length
+
+ def __getitem__(self, key):
+ if type(key) is SliceType:
+ return [self[w] for w in range(*key.indices(self.length))]
+ if key >= self.length:
+ raise IndexError
+ if key < 0:
+ key = self.length - key
+ return self.elem.find("span", {"id": "word-" + str(key)}).text
+
+ def __str__(self):
+ return str(self.elem)
+
+
+def get_page(title, page):
+ params = {"action": "render", "title": "Page:" + title + "/" + str(page)}
+ r = requests.get(URL, params=params)
+ if r.status_code == requests.codes.ok:
+ soup = BeautifulSoup(r.text, "lxml")
+ return soup.select("div.pagetext")[0].text
+ else:
+ return None
+
+
+def get_page2(text):
+ soup = BeautifulSoup(text, "lxml")
+ elem = soup.select("div.pagetext")[0]
+ return HtmlText(elem), elem.text
+
+
+def get_pages(title, begin=1, end=None):
+ if end:
+ return (get_page(title, i) for i in xrange(begin, end + 1))
+ else:
+ return takewhile(lambda x: x is not None,
+ (get_page(title, i) for i in count(begin)))
+
+def gen_html(book, page_number):
+ doc = du.get_document(book)
+ page = doc.pages[int(page_number)-1]
+ d = du.parse_page(page)
+ corrected_text = get_page(book, int(page_number))
+ corrected_words = su.simplify(corrected_text).split()
+ if d:
+ orig_words, orig_coords = zip(*d)
+ C = su.align(corrected_words, list(orig_words), list(orig_coords))
+ corr_words = corrected_text.split()
+ orig_coords_html = du.convert_to_htmlcoord(orig_coords, page.size[1])
+ return orig_coords_html, orig_words, corr_words, C[1]
+
+if __name__ == "__main__":
+ b = BeautifulSoup("<a>asd</a>")
+ c = HtmlText(b)
+ print type(c[0])
+ print su.align(c, [u"asd"], None)
+ print c[0:1]