diff options
| author | Guillaume Horel <guillaume.horel@gmail.com> | 2014-09-08 22:18:02 -0400 |
|---|---|---|
| committer | Guillaume Horel <guillaume.horel@gmail.com> | 2014-09-08 22:18:02 -0400 |
| commit | d344cfc46b65763f9e06b3f09c428573614e8fbd (patch) | |
| tree | 1e023945c31b284c81560e60337f592a30093138 /utils | |
| parent | 2e1b9a4df2724b2ac61f39dcb3d9cbdf3a0ee306 (diff) | |
| download | ocr-layer-curation-d344cfc46b65763f9e06b3f09c428573614e8fbd.tar.gz | |
get rid of convert_to_htmlcoord for now
Diffstat (limited to 'utils')
| -rw-r--r-- | utils/djvu_utils.py | 14 |
1 files changed, 6 insertions, 8 deletions
diff --git a/utils/djvu_utils.py b/utils/djvu_utils.py index 21692a1..a8d40d3 100644 --- a/utils/djvu_utils.py +++ b/utils/djvu_utils.py @@ -9,24 +9,22 @@ from PIL import Image def parse_page(page): s = page.text.sexpr - def aux(s): + def aux(s, page_size): if type(s) is djvu.sexpr.ListExpression: if len(s) == 0: pass if str(s[0].value) == "word": - coords = [s[i].value for i in xrange(1, 5)] + c = [s[i].value for i in xrange(1, 5)] + coords = ",".join(map(str, [c[0], page_size -c[3], + c[2], page_size - c[1]])) word = s[5].value yield (word.decode("utf-8"), coords) else: - for c in chain.from_iterable(aux(child) for child in s[5:]): + for c in chain.from_iterable(aux(child, page_size) for child in s[5:]): yield c else: pass - return aux(s) if s else None - -def convert_to_htmlcoord(coords, page_size): - return [",".join(map(str, [c[0], page_size - c[3], - c[2], page_size - c[1]])) for c in coords] + return aux(s, page.size[1]) if s else None def get_document(djvufile): c = Context() |
