diff options
| author | Guillaume Horel <guillaume.horel@gmail.com> | 2013-08-17 19:01:24 -0400 |
|---|---|---|
| committer | Guillaume Horel <guillaume.horel@gmail.com> | 2013-08-17 19:01:24 -0400 |
| commit | 1d53df7c99126679d391a1efc96b30aa3848b4d3 (patch) | |
| tree | 2d2977bac09ef557b12faa8f8a3a3707f37ec128 | |
| parent | f633ea807ef9a83f84f41767e71fad71656a4439 (diff) | |
| download | ocr-layer-curation-1d53df7c99126679d391a1efc96b30aa3848b4d3.tar.gz | |
add function for converting alignment to sexp
| -rw-r--r-- | parsedjvutext.py | 9 | ||||
| -rw-r--r-- | string_utils.py | 18 |
2 files changed, 24 insertions, 3 deletions
diff --git a/parsedjvutext.py b/parsedjvutext.py index 3d4ee96..773a1d4 100644 --- a/parsedjvutext.py +++ b/parsedjvutext.py @@ -33,11 +33,14 @@ def parse_wordline(line): coords = map(int, line[1:5]) return word, coords -def parse_page_sexp(djvubook, pagenumber): +def page_sexp(djvubook, pagenumber): args = ["djvused", "-e", "select {0};print-txt".format(pagenumber), djvubook] - page = [parse_wordline(line) for line in \ - subprocess.check_output(args).split("\n") if "word" in line] + return subprocess.check_output(args).split("\n") + +def parse_page_sexp(djvubook, pagenumber): + page = [parse_wordline(line) for line in page_sexp(djvubook, pagenumber) \ + if "word" in line] return {"words": [a for a, b in page], "coords": [b for a, b in page]} def parse_book_sexp(djvubook): diff --git a/string_utils.py b/string_utils.py index 1b94ce3..3e0706b 100644 --- a/string_utils.py +++ b/string_utils.py @@ -1,5 +1,6 @@ # -*- coding: utf-8 -*- from Levenshtein import distance as levenshtein +import re def simplify(text): mapp = [(u"’", u"'"), (u"↑", u"."), (u"…", u"..."), (u"É", u"E"), @@ -171,3 +172,20 @@ def print_alignment(l1, l2, c2, alignment): for word in l2: print u"{0:>25} | {1}".format("", word) + +def alignment_to_sexp(alignment, sexp, l2): + alignment = iter(alignment) + for line in sexp: + if "word" not in line: + print line + else: + index = alignment.next() + if index == -1: + break + else: + re.sub("(?P<begin>\d+ \d+ \d+ \d+\s) \w+(?P<end>\)+$)", + "\g<begin>{0}\g<end>".format( + " ".join([l2[i] for i in list(index)])), + line) + line.encode('string-escape') + print line |
