diff options
| author | Guillaume Horel <guillaume.horel@gmail.com> | 2014-03-01 15:28:42 -0500 |
|---|---|---|
| committer | Guillaume Horel <guillaume.horel@gmail.com> | 2014-03-01 15:30:27 -0500 |
| commit | c5734b6b776727959f1b485651f1ddc7c8121a85 (patch) | |
| tree | 0d81bc762052bed11fd08e811ef26fdccf974cdd | |
| parent | 424aa29f600bc17c8391a3802206385962648519 (diff) | |
| download | ocr-layer-curation-c5734b6b776727959f1b485651f1ddc7c8121a85.tar.gz | |
preliminary alignment_to_sexp
* right now just output a list of pairs (corrected_word, coords)
* need to generate a sexp file if we want to reinsert into the djvu
* bounding boxes are not smart at all right now (no merge or splits)
| -rw-r--r-- | string_utils.py | 33 |
1 files changed, 13 insertions, 20 deletions
diff --git a/string_utils.py b/string_utils.py index 186f2eb..a8a38c0 100644 --- a/string_utils.py +++ b/string_utils.py @@ -156,25 +156,18 @@ def print_alignment(l1, l2, c2, alignment): for word in l2: print u"{0:>25} | {1}".format("", word) - -def alignment_to_sexp(alignment, sexp, l2): - alignment = iter(alignment) - for line in sexp: - if "word" not in line: - print line +def alignment_to_sexp(l1, l2, c2, alignment): + r = [] + prev = 0 + for index, g in itertools.groupby(zip(l1, alignment), lambda x:x[1]): + word = " ".join([a[0] for a in g]) + if not index: + r.append([word, []]) else: - index = alignment.next() - if index == -1: - break + begin, end = index[0], index[-1] + if end > begin: + #need to find a better way to get the box coordinates + r.append([word, c2[begin]]) else: - if type(index) == tuple: - word = " ".join([l2[i] for i in list(index)]) - else: - try: - word = l2[index] - except IndexError: - print index - word = word.encode("utf-8").encode("string-escape") - re.sub("(?P<begin>\d+ \d+ \d+ \d+\s) \w+(?P<end>\)+$)", - "\g<begin>{0}\g<end>".format(word), line) - print line + r.append([word, c2[begin]]) + return r |
