From c5734b6b776727959f1b485651f1ddc7c8121a85 Mon Sep 17 00:00:00 2001 From: Guillaume Horel Date: Sat, 1 Mar 2014 15:28:42 -0500 Subject: preliminary alignment_to_sexp * right now just output a list of pairs (corrected_word, coords) * need to generate a sexp file if we want to reinsert into the djvu * bounding boxes are not smart at all right now (no merge or splits) --- string_utils.py | 33 +++++++++++++-------------------- 1 file changed, 13 insertions(+), 20 deletions(-) (limited to 'string_utils.py') diff --git a/string_utils.py b/string_utils.py index 186f2eb..a8a38c0 100644 --- a/string_utils.py +++ b/string_utils.py @@ -156,25 +156,18 @@ def print_alignment(l1, l2, c2, alignment): for word in l2: print u"{0:>25} | {1}".format("", word) - -def alignment_to_sexp(alignment, sexp, l2): - alignment = iter(alignment) - for line in sexp: - if "word" not in line: - print line +def alignment_to_sexp(l1, l2, c2, alignment): + r = [] + prev = 0 + for index, g in itertools.groupby(zip(l1, alignment), lambda x:x[1]): + word = " ".join([a[0] for a in g]) + if not index: + r.append([word, []]) else: - index = alignment.next() - if index == -1: - break + begin, end = index[0], index[-1] + if end > begin: + #need to find a better way to get the box coordinates + r.append([word, c2[begin]]) else: - if type(index) == tuple: - word = " ".join([l2[i] for i in list(index)]) - else: - try: - word = l2[index] - except IndexError: - print index - word = word.encode("utf-8").encode("string-escape") - re.sub("(?P\d+ \d+ \d+ \d+\s) \w+(?P\)+$)", - "\g{0}\g".format(word), line) - print line + r.append([word, c2[begin]]) + return r -- cgit v1.2.3-70-g09d2