diff options
| author | Guillaume Horel <guillaume.horel@gmail.com> | 2013-08-17 19:02:32 -0400 |
|---|---|---|
| committer | Guillaume Horel <guillaume.horel@gmail.com> | 2013-08-17 19:02:32 -0400 |
| commit | 4e99558cb00144d045fe1fc00793b4b16f0e6fab (patch) | |
| tree | 0a0042b772e63f993b509990a8a0e0fe02f28561 | |
| parent | 1d53df7c99126679d391a1efc96b30aa3848b4d3 (diff) | |
| download | ocr-layer-curation-4e99558cb00144d045fe1fc00793b4b16f0e6fab.tar.gz | |
simplify Thibaut's code
| -rw-r--r-- | string_utils.py | 17 |
1 files changed, 3 insertions, 14 deletions
diff --git a/string_utils.py b/string_utils.py index 3e0706b..c4439da 100644 --- a/string_utils.py +++ b/string_utils.py @@ -1,6 +1,7 @@ # -*- coding: utf-8 -*- from Levenshtein import distance as levenshtein import re +import itertools def simplify(text): mapp = [(u"’", u"'"), (u"↑", u"."), (u"…", u"..."), (u"É", u"E"), @@ -134,21 +135,9 @@ def print_alignment(l1, l2, c2, alignment): """Given two list of words and an alignment (as defined in :func:`align`) print the two list of words side-by-side and aligned. """ - - # collapse sequence of consecutive words in l1 which map to the same word - # in l2 - def aux((l, m), (word, index)): - if index == m[-1]: - l[-1] += " " + word - else: - l.append(word) - m.append(index) - return l, m - if l1: - l1, alignment = reduce(aux, zip(l1, alignment), ([""], [alignment[0]])) - prev = 0 - for index, word in zip(alignment, l1): + for index, g in itertools.groupby(zip(l1, alignment), lambda x:x[1]): + word = " ".join([a[0] for a in g]) if index == -1: print u"{0:>25} | ".format(word) else: |
