From f7168bf05ec4b976dd74b357bff7ff54d0693f13 Mon Sep 17 00:00:00 2001 From: Guillaume Horel Date: Wed, 21 Aug 2013 15:40:59 -0400 Subject: Simplify datastructure An alignment is now a list of list. Empty list means word maps to nothing, and len(list) greater than one means a word maps to multiple words. This removes the artificial distinction between index and tuple. --- string_utils.py | 23 ++++++++++------------- 1 file changed, 10 insertions(+), 13 deletions(-) diff --git a/string_utils.py b/string_utils.py index 12d22b8..61320ac 100644 --- a/string_utils.py +++ b/string_utils.py @@ -70,11 +70,11 @@ def align(l1, l2, c2): à la Needleman-Wunsch. The function returns a (score, alignment) pair. An alignment is simply - a list of size len(l1) giving for each word in l1, the index of the word in - l2 it maps to (or -1 if the word maps to nothing). + a list of list of size len(l1) giving for each word in l1, the list of + indices in l2 it maps to (the list is empty if the word maps to nothing). - Note that we also allow the index to be a tuple when a word in l1 maps to - a sequence of words in l2. Conversly, consecutive words in l1 can map to + Note that if the list is of size>1, the word in l1 will map to a sequence + of words in l2. Conversly, consecutive words in l1 can map to the same word in l2. """ @@ -96,17 +96,17 @@ def align(l1, l2, c2): a[0][j] = j, [] for i in xrange(1, n + 1): - a[i][0] = i * del_cost1, [-1] * i + a[i][0] = i * del_cost1, [] * i for j in xrange(1, m + 1): s, b = a[i-1][j-1] d = levenshtein(l1[i-1], l2[j-1]) - min_s, min_b = s + w * d, b + [j-1] + min_s, min_b = s + w * d, b + [[j-1]] s, b = a[i-1][j] if s + del_cost1 < min_s: - min_s, min_b = s + del_cost1, b + [-1] + min_s, min_b = s + del_cost1, b + [[]] s, b = a[i][j-1] if s + del_cost2(l2[j-1]) < min_s: @@ -124,7 +124,7 @@ def align(l1, l2, c2): d = levenshtein(join_words(l1[i-k:i]), join_ocr_words(l2[j-l:j], c2[j-l:j])) if s + w * d < min_s: - temp = [j-1] if l == 1 else [tuple(range(j-l, j))] + temp = [[j-1]] if l == 1 else [range(j-l, j)] min_s, min_b = s + w * d, b + temp * k a[i][j] = min_s, min_b @@ -138,13 +138,10 @@ def print_alignment(l1, l2, c2, alignment): prev = 0 for index, g in itertools.groupby(zip(l1, alignment), lambda x:x[1]): word = " ".join([a[0] for a in g]) - if index == -1: + if not index: print u"{0:>25} | ".format(word) else: - if type(index) == tuple: - begin, end = index[0], index[-1] - else: - begin, end = index, index + begin, end = index[0], index[-1] while prev < begin - 1: prev += 1 -- cgit v1.2.3-70-g09d2 From d295c767717874045aab27d30759fd3ec7ed49fa Mon Sep 17 00:00:00 2001 From: Guillaume Horel Date: Wed, 21 Aug 2013 16:33:14 -0400 Subject: small simplifaction --- string_utils.py | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/string_utils.py b/string_utils.py index 61320ac..186f2eb 100644 --- a/string_utils.py +++ b/string_utils.py @@ -96,7 +96,7 @@ def align(l1, l2, c2): a[0][j] = j, [] for i in xrange(1, n + 1): - a[i][0] = i * del_cost1, [] * i + a[i][0] = i * del_cost1, [[]] * i for j in xrange(1, m + 1): @@ -142,10 +142,8 @@ def print_alignment(l1, l2, c2, alignment): print u"{0:>25} | ".format(word) else: begin, end = index[0], index[-1] - - while prev < begin - 1: - prev += 1 - print u"{0:>25} | {1}".format("", l2[prev]) + for i in range(prev, begin-1): + print u"{0:>25} | {1}".format("", l2[i+1]) prev = end if end > begin: -- cgit v1.2.3-70-g09d2