diff options
| author | Thibaut Horel <thibaut.horel@gmail.com> | 2013-08-05 22:29:24 +0200 |
|---|---|---|
| committer | Thibaut Horel <thibaut.horel@gmail.com> | 2013-08-05 22:45:45 +0200 |
| commit | b51137379fab2d5579e0caeada5389c682ec3cc5 (patch) | |
| tree | 49e1c6bab1a69143e9aa290e48e5849e7b2b6756 /string_utils.py | |
| parent | 80133c8d2d0a7334e8f53a11eaa48a8d7b70c1da (diff) | |
| download | ocr-layer-curation-b51137379fab2d5579e0caeada5389c682ec3cc5.tar.gz | |
Use a Needleman-Wunsch type algorithm for text alignment
Diffstat (limited to 'string_utils.py')
| -rw-r--r-- | string_utils.py | 100 |
1 files changed, 98 insertions, 2 deletions
diff --git a/string_utils.py b/string_utils.py index 8b7a3a3..c441667 100644 --- a/string_utils.py +++ b/string_utils.py @@ -32,13 +32,13 @@ def levenshtein(word1, word2): def cut(word, left, right): """Return pair of strings (p + "-", s) such that p+s == word and - L(p + "-", left) + L(s, right) is minimal, where L is the levenshtein + L(p + "-", left) + L(s, right) is minimal, where L is the levenshtein distance. Implementation is suboptimal since the computation of the Levenshtein distances will involve comparing the same segments repeatedly. - TODO: handle the case when word contains an hyphen (e.g. c'est-à-dire) + TODO: handle the case when word contains an hyphen (e.g. c'est-a-dire) """ def aux(i): @@ -73,3 +73,99 @@ def printDiff(C, X, Y, i, j): elif i > 0 and (j == 0 or C[i][j-1] < C[i-1][j]): printDiff(C, X, Y, i-1, j) print "- " + X[i-1] + +def join_words(l): + if len(l) == 0: + return "" + elif len(l) == 1: + return l[0] + else: + if l[-2][-1] == "-": + l[-2] = l[-2][:-1] + return "".join(l) + +def align(l1, l2): + """Compute the optimal alignment between two list of words + à la Needleman-Wunsch. + + The function returns a (score, alignment) pair. An alignment is simply + a list of size len(l1) giving for each word in l1, the index of the word in + l2 it maps to (or -1 if the word maps to nothing). + + Note that we also allow the index to be a tuple when a word in l1 maps to + a sequence of words in l2. Conversly, consecutive words in l1 can map to + the same word in l2. + """ + + # Throughout the function, l1 is to be thought of as the proofread text, + # and l2 as the OCR text. The deletion costs are not symmetric: removing + # junk from the OCR is frequent while removing a word from the proofread + # text should be rare. + del_cost1 = 20 + del_cost2 = 3 + w = 2 # multiplicative cost factor for the Levenshtein distance + + n, m = len(l1), len(l2) + # a is the (score, alignment) matrix. a[i][j] is the (score, alignment) + # pair of the first i words of l1 to the first j words of l2 + a = [[(0, [])] * (m + 1) for i in xrange(n + 1)] + + for j in xrange(1, m + 1): + a[0][j] = del_cost2 * j, [] + + for i in xrange(1, n + 1): + a[i][0] = i * del_cost1, [-1] * i + for j in xrange(1, m + 1): + s1, a1 = a[i-1][j-1] + d = levenshtein(l1[i-1], l2[j-1]) + s2, a2 = a[i-1][j] + s3, a3 = a[i][j-1] + l = [(s1 + w * d, a1 + [j-1]), + (s2 + del_cost1, a2 + [-1]), + (s3 + del_cost2, a3)] + if (j >= 2): + s4, a4 = a[i-1][j-2] + d2 = levenshtein(l1[i-1], join_words(l2[j-2:j])) + l += [(s4 + w * d2, a4 + [(j-2, j-1)])] + if (i >= 2): + s5, a5 = a[i-2][j-1] + d3 = levenshtein(join_words(l1[i-2:i]), l2[j-1]) + l += [(s5 + w * d3, a5 + [j-1, j-1])] + + a[i][j] = min(l, key=lambda x: x[0]) + + return a[n][m] + +def print_alignment(l1, l2, alignment): + """Given two list of words and an alignment (as defined in :func:`align`) + print the two list of words side-by-side and aligned. + """ + + # collapse sequence of consecutive words in l1 which map to the same word + # in l2 + def aux((l, m), (word, index)): + if index == m[-1]: + l[-1] += " " + word + else: + l.append(word) + m.append(index) + return l, m + l1, alignment = reduce(aux, zip(l1, alignment), ([""], [alignment[0]])) + + prev = 0 + for (i, word) in enumerate(l1): + if alignment[i] == -1: + print u"{0:>25} | ".format(word) + else: + if type(alignment[i]) == tuple: + begin, end = alignment[i][0], alignment[i][-1] + merge = True + else: + begin, end = alignment[i], alignment[i] + merge = False + while prev < begin - 1: + prev += 1 + print u"{0:>25} | {1}".format("", l2[prev]) + print u"{0:>25} | {1:<25} {2}".format(word, join_words(l2[begin:end+1]), + "(M)" if merge else "") + prev = end |
