# -*- coding: utf-8 -*- def levenshtein(word1, word2): """Return triplet of number of (substitutions, insertions, deletions) to transform word1 into word2. Dynamic programming implementation storing only two rows of the full matrix at a time. TODO: write this in a Cython module. """ s, t = len(word1), len(word2) if word1 == word2: return 0 if not min(s, t): return max(s, t) v0 = [i for i in xrange(t + 1)] # v0[i] is d(0, i) v1 = [0] * (t + 1) for i in xrange(s): # v0[j] = d(i, j) for all j and we compute v1[j] = d(i+1, j) for all j v1[0] = i + 1 for j in xrange(t): diff = int(word1[i] != word2[j]) v1[j+1] = min(v1[j] + 1, v0[j+1] + 1, v0[j] + diff) v0 = list(v1) # copy v1 into v0 for the next iteration return v1[t] def cut(word, left, right): """Return pair of strings (p + "-", s) such that p+s == word and L(p + "-", left) + L(s, right) is minimal, where L is the levenshtein distance. Implementation is suboptimal since the computation of the Levenshtein distances will involve comparing the same segments repeatedly. TODO: handle the case when word contains an hyphen (e.g. c'est-a-dire) """ def aux(i): leftw, rightw = word[:i] + "-", word[i:] return (leftw, rightw, levenshtein(leftw, left) + levenshtein(rightw, right)) l = [aux(i) for i in xrange(len(word) + 1)] return min(l, key=lambda x: x[2])[:2] def LCS(X, Y): m = len(X) n = len(Y) # An (m+1) times (n+1) matrix C = [[0] * (n+1) for i in range(m+1)] for i in range(1, m+1): for j in range(1, n+1): if X[i-1] == Y[j-1]: C[i][j] = C[i-1][j-1] + 1 else: C[i][j] = max(C[i][j-1], C[i-1][j]) return C def printDiff(C, X, Y, i, j): if i > 0 and j > 0 and X[i-1] == Y[j-1]: printDiff(C, X, Y, i-1, j-1) print " " + X[i-1] else: if j > 0 and (i == 0 or C[i][j-1] >= C[i-1][j]): printDiff(C, X, Y, i, j-1) print "+ " + Y[j-1] elif i > 0 and (j == 0 or C[i][j-1] < C[i-1][j]): printDiff(C, X, Y, i-1, j) print "- " + X[i-1] def join_words(l): if len(l) == 0: return "" elif len(l) == 1: return l[0] else: if l[-2][-1] == "-": l[-2] = l[-2][:-1] return "".join(l) def align(l1, l2): """Compute the optimal alignment between two list of words à la Needleman-Wunsch. The function returns a (score, alignment) pair. An alignment is simply a list of size len(l1) giving for each word in l1, the index of the word in l2 it maps to (or -1 if the word maps to nothing). Note that we also allow the index to be a tuple when a word in l1 maps to a sequence of words in l2. Conversly, consecutive words in l1 can map to the same word in l2. """ # Throughout the function, l1 is to be thought of as the proofread text, # and l2 as the OCR text. The deletion costs are not symmetric: removing # junk from the OCR is frequent while removing a word from the proofread # text should be rare. del_cost1 = 20 del_cost2 = 3 w = 2 # multiplicative cost factor for the Levenshtein distance n, m = len(l1), len(l2) # a is the (score, alignment) matrix. a[i][j] is the (score, alignment) # pair of the first i words of l1 to the first j words of l2 a = [[(0, [])] * (m + 1) for i in xrange(n + 1)] for j in xrange(1, m + 1): a[0][j] = del_cost2 * j, [] for i in xrange(1, n + 1): a[i][0] = i * del_cost1, [-1] * i for j in xrange(1, m + 1): s1, a1 = a[i-1][j-1] d = levenshtein(l1[i-1], l2[j-1]) s2, a2 = a[i-1][j] s3, a3 = a[i][j-1] l = [(s1 + w * d, a1 + [j-1]), (s2 + del_cost1, a2 + [-1]), (s3 + del_cost2, a3)] if (j >= 2): s4, a4 = a[i-1][j-2] d2 = levenshtein(l1[i-1], join_words(l2[j-2:j])) l += [(s4 + w * d2, a4 + [(j-2, j-1)])] if (i >= 2): s5, a5 = a[i-2][j-1] d3 = levenshtein(join_words(l1[i-2:i]), l2[j-1]) l += [(s5 + w * d3, a5 + [j-1, j-1])] a[i][j] = min(l, key=lambda x: x[0]) return a[n][m] def print_alignment(l1, l2, alignment): """Given two list of words and an alignment (as defined in :func:`align`) print the two list of words side-by-side and aligned. """ # collapse sequence of consecutive words in l1 which map to the same word # in l2 def aux((l, m), (word, index)): if index == m[-1]: l[-1] += " " + word else: l.append(word) m.append(index) return l, m l1, alignment = reduce(aux, zip(l1, alignment), ([""], [alignment[0]])) prev = 0 for (i, word) in enumerate(l1): if alignment[i] == -1: print u"{0:>25} | ".format(word) else: if type(alignment[i]) == tuple: begin, end = alignment[i][0], alignment[i][-1] merge = True else: begin, end = alignment[i], alignment[i] merge = False while prev < begin - 1: prev += 1 print u"{0:>25} | {1}".format("", l2[prev]) print u"{0:>25} | {1:<25} {2}".format(word, join_words(l2[begin:end+1]), "(M)" if merge else "") prev = end