diff options
| -rw-r--r-- | compare.py | 15 | ||||
| -rw-r--r-- | lcs.py | 24 |
2 files changed, 39 insertions, 0 deletions
diff --git a/compare.py b/compare.py new file mode 100644 index 0000000..4fcacd0 --- /dev/null +++ b/compare.py @@ -0,0 +1,15 @@ +import pdb +from wikisource import get_page +from parsedjvutext import parse_book +import lcs + +wikibook = "Villiers de L'Isle-Adam - Tribulat Bonhomet, 1908.djvu" +ocrbook = "Tribulat Bonhomet.xml" + +ocrbook = parse_book(ocrbook) + +n = 14 +l1 = ocrbook['words'][n] +l2 = get_page(wikibook, n+1).split() +C = lcs.LCS(l1, l2) +lcs.printDiff(C, l1, l2, len(l1), len(l2)) @@ -0,0 +1,24 @@ +def LCS(X, Y): + m = len(X) + n = len(Y) + # An (m+1) times (n+1) matrix + C = [[0] * (n+1) for i in range(m+1)] + for i in range(1, m+1): + for j in range(1, n+1): + if X[i-1] == Y[j-1]: + C[i][j] = C[i-1][j-1] + 1 + else: + C[i][j] = max(C[i][j-1], C[i-1][j]) + return C + +def printDiff(C, X, Y, i, j): + if i > 0 and j > 0 and X[i-1] == Y[j-1]: + printDiff(C, X, Y, i-1, j-1) + print " " + X[i-1] + else: + if j > 0 and (i == 0 or C[i][j-1] >= C[i-1][j]): + printDiff(C, X, Y, i, j-1) + print "+ " + Y[j-1] + elif i > 0 and (j == 0 or C[i][j-1] < C[i-1][j]): + printDiff(C, X, Y, i-1, j) + print "- " + X[i-1] |
