aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorThibaut Horel <thibaut.horel@gmail.com>2013-08-04 12:53:15 +0200
committerThibaut Horel <thibaut.horel@gmail.com>2013-08-04 12:53:15 +0200
commit8431ad45020540c771fbb1a80a63aca07fcf586a (patch)
treedae6102982348e31c385c58d145af54465e6aeec
parent331499d08be1ef32e5bb6200963ebc63500eb826 (diff)
downloadocr-layer-curation-8431ad45020540c771fbb1a80a63aca07fcf586a.tar.gz
Add some string utils functions
Levenshtein distance and word hyphenation
-rw-r--r--lcs.py24
-rw-r--r--string_utils.py73
2 files changed, 73 insertions, 24 deletions
diff --git a/lcs.py b/lcs.py
deleted file mode 100644
index c180241..0000000
--- a/lcs.py
+++ /dev/null
@@ -1,24 +0,0 @@
-def LCS(X, Y):
- m = len(X)
- n = len(Y)
- # An (m+1) times (n+1) matrix
- C = [[0] * (n+1) for i in range(m+1)]
- for i in range(1, m+1):
- for j in range(1, n+1):
- if X[i-1] == Y[j-1]:
- C[i][j] = C[i-1][j-1] + 1
- else:
- C[i][j] = max(C[i][j-1], C[i-1][j])
- return C
-
-def printDiff(C, X, Y, i, j):
- if i > 0 and j > 0 and X[i-1] == Y[j-1]:
- printDiff(C, X, Y, i-1, j-1)
- print " " + X[i-1]
- else:
- if j > 0 and (i == 0 or C[i][j-1] >= C[i-1][j]):
- printDiff(C, X, Y, i, j-1)
- print "+ " + Y[j-1]
- elif i > 0 and (j == 0 or C[i][j-1] < C[i-1][j]):
- printDiff(C, X, Y, i-1, j)
- print "- " + X[i-1]
diff --git a/string_utils.py b/string_utils.py
new file mode 100644
index 0000000..81f448f
--- /dev/null
+++ b/string_utils.py
@@ -0,0 +1,73 @@
+def levenshtein(word1, word2):
+ """Return triplet of number of (substitutions, insertions, deletions) to
+ transform word1 into word2.
+
+ Dynamic programming implementation storing only two rows of the full matrix
+ at a time.
+
+ TODO: write this in a Cython module.
+ """
+
+ s, t = len(word1), len(word2)
+ if word1 == word2:
+ return 0
+ if not min(s, t):
+ return max(s, t)
+
+ v0 = [i for i in xrange(t + 1)] # v0[i] is d(0, i)
+ v1 = [0] * (t + 1)
+ for i in xrange(s):
+ # v0[j] = d(i, j) for all j and we compute v1[j] = d(i+1, j) for all j
+ v1[0] = i + 1
+
+ for j in xrange(t):
+ diff = int(word1[i] != word2[j])
+ v1[j+1] = min(v1[j] + 1, v0[j+1] + 1, v0[j] + diff)
+
+ v0 = list(v1) # copy v1 into v0 for the next iteration
+
+ return v1[t]
+
+def cut(word, left, right):
+ """Return pair of strings (p + "-", s) such that p+s == word and
+ L(p + "-", left) + L(s, right) is minimal, where L is the levenshtein
+ distance.
+
+ Implementation is suboptimal since the computation of the Levenshtein
+ distances will involve comparing the same segments repeatedly.
+
+ TODO: handle the case when word contains an hyphen (e.g. c'est-à-dire)
+ """
+
+ def aux(i):
+ leftw, rightw = word[:i] + "-", word[i:]
+ return (leftw, rightw,
+ levenshtein(leftw, left) + levenshtein(rightw, right))
+
+ l = [aux(i) for i in xrange(len(word) + 1)]
+ return min(l, key=lambda x: x[2])[:2]
+
+def LCS(X, Y):
+ m = len(X)
+ n = len(Y)
+ # An (m+1) times (n+1) matrix
+ C = [[0] * (n+1) for i in range(m+1)]
+ for i in range(1, m+1):
+ for j in range(1, n+1):
+ if X[i-1] == Y[j-1]:
+ C[i][j] = C[i-1][j-1] + 1
+ else:
+ C[i][j] = max(C[i][j-1], C[i-1][j])
+ return C
+
+def printDiff(C, X, Y, i, j):
+ if i > 0 and j > 0 and X[i-1] == Y[j-1]:
+ printDiff(C, X, Y, i-1, j-1)
+ print " " + X[i-1]
+ else:
+ if j > 0 and (i == 0 or C[i][j-1] >= C[i-1][j]):
+ printDiff(C, X, Y, i, j-1)
+ print "+ " + Y[j-1]
+ elif i > 0 and (j == 0 or C[i][j-1] < C[i-1][j]):
+ printDiff(C, X, Y, i-1, j)
+ print "- " + X[i-1]