aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorThibaut Horel <thibaut.horel@gmail.com>2013-08-06 00:27:14 +0200
committerThibaut Horel <thibaut.horel@gmail.com>2013-08-06 00:27:14 +0200
commit5213d148a9fadbd0b7425e8c08298e9e74785d9f (patch)
tree1d8d611a33f12e338712605cdbc2cb8ee54b84f9
parentb9ccbf43f87a5a38f3bfce5b5e24ca6ce95c3974 (diff)
downloadocr-layer-curation-5213d148a9fadbd0b7425e8c08298e9e74785d9f.tar.gz
Adding some comments
-rw-r--r--string_utils.py38
1 files changed, 24 insertions, 14 deletions
diff --git a/string_utils.py b/string_utils.py
index 64bd7b0..3db0a47 100644
--- a/string_utils.py
+++ b/string_utils.py
@@ -86,22 +86,32 @@ def align(l1, l2):
for i in xrange(1, n + 1):
a[i][0] = i * del_cost1, [-1] * i
+
for j in xrange(1, m + 1):
- s1, a1 = a[i-1][j-1]
+ l = [] # contains the different options
+
+ # mapping l1[i-1] to l2[j-1]
+ s, b = a[i-1][j-1]
d = levenshtein(l1[i-1], l2[j-1])
- s2, a2 = a[i-1][j]
- s3, a3 = a[i][j-1]
- l = [(s1 + w * d, a1 + [j-1]),
- (s2 + del_cost1, a2 + [-1]),
- (s3 + del_cost2, a3)]
- if (j >= 2):
- s4, a4 = a[i-1][j-2]
- d2 = levenshtein(l1[i-1], join_words(l2[j-2:j]))
- l += [(s4 + w * d2, a4 + [(j-2, j-1)])]
- if (i >= 2):
- s5, a5 = a[i-2][j-1]
- d3 = levenshtein(join_words(l1[i-2:i]), l2[j-1])
- l += [(s5 + w * d3, a5 + [j-1, j-1])]
+ l.append((s + w * d, b + [j-1]))
+
+ # deleting l1[i-1]
+ s, b = a[i-1][j]
+ l.append((s + del_cost1, b + [-1]))
+
+ # deleting l2[j-1]
+ s, b = a[i][j-1]
+ l.append((s + del_cost2, b))
+
+ if (j >= 2): # mapping l1[i-1] to l2[j-2] + l2[j-1]
+ s, b = a[i-1][j-2]
+ d = levenshtein(l1[i-1], join_words(l2[j-2:j]))
+ l.append((s + w * d, b + [(j-2, j-1)]))
+
+ if (i >= 2): # mapping l1[i-2]+l1[i-1] to l2[j-1]
+ s, b = a[i-2][j-1]
+ d = levenshtein(join_words(l1[i-2:i]), l2[j-1])
+ l.append((s + w * d, b + [j-1, j-1]))
a[i][j] = min(l, key=lambda x: x[0])