From f7168bf05ec4b976dd74b357bff7ff54d0693f13 Mon Sep 17 00:00:00 2001
From: Guillaume Horel <guillaume.horel@serenitascapital.com>
Date: Wed, 21 Aug 2013 15:40:59 -0400
Subject: Simplify datastructure

An alignment is now a list of list. Empty list means word maps to
nothing, and len(list) greater than one means a word maps to multiple words.
This removes the artificial distinction between index and tuple.
---
 string_utils.py | 23 ++++++++++-------------
 1 file changed, 10 insertions(+), 13 deletions(-)

diff --git a/string_utils.py b/string_utils.py
index 12d22b8..61320ac 100644
--- a/string_utils.py
+++ b/string_utils.py
@@ -70,11 +70,11 @@ def align(l1, l2, c2):
     à la Needleman-Wunsch.
 
     The function returns a (score, alignment) pair. An alignment is simply
-    a list of size len(l1) giving for each word in l1, the index of the word in
-    l2 it maps to (or -1 if the word maps to nothing).
+    a list of list of size len(l1) giving for each word in l1, the list of
+    indices in l2 it maps to (the list is empty if the word maps to nothing).
 
-    Note that we also allow the index to be a tuple when a word in l1 maps to
-    a sequence of words in l2. Conversly, consecutive words in l1 can map to
+    Note that if the list is of size>1, the word in l1 will map to a sequence
+    of words in l2. Conversly, consecutive words in l1 can map to
     the same word in l2.
     """
 
@@ -96,17 +96,17 @@ def align(l1, l2, c2):
         a[0][j] = j, []
 
     for i in xrange(1, n + 1):
-        a[i][0] = i * del_cost1, [-1] * i
+        a[i][0] = i * del_cost1, [] * i
 
         for j in xrange(1, m + 1):
 
             s, b = a[i-1][j-1]
             d = levenshtein(l1[i-1], l2[j-1])
-            min_s, min_b  = s + w * d, b + [j-1]
+            min_s, min_b  = s + w * d, b + [[j-1]]
 
             s, b = a[i-1][j]
             if s + del_cost1 < min_s:
-                min_s, min_b = s + del_cost1, b + [-1]
+                min_s, min_b = s + del_cost1, b + [[]]
 
             s, b = a[i][j-1]
             if s + del_cost2(l2[j-1]) < min_s:
@@ -124,7 +124,7 @@ def align(l1, l2, c2):
                     d = levenshtein(join_words(l1[i-k:i]),
                                     join_ocr_words(l2[j-l:j], c2[j-l:j]))
                     if s + w * d < min_s:
-                        temp = [j-1] if l == 1 else [tuple(range(j-l, j))]
+                        temp = [[j-1]] if l == 1 else [range(j-l, j)]
                         min_s, min_b = s + w * d, b + temp * k
 
             a[i][j] = min_s, min_b
@@ -138,13 +138,10 @@ def print_alignment(l1, l2, c2, alignment):
     prev = 0
     for index, g in itertools.groupby(zip(l1, alignment), lambda x:x[1]):
         word = " ".join([a[0] for a in g])
-        if index == -1:
+        if not index:
             print u"{0:>25} | ".format(word)
         else:
-            if type(index) == tuple:
-                begin, end = index[0], index[-1]
-            else:
-                begin, end = index, index
+            begin, end = index[0], index[-1]
 
             while prev < begin - 1:
                 prev += 1
-- 
cgit v1.2.3-70-g09d2


From d295c767717874045aab27d30759fd3ec7ed49fa Mon Sep 17 00:00:00 2001
From: Guillaume Horel <guillaume.horel@serenitascapital.com>
Date: Wed, 21 Aug 2013 16:33:14 -0400
Subject: small simplifaction

---
 string_utils.py | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

diff --git a/string_utils.py b/string_utils.py
index 61320ac..186f2eb 100644
--- a/string_utils.py
+++ b/string_utils.py
@@ -96,7 +96,7 @@ def align(l1, l2, c2):
         a[0][j] = j, []
 
     for i in xrange(1, n + 1):
-        a[i][0] = i * del_cost1, [] * i
+        a[i][0] = i * del_cost1, [[]] * i
 
         for j in xrange(1, m + 1):
 
@@ -142,10 +142,8 @@ def print_alignment(l1, l2, c2, alignment):
             print u"{0:>25} | ".format(word)
         else:
             begin, end = index[0], index[-1]
-
-            while prev < begin - 1:
-                prev += 1
-                print u"{0:>25} | {1}".format("", l2[prev])
+            for i in range(prev, begin-1):
+                print u"{0:>25} | {1}".format("", l2[i+1])
             prev = end
 
             if end > begin:
-- 
cgit v1.2.3-70-g09d2