Reorganize the code

hope I did it right. We have two packages now, one for the server and one for the actual library.
author: Guillaume Horel <guillaume.horel@gmail.com> 2014-09-07 18:21:37 -0400
committer: Guillaume Horel <guillaume.horel@gmail.com> 2014-09-07 18:24:08 -0400
commit: 0e8b0c88a4d3009cbbea695f606e49faef27f373 (patch)
tree: 85a14a7aef3ee36e73544382c6fdec8aa6bd375c /string_utils.py
parent: 74604d7b8ae98b125f1c800da753f8ab67474eb5 (diff)
download: ocr-layer-curation-0e8b0c88a4d3009cbbea695f606e49faef27f373.tar.gz
1 files changed, 0 insertions, 161 deletions
diff --git a/string_utils.py b/string_utils.py
deleted file mode 100644
index b6c8ce0..0000000
--- a/string_utils.py
+++ /dev/null
@@ -1,161 +0,0 @@
-# -*- coding: utf-8 -*-
-from Levenshtein import distance as levenshtein
-import re
-import itertools
-
-def simplify(text):
-    mapp = [(u"’", u"'"), (u"↑", u"."), (u"…", u"..."), (u"É", u"E"),
-            (u"À", u"A"), (u"Ô", u"O"), (u"—", u"-")]
-
-    for a, b in mapp:
-        text = text.replace(a, b)
-
-    return text
-
-def cut(word, left, right):
-    """Return pair of strings (p + "-", s) such that p+s == word and
-    L(p + "-", left) + L(s, right) is minimal, where L is the levenshtein
-    distance.
-
-    Implementation is suboptimal since the computation of the Levenshtein
-    distances will involve comparing the same segments repeatedly.
-
-    TODO: handle the case when word contains an hyphen (e.g. c'est-a-dire)
-    """
-
-    def aux(i):
-        leftw, rightw = word[:i] + "-", word[i:]
-        return (leftw, rightw,
-                levenshtein(leftw, left) + levenshtein(rightw, right))
-
-    l = [aux(i) for i in xrange(len(word) + 1)]
-    return min(l, key=lambda x: x[2])[:2]
-
-def join_ocr_words(l, c):
-    m = list(l)
-    if len(l) >= 2 and c[-2][2] > c[-1][0] and (not l[-2][-1].isalnum()):
-        l[-2] = l[-2][:-1]
-    return "".join(l)
-
-def join_words(l):
-    return "".join(l)
-
-def align(l1, l2, c2):
-    """Compute the optimal alignment between two list of words
-    à la Needleman-Wunsch.
-
-    The function returns a (score, alignment) pair. An alignment is simply
-    a list of list of size len(l1) giving for each word in l1, the list of
-    indices in l2 it maps to (the list is empty if the word maps to nothing).
-
-    Note that if the list is of size>1, the word in l1 will map to a sequence
-    of words in l2. Conversly, consecutive words in l1 can map to
-    the same word in l2.
-    """
-
-    # Throughout the function, l1 is to be thought of as the proofread text,
-    # and l2 as the OCR text. The deletion costs are not symmetric: removing
-    # junk from the OCR is frequent while removing a word from the proofread
-    # text should be rare.
-    del_cost1 = 50
-    def del_cost2(w):
-        return 1+3*len([c for c in w if c.isalnum()])
-    w = 3 # multiplicative cost factor for the Levenshtein distance
-
-    n, m = len(l1), len(l2)
-    # a is the (score, alignment) matrix. a[i][j] is the (score, alignment)
-    # pair of the first i words of l1 to the first j words of l2
-    a = [[(0, [])] * (m + 1) for i in xrange(n + 1)]
-
-    for j in xrange(1, m + 1):
-        a[0][j] = j, []
-
-    for i in xrange(1, n + 1):
-        a[i][0] = i * del_cost1, [[]] * i
-
-        for j in xrange(1, m + 1):
-
-            s, b = a[i-1][j-1]
-            d = levenshtein(l1[i-1], l2[j-1])
-            min_s, min_b  = s + w * d, b + [[j-1]]
-
-            s, b = a[i-1][j]
-            if s + del_cost1 < min_s:
-                min_s, min_b = s + del_cost1, b + [[]]
-
-            s, b = a[i][j-1]
-            if s + del_cost2(l2[j-1]) < min_s:
-                min_s, min_b = s + del_cost2(l2[j-1]), b
-
-            for k in xrange(1, 8):
-                for l in xrange(1, 5):
-                    if k + l <= 2:
-                        continue
-                    if k+l > 7:
-                        break
-                    if j < l or i < k:
-                        break
-                    s, b = a[i-k][j-l]
-                    d = levenshtein(join_words(l1[i-k:i]),
-                                    join_ocr_words(l2[j-l:j], c2[j-l:j]))
-                    if s + w * d < min_s:
-                        temp = [[j-1]] if l == 1 else [range(j-l, j)]
-                        min_s, min_b = s + w * d, b + temp * k
-
-            a[i][j] = min_s, min_b
-
-    return a[n][m]
-
-def print_alignment(l1, l2, c2, alignment):
-    """Given two list of words and an alignment (as defined in :func:`align`)
-    print the two list of words side-by-side and aligned.
-    """
-    prev = 0
-    for index, g in itertools.groupby(zip(l1, alignment), lambda x:x[1]):
-        word = " ".join([a[0] for a in g])
-        if not index:
-            print u"{0:>25} | ".format(word)
-        else:
-            begin, end = index[0], index[-1]
-            for i in range(prev, begin-1):
-                print u"{0:>25} | {1}".format("", l2[i+1])
-            prev = end
-
-            if end > begin:
-                print u"{0:>25} | {1:<25} (M)".format(word,
-                                                      join_ocr_words(l2[begin:end+1], c2[begin:end+1]))
-            else:
-                print u"{0:>25} | {1:<25}".format(word, l2[begin])
-
-    if not l1:
-        for word in l2:
-            print u"{0:>25} | {1}".format("", word)
-
-def invert_align(alignment, n):
-    l = [[] for _ in range(n)]
-    for i, e in enumerate(alignment):
-        for a in e:
-            l[a].append(i)
-    return l
-
-def alignment_to_coord(l1, alignment):
-    # l1 list of corrected words
-    # alignment list of size len(l1) qui mappe mots dans l2
-    # returns indices in l2
-
-    r = []
-    prev = 0
-    for index, g in itertools.groupby(zip(l1, alignment), lambda x:x[1]):
-        word = " ".join([a[0] for a in g])
-        r.append([word, index])
-        # if not index:
-        #     r.append([word, None])
-        # else:
-
-        #     begin, end = index[0], index[-1]
-        #     if end > begin:
-        #         #need to find a better way to get the box coordinates
-        #         r.append([word, begin])
-        #     else:
-        #         r.append([word, begin])
-    return r
author	Guillaume Horel <guillaume.horel@gmail.com>	2014-09-07 18:21:37 -0400
committer	Guillaume Horel <guillaume.horel@gmail.com>	2014-09-07 18:24:08 -0400
commit	0e8b0c88a4d3009cbbea695f606e49faef27f373 (patch)
tree	85a14a7aef3ee36e73544382c6fdec8aa6bd375c /string_utils.py
parent	74604d7b8ae98b125f1c800da753f8ab67474eb5 (diff)
download	ocr-layer-curation-0e8b0c88a4d3009cbbea695f606e49faef27f373.tar.gz