aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorThibaut Horel <thibaut.horel@gmail.com>2013-08-05 23:59:13 +0200
committerThibaut Horel <thibaut.horel@gmail.com>2013-08-05 23:59:13 +0200
commitb9ccbf43f87a5a38f3bfce5b5e24ca6ce95c3974 (patch)
tree2dc17ffe392fb54958e52e426978ff1f948cc14b
parentb51137379fab2d5579e0caeada5389c682ec3cc5 (diff)
downloadocr-layer-curation-b9ccbf43f87a5a38f3bfce5b5e24ca6ce95c3974.tar.gz
Use C implementation of the Levenshtein distance
Requires the python-Levenshtein package on PyPI
-rw-r--r--string_utils.py31
1 files changed, 1 insertions, 30 deletions
diff --git a/string_utils.py b/string_utils.py
index c441667..64bd7b0 100644
--- a/string_utils.py
+++ b/string_utils.py
@@ -1,34 +1,5 @@
# -*- coding: utf-8 -*-
-
-def levenshtein(word1, word2):
- """Return triplet of number of (substitutions, insertions, deletions) to
- transform word1 into word2.
-
- Dynamic programming implementation storing only two rows of the full matrix
- at a time.
-
- TODO: write this in a Cython module.
- """
-
- s, t = len(word1), len(word2)
- if word1 == word2:
- return 0
- if not min(s, t):
- return max(s, t)
-
- v0 = [i for i in xrange(t + 1)] # v0[i] is d(0, i)
- v1 = [0] * (t + 1)
- for i in xrange(s):
- # v0[j] = d(i, j) for all j and we compute v1[j] = d(i+1, j) for all j
- v1[0] = i + 1
-
- for j in xrange(t):
- diff = int(word1[i] != word2[j])
- v1[j+1] = min(v1[j] + 1, v0[j+1] + 1, v0[j] + diff)
-
- v0 = list(v1) # copy v1 into v0 for the next iteration
-
- return v1[t]
+from Levenshtein import distance as levenshtein
def cut(word, left, right):
"""Return pair of strings (p + "-", s) such that p+s == word and