diff options
| author | Thibaut Horel <thibaut.horel@gmail.com> | 2013-08-05 23:59:13 +0200 |
|---|---|---|
| committer | Thibaut Horel <thibaut.horel@gmail.com> | 2013-08-05 23:59:13 +0200 |
| commit | b9ccbf43f87a5a38f3bfce5b5e24ca6ce95c3974 (patch) | |
| tree | 2dc17ffe392fb54958e52e426978ff1f948cc14b /string_utils.py | |
| parent | b51137379fab2d5579e0caeada5389c682ec3cc5 (diff) | |
| download | ocr-layer-curation-b9ccbf43f87a5a38f3bfce5b5e24ca6ce95c3974.tar.gz | |
Use C implementation of the Levenshtein distance
Requires the python-Levenshtein package on PyPI
Diffstat (limited to 'string_utils.py')
| -rw-r--r-- | string_utils.py | 31 |
1 files changed, 1 insertions, 30 deletions
diff --git a/string_utils.py b/string_utils.py index c441667..64bd7b0 100644 --- a/string_utils.py +++ b/string_utils.py @@ -1,34 +1,5 @@ # -*- coding: utf-8 -*- - -def levenshtein(word1, word2): - """Return triplet of number of (substitutions, insertions, deletions) to - transform word1 into word2. - - Dynamic programming implementation storing only two rows of the full matrix - at a time. - - TODO: write this in a Cython module. - """ - - s, t = len(word1), len(word2) - if word1 == word2: - return 0 - if not min(s, t): - return max(s, t) - - v0 = [i for i in xrange(t + 1)] # v0[i] is d(0, i) - v1 = [0] * (t + 1) - for i in xrange(s): - # v0[j] = d(i, j) for all j and we compute v1[j] = d(i+1, j) for all j - v1[0] = i + 1 - - for j in xrange(t): - diff = int(word1[i] != word2[j]) - v1[j+1] = min(v1[j] + 1, v0[j+1] + 1, v0[j] + diff) - - v0 = list(v1) # copy v1 into v0 for the next iteration - - return v1[t] +from Levenshtein import distance as levenshtein def cut(word, left, right): """Return pair of strings (p + "-", s) such that p+s == word and |
