diff options
| author | Thibaut Horel <thibaut.horel@gmail.com> | 2014-09-07 16:01:05 -0400 |
|---|---|---|
| committer | Thibaut Horel <thibaut.horel@gmail.com> | 2014-09-07 16:01:05 -0400 |
| commit | cd51d7e0d763ed669a2a45555d64d81e3b2478a1 (patch) | |
| tree | 52d27202e5971f1972c2a3e3430913b93f55b9dd /string_utils.py | |
| parent | d28394833d54a68f5ca13d2edaa261128f6c5170 (diff) | |
| parent | 6283b6582960544dc02e438e739775e3239b802c (diff) | |
| download | ocr-layer-curation-cd51d7e0d763ed669a2a45555d64d81e3b2478a1.tar.gz | |
Merge branch 'master' of horel.org:thibaut/ocr-layer-curation
Conflicts:
web/static/css/style.css
web/templates/index.html
web/utils.py
Diffstat (limited to 'string_utils.py')
| -rw-r--r-- | string_utils.py | 33 |
1 files changed, 23 insertions, 10 deletions
diff --git a/string_utils.py b/string_utils.py index a8a38c0..0588418 100644 --- a/string_utils.py +++ b/string_utils.py @@ -156,18 +156,31 @@ def print_alignment(l1, l2, c2, alignment): for word in l2: print u"{0:>25} | {1}".format("", word) -def alignment_to_sexp(l1, l2, c2, alignment): +def invert_align(alignment, n): + l = [[] for _ in range(n)] + for i, e in enumerate(alignment): + for a in e: + l[a].append(i) + return l + +def alignment_to_coord(l1, alignment): + # l1 list of corrected words + # alignment list of size len(l1) qui mappe mots dans l2 + # returns indices in l2 + r = [] prev = 0 for index, g in itertools.groupby(zip(l1, alignment), lambda x:x[1]): word = " ".join([a[0] for a in g]) - if not index: - r.append([word, []]) - else: - begin, end = index[0], index[-1] - if end > begin: - #need to find a better way to get the box coordinates - r.append([word, c2[begin]]) - else: - r.append([word, c2[begin]]) + r.append([word, index]) + # if not index: + # r.append([word, None]) + # else: + + # begin, end = index[0], index[-1] + # if end > begin: + # #need to find a better way to get the box coordinates + # r.append([word, begin]) + # else: + # r.append([word, begin]) return r |
