diff options
| author | Guillaume Horel <guillaume.horel@gmail.com> | 2013-08-05 00:05:17 -0400 |
|---|---|---|
| committer | Guillaume Horel <guillaume.horel@gmail.com> | 2013-08-05 00:05:17 -0400 |
| commit | 47f58b93fc71a2820dd6d1e1038d11b1b80a475c (patch) | |
| tree | f65aa11c19016de44563f239d4e2db796b262fdc | |
| parent | aca17d11b107614915057f15e8cf0e828d4e3381 (diff) | |
| download | ocr-layer-curation-47f58b93fc71a2820dd6d1e1038d11b1b80a475c.tar.gz | |
improve function to parse djvu files
| -rw-r--r-- | parsedjvutext.py | 65 |
1 files changed, 57 insertions, 8 deletions
diff --git a/parsedjvutext.py b/parsedjvutext.py index 41c4194..3d4ee96 100644 --- a/parsedjvutext.py +++ b/parsedjvutext.py @@ -1,16 +1,65 @@ import sys from bs4 import BeautifulSoup +import subprocess -def parse_book(book): +def parse_book_xml(djvubook): + args = ["djvutoxml", djvubook] + soup = BeautifulSoup(subprocess.check_output(args), "lxml") words = [] coords = [] - with open(book) as fh: - soup = BeautifulSoup(fh, "lxml") - for page in soup.find_all("hiddentext"): - words.append([word.text for word in page.find_all("word")]) - coords.append([tuple(map(int, word["coords"].split(","))) \ - for word in page.find_all("word")]) + for page in soup.find_all("hiddentext"): + words.append([word.text for word in page.find_all("word")]) + coords.append([tuple(map(int, word["coords"].split(","))) \ + for word in page.find_all("word")]) return {"words": words, "coords": coords} +def get_npages(djvubook): + args = ["djvused", "-e", "n", djvubook] + return int(subprocess.check_output(args)) + +def parse_page_xml(djvubook, pagenumber): + args = ["djvutoxml", "--page", str(pagenumber), djvubook] + soup = BeautifulSoup(subprocess.check_output(args), "lxml") + all_words = soup.find_all("word") + words = [word.text for word in all_words] + coords = [tuple(map(int, word["coords"].split(","))) \ + for word in all_words] + return {"words": words, "coords": coords} + +def parse_wordline(line): + line = line.lstrip(" (").rstrip(")").split(" ") + word = line[5] + word = word[1:-1].decode("string_escape").decode("utf-8") + coords = map(int, line[1:5]) + return word, coords + +def parse_page_sexp(djvubook, pagenumber): + args = ["djvused", "-e", "select {0};print-txt".format(pagenumber), + djvubook] + page = [parse_wordline(line) for line in \ + subprocess.check_output(args).split("\n") if "word" in line] + return {"words": [a for a, b in page], "coords": [b for a, b in page]} + +def parse_book_sexp(djvubook): + book = {"words": [], "coords": []} + page_coords = [] + page_words = [] + firstpage = True + args = ["djvused", "-e", "print-txt", djvubook] + for line in subprocess.check_output(args).split("\n"): + if "page" in line: + if firstpage: + firstpage = False + else: + book["words"].append(page_words) + book["coords"].append(page_coords) + page_coords = [] + page_words = [] + if "word" in line: + word, coords = parse_wordline(line) + page_words.append(word) + page_coords.append(coords) + return book + if __name__=="__main__": - book = parse_book(sys.argv[1]) + book_sexp = parse_book_sexp(sys.argv[1]) |
