diff options
| author | Guillaume Horel <guillaume.horel@serenitascapital.com> | 2014-02-27 13:56:11 -0500 |
|---|---|---|
| committer | Guillaume Horel <guillaume.horel@serenitascapital.com> | 2014-02-27 13:56:11 -0500 |
| commit | f96752448a537bd6a3378a83ab0e8476653ec59c (patch) | |
| tree | 14669d6d10e9d0343a33b32bf2ab7197fd25b84a | |
| parent | 0d583ea5f9873a5b2a22a89bbb979bf08dd05a90 (diff) | |
| download | ocr-layer-curation-f96752448a537bd6a3378a83ab0e8476653ec59c.tar.gz | |
cleanup
| -rwxr-xr-x | extract_pages.sh | 7 | ||||
| -rw-r--r-- | parsedjvutext.py | 28 | ||||
| -rw-r--r-- | parsepdftext.py | 21 |
3 files changed, 0 insertions, 56 deletions
diff --git a/extract_pages.sh b/extract_pages.sh deleted file mode 100755 index c49a0c4..0000000 --- a/extract_pages.sh +++ /dev/null @@ -1,7 +0,0 @@ -#!/bin/bash - -npages=$(djvused -e 'n' $1) - -for i in $(seq 1 $npages); do - djvused -e "select $i;output-txt" $1 >page${i}.djvutxt -done diff --git a/parsedjvutext.py b/parsedjvutext.py index 9855786..a13421a 100644 --- a/parsedjvutext.py +++ b/parsedjvutext.py @@ -6,34 +6,6 @@ from djvu.decode import Context from itertools import chain import collections - -def parse_book_xml(djvubook): - args = ["djvutoxml", djvubook] - soup = BeautifulSoup(subprocess.check_output(args), "lxml") - words = [] - coords = [] - for page in soup.find_all("hiddentext"): - words.append([word.text for word in page.find_all("word")]) - coords.append([tuple(map(int, word["coords"].split(","))) - for word in page.find_all("word")]) - return {"words": words, "coords": coords} - - -def get_npages(djvubook): - args = ["djvused", "-e", "n", djvubook] - return int(subprocess.check_output(args)) - - -def parse_page_xml(djvubook, pagenumber): - args = ["djvutoxml", "--page", str(pagenumber), djvubook] - soup = BeautifulSoup(subprocess.check_output(args), "lxml") - all_words = soup.find_all("word") - words = [word.text for word in all_words] - coords = [tuple(map(int, word["coords"].split(","))) - for word in all_words] - return {"words": words, "coords": coords} - - def parse_page(page, html=False): s, page_size = page.text.sexpr, page.size[1] diff --git a/parsepdftext.py b/parsepdftext.py deleted file mode 100644 index d1af47e..0000000 --- a/parsepdftext.py +++ /dev/null @@ -1,21 +0,0 @@ -import sys -from xml.etree import ElementTree as ET - -def parse_coords(word): - # coordinates are in dpi, and computed from the top left corner - return tuple([word.attrib[c] for c in ['xMin', 'xMax', 'yMin', 'yMax']]) - -def parse_book(book): - document = ET.parse(book) - ns = 'http://www.w3.org/1999/xhtml' - - words = [] - coords = [] - for page in document.findall('.//{{{0}}}page'.format(ns)): - words.append([word.text for word in page.getchildren()]) - coords.append([parse_coords(word) for word in page.getchildren()]) - return {"words": words, "coords": coords} - -if __name__=="__main__": - book = parse_book(sys.argv[1]) - print book['words'][14] |
