aboutsummaryrefslogtreecommitdiffstats
path: root/parsedjvutext.py
diff options
context:
space:
mode:
authorGuillaume Horel <guillaume.horel@serenitascapital.com>2014-02-27 13:56:11 -0500
committerGuillaume Horel <guillaume.horel@serenitascapital.com>2014-02-27 13:56:11 -0500
commitf96752448a537bd6a3378a83ab0e8476653ec59c (patch)
tree14669d6d10e9d0343a33b32bf2ab7197fd25b84a /parsedjvutext.py
parent0d583ea5f9873a5b2a22a89bbb979bf08dd05a90 (diff)
downloadocr-layer-curation-f96752448a537bd6a3378a83ab0e8476653ec59c.tar.gz
cleanup
Diffstat (limited to 'parsedjvutext.py')
-rw-r--r--parsedjvutext.py28
1 files changed, 0 insertions, 28 deletions
diff --git a/parsedjvutext.py b/parsedjvutext.py
index 9855786..a13421a 100644
--- a/parsedjvutext.py
+++ b/parsedjvutext.py
@@ -6,34 +6,6 @@ from djvu.decode import Context
from itertools import chain
import collections
-
-def parse_book_xml(djvubook):
- args = ["djvutoxml", djvubook]
- soup = BeautifulSoup(subprocess.check_output(args), "lxml")
- words = []
- coords = []
- for page in soup.find_all("hiddentext"):
- words.append([word.text for word in page.find_all("word")])
- coords.append([tuple(map(int, word["coords"].split(",")))
- for word in page.find_all("word")])
- return {"words": words, "coords": coords}
-
-
-def get_npages(djvubook):
- args = ["djvused", "-e", "n", djvubook]
- return int(subprocess.check_output(args))
-
-
-def parse_page_xml(djvubook, pagenumber):
- args = ["djvutoxml", "--page", str(pagenumber), djvubook]
- soup = BeautifulSoup(subprocess.check_output(args), "lxml")
- all_words = soup.find_all("word")
- words = [word.text for word in all_words]
- coords = [tuple(map(int, word["coords"].split(",")))
- for word in all_words]
- return {"words": words, "coords": coords}
-
-
def parse_page(page, html=False):
s, page_size = page.text.sexpr, page.size[1]