aboutsummaryrefslogtreecommitdiffstats
path: root/parsedjvutext.py
diff options
context:
space:
mode:
Diffstat (limited to 'parsedjvutext.py')
-rw-r--r--parsedjvutext.py81
1 files changed, 45 insertions, 36 deletions
diff --git a/parsedjvutext.py b/parsedjvutext.py
index 773a1d4..06cecb9 100644
--- a/parsedjvutext.py
+++ b/parsedjvutext.py
@@ -1,6 +1,9 @@
import sys
from bs4 import BeautifulSoup
import subprocess
+import djvu
+from djvu.decode import Context
+from itertools import chain
def parse_book_xml(djvubook):
args = ["djvutoxml", djvubook]
@@ -26,43 +29,49 @@ def parse_page_xml(djvubook, pagenumber):
for word in all_words]
return {"words": words, "coords": coords}
-def parse_wordline(line):
- line = line.lstrip(" (").rstrip(")").split(" ")
- word = line[5]
- word = word[1:-1].decode("string_escape").decode("utf-8")
- coords = map(int, line[1:5])
- return word, coords
+def parse_page_sexp(s, page_size=None):
+ if type(s) is djvu.sexpr.ListExpression:
+ if len(s) == 0:
+ pass
+ if str(s[0].value) == "word":
+ coords = [s[i].value for i in xrange(1, 5)]
+ if page_size:
+ coords[1]=page_size-coords[1]
+ coords[3]=page_size-coords[3]
+ word = s[5].value
+ yield (word, coords)
+ else:
+ for c in chain.from_iterable(parse_page_sexp(child, page_size) for child in s[5:]):
+ yield c
+ else:
+ pass
-def page_sexp(djvubook, pagenumber):
- args = ["djvused", "-e", "select {0};print-txt".format(pagenumber),
- djvubook]
- return subprocess.check_output(args).split("\n")
+def parse_book(djvubook, page=None, html=False):
+ """
+ returns the list of words and coordinates from a djvu book.
+ if page is None, returns the whole book.
+ if html is True, coordinates are computed from the bottom of the page
+ """
+ c = Context()
+ document = c.new_document(djvu.decode.FileURI(djvubook))
+ document.decoding_job.wait()
+ if page:
+ toparse = [document.pages[page-1]]
+ else:
+ toparse = document.pages
+ words = [[]] * len(toparse)
+ coords = [[]] * len(toparse)
+ page_size = None
+ for i, page in enumerate(toparse):
+ if page.text.sexpr:
+ if html:
+ page_size= p.size[1]
+ gen = parse_page_sexp(page.text.sexpr, page_size)
+ word_coords = zip(*gen)
+ words[i] = word_coords[0]
+ coords[i] = word_coords[1]
-def parse_page_sexp(djvubook, pagenumber):
- page = [parse_wordline(line) for line in page_sexp(djvubook, pagenumber) \
- if "word" in line]
- return {"words": [a for a, b in page], "coords": [b for a, b in page]}
-
-def parse_book_sexp(djvubook):
- book = {"words": [], "coords": []}
- page_coords = []
- page_words = []
- firstpage = True
- args = ["djvused", "-e", "print-txt", djvubook]
- for line in subprocess.check_output(args).split("\n"):
- if "page" in line:
- if firstpage:
- firstpage = False
- else:
- book["words"].append(page_words)
- book["coords"].append(page_coords)
- page_coords = []
- page_words = []
- if "word" in line:
- word, coords = parse_wordline(line)
- page_words.append(word)
- page_coords.append(coords)
- return book
+ return {"words": words, "coords": coords}
if __name__=="__main__":
- book_sexp = parse_book_sexp(sys.argv[1])
+ book = parse_book(sys.argv[1])