aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--parsedjvutext.py53
1 files changed, 35 insertions, 18 deletions
diff --git a/parsedjvutext.py b/parsedjvutext.py
index 773a1d4..696601d 100644
--- a/parsedjvutext.py
+++ b/parsedjvutext.py
@@ -1,6 +1,9 @@
import sys
from bs4 import BeautifulSoup
import subprocess
+import djvu
+from djvu.decode import Context
+from itertools import chain
def parse_book_xml(djvubook):
args = ["djvutoxml", djvubook]
@@ -43,25 +46,39 @@ def parse_page_sexp(djvubook, pagenumber):
if "word" in line]
return {"words": [a for a, b in page], "coords": [b for a, b in page]}
-def parse_book_sexp(djvubook):
+def parse_sexp(s):
+ if type(s) is djvu.sexpr.ListExpression:
+ if len(s) == 0:
+ return []
+ if str(s[0].value) == "word":
+ coords = [s[i].value for i in xrange(1, 5)]
+ word = s[5].value
+ return [(word, coords)]
+ else:
+ gen = chain.from_iterable(parse_sexp(child) for child in s[5:])
+ return list(gen)
+ else:
+ return []
+
+def parse_book_sexp(djvubook, page=None, html=False):
+ """
+ returns the list of words and coordinates from a djvu book.
+ if page is None, returns the whole book.
+ if html is True, coordinates are computed from the bottom of the page
+ """
book = {"words": [], "coords": []}
- page_coords = []
- page_words = []
- firstpage = True
- args = ["djvused", "-e", "print-txt", djvubook]
- for line in subprocess.check_output(args).split("\n"):
- if "page" in line:
- if firstpage:
- firstpage = False
- else:
- book["words"].append(page_words)
- book["coords"].append(page_coords)
- page_coords = []
- page_words = []
- if "word" in line:
- word, coords = parse_wordline(line)
- page_words.append(word)
- page_coords.append(coords)
+ c = Context()
+ document = c.new_document(djvu.decode.FileURI(djvubook))
+ document.decoding_job.wait()
+ if page:
+ toparse = [document.pages[page-1]]
+ else:
+ toparse = document.pages
+ for page in toparse:
+ gen = parse_sexp(page.text.sexpr)
+ word_coords = zip(*gen)
+ book["words"].append(word_coords[0])
+ book["coords"].append(word_coords[1])
return book
if __name__=="__main__":