From 41df9107d8f7ae19bbdcadee0f411e9763c6fbbc Mon Sep 17 00:00:00 2001 From: Guillaume Horel Date: Sat, 3 Aug 2013 13:33:33 -0400 Subject: srcript to parse djvu xml --- parsedjvutext.py | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) create mode 100644 parsedjvutext.py diff --git a/parsedjvutext.py b/parsedjvutext.py new file mode 100644 index 0000000..41c4194 --- /dev/null +++ b/parsedjvutext.py @@ -0,0 +1,16 @@ +import sys +from bs4 import BeautifulSoup + +def parse_book(book): + words = [] + coords = [] + with open(book) as fh: + soup = BeautifulSoup(fh, "lxml") + for page in soup.find_all("hiddentext"): + words.append([word.text for word in page.find_all("word")]) + coords.append([tuple(map(int, word["coords"].split(","))) \ + for word in page.find_all("word")]) + return {"words": words, "coords": coords} + +if __name__=="__main__": + book = parse_book(sys.argv[1]) -- cgit v1.2.3-70-g09d2