aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorGuillaume Horel <guillaume.horel@gmail.com>2013-08-03 13:33:33 -0400
committerGuillaume Horel <guillaume.horel@gmail.com>2013-08-03 13:33:33 -0400
commit41df9107d8f7ae19bbdcadee0f411e9763c6fbbc (patch)
tree0b3cde92ba83436e315693446333d185761d92bd
parentd82fdab22ae3600fc24758a150fc23e2fec36921 (diff)
downloadocr-layer-curation-41df9107d8f7ae19bbdcadee0f411e9763c6fbbc.tar.gz
srcript to parse djvu xml
-rw-r--r--parsedjvutext.py16
1 files changed, 16 insertions, 0 deletions
diff --git a/parsedjvutext.py b/parsedjvutext.py
new file mode 100644
index 0000000..41c4194
--- /dev/null
+++ b/parsedjvutext.py
@@ -0,0 +1,16 @@
+import sys
+from bs4 import BeautifulSoup
+
+def parse_book(book):
+ words = []
+ coords = []
+ with open(book) as fh:
+ soup = BeautifulSoup(fh, "lxml")
+ for page in soup.find_all("hiddentext"):
+ words.append([word.text for word in page.find_all("word")])
+ coords.append([tuple(map(int, word["coords"].split(","))) \
+ for word in page.find_all("word")])
+ return {"words": words, "coords": coords}
+
+if __name__=="__main__":
+ book = parse_book(sys.argv[1])