blob: d1af47e51e0a24502838fe1111cba4c604db845e (
plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
|
import sys
from xml.etree import ElementTree as ET
def parse_coords(word):
# coordinates are in dpi, and computed from the top left corner
return tuple([word.attrib[c] for c in ['xMin', 'xMax', 'yMin', 'yMax']])
def parse_book(book):
document = ET.parse(book)
ns = 'http://www.w3.org/1999/xhtml'
words = []
coords = []
for page in document.findall('.//{{{0}}}page'.format(ns)):
words.append([word.text for word in page.getchildren()])
coords.append([parse_coords(word) for word in page.getchildren()])
return {"words": words, "coords": coords}
if __name__=="__main__":
book = parse_book(sys.argv[1])
print book['words'][14]
|