blob: 8521d7a5e6b2fd731f956209ef04de55dd8e8b0b (
plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
|
import sys
from xml.etree import ElementTree as ET
def parse_coords(word):
# coordinates are in dpi, and computed from the top left corner
return tuple([word.attrib[c] for c in ['xMin', 'xMax', 'yMin', 'yMax']])
def parse_book(book):
document = ET.parse(book)
ns = 'http://www.w3.org/1999/xhtml'
words = []
coords = []
for i, page in enumerate(document.findall('.//{{{0}}}page'.format(ns))):
words.append([word.text for word in page.getchildren()])
coords.append([parse_coords(word) for word in page.getchildren()])
return {"words": words, "coords": coords}
if __name__=="__main__":
book = parse_book(sys.argv[1])
|