aboutsummaryrefslogtreecommitdiffstats
path: root/parsepdftext.py
blob: 51e237cb94b070180c8a84e534eed2ef5d96f7c9 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
import sys
from xml.etree import ElementTree as ET

document = ET.parse(sys.argv[1])
ns = 'http://www.w3.org/1999/xhtml'
corners = ['xMin', 'xMax', 'yMin', 'yMax']
# coordinates are in dpi, and computed from the top left corner
words = []
coords = []
for i, page in enumerate(document.findall('.//{{{0}}}page'.format(ns))):
    words.append([word.text for word in page.getchildren()])
    coords.append([tuple([float(word.attrib[c]) for c in corners]) for word in page.getchildren()])