aboutsummaryrefslogtreecommitdiffstats
path: root/parsepdftext.py
blob: d1af47e51e0a24502838fe1111cba4c604db845e (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
import sys
from xml.etree import ElementTree as ET

def parse_coords(word):
    # coordinates are in dpi, and computed from the top left corner
    return tuple([word.attrib[c] for c in ['xMin', 'xMax', 'yMin', 'yMax']])

def parse_book(book):
    document = ET.parse(book)
    ns = 'http://www.w3.org/1999/xhtml'

    words = []
    coords = []
    for page in document.findall('.//{{{0}}}page'.format(ns)):
        words.append([word.text for word in page.getchildren()])
        coords.append([parse_coords(word) for word in page.getchildren()])
    return {"words": words, "coords": coords}

if __name__=="__main__":
    book = parse_book(sys.argv[1])
    print book['words'][14]