From 21e329d4380c29825f36877e5f7c7fabc8ec067b Mon Sep 17 00:00:00 2001 From: Guillaume Horel Date: Sat, 3 Aug 2013 10:58:57 -0400 Subject: working version of the parser --- parsepdftext.py | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) (limited to 'parsepdftext.py') diff --git a/parsepdftext.py b/parsepdftext.py index 778d30b..51e237c 100644 --- a/parsepdftext.py +++ b/parsepdftext.py @@ -3,8 +3,10 @@ from xml.etree import ElementTree as ET document = ET.parse(sys.argv[1]) ns = 'http://www.w3.org/1999/xhtml' -for page, i in enumerate(document.findall('.//{{{0}}}page'.format(ns))): - for word in page.getchildren(): - octalescapedtext = ''.join(["\{0:o}".format(c) if c>127 else chr(c) for c in map(ord,word.text.encode('utf8'))]) - #escape quote character - print octalescapedtext +corners = ['xMin', 'xMax', 'yMin', 'yMax'] +# coordinates are in dpi, and computed from the top left corner +words = [] +coords = [] +for i, page in enumerate(document.findall('.//{{{0}}}page'.format(ns))): + words.append([word.text for word in page.getchildren()]) + coords.append([tuple([float(word.attrib[c]) for c in corners]) for word in page.getchildren()]) -- cgit v1.2.3-70-g09d2