working version of the parser

author: Guillaume Horel <guillaume.horel@gmail.com> 2013-08-03 10:58:57 -0400
committer: Guillaume Horel <guillaume.horel@gmail.com> 2013-08-03 10:58:57 -0400
commit: 21e329d4380c29825f36877e5f7c7fabc8ec067b (patch)
tree: e03e3c7fd0f2e68d6d1a28073dfcf9b34a23359e
parent: 5588397ee5114b072b4d351747f9adbe3b0206f5 (diff)
download: ocr-layer-curation-21e329d4380c29825f36877e5f7c7fabc8ec067b.tar.gz
1 files changed, 7 insertions, 5 deletions
diff --git a/parsepdftext.py b/parsepdftext.py
index 778d30b..51e237c 100644
--- a/parsepdftext.py
+++ b/parsepdftext.py
@@ -3,8 +3,10 @@ from xml.etree import ElementTree as ET
 
 document = ET.parse(sys.argv[1])
 ns = 'http://www.w3.org/1999/xhtml'
-for page, i in enumerate(document.findall('.//{{{0}}}page'.format(ns))):
-    for word in page.getchildren():
-        octalescapedtext = ''.join(["\{0:o}".format(c) if c>127 else chr(c) for c in map(ord,word.text.encode('utf8'))])
-        #escape quote character
-        print octalescapedtext
+corners = ['xMin', 'xMax', 'yMin', 'yMax']
+# coordinates are in dpi, and computed from the top left corner
+words = []
+coords = []
+for i, page in enumerate(document.findall('.//{{{0}}}page'.format(ns))):
+    words.append([word.text for word in page.getchildren()])
+    coords.append([tuple([float(word.attrib[c]) for c in corners]) for word in page.getchildren()])
author	Guillaume Horel <guillaume.horel@gmail.com>	2013-08-03 10:58:57 -0400
committer	Guillaume Horel <guillaume.horel@gmail.com>	2013-08-03 10:58:57 -0400
commit	21e329d4380c29825f36877e5f7c7fabc8ec067b (patch)
tree	e03e3c7fd0f2e68d6d1a28073dfcf9b34a23359e
parent	5588397ee5114b072b4d351747f9adbe3b0206f5 (diff)
download	ocr-layer-curation-21e329d4380c29825f36877e5f7c7fabc8ec067b.tar.gz