add inital text file and parser

author: Guillaume Horel <guillaume.horel@gmail.com> 2013-08-03 10:39:10 -0400
committer: Guillaume Horel <guillaume.horel@gmail.com> 2013-08-03 10:39:10 -0400
commit: 5588397ee5114b072b4d351747f9adbe3b0206f5 (patch)
tree: 24feadd8bda7d4f301d899d250b22dc68a94bcad /parsepdftext.py
parent: a2bb39738f012993352fc984d24f9ec9b1494146 (diff)
download: ocr-layer-curation-5588397ee5114b072b4d351747f9adbe3b0206f5.tar.gz
1 files changed, 10 insertions, 0 deletions
diff --git a/parsepdftext.py b/parsepdftext.py
new file mode 100644
index 0000000..778d30b
--- /dev/null
+++ b/parsepdftext.py
@@ -0,0 +1,10 @@
+import sys
+from xml.etree import ElementTree as ET
+
+document = ET.parse(sys.argv[1])
+ns = 'http://www.w3.org/1999/xhtml'
+for page, i in enumerate(document.findall('.//{{{0}}}page'.format(ns))):
+    for word in page.getchildren():
+        octalescapedtext = ''.join(["\{0:o}".format(c) if c>127 else chr(c) for c in map(ord,word.text.encode('utf8'))])
+        #escape quote character
+        print octalescapedtext
author	Guillaume Horel <guillaume.horel@gmail.com>	2013-08-03 10:39:10 -0400
committer	Guillaume Horel <guillaume.horel@gmail.com>	2013-08-03 10:39:10 -0400
commit	5588397ee5114b072b4d351747f9adbe3b0206f5 (patch)
tree	24feadd8bda7d4f301d899d250b22dc68a94bcad /parsepdftext.py
parent	a2bb39738f012993352fc984d24f9ec9b1494146 (diff)
download	ocr-layer-curation-5588397ee5114b072b4d351747f9adbe3b0206f5.tar.gz