diff options
| author | Guillaume Horel <guillaume.horel@gmail.com> | 2013-08-03 10:39:10 -0400 |
|---|---|---|
| committer | Guillaume Horel <guillaume.horel@gmail.com> | 2013-08-03 10:39:10 -0400 |
| commit | 5588397ee5114b072b4d351747f9adbe3b0206f5 (patch) | |
| tree | 24feadd8bda7d4f301d899d250b22dc68a94bcad /parsepdftext.py | |
| parent | a2bb39738f012993352fc984d24f9ec9b1494146 (diff) | |
| download | ocr-layer-curation-5588397ee5114b072b4d351747f9adbe3b0206f5.tar.gz | |
add inital text file and parser
Diffstat (limited to 'parsepdftext.py')
| -rw-r--r-- | parsepdftext.py | 10 |
1 files changed, 10 insertions, 0 deletions
diff --git a/parsepdftext.py b/parsepdftext.py new file mode 100644 index 0000000..778d30b --- /dev/null +++ b/parsepdftext.py @@ -0,0 +1,10 @@ +import sys +from xml.etree import ElementTree as ET + +document = ET.parse(sys.argv[1]) +ns = 'http://www.w3.org/1999/xhtml' +for page, i in enumerate(document.findall('.//{{{0}}}page'.format(ns))): + for word in page.getchildren(): + octalescapedtext = ''.join(["\{0:o}".format(c) if c>127 else chr(c) for c in map(ord,word.text.encode('utf8'))]) + #escape quote character + print octalescapedtext |
