blob: 778d30ba181276c15c2277e791d5661498534471 (
plain)
1
2
3
4
5
6
7
8
9
10
|
import sys
from xml.etree import ElementTree as ET
document = ET.parse(sys.argv[1])
ns = 'http://www.w3.org/1999/xhtml'
for page, i in enumerate(document.findall('.//{{{0}}}page'.format(ns))):
for word in page.getchildren():
octalescapedtext = ''.join(["\{0:o}".format(c) if c>127 else chr(c) for c in map(ord,word.text.encode('utf8'))])
#escape quote character
print octalescapedtext
|