aboutsummaryrefslogtreecommitdiffstats
path: root/parsepdftext.py
diff options
context:
space:
mode:
authorGuillaume Horel <guillaume.horel@serenitascapital.com>2014-02-27 13:56:11 -0500
committerGuillaume Horel <guillaume.horel@serenitascapital.com>2014-02-27 13:56:11 -0500
commitf96752448a537bd6a3378a83ab0e8476653ec59c (patch)
tree14669d6d10e9d0343a33b32bf2ab7197fd25b84a /parsepdftext.py
parent0d583ea5f9873a5b2a22a89bbb979bf08dd05a90 (diff)
downloadocr-layer-curation-f96752448a537bd6a3378a83ab0e8476653ec59c.tar.gz
cleanup
Diffstat (limited to 'parsepdftext.py')
-rw-r--r--parsepdftext.py21
1 files changed, 0 insertions, 21 deletions
diff --git a/parsepdftext.py b/parsepdftext.py
deleted file mode 100644
index d1af47e..0000000
--- a/parsepdftext.py
+++ /dev/null
@@ -1,21 +0,0 @@
-import sys
-from xml.etree import ElementTree as ET
-
-def parse_coords(word):
- # coordinates are in dpi, and computed from the top left corner
- return tuple([word.attrib[c] for c in ['xMin', 'xMax', 'yMin', 'yMax']])
-
-def parse_book(book):
- document = ET.parse(book)
- ns = 'http://www.w3.org/1999/xhtml'
-
- words = []
- coords = []
- for page in document.findall('.//{{{0}}}page'.format(ns)):
- words.append([word.text for word in page.getchildren()])
- coords.append([parse_coords(word) for word in page.getchildren()])
- return {"words": words, "coords": coords}
-
-if __name__=="__main__":
- book = parse_book(sys.argv[1])
- print book['words'][14]