From f96752448a537bd6a3378a83ab0e8476653ec59c Mon Sep 17 00:00:00 2001 From: Guillaume Horel Date: Thu, 27 Feb 2014 13:56:11 -0500 Subject: cleanup --- parsepdftext.py | 21 --------------------- 1 file changed, 21 deletions(-) delete mode 100644 parsepdftext.py (limited to 'parsepdftext.py') diff --git a/parsepdftext.py b/parsepdftext.py deleted file mode 100644 index d1af47e..0000000 --- a/parsepdftext.py +++ /dev/null @@ -1,21 +0,0 @@ -import sys -from xml.etree import ElementTree as ET - -def parse_coords(word): - # coordinates are in dpi, and computed from the top left corner - return tuple([word.attrib[c] for c in ['xMin', 'xMax', 'yMin', 'yMax']]) - -def parse_book(book): - document = ET.parse(book) - ns = 'http://www.w3.org/1999/xhtml' - - words = [] - coords = [] - for page in document.findall('.//{{{0}}}page'.format(ns)): - words.append([word.text for word in page.getchildren()]) - coords.append([parse_coords(word) for word in page.getchildren()]) - return {"words": words, "coords": coords} - -if __name__=="__main__": - book = parse_book(sys.argv[1]) - print book['words'][14] -- cgit v1.2.3-70-g09d2