From f96752448a537bd6a3378a83ab0e8476653ec59c Mon Sep 17 00:00:00 2001
From: Guillaume Horel <guillaume.horel@serenitascapital.com>
Date: Thu, 27 Feb 2014 13:56:11 -0500
Subject: cleanup

---
 parsepdftext.py | 21 ---------------------
 1 file changed, 21 deletions(-)
 delete mode 100644 parsepdftext.py

(limited to 'parsepdftext.py')

diff --git a/parsepdftext.py b/parsepdftext.py
deleted file mode 100644
index d1af47e..0000000
--- a/parsepdftext.py
+++ /dev/null
@@ -1,21 +0,0 @@
-import sys
-from xml.etree import ElementTree as ET
-
-def parse_coords(word):
-    # coordinates are in dpi, and computed from the top left corner
-    return tuple([word.attrib[c] for c in ['xMin', 'xMax', 'yMin', 'yMax']])
-
-def parse_book(book):
-    document = ET.parse(book)
-    ns = 'http://www.w3.org/1999/xhtml'
-
-    words = []
-    coords = []
-    for page in document.findall('.//{{{0}}}page'.format(ns)):
-        words.append([word.text for word in page.getchildren()])
-        coords.append([parse_coords(word) for word in page.getchildren()])
-    return {"words": words, "coords": coords}
-
-if __name__=="__main__":
-    book = parse_book(sys.argv[1])
-    print book['words'][14]
-- 
cgit v1.2.3-70-g09d2