3 files changed, 0 insertions, 56 deletions
diff --git a/extract_pages.sh b/extract_pages.sh
deleted file mode 100755
index c49a0c4..0000000
--- a/extract_pages.sh
+++ /dev/null
@@ -1,7 +0,0 @@
-#!/bin/bash
-
-npages=$(djvused -e 'n' $1)
-
-for i in $(seq 1 $npages); do
-  djvused -e "select $i;output-txt" $1 >page${i}.djvutxt
-done
diff --git a/parsedjvutext.py b/parsedjvutext.py
index 9855786..a13421a 100644
--- a/parsedjvutext.py
+++ b/parsedjvutext.py
@@ -6,34 +6,6 @@ from djvu.decode import Context
 from itertools import chain
 import collections
 
-
-def parse_book_xml(djvubook):
-    args = ["djvutoxml", djvubook]
-    soup = BeautifulSoup(subprocess.check_output(args), "lxml")
-    words = []
-    coords = []
-    for page in soup.find_all("hiddentext"):
-        words.append([word.text for word in page.find_all("word")])
-        coords.append([tuple(map(int, word["coords"].split(",")))
-                       for word in page.find_all("word")])
-    return {"words": words, "coords": coords}
-
-
-def get_npages(djvubook):
-    args = ["djvused", "-e", "n", djvubook]
-    return int(subprocess.check_output(args))
-
-
-def parse_page_xml(djvubook, pagenumber):
-    args = ["djvutoxml", "--page", str(pagenumber), djvubook]
-    soup = BeautifulSoup(subprocess.check_output(args), "lxml")
-    all_words = soup.find_all("word")
-    words = [word.text for word in all_words]
-    coords = [tuple(map(int, word["coords"].split(",")))
-              for word in all_words]
-    return {"words": words, "coords": coords}
-
-
 def parse_page(page, html=False):
     s, page_size = page.text.sexpr, page.size[1]
 
diff --git a/parsepdftext.py b/parsepdftext.py
deleted file mode 100644
index d1af47e..0000000
--- a/parsepdftext.py
+++ /dev/null
@@ -1,21 +0,0 @@
-import sys
-from xml.etree import ElementTree as ET
-
-def parse_coords(word):
-    # coordinates are in dpi, and computed from the top left corner
-    return tuple([word.attrib[c] for c in ['xMin', 'xMax', 'yMin', 'yMax']])
-
-def parse_book(book):
-    document = ET.parse(book)
-    ns = 'http://www.w3.org/1999/xhtml'
-
-    words = []
-    coords = []
-    for page in document.findall('.//{{{0}}}page'.format(ns)):
-        words.append([word.text for word in page.getchildren()])
-        coords.append([parse_coords(word) for word in page.getchildren()])
-    return {"words": words, "coords": coords}
-
-if __name__=="__main__":
-    book = parse_book(sys.argv[1])
-    print book['words'][14]