aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--parsedjvutext.py16
1 files changed, 16 insertions, 0 deletions
diff --git a/parsedjvutext.py b/parsedjvutext.py
new file mode 100644
index 0000000..41c4194
--- /dev/null
+++ b/parsedjvutext.py
@@ -0,0 +1,16 @@
+import sys
+from bs4 import BeautifulSoup
+
+def parse_book(book):
+ words = []
+ coords = []
+ with open(book) as fh:
+ soup = BeautifulSoup(fh, "lxml")
+ for page in soup.find_all("hiddentext"):
+ words.append([word.text for word in page.find_all("word")])
+ coords.append([tuple(map(int, word["coords"].split(","))) \
+ for word in page.find_all("word")])
+ return {"words": words, "coords": coords}
+
+if __name__=="__main__":
+ book = parse_book(sys.argv[1])