diff options
| -rw-r--r-- | parsedjvutext.py | 16 |
1 files changed, 16 insertions, 0 deletions
diff --git a/parsedjvutext.py b/parsedjvutext.py new file mode 100644 index 0000000..41c4194 --- /dev/null +++ b/parsedjvutext.py @@ -0,0 +1,16 @@ +import sys +from bs4 import BeautifulSoup + +def parse_book(book): + words = [] + coords = [] + with open(book) as fh: + soup = BeautifulSoup(fh, "lxml") + for page in soup.find_all("hiddentext"): + words.append([word.text for word in page.find_all("word")]) + coords.append([tuple(map(int, word["coords"].split(","))) \ + for word in page.find_all("word")]) + return {"words": words, "coords": coords} + +if __name__=="__main__": + book = parse_book(sys.argv[1]) |
