aboutsummaryrefslogtreecommitdiffstats
path: root/parsedjvutext.py
blob: 41c41942de7c3960ac372ad63e3e9321d5aac764 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
import sys
from bs4 import BeautifulSoup

def parse_book(book):
    words = []
    coords = []
    with open(book) as fh:
        soup = BeautifulSoup(fh, "lxml")
        for page in soup.find_all("hiddentext"):
            words.append([word.text for word in page.find_all("word")])
            coords.append([tuple(map(int, word["coords"].split(","))) \
                           for word in page.find_all("word")])
    return {"words": words, "coords": coords}

if __name__=="__main__":
    book = parse_book(sys.argv[1])