blob: 41c41942de7c3960ac372ad63e3e9321d5aac764 (
plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
|
import sys
from bs4 import BeautifulSoup
def parse_book(book):
words = []
coords = []
with open(book) as fh:
soup = BeautifulSoup(fh, "lxml")
for page in soup.find_all("hiddentext"):
words.append([word.text for word in page.find_all("word")])
coords.append([tuple(map(int, word["coords"].split(","))) \
for word in page.find_all("word")])
return {"words": words, "coords": coords}
if __name__=="__main__":
book = parse_book(sys.argv[1])
|