diff options
| author | Guillaume Horel <guillaume.horel@gmail.com> | 2013-08-03 13:33:33 -0400 |
|---|---|---|
| committer | Guillaume Horel <guillaume.horel@gmail.com> | 2013-08-03 13:33:33 -0400 |
| commit | 41df9107d8f7ae19bbdcadee0f411e9763c6fbbc (patch) | |
| tree | 0b3cde92ba83436e315693446333d185761d92bd /parsedjvutext.py | |
| parent | d82fdab22ae3600fc24758a150fc23e2fec36921 (diff) | |
| download | ocr-layer-curation-41df9107d8f7ae19bbdcadee0f411e9763c6fbbc.tar.gz | |
srcript to parse djvu xml
Diffstat (limited to 'parsedjvutext.py')
| -rw-r--r-- | parsedjvutext.py | 16 |
1 files changed, 16 insertions, 0 deletions
diff --git a/parsedjvutext.py b/parsedjvutext.py new file mode 100644 index 0000000..41c4194 --- /dev/null +++ b/parsedjvutext.py @@ -0,0 +1,16 @@ +import sys +from bs4 import BeautifulSoup + +def parse_book(book): + words = [] + coords = [] + with open(book) as fh: + soup = BeautifulSoup(fh, "lxml") + for page in soup.find_all("hiddentext"): + words.append([word.text for word in page.find_all("word")]) + coords.append([tuple(map(int, word["coords"].split(","))) \ + for word in page.find_all("word")]) + return {"words": words, "coords": coords} + +if __name__=="__main__": + book = parse_book(sys.argv[1]) |
