import sys
from bs4 import BeautifulSoup
import subprocess
import djvu
from djvu.decode import Context
from itertools import chain


def parse_book_xml(djvubook):
    args = ["djvutoxml", djvubook]
    soup = BeautifulSoup(subprocess.check_output(args), "lxml")
    words = []
    coords = []
    for page in soup.find_all("hiddentext"):
        words.append([word.text for word in page.find_all("word")])
        coords.append([tuple(map(int, word["coords"].split(",")))
                       for word in page.find_all("word")])
    return {"words": words, "coords": coords}


def get_npages(djvubook):
    args = ["djvused", "-e", "n", djvubook]
    return int(subprocess.check_output(args))


def parse_page_xml(djvubook, pagenumber):
    args = ["djvutoxml", "--page", str(pagenumber), djvubook]
    soup = BeautifulSoup(subprocess.check_output(args), "lxml")
    all_words = soup.find_all("word")
    words = [word.text for word in all_words]
    coords = [tuple(map(int, word["coords"].split(",")))
              for word in all_words]
    return {"words": words, "coords": coords}


def parse_page_sexp(s, page_size=None):
    if type(s) is djvu.sexpr.ListExpression:
        if len(s) == 0:
            pass
        if str(s[0].value) == "word":
            coords = [s[i].value for i in xrange(1, 5)]
            if page_size:
                coords[1] = page_size - coords[1]
                coords[3] = page_size - coords[3]
            word = s[5].value
            yield (word, coords)
        else:
            for c in chain.from_iterable(parse_page_sexp(child, page_size)
                                         for child in s[5:]):
                yield c
    else:
        pass


def parse_book(djvubook, page=None, html=False):
    """
    returns the list of words and coordinates from a djvu book.
    if page is None, returns the whole book.
    if html is True, coordinates are computed from the bottom of the page
    """
    c = Context()
    document = c.new_document(djvu.decode.FileURI(djvubook))
    document.decoding_job.wait()
    if page:
        toparse = [document.pages[page - 1]]
    else:
        toparse = document.pages

    def gen_pages():
        for i, page in enumerate(toparse):
            if page.text.sexpr:
                page_size = page.size[1] if html else None
                gen = parse_page_sexp(page.text.sexpr, page_size)
                yield zip(*gen)

    return list(gen_pages())

if __name__ == "__main__":
    book = parse_book(sys.argv[1])