#!/usr/bin/python2
# Copyright 2003-2008, Nick Mathewson.  See LICENSE for licensing info.

"""BibTeX.py -- parse and manipulate BibTeX files and entries.

   Based on perl code by Eddie Kohler; heavily modified.
"""

import cStringIO
import re
import sys

import config

from entry import BibTeXEntry

__all__ = ['ParseError', 'BibTeX', 'FileIter', 'Parser', 'parseFile']


class ParseError(Exception):
    """Raised on invalid BibTeX"""
    pass


class BibTeX:
    """A parsed BibTeX file"""
    def __init__(self):
        self.entries = {}

    def addEntry(self, ent):
        """Add a BibTeX entry to this file."""
        k = ent.key
        if k.lower() in self.entries:
            print >> sys.stderr, "Already have an entry named %s" % k
            return
        self.entries[k.lower()] = ent

    def __contains__(self, key):
        return key.lower() in self.entries

    def __getitem__(self, key):
        return self.entries[key.lower()]

    def __iter__(self):
        return iter(self.entries.values())

    def resolve(self):
        """Validate all entries in this file, and resolve cross-references"""
        seen = {}
        for ent in self:
            seen.clear()
            while ent.get('crossref'):
                try:
                    cr = self.entries[ent['crossref'].lower()]
                except KeyError:
                    print "No such crossref: %s" % ent['crossref']
                    break
                if seen.get(cr.key):
                    raise ParseError("Circular crossref at %s" % ent.key)
                seen[cr.key] = 1
                del ent.entries['crossref']

                if cr.entryLine < ent.entryLine:
                    print "Warning: crossref %s used after declaration" % cr.key

                for k in cr.entries.keys():
                    if ent.entries.has_key(k):
                        print "ERROR: %s defined both in %s and in %s" % (
                            k, ent.key, cr.key)
                    else:
                        ent.entries[k] = cr.entries[k]

            ent.resolve()
        rk = config.REQUIRE_KEY
        if rk is None:
            # hack: if no key is required, require "title", since every
            # entry will have a title.
            rk = "title"

        for ent in self:
            if ent.type in config.OMIT_ENTRIES or not ent.has_key(rk):
                ent.check()
                del self.entries[ent.key.lower()]


class FileIter:
    def __init__(self, fname=None, file=None, it=None, string=None):
        if fname:
            file = open(fname, 'r')
        if string:
            file = cStringIO.StringIO(string)
        if file:
            self.iter = iter(file)
        assert self.iter
        self.lineno = 0

    def next(self):
        self.lineno += 1
        return self.iter.next()

    def advance(self, line):
        while not line or line.isspace() or COMMENT_RE.match(line):
            line = self.next()
        return line

# Matches a comment line outside of an entry.
OUTER_COMMENT_RE = re.compile(r'^\s*[\#\%]')
# Matches a comment line inside of an entry.
COMMENT_RE = re.compile(r'^\s*\%')
# Matches the start of an entry. group 1 is the type of the entry.
# group 2 is the rest of the line.
ENTRY_BEGIN_RE = re.compile(r'''^\s*\@([^\s\"\%\'\(\)\,\=\{\}]+)(.*)''')
# Start of an entry.  group 1 is the keyword naming the entry.
BRACE_BEGIN_RE = re.compile(r'\s*\{(.*)')
BRACE_END_RE = re.compile(r'\s*\}(.*)')
KEY_RE = re.compile(r'''\s*([^\"\#\%\'\(\)\,\=\{\}\s]+)(.*)''')

STRING_CLOSE_RE = re.compile(r'^([^\{\}\"]*)\"(.*)')
BRACE_CLOSE_RE = re.compile(r'^([^\{\}]*)\}(.*)')
BRACE_OPEN_RE = re.compile(r'^([^\{\}]*\{)(.*)')
RAW_DATA_RE = re.compile(r'^([^\s\},]+)(.*)')


class Parser:
    """Parser class: reads BibTeX from a file and returns a BibTeX object."""
    # Fields
    # strings: maps entry string keys to their values.
    # newStrings: all string definitions not in config.INITIAL_STRINGS
    # invStrings: map from string values to their keys.
    # fileiter: the line iterator we're parsing from.
    # result: the BibTeX object that we're parsing into
    # litStringLine: the line on which we started parsing a literal string;
    #     0 for none.
    # entryLine: the line on which the current entry started; 0 for none.
    #
    # curEntType: the type of the entry we're parsing now. (paper,article,etc)
    def __init__(self, fileiter, initial_strings, result=None):
        self.strings = config.INITIAL_STRINGS.copy()
        self.strings.update(initial_strings)
        self.newStrings = {}
        self.invStrings = {}
        for k, v in config.INITIAL_STRINGS.items():
            self.invStrings[v] = k
        self.fileiter = fileiter
        if result is None:
            result = BibTeX()
        self.result = result
        self.litStringLine = 0
        self.entryLine = 0

    def advance(self, line):
        return self.fileiter.advance(line)

    @property
    def lineno(self):
        return self.fileiter.lineno

    def _parseKey(self, line):
        line = self.advance(line)
        m = KEY_RE.match(line)
        if not m:
            raise ParseError("Expected key at line %s" % self.fileiter.lineno)
        key, line = m.groups()
        return key, line

    def _parseValue(self, line):
        bracelevel = 0
        data = []
        while True:
            line = self.advance(line).strip()
            assert line

            # Literal string?
            if line[0] == '"':
                line = line[1:]
                self.litStringLine = self.fileiter.lineno
                while True:
                    if bracelevel:
                        m = BRACE_CLOSE_RE.match(line)
                        if m:
                            data.append(m.group(1))
                            data.append('}')
                            line = m.group(2)
                            bracelevel -= 1
                            continue
                    else:
                        m = STRING_CLOSE_RE.match(line)
                        if m:
                            data.append(m.group(1))
                            line = m.group(2)
                            break
                    m = BRACE_OPEN_RE.match(line)
                    if m:
                        data.append(m.group(1))
                        line = m.group(2)
                        bracelevel += 1
                        continue
                    data.append(line)
                    data.append(" ")
                    line = self.fileiter.next()
                self.litStringLine = 0
            elif line[0] == '{':
                bracelevel += 1
                line = line[1:]
                while bracelevel:
                    m = BRACE_CLOSE_RE.match(line)
                    if m:
                        # print bracelevel, "A", repr(m.group(1))
                        data.append(m.group(1))
                        bracelevel -= 1
                        if bracelevel > 0:
                            # print bracelevel, "- '}'"
                            data.append('}')
                        line = m.group(2)
                        continue
                    m = BRACE_OPEN_RE.match(line)
                    if m:
                        bracelevel += 1
                        # print bracelevel, "B", repr(m.group(1))
                        data.append(m.group(1))
                        line = m.group(2)
                        continue
                    else:
                        # print bracelevel, "C", repr(line)
                        data.append(line)
                        data.append(" ")
                        line = self.fileiter.next()
            elif line[0] == '#':
                print >>sys.stderr, "Weird concat on line %s" % self.lineno
            elif line[0] in "},":
                if not data:
                    print >>sys.stderr, "No data after field on line %s" % (
                        self.lineno)
            else:
                m = RAW_DATA_RE.match(line)
                if m:
                    s = self.strings.get(m.group(1).lower())
                    if s is not None:
                        data.append(s)
                    else:
                        data.append(m.group(1))
                    line = m.group(2)
                else:
                    raise ParseError("Questionable line at line %s" % self.lineno)

            # Got a string, check for concatenation.
            if line.isspace() or not line:
                data.append(" ")
            line = self.advance(line).strip()
            assert line
            if line[0] == '#':
                line = line[1:]
            else:
                data = "".join(data)
                data = re.sub(r'\s+', ' ', data)
                data = re.sub(r'^\s+', '', data)
                data = re.sub(r'\s+$', '', data)
                return data, line

    def _parseEntry(self, line):  # name, strings, entries
        self.entryLine = self.lineno
        line = self.advance(line)

        m = BRACE_BEGIN_RE.match(line)
        if not m:
            raise ParseError("Expected an opening brace at line %s" % self.lineno)
        line = m.group(1)

        proto = {'string': 'p', 'preamble': 'v'}.get(self.curEntType, 'kp*')

        v = []
        while True:
            line = self.advance(line)

            m = BRACE_END_RE.match(line)
            if m:
                line = m.group(1)
                break
            if not proto:
                raise ParseError("Overlong entry starting on line %s"
                                 % self.entryLine)
            elif proto[0] == 'k':
                key, line = self._parseKey(line)
                v.append(key)
            elif proto[0] == 'v':
                value, line = self._parseValue(line)
                v.append(value)
            elif proto[0] == 'p':
                key, line = self._parseKey(line)
                v.append(key)
                line = self.advance(line).strip()
                if line[0] == '=':
                    line = line[1:]
                value, line = self._parseValue(line)
                v.append(value)
            else:
                assert 0
            line = line.strip()
            if line and line[0] == ',':
                line = line[1:]
            if proto and proto[1:] != '*':
                proto = proto[1:]
        if proto and proto[1:] != '*':
            raise ParseError("Missing arguments to %s on line %s" % (
                             self.curEntType, self.entryLine))

        if self.curEntType == 'string':
            self.strings[v[0]] = v[1]
            self.newStrings[v[0]] = v[1]
            self.invStrings[v[1]] = v[0]
        elif self.curEntType == 'preamble':
            pass
        else:
            key = v[0]
            d = {}
            for i in xrange(1, len(v), 2):
                d[v[i].lower()] = v[i+1]
            ent = BibTeXEntry(self.curEntType, key, d)
            ent.entryLine = self.entryLine
            self.result.addEntry(ent)

        return line

    def parse(self):
        try:
            self._parse()
        except StopIteration:
            if self.litStringLine:
                raise ParseError("Unexpected EOF in string (started on %s)" %
                                 self.litStringLine)
            elif self.entryLine:
                raise ParseError("Unexpected EOF at line %s (entry started "
                                 "on %s)" % (self.lineno, self.entryLine))

        self.result.invStrings = self.invStrings
        self.result.newStrings = self.newStrings

        return self.result

    def _parse(self):
        line = self.fileiter.next()
        while True:
            # Skip blank lines.
            while not line or line.isspace() or OUTER_COMMENT_RE.match(line):
                line = self.fileiter.next()
            # Get the first line of an entry.
            m = ENTRY_BEGIN_RE.match(line)
            if m:
                self.curEntType = m.group(1).lower()
                line = m.group(2)
                line = self._parseEntry(line)
                self.entryLine = 0
            else:
                raise ParseError("Bad input at line %s (expected a new entry.)"
                                 % self.lineno)


def parseFile(filename, result=None):
    """Helper function: parse a single BibTeX file"""
    f = FileIter(fname=filename)
    p = Parser(f, {}, result)
    r = p.parse()
    r.resolve()
    for e in r:
        e.check()
    return r


def parseString(string, result=None):
    """Helper function: parse BibTeX from a string"""
    f = FileIter(string=string)
    p = Parser(f, {}, result)
    r = p.parse()
    r.resolve()
    for e in r:
        e.check()
    return r

if __name__ == '__main__':
    if len(sys.argv) > 1:
        fname = sys.argv[1]
    else:
        fname = "testbib/pdos.bib"

    r = parseFile(fname)

    for e in r:
        if e.type in ("proceedings", "journal"):
            continue
        print e.to_html()