#!/usr/bin/python2
# Copyright 2003-2008, Nick Mathewson.  See LICENSE for licensing info.

"""BibTeX.py -- parse and manipulate BibTeX files and entries.

   Based on perl code by Eddie Kohler; heavily modified.
"""

import cStringIO
import re
import sys
import os
import copy

import config

from entry import BibTeXEntry, buildAuthorTable
from utils import txtize, url_untranslate, smartJoin

__all__ = ['ParseError', 'BibTeX', 'BibTeXEntry', 'htmlize',
           'ParsedAuthor', 'FileIter', 'Parser', 'parseFile',
           'splitEntriesBy', 'sortEntriesBy']

# List: must map from month number to month name.
MONTHS = [None, "January", "February", "March", "April", "May", "June",
          "July", "August", "September", "October", "November", "December"]


class ParseError(Exception):
    """Raised on invalid BibTeX"""
    pass


class BibTeX:
    """A parsed BibTeX file"""
    def __init__(self):
        self.entries = []  # List of BibTeXEntry
        self.byKey = {}  # Map from BibTeX key to BibTeX entry.

    def addEntry(self, ent):
        """Add a BibTeX entry to this file."""
        k = ent.key
        if self.byKey.get(ent.key.lower()):
            print >> sys.stderr, "Already have an entry named %s" % k
            return
        self.entries.append(ent)
        self.byKey[ent.key.lower()] = ent

    def resolve(self):
        """Validate all entries in this file, and resolve cross-references"""
        seen = {}
        for ent in self.entries:
            seen.clear()
            while ent.get('crossref'):
                try:
                    cr = self.byKey[ent['crossref'].lower()]
                except KeyError:
                    print "No such crossref: %s" % ent['crossref']
                    break
                if seen.get(cr.key):
                    raise ParseError("Circular crossref at %s" % ent.key)
                seen[cr.key] = 1
                del ent.entries['crossref']

                if cr.entryLine < ent.entryLine:
                    print "Warning: crossref %s used after declaration" % cr.key

                for k in cr.entries.keys():
                    if ent.entries.has_key(k):
                        print "ERROR: %s defined both in %s and in %s"%(
                            k,ent.key,cr.key)
                    else:
                        ent.entries[k] = cr.entries[k]

            ent.resolve()
        newEntries = []
        rk = config.REQUIRE_KEY
        if rk is None:
            # hack: if no key is required, require "title", since every
            # entry will have a title.
            rk = "title"

        for ent in self.entries:
            if ent.type in config.OMIT_ENTRIES or not ent.has_key(rk):
                ent.check()
                del self.byKey[ent.key.lower()]
            else:
                newEntries.append(ent)
        self.entries = newEntries


def splitEntriesBy(entries, field):
    """Take a list of BibTeX entries and the name of a bibtex field; return
       a map from vield value to list of entry."""
    result = {}
    for ent in entries:
        key = ent.get(field)
        if field in config.MULTI_VAL_FIELDS:
            key = [k.strip() for k in key.split(',')]
        else:
            key = [key]
        for k in key:
            try:
                result[k].append(ent)
            except:
                result[k] = [ent]
    return result

def splitSortedEntriesBy(entries, field):
    """Take inputs as in splitEntriesBy, where 'entries' is sorted by 'field'.
       Return a list of (field-value, entry-list) tuples, in the order
       given in 'entries'."""
    result = []
    curVal = "alskjdsakldj"
    curList = []
    for ent in entries:
        key = ent.get(field)
        if key == curVal:
            curList.append(ent)
        else:
            curVal = key
            curList = [ent]
            result.append((curVal, curList))
    return result

def sortEntriesBy(entries, field, default):
    """Take inputs as in splitEntriesBy, and return a list of entries sorted
       by the value of 'field'. Entries without 'field' are sorted as if their
       value were 'default'.
       """
    tmp = []
    i = 0
    for ent in entries:
        i += 1
        v = ent.get(field, default)
        if v.startswith("<span class='bad'>"):
            v = default
        if field in config.MULTI_VAL_FIELDS:
            for v_j in v.split(','):
                ent_j = copy.deepcopy(ent)
                ent_j.__setitem__(field, v_j.strip())
                tmp.append((txtize(v_j.strip()), i, ent_j))
        else: tmp.append((txtize(v), i, ent))
    tmp.sort()
    return [ t[2] for t in tmp ]

def splitEntriesByAuthor(entries):
    """Take a list of entries, sort them by author names, and return:
         a sorted list of (authorname-in-html, bibtex-entry-list) tuples,
         a map from authorname-in-html to name-for-url.
       Entries with multiple authors appear once per author.
    """
    collapsedAuthors = buildAuthorTable(entries)
    entries = sortEntriesByDate(entries)
    result = {} # Name in sorting order -> entries
    htmlResult = {} # name in sorting order -> Full name
    url_map = {} # Full name -> Url
    for ent in entries:
        for a in ent.parsedAuthor:
            canonical = collapsedAuthors[a]
            url = canonical.getHomepage()
            sortkey = canonical.getSortingName()
            secname = canonical.getSectionName()
            if url:
                url_map[secname] = url

            htmlResult[sortkey] = secname
            result.setdefault(sortkey, []).append(ent)
    sortnames = result.keys()
    sortnames.sort()
    sections = [ (htmlResult[n], result[n]) for n in sortnames ]
    return sections, url_map

## def sortEntriesByAuthor(entries):
##     tmp = []
##     i = 0
##     for ent in entries:
##         i += 1
##         authors = [ txtize(" ".join(a.von+a.last+a.first+a.jr))
##                     for a in ent.parsedAuthor ]
##         tmp.append((tuple(authors), i, ent))
##     tmp.sort()
##     return [ t[2] for t in tmp ]

def sortEntriesByDate(entries):
    """Sort a list of entries by their publication date."""
    tmp = []
    i = 0
    for ent in entries:
        i += 1
        if (ent.get('month') == "forthcoming" or
            ent.get('year') == "forthcoming"):
            tmp.append((20000*13, i, ent))
            continue
        try:
            monthname = ent.get("month")
            if monthname is not None:
                match = re.match(r"(\w+)--\w+", monthname)
                if match:
                    monthname = match.group(1)
            mon = MONTHS.index(monthname)
        except ValueError:
            print "Unknown month %r in %s"%(ent.get("month"), ent.key)
            mon = 0

        try:
            date = int(ent['year'])*13 + mon
        except KeyError:
            print "ERROR: No year field in %s"%ent.key
            date = 10000*13
        except ValueError:
            date = 10000*13
        tmp.append((date, i, ent))
    tmp.sort()
    return [ t[2] for t in tmp ]


class FileIter:
    def __init__(self, fname=None, file=None, it=None, string=None):
        if fname:
            file = open(fname, 'r')
        if string:
            file = cStringIO.StringIO(string)
        if file:
            it = iter(file.xreadlines())
        self.iter = it
        assert self.iter
        self.lineno = 0
        self._next = it.next
    def next(self):
        self.lineno += 1
        return self._next()


class Parser:
    """Parser class: reads BibTeX from a file and returns a BibTeX object."""
    ## Fields
    # strings: maps entry string keys to their values.
    # newStrings: all string definitions not in config.INITIAL_STRINGS
    # invStrings: map from string values to their keys.
    # fileiter: the line iterator we're parsing from.
    # result: the BibTeX object that we're parsing into
    # litStringLine: the line on which we started parsing a literal string;
    #     0 for none.
    # entryLine: the line on which the current entry started; 0 for none.
    #
    # curEntType: the type of the entry we're parsing now. (paper,article,etc)
    def __init__(self, fileiter, initial_strings, result=None):
        self.strings = config.INITIAL_STRINGS.copy()
        self.strings.update(initial_strings)
        self.newStrings = {}
        self.invStrings = {}
        for k,v in config.INITIAL_STRINGS.items():
            self.invStrings[v]=k
        self.fileiter = fileiter
        if result is None:
            result = BibTeX()
        self.result = result
        self.litStringLine = 0
        self.entryLine = 0

    def _parseKey(self, line):
        it = self.fileiter
        line = _advance(it, line)
        m = KEY_RE.match(line)
        if not m:
            raise ParseError("Expected key at line %s"%self.fileiter.lineno)
        key, line = m.groups()
        return key, line

    def _parseValue(self, line):
        it = self.fileiter
        bracelevel = 0
        data = []
        while 1:
            line = _advance(it,line)
            line = line.strip()
            assert line

            # Literal string?
            if line[0] == '"':
                line=line[1:]
                self.litStringLine = it.lineno
                while 1:
                    if bracelevel:
                        m = BRACE_CLOSE_RE.match(line)
                        if m:
                            data.append(m.group(1))
                            data.append('}')
                            line = m.group(2)
                            bracelevel -= 1
                            continue
                    else:
                        m = STRING_CLOSE_RE.match(line)
                        if m:
                            data.append(m.group(1))
                            line = m.group(2)
                            break
                    m = BRACE_OPEN_RE.match(line)
                    if m:
                        data.append(m.group(1))
                        line = m.group(2)
                        bracelevel += 1
                        continue
                    data.append(line)
                    data.append(" ")
                    line = it.next()
                self.litStringLine = 0
            elif line[0] == '{':
                bracelevel += 1
                line = line[1:]
                while bracelevel:
                    m = BRACE_CLOSE_RE.match(line)
                    if m:
                        #print bracelevel, "A", repr(m.group(1))
                        data.append(m.group(1))
                        bracelevel -= 1
                        if bracelevel > 0:
                            #print bracelevel, "- '}'"
                            data.append('}')
                        line = m.group(2)
                        continue
                    m = BRACE_OPEN_RE.match(line)
                    if m:
                        bracelevel += 1
                        #print bracelevel, "B", repr(m.group(1))
                        data.append(m.group(1))
                        line = m.group(2)
                        continue
                    else:
                        #print bracelevel, "C", repr(line)
                        data.append(line)
                        data.append(" ")
                        line = it.next()
            elif line[0] == '#':
                print >>sys.stderr, "Weird concat on line %s"%it.lineno
            elif line[0] in "},":
                if not data:
                    print >>sys.stderr, "No data after field on line %s"%(
                        it.lineno)
            else:
                m = RAW_DATA_RE.match(line)
                if m:
                    s = self.strings.get(m.group(1).lower())
                    if s is not None:
                        data.append(s)
                    else:
                        data.append(m.group(1))
                    line = m.group(2)
                else:
                    raise ParseError("Questionable line at line %s"%it.lineno)

            # Got a string, check for concatenation.
            if line.isspace() or not line:
                data.append(" ")
            line = _advance(it,line)
            line = line.strip()
            assert line
            if line[0] == '#':
                line = line[1:]
            else:
                data = "".join(data)
                data = re.sub(r'\s+', ' ', data)
                data = re.sub(r'^\s+', '', data)
                data = re.sub(r'\s+$', '', data)
                return data, line

    def _parseEntry(self, line): #name, strings, entries
        it = self.fileiter
        self.entryLine = it.lineno
        line = _advance(it,line)

        m = BRACE_BEGIN_RE.match(line)
        if not m:
            raise ParseError("Expected an opening brace at line %s"%it.lineno)
        line = m.group(1)

        proto = { 'string' : 'p',
                  'preamble' : 'v',
                  }.get(self.curEntType, 'kp*')

        v = []
        while 1:
            line = _advance(it,line)

            m = BRACE_END_RE.match(line)
            if m:
                line = m.group(1)
                break
            if not proto:
                raise ParseError("Overlong entry starting on line %s"
                                 % self.entryLine)
            elif proto[0] == 'k':
                key, line = self._parseKey(line)
                v.append(key)
            elif proto[0] == 'v':
                value, line = self._parseValue(line)
                v.append(value)
            elif proto[0] == 'p':
                key, line = self._parseKey(line)
                v.append(key)
                line = _advance(it,line)
                line = line.lstrip()
                if line[0] == '=':
                    line = line[1:]
                value, line = self._parseValue(line)
                v.append(value)
            else:
                assert 0
            line = line.strip()
            if line and line[0] == ',':
                line = line[1:]
            if proto and proto[1:] != '*':
                proto = proto[1:]
        if proto and proto[1:] != '*':
            raise ParseError("Missing arguments to %s on line %s" % (
                             self.curEntType, self.entryLine))

        if self.curEntType == 'string':
            self.strings[v[0]] = v[1]
            self.newStrings[v[0]] = v[1]
            self.invStrings[v[1]] = v[0]
        elif self.curEntType == 'preamble':
            pass
        else:
            key = v[0]
            d = {}
            for i in xrange(1,len(v),2):
                d[v[i].lower()] = v[i+1]
            ent = BibTeXEntry(self.curEntType, key, d)
            ent.entryLine = self.entryLine
            self.result.addEntry(ent)

        return line

    def parse(self):
        try:
            self._parse()
        except StopIteration:
            if self.litStringLine:
                raise ParseError("Unexpected EOF in string (started on %s)" %
                                 self.litStringLine)
            elif self.entryLine:
                raise ParseError("Unexpected EOF at line %s (entry started "
                                 "on %s)" % (self.fileiter.lineno,
                                             self.entryLine))

        self.result.invStrings = self.invStrings
        self.result.newStrings = self.newStrings

        return self.result

    def _parse(self):
        it = self.fileiter
        line = it.next()
        while 1:
            # Skip blank lines.
            while not line or line.isspace() or OUTER_COMMENT_RE.match(line):
                line = it.next()
            # Get the first line of an entry.
            m = ENTRY_BEGIN_RE.match(line)
            if m:
                self.curEntType = m.group(1).lower()
                line = m.group(2)
                line = self._parseEntry(line)
                self.entryLine = 0
            else:
                raise ParseError("Bad input at line %s (expected a new entry.)"
                                 % it.lineno)

def _advance(it,line):
    while not line or line.isspace() or COMMENT_RE.match(line):
        line = it.next()
    return line

# Matches a comment line outside of an entry.
OUTER_COMMENT_RE = re.compile(r'^\s*[\#\%]')
# Matches a comment line inside of an entry.
COMMENT_RE = re.compile(r'^\s*\%')
# Matches the start of an entry. group 1 is the type of the entry.
# group 2 is the rest of the line.
ENTRY_BEGIN_RE = re.compile(r'''^\s*\@([^\s\"\%\'\(\)\,\=\{\}]+)(.*)''')
# Start of an entry.  group 1 is the keyword naming the entry.
BRACE_BEGIN_RE = re.compile(r'\s*\{(.*)')
BRACE_END_RE = re.compile(r'\s*\}(.*)')
KEY_RE = re.compile(r'''\s*([^\"\#\%\'\(\)\,\=\{\}\s]+)(.*)''')

STRING_CLOSE_RE = re.compile(r'^([^\{\}\"]*)\"(.*)')
BRACE_CLOSE_RE = re.compile(r'^([^\{\}]*)\}(.*)')
BRACE_OPEN_RE = re.compile(r'^([^\{\}]*\{)(.*)')
RAW_DATA_RE = re.compile(r'^([^\s\},]+)(.*)')

def parseFile(filename, result=None):
    """Helper function: parse a single BibTeX file"""
    f = FileIter(fname=filename)
    p = Parser(f, {}, result)
    r = p.parse()
    r.resolve()
    for e in r.entries:
        e.check()
    return r

def parseString(string, result=None):
    """Helper function: parse BibTeX from a string"""
    f = FileIter(string=string)
    p = Parser(f, {}, result)
    r = p.parse()
    r.resolve()
    for e in r.entries:
        e.check()
    return r

if __name__ == '__main__':
    if len(sys.argv)>1:
        fname=sys.argv[1]
    else:
        fname="testbib/pdos.bib"

    r = parseFile(fname)

    for e in r.entries:
        if e.type in ("proceedings", "journal"): continue
        print e.to_html()