#!/usr/bin/python2 # Copyright 2003-2008, Nick Mathewson. See LICENSE for licensing info. """BibTeX.py -- parse and manipulate BibTeX files and entries. Based on perl code by Eddie Kohler; heavily modified. """ import cStringIO import re import sys import os import copy import config from entry import BibTeXEntry, buildAuthorTable from utils import txtize, url_untranslate, smartJoin __all__ = ['ParseError', 'BibTeX', 'BibTeXEntry', 'htmlize', 'ParsedAuthor', 'FileIter', 'Parser', 'parseFile', 'splitEntriesBy', 'sortEntriesBy'] # List: must map from month number to month name. MONTHS = [None, "January", "February", "March", "April", "May", "June", "July", "August", "September", "October", "November", "December"] class ParseError(Exception): """Raised on invalid BibTeX""" pass class BibTeX: """A parsed BibTeX file""" def __init__(self): self.entries = [] # List of BibTeXEntry self.byKey = {} # Map from BibTeX key to BibTeX entry. def addEntry(self, ent): """Add a BibTeX entry to this file.""" k = ent.key if self.byKey.get(ent.key.lower()): print >> sys.stderr, "Already have an entry named %s" % k return self.entries.append(ent) self.byKey[ent.key.lower()] = ent def resolve(self): """Validate all entries in this file, and resolve cross-references""" seen = {} for ent in self.entries: seen.clear() while ent.get('crossref'): try: cr = self.byKey[ent['crossref'].lower()] except KeyError: print "No such crossref: %s" % ent['crossref'] break if seen.get(cr.key): raise ParseError("Circular crossref at %s" % ent.key) seen[cr.key] = 1 del ent.entries['crossref'] if cr.entryLine < ent.entryLine: print "Warning: crossref %s used after declaration" % cr.key for k in cr.entries.keys(): if ent.entries.has_key(k): print "ERROR: %s defined both in %s and in %s"%( k,ent.key,cr.key) else: ent.entries[k] = cr.entries[k] ent.resolve() newEntries = [] rk = config.REQUIRE_KEY if rk is None: # hack: if no key is required, require "title", since every # entry will have a title. rk = "title" for ent in self.entries: if ent.type in config.OMIT_ENTRIES or not ent.has_key(rk): ent.check() del self.byKey[ent.key.lower()] else: newEntries.append(ent) self.entries = newEntries def splitEntriesBy(entries, field): """Take a list of BibTeX entries and the name of a bibtex field; return a map from vield value to list of entry.""" result = {} for ent in entries: key = ent.get(field) if field in config.MULTI_VAL_FIELDS: key = [k.strip() for k in key.split(',')] else: key = [key] for k in key: try: result[k].append(ent) except: result[k] = [ent] return result def splitSortedEntriesBy(entries, field): """Take inputs as in splitEntriesBy, where 'entries' is sorted by 'field'. Return a list of (field-value, entry-list) tuples, in the order given in 'entries'.""" result = [] curVal = "alskjdsakldj" curList = [] for ent in entries: key = ent.get(field) if key == curVal: curList.append(ent) else: curVal = key curList = [ent] result.append((curVal, curList)) return result def sortEntriesBy(entries, field, default): """Take inputs as in splitEntriesBy, and return a list of entries sorted by the value of 'field'. Entries without 'field' are sorted as if their value were 'default'. """ tmp = [] i = 0 for ent in entries: i += 1 v = ent.get(field, default) if v.startswith(""): v = default if field in config.MULTI_VAL_FIELDS: for v_j in v.split(','): ent_j = copy.deepcopy(ent) ent_j.__setitem__(field, v_j.strip()) tmp.append((txtize(v_j.strip()), i, ent_j)) else: tmp.append((txtize(v), i, ent)) tmp.sort() return [ t[2] for t in tmp ] def splitEntriesByAuthor(entries): """Take a list of entries, sort them by author names, and return: a sorted list of (authorname-in-html, bibtex-entry-list) tuples, a map from authorname-in-html to name-for-url. Entries with multiple authors appear once per author. """ collapsedAuthors = buildAuthorTable(entries) entries = sortEntriesByDate(entries) result = {} # Name in sorting order -> entries htmlResult = {} # name in sorting order -> Full name url_map = {} # Full name -> Url for ent in entries: for a in ent.parsedAuthor: canonical = collapsedAuthors[a] url = canonical.getHomepage() sortkey = canonical.getSortingName() secname = canonical.getSectionName() if url: url_map[secname] = url htmlResult[sortkey] = secname result.setdefault(sortkey, []).append(ent) sortnames = result.keys() sortnames.sort() sections = [ (htmlResult[n], result[n]) for n in sortnames ] return sections, url_map ## def sortEntriesByAuthor(entries): ## tmp = [] ## i = 0 ## for ent in entries: ## i += 1 ## authors = [ txtize(" ".join(a.von+a.last+a.first+a.jr)) ## for a in ent.parsedAuthor ] ## tmp.append((tuple(authors), i, ent)) ## tmp.sort() ## return [ t[2] for t in tmp ] def sortEntriesByDate(entries): """Sort a list of entries by their publication date.""" tmp = [] i = 0 for ent in entries: i += 1 if (ent.get('month') == "forthcoming" or ent.get('year') == "forthcoming"): tmp.append((20000*13, i, ent)) continue try: monthname = ent.get("month") if monthname is not None: match = re.match(r"(\w+)--\w+", monthname) if match: monthname = match.group(1) mon = MONTHS.index(monthname) except ValueError: print "Unknown month %r in %s"%(ent.get("month"), ent.key) mon = 0 try: date = int(ent['year'])*13 + mon except KeyError: print "ERROR: No year field in %s"%ent.key date = 10000*13 except ValueError: date = 10000*13 tmp.append((date, i, ent)) tmp.sort() return [ t[2] for t in tmp ] class FileIter: def __init__(self, fname=None, file=None, it=None, string=None): if fname: file = open(fname, 'r') if string: file = cStringIO.StringIO(string) if file: it = iter(file.xreadlines()) self.iter = it assert self.iter self.lineno = 0 self._next = it.next def next(self): self.lineno += 1 return self._next() class Parser: """Parser class: reads BibTeX from a file and returns a BibTeX object.""" ## Fields # strings: maps entry string keys to their values. # newStrings: all string definitions not in config.INITIAL_STRINGS # invStrings: map from string values to their keys. # fileiter: the line iterator we're parsing from. # result: the BibTeX object that we're parsing into # litStringLine: the line on which we started parsing a literal string; # 0 for none. # entryLine: the line on which the current entry started; 0 for none. # # curEntType: the type of the entry we're parsing now. (paper,article,etc) def __init__(self, fileiter, initial_strings, result=None): self.strings = config.INITIAL_STRINGS.copy() self.strings.update(initial_strings) self.newStrings = {} self.invStrings = {} for k,v in config.INITIAL_STRINGS.items(): self.invStrings[v]=k self.fileiter = fileiter if result is None: result = BibTeX() self.result = result self.litStringLine = 0 self.entryLine = 0 def _parseKey(self, line): it = self.fileiter line = _advance(it, line) m = KEY_RE.match(line) if not m: raise ParseError("Expected key at line %s"%self.fileiter.lineno) key, line = m.groups() return key, line def _parseValue(self, line): it = self.fileiter bracelevel = 0 data = [] while 1: line = _advance(it,line) line = line.strip() assert line # Literal string? if line[0] == '"': line=line[1:] self.litStringLine = it.lineno while 1: if bracelevel: m = BRACE_CLOSE_RE.match(line) if m: data.append(m.group(1)) data.append('}') line = m.group(2) bracelevel -= 1 continue else: m = STRING_CLOSE_RE.match(line) if m: data.append(m.group(1)) line = m.group(2) break m = BRACE_OPEN_RE.match(line) if m: data.append(m.group(1)) line = m.group(2) bracelevel += 1 continue data.append(line) data.append(" ") line = it.next() self.litStringLine = 0 elif line[0] == '{': bracelevel += 1 line = line[1:] while bracelevel: m = BRACE_CLOSE_RE.match(line) if m: #print bracelevel, "A", repr(m.group(1)) data.append(m.group(1)) bracelevel -= 1 if bracelevel > 0: #print bracelevel, "- '}'" data.append('}') line = m.group(2) continue m = BRACE_OPEN_RE.match(line) if m: bracelevel += 1 #print bracelevel, "B", repr(m.group(1)) data.append(m.group(1)) line = m.group(2) continue else: #print bracelevel, "C", repr(line) data.append(line) data.append(" ") line = it.next() elif line[0] == '#': print >>sys.stderr, "Weird concat on line %s"%it.lineno elif line[0] in "},": if not data: print >>sys.stderr, "No data after field on line %s"%( it.lineno) else: m = RAW_DATA_RE.match(line) if m: s = self.strings.get(m.group(1).lower()) if s is not None: data.append(s) else: data.append(m.group(1)) line = m.group(2) else: raise ParseError("Questionable line at line %s"%it.lineno) # Got a string, check for concatenation. if line.isspace() or not line: data.append(" ") line = _advance(it,line) line = line.strip() assert line if line[0] == '#': line = line[1:] else: data = "".join(data) data = re.sub(r'\s+', ' ', data) data = re.sub(r'^\s+', '', data) data = re.sub(r'\s+$', '', data) return data, line def _parseEntry(self, line): #name, strings, entries it = self.fileiter self.entryLine = it.lineno line = _advance(it,line) m = BRACE_BEGIN_RE.match(line) if not m: raise ParseError("Expected an opening brace at line %s"%it.lineno) line = m.group(1) proto = { 'string' : 'p', 'preamble' : 'v', }.get(self.curEntType, 'kp*') v = [] while 1: line = _advance(it,line) m = BRACE_END_RE.match(line) if m: line = m.group(1) break if not proto: raise ParseError("Overlong entry starting on line %s" % self.entryLine) elif proto[0] == 'k': key, line = self._parseKey(line) v.append(key) elif proto[0] == 'v': value, line = self._parseValue(line) v.append(value) elif proto[0] == 'p': key, line = self._parseKey(line) v.append(key) line = _advance(it,line) line = line.lstrip() if line[0] == '=': line = line[1:] value, line = self._parseValue(line) v.append(value) else: assert 0 line = line.strip() if line and line[0] == ',': line = line[1:] if proto and proto[1:] != '*': proto = proto[1:] if proto and proto[1:] != '*': raise ParseError("Missing arguments to %s on line %s" % ( self.curEntType, self.entryLine)) if self.curEntType == 'string': self.strings[v[0]] = v[1] self.newStrings[v[0]] = v[1] self.invStrings[v[1]] = v[0] elif self.curEntType == 'preamble': pass else: key = v[0] d = {} for i in xrange(1,len(v),2): d[v[i].lower()] = v[i+1] ent = BibTeXEntry(self.curEntType, key, d) ent.entryLine = self.entryLine self.result.addEntry(ent) return line def parse(self): try: self._parse() except StopIteration: if self.litStringLine: raise ParseError("Unexpected EOF in string (started on %s)" % self.litStringLine) elif self.entryLine: raise ParseError("Unexpected EOF at line %s (entry started " "on %s)" % (self.fileiter.lineno, self.entryLine)) self.result.invStrings = self.invStrings self.result.newStrings = self.newStrings return self.result def _parse(self): it = self.fileiter line = it.next() while 1: # Skip blank lines. while not line or line.isspace() or OUTER_COMMENT_RE.match(line): line = it.next() # Get the first line of an entry. m = ENTRY_BEGIN_RE.match(line) if m: self.curEntType = m.group(1).lower() line = m.group(2) line = self._parseEntry(line) self.entryLine = 0 else: raise ParseError("Bad input at line %s (expected a new entry.)" % it.lineno) def _advance(it,line): while not line or line.isspace() or COMMENT_RE.match(line): line = it.next() return line # Matches a comment line outside of an entry. OUTER_COMMENT_RE = re.compile(r'^\s*[\#\%]') # Matches a comment line inside of an entry. COMMENT_RE = re.compile(r'^\s*\%') # Matches the start of an entry. group 1 is the type of the entry. # group 2 is the rest of the line. ENTRY_BEGIN_RE = re.compile(r'''^\s*\@([^\s\"\%\'\(\)\,\=\{\}]+)(.*)''') # Start of an entry. group 1 is the keyword naming the entry. BRACE_BEGIN_RE = re.compile(r'\s*\{(.*)') BRACE_END_RE = re.compile(r'\s*\}(.*)') KEY_RE = re.compile(r'''\s*([^\"\#\%\'\(\)\,\=\{\}\s]+)(.*)''') STRING_CLOSE_RE = re.compile(r'^([^\{\}\"]*)\"(.*)') BRACE_CLOSE_RE = re.compile(r'^([^\{\}]*)\}(.*)') BRACE_OPEN_RE = re.compile(r'^([^\{\}]*\{)(.*)') RAW_DATA_RE = re.compile(r'^([^\s\},]+)(.*)') def parseFile(filename, result=None): """Helper function: parse a single BibTeX file""" f = FileIter(fname=filename) p = Parser(f, {}, result) r = p.parse() r.resolve() for e in r.entries: e.check() return r def parseString(string, result=None): """Helper function: parse BibTeX from a string""" f = FileIter(string=string) p = Parser(f, {}, result) r = p.parse() r.resolve() for e in r.entries: e.check() return r if __name__ == '__main__': if len(sys.argv)>1: fname=sys.argv[1] else: fname="testbib/pdos.bib" r = parseFile(fname) for e in r.entries: if e.type in ("proceedings", "journal"): continue print e.to_html()