#!/usr/bin/python2 # Copyright 2003-2008, Nick Mathewson. See LICENSE for licensing info. """BibTeX.py -- parse and manipulate BibTeX files and entries. Based on perl code by Eddie Kohler; heavily modified. """ import cStringIO import re import sys import config from entry import BibTeXEntry __all__ = ['ParseError', 'BibTeX', 'FileIter', 'Parser', 'parseFile'] class ParseError(Exception): """Raised on invalid BibTeX""" pass class BibTeX: """A parsed BibTeX file""" def __init__(self): self.entries = {} def addEntry(self, ent): """Add a BibTeX entry to this file.""" k = ent.key if k.lower() in self.entries: print >> sys.stderr, "Already have an entry named %s" % k return self.entries[k.lower()] = ent def __contains__(self, key): return key.lower() in self.entries def __getitem__(self, key): return self.entries[key.lower()] def __iter__(self): return iter(self.entries.values()) def resolve(self): """Validate all entries in this file, and resolve cross-references""" seen = {} for ent in self: seen.clear() while ent.get('crossref'): try: cr = self.entries[ent['crossref'].lower()] except KeyError: print "No such crossref: %s" % ent['crossref'] break if seen.get(cr.key): raise ParseError("Circular crossref at %s" % ent.key) seen[cr.key] = 1 del ent.entries['crossref'] if cr.entryLine < ent.entryLine: print "Warning: crossref %s used after declaration" % cr.key for k in cr.entries.keys(): if ent.entries.has_key(k): print "ERROR: %s defined both in %s and in %s" % ( k, ent.key, cr.key) else: ent.entries[k] = cr.entries[k] ent.resolve() rk = config.REQUIRE_KEY if rk is None: # hack: if no key is required, require "title", since every # entry will have a title. rk = "title" for ent in self: if ent.type in config.OMIT_ENTRIES or not ent.has_key(rk): ent.check() del self.entries[ent.key.lower()] class FileIter: def __init__(self, fname=None, file=None, it=None, string=None): if fname: file = open(fname, 'r') if string: file = cStringIO.StringIO(string) if file: self.iter = iter(file) assert self.iter self.lineno = 0 def next(self): self.lineno += 1 return self.iter.next() def advance(self, line): while not line or line.isspace() or COMMENT_RE.match(line): line = self.next() return line # Matches a comment line outside of an entry. OUTER_COMMENT_RE = re.compile(r'^\s*[\#\%]') # Matches a comment line inside of an entry. COMMENT_RE = re.compile(r'^\s*\%') # Matches the start of an entry. group 1 is the type of the entry. # group 2 is the rest of the line. ENTRY_BEGIN_RE = re.compile(r'''^\s*\@([^\s\"\%\'\(\)\,\=\{\}]+)(.*)''') # Start of an entry. group 1 is the keyword naming the entry. BRACE_BEGIN_RE = re.compile(r'\s*\{(.*)') BRACE_END_RE = re.compile(r'\s*\}(.*)') KEY_RE = re.compile(r'''\s*([^\"\#\%\'\(\)\,\=\{\}\s]+)(.*)''') STRING_CLOSE_RE = re.compile(r'^([^\{\}\"]*)\"(.*)') BRACE_CLOSE_RE = re.compile(r'^([^\{\}]*)\}(.*)') BRACE_OPEN_RE = re.compile(r'^([^\{\}]*\{)(.*)') RAW_DATA_RE = re.compile(r'^([^\s\},]+)(.*)') class Parser: """Parser class: reads BibTeX from a file and returns a BibTeX object.""" # Fields # strings: maps entry string keys to their values. # newStrings: all string definitions not in config.INITIAL_STRINGS # invStrings: map from string values to their keys. # fileiter: the line iterator we're parsing from. # result: the BibTeX object that we're parsing into # litStringLine: the line on which we started parsing a literal string; # 0 for none. # entryLine: the line on which the current entry started; 0 for none. # # curEntType: the type of the entry we're parsing now. (paper,article,etc) def __init__(self, fileiter, initial_strings, result=None): self.strings = config.INITIAL_STRINGS.copy() self.strings.update(initial_strings) self.newStrings = {} self.invStrings = {} for k, v in config.INITIAL_STRINGS.items(): self.invStrings[v] = k self.fileiter = fileiter if result is None: result = BibTeX() self.result = result self.litStringLine = 0 self.entryLine = 0 def advance(self, line): return self.fileiter.advance(line) @property def lineno(self): return self.fileiter.lineno def _parseKey(self, line): line = self.advance(line) m = KEY_RE.match(line) if not m: raise ParseError("Expected key at line %s" % self.fileiter.lineno) key, line = m.groups() return key, line def _parseValue(self, line): bracelevel = 0 data = [] while True: line = self.advance(line).strip() assert line # Literal string? if line[0] == '"': line = line[1:] self.litStringLine = self.fileiter.lineno while True: if bracelevel: m = BRACE_CLOSE_RE.match(line) if m: data.append(m.group(1)) data.append('}') line = m.group(2) bracelevel -= 1 continue else: m = STRING_CLOSE_RE.match(line) if m: data.append(m.group(1)) line = m.group(2) break m = BRACE_OPEN_RE.match(line) if m: data.append(m.group(1)) line = m.group(2) bracelevel += 1 continue data.append(line) data.append(" ") line = self.fileiter.next() self.litStringLine = 0 elif line[0] == '{': bracelevel += 1 line = line[1:] while bracelevel: m = BRACE_CLOSE_RE.match(line) if m: # print bracelevel, "A", repr(m.group(1)) data.append(m.group(1)) bracelevel -= 1 if bracelevel > 0: # print bracelevel, "- '}'" data.append('}') line = m.group(2) continue m = BRACE_OPEN_RE.match(line) if m: bracelevel += 1 # print bracelevel, "B", repr(m.group(1)) data.append(m.group(1)) line = m.group(2) continue else: # print bracelevel, "C", repr(line) data.append(line) data.append(" ") line = self.fileiter.next() elif line[0] == '#': print >>sys.stderr, "Weird concat on line %s" % self.lineno elif line[0] in "},": if not data: print >>sys.stderr, "No data after field on line %s" % ( self.lineno) else: m = RAW_DATA_RE.match(line) if m: s = self.strings.get(m.group(1).lower()) if s is not None: data.append(s) else: data.append(m.group(1)) line = m.group(2) else: raise ParseError("Questionable line at line %s" % self.lineno) # Got a string, check for concatenation. if line.isspace() or not line: data.append(" ") line = self.advance(line).strip() assert line if line[0] == '#': line = line[1:] else: data = "".join(data) data = re.sub(r'\s+', ' ', data) data = re.sub(r'^\s+', '', data) data = re.sub(r'\s+$', '', data) return data, line def _parseEntry(self, line): # name, strings, entries self.entryLine = self.lineno line = self.advance(line) m = BRACE_BEGIN_RE.match(line) if not m: raise ParseError("Expected an opening brace at line %s" % self.lineno) line = m.group(1) proto = {'string': 'p', 'preamble': 'v'}.get(self.curEntType, 'kp*') v = [] while True: line = self.advance(line) m = BRACE_END_RE.match(line) if m: line = m.group(1) break if not proto: raise ParseError("Overlong entry starting on line %s" % self.entryLine) elif proto[0] == 'k': key, line = self._parseKey(line) v.append(key) elif proto[0] == 'v': value, line = self._parseValue(line) v.append(value) elif proto[0] == 'p': key, line = self._parseKey(line) v.append(key) line = self.advance(line).strip() if line[0] == '=': line = line[1:] value, line = self._parseValue(line) v.append(value) else: assert 0 line = line.strip() if line and line[0] == ',': line = line[1:] if proto and proto[1:] != '*': proto = proto[1:] if proto and proto[1:] != '*': raise ParseError("Missing arguments to %s on line %s" % ( self.curEntType, self.entryLine)) if self.curEntType == 'string': self.strings[v[0]] = v[1] self.newStrings[v[0]] = v[1] self.invStrings[v[1]] = v[0] elif self.curEntType == 'preamble': pass else: key = v[0] d = {} for i in xrange(1, len(v), 2): d[v[i].lower()] = v[i+1] ent = BibTeXEntry(self.curEntType, key, d) ent.entryLine = self.entryLine self.result.addEntry(ent) return line def parse(self): try: self._parse() except StopIteration: if self.litStringLine: raise ParseError("Unexpected EOF in string (started on %s)" % self.litStringLine) elif self.entryLine: raise ParseError("Unexpected EOF at line %s (entry started " "on %s)" % (self.lineno, self.entryLine)) self.result.invStrings = self.invStrings self.result.newStrings = self.newStrings return self.result def _parse(self): line = self.fileiter.next() while True: # Skip blank lines. while not line or line.isspace() or OUTER_COMMENT_RE.match(line): line = self.fileiter.next() # Get the first line of an entry. m = ENTRY_BEGIN_RE.match(line) if m: self.curEntType = m.group(1).lower() line = m.group(2) line = self._parseEntry(line) self.entryLine = 0 else: raise ParseError("Bad input at line %s (expected a new entry.)" % self.lineno) def parseFile(filename, result=None): """Helper function: parse a single BibTeX file""" f = FileIter(fname=filename) p = Parser(f, {}, result) r = p.parse() r.resolve() for e in r: e.check() return r def parseString(string, result=None): """Helper function: parse BibTeX from a string""" f = FileIter(string=string) p = Parser(f, {}, result) r = p.parse() r.resolve() for e in r: e.check() return r if __name__ == '__main__': if len(sys.argv) > 1: fname = sys.argv[1] else: fname = "testbib/pdos.bib" r = parseFile(fname) for e in r: if e.type in ("proceedings", "journal"): continue print e.to_html()