diff options
| author | Nick Mathewson <nickm@torproject.org> | 2003-05-17 06:10:20 +0000 |
|---|---|---|
| committer | Nick Mathewson <nickm@torproject.org> | 2003-05-17 06:10:20 +0000 |
| commit | 1d07a97300d48872726edd989f53bf489dc00a41 (patch) | |
| tree | baa19dd713b183d6694364251b6e17afe9195b0f /BibTeX.py | |
| parent | 11fddbc0273c37d651399ec782c57065f8030a76 (diff) | |
| download | anonbib-1d07a97300d48872726edd989f53bf489dc00a41.tar.gz | |
Initial revision
svn:r2
Diffstat (limited to 'BibTeX.py')
| -rw-r--r-- | BibTeX.py | 637 |
1 files changed, 637 insertions, 0 deletions
diff --git a/BibTeX.py b/BibTeX.py new file mode 100644 index 0000000..07e3b66 --- /dev/null +++ b/BibTeX.py @@ -0,0 +1,637 @@ +#!/usr/bin/python + +import cStringIO +import re +import sys + +INITIAL_STRINGS = { + 'jan' : 'January', 'feb' : 'February', + 'mar' : 'March', 'apr' : 'April', + 'may' : 'May', 'jun' : 'June', + 'jul' : 'July', 'aug' : 'August', + 'sep' : 'September', 'oct' : 'October', + 'nov' : 'November', 'dec' : 'December' + } + +class ParseError(Exception): + pass + +class BibTeX: + def __init__(self): + self.entries = [] + self.byKey = {} + def addEntry(self, ent): + k = ent.key + if self.byKey.get(ent.key): + print >> sys.stderr, "Already have an entry named %s"%k + return + self.entries.append(ent) + self.byKey[ent.key] = ent + def resolve(self): + seen = {} + for ent in self.entries: + seen.clear() + while ent.get('crossref'): + try: + cr = self.byKey[ent['crossref'].lower()] + except KeyError: + print "No such crossref: %s", ent['crossref'] + print ent + break + if seen.get(cr.key): + raise ParseError("Circular crossref at %s" % ent.key) + seen[cr.key] = 1 + del ent.entries['crossref'] + ent.entries.update(cr.entries) + ent.resolve() + +DISPLAYED_FIELDS = [ 'title', 'author', 'journal', 'booktitle', +'school', 'institution', 'organization', 'volume', 'number', 'year', +'month', 'address', 'chapter', 'edition', 'pages', 'editor', +'howpublished', 'key', 'publisher', 'type', 'note' ] + +class BibTeXEntry: + def __init__(self, type, key, entries): + self.type = type + self.key = key + self.entries = entries + self._get = self.entries.__getitem__ + def get(self, k, v=None): + return self.entries.get(k,v) + def __getitem__(self, k): + return self._get(k) + def __str__(self): + return self.format(70,1) + def format(self, width=70,v=0): + d = ["@%s{%s,\n" % (self.type, self.key)] + if v: + df = DISPLAYED_FIELDS[:] + for k in self.entries.keys(): + if k not in df: + df.append(k) + else: + df = DISPLAYED_FIELDS + for f in df: + if not self.entries.has_key(f): + continue + v = self.entries[f] + d.append(" ") + s = "%s = {%s}\n" % (f, v) + d.append(_split(s,width)) + d.append("}\n") + return "".join(d) + def resolve(self): + a = self.get('author') + if a: + self.parsedAuthor = parseAuthor(a) + #print a + #print " => ",repr(self.parsedAuthor) + else: + self.parsedAuthor = None + def check(self): + ok = 1 + if self.type == 'inproceedings': + fields = 'booktitle', 'month', 'year' + elif self.type == 'article': + fields = 'journal', 'month', 'year' + elif self.type == 'techreport': + fields = 'institution', 'number' + elif self.type == 'misc': + fields = 'howpublished', + else: + fields = () + fields += 'title', 'author' + + for field in fields: + if not self.get(field): + print "ERROR: %s has no %s field" % (self.key, field) + self.entries[field] = "<b>???</b>" + ok = 0 + + return ok + + def biblio_to_html(self): + if self.type == 'inproceedings': + booktitle = self['booktitle'] + bookurl = self.get('bookurl') + if bookurl: + m = PROCEEDINGS_RE.match(booktitle) + if m: + res = ["In the ", m.group(1), + '<a href="%s">'%bookurl, m.group(2), "</a>"] + else: + res = ['In the <a href="%s">%s</a>' % (bookurl,booktitle)] + else: + res = ["In the ", booktitle ] + + if self.get("edition"): + res.append(",") + res.append(self['edition']) + if self.get("address"): + res.append(",") + res.append(self['address']) + res.append(", %s %s" % (self['month'], self['year'])) + if not self.get('pages'): + pass + elif "-" in self['pages']: + res.append(", pages %s"%self['pages']) + else: + res.append(", page %s"%self['pages']) + elif self.type == 'article': + res = ["In "] + if self.get('journalurl'): + res.append('<a href="%s">%s</a>'% + (self['journalurl'],self['journal'])) + else: + res.append(self['journal']) + if self.get('volume'): + res.append(" <b>%s</b>"%self['volume']) + if self.get('number'): + res.append("(%s)"%self['number']) + res.append(", %s %s" % (self['month'], self['year'])) + if not self.get('pages'): + pass + elif "-" in self['pages']: + res.append(", pages %s"%self['pages']) + else: + res.append(", page %s"%self['pages']) + elif self.type == 'techreport': + res = [ "%s %s %s" % (self['institution'], + self.get('type', 'technical report'), + self['number']) ] + if self.get('month') or self.get('year'): + res.append(", %s %s" % (self.get('month', ''), + self.get('year', ''))) + elif self.type == 'mastersthesis' or self.type == 'phdthesis': + if self.get('type'): + res = [self['type']] + elif type == 'mastersthesis': + res = ["Masters's thesis"] + else: + res = ["Ph.D. thesis"] + if self.get('school'): + res.append(", %s"%(self['school'])) + if self.get('month') or self.get('year'): + res.append(", %s %s" % (self.get('month', ''), + self.get('year', ''))) + elif self.type == 'misc': + res = [self['howpublished']] + if self.get('month') or self.get('year'): + res.append(", %s %s" % (self.get('month', ''), + self.get('year', ''))) + if not self.get('pages'): + pass + elif "-" in self['pages']: + res.append(", pages %s"%self['pages']) + else: + res.append(", page %s"%self['pages']) + else: + res = ["<Odd type %s>"%self.type] + + res[0:0] = ["<span class='biblio'>"] + res.append("</span ") + + res.append("<span class='availability'>" + "(<a href='__'>BibTex  entry<a>)") + return htmlize("".join(res)) + + def to_html(self): + res = ["<li><p class='entry'><span class='title'>%s</span>"%( + htmlize(self['title']))] + availability = [] + for key, name in (('www_abstract_url', 'abstract'), + ('www_html_url', 'HTML'), + ('www_pdf_url', 'PDF'), + ('www_ps_url', 'PS'), + ('www_ps_gz_url', 'gzipped PS')): + url = self.get('key') + if not url: continue + availability.append('<a href="%s">%s</a>' %(url,name)) + if availability: + res.append(" <span class='availability'>(") + res.append(", ".join(availability)) + res.append("</span") + res.append("<br>") + + #res.append("\n<!-- %r -->\n" % self.parsedAuthor) + htmlAuthors = [] + for author in self.parsedAuthor: + f,v,l,j = author.first,author.von,author.last,author.jr + a = " ".join(f+v+l) + if j: + a = "%s, %s" %(a,j) + htmlAuthors.append(htmlize(a)) + if len(htmlAuthors) == 1: + res.append(htmlAuthors[0]) + elif len(htmlAuthors) == 2: + res.append(" and ".join(htmlAuthors)) + else: + res.append(", ".join(htmlAuthors[:-1])) + res.append(", and ") + res.append(htmlAuthors[-1]) + + if res[-1][-1] != '.': + res.append(".") + res.append("</span><br>\n") + res.append(self.biblio_to_html()) + res.append("</p></li>\n\n") + return "".join(res) + +RE_LONE_AMP = re.compile(r'&([^a-z0-9])') +RE_LONE_I = re.compile(r'\\i([^a-z0-9])') +RE_ACCENT = re.compile(r'\\([\'`~^"])(.)') +ACCENT_MAP = { "'": 'acute', "`" : 'grave', "~": 'tilde', + "^": 'circ', '"' : 'uml' } +RE_TEX_CMD = re.compile(r"(?:\\[a-zA-Z@]+|\\.)") +RE_PAGE_SPAN = re.compile(r"(\d)--(\d)") +def htmlize(s): + s = RE_LONE_AMP.sub(lambda m: "&%s" % m.group(1), s) + s = RE_LONE_I.sub(lambda m: "i%s" % m.group(1), s) + s = RE_ACCENT.sub(lambda m: "&%s%s;" %(m.group(2), + ACCENT_MAP[(m.group(1))]), + s) + s = RE_TEX_CMD.sub("", s) + s = s.translate(ALLCHARS, "{}") + s = RE_PAGE_SPAN.sub(lambda m: "%s-%s"%(m.groups()), s) + return s + +PROCEEDINGS_RE = re.compile( + r'((?:proceedings|workshop record) of(?: the)? )(.*)', + re.I) + + +class ParsedAuthor: + def __init__(self, first, von, last, jr): + self.first = first + self.von = von + self.last = last + self.jr = jr + def __repr__(self): + return "ParsedAuthor(%r,%r,%r,%r)"%(self.first,self.von, + self.last,self.jr) + def __str__(self): + return " ".join(self.first+self.von+self.last+self.jr) + +def _split(s,w=79): + r = [] + s = s.replace("\n", " ") + while len(s) > w: + for i in xrange(w-1, 0, -1): + if s[i] == ' ': + r.append(s[:i]) + s = s[i+1:] + break + else: + r.append(s[:w]) + s = s[w:] + r.append(s) + r.append("") + return "\n".join(r) + +class FileIter: + def __init__(self, fname=None, file=None, it=None, string=None): + if fname: + file = open(fname, 'r') + if string: + file = cStringIO.StringIO(string) + if file: + it = iter(file.xreadlines()) + self.iter = it + assert self.iter + self.lineno = 0 + self._next = it.next + def next(self): + self.lineno += 1 + return self._next() + + +def parseAuthor(s): + items = [] + + #print "A", `s` + s = s.strip() + while s: + s = s.strip() + bracelevel = 0 + for i in xrange(len(s)): + if s[i] == '{': + bracelevel += 1 + elif s[i] == '}': + bracelevel -= 1 + elif bracelevel <= 0 and s[i] in " \t\n,": + break + if i+1 == len(s): + items.append(s) + else: + items.append(s[0:i]) + if (s[i] == ','): + items.append(',') + s = s[i+1:] + + #print "B", items + + authors = [[]] + for item in items: + if item == 'and': + authors.append([]) + else: + authors[-1].append(item) + + #print "C", authors + + parsedAuthors = [] + # Split into first, von, last, jr + for author in authors: + #print author + + commas = 0 + fvl = [] + vl = [] + f = [] + v = [] + l = [] + j = [] + cur = fvl + for item in author: + if item == ',': + if commas == 0: + vl = fvl + fvl = [] + cur = f + else: + j.extend(f) + f = [] + else: + cur.append(item) + if commas == 0: + split_von(f,v,l,fvl) + else: + split_von(None,v,l,vl) + + parsedAuthors.append(ParsedAuthor(f,v,l,j)) + #print " ====> ", parsedAuthors[-1] + + return parsedAuthors + +ALLCHARS = "".join(map(chr,range(256))) +LC_CHARS = "abcdefghijklmnopqrstuvwxyz" +SV_DELCHARS = ("ABCDEFGHIJKLMNOPQRSTUVWXYZ" + "abcdefghijklmnopqrstuvwxyz" + "@") +RE_ESCAPED = re.compile(r'\\.') +def split_von(f,v,l,x): + in_von = 0 + while x: + tt = t = x[0] + del x[0] + if tt[:2] == '{\\': + tt = tt.translate(ALLCHARS, SV_DELCHARS) + tt = RE_ESCAPED.sub("", tt) + tt = tt.translate(ALLCHARS, "{}") + if tt.translate(ALLCHARS, LC_CHARS) == "": + v.append(t) + in_von = 1 + elif in_von and f is not None: + l.append(t) + l.extend(x) + return + else: + f.append(t) + if not in_von: + l.append(f[-1]) + del f[-1] + +class Parser: + def __init__(self, fileiter, initial_strings): + self.strings = INITIAL_STRINGS.copy() + self.strings.update(initial_strings) + self.fileiter = fileiter + self.entries = {} + self.result = BibTeX() + self.litStringLine = 0 + self.entryLine = 0 + + def _parseKey(self, line): + it = self.fileiter + line = _advance(it,line) + m = KEY_RE.match(line) + if not m: + raise ParseError("Expected key at line %s"%self.fileiter.lineno) + key, line = m.groups() + return key, line + + def _parseValue(self, line): + it = self.fileiter + bracelevel = 0 + data = [] + while 1: + line = _advance(it,line) + line = line.strip() + assert line + + # Literal string? + if line[0] == '"': + line=line[1:] + self.litStringLine = it.lineno + while 1: + if bracelevel: + m = BRACE_CLOSE_RE.match(line) + if m: + data.append(m.group(1)) + data.append('}') + line = m.group(2) + bracelevel -= 1 + continue + else: + m = STRING_CLOSE_RE.match(line) + if m: + data.append(m.group(1)) + line = m.group(2) + break + m = BRACE_OPEN_RE.match(line) + if m: + data.append(m.group(1)) + line = m.group(2) + bracelevel += 1 + continue + data.append(line) + line = it.next() + self.litStringLine = 0 + elif line[0] == '{': + bracelevel += 1 + line = line[1:] + while bracelevel: + m = BRACE_CLOSE_RE.match(line) + if m: + #print bracelevel, "A", repr(m.group(1)) + data.append(m.group(1)) + bracelevel -= 1 + if bracelevel > 0: + #print bracelevel, "- '}'" + data.append('}') + line = m.group(2) + continue + m = BRACE_OPEN_RE.match(line) + if m: + bracelevel += 1 + #print bracelevel, "B", repr(m.group(1)) + data.append(m.group(1)) + line = m.group(2) + continue + else: + #print bracelevel, "C", repr(line) + data.append(line) + line = it.next() + elif line[0] == '#': + print >>sys.stderr, "Weird concat on line %s"%it.lineno + elif line[0] in "},": + if not data: + print >>sys.stderr, "No data after field on line %s"%( + it.lineno) + else: + m = RAW_DATA_RE.match(line) + if m: + s = self.strings.get(m.group(1).lower()) + if s is not None: + data.append(s) + else: + data.append(m.group(1)) + line = m.group(2) + else: + raise ParseError("Questionable line at line %s"%it.lineno) + + + # Got a string, check for concatenation. + line = _advance(it,line) + line = line.strip() + assert line + if line[0] == '#': + line = line[1:] + else: + return "".join(data), line + + def _parseEntry(self, line): #name, strings, entries + it = self.fileiter + self.entryLine = it.lineno + line = _advance(it,line) + m = BRACE_BEGIN_RE.match(line) + if not m: + raise ParseError("Expected an opening brace at line %s"%it.lineno) + line = m.group(1) + + proto = { 'string' : 'p', + 'preamble' : 'v', + }.get(self.curEntType, 'kp*') + + v = [] + while 1: + line = _advance(it,line) + + m = BRACE_END_RE.match(line) + if m: + line = m.group(1) + break + if not proto: + raise ParseError("Overlong entry starting on line %s" + % self.entryLine) + elif proto[0] == 'k': + key, line = self._parseKey(line) + v.append(key) + elif proto[0] == 'v': + value, line = self._parseValue(line) + v.append(value) + elif proto[0] == 'p': + key, line = self._parseKey(line) + v.append(key) + line = _advance(it,line) + line = line.lstrip() + if line[0] == '=': + line = line[1:] + value, line = self._parseValue(line) + v.append(value) + else: + assert 0 + line = line.strip() + if line and line[0] == ',': + line = line[1:] + if proto and proto[1:] != '*': + proto = proto[1:] + if proto and proto[1:] != '*': + raise ParseError("Missing arguments to %s on line %s" % ( + self.curEntType, self.entryLine)) + + if self.curEntType == 'string': + self.strings[v[0]] = v[1] + elif self.curEntType == 'preamble': + pass + else: + key = v[0] + d = {} + for i in xrange(1,len(v),2): + d[v[i].lower()] = v[i+1] + ent = BibTeXEntry(self.curEntType, key, d) + self.result.addEntry(ent) + + return line + + def parse(self): + try: + self._parse() + except StopIteration: + if self.litStringLine: + raise ParseError("Unexpected EOF in string (%s)" % + self.litStringLine) + elif self.entryLine: + raise ParseError("Unexpected EOF at line %s (%s)" % ( + self.fileiter.lineno, self.entryLine)) + + return self.result + + def _parse(self): + it = self.fileiter + line = it.next() + while 1: + while not line or line.isspace() or OUTER_COMMENT_RE.match(line): + line = it.next() + m = ENTRY_BEGIN_RE.match(line) + if m: + self.curEntType = m.group(1).lower() + line = m.group(2) + line = self._parseEntry(line) + self.entryLine = 0 + else: + raise ParseError("Bad input at line %s (expected a new entry.)" + % it.lineno) + +def _advance(it,line): + while not line or line.isspace() or COMMENT_RE.match(line): + line = it.next() + return line + +OUTER_COMMENT_RE = re.compile(r'^\s*[\#\%]') +COMMENT_RE = re.compile(r'^\s*\%') +ENTRY_BEGIN_RE = re.compile(r'''^\s*\@([^\s\"\%\'\(\)\,\=\{\}]+)(.*)''') +BRACE_BEGIN_RE = re.compile(r'\s*\{(.*)') +BRACE_END_RE = re.compile(r'\s*\}(.*)') +KEY_RE = re.compile(r'''\s*([^\"\#\%\'\(\)\,\=\{\}\s]+)(.*)''') + +STRING_CLOSE_RE = re.compile(r'^([^\{\}\"]*)\"(.*)') +BRACE_CLOSE_RE = re.compile(r'^([^\{\}]*)\}(.*)') +BRACE_OPEN_RE = re.compile(r'^([^\{\}]*\{)(.*)') +RAW_DATA_RE = re.compile(r'^([^\s\},]+)(.*)') + +if __name__ == '__main__': + f = FileIter(fname="testbib/pdos.bib") + p = Parser(f, {}) + print p + r = p.parse() + r.resolve() + for e in r.entries: + print e + for e in r.entries: + if e.type in ("proceedings", "journal"): continue + e.check() + for e in r.entries: + if e.type in ("proceedings", "journal"): continue + print e.to_html() + |
