#!/usr/bin/python
import cStringIO
import re
import sys
import config
__all__ = ( 'ParseError', 'BibTeX', 'BibTeXEntry', 'htmlize',
'ParsedAuthor', 'FileIter', 'Parser', 'parseFile',
'splitBibTeXEntriesBy',
'sortBibTexEntriesBy', )
class ParseError(Exception):
pass
class BibTeX:
def __init__(self):
self.entries = []
self.byKey = {}
def addEntry(self, ent):
k = ent.key
if self.byKey.get(ent.key):
print >> sys.stderr, "Already have an entry named %s"%k
return
self.entries.append(ent)
self.byKey[ent.key] = ent
def resolve(self):
seen = {}
for ent in self.entries:
seen.clear()
while ent.get('crossref'):
try:
cr = self.byKey[ent['crossref'].lower()]
except KeyError:
print "No such crossref: %s", ent['crossref']
print ent
break
if seen.get(cr.key):
raise ParseError("Circular crossref at %s" % ent.key)
seen[cr.key] = 1
del ent.entries['crossref']
ent.entries.update(cr.entries)
ent.resolve()
newEntries = []
for ent in self.entries:
if ent.type in config.OMIT_ENTRIES:
del self.byKey[ent.key]
else:
newEntries.append(ent)
self.entries = newEntries
def splitEntriesBy(entries, field):
result = {}
for ent in entries:
key = ent.get(field)
try:
result[key].append(ent)
except:
result[key] = [ent]
return result
def sortEntriesBy(self, field):
tmp = [ (ent.get(field), ent) for ent in entries ]
tmp.sort()
return [ t[2] for t in tmp ]
DISPLAYED_FIELDS = [ 'title', 'author', 'journal', 'booktitle',
'school', 'institution', 'organization', 'volume', 'number', 'year',
'month', 'address', 'chapter', 'edition', 'pages', 'editor',
'howpublished', 'key', 'publisher', 'type', 'note' ]
class BibTeXEntry:
def __init__(self, type, key, entries):
self.type = type
self.key = key
self.entries = entries
self._get = self.entries.__getitem__
def get(self, k, v=None):
return self.entries.get(k,v)
def __getitem__(self, k):
return self._get(k)
def __setitem__(self, k, v):
self.entries[k] = v
def __str__(self):
return self.format(70,1)
def format(self, width=70,v=0):
d = ["@%s{%s,\n" % (self.type, self.key)]
if v:
df = DISPLAYED_FIELDS[:]
for k in self.entries.keys():
if k not in df:
df.append(k)
else:
df = DISPLAYED_FIELDS
for f in df:
if not self.entries.has_key(f):
continue
v = self.entries[f]
d.append(" ")
s = "%s = {%s}\n" % (f, v)
d.append(_split(s,width))
d.append("}\n")
return "".join(d)
def resolve(self):
a = self.get('author')
if a:
self.parsedAuthor = parseAuthor(a)
#print a
#print " => ",repr(self.parsedAuthor)
else:
self.parsedAuthor = None
def check(self):
ok = 1
if self.type == 'inproceedings':
fields = 'booktitle', 'year'
elif self.type == 'article':
fields = 'journal', 'year'
elif self.type == 'techreport':
fields = 'institution', 'number'
elif self.type == 'misc':
fields = 'howpublished',
else:
fields = ()
fields += 'title', 'author'
for field in fields:
if not self.get(field):
print "ERROR: %s has no %s field" % (self.key, field)
self.entries[field] = "%s:??"%field
ok = 0
return ok
def biblio_to_html(self):
if self.type == 'inproceedings':
booktitle = self['booktitle']
bookurl = self.get('bookurl')
if bookurl:
m = PROCEEDINGS_RE.match(booktitle)
if m:
res = ["In the ", m.group(1),
''%bookurl, m.group(2), ""]
else:
res = ['In the %s' % (bookurl,booktitle)]
else:
res = ["In the ", booktitle ]
if self.get("edition"):
res.append(",")
res.append(self['edition'])
if self.get("address"):
res.append(",")
res.append(self['address'])
res.append(", %s %s" % (self.get('month',""), self['year']))
if not self.get('pages'):
pass
elif "-" in self['pages']:
res.append(", pages %s"%self['pages'])
else:
res.append(", page %s"%self['pages'])
elif self.type == 'article':
res = ["In "]
if self.get('journalurl'):
res.append('%s'%
(self['journalurl'],self['journal']))
else:
res.append(self['journal'])
if self.get('volume'):
res.append(" %s"%self['volume'])
if self.get('number'):
res.append("(%s)"%self['number'])
res.append(", %s %s" % (self.get('month',""), self['year']))
if not self.get('pages'):
pass
elif "-" in self['pages']:
res.append(", pages %s"%self['pages'])
else:
res.append(", page %s"%self['pages'])
elif self.type == 'techreport':
res = [ "%s %s %s" % (self['institution'],
self.get('type', 'technical report'),
self['number']) ]
if self.get('month') or self.get('year'):
res.append(", %s %s" % (self.get('month', ''),
self.get('year', '')))
elif self.type == 'mastersthesis' or self.type == 'phdthesis':
if self.get('type'):
res = [self['type']]
elif type == 'mastersthesis':
res = ["Masters's thesis"]
else:
res = ["Ph.D. thesis"]
if self.get('school'):
res.append(", %s"%(self['school']))
if self.get('month') or self.get('year'):
res.append(", %s %s" % (self.get('month', ''),
self.get('year', '')))
elif self.type == 'misc':
res = [self['howpublished']]
if self.get('month') or self.get('year'):
res.append(", %s %s" % (self.get('month', ''),
self.get('year', '')))
if not self.get('pages'):
pass
elif "-" in self['pages']:
res.append(", pages %s"%self['pages'])
else:
res.append(", page %s"%self['pages'])
else:
res = ["<Odd type %s>"%self.type]
res[0:0] = [""]
res.append("")
res.append(" "
"(BibTeX entry)"
"")
return htmlize("".join(res))
def to_html(self):
res = ["
%s"%(
htmlize(self['title']))]
availability = []
for key, name in (('www_abstract_url', 'abstract'),
('www_html_url', 'HTML'),
('www_pdf_url', 'PDF'),
('www_ps_url', 'PS'),
('www_txt_url', 'TXT'),
('www_ps_gz_url', 'gzipped PS')):
url = self.get(key)
if not url: continue
availability.append('%s' %(url,name))
if availability:
res.append(" (")
res.append(", ".join(availability))
res.append(")")
res.append("
by ")
#res.append("\n\n" % self.parsedAuthor)
htmlAuthors = []
for author in self.parsedAuthor:
f,v,l,j = author.first,author.von,author.last,author.jr
a = " ".join(f+v+l)
if j:
a = "%s, %s" %(a,j)
a = htmlize(a)
htmlAuthor = None
for pat, url in config.AUTHOR_RE_LIST:
if pat.search(a):
htmlAuthor = '%s' % (url, a)
break
if not htmlAuthor:
htmlAuthor = a
htmlAuthors.append(htmlAuthor)
if len(htmlAuthors) == 1:
res.append(htmlAuthors[0])
elif len(htmlAuthors) == 2:
res.append(" and ".join(htmlAuthors))
else:
res.append(", ".join(htmlAuthors[:-1]))
res.append(", and ")
res.append(htmlAuthors[-1])
if res[-1][-1] != '.':
res.append(".")
res.append("
\n")
res.append(self.biblio_to_html())
res.append("
\n\n")
return "".join(res)
RE_LONE_AMP = re.compile(r'&([^a-z0-9])')
RE_LONE_I = re.compile(r'\\i([^a-z0-9])')
RE_ACCENT = re.compile(r'\\([\'`~^"])(.)')
ACCENT_MAP = { "'": 'acute', "`" : 'grave', "~": 'tilde',
"^": 'circ', '"' : 'uml' }
RE_TEX_CMD = re.compile(r"(?:\\[a-zA-Z@]+|\\.)")
RE_PAGE_SPAN = re.compile(r"(\d)--(\d)")
def htmlize(s):
s = RE_LONE_AMP.sub(lambda m: "&%s" % m.group(1), s)
s = RE_LONE_I.sub(lambda m: "i%s" % m.group(1), s)
s = RE_ACCENT.sub(lambda m: "&%s%s;" %(m.group(2),
ACCENT_MAP[(m.group(1))]),
s)
s = RE_TEX_CMD.sub("", s)
s = s.translate(ALLCHARS, "{}")
s = RE_PAGE_SPAN.sub(lambda m: "%s-%s"%(m.groups()), s)
return s
PROCEEDINGS_RE = re.compile(
r'((?:proceedings|workshop record) of(?: the)? )(.*)',
re.I)
class ParsedAuthor:
def __init__(self, first, von, last, jr):
self.first = first
self.von = von
self.last = last
self.jr = jr
def __repr__(self):
return "ParsedAuthor(%r,%r,%r,%r)"%(self.first,self.von,
self.last,self.jr)
def __str__(self):
return " ".join(self.first+self.von+self.last+self.jr)
def _split(s,w=79):
r = []
s = s.replace("\n", " ")
while len(s) > w:
for i in xrange(w-1, 0, -1):
if s[i] == ' ':
r.append(s[:i])
s = s[i+1:]
break
else:
r.append(s[:w])
s = s[w:]
r.append(s)
r.append("")
return "\n".join(r)
class FileIter:
def __init__(self, fname=None, file=None, it=None, string=None):
if fname:
file = open(fname, 'r')
if string:
file = cStringIO.StringIO(string)
if file:
it = iter(file.xreadlines())
self.iter = it
assert self.iter
self.lineno = 0
self._next = it.next
def next(self):
self.lineno += 1
return self._next()
def parseAuthor(s):
items = []
#print "A", `s`
s = s.strip()
while s:
s = s.strip()
bracelevel = 0
for i in xrange(len(s)):
if s[i] == '{':
bracelevel += 1
elif s[i] == '}':
bracelevel -= 1
elif bracelevel <= 0 and s[i] in " \t\n,":
break
if i+1 == len(s):
items.append(s)
else:
items.append(s[0:i])
if (s[i] == ','):
items.append(',')
s = s[i+1:]
#print "B", items
authors = [[]]
for item in items:
if item == 'and':
authors.append([])
else:
authors[-1].append(item)
#print "C", authors
parsedAuthors = []
# Split into first, von, last, jr
for author in authors:
#print author
commas = 0
fvl = []
vl = []
f = []
v = []
l = []
j = []
cur = fvl
for item in author:
if item == ',':
if commas == 0:
vl = fvl
fvl = []
cur = f
else:
j.extend(f)
f = []
else:
cur.append(item)
if commas == 0:
split_von(f,v,l,fvl)
else:
split_von(None,v,l,vl)
parsedAuthors.append(ParsedAuthor(f,v,l,j))
#print " ====> ", parsedAuthors[-1]
return parsedAuthors
ALLCHARS = "".join(map(chr,range(256)))
LC_CHARS = "abcdefghijklmnopqrstuvwxyz"
SV_DELCHARS = ("ABCDEFGHIJKLMNOPQRSTUVWXYZ"
"abcdefghijklmnopqrstuvwxyz"
"@")
RE_ESCAPED = re.compile(r'\\.')
def split_von(f,v,l,x):
in_von = 0
while x:
tt = t = x[0]
del x[0]
if tt[:2] == '{\\':
tt = tt.translate(ALLCHARS, SV_DELCHARS)
tt = RE_ESCAPED.sub("", tt)
tt = tt.translate(ALLCHARS, "{}")
if tt.translate(ALLCHARS, LC_CHARS) == "":
v.append(t)
in_von = 1
elif in_von and f is not None:
l.append(t)
l.extend(x)
return
else:
f.append(t)
if not in_von:
l.append(f[-1])
del f[-1]
class Parser:
def __init__(self, fileiter, initial_strings):
self.strings = config.INITIAL_STRINGS.copy()
self.strings.update(initial_strings)
self.fileiter = fileiter
self.entries = {}
self.result = BibTeX()
self.litStringLine = 0
self.entryLine = 0
def _parseKey(self, line):
it = self.fileiter
line = _advance(it,line)
m = KEY_RE.match(line)
if not m:
raise ParseError("Expected key at line %s"%self.fileiter.lineno)
key, line = m.groups()
return key, line
def _parseValue(self, line):
it = self.fileiter
bracelevel = 0
data = []
while 1:
line = _advance(it,line)
line = line.strip()
assert line
# Literal string?
if line[0] == '"':
line=line[1:]
self.litStringLine = it.lineno
while 1:
if bracelevel:
m = BRACE_CLOSE_RE.match(line)
if m:
data.append(m.group(1))
data.append('}')
line = m.group(2)
bracelevel -= 1
continue
else:
m = STRING_CLOSE_RE.match(line)
if m:
data.append(m.group(1))
line = m.group(2)
break
m = BRACE_OPEN_RE.match(line)
if m:
data.append(m.group(1))
line = m.group(2)
bracelevel += 1
continue
data.append(line)
line = it.next()
self.litStringLine = 0
elif line[0] == '{':
bracelevel += 1
line = line[1:]
while bracelevel:
m = BRACE_CLOSE_RE.match(line)
if m:
#print bracelevel, "A", repr(m.group(1))
data.append(m.group(1))
bracelevel -= 1
if bracelevel > 0:
#print bracelevel, "- '}'"
data.append('}')
line = m.group(2)
continue
m = BRACE_OPEN_RE.match(line)
if m:
bracelevel += 1
#print bracelevel, "B", repr(m.group(1))
data.append(m.group(1))
line = m.group(2)
continue
else:
#print bracelevel, "C", repr(line)
data.append(line)
line = it.next()
elif line[0] == '#':
print >>sys.stderr, "Weird concat on line %s"%it.lineno
elif line[0] in "},":
if not data:
print >>sys.stderr, "No data after field on line %s"%(
it.lineno)
else:
m = RAW_DATA_RE.match(line)
if m:
s = self.strings.get(m.group(1).lower())
if s is not None:
data.append(s)
else:
data.append(m.group(1))
line = m.group(2)
else:
raise ParseError("Questionable line at line %s"%it.lineno)
# Got a string, check for concatenation.
line = _advance(it,line)
line = line.strip()
assert line
if line[0] == '#':
line = line[1:]
else:
return "".join(data), line
def _parseEntry(self, line): #name, strings, entries
it = self.fileiter
self.entryLine = it.lineno
line = _advance(it,line)
m = BRACE_BEGIN_RE.match(line)
if not m:
raise ParseError("Expected an opening brace at line %s"%it.lineno)
line = m.group(1)
proto = { 'string' : 'p',
'preamble' : 'v',
}.get(self.curEntType, 'kp*')
v = []
while 1:
line = _advance(it,line)
m = BRACE_END_RE.match(line)
if m:
line = m.group(1)
break
if not proto:
raise ParseError("Overlong entry starting on line %s"
% self.entryLine)
elif proto[0] == 'k':
key, line = self._parseKey(line)
v.append(key)
elif proto[0] == 'v':
value, line = self._parseValue(line)
v.append(value)
elif proto[0] == 'p':
key, line = self._parseKey(line)
v.append(key)
line = _advance(it,line)
line = line.lstrip()
if line[0] == '=':
line = line[1:]
value, line = self._parseValue(line)
v.append(value)
else:
assert 0
line = line.strip()
if line and line[0] == ',':
line = line[1:]
if proto and proto[1:] != '*':
proto = proto[1:]
if proto and proto[1:] != '*':
raise ParseError("Missing arguments to %s on line %s" % (
self.curEntType, self.entryLine))
if self.curEntType == 'string':
self.strings[v[0]] = v[1]
elif self.curEntType == 'preamble':
pass
else:
key = v[0]
d = {}
for i in xrange(1,len(v),2):
d[v[i].lower()] = v[i+1]
ent = BibTeXEntry(self.curEntType, key, d)
self.result.addEntry(ent)
return line
def parse(self):
try:
self._parse()
except StopIteration:
if self.litStringLine:
raise ParseError("Unexpected EOF in string (%s)" %
self.litStringLine)
elif self.entryLine:
raise ParseError("Unexpected EOF at line %s (%s)" % (
self.fileiter.lineno, self.entryLine))
return self.result
def _parse(self):
it = self.fileiter
line = it.next()
while 1:
while not line or line.isspace() or OUTER_COMMENT_RE.match(line):
line = it.next()
m = ENTRY_BEGIN_RE.match(line)
if m:
self.curEntType = m.group(1).lower()
line = m.group(2)
line = self._parseEntry(line)
self.entryLine = 0
else:
raise ParseError("Bad input at line %s (expected a new entry.)"
% it.lineno)
def _advance(it,line):
while not line or line.isspace() or COMMENT_RE.match(line):
line = it.next()
return line
OUTER_COMMENT_RE = re.compile(r'^\s*[\#\%]')
COMMENT_RE = re.compile(r'^\s*\%')
ENTRY_BEGIN_RE = re.compile(r'''^\s*\@([^\s\"\%\'\(\)\,\=\{\}]+)(.*)''')
BRACE_BEGIN_RE = re.compile(r'\s*\{(.*)')
BRACE_END_RE = re.compile(r'\s*\}(.*)')
KEY_RE = re.compile(r'''\s*([^\"\#\%\'\(\)\,\=\{\}\s]+)(.*)''')
STRING_CLOSE_RE = re.compile(r'^([^\{\}\"]*)\"(.*)')
BRACE_CLOSE_RE = re.compile(r'^([^\{\}]*)\}(.*)')
BRACE_OPEN_RE = re.compile(r'^([^\{\}]*\{)(.*)')
RAW_DATA_RE = re.compile(r'^([^\s\},]+)(.*)')
def parseFile(filename):
f = FileIter(fname=filename)
p = Parser(f, {})
r = p.parse()
r.resolve()
for e in r.entries:
e.check()
return r
if __name__ == '__main__':
import sys
if len(sys.argv)>1:
fname=sys.argv[1]
else:
fname="testbib/pdos.bib"
r = parseFile(fname)
for e in r.entries:
if e.type in ("proceedings", "journal"): continue
print e.to_html()