aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorThibaut Horel <thibaut.horel@gmail.com>2016-02-04 19:46:04 -0500
committerThibaut Horel <thibaut.horel@gmail.com>2016-02-04 19:46:04 -0500
commit871c61c6b4351d4a9dd78ba1d70d6e1af8ffe1e7 (patch)
tree99bce3e74cbcff075dcb6bceacd0f2e1133bef4d
parentfd20589a448cd19d036f18cabb1663c33a24375d (diff)
downloadanonbib-871c61c6b4351d4a9dd78ba1d70d6e1af8ffe1e7.tar.gz
Start cleaning: PEP8 and split the BibTeX.py monster
-rw-r--r--BibTeX.py755
-rw-r--r--entry.py653
-rw-r--r--utils.py118
3 files changed, 789 insertions, 737 deletions
diff --git a/BibTeX.py b/BibTeX.py
index e076200..d0f5624 100644
--- a/BibTeX.py
+++ b/BibTeX.py
@@ -14,63 +14,43 @@ import copy
import config
-import rank
+from entry import BibTeXEntry, buildAuthorTable
+from utils import txtize, url_untranslate, smartJoin
-__all__ = [ 'ParseError', 'BibTeX', 'BibTeXEntry', 'htmlize',
- 'ParsedAuthor', 'FileIter', 'Parser', 'parseFile',
- 'splitBibTeXEntriesBy', 'sortBibTexEntriesBy', ]
+__all__ = ['ParseError', 'BibTeX', 'BibTeXEntry', 'htmlize',
+ 'ParsedAuthor', 'FileIter', 'Parser', 'parseFile',
+ 'splitEntriesBy', 'sortEntriesBy']
# List: must map from month number to month name.
-MONTHS = [ None,
- "January", "February", "March", "April", "May", "June",
- "July", "August", "September", "October", "November", "December"]
+MONTHS = [None, "January", "February", "March", "April", "May", "June",
+ "July", "August", "September", "October", "November", "December"]
+
+
-# Fields that we only care about for making web pages (BibTeX doesn't
-# recognize them.)
-WWW_FIELDS = [ 'www_section', 'www_important', 'www_remarks',
- 'www_abstract_url', 'www_html_url', 'www_pdf_url', 'www_ps_url',
- 'www_txt_url', 'www_ps_gz_url', 'www_amazon_url',
- 'www_excerpt_url', 'www_publisher_url',
- 'www_cache_section', 'www_tags' ]
-def url_untranslate(s):
- """Change a BibTeX key into a string suitable for use in a URL."""
- s = re.sub(r'([%<>`#, &_\';])',
- lambda m: "_%02x"%ord(m.group(1)),
- s)
- s = s.replace("/",":")
- return s
class ParseError(Exception):
"""Raised on invalid BibTeX"""
pass
-def smartJoin(*lst):
- """Equivalent to os.path.join, but handle"." and ".." entries a bit better.
- """
- lst = [ item for item in lst if item != "." ]
- idx = 0
- while idx < len(lst):
- if idx > 0 and lst[idx] == "..":
- del lst[idx]
- else:
- idx += 1
- return os.path.join(*lst)
+
class BibTeX:
"""A parsed BibTeX file"""
def __init__(self):
- self.entries = [] # List of BibTeXEntry
- self.byKey = {} # Map from BibTeX key to BibTeX entry.
+ self.entries = [] # List of BibTeXEntry
+ self.byKey = {} # Map from BibTeX key to BibTeX entry.
+
def addEntry(self, ent):
"""Add a BibTeX entry to this file."""
k = ent.key
if self.byKey.get(ent.key.lower()):
- print >> sys.stderr, "Already have an entry named %s"%k
+ print >> sys.stderr, "Already have an entry named %s" % k
return
self.entries.append(ent)
self.byKey[ent.key.lower()] = ent
+
def resolve(self):
"""Validate all entries in this file, and resolve cross-references"""
seen = {}
@@ -80,7 +60,7 @@ class BibTeX:
try:
cr = self.byKey[ent['crossref'].lower()]
except KeyError:
- print "No such crossref: %s"% ent['crossref']
+ print "No such crossref: %s" % ent['crossref']
break
if seen.get(cr.key):
raise ParseError("Circular crossref at %s" % ent.key)
@@ -88,7 +68,7 @@ class BibTeX:
del ent.entries['crossref']
if cr.entryLine < ent.entryLine:
- print "Warning: crossref %s used after declaration"%cr.key
+ print "Warning: crossref %s used after declaration" % cr.key
for k in cr.entries.keys():
if ent.entries.has_key(k):
@@ -113,47 +93,7 @@ class BibTeX:
newEntries.append(ent)
self.entries = newEntries
-def buildAuthorTable(entries):
- """Given a list of BibTeXEntry, return a map from parsed author name to
- parsed canonical name.
- """
- authorsByLast = {}
- for e in entries:
- for a in e.parsedAuthor:
- authorsByLast.setdefault(tuple(a.last), []).append(a)
- # map from author to collapsed author.
- result = {}
- for k,v in config.COLLAPSE_AUTHORS.items():
- a = parseAuthor(k)[0]
- c = parseAuthor(v)[0]
- result[c] = c
- result[a] = c
-
- for e in entries:
- for author in e.parsedAuthor:
- if result.has_key(author):
- continue
-
- c = author
- for a in authorsByLast[tuple(author.last)]:
- if a is author:
- continue
- c = c.collapsesTo(a)
- result[author] = c
- if 0:
- for a,c in result.items():
- if a != c:
- print "Collapsing authors: %s => %s" % (a,c)
- if 0:
- print parseAuthor("Franz Kaashoek")[0].collapsesTo(
- parseAuthor("M. Franz Kaashoek")[0])
- print parseAuthor("Paul F. Syverson")[0].collapsesTo(
- parseAuthor("Paul Syverson")[0])
- print parseAuthor("Paul Syverson")[0].collapsesTo(
- parseAuthor("Paul F. Syverson")[0])
-
- return result
def splitEntriesBy(entries, field):
"""Take a list of BibTeX entries and the name of a bibtex field; return
@@ -281,570 +221,9 @@ def sortEntriesByDate(entries):
return [ t[2] for t in tmp ]
-# List of fields that appear when we display the entries as BibTeX.
-DISPLAYED_FIELDS = [ 'title', 'author', 'journal', 'booktitle',
-'school', 'institution', 'organization', 'volume', 'number', 'year',
-'month', 'address', 'location', 'chapter', 'edition', 'pages', 'editor',
-'howpublished', 'key', 'publisher', 'type', 'note', 'series' ]
-
-class BibTeXEntry:
- """A single BibTeX entry."""
- def __init__(self, type, key, entries):
- self.type = type # What kind of entry is it? (@book,@injournal,etc)
- self.key = key # What key does it have?
- self.entries = entries # Map from key to value.
- self.entryLine = 0 # Defined on this line number
- def get(self, k, v=None):
- return self.entries.get(k,v)
- def has_key(self, k):
- return self.entries.has_key(k)
- def __getitem__(self, k):
- return self.entries[k]
- def __setitem__(self, k, v):
- self.entries[k] = v
- def __str__(self):
- return self.format(70,1)
- def getURL(self):
- """Return the best URL to use for this paper, or None."""
- best = None
- for field in ['www_pdf_url', 'www_ps_gz_url', 'www_ps_url',
- 'www_html_url', 'www_txt_url', ]:
- u = self.get(field)
- if u:
- if not best:
- best = u
- elif (best.startswith("http://citeseer.nj.nec.com/")
- and not u.startswith("http://citeseer.nj.nec.com/")):
- best = u
- return best
-
- def format(self, width=70, indent=8, v=0, invStrings={}):
- """Format this entry as BibTeX."""
- d = ["@%s{%s,\n" % (self.type, self.key)]
- if v:
- df = DISPLAYED_FIELDS[:]
- for k in self.entries.keys():
- if k not in df:
- df.append(k)
- else:
- df = DISPLAYED_FIELDS
- for f in df:
- if not self.entries.has_key(f):
- continue
- v = self.entries[f]
- if v.startswith("<span class='bad'>"):
- d.append("%%%%% ERROR: Missing field\n")
- d.append("%% %s = {?????},\n"%f)
- continue
- np = v.translate(ALLCHARS, PRINTINGCHARS)
- if np:
- d.append("%%%%% "+("ERROR: Non-ASCII characters: '%r'\n"%np))
- d.append(" ")
- v = v.replace("&", "&amp;")
- if invStrings.has_key(v):
- s = "%s = %s,\n" %(f, invStrings[v])
- else:
- s = "%s = {%s},\n" % (f, v)
- d.append(_split(s,width,indent))
- d.append("}\n")
- return "".join(d)
- def resolve(self):
- """Handle post-processing for this entry"""
- a = self.get('author')
- if a:
- self.parsedAuthor = parseAuthor(a)
- #print a
- #print " => ",repr(self.parsedAuthor)
- else:
- self.parsedAuthor = None
-
- def isImportant(self):
- """Return 1 iff this entry is marked as important"""
- imp = self.get("www_important")
- if imp and imp.strip().lower() not in ("no", "false", "0"):
- return 1
- return 0
-
- def check(self):
- """Print any errors for this entry, and return true if there were
- none."""
- errs = self._check()
- for e in errs:
- print e
- return not errs
-
- def _check(self):
- errs = []
- if self.type == 'inproceedings':
- fields = 'booktitle', 'year'
- elif self.type == 'incollection':
- fields = 'booktitle', 'year'
- elif self.type == 'proceedings':
- fields = 'booktitle', 'editor'
- elif self.type == 'article':
- fields = 'journal', 'year'
- elif self.type == 'techreport':
- fields = 'institution',
- elif self.type == 'misc':
- fields = 'howpublished',
- elif self.type in ('mastersthesis', 'phdthesis'):
- fields = ()
- else:
- fields = ()
- errs.append("ERROR: odd type %s"%self.type)
- if self.type != 'proceedings':
- fields += 'title', 'author', 'www_section', 'year'
-
- for field in fields:
- if self.get(field) is None or \
- self.get(field).startswith("<span class='bad'>"):
- errs.append("ERROR: %s has no %s field" % (self.key, field))
- self.entries[field] = "<span class='bad'>%s:??</span>"%field
-
- if self.type == 'inproceedings':
- if self.get("booktitle"):
- if not self['booktitle'].startswith("Proceedings of") and \
- not self['booktitle'].startswith("{Proceedings of"):
- errs.append("ERROR: %s's booktitle (%r) doesn't start with 'Proceedings of'" % (self.key, self['booktitle']))
-
- if self.has_key("pages") and not re.search(r'\d+--\d+', self['pages']):
- errs.append("ERROR: Misformed pages in %s"%self.key)
-
- if self.type == 'proceedings':
- if self.get('title'):
- errs.append("ERROR: %s is a proceedings: it should have a booktitle, not a title." % self.key)
-
- for field, value in self.entries.items():
- if value.translate(ALLCHARS, PRINTINGCHARS):
- errs.append("ERROR: %s.%s has non-ASCII characters"%(
- self.key, field))
- if field.startswith("www_") and field not in WWW_FIELDS:
- errs.append("ERROR: unknown www field %s"% field)
- if value.strip()[-1:] == '.' and \
- field not in ("notes", "www_remarks", "author"):
- errs.append("ERROR: %s.%s has an extraneous period"%(self.key,
- field))
- return errs
-
- def biblio_to_html(self):
- """Return the HTML for the citation portion of entry."""
- if self.type in ('inproceedings', 'incollection'):
- booktitle = self['booktitle']
- bookurl = self.get('bookurl')
- if bookurl:
- m = PROCEEDINGS_RE.match(booktitle)
- if m:
- res = ["In the ", m.group(1),
- '<a href="%s">'%bookurl, m.group(2), "</a>"]
- else:
- res = ['In the <a href="%s">%s</a>' % (bookurl,booktitle)]
- else:
- res = ["In the ", booktitle ]
-
- if self.get("edition"):
- res.append(",")
- res.append(self['edition'])
- if self.get("location"):
- res.append(", ")
- res.append(self['location'])
- elif self.get("address"):
- res.append(", ")
- res.append(self['address'])
- res.append(", %s %s" % (self.get('month',""), self['year']))
- if not self.get('pages'):
- pass
- elif "-" in self['pages']:
- res.append(", pages&nbsp;%s"%self['pages'])
- else:
- res.append(", page&nbsp;%s"%self['pages'])
- elif self.type == 'article':
- res = ["In "]
- if self.get('journalurl'):
- res.append('<a href="%s">%s</a>'%
- (self['journalurl'],self['journal']))
- else:
- res.append(self['journal'])
- if self.get('volume'):
- res.append(" <b>%s</b>"%self['volume'])
- if self.get('number'):
- res.append("(%s)"%self['number'])
- res.append(", %s %s" % (self.get('month',""), self['year']))
- if not self.get('pages'):
- pass
- elif "-" in self['pages']:
- res.append(", pages&nbsp;%s"%self['pages'])
- else:
- res.append(", page&nbsp;%s"%self['pages'])
- elif self.type == 'techreport':
- res = [ "%s %s %s" % (self['institution'],
- self.get('type', 'technical report'),
- self.get('number', "")) ]
- if self.get('month') or self.get('year'):
- res.append(", %s %s" % (self.get('month', ''),
- self.get('year', '')))
- elif self.type == 'mastersthesis' or self.type == 'phdthesis':
- if self.get('type'):
- res = [self['type']]
- elif self.type == 'mastersthesis':
- res = ["Masters's thesis"]
- else:
- res = ["Ph.D. thesis"]
- if self.get('school'):
- res.append(", %s"%(self['school']))
- if self.get('month') or self.get('year'):
- res.append(", %s %s" % (self.get('month', ''),
- self.get('year', '')))
- elif self.type == 'book':
- res = [self['publisher']]
- if self.get('year'):
- res.append(" ");
- res.append(self.get('year'));
- # res.append(", %s"%(self.get('year')))
- if self.get('series'):
- res.append(",");
- res.append(self['series']);
- elif self.type == 'misc':
- res = [self['howpublished']]
- if self.get('month') or self.get('year'):
- res.append(", %s %s" % (self.get('month', ''),
- self.get('year', '')))
- if not self.get('pages'):
- pass
- elif "-" in self['pages']:
- res.append(", pages&nbsp;%s"%self['pages'])
- else:
- res.append(", page&nbsp;%s"%self['pages'])
- else:
- res = ["&lt;Odd type %s&gt;"%self.type]
-
- res[0:0] = ["<span class='biblio'>"]
- res.append(".</span>")
-
- bibtexurl = "./bibtex.html#%s"%url_untranslate(self.key)
- res.append((" <span class='availability'>"
- "(<a href='%s'>BibTeX&nbsp;entry</a>)"
- "</span>") %bibtexurl)
- return htmlize("".join(res))
-
- def to_html(self, cache_path="./cache", base_url="."):
- """Return the HTML for this entry."""
- imp = self.isImportant()
- draft = self.get('year') == 'forthcoming'
- if imp:
- res = ["<li><div class='impEntry'><p class='impEntry'>" ]
- elif draft:
- res = ["<li><div class='draftEntry'><p class='draftEntry'>" ]
- else:
- res = ["<li><p class='entry'>"]
-
- if imp or not draft:
- # Add a picture of the rank
- # Only if year is known or paper important!
- r = rank.get_rank_html(self['title'], self.get('year'),
- update=False, base_url=base_url)
- if r is not None:
- res.append(r)
-
- res.append("<span class='title'><a name='%s'>%s</a></span>"%(
- url_untranslate(self.key),htmlize(self['title'])))
-
- for cached in 0,1:
- availability = []
- if not cached:
- for which in [ "amazon", "excerpt", "publisher" ]:
- key = "www_%s_url"%which
- if self.get(key):
- url=self[key]
- url = unTeXescapeURL(url)
- availability.append('<a href="%s">%s</a>' %(url,which))
-
- cache_section = self.get('www_cache_section', ".")
- if cache_section not in config.CACHE_SECTIONS:
- if cache_section != ".":
- print >>sys.stderr, "Unrecognized cache section %s"%(
- cache_section)
- cache_section="."
-
- for key, name, ext in (('www_abstract_url', 'abstract','abstract'),
- ('www_html_url', 'HTML', 'html'),
- ('www_pdf_url', 'PDF', 'pdf'),
- ('www_ps_url', 'PS', 'ps'),
- ('www_txt_url', 'TXT', 'txt'),
- ('www_ps_gz_url', 'gzipped&nbsp;PS','ps.gz')
- ):
- if cached:
- #XXXX the URL needs to be relative to the absolute
- #XXXX cache path.
- url = smartJoin(cache_path,cache_section,
- "%s.%s"%(self.key,ext))
- fname = smartJoin(config.OUTPUT_DIR, config.CACHE_DIR,
- cache_section,
- "%s.%s"%(self.key,ext))
- if not os.path.exists(fname): continue
- else:
- url = self.get(key)
- if not url: continue
- url = unTeXescapeURL(url)
- url = url.replace('&', '&amp;')
- availability.append('<a href="%s">%s</a>' %(url,name))
-
- if availability:
- res.append([" ", "&nbsp;"][cached])
- res.append("<span class='availability'>(")
- if cached: res.append("Cached:&nbsp;")
- res.append(",&nbsp;".join(availability))
- res.append(")</span>")
-
- res.append("<br /><span class='author'>by ")
-
- #res.append("\n<!-- %r -->\n" % self.parsedAuthor)
- htmlAuthors = [ a.htmlizeWithLink() for a in self.parsedAuthor ]
-
- if len(htmlAuthors) == 1:
- res.append(htmlAuthors[0])
- elif len(htmlAuthors) == 2:
- res.append(" and ".join(htmlAuthors))
- else:
- res.append(", ".join(htmlAuthors[:-1]))
- res.append(", and ")
- res.append(htmlAuthors[-1])
- if res[-1][-1] != '.':
- res.append(".")
- res.append("</span><br />\n")
- res.append(self.biblio_to_html())
- res.append("<a href='#%s'>&middot;</a>"%url_untranslate(self.key))
- res.append("</p>")
- if self.get('www_remarks'):
- res.append("<p class='remarks'>%s</p>"%htmlize(
- self['www_remarks']))
- if imp or draft:
- res.append("</div>")
- res.append("</li>\n\n")
-
- return "".join(res)
-
-def unTeXescapeURL(s):
- """Turn a URL as formatted in TeX into a real URL."""
- s = s.replace("\\_", "_")
- s = s.replace("\\-", "")
- s = s.replace("\{}", "")
- s = s.replace("{}", "")
- return s
-
-def TeXescapeURL(s):
- """Escape a URL for use in TeX"""
- s = s.replace("_", "\\_")
- s = s.replace("~", "\{}~")
- return s
-
-RE_LONE_AMP = re.compile(r'&([^a-z0-9])')
-RE_LONE_I = re.compile(r'\\i([^a-z0-9])')
-RE_ACCENT = re.compile(r'\\([\'`~^"c])([^{]|{.})')
-RE_LIGATURE = re.compile(r'\\(AE|ae|OE|oe|AA|aa|O|o|ss)([^a-z0-9])')
-ACCENT_MAP = { "'" : 'acute',
- "`" : 'grave',
- "~" : 'tilde',
- "^" : 'circ',
- '"' : 'uml',
- "c" : 'cedil',
- }
-UNICODE_MAP = { '&nacute;' : '&#x0144;', }
-HTML_LIGATURE_MAP = {
- 'AE' : '&AElig;',
- 'ae' : '&aelig;',
- 'OE' : '&OElig;',
- 'oe' : '&oelig;',
- 'AA' : '&Aring;',
- 'aa' : '&aring;',
- 'O' : '&Oslash;',
- 'o' : '&oslash;',
- 'ss' : '&szlig;',
- }
-RE_TEX_CMD = re.compile(r"(?:\\[a-zA-Z@]+|\\.)")
-RE_PAGE_SPAN = re.compile(r"(\d)--(\d)")
-def _unaccent(m):
- accent,char = m.groups()
- if char[0] == '{':
- char = char[1]
- accented = "&%s%s;" % (char, ACCENT_MAP[accent])
- return UNICODE_MAP.get(accented, accented)
-def _unlig_html(m):
- return "%s%s"%(HTML_LIGATURE_MAP[m.group(1)],m.group(2))
-def htmlize(s):
- """Turn a TeX string into good-looking HTML."""
- s = RE_LONE_AMP.sub(lambda m: "&amp;%s" % m.group(1), s)
- s = RE_LONE_I.sub(lambda m: "i%s" % m.group(1), s)
- s = RE_ACCENT.sub(_unaccent, s)
- s = unTeXescapeURL(s)
- s = RE_LIGATURE.sub(_unlig_html, s);
- s = RE_TEX_CMD.sub("", s)
- s = s.translate(ALLCHARS, "{}")
- s = RE_PAGE_SPAN.sub(lambda m: "%s-%s"%(m.groups()), s)
- s = s.replace("---", "&mdash;");
- s = s.replace("--", "&ndash;");
- return s
-
-def author_url(author):
- """Given an author's name, return a URL for his/her homepage."""
- for pat, url in config.AUTHOR_RE_LIST:
- if pat.search(author):
- return url
- return None
-
-def txtize(s):
- """Turn a TeX string into decnent plaintext."""
- s = RE_LONE_I.sub(lambda m: "i%s" % m.group(1), s)
- s = RE_ACCENT.sub(lambda m: "%s" % m.group(2), s)
- s = RE_LIGATURE.sub(lambda m: "%s%s"%m.groups(), s)
- s = RE_TEX_CMD.sub("", s)
- s = s.translate(ALLCHARS, "{}")
- return s
-
-PROCEEDINGS_RE = re.compile(
- r'((?:proceedings|workshop record) of(?: the)? )(.*)',
- re.I)
-
-class ParsedAuthor:
- """The parsed name of an author.
-
- Eddie deserves credit for this incredibly hairy business.
- """
- def __init__(self, first, von, last, jr):
- self.first = first
- self.von = von
- self.last = last
- self.jr = jr
- self.collapsable = 1
-
- self.html = htmlize(str(self))
- self.txt = txtize(str(self))
-
- s = self.html
- for pat in config.NO_COLLAPSE_AUTHORS_RE_LIST:
- if pat.search(s):
- self.collapsable = 0
- break
-
- def __eq__(self, o):
- return ((self.first == o.first) and
- (self.last == o.last) and
- (self.von == o.von) and
- (self.jr == o.jr))
-
- def __hash__(self):
- return hash(repr(self))
-
- def collapsesTo(self, o):
- """Return true iff 'o' could be a more canonical version of this author
- """
- if not self.collapsable or not o.collapsable:
- return self
-
- if self.last != o.last or self.von != o.von or self.jr != o.jr:
- return self
- if not self.first:
- return o
-
- if len(self.first) == len(o.first):
- n = []
- for a,b in zip(self.first, o.first):
- if a == b:
- n.append(a)
- elif len(a) == 2 and a[1] == '.' and a[0] == b[0]:
- n.append(b)
- elif len(b) == 2 and b[1] == '.' and a[0] == b[0]:
- n.append(a)
- else:
- return self
- if n == self.first:
- return self
- elif n == o.first:
- return o
- else:
- return self
- else:
- realname = max([len(n) for n in self.first+o.first])>2
- if not realname:
- return self
-
- if len(self.first) < len(o.first):
- short = self.first; long = o.first
- else:
- short = o.first; long = self.first
-
- initials_s = "".join([n[0] for n in short])
- initials_l = "".join([n[0] for n in long])
- idx = initials_l.find(initials_s)
- if idx < 0:
- return self
- n = long[:idx]
- for i in range(idx, idx+len(short)):
- a = long[i]; b = short[i-idx]
- if a == b:
- n.append(a)
- elif len(a) == 2 and a[1] == '.' and a[0] == b[0]:
- n.append(b)
- elif len(b) == 2 and b[1] == '.' and a[0] == b[0]:
- n.append(a)
- else:
- return self
- n += long[idx+len(short):]
-
- if n == self.first:
- return self
- elif n == o.first:
- return o
- else:
- return self
-
- def __repr__(self):
- return "ParsedAuthor(%r,%r,%r,%r)"%(self.first,self.von,
- self.last,self.jr)
- def __str__(self):
- a = " ".join(self.first+self.von+self.last)
- if self.jr:
- return "%s, %s" % (a,self.jr)
- return a
-
- def getHomepage(self):
- s = self.html
- for pat, url in config.AUTHOR_RE_LIST:
- if pat.search(s):
- return url
- return None
-
- def getSortingName(self):
- """Return a representation of this author's name in von-last-first-jr
- order, unless overridden by ALPH """
- s = self.html
- for pat,v in config.ALPHABETIZE_AUTHOR_AS_RE_LIST:
- if pat.search(s):
- return v
-
- return txtize(" ".join(self.von+self.last+self.first+self.jr))
-
- def getSectionName(self):
- """Return a HTML representation of this author's name in
- last, first von, jr order"""
- secname = " ".join(self.last)
- more = self.first+self.von
- if more:
- secname += ", "+" ".join(more)
- if self.jr:
- secname += ", "+" ".join(self.jr)
- secname = htmlize(secname)
- return secname
-
- def htmlizeWithLink(self):
- a = self.html
- u = self.getHomepage()
- if u:
- return "<a href='%s'>%s</a>"%(u,a)
- else:
- return a
def _split(s,w=79,indent=8):
r = []
@@ -886,105 +265,7 @@ class FileIter:
return self._next()
-def parseAuthor(s):
- try:
- return _parseAuthor(s)
- except:
- print >>sys.stderr, "Internal error while parsing author %r"%s
- raise
-
-def _parseAuthor(s):
- """Take an author string and return a list of ParsedAuthor."""
- items = []
-
- s = s.strip()
- while s:
- s = s.strip()
- bracelevel = 0
- for i in xrange(len(s)):
- if s[i] == '{':
- bracelevel += 1
- elif s[i] == '}':
- bracelevel -= 1
- elif bracelevel <= 0 and s[i] in " \t\n,":
- break
- if i+1 == len(s):
- items.append(s)
- else:
- items.append(s[0:i])
- if (s[i] == ','):
- items.append(',')
- s = s[i+1:]
-
- authors = [[]]
- for item in items:
- if item == 'and':
- authors.append([])
- else:
- authors[-1].append(item)
-
- parsedAuthors = []
- # Split into first, von, last, jr
- for author in authors:
- commas = 0
- fvl = []
- vl = []
- f = []
- v = []
- l = []
- j = []
- cur = fvl
- for item in author:
- if item == ',':
- if commas == 0:
- vl = fvl
- fvl = []
- cur = f
- else:
- j.extend(f)
- cur = f = []
- commas += 1
- else:
- cur.append(item)
- if commas == 0:
- split_von(f,v,l,fvl)
- else:
- f_tmp = []
- split_von(f_tmp,v,l,vl)
-
- parsedAuthors.append(ParsedAuthor(f,v,l,j))
-
- return parsedAuthors
-
-ALLCHARS = "".join(map(chr,range(256)))
-PRINTINGCHARS = "\t\n\r"+"".join(map(chr,range(32, 127)))
-LC_CHARS = "abcdefghijklmnopqrstuvwxyz"
-SV_DELCHARS = ("ABCDEFGHIJKLMNOPQRSTUVWXYZ"
- "abcdefghijklmnopqrstuvwxyz"
- "@")
-RE_ESCAPED = re.compile(r'\\.')
-def split_von(f,v,l,x):
- in_von = 0
- while x:
- tt = t = x[0]
- del x[0]
- if tt[:2] == '{\\':
- tt = tt.translate(ALLCHARS, SV_DELCHARS)
- tt = RE_ESCAPED.sub("", tt)
- tt = tt.translate(ALLCHARS, "{}")
- if tt.translate(ALLCHARS, LC_CHARS) == "":
- v.append(t)
- in_von = 1
- elif in_von and f is not None:
- l.append(t)
- l.extend(x)
- return
- else:
- f.append(t)
- if not in_von:
- l.append(f[-1])
- del f[-1]
class Parser:
@@ -1016,7 +297,7 @@ class Parser:
def _parseKey(self, line):
it = self.fileiter
- line = _advance(it,line)
+ line = _advance(it, line)
m = KEY_RE.match(line)
if not m:
raise ParseError("Expected key at line %s"%self.fileiter.lineno)
diff --git a/entry.py b/entry.py
new file mode 100644
index 0000000..9846e32
--- /dev/null
+++ b/entry.py
@@ -0,0 +1,653 @@
+import rank
+import sys
+import re
+import config
+import os
+from utils import htmlize, txtize, url_untranslate, unTeXescapeURL, smartJoin,\
+ _split
+
+# Fields that we only care about for making web pages (BibTeX doesn't
+# recognize them.)
+WWW_FIELDS = ['www_section', 'www_important', 'www_remarks',
+ 'www_abstract_url', 'www_html_url', 'www_pdf_url', 'www_ps_url',
+ 'www_txt_url', 'www_ps_gz_url', 'www_amazon_url',
+ 'www_excerpt_url', 'www_publisher_url',
+ 'www_cache_section', 'www_tags']
+
+def author_url(author):
+ """Given an author's name, return a URL for his/her homepage."""
+ for pat, url in config.AUTHOR_RE_LIST:
+ if pat.search(author):
+ return url
+ return None
+ALLCHARS = "".join(map(chr,range(256)))
+PRINTINGCHARS = "\t\n\r"+"".join(map(chr,range(32, 127)))
+LC_CHARS = "abcdefghijklmnopqrstuvwxyz"
+SV_DELCHARS = ("ABCDEFGHIJKLMNOPQRSTUVWXYZ"
+ "abcdefghijklmnopqrstuvwxyz"
+ "@")
+RE_ESCAPED = re.compile(r'\\.')
+PROCEEDINGS_RE = re.compile(
+ r'((?:proceedings|workshop record) of(?: the)? )(.*)',
+ re.I)
+
+def split_von(f,v,l,x):
+ in_von = 0
+ while x:
+ tt = t = x[0]
+ del x[0]
+ if tt[:2] == '{\\':
+ tt = tt.translate(ALLCHARS, SV_DELCHARS)
+ tt = RE_ESCAPED.sub("", tt)
+ tt = tt.translate(ALLCHARS, "{}")
+ if tt.translate(ALLCHARS, LC_CHARS) == "":
+ v.append(t)
+ in_von = 1
+ elif in_von and f is not None:
+ l.append(t)
+ l.extend(x)
+ return
+ else:
+ f.append(t)
+ if not in_von:
+ l.append(f[-1])
+ del f[-1]
+
+def buildAuthorTable(entries):
+ """Given a list of BibTeXEntry, return a map from parsed author name to
+ parsed canonical name.
+ """
+ authorsByLast = {}
+ for e in entries:
+ for a in e.parsedAuthor:
+ authorsByLast.setdefault(tuple(a.last), []).append(a)
+ # map from author to collapsed author.
+ result = {}
+ for k,v in config.COLLAPSE_AUTHORS.items():
+ a = parseAuthor(k)[0]
+ c = parseAuthor(v)[0]
+ result[c] = c
+ result[a] = c
+
+ for e in entries:
+ for author in e.parsedAuthor:
+ if result.has_key(author):
+ continue
+
+ c = author
+ for a in authorsByLast[tuple(author.last)]:
+ if a is author:
+ continue
+ c = c.collapsesTo(a)
+ result[author] = c
+
+ if 0:
+ for a,c in result.items():
+ if a != c:
+ print "Collapsing authors: %s => %s" % (a,c)
+ if 0:
+ print parseAuthor("Franz Kaashoek")[0].collapsesTo(
+ parseAuthor("M. Franz Kaashoek")[0])
+ print parseAuthor("Paul F. Syverson")[0].collapsesTo(
+ parseAuthor("Paul Syverson")[0])
+ print parseAuthor("Paul Syverson")[0].collapsesTo(
+ parseAuthor("Paul F. Syverson")[0])
+
+ return result
+
+# List of fields that appear when we display the entries as BibTeX.
+DISPLAYED_FIELDS = [ 'title', 'author', 'journal', 'booktitle',
+'school', 'institution', 'organization', 'volume', 'number', 'year',
+'month', 'address', 'location', 'chapter', 'edition', 'pages', 'editor',
+'howpublished', 'key', 'publisher', 'type', 'note', 'series' ]
+
+class BibTeXEntry:
+ """A single BibTeX entry."""
+ def __init__(self, type, key, entries):
+ self.type = type # What kind of entry is it? (@book,@injournal,etc)
+ self.key = key # What key does it have?
+ self.entries = entries # Map from key to value.
+ self.entryLine = 0 # Defined on this line number
+ def get(self, k, v=None):
+ return self.entries.get(k,v)
+ def has_key(self, k):
+ return self.entries.has_key(k)
+ def __getitem__(self, k):
+ return self.entries[k]
+ def __setitem__(self, k, v):
+ self.entries[k] = v
+ def __str__(self):
+ return self.format(70,1)
+ def getURL(self):
+ """Return the best URL to use for this paper, or None."""
+ best = None
+ for field in ['www_pdf_url', 'www_ps_gz_url', 'www_ps_url',
+ 'www_html_url', 'www_txt_url', ]:
+ u = self.get(field)
+ if u:
+ if not best:
+ best = u
+ elif (best.startswith("http://citeseer.nj.nec.com/")
+ and not u.startswith("http://citeseer.nj.nec.com/")):
+ best = u
+ return best
+
+ def format(self, width=70, indent=8, v=0, invStrings={}):
+ """Format this entry as BibTeX."""
+ d = ["@%s{%s,\n" % (self.type, self.key)]
+ if v:
+ df = DISPLAYED_FIELDS[:]
+ for k in self.entries.keys():
+ if k not in df:
+ df.append(k)
+ else:
+ df = DISPLAYED_FIELDS
+ for f in df:
+ if not self.entries.has_key(f):
+ continue
+ v = self.entries[f]
+ if v.startswith("<span class='bad'>"):
+ d.append("%%%%% ERROR: Missing field\n")
+ d.append("%% %s = {?????},\n"%f)
+ continue
+ np = v.translate(ALLCHARS, PRINTINGCHARS)
+ if np:
+ d.append("%%%%% "+("ERROR: Non-ASCII characters: '%r'\n"%np))
+ d.append(" ")
+ v = v.replace("&", "&amp;")
+ if invStrings.has_key(v):
+ s = "%s = %s,\n" %(f, invStrings[v])
+ else:
+ s = "%s = {%s},\n" % (f, v)
+ d.append(_split(s,width,indent))
+ d.append("}\n")
+ return "".join(d)
+ def resolve(self):
+ """Handle post-processing for this entry"""
+ a = self.get('author')
+ if a:
+ self.parsedAuthor = parseAuthor(a)
+ #print a
+ #print " => ",repr(self.parsedAuthor)
+ else:
+ self.parsedAuthor = None
+
+ def isImportant(self):
+ """Return 1 iff this entry is marked as important"""
+ imp = self.get("www_important")
+ if imp and imp.strip().lower() not in ("no", "false", "0"):
+ return 1
+ return 0
+
+ def check(self):
+ """Print any errors for this entry, and return true if there were
+ none."""
+ errs = self._check()
+ for e in errs:
+ print e
+ return not errs
+
+ def _check(self):
+ errs = []
+ if self.type == 'inproceedings':
+ fields = 'booktitle', 'year'
+ elif self.type == 'incollection':
+ fields = 'booktitle', 'year'
+ elif self.type == 'proceedings':
+ fields = 'booktitle', 'editor'
+ elif self.type == 'article':
+ fields = 'journal', 'year'
+ elif self.type == 'techreport':
+ fields = 'institution',
+ elif self.type == 'misc':
+ fields = 'howpublished',
+ elif self.type in ('mastersthesis', 'phdthesis'):
+ fields = ()
+ else:
+ fields = ()
+ errs.append("ERROR: odd type %s"%self.type)
+ if self.type != 'proceedings':
+ fields += 'title', 'author', 'www_section', 'year'
+
+ for field in fields:
+ if self.get(field) is None or \
+ self.get(field).startswith("<span class='bad'>"):
+ errs.append("ERROR: %s has no %s field" % (self.key, field))
+ self.entries[field] = "<span class='bad'>%s:??</span>"%field
+
+ if self.type == 'inproceedings':
+ if self.get("booktitle"):
+ if not self['booktitle'].startswith("Proceedings of") and \
+ not self['booktitle'].startswith("{Proceedings of"):
+ errs.append("ERROR: %s's booktitle (%r) doesn't start with 'Proceedings of'" % (self.key, self['booktitle']))
+
+ if self.has_key("pages") and not re.search(r'\d+--\d+', self['pages']):
+ errs.append("ERROR: Misformed pages in %s"%self.key)
+
+ if self.type == 'proceedings':
+ if self.get('title'):
+ errs.append("ERROR: %s is a proceedings: it should have a booktitle, not a title." % self.key)
+
+ for field, value in self.entries.items():
+ if value.translate(ALLCHARS, PRINTINGCHARS):
+ errs.append("ERROR: %s.%s has non-ASCII characters"%(
+ self.key, field))
+ if field.startswith("www_") and field not in WWW_FIELDS:
+ errs.append("ERROR: unknown www field %s"% field)
+ if value.strip()[-1:] == '.' and \
+ field not in ("notes", "www_remarks", "author"):
+ errs.append("ERROR: %s.%s has an extraneous period"%(self.key,
+ field))
+ return errs
+
+ def biblio_to_html(self):
+ """Return the HTML for the citation portion of entry."""
+ if self.type in ('inproceedings', 'incollection'):
+ booktitle = self['booktitle']
+ bookurl = self.get('bookurl')
+ if bookurl:
+ m = PROCEEDINGS_RE.match(booktitle)
+ if m:
+ res = ["In the ", m.group(1),
+ '<a href="%s">' % bookurl, m.group(2), "</a>"]
+ else:
+ res = ['In the <a href="%s">%s</a>' % (bookurl, booktitle)]
+ else:
+ res = ["In the ", booktitle]
+
+ if self.get("edition"):
+ res.append(",")
+ res.append(self['edition'])
+ if self.get("location"):
+ res.append(", ")
+ res.append(self['location'])
+ elif self.get("address"):
+ res.append(", ")
+ res.append(self['address'])
+ res.append(", %s %s" % (self.get('month', ""), self['year']))
+ if not self.get('pages'):
+ pass
+ elif "-" in self['pages']:
+ res.append(", pages&nbsp;%s" % self['pages'])
+ else:
+ res.append(", page&nbsp;%s" % self['pages'])
+ elif self.type == 'article':
+ res = ["In "]
+ if self.get('journalurl'):
+ res.append('<a href="%s">%s</a>' % (self['journalurl'],
+ self['journal']))
+ else:
+ res.append(self['journal'])
+ if self.get('volume'):
+ res.append(" <b>%s</b>" % self['volume'])
+ if self.get('number'):
+ res.append("(%s)" % self['number'])
+ res.append(", %s %s" % (self.get('month', ""), self['year']))
+ if not self.get('pages'):
+ pass
+ elif "-" in self['pages']:
+ res.append(", pages&nbsp;%s" % self['pages'])
+ else:
+ res.append(", page&nbsp;%s" % self['pages'])
+ elif self.type == 'techreport':
+ res = ["%s %s %s" % (self['institution'],
+ self.get('type', 'technical report'),
+ self.get('number', ""))]
+ if self.get('month') or self.get('year'):
+ res.append(", %s %s" % (self.get('month', ''),
+ self.get('year', '')))
+ elif self.type == 'mastersthesis' or self.type == 'phdthesis':
+ if self.get('type'):
+ res = [self['type']]
+ elif self.type == 'mastersthesis':
+ res = ["Masters's thesis"]
+ else:
+ res = ["Ph.D. thesis"]
+ if self.get('school'):
+ res.append(", %s" % (self['school']))
+ if self.get('month') or self.get('year'):
+ res.append(", %s %s" % (self.get('month', ''),
+ self.get('year', '')))
+ elif self.type == 'book':
+ res = [self['publisher']]
+ if self.get('year'):
+ res.append(" ")
+ res.append(self.get('year'))
+ # res.append(", %s"%(self.get('year')))
+ if self.get('series'):
+ res.append(",")
+ res.append(self['series'])
+ elif self.type == 'misc':
+ res = [self['howpublished']]
+ if self.get('month') or self.get('year'):
+ res.append(", %s %s" % (self.get('month', ''),
+ self.get('year', '')))
+ if not self.get('pages'):
+ pass
+ elif "-" in self['pages']:
+ res.append(", pages&nbsp;%s" % self['pages'])
+ else:
+ res.append(", page&nbsp;%s" % self['pages'])
+ else:
+ res = ["&lt;Odd type %s&gt;" % self.type]
+
+ res[0:0] = ["<span class='biblio'>"]
+ res.append(".</span>")
+
+ bibtexurl = "./bibtex.html#%s" % url_untranslate(self.key)
+ res.append((" <span class='availability'>"
+ "(<a href='%s'>BibTeX&nbsp;entry</a>)"
+ "</span>") % bibtexurl)
+ return htmlize("".join(res))
+
+ def to_html(self, cache_path="./cache", base_url="."):
+ """Return the HTML for this entry."""
+ imp = self.isImportant()
+ draft = self.get('year') == 'forthcoming'
+ if imp:
+ res = ["<li><div class='impEntry'><p class='impEntry'>"]
+ elif draft:
+ res = ["<li><div class='draftEntry'><p class='draftEntry'>"]
+ else:
+ res = ["<li><p class='entry'>"]
+
+ if imp or not draft:
+ # Add a picture of the rank
+ # Only if year is known or paper important!
+ r = rank.get_rank_html(self['title'], self.get('year'),
+ update=False, base_url=base_url)
+ if r is not None:
+ res.append(r)
+
+ res.append("<span class='title'><a name='%s'>%s</a></span>"%(
+ url_untranslate(self.key),htmlize(self['title'])))
+
+ for cached in 0,1:
+ availability = []
+ if not cached:
+ for which in [ "amazon", "excerpt", "publisher" ]:
+ key = "www_%s_url"%which
+ if self.get(key):
+ url=self[key]
+ url = unTeXescapeURL(url)
+ availability.append('<a href="%s">%s</a>' %(url,which))
+
+ cache_section = self.get('www_cache_section', ".")
+ if cache_section not in config.CACHE_SECTIONS:
+ if cache_section != ".":
+ print >>sys.stderr, "Unrecognized cache section %s"%(
+ cache_section)
+ cache_section="."
+
+ for key, name, ext in (('www_abstract_url', 'abstract','abstract'),
+ ('www_html_url', 'HTML', 'html'),
+ ('www_pdf_url', 'PDF', 'pdf'),
+ ('www_ps_url', 'PS', 'ps'),
+ ('www_txt_url', 'TXT', 'txt'),
+ ('www_ps_gz_url', 'gzipped&nbsp;PS','ps.gz')
+ ):
+ if cached:
+ #XXXX the URL needs to be relative to the absolute
+ #XXXX cache path.
+ url = smartJoin(cache_path,cache_section,
+ "%s.%s"%(self.key,ext))
+ fname = smartJoin(config.OUTPUT_DIR, config.CACHE_DIR,
+ cache_section,
+ "%s.%s"%(self.key,ext))
+ if not os.path.exists(fname): continue
+ else:
+ url = self.get(key)
+ if not url: continue
+ url = unTeXescapeURL(url)
+ url = url.replace('&', '&amp;')
+ availability.append('<a href="%s">%s</a>' %(url,name))
+
+ if availability:
+ res.append([" ", "&nbsp;"][cached])
+ res.append("<span class='availability'>(")
+ if cached: res.append("Cached:&nbsp;")
+ res.append(",&nbsp;".join(availability))
+ res.append(")</span>")
+
+ res.append("<br /><span class='author'>by ")
+
+ #res.append("\n<!-- %r -->\n" % self.parsedAuthor)
+ htmlAuthors = [ a.htmlizeWithLink() for a in self.parsedAuthor ]
+
+ if len(htmlAuthors) == 1:
+ res.append(htmlAuthors[0])
+ elif len(htmlAuthors) == 2:
+ res.append(" and ".join(htmlAuthors))
+ else:
+ res.append(", ".join(htmlAuthors[:-1]))
+ res.append(", and ")
+ res.append(htmlAuthors[-1])
+
+ if res[-1][-1] != '.':
+ res.append(".")
+ res.append("</span><br />\n")
+ res.append(self.biblio_to_html())
+ res.append("<a href='#%s'>&middot;</a>"%url_untranslate(self.key))
+ res.append("</p>")
+
+ if self.get('www_remarks'):
+ res.append("<p class='remarks'>%s</p>"%htmlize(
+ self['www_remarks']))
+
+ if imp or draft:
+ res.append("</div>")
+ res.append("</li>\n\n")
+
+ return "".join(res)
+
+
+class ParsedAuthor:
+ """The parsed name of an author.
+
+ Eddie deserves credit for this incredibly hairy business.
+ """
+ def __init__(self, first, von, last, jr):
+ self.first = first
+ self.von = von
+ self.last = last
+ self.jr = jr
+ self.collapsable = 1
+
+ self.html = htmlize(str(self))
+ self.txt = txtize(str(self))
+
+ s = self.html
+ for pat in config.NO_COLLAPSE_AUTHORS_RE_LIST:
+ if pat.search(s):
+ self.collapsable = 0
+ break
+
+ def __eq__(self, o):
+ return ((self.first == o.first) and
+ (self.last == o.last) and
+ (self.von == o.von) and
+ (self.jr == o.jr))
+
+ def __hash__(self):
+ return hash(repr(self))
+
+ def collapsesTo(self, o):
+ """Return true iff 'o' could be a more canonical version of this author
+ """
+ if not self.collapsable or not o.collapsable:
+ return self
+
+ if self.last != o.last or self.von != o.von or self.jr != o.jr:
+ return self
+ if not self.first:
+ return o
+
+ if len(self.first) == len(o.first):
+ n = []
+ for a,b in zip(self.first, o.first):
+ if a == b:
+ n.append(a)
+ elif len(a) == 2 and a[1] == '.' and a[0] == b[0]:
+ n.append(b)
+ elif len(b) == 2 and b[1] == '.' and a[0] == b[0]:
+ n.append(a)
+ else:
+ return self
+ if n == self.first:
+ return self
+ elif n == o.first:
+ return o
+ else:
+ return self
+ else:
+ realname = max([len(n) for n in self.first+o.first])>2
+ if not realname:
+ return self
+
+ if len(self.first) < len(o.first):
+ short = self.first; long = o.first
+ else:
+ short = o.first; long = self.first
+
+ initials_s = "".join([n[0] for n in short])
+ initials_l = "".join([n[0] for n in long])
+ idx = initials_l.find(initials_s)
+ if idx < 0:
+ return self
+ n = long[:idx]
+ for i in range(idx, idx+len(short)):
+ a = long[i]; b = short[i-idx]
+ if a == b:
+ n.append(a)
+ elif len(a) == 2 and a[1] == '.' and a[0] == b[0]:
+ n.append(b)
+ elif len(b) == 2 and b[1] == '.' and a[0] == b[0]:
+ n.append(a)
+ else:
+ return self
+ n += long[idx+len(short):]
+
+ if n == self.first:
+ return self
+ elif n == o.first:
+ return o
+ else:
+ return self
+
+ def __repr__(self):
+ return "ParsedAuthor(%r,%r,%r,%r)"%(self.first,self.von,
+ self.last,self.jr)
+ def __str__(self):
+ a = " ".join(self.first+self.von+self.last)
+ if self.jr:
+ return "%s, %s" % (a,self.jr)
+ return a
+
+ def getHomepage(self):
+ s = self.html
+ for pat, url in config.AUTHOR_RE_LIST:
+ if pat.search(s):
+ return url
+ return None
+
+ def getSortingName(self):
+ """Return a representation of this author's name in von-last-first-jr
+ order, unless overridden by ALPH """
+ s = self.html
+ for pat,v in config.ALPHABETIZE_AUTHOR_AS_RE_LIST:
+ if pat.search(s):
+ return v
+
+ return txtize(" ".join(self.von+self.last+self.first+self.jr))
+
+ def getSectionName(self):
+ """Return a HTML representation of this author's name in
+ last, first von, jr order"""
+ secname = " ".join(self.last)
+ more = self.first+self.von
+ if more:
+ secname += ", "+" ".join(more)
+ if self.jr:
+ secname += ", "+" ".join(self.jr)
+ secname = htmlize(secname)
+ return secname
+
+ def htmlizeWithLink(self):
+ a = self.html
+ u = self.getHomepage()
+ if u:
+ return "<a href='%s'>%s</a>"%(u,a)
+ else:
+ return a
+
+
+def parseAuthor(s):
+ try:
+ return _parseAuthor(s)
+ except:
+ print >>sys.stderr, "Internal error while parsing author %r"%s
+ raise
+
+def _parseAuthor(s):
+ """Take an author string and return a list of ParsedAuthor."""
+ items = []
+
+ s = s.strip()
+ while s:
+ s = s.strip()
+ bracelevel = 0
+ for i in xrange(len(s)):
+ if s[i] == '{':
+ bracelevel += 1
+ elif s[i] == '}':
+ bracelevel -= 1
+ elif bracelevel <= 0 and s[i] in " \t\n,":
+ break
+ if i+1 == len(s):
+ items.append(s)
+ else:
+ items.append(s[0:i])
+ if (s[i] == ','):
+ items.append(',')
+ s = s[i+1:]
+
+ authors = [[]]
+ for item in items:
+ if item == 'and':
+ authors.append([])
+ else:
+ authors[-1].append(item)
+
+ parsedAuthors = []
+ # Split into first, von, last, jr
+ for author in authors:
+ commas = 0
+ fvl = []
+ vl = []
+ f = []
+ v = []
+ l = []
+ j = []
+ cur = fvl
+ for item in author:
+ if item == ',':
+ if commas == 0:
+ vl = fvl
+ fvl = []
+ cur = f
+ else:
+ j.extend(f)
+ cur = f = []
+ commas += 1
+ else:
+ cur.append(item)
+
+ if commas == 0:
+ split_von(f,v,l,fvl)
+ else:
+ f_tmp = []
+ split_von(f_tmp,v,l,vl)
+
+ parsedAuthors.append(ParsedAuthor(f,v,l,j))
+
+ return parsedAuthors
diff --git a/utils.py b/utils.py
new file mode 100644
index 0000000..4d4b583
--- /dev/null
+++ b/utils.py
@@ -0,0 +1,118 @@
+import re
+import os
+
+ALLCHARS = "".join(map(chr,range(256)))
+RE_LONE_AMP = re.compile(r'&([^a-z0-9])')
+RE_LONE_I = re.compile(r'\\i([^a-z0-9])')
+RE_ACCENT = re.compile(r'\\([\'`~^"c])([^{]|{.})')
+RE_LIGATURE = re.compile(r'\\(AE|ae|OE|oe|AA|aa|O|o|ss)([^a-z0-9])')
+ACCENT_MAP = { "'" : 'acute',
+ "`" : 'grave',
+ "~" : 'tilde',
+ "^" : 'circ',
+ '"' : 'uml',
+ "c" : 'cedil',
+ }
+
+UNICODE_MAP = { '&nacute;' : '&#x0144;', }
+HTML_LIGATURE_MAP = {
+ 'AE' : '&AElig;',
+ 'ae' : '&aelig;',
+ 'OE' : '&OElig;',
+ 'oe' : '&oelig;',
+ 'AA' : '&Aring;',
+ 'aa' : '&aring;',
+ 'O' : '&Oslash;',
+ 'o' : '&oslash;',
+ 'ss' : '&szlig;',
+ }
+RE_TEX_CMD = re.compile(r"(?:\\[a-zA-Z@]+|\\.)")
+RE_PAGE_SPAN = re.compile(r"(\d)--(\d)")
+
+def url_untranslate(s):
+ """Change a BibTeX key into a string suitable for use in a URL."""
+ s = re.sub(r'([%<>`#, &_\';])', lambda m: "_%02x" % ord(m.group(1)), s)
+ s = s.replace("/", ":")
+ return s
+
+def txtize(s):
+ """Turn a TeX string into decnent plaintext."""
+ s = RE_LONE_I.sub(lambda m: "i%s" % m.group(1), s)
+ s = RE_ACCENT.sub(lambda m: "%s" % m.group(2), s)
+ s = RE_LIGATURE.sub(lambda m: "%s%s"%m.groups(), s)
+ s = RE_TEX_CMD.sub("", s)
+ s = s.translate(ALLCHARS, "{}")
+ return s
+
+def unTeXescapeURL(s):
+ """Turn a URL as formatted in TeX into a real URL."""
+ s = s.replace("\\_", "_")
+ s = s.replace("\\-", "")
+ s = s.replace("\{}", "")
+ s = s.replace("{}", "")
+ return s
+
+def TeXescapeURL(s):
+ """Escape a URL for use in TeX"""
+ s = s.replace("_", "\\_")
+ s = s.replace("~", "\{}~")
+ return s
+
+def _unaccent(m):
+ accent,char = m.groups()
+ if char[0] == '{':
+ char = char[1]
+ accented = "&%s%s;" % (char, ACCENT_MAP[accent])
+ return UNICODE_MAP.get(accented, accented)
+
+def _unlig_html(m):
+ return "%s%s"%(HTML_LIGATURE_MAP[m.group(1)],m.group(2))
+
+def htmlize(s):
+ """Turn a TeX string into good-looking HTML."""
+ s = RE_LONE_AMP.sub(lambda m: "&amp;%s" % m.group(1), s)
+ s = RE_LONE_I.sub(lambda m: "i%s" % m.group(1), s)
+ s = RE_ACCENT.sub(_unaccent, s)
+ s = unTeXescapeURL(s)
+ s = RE_LIGATURE.sub(_unlig_html, s);
+ s = RE_TEX_CMD.sub("", s)
+ s = s.translate(ALLCHARS, "{}")
+ s = RE_PAGE_SPAN.sub(lambda m: "%s-%s"%(m.groups()), s)
+ s = s.replace("---", "&mdash;");
+ s = s.replace("--", "&ndash;");
+ return s
+
+def smartJoin(*lst):
+ """Equivalent to os.path.join, but handle"." and ".." entries a bit better.
+ """
+ lst = [item for item in lst if item != "."]
+ idx = 0
+ while idx < len(lst):
+ if idx > 0 and lst[idx] == "..":
+ del lst[idx]
+ else:
+ idx += 1
+ return os.path.join(*lst)
+
+def _split(s,w=79,indent=8):
+ r = []
+ s = re.sub(r"\s+", " ", s)
+ first = 1
+ indentation = ""
+ while len(s) > w:
+ for i in xrange(w-1, 20, -1):
+ if s[i] == ' ':
+ r.append(indentation+s[:i])
+ s = s[i+1:]
+ break
+ else:
+ r.append(indentation+s.strip())
+ s = ""
+ if first:
+ first = 0
+ w -= indent
+ indentation = " "*indent
+ if (s):
+ r.append(indentation+s)
+ r.append("")
+ return "\n".join(r)