Start cleaning: PEP8 and split the BibTeX.py monster

author: Thibaut Horel <thibaut.horel@gmail.com> 2016-02-04 19:46:04 -0500
committer: Thibaut Horel <thibaut.horel@gmail.com> 2016-02-04 19:46:04 -0500
commit: 871c61c6b4351d4a9dd78ba1d70d6e1af8ffe1e7 (patch)
tree: 99bce3e74cbcff075dcb6bceacd0f2e1133bef4d
parent: fd20589a448cd19d036f18cabb1663c33a24375d (diff)
download: anonbib-871c61c6b4351d4a9dd78ba1d70d6e1af8ffe1e7.tar.gz
3 files changed, 789 insertions, 737 deletions
diff --git a/BibTeX.py b/BibTeX.py
index e076200..d0f5624 100644
--- a/BibTeX.py
+++ b/BibTeX.py
@@ -14,63 +14,43 @@ import copy
 
 import config
 
-import rank
+from entry import BibTeXEntry, buildAuthorTable
+from utils import txtize, url_untranslate, smartJoin
 
-__all__ = [ 'ParseError', 'BibTeX', 'BibTeXEntry', 'htmlize',
-            'ParsedAuthor', 'FileIter', 'Parser', 'parseFile',
-            'splitBibTeXEntriesBy', 'sortBibTexEntriesBy', ]
+__all__ = ['ParseError', 'BibTeX', 'BibTeXEntry', 'htmlize',
+           'ParsedAuthor', 'FileIter', 'Parser', 'parseFile',
+           'splitEntriesBy', 'sortEntriesBy']
 
 # List: must map from month number to month name.
-MONTHS = [ None,
-           "January", "February", "March", "April", "May", "June",
-           "July", "August", "September", "October", "November", "December"]
+MONTHS = [None, "January", "February", "March", "April", "May", "June",
+          "July", "August", "September", "October", "November", "December"]
+
+
 
-# Fields that we only care about for making web pages (BibTeX doesn't
-# recognize them.)
-WWW_FIELDS = [ 'www_section', 'www_important', 'www_remarks',
-               'www_abstract_url', 'www_html_url', 'www_pdf_url', 'www_ps_url',
-               'www_txt_url', 'www_ps_gz_url', 'www_amazon_url',
-	       'www_excerpt_url', 'www_publisher_url',
-               'www_cache_section', 'www_tags' ]
 
-def url_untranslate(s):
-    """Change a BibTeX key into a string suitable for use in a URL."""
-    s = re.sub(r'([%<>`#, &_\';])',
-               lambda m: "_%02x"%ord(m.group(1)),
-               s)
-    s = s.replace("/",":")
-    return s
 
 class ParseError(Exception):
     """Raised on invalid BibTeX"""
     pass
 
 
-def smartJoin(*lst):
-    """Equivalent to os.path.join, but handle"." and ".." entries a bit better.
-    """
-    lst = [ item for item in lst if item != "." ]
-    idx = 0
-    while idx < len(lst):
-        if idx > 0 and lst[idx] == "..":
-            del lst[idx]
-        else:
-            idx += 1
-    return os.path.join(*lst)
+
 
 class BibTeX:
     """A parsed BibTeX file"""
     def __init__(self):
-        self.entries = [] # List of BibTeXEntry
-        self.byKey = {} # Map from BibTeX key to BibTeX entry.
+        self.entries = []  # List of BibTeXEntry
+        self.byKey = {}  # Map from BibTeX key to BibTeX entry.
+
     def addEntry(self, ent):
         """Add a BibTeX entry to this file."""
         k = ent.key
         if self.byKey.get(ent.key.lower()):
-            print >> sys.stderr, "Already have an entry named %s"%k
+            print >> sys.stderr, "Already have an entry named %s" % k
             return
         self.entries.append(ent)
         self.byKey[ent.key.lower()] = ent
+
     def resolve(self):
         """Validate all entries in this file, and resolve cross-references"""
         seen = {}
@@ -80,7 +60,7 @@ class BibTeX:
                 try:
                     cr = self.byKey[ent['crossref'].lower()]
                 except KeyError:
-                    print "No such crossref: %s"% ent['crossref']
+                    print "No such crossref: %s" % ent['crossref']
                     break
                 if seen.get(cr.key):
                     raise ParseError("Circular crossref at %s" % ent.key)
@@ -88,7 +68,7 @@ class BibTeX:
                 del ent.entries['crossref']
 
                 if cr.entryLine < ent.entryLine:
-                    print "Warning: crossref %s used after declaration"%cr.key
+                    print "Warning: crossref %s used after declaration" % cr.key
 
                 for k in cr.entries.keys():
                     if ent.entries.has_key(k):
@@ -113,47 +93,7 @@ class BibTeX:
                 newEntries.append(ent)
         self.entries = newEntries
 
-def buildAuthorTable(entries):
-    """Given a list of BibTeXEntry, return a map from parsed author name to
-       parsed canonical name.
-    """
-    authorsByLast = {}
-    for e in entries:
-        for a in e.parsedAuthor:
-            authorsByLast.setdefault(tuple(a.last), []).append(a)
-    # map from author to collapsed author.
-    result = {}
-    for k,v in config.COLLAPSE_AUTHORS.items():
-        a = parseAuthor(k)[0]
-        c = parseAuthor(v)[0]
-        result[c] = c
-        result[a] = c
-
-    for e in entries:
-        for author in e.parsedAuthor:
-            if result.has_key(author):
-                continue
-
-            c = author
-            for a in authorsByLast[tuple(author.last)]:
-                if a is author:
-                    continue
-                c = c.collapsesTo(a)
-            result[author] = c
 
-    if 0:
-        for a,c in result.items():
-            if a != c:
-                print "Collapsing authors: %s => %s" % (a,c)
-    if 0:
-        print parseAuthor("Franz Kaashoek")[0].collapsesTo(
-            parseAuthor("M. Franz Kaashoek")[0])
-        print parseAuthor("Paul F. Syverson")[0].collapsesTo(
-            parseAuthor("Paul Syverson")[0])
-        print parseAuthor("Paul Syverson")[0].collapsesTo(
-            parseAuthor("Paul F. Syverson")[0])
-
-    return result
 
 def splitEntriesBy(entries, field):
     """Take a list of BibTeX entries and the name of a bibtex field; return
@@ -281,570 +221,9 @@ def sortEntriesByDate(entries):
     return [ t[2] for t in tmp ]
 
 
-# List of fields that appear when we display the entries as BibTeX.
-DISPLAYED_FIELDS = [ 'title', 'author', 'journal', 'booktitle',
-'school', 'institution', 'organization', 'volume', 'number', 'year',
-'month', 'address', 'location', 'chapter', 'edition', 'pages', 'editor',
-'howpublished', 'key', 'publisher', 'type', 'note', 'series' ]
-
-class BibTeXEntry:
-    """A single BibTeX entry."""
-    def __init__(self, type, key, entries):
-        self.type = type  # What kind of entry is it?  (@book,@injournal,etc)
-        self.key = key # What key does it have?
-        self.entries = entries # Map from key to value.
-        self.entryLine = 0 # Defined on this line number
-    def get(self, k, v=None):
-        return self.entries.get(k,v)
-    def has_key(self, k):
-        return self.entries.has_key(k)
-    def __getitem__(self, k):
-        return self.entries[k]
-    def __setitem__(self, k, v):
-        self.entries[k] = v
-    def __str__(self):
-        return self.format(70,1)
-    def getURL(self):
-        """Return the best URL to use for this paper, or None."""
-        best = None
-        for field in ['www_pdf_url', 'www_ps_gz_url', 'www_ps_url',
-                      'www_html_url', 'www_txt_url', ]:
-            u = self.get(field)
-            if u:
-                if not best:
-                    best = u
-                elif (best.startswith("http://citeseer.nj.nec.com/")
-                      and not u.startswith("http://citeseer.nj.nec.com/")):
-                    best = u
-        return best
-
-    def format(self, width=70, indent=8, v=0, invStrings={}):
-        """Format this entry as BibTeX."""
-        d = ["@%s{%s,\n" % (self.type, self.key)]
-        if v:
-            df = DISPLAYED_FIELDS[:]
-            for k in self.entries.keys():
-                if k not in df:
-                    df.append(k)
-        else:
-            df = DISPLAYED_FIELDS
-        for f in df:
-            if not self.entries.has_key(f):
-                continue
-            v = self.entries[f]
-            if v.startswith("<span class='bad'>"):
-                d.append("%%%%% ERROR: Missing field\n")
-                d.append("%% %s = {?????},\n"%f)
-                continue
-            np = v.translate(ALLCHARS, PRINTINGCHARS)
-            if np:
-                d.append("%%%%% "+("ERROR: Non-ASCII characters: '%r'\n"%np))
-            d.append("  ")
-            v = v.replace("&", "&amp;")
-            if invStrings.has_key(v):
-                s = "%s = %s,\n" %(f, invStrings[v])
-            else:
-                s = "%s = {%s},\n" % (f, v)
-            d.append(_split(s,width,indent))
-        d.append("}\n")
-        return "".join(d)
-    def resolve(self):
-        """Handle post-processing for this entry"""
-        a = self.get('author')
-        if a:
-            self.parsedAuthor = parseAuthor(a)
-            #print a
-            #print "   => ",repr(self.parsedAuthor)
-        else:
-            self.parsedAuthor = None
-
-    def isImportant(self):
-        """Return 1 iff this entry is marked as important"""
-        imp = self.get("www_important")
-        if imp and imp.strip().lower() not in ("no", "false", "0"):
-            return 1
-        return 0
-
-    def check(self):
-        """Print any errors for this entry, and return true if there were
-           none."""
-        errs = self._check()
-        for e in errs:
-            print e
-        return not errs
-
-    def _check(self):
-        errs = []
-        if self.type == 'inproceedings':
-            fields = 'booktitle', 'year'
-        elif self.type == 'incollection':
-            fields = 'booktitle', 'year'
-        elif self.type == 'proceedings':
-            fields = 'booktitle', 'editor'
-        elif self.type == 'article':
-            fields = 'journal', 'year'
-        elif self.type == 'techreport':
-            fields = 'institution',
-        elif self.type == 'misc':
-            fields = 'howpublished',
-        elif self.type in ('mastersthesis', 'phdthesis'):
-            fields = ()
-        else:
-            fields = ()
-            errs.append("ERROR: odd type %s"%self.type)
-        if self.type != 'proceedings':
-            fields += 'title', 'author', 'www_section', 'year'
-
-        for field in fields:
-            if self.get(field) is None or \
-                   self.get(field).startswith("<span class='bad'>"):
-                errs.append("ERROR: %s has no %s field" % (self.key, field))
-                self.entries[field] = "<span class='bad'>%s:??</span>"%field
-
-        if self.type == 'inproceedings':
-            if self.get("booktitle"):
-                if not self['booktitle'].startswith("Proceedings of") and \
-                   not self['booktitle'].startswith("{Proceedings of"):
-                    errs.append("ERROR: %s's booktitle (%r) doesn't start with 'Proceedings of'" % (self.key, self['booktitle']))
-
-        if self.has_key("pages") and not re.search(r'\d+--\d+', self['pages']):
-            errs.append("ERROR: Misformed pages in %s"%self.key)
-
-        if self.type == 'proceedings':
-            if self.get('title'):
-                errs.append("ERROR: %s is a proceedings: it should have a booktitle, not a title." % self.key)
-
-        for field, value in self.entries.items():
-            if value.translate(ALLCHARS, PRINTINGCHARS):
-                errs.append("ERROR: %s.%s has non-ASCII characters"%(
-                    self.key, field))
-            if field.startswith("www_") and field not in WWW_FIELDS:
-                errs.append("ERROR: unknown www field %s"% field)
-            if value.strip()[-1:] == '.' and \
-                field not in ("notes", "www_remarks", "author"):
-                errs.append("ERROR: %s.%s has an extraneous period"%(self.key,
-                            field))
-        return errs
-
-    def biblio_to_html(self):
-        """Return the HTML for the citation portion of entry."""
-        if self.type in ('inproceedings', 'incollection'):
-            booktitle = self['booktitle']
-            bookurl = self.get('bookurl')
-            if bookurl:
-                m = PROCEEDINGS_RE.match(booktitle)
-                if m:
-                    res = ["In the ", m.group(1),
-                           '<a href="%s">'%bookurl, m.group(2), "</a>"]
-                else:
-                    res = ['In the <a href="%s">%s</a>' % (bookurl,booktitle)]
-            else:
-                res = ["In the ", booktitle ]
-
-            if self.get("edition"):
-                res.append(",")
-                res.append(self['edition'])
-            if self.get("location"):
-                res.append(", ")
-                res.append(self['location'])
-            elif self.get("address"):
-                res.append(", ")
-                res.append(self['address'])
-            res.append(", %s %s" % (self.get('month',""), self['year']))
-            if not self.get('pages'):
-                pass
-            elif "-" in self['pages']:
-                res.append(", pages&nbsp;%s"%self['pages'])
-            else:
-                res.append(", page&nbsp;%s"%self['pages'])
-        elif self.type == 'article':
-            res = ["In "]
-            if self.get('journalurl'):
-                res.append('<a href="%s">%s</a>'%
-                           (self['journalurl'],self['journal']))
-            else:
-                res.append(self['journal'])
-            if self.get('volume'):
-                res.append(" <b>%s</b>"%self['volume'])
-            if self.get('number'):
-                res.append("(%s)"%self['number'])
-            res.append(", %s %s" % (self.get('month',""), self['year']))
-            if not self.get('pages'):
-                pass
-            elif "-" in self['pages']:
-                res.append(", pages&nbsp;%s"%self['pages'])
-            else:
-                res.append(", page&nbsp;%s"%self['pages'])
-        elif self.type == 'techreport':
-            res = [ "%s %s %s" % (self['institution'],
-                                  self.get('type', 'technical report'),
-                                  self.get('number', "")) ]
-            if self.get('month') or self.get('year'):
-                res.append(", %s %s" % (self.get('month', ''),
-                                        self.get('year', '')))
-        elif self.type == 'mastersthesis' or self.type == 'phdthesis':
-            if self.get('type'):
-                res = [self['type']]
-            elif self.type == 'mastersthesis':
-                res = ["Masters's thesis"]
-            else:
-                res = ["Ph.D. thesis"]
-            if self.get('school'):
-                res.append(", %s"%(self['school']))
-            if self.get('month') or self.get('year'):
-                res.append(", %s %s" % (self.get('month', ''),
-                                        self.get('year', '')))
-	elif self.type == 'book':
-	    res = [self['publisher']]
-	    if self.get('year'):
-		res.append(" ");
-		res.append(self.get('year'));
-	#	res.append(", %s"%(self.get('year')))
-	    if self.get('series'):
-		res.append(",");
-		res.append(self['series']);
-        elif self.type == 'misc':
-            res = [self['howpublished']]
-            if self.get('month') or self.get('year'):
-                res.append(", %s %s" % (self.get('month', ''),
-                                        self.get('year', '')))
-            if not self.get('pages'):
-                pass
-            elif "-" in self['pages']:
-                res.append(", pages&nbsp;%s"%self['pages'])
-            else:
-                res.append(", page&nbsp;%s"%self['pages'])
-        else:
-            res = ["&lt;Odd type %s&gt;"%self.type]
-
-        res[0:0] = ["<span class='biblio'>"]
-        res.append(".</span>")
-
-        bibtexurl = "./bibtex.html#%s"%url_untranslate(self.key)
-        res.append((" <span class='availability'>"
-                   "(<a href='%s'>BibTeX&nbsp;entry</a>)"
-                   "</span>") %bibtexurl)
-        return htmlize("".join(res))
-
-    def to_html(self, cache_path="./cache", base_url="."):
-        """Return the HTML for this entry."""
-        imp = self.isImportant()
-        draft = self.get('year') == 'forthcoming'
-        if imp:
-            res = ["<li><div class='impEntry'><p class='impEntry'>" ]
-        elif draft:
-            res = ["<li><div class='draftEntry'><p class='draftEntry'>" ]
-        else:
-            res = ["<li><p class='entry'>"]
-
-        if imp or not draft:
-            # Add a picture of the rank
-            # Only if year is known or paper important!
-            r = rank.get_rank_html(self['title'], self.get('year'),
-                                   update=False, base_url=base_url)
-            if r is not None:
-                res.append(r)
-
-        res.append("<span class='title'><a name='%s'>%s</a></span>"%(
-            url_untranslate(self.key),htmlize(self['title'])))
-
-        for cached in 0,1:
-            availability = []
-            if not cached:
-                for which in [ "amazon", "excerpt", "publisher" ]:
-                    key = "www_%s_url"%which
-                    if self.get(key):
-                        url=self[key]
-                        url = unTeXescapeURL(url)
-                        availability.append('<a href="%s">%s</a>' %(url,which))
-
-            cache_section = self.get('www_cache_section', ".")
-            if cache_section not in config.CACHE_SECTIONS:
-                if cache_section != ".":
-                    print >>sys.stderr, "Unrecognized cache section %s"%(
-                        cache_section)
-                    cache_section="."
-
-            for key, name, ext in (('www_abstract_url', 'abstract','abstract'),
-                                   ('www_html_url', 'HTML', 'html'),
-                                   ('www_pdf_url', 'PDF', 'pdf'),
-                                   ('www_ps_url', 'PS', 'ps'),
-                                   ('www_txt_url', 'TXT', 'txt'),
-                                   ('www_ps_gz_url', 'gzipped&nbsp;PS','ps.gz')
-                                   ):
-                if cached:
-                    #XXXX the URL needs to be relative to the absolute
-                    #XXXX cache path.
-                    url = smartJoin(cache_path,cache_section,
-                                    "%s.%s"%(self.key,ext))
-                    fname = smartJoin(config.OUTPUT_DIR, config.CACHE_DIR,
-                                      cache_section,
-                                      "%s.%s"%(self.key,ext))
-                    if not os.path.exists(fname): continue
-                else:
-                    url = self.get(key)
-                    if not url: continue
-                url = unTeXescapeURL(url)
-                url = url.replace('&', '&amp;')
-                availability.append('<a href="%s">%s</a>' %(url,name))
-
-            if availability:
-                res.append([" ", "&nbsp;"][cached])
-                res.append("<span class='availability'>(")
-                if cached: res.append("Cached:&nbsp;")
-                res.append(",&nbsp;".join(availability))
-                res.append(")</span>")
-
-        res.append("<br /><span class='author'>by ")
-
-        #res.append("\n<!-- %r -->\n" % self.parsedAuthor)
-        htmlAuthors = [ a.htmlizeWithLink() for a in self.parsedAuthor ]
-
-        if len(htmlAuthors) == 1:
-            res.append(htmlAuthors[0])
-        elif len(htmlAuthors) == 2:
-            res.append(" and ".join(htmlAuthors))
-        else:
-            res.append(", ".join(htmlAuthors[:-1]))
-            res.append(", and ")
-            res.append(htmlAuthors[-1])
 
-        if res[-1][-1] != '.':
-            res.append(".")
-        res.append("</span><br />\n")
-        res.append(self.biblio_to_html())
-        res.append("<a href='#%s'>&middot;</a>"%url_untranslate(self.key))
-        res.append("</p>")
 
-        if self.get('www_remarks'):
-            res.append("<p class='remarks'>%s</p>"%htmlize(
-                self['www_remarks']))
 
-        if imp or draft:
-            res.append("</div>")
-        res.append("</li>\n\n")
-
-        return "".join(res)
-
-def unTeXescapeURL(s):
-    """Turn a URL as formatted in TeX into a real URL."""
-    s = s.replace("\\_", "_")
-    s = s.replace("\\-", "")
-    s = s.replace("\{}", "")
-    s = s.replace("{}", "")
-    return s
-
-def TeXescapeURL(s):
-    """Escape a URL for use in TeX"""
-    s = s.replace("_", "\\_")
-    s = s.replace("~", "\{}~")
-    return s
-
-RE_LONE_AMP = re.compile(r'&([^a-z0-9])')
-RE_LONE_I = re.compile(r'\\i([^a-z0-9])')
-RE_ACCENT = re.compile(r'\\([\'`~^"c])([^{]|{.})')
-RE_LIGATURE = re.compile(r'\\(AE|ae|OE|oe|AA|aa|O|o|ss)([^a-z0-9])')
-ACCENT_MAP = { "'" : 'acute',
-               "`" : 'grave',
-               "~" : 'tilde',
-               "^" : 'circ',
-               '"' : 'uml',
-               "c" : 'cedil',
-               }
-UNICODE_MAP = { '&nacute;' : '&#x0144;', }
-HTML_LIGATURE_MAP = {
-    'AE' : '&AElig;',
-    'ae' : '&aelig;',
-    'OE' : '&OElig;',
-    'oe' : '&oelig;',
-    'AA' : '&Aring;',
-    'aa' : '&aring;',
-    'O'  : '&Oslash;',
-    'o'  : '&oslash;',
-    'ss' : '&szlig;',
-    }
-RE_TEX_CMD = re.compile(r"(?:\\[a-zA-Z@]+|\\.)")
-RE_PAGE_SPAN = re.compile(r"(\d)--(\d)")
-def _unaccent(m):
-    accent,char = m.groups()
-    if char[0] == '{':
-        char = char[1]
-    accented = "&%s%s;" % (char, ACCENT_MAP[accent])
-    return UNICODE_MAP.get(accented, accented)
-def _unlig_html(m):
-    return "%s%s"%(HTML_LIGATURE_MAP[m.group(1)],m.group(2))
-def htmlize(s):
-    """Turn a TeX string into good-looking HTML."""
-    s = RE_LONE_AMP.sub(lambda m: "&amp;%s" % m.group(1), s)
-    s = RE_LONE_I.sub(lambda m: "i%s" % m.group(1), s)
-    s = RE_ACCENT.sub(_unaccent, s)
-    s = unTeXescapeURL(s)
-    s = RE_LIGATURE.sub(_unlig_html, s);
-    s = RE_TEX_CMD.sub("", s)
-    s = s.translate(ALLCHARS, "{}")
-    s = RE_PAGE_SPAN.sub(lambda m: "%s-%s"%(m.groups()), s)
-    s = s.replace("---", "&mdash;");
-    s = s.replace("--", "&ndash;");
-    return s
-
-def author_url(author):
-    """Given an author's name, return a URL for his/her homepage."""
-    for pat, url in config.AUTHOR_RE_LIST:
-        if pat.search(author):
-            return url
-    return None
-
-def txtize(s):
-    """Turn a TeX string into decnent plaintext."""
-    s = RE_LONE_I.sub(lambda m: "i%s" % m.group(1), s)
-    s = RE_ACCENT.sub(lambda m: "%s" % m.group(2), s)
-    s = RE_LIGATURE.sub(lambda m: "%s%s"%m.groups(), s)
-    s = RE_TEX_CMD.sub("", s)
-    s = s.translate(ALLCHARS, "{}")
-    return s
-
-PROCEEDINGS_RE = re.compile(
-                        r'((?:proceedings|workshop record) of(?: the)? )(.*)',
-                        re.I)
-
-class ParsedAuthor:
-    """The parsed name of an author.
-
-       Eddie deserves credit for this incredibly hairy business.
-    """
-    def __init__(self, first, von, last, jr):
-        self.first = first
-        self.von = von
-        self.last = last
-        self.jr = jr
-        self.collapsable = 1
-
-        self.html = htmlize(str(self))
-        self.txt = txtize(str(self))
-
-        s = self.html
-        for pat in config.NO_COLLAPSE_AUTHORS_RE_LIST:
-            if pat.search(s):
-                self.collapsable = 0
-                break
-
-    def __eq__(self, o):
-        return ((self.first == o.first) and
-                (self.last  == o.last) and
-                (self.von   == o.von) and
-                (self.jr    == o.jr))
-
-    def __hash__(self):
-        return hash(repr(self))
-
-    def collapsesTo(self, o):
-        """Return true iff 'o' could be a more canonical version of this author
-        """
-        if not self.collapsable or not o.collapsable:
-            return self
-
-        if self.last != o.last or self.von != o.von or self.jr != o.jr:
-            return self
-        if not self.first:
-            return o
-
-        if len(self.first) == len(o.first):
-            n = []
-            for a,b in zip(self.first, o.first):
-                if a == b:
-                    n.append(a)
-                elif len(a) == 2 and a[1] == '.' and a[0] == b[0]:
-                    n.append(b)
-                elif len(b) == 2 and b[1] == '.' and a[0] == b[0]:
-                    n.append(a)
-                else:
-                    return self
-            if n == self.first:
-                return self
-            elif n == o.first:
-                return o
-            else:
-                return self
-        else:
-            realname = max([len(n) for n in self.first+o.first])>2
-            if not realname:
-                return self
-
-            if len(self.first) < len(o.first):
-                short = self.first; long = o.first
-            else:
-                short = o.first; long = self.first
-
-            initials_s = "".join([n[0] for n in short])
-            initials_l = "".join([n[0] for n in long])
-            idx = initials_l.find(initials_s)
-            if idx < 0:
-                return self
-            n = long[:idx]
-            for i in range(idx, idx+len(short)):
-                a = long[i]; b = short[i-idx]
-                if a == b:
-                    n.append(a)
-                elif len(a) == 2 and a[1] == '.' and a[0] == b[0]:
-                    n.append(b)
-                elif len(b) == 2 and b[1] == '.' and a[0] == b[0]:
-                    n.append(a)
-                else:
-                    return self
-            n += long[idx+len(short):]
-
-            if n == self.first:
-                return self
-            elif n == o.first:
-                return o
-            else:
-                return self
-
-    def __repr__(self):
-        return "ParsedAuthor(%r,%r,%r,%r)"%(self.first,self.von,
-                                            self.last,self.jr)
-    def __str__(self):
-        a = " ".join(self.first+self.von+self.last)
-        if self.jr:
-            return "%s, %s" % (a,self.jr)
-        return a
-
-    def getHomepage(self):
-        s = self.html
-        for pat, url in config.AUTHOR_RE_LIST:
-            if pat.search(s):
-                return url
-        return None
-
-    def getSortingName(self):
-        """Return a representation of this author's name in von-last-first-jr
-           order, unless overridden by ALPH """
-        s = self.html
-        for pat,v in config.ALPHABETIZE_AUTHOR_AS_RE_LIST:
-            if pat.search(s):
-                return v
-
-        return txtize(" ".join(self.von+self.last+self.first+self.jr))
-
-    def getSectionName(self):
-        """Return a HTML representation of this author's name in
-           last, first von, jr order"""
-        secname = " ".join(self.last)
-        more = self.first+self.von
-        if more:
-            secname += ", "+" ".join(more)
-        if self.jr:
-            secname += ", "+" ".join(self.jr)
-        secname = htmlize(secname)
-        return secname
-
-    def htmlizeWithLink(self):
-        a = self.html
-        u = self.getHomepage()
-        if u:
-            return "<a href='%s'>%s</a>"%(u,a)
-        else:
-            return a
 
 def _split(s,w=79,indent=8):
     r = []
@@ -886,105 +265,7 @@ class FileIter:
         return self._next()
 
 
-def parseAuthor(s):
-    try:
-        return _parseAuthor(s)
-    except:
-        print >>sys.stderr, "Internal error while parsing author %r"%s
-        raise
-
-def _parseAuthor(s):
-    """Take an author string and return a list of ParsedAuthor."""
-    items = []
-
-    s = s.strip()
-    while s:
-        s = s.strip()
-        bracelevel = 0
-        for i in xrange(len(s)):
-            if s[i] == '{':
-                bracelevel += 1
-            elif s[i] == '}':
-                bracelevel -= 1
-            elif bracelevel <= 0 and s[i] in " \t\n,":
-                break
-        if i+1 == len(s):
-            items.append(s)
-        else:
-            items.append(s[0:i])
-        if (s[i] == ','):
-            items.append(',')
-        s = s[i+1:]
-
-    authors = [[]]
-    for item in items:
-        if item == 'and':
-            authors.append([])
-        else:
-            authors[-1].append(item)
-
-    parsedAuthors = []
-    # Split into first, von, last, jr
-    for author in authors:
-        commas = 0
-        fvl = []
-        vl = []
-        f = []
-        v = []
-        l = []
-        j = []
-        cur = fvl
-        for item in author:
-            if item == ',':
-                if commas == 0:
-                    vl = fvl
-                    fvl = []
-                    cur = f
-                else:
-                    j.extend(f)
-                    cur = f = []
-                commas += 1
-            else:
-                cur.append(item)
 
-        if commas == 0:
-            split_von(f,v,l,fvl)
-        else:
-            f_tmp = []
-            split_von(f_tmp,v,l,vl)
-
-        parsedAuthors.append(ParsedAuthor(f,v,l,j))
-
-    return parsedAuthors
-
-ALLCHARS = "".join(map(chr,range(256)))
-PRINTINGCHARS = "\t\n\r"+"".join(map(chr,range(32, 127)))
-LC_CHARS = "abcdefghijklmnopqrstuvwxyz"
-SV_DELCHARS = ("ABCDEFGHIJKLMNOPQRSTUVWXYZ"
-               "abcdefghijklmnopqrstuvwxyz"
-               "@")
-RE_ESCAPED = re.compile(r'\\.')
-def split_von(f,v,l,x):
-    in_von = 0
-    while x:
-        tt = t = x[0]
-        del x[0]
-        if tt[:2] == '{\\':
-            tt = tt.translate(ALLCHARS, SV_DELCHARS)
-            tt = RE_ESCAPED.sub("", tt)
-            tt = tt.translate(ALLCHARS, "{}")
-        if tt.translate(ALLCHARS, LC_CHARS) == "":
-            v.append(t)
-            in_von = 1
-        elif in_von and f is not None:
-            l.append(t)
-            l.extend(x)
-            return
-        else:
-            f.append(t)
-    if not in_von:
-        l.append(f[-1])
-        del f[-1]
 
 
 class Parser:
@@ -1016,7 +297,7 @@ class Parser:
 
     def _parseKey(self, line):
         it = self.fileiter
-        line = _advance(it,line)
+        line = _advance(it, line)
         m = KEY_RE.match(line)
         if not m:
             raise ParseError("Expected key at line %s"%self.fileiter.lineno)
diff --git a/entry.py b/entry.py
new file mode 100644
index 0000000..9846e32
--- /dev/null
+++ b/entry.py
@@ -0,0 +1,653 @@
+import rank
+import sys
+import re
+import config
+import os
+from utils import htmlize, txtize, url_untranslate, unTeXescapeURL, smartJoin,\
+    _split
+
+# Fields that we only care about for making web pages (BibTeX doesn't
+# recognize them.)
+WWW_FIELDS = ['www_section', 'www_important', 'www_remarks',
+              'www_abstract_url', 'www_html_url', 'www_pdf_url', 'www_ps_url',
+              'www_txt_url', 'www_ps_gz_url', 'www_amazon_url',
+              'www_excerpt_url', 'www_publisher_url',
+              'www_cache_section', 'www_tags']
+
+def author_url(author):
+    """Given an author's name, return a URL for his/her homepage."""
+    for pat, url in config.AUTHOR_RE_LIST:
+        if pat.search(author):
+            return url
+    return None
+ALLCHARS = "".join(map(chr,range(256)))
+PRINTINGCHARS = "\t\n\r"+"".join(map(chr,range(32, 127)))
+LC_CHARS = "abcdefghijklmnopqrstuvwxyz"
+SV_DELCHARS = ("ABCDEFGHIJKLMNOPQRSTUVWXYZ"
+               "abcdefghijklmnopqrstuvwxyz"
+               "@")
+RE_ESCAPED = re.compile(r'\\.')
+PROCEEDINGS_RE = re.compile(
+                        r'((?:proceedings|workshop record) of(?: the)? )(.*)',
+                        re.I)
+
+def split_von(f,v,l,x):
+    in_von = 0
+    while x:
+        tt = t = x[0]
+        del x[0]
+        if tt[:2] == '{\\':
+            tt = tt.translate(ALLCHARS, SV_DELCHARS)
+            tt = RE_ESCAPED.sub("", tt)
+            tt = tt.translate(ALLCHARS, "{}")
+        if tt.translate(ALLCHARS, LC_CHARS) == "":
+            v.append(t)
+            in_von = 1
+        elif in_von and f is not None:
+            l.append(t)
+            l.extend(x)
+            return
+        else:
+            f.append(t)
+    if not in_von:
+        l.append(f[-1])
+        del f[-1]
+
+def buildAuthorTable(entries):
+    """Given a list of BibTeXEntry, return a map from parsed author name to
+       parsed canonical name.
+    """
+    authorsByLast = {}
+    for e in entries:
+        for a in e.parsedAuthor:
+            authorsByLast.setdefault(tuple(a.last), []).append(a)
+    # map from author to collapsed author.
+    result = {}
+    for k,v in config.COLLAPSE_AUTHORS.items():
+        a = parseAuthor(k)[0]
+        c = parseAuthor(v)[0]
+        result[c] = c
+        result[a] = c
+
+    for e in entries:
+        for author in e.parsedAuthor:
+            if result.has_key(author):
+                continue
+
+            c = author
+            for a in authorsByLast[tuple(author.last)]:
+                if a is author:
+                    continue
+                c = c.collapsesTo(a)
+            result[author] = c
+
+    if 0:
+        for a,c in result.items():
+            if a != c:
+                print "Collapsing authors: %s => %s" % (a,c)
+    if 0:
+        print parseAuthor("Franz Kaashoek")[0].collapsesTo(
+            parseAuthor("M. Franz Kaashoek")[0])
+        print parseAuthor("Paul F. Syverson")[0].collapsesTo(
+            parseAuthor("Paul Syverson")[0])
+        print parseAuthor("Paul Syverson")[0].collapsesTo(
+            parseAuthor("Paul F. Syverson")[0])
+
+    return result
+
+# List of fields that appear when we display the entries as BibTeX.
+DISPLAYED_FIELDS = [ 'title', 'author', 'journal', 'booktitle',
+'school', 'institution', 'organization', 'volume', 'number', 'year',
+'month', 'address', 'location', 'chapter', 'edition', 'pages', 'editor',
+'howpublished', 'key', 'publisher', 'type', 'note', 'series' ]
+
+class BibTeXEntry:
+    """A single BibTeX entry."""
+    def __init__(self, type, key, entries):
+        self.type = type  # What kind of entry is it?  (@book,@injournal,etc)
+        self.key = key # What key does it have?
+        self.entries = entries # Map from key to value.
+        self.entryLine = 0 # Defined on this line number
+    def get(self, k, v=None):
+        return self.entries.get(k,v)
+    def has_key(self, k):
+        return self.entries.has_key(k)
+    def __getitem__(self, k):
+        return self.entries[k]
+    def __setitem__(self, k, v):
+        self.entries[k] = v
+    def __str__(self):
+        return self.format(70,1)
+    def getURL(self):
+        """Return the best URL to use for this paper, or None."""
+        best = None
+        for field in ['www_pdf_url', 'www_ps_gz_url', 'www_ps_url',
+                      'www_html_url', 'www_txt_url', ]:
+            u = self.get(field)
+            if u:
+                if not best:
+                    best = u
+                elif (best.startswith("http://citeseer.nj.nec.com/")
+                      and not u.startswith("http://citeseer.nj.nec.com/")):
+                    best = u
+        return best
+
+    def format(self, width=70, indent=8, v=0, invStrings={}):
+        """Format this entry as BibTeX."""
+        d = ["@%s{%s,\n" % (self.type, self.key)]
+        if v:
+            df = DISPLAYED_FIELDS[:]
+            for k in self.entries.keys():
+                if k not in df:
+                    df.append(k)
+        else:
+            df = DISPLAYED_FIELDS
+        for f in df:
+            if not self.entries.has_key(f):
+                continue
+            v = self.entries[f]
+            if v.startswith("<span class='bad'>"):
+                d.append("%%%%% ERROR: Missing field\n")
+                d.append("%% %s = {?????},\n"%f)
+                continue
+            np = v.translate(ALLCHARS, PRINTINGCHARS)
+            if np:
+                d.append("%%%%% "+("ERROR: Non-ASCII characters: '%r'\n"%np))
+            d.append("  ")
+            v = v.replace("&", "&amp;")
+            if invStrings.has_key(v):
+                s = "%s = %s,\n" %(f, invStrings[v])
+            else:
+                s = "%s = {%s},\n" % (f, v)
+            d.append(_split(s,width,indent))
+        d.append("}\n")
+        return "".join(d)
+    def resolve(self):
+        """Handle post-processing for this entry"""
+        a = self.get('author')
+        if a:
+            self.parsedAuthor = parseAuthor(a)
+            #print a
+            #print "   => ",repr(self.parsedAuthor)
+        else:
+            self.parsedAuthor = None
+
+    def isImportant(self):
+        """Return 1 iff this entry is marked as important"""
+        imp = self.get("www_important")
+        if imp and imp.strip().lower() not in ("no", "false", "0"):
+            return 1
+        return 0
+
+    def check(self):
+        """Print any errors for this entry, and return true if there were
+           none."""
+        errs = self._check()
+        for e in errs:
+            print e
+        return not errs
+
+    def _check(self):
+        errs = []
+        if self.type == 'inproceedings':
+            fields = 'booktitle', 'year'
+        elif self.type == 'incollection':
+            fields = 'booktitle', 'year'
+        elif self.type == 'proceedings':
+            fields = 'booktitle', 'editor'
+        elif self.type == 'article':
+            fields = 'journal', 'year'
+        elif self.type == 'techreport':
+            fields = 'institution',
+        elif self.type == 'misc':
+            fields = 'howpublished',
+        elif self.type in ('mastersthesis', 'phdthesis'):
+            fields = ()
+        else:
+            fields = ()
+            errs.append("ERROR: odd type %s"%self.type)
+        if self.type != 'proceedings':
+            fields += 'title', 'author', 'www_section', 'year'
+
+        for field in fields:
+            if self.get(field) is None or \
+                   self.get(field).startswith("<span class='bad'>"):
+                errs.append("ERROR: %s has no %s field" % (self.key, field))
+                self.entries[field] = "<span class='bad'>%s:??</span>"%field
+
+        if self.type == 'inproceedings':
+            if self.get("booktitle"):
+                if not self['booktitle'].startswith("Proceedings of") and \
+                   not self['booktitle'].startswith("{Proceedings of"):
+                    errs.append("ERROR: %s's booktitle (%r) doesn't start with 'Proceedings of'" % (self.key, self['booktitle']))
+
+        if self.has_key("pages") and not re.search(r'\d+--\d+', self['pages']):
+            errs.append("ERROR: Misformed pages in %s"%self.key)
+
+        if self.type == 'proceedings':
+            if self.get('title'):
+                errs.append("ERROR: %s is a proceedings: it should have a booktitle, not a title." % self.key)
+
+        for field, value in self.entries.items():
+            if value.translate(ALLCHARS, PRINTINGCHARS):
+                errs.append("ERROR: %s.%s has non-ASCII characters"%(
+                    self.key, field))
+            if field.startswith("www_") and field not in WWW_FIELDS:
+                errs.append("ERROR: unknown www field %s"% field)
+            if value.strip()[-1:] == '.' and \
+                field not in ("notes", "www_remarks", "author"):
+                errs.append("ERROR: %s.%s has an extraneous period"%(self.key,
+                            field))
+        return errs
+
+    def biblio_to_html(self):
+        """Return the HTML for the citation portion of entry."""
+        if self.type in ('inproceedings', 'incollection'):
+            booktitle = self['booktitle']
+            bookurl = self.get('bookurl')
+            if bookurl:
+                m = PROCEEDINGS_RE.match(booktitle)
+                if m:
+                    res = ["In the ", m.group(1),
+                           '<a href="%s">' % bookurl, m.group(2), "</a>"]
+                else:
+                    res = ['In the <a href="%s">%s</a>' % (bookurl, booktitle)]
+            else:
+                res = ["In the ", booktitle]
+
+            if self.get("edition"):
+                res.append(",")
+                res.append(self['edition'])
+            if self.get("location"):
+                res.append(", ")
+                res.append(self['location'])
+            elif self.get("address"):
+                res.append(", ")
+                res.append(self['address'])
+            res.append(", %s %s" % (self.get('month', ""), self['year']))
+            if not self.get('pages'):
+                pass
+            elif "-" in self['pages']:
+                res.append(", pages&nbsp;%s" % self['pages'])
+            else:
+                res.append(", page&nbsp;%s" % self['pages'])
+        elif self.type == 'article':
+            res = ["In "]
+            if self.get('journalurl'):
+                res.append('<a href="%s">%s</a>' % (self['journalurl'],
+                                                    self['journal']))
+            else:
+                res.append(self['journal'])
+            if self.get('volume'):
+                res.append(" <b>%s</b>" % self['volume'])
+            if self.get('number'):
+                res.append("(%s)" % self['number'])
+            res.append(", %s %s" % (self.get('month', ""), self['year']))
+            if not self.get('pages'):
+                pass
+            elif "-" in self['pages']:
+                res.append(", pages&nbsp;%s" % self['pages'])
+            else:
+                res.append(", page&nbsp;%s" % self['pages'])
+        elif self.type == 'techreport':
+            res = ["%s %s %s" % (self['institution'],
+                                 self.get('type', 'technical report'),
+                                 self.get('number', ""))]
+            if self.get('month') or self.get('year'):
+                res.append(", %s %s" % (self.get('month', ''),
+                                        self.get('year', '')))
+        elif self.type == 'mastersthesis' or self.type == 'phdthesis':
+            if self.get('type'):
+                res = [self['type']]
+            elif self.type == 'mastersthesis':
+                res = ["Masters's thesis"]
+            else:
+                res = ["Ph.D. thesis"]
+            if self.get('school'):
+                res.append(", %s" % (self['school']))
+            if self.get('month') or self.get('year'):
+                res.append(", %s %s" % (self.get('month', ''),
+                                        self.get('year', '')))
+        elif self.type == 'book':
+            res = [self['publisher']]
+            if self.get('year'):
+                res.append(" ")
+                res.append(self.get('year'))
+                # res.append(", %s"%(self.get('year')))
+            if self.get('series'):
+                res.append(",")
+                res.append(self['series'])
+        elif self.type == 'misc':
+            res = [self['howpublished']]
+            if self.get('month') or self.get('year'):
+                res.append(", %s %s" % (self.get('month', ''),
+                                        self.get('year', '')))
+            if not self.get('pages'):
+                pass
+            elif "-" in self['pages']:
+                res.append(", pages&nbsp;%s" % self['pages'])
+            else:
+                res.append(", page&nbsp;%s" % self['pages'])
+        else:
+            res = ["&lt;Odd type %s&gt;" % self.type]
+
+        res[0:0] = ["<span class='biblio'>"]
+        res.append(".</span>")
+
+        bibtexurl = "./bibtex.html#%s" % url_untranslate(self.key)
+        res.append((" <span class='availability'>"
+                    "(<a href='%s'>BibTeX&nbsp;entry</a>)"
+                    "</span>") % bibtexurl)
+        return htmlize("".join(res))
+
+    def to_html(self, cache_path="./cache", base_url="."):
+        """Return the HTML for this entry."""
+        imp = self.isImportant()
+        draft = self.get('year') == 'forthcoming'
+        if imp:
+            res = ["<li><div class='impEntry'><p class='impEntry'>"]
+        elif draft:
+            res = ["<li><div class='draftEntry'><p class='draftEntry'>"]
+        else:
+            res = ["<li><p class='entry'>"]
+
+        if imp or not draft:
+            # Add a picture of the rank
+            # Only if year is known or paper important!
+            r = rank.get_rank_html(self['title'], self.get('year'),
+                                   update=False, base_url=base_url)
+            if r is not None:
+                res.append(r)
+
+        res.append("<span class='title'><a name='%s'>%s</a></span>"%(
+            url_untranslate(self.key),htmlize(self['title'])))
+
+        for cached in 0,1:
+            availability = []
+            if not cached:
+                for which in [ "amazon", "excerpt", "publisher" ]:
+                    key = "www_%s_url"%which
+                    if self.get(key):
+                        url=self[key]
+                        url = unTeXescapeURL(url)
+                        availability.append('<a href="%s">%s</a>' %(url,which))
+
+            cache_section = self.get('www_cache_section', ".")
+            if cache_section not in config.CACHE_SECTIONS:
+                if cache_section != ".":
+                    print >>sys.stderr, "Unrecognized cache section %s"%(
+                        cache_section)
+                    cache_section="."
+
+            for key, name, ext in (('www_abstract_url', 'abstract','abstract'),
+                                   ('www_html_url', 'HTML', 'html'),
+                                   ('www_pdf_url', 'PDF', 'pdf'),
+                                   ('www_ps_url', 'PS', 'ps'),
+                                   ('www_txt_url', 'TXT', 'txt'),
+                                   ('www_ps_gz_url', 'gzipped&nbsp;PS','ps.gz')
+                                   ):
+                if cached:
+                    #XXXX the URL needs to be relative to the absolute
+                    #XXXX cache path.
+                    url = smartJoin(cache_path,cache_section,
+                                    "%s.%s"%(self.key,ext))
+                    fname = smartJoin(config.OUTPUT_DIR, config.CACHE_DIR,
+                                      cache_section,
+                                      "%s.%s"%(self.key,ext))
+                    if not os.path.exists(fname): continue
+                else:
+                    url = self.get(key)
+                    if not url: continue
+                url = unTeXescapeURL(url)
+                url = url.replace('&', '&amp;')
+                availability.append('<a href="%s">%s</a>' %(url,name))
+
+            if availability:
+                res.append([" ", "&nbsp;"][cached])
+                res.append("<span class='availability'>(")
+                if cached: res.append("Cached:&nbsp;")
+                res.append(",&nbsp;".join(availability))
+                res.append(")</span>")
+
+        res.append("<br /><span class='author'>by ")
+
+        #res.append("\n<!-- %r -->\n" % self.parsedAuthor)
+        htmlAuthors = [ a.htmlizeWithLink() for a in self.parsedAuthor ]
+
+        if len(htmlAuthors) == 1:
+            res.append(htmlAuthors[0])
+        elif len(htmlAuthors) == 2:
+            res.append(" and ".join(htmlAuthors))
+        else:
+            res.append(", ".join(htmlAuthors[:-1]))
+            res.append(", and ")
+            res.append(htmlAuthors[-1])
+
+        if res[-1][-1] != '.':
+            res.append(".")
+        res.append("</span><br />\n")
+        res.append(self.biblio_to_html())
+        res.append("<a href='#%s'>&middot;</a>"%url_untranslate(self.key))
+        res.append("</p>")
+
+        if self.get('www_remarks'):
+            res.append("<p class='remarks'>%s</p>"%htmlize(
+                self['www_remarks']))
+
+        if imp or draft:
+            res.append("</div>")
+        res.append("</li>\n\n")
+
+        return "".join(res)
+
+
+class ParsedAuthor:
+    """The parsed name of an author.
+
+       Eddie deserves credit for this incredibly hairy business.
+    """
+    def __init__(self, first, von, last, jr):
+        self.first = first
+        self.von = von
+        self.last = last
+        self.jr = jr
+        self.collapsable = 1
+
+        self.html = htmlize(str(self))
+        self.txt = txtize(str(self))
+
+        s = self.html
+        for pat in config.NO_COLLAPSE_AUTHORS_RE_LIST:
+            if pat.search(s):
+                self.collapsable = 0
+                break
+
+    def __eq__(self, o):
+        return ((self.first == o.first) and
+                (self.last  == o.last) and
+                (self.von   == o.von) and
+                (self.jr    == o.jr))
+
+    def __hash__(self):
+        return hash(repr(self))
+
+    def collapsesTo(self, o):
+        """Return true iff 'o' could be a more canonical version of this author
+        """
+        if not self.collapsable or not o.collapsable:
+            return self
+
+        if self.last != o.last or self.von != o.von or self.jr != o.jr:
+            return self
+        if not self.first:
+            return o
+
+        if len(self.first) == len(o.first):
+            n = []
+            for a,b in zip(self.first, o.first):
+                if a == b:
+                    n.append(a)
+                elif len(a) == 2 and a[1] == '.' and a[0] == b[0]:
+                    n.append(b)
+                elif len(b) == 2 and b[1] == '.' and a[0] == b[0]:
+                    n.append(a)
+                else:
+                    return self
+            if n == self.first:
+                return self
+            elif n == o.first:
+                return o
+            else:
+                return self
+        else:
+            realname = max([len(n) for n in self.first+o.first])>2
+            if not realname:
+                return self
+
+            if len(self.first) < len(o.first):
+                short = self.first; long = o.first
+            else:
+                short = o.first; long = self.first
+
+            initials_s = "".join([n[0] for n in short])
+            initials_l = "".join([n[0] for n in long])
+            idx = initials_l.find(initials_s)
+            if idx < 0:
+                return self
+            n = long[:idx]
+            for i in range(idx, idx+len(short)):
+                a = long[i]; b = short[i-idx]
+                if a == b:
+                    n.append(a)
+                elif len(a) == 2 and a[1] == '.' and a[0] == b[0]:
+                    n.append(b)
+                elif len(b) == 2 and b[1] == '.' and a[0] == b[0]:
+                    n.append(a)
+                else:
+                    return self
+            n += long[idx+len(short):]
+
+            if n == self.first:
+                return self
+            elif n == o.first:
+                return o
+            else:
+                return self
+
+    def __repr__(self):
+        return "ParsedAuthor(%r,%r,%r,%r)"%(self.first,self.von,
+                                            self.last,self.jr)
+    def __str__(self):
+        a = " ".join(self.first+self.von+self.last)
+        if self.jr:
+            return "%s, %s" % (a,self.jr)
+        return a
+
+    def getHomepage(self):
+        s = self.html
+        for pat, url in config.AUTHOR_RE_LIST:
+            if pat.search(s):
+                return url
+        return None
+
+    def getSortingName(self):
+        """Return a representation of this author's name in von-last-first-jr
+           order, unless overridden by ALPH """
+        s = self.html
+        for pat,v in config.ALPHABETIZE_AUTHOR_AS_RE_LIST:
+            if pat.search(s):
+                return v
+
+        return txtize(" ".join(self.von+self.last+self.first+self.jr))
+
+    def getSectionName(self):
+        """Return a HTML representation of this author's name in
+           last, first von, jr order"""
+        secname = " ".join(self.last)
+        more = self.first+self.von
+        if more:
+            secname += ", "+" ".join(more)
+        if self.jr:
+            secname += ", "+" ".join(self.jr)
+        secname = htmlize(secname)
+        return secname
+
+    def htmlizeWithLink(self):
+        a = self.html
+        u = self.getHomepage()
+        if u:
+            return "<a href='%s'>%s</a>"%(u,a)
+        else:
+            return a
+
+
+def parseAuthor(s):
+    try:
+        return _parseAuthor(s)
+    except:
+        print >>sys.stderr, "Internal error while parsing author %r"%s
+        raise
+
+def _parseAuthor(s):
+    """Take an author string and return a list of ParsedAuthor."""
+    items = []
+
+    s = s.strip()
+    while s:
+        s = s.strip()
+        bracelevel = 0
+        for i in xrange(len(s)):
+            if s[i] == '{':
+                bracelevel += 1
+            elif s[i] == '}':
+                bracelevel -= 1
+            elif bracelevel <= 0 and s[i] in " \t\n,":
+                break
+        if i+1 == len(s):
+            items.append(s)
+        else:
+            items.append(s[0:i])
+        if (s[i] == ','):
+            items.append(',')
+        s = s[i+1:]
+
+    authors = [[]]
+    for item in items:
+        if item == 'and':
+            authors.append([])
+        else:
+            authors[-1].append(item)
+
+    parsedAuthors = []
+    # Split into first, von, last, jr
+    for author in authors:
+        commas = 0
+        fvl = []
+        vl = []
+        f = []
+        v = []
+        l = []
+        j = []
+        cur = fvl
+        for item in author:
+            if item == ',':
+                if commas == 0:
+                    vl = fvl
+                    fvl = []
+                    cur = f
+                else:
+                    j.extend(f)
+                    cur = f = []
+                commas += 1
+            else:
+                cur.append(item)
+
+        if commas == 0:
+            split_von(f,v,l,fvl)
+        else:
+            f_tmp = []
+            split_von(f_tmp,v,l,vl)
+
+        parsedAuthors.append(ParsedAuthor(f,v,l,j))
+
+    return parsedAuthors
diff --git a/utils.py b/utils.py
new file mode 100644
index 0000000..4d4b583
--- /dev/null
+++ b/utils.py
@@ -0,0 +1,118 @@
+import re
+import os
+
+ALLCHARS = "".join(map(chr,range(256)))
+RE_LONE_AMP = re.compile(r'&([^a-z0-9])')
+RE_LONE_I = re.compile(r'\\i([^a-z0-9])')
+RE_ACCENT = re.compile(r'\\([\'`~^"c])([^{]|{.})')
+RE_LIGATURE = re.compile(r'\\(AE|ae|OE|oe|AA|aa|O|o|ss)([^a-z0-9])')
+ACCENT_MAP = { "'" : 'acute',
+               "`" : 'grave',
+               "~" : 'tilde',
+               "^" : 'circ',
+               '"' : 'uml',
+               "c" : 'cedil',
+               }
+
+UNICODE_MAP = { '&nacute;' : '&#x0144;', }
+HTML_LIGATURE_MAP = {
+    'AE' : '&AElig;',
+    'ae' : '&aelig;',
+    'OE' : '&OElig;',
+    'oe' : '&oelig;',
+    'AA' : '&Aring;',
+    'aa' : '&aring;',
+    'O'  : '&Oslash;',
+    'o'  : '&oslash;',
+    'ss' : '&szlig;',
+    }
+RE_TEX_CMD = re.compile(r"(?:\\[a-zA-Z@]+|\\.)")
+RE_PAGE_SPAN = re.compile(r"(\d)--(\d)")
+
+def url_untranslate(s):
+    """Change a BibTeX key into a string suitable for use in a URL."""
+    s = re.sub(r'([%<>`#, &_\';])', lambda m: "_%02x" % ord(m.group(1)), s)
+    s = s.replace("/", ":")
+    return s
+
+def txtize(s):
+    """Turn a TeX string into decnent plaintext."""
+    s = RE_LONE_I.sub(lambda m: "i%s" % m.group(1), s)
+    s = RE_ACCENT.sub(lambda m: "%s" % m.group(2), s)
+    s = RE_LIGATURE.sub(lambda m: "%s%s"%m.groups(), s)
+    s = RE_TEX_CMD.sub("", s)
+    s = s.translate(ALLCHARS, "{}")
+    return s
+
+def unTeXescapeURL(s):
+    """Turn a URL as formatted in TeX into a real URL."""
+    s = s.replace("\\_", "_")
+    s = s.replace("\\-", "")
+    s = s.replace("\{}", "")
+    s = s.replace("{}", "")
+    return s
+
+def TeXescapeURL(s):
+    """Escape a URL for use in TeX"""
+    s = s.replace("_", "\\_")
+    s = s.replace("~", "\{}~")
+    return s
+
+def _unaccent(m):
+    accent,char = m.groups()
+    if char[0] == '{':
+        char = char[1]
+    accented = "&%s%s;" % (char, ACCENT_MAP[accent])
+    return UNICODE_MAP.get(accented, accented)
+
+def _unlig_html(m):
+    return "%s%s"%(HTML_LIGATURE_MAP[m.group(1)],m.group(2))
+
+def htmlize(s):
+    """Turn a TeX string into good-looking HTML."""
+    s = RE_LONE_AMP.sub(lambda m: "&amp;%s" % m.group(1), s)
+    s = RE_LONE_I.sub(lambda m: "i%s" % m.group(1), s)
+    s = RE_ACCENT.sub(_unaccent, s)
+    s = unTeXescapeURL(s)
+    s = RE_LIGATURE.sub(_unlig_html, s);
+    s = RE_TEX_CMD.sub("", s)
+    s = s.translate(ALLCHARS, "{}")
+    s = RE_PAGE_SPAN.sub(lambda m: "%s-%s"%(m.groups()), s)
+    s = s.replace("---", "&mdash;");
+    s = s.replace("--", "&ndash;");
+    return s
+
+def smartJoin(*lst):
+    """Equivalent to os.path.join, but handle"." and ".." entries a bit better.
+    """
+    lst = [item for item in lst if item != "."]
+    idx = 0
+    while idx < len(lst):
+        if idx > 0 and lst[idx] == "..":
+            del lst[idx]
+        else:
+            idx += 1
+    return os.path.join(*lst)
+
+def _split(s,w=79,indent=8):
+    r = []
+    s = re.sub(r"\s+", " ", s)
+    first = 1
+    indentation = ""
+    while len(s) > w:
+        for i in xrange(w-1, 20, -1):
+            if s[i] == ' ':
+                r.append(indentation+s[:i])
+                s = s[i+1:]
+                break
+        else:
+            r.append(indentation+s.strip())
+            s = ""
+        if first:
+            first = 0
+            w -= indent
+            indentation = " "*indent
+    if (s):
+        r.append(indentation+s)
+    r.append("")
+    return "\n".join(r)
author	Thibaut Horel <thibaut.horel@gmail.com>	2016-02-04 19:46:04 -0500
committer	Thibaut Horel <thibaut.horel@gmail.com>	2016-02-04 19:46:04 -0500
commit	871c61c6b4351d4a9dd78ba1d70d6e1af8ffe1e7 (patch)
tree	99bce3e74cbcff075dcb6bceacd0f2e1133bef4d
parent	fd20589a448cd19d036f18cabb1663c33a24375d (diff)
download	anonbib-871c61c6b4351d4a9dd78ba1d70d6e1af8ffe1e7.tar.gz