Start cleaning: PEP8 and split the BibTeX.py monster

author: Thibaut Horel <thibaut.horel@gmail.com> 2016-02-04 19:46:04 -0500
committer: Thibaut Horel <thibaut.horel@gmail.com> 2016-02-04 19:46:04 -0500
commit: 871c61c6b4351d4a9dd78ba1d70d6e1af8ffe1e7 (patch)
tree: 99bce3e74cbcff075dcb6bceacd0f2e1133bef4d /entry.py
parent: fd20589a448cd19d036f18cabb1663c33a24375d (diff)
download: anonbib-871c61c6b4351d4a9dd78ba1d70d6e1af8ffe1e7.tar.gz
1 files changed, 653 insertions, 0 deletions
diff --git a/entry.py b/entry.py
new file mode 100644
index 0000000..9846e32
--- /dev/null
+++ b/entry.py
@@ -0,0 +1,653 @@
+import rank
+import sys
+import re
+import config
+import os
+from utils import htmlize, txtize, url_untranslate, unTeXescapeURL, smartJoin,\
+    _split
+
+# Fields that we only care about for making web pages (BibTeX doesn't
+# recognize them.)
+WWW_FIELDS = ['www_section', 'www_important', 'www_remarks',
+              'www_abstract_url', 'www_html_url', 'www_pdf_url', 'www_ps_url',
+              'www_txt_url', 'www_ps_gz_url', 'www_amazon_url',
+              'www_excerpt_url', 'www_publisher_url',
+              'www_cache_section', 'www_tags']
+
+def author_url(author):
+    """Given an author's name, return a URL for his/her homepage."""
+    for pat, url in config.AUTHOR_RE_LIST:
+        if pat.search(author):
+            return url
+    return None
+ALLCHARS = "".join(map(chr,range(256)))
+PRINTINGCHARS = "\t\n\r"+"".join(map(chr,range(32, 127)))
+LC_CHARS = "abcdefghijklmnopqrstuvwxyz"
+SV_DELCHARS = ("ABCDEFGHIJKLMNOPQRSTUVWXYZ"
+               "abcdefghijklmnopqrstuvwxyz"
+               "@")
+RE_ESCAPED = re.compile(r'\\.')
+PROCEEDINGS_RE = re.compile(
+                        r'((?:proceedings|workshop record) of(?: the)? )(.*)',
+                        re.I)
+
+def split_von(f,v,l,x):
+    in_von = 0
+    while x:
+        tt = t = x[0]
+        del x[0]
+        if tt[:2] == '{\\':
+            tt = tt.translate(ALLCHARS, SV_DELCHARS)
+            tt = RE_ESCAPED.sub("", tt)
+            tt = tt.translate(ALLCHARS, "{}")
+        if tt.translate(ALLCHARS, LC_CHARS) == "":
+            v.append(t)
+            in_von = 1
+        elif in_von and f is not None:
+            l.append(t)
+            l.extend(x)
+            return
+        else:
+            f.append(t)
+    if not in_von:
+        l.append(f[-1])
+        del f[-1]
+
+def buildAuthorTable(entries):
+    """Given a list of BibTeXEntry, return a map from parsed author name to
+       parsed canonical name.
+    """
+    authorsByLast = {}
+    for e in entries:
+        for a in e.parsedAuthor:
+            authorsByLast.setdefault(tuple(a.last), []).append(a)
+    # map from author to collapsed author.
+    result = {}
+    for k,v in config.COLLAPSE_AUTHORS.items():
+        a = parseAuthor(k)[0]
+        c = parseAuthor(v)[0]
+        result[c] = c
+        result[a] = c
+
+    for e in entries:
+        for author in e.parsedAuthor:
+            if result.has_key(author):
+                continue
+
+            c = author
+            for a in authorsByLast[tuple(author.last)]:
+                if a is author:
+                    continue
+                c = c.collapsesTo(a)
+            result[author] = c
+
+    if 0:
+        for a,c in result.items():
+            if a != c:
+                print "Collapsing authors: %s => %s" % (a,c)
+    if 0:
+        print parseAuthor("Franz Kaashoek")[0].collapsesTo(
+            parseAuthor("M. Franz Kaashoek")[0])
+        print parseAuthor("Paul F. Syverson")[0].collapsesTo(
+            parseAuthor("Paul Syverson")[0])
+        print parseAuthor("Paul Syverson")[0].collapsesTo(
+            parseAuthor("Paul F. Syverson")[0])
+
+    return result
+
+# List of fields that appear when we display the entries as BibTeX.
+DISPLAYED_FIELDS = [ 'title', 'author', 'journal', 'booktitle',
+'school', 'institution', 'organization', 'volume', 'number', 'year',
+'month', 'address', 'location', 'chapter', 'edition', 'pages', 'editor',
+'howpublished', 'key', 'publisher', 'type', 'note', 'series' ]
+
+class BibTeXEntry:
+    """A single BibTeX entry."""
+    def __init__(self, type, key, entries):
+        self.type = type  # What kind of entry is it?  (@book,@injournal,etc)
+        self.key = key # What key does it have?
+        self.entries = entries # Map from key to value.
+        self.entryLine = 0 # Defined on this line number
+    def get(self, k, v=None):
+        return self.entries.get(k,v)
+    def has_key(self, k):
+        return self.entries.has_key(k)
+    def __getitem__(self, k):
+        return self.entries[k]
+    def __setitem__(self, k, v):
+        self.entries[k] = v
+    def __str__(self):
+        return self.format(70,1)
+    def getURL(self):
+        """Return the best URL to use for this paper, or None."""
+        best = None
+        for field in ['www_pdf_url', 'www_ps_gz_url', 'www_ps_url',
+                      'www_html_url', 'www_txt_url', ]:
+            u = self.get(field)
+            if u:
+                if not best:
+                    best = u
+                elif (best.startswith("http://citeseer.nj.nec.com/")
+                      and not u.startswith("http://citeseer.nj.nec.com/")):
+                    best = u
+        return best
+
+    def format(self, width=70, indent=8, v=0, invStrings={}):
+        """Format this entry as BibTeX."""
+        d = ["@%s{%s,\n" % (self.type, self.key)]
+        if v:
+            df = DISPLAYED_FIELDS[:]
+            for k in self.entries.keys():
+                if k not in df:
+                    df.append(k)
+        else:
+            df = DISPLAYED_FIELDS
+        for f in df:
+            if not self.entries.has_key(f):
+                continue
+            v = self.entries[f]
+            if v.startswith("<span class='bad'>"):
+                d.append("%%%%% ERROR: Missing field\n")
+                d.append("%% %s = {?????},\n"%f)
+                continue
+            np = v.translate(ALLCHARS, PRINTINGCHARS)
+            if np:
+                d.append("%%%%% "+("ERROR: Non-ASCII characters: '%r'\n"%np))
+            d.append("  ")
+            v = v.replace("&", "&amp;")
+            if invStrings.has_key(v):
+                s = "%s = %s,\n" %(f, invStrings[v])
+            else:
+                s = "%s = {%s},\n" % (f, v)
+            d.append(_split(s,width,indent))
+        d.append("}\n")
+        return "".join(d)
+    def resolve(self):
+        """Handle post-processing for this entry"""
+        a = self.get('author')
+        if a:
+            self.parsedAuthor = parseAuthor(a)
+            #print a
+            #print "   => ",repr(self.parsedAuthor)
+        else:
+            self.parsedAuthor = None
+
+    def isImportant(self):
+        """Return 1 iff this entry is marked as important"""
+        imp = self.get("www_important")
+        if imp and imp.strip().lower() not in ("no", "false", "0"):
+            return 1
+        return 0
+
+    def check(self):
+        """Print any errors for this entry, and return true if there were
+           none."""
+        errs = self._check()
+        for e in errs:
+            print e
+        return not errs
+
+    def _check(self):
+        errs = []
+        if self.type == 'inproceedings':
+            fields = 'booktitle', 'year'
+        elif self.type == 'incollection':
+            fields = 'booktitle', 'year'
+        elif self.type == 'proceedings':
+            fields = 'booktitle', 'editor'
+        elif self.type == 'article':
+            fields = 'journal', 'year'
+        elif self.type == 'techreport':
+            fields = 'institution',
+        elif self.type == 'misc':
+            fields = 'howpublished',
+        elif self.type in ('mastersthesis', 'phdthesis'):
+            fields = ()
+        else:
+            fields = ()
+            errs.append("ERROR: odd type %s"%self.type)
+        if self.type != 'proceedings':
+            fields += 'title', 'author', 'www_section', 'year'
+
+        for field in fields:
+            if self.get(field) is None or \
+                   self.get(field).startswith("<span class='bad'>"):
+                errs.append("ERROR: %s has no %s field" % (self.key, field))
+                self.entries[field] = "<span class='bad'>%s:??</span>"%field
+
+        if self.type == 'inproceedings':
+            if self.get("booktitle"):
+                if not self['booktitle'].startswith("Proceedings of") and \
+                   not self['booktitle'].startswith("{Proceedings of"):
+                    errs.append("ERROR: %s's booktitle (%r) doesn't start with 'Proceedings of'" % (self.key, self['booktitle']))
+
+        if self.has_key("pages") and not re.search(r'\d+--\d+', self['pages']):
+            errs.append("ERROR: Misformed pages in %s"%self.key)
+
+        if self.type == 'proceedings':
+            if self.get('title'):
+                errs.append("ERROR: %s is a proceedings: it should have a booktitle, not a title." % self.key)
+
+        for field, value in self.entries.items():
+            if value.translate(ALLCHARS, PRINTINGCHARS):
+                errs.append("ERROR: %s.%s has non-ASCII characters"%(
+                    self.key, field))
+            if field.startswith("www_") and field not in WWW_FIELDS:
+                errs.append("ERROR: unknown www field %s"% field)
+            if value.strip()[-1:] == '.' and \
+                field not in ("notes", "www_remarks", "author"):
+                errs.append("ERROR: %s.%s has an extraneous period"%(self.key,
+                            field))
+        return errs
+
+    def biblio_to_html(self):
+        """Return the HTML for the citation portion of entry."""
+        if self.type in ('inproceedings', 'incollection'):
+            booktitle = self['booktitle']
+            bookurl = self.get('bookurl')
+            if bookurl:
+                m = PROCEEDINGS_RE.match(booktitle)
+                if m:
+                    res = ["In the ", m.group(1),
+                           '<a href="%s">' % bookurl, m.group(2), "</a>"]
+                else:
+                    res = ['In the <a href="%s">%s</a>' % (bookurl, booktitle)]
+            else:
+                res = ["In the ", booktitle]
+
+            if self.get("edition"):
+                res.append(",")
+                res.append(self['edition'])
+            if self.get("location"):
+                res.append(", ")
+                res.append(self['location'])
+            elif self.get("address"):
+                res.append(", ")
+                res.append(self['address'])
+            res.append(", %s %s" % (self.get('month', ""), self['year']))
+            if not self.get('pages'):
+                pass
+            elif "-" in self['pages']:
+                res.append(", pages&nbsp;%s" % self['pages'])
+            else:
+                res.append(", page&nbsp;%s" % self['pages'])
+        elif self.type == 'article':
+            res = ["In "]
+            if self.get('journalurl'):
+                res.append('<a href="%s">%s</a>' % (self['journalurl'],
+                                                    self['journal']))
+            else:
+                res.append(self['journal'])
+            if self.get('volume'):
+                res.append(" <b>%s</b>" % self['volume'])
+            if self.get('number'):
+                res.append("(%s)" % self['number'])
+            res.append(", %s %s" % (self.get('month', ""), self['year']))
+            if not self.get('pages'):
+                pass
+            elif "-" in self['pages']:
+                res.append(", pages&nbsp;%s" % self['pages'])
+            else:
+                res.append(", page&nbsp;%s" % self['pages'])
+        elif self.type == 'techreport':
+            res = ["%s %s %s" % (self['institution'],
+                                 self.get('type', 'technical report'),
+                                 self.get('number', ""))]
+            if self.get('month') or self.get('year'):
+                res.append(", %s %s" % (self.get('month', ''),
+                                        self.get('year', '')))
+        elif self.type == 'mastersthesis' or self.type == 'phdthesis':
+            if self.get('type'):
+                res = [self['type']]
+            elif self.type == 'mastersthesis':
+                res = ["Masters's thesis"]
+            else:
+                res = ["Ph.D. thesis"]
+            if self.get('school'):
+                res.append(", %s" % (self['school']))
+            if self.get('month') or self.get('year'):
+                res.append(", %s %s" % (self.get('month', ''),
+                                        self.get('year', '')))
+        elif self.type == 'book':
+            res = [self['publisher']]
+            if self.get('year'):
+                res.append(" ")
+                res.append(self.get('year'))
+                # res.append(", %s"%(self.get('year')))
+            if self.get('series'):
+                res.append(",")
+                res.append(self['series'])
+        elif self.type == 'misc':
+            res = [self['howpublished']]
+            if self.get('month') or self.get('year'):
+                res.append(", %s %s" % (self.get('month', ''),
+                                        self.get('year', '')))
+            if not self.get('pages'):
+                pass
+            elif "-" in self['pages']:
+                res.append(", pages&nbsp;%s" % self['pages'])
+            else:
+                res.append(", page&nbsp;%s" % self['pages'])
+        else:
+            res = ["&lt;Odd type %s&gt;" % self.type]
+
+        res[0:0] = ["<span class='biblio'>"]
+        res.append(".</span>")
+
+        bibtexurl = "./bibtex.html#%s" % url_untranslate(self.key)
+        res.append((" <span class='availability'>"
+                    "(<a href='%s'>BibTeX&nbsp;entry</a>)"
+                    "</span>") % bibtexurl)
+        return htmlize("".join(res))
+
+    def to_html(self, cache_path="./cache", base_url="."):
+        """Return the HTML for this entry."""
+        imp = self.isImportant()
+        draft = self.get('year') == 'forthcoming'
+        if imp:
+            res = ["<li><div class='impEntry'><p class='impEntry'>"]
+        elif draft:
+            res = ["<li><div class='draftEntry'><p class='draftEntry'>"]
+        else:
+            res = ["<li><p class='entry'>"]
+
+        if imp or not draft:
+            # Add a picture of the rank
+            # Only if year is known or paper important!
+            r = rank.get_rank_html(self['title'], self.get('year'),
+                                   update=False, base_url=base_url)
+            if r is not None:
+                res.append(r)
+
+        res.append("<span class='title'><a name='%s'>%s</a></span>"%(
+            url_untranslate(self.key),htmlize(self['title'])))
+
+        for cached in 0,1:
+            availability = []
+            if not cached:
+                for which in [ "amazon", "excerpt", "publisher" ]:
+                    key = "www_%s_url"%which
+                    if self.get(key):
+                        url=self[key]
+                        url = unTeXescapeURL(url)
+                        availability.append('<a href="%s">%s</a>' %(url,which))
+
+            cache_section = self.get('www_cache_section', ".")
+            if cache_section not in config.CACHE_SECTIONS:
+                if cache_section != ".":
+                    print >>sys.stderr, "Unrecognized cache section %s"%(
+                        cache_section)
+                    cache_section="."
+
+            for key, name, ext in (('www_abstract_url', 'abstract','abstract'),
+                                   ('www_html_url', 'HTML', 'html'),
+                                   ('www_pdf_url', 'PDF', 'pdf'),
+                                   ('www_ps_url', 'PS', 'ps'),
+                                   ('www_txt_url', 'TXT', 'txt'),
+                                   ('www_ps_gz_url', 'gzipped&nbsp;PS','ps.gz')
+                                   ):
+                if cached:
+                    #XXXX the URL needs to be relative to the absolute
+                    #XXXX cache path.
+                    url = smartJoin(cache_path,cache_section,
+                                    "%s.%s"%(self.key,ext))
+                    fname = smartJoin(config.OUTPUT_DIR, config.CACHE_DIR,
+                                      cache_section,
+                                      "%s.%s"%(self.key,ext))
+                    if not os.path.exists(fname): continue
+                else:
+                    url = self.get(key)
+                    if not url: continue
+                url = unTeXescapeURL(url)
+                url = url.replace('&', '&amp;')
+                availability.append('<a href="%s">%s</a>' %(url,name))
+
+            if availability:
+                res.append([" ", "&nbsp;"][cached])
+                res.append("<span class='availability'>(")
+                if cached: res.append("Cached:&nbsp;")
+                res.append(",&nbsp;".join(availability))
+                res.append(")</span>")
+
+        res.append("<br /><span class='author'>by ")
+
+        #res.append("\n<!-- %r -->\n" % self.parsedAuthor)
+        htmlAuthors = [ a.htmlizeWithLink() for a in self.parsedAuthor ]
+
+        if len(htmlAuthors) == 1:
+            res.append(htmlAuthors[0])
+        elif len(htmlAuthors) == 2:
+            res.append(" and ".join(htmlAuthors))
+        else:
+            res.append(", ".join(htmlAuthors[:-1]))
+            res.append(", and ")
+            res.append(htmlAuthors[-1])
+
+        if res[-1][-1] != '.':
+            res.append(".")
+        res.append("</span><br />\n")
+        res.append(self.biblio_to_html())
+        res.append("<a href='#%s'>&middot;</a>"%url_untranslate(self.key))
+        res.append("</p>")
+
+        if self.get('www_remarks'):
+            res.append("<p class='remarks'>%s</p>"%htmlize(
+                self['www_remarks']))
+
+        if imp or draft:
+            res.append("</div>")
+        res.append("</li>\n\n")
+
+        return "".join(res)
+
+
+class ParsedAuthor:
+    """The parsed name of an author.
+
+       Eddie deserves credit for this incredibly hairy business.
+    """
+    def __init__(self, first, von, last, jr):
+        self.first = first
+        self.von = von
+        self.last = last
+        self.jr = jr
+        self.collapsable = 1
+
+        self.html = htmlize(str(self))
+        self.txt = txtize(str(self))
+
+        s = self.html
+        for pat in config.NO_COLLAPSE_AUTHORS_RE_LIST:
+            if pat.search(s):
+                self.collapsable = 0
+                break
+
+    def __eq__(self, o):
+        return ((self.first == o.first) and
+                (self.last  == o.last) and
+                (self.von   == o.von) and
+                (self.jr    == o.jr))
+
+    def __hash__(self):
+        return hash(repr(self))
+
+    def collapsesTo(self, o):
+        """Return true iff 'o' could be a more canonical version of this author
+        """
+        if not self.collapsable or not o.collapsable:
+            return self
+
+        if self.last != o.last or self.von != o.von or self.jr != o.jr:
+            return self
+        if not self.first:
+            return o
+
+        if len(self.first) == len(o.first):
+            n = []
+            for a,b in zip(self.first, o.first):
+                if a == b:
+                    n.append(a)
+                elif len(a) == 2 and a[1] == '.' and a[0] == b[0]:
+                    n.append(b)
+                elif len(b) == 2 and b[1] == '.' and a[0] == b[0]:
+                    n.append(a)
+                else:
+                    return self
+            if n == self.first:
+                return self
+            elif n == o.first:
+                return o
+            else:
+                return self
+        else:
+            realname = max([len(n) for n in self.first+o.first])>2
+            if not realname:
+                return self
+
+            if len(self.first) < len(o.first):
+                short = self.first; long = o.first
+            else:
+                short = o.first; long = self.first
+
+            initials_s = "".join([n[0] for n in short])
+            initials_l = "".join([n[0] for n in long])
+            idx = initials_l.find(initials_s)
+            if idx < 0:
+                return self
+            n = long[:idx]
+            for i in range(idx, idx+len(short)):
+                a = long[i]; b = short[i-idx]
+                if a == b:
+                    n.append(a)
+                elif len(a) == 2 and a[1] == '.' and a[0] == b[0]:
+                    n.append(b)
+                elif len(b) == 2 and b[1] == '.' and a[0] == b[0]:
+                    n.append(a)
+                else:
+                    return self
+            n += long[idx+len(short):]
+
+            if n == self.first:
+                return self
+            elif n == o.first:
+                return o
+            else:
+                return self
+
+    def __repr__(self):
+        return "ParsedAuthor(%r,%r,%r,%r)"%(self.first,self.von,
+                                            self.last,self.jr)
+    def __str__(self):
+        a = " ".join(self.first+self.von+self.last)
+        if self.jr:
+            return "%s, %s" % (a,self.jr)
+        return a
+
+    def getHomepage(self):
+        s = self.html
+        for pat, url in config.AUTHOR_RE_LIST:
+            if pat.search(s):
+                return url
+        return None
+
+    def getSortingName(self):
+        """Return a representation of this author's name in von-last-first-jr
+           order, unless overridden by ALPH """
+        s = self.html
+        for pat,v in config.ALPHABETIZE_AUTHOR_AS_RE_LIST:
+            if pat.search(s):
+                return v
+
+        return txtize(" ".join(self.von+self.last+self.first+self.jr))
+
+    def getSectionName(self):
+        """Return a HTML representation of this author's name in
+           last, first von, jr order"""
+        secname = " ".join(self.last)
+        more = self.first+self.von
+        if more:
+            secname += ", "+" ".join(more)
+        if self.jr:
+            secname += ", "+" ".join(self.jr)
+        secname = htmlize(secname)
+        return secname
+
+    def htmlizeWithLink(self):
+        a = self.html
+        u = self.getHomepage()
+        if u:
+            return "<a href='%s'>%s</a>"%(u,a)
+        else:
+            return a
+
+
+def parseAuthor(s):
+    try:
+        return _parseAuthor(s)
+    except:
+        print >>sys.stderr, "Internal error while parsing author %r"%s
+        raise
+
+def _parseAuthor(s):
+    """Take an author string and return a list of ParsedAuthor."""
+    items = []
+
+    s = s.strip()
+    while s:
+        s = s.strip()
+        bracelevel = 0
+        for i in xrange(len(s)):
+            if s[i] == '{':
+                bracelevel += 1
+            elif s[i] == '}':
+                bracelevel -= 1
+            elif bracelevel <= 0 and s[i] in " \t\n,":
+                break
+        if i+1 == len(s):
+            items.append(s)
+        else:
+            items.append(s[0:i])
+        if (s[i] == ','):
+            items.append(',')
+        s = s[i+1:]
+
+    authors = [[]]
+    for item in items:
+        if item == 'and':
+            authors.append([])
+        else:
+            authors[-1].append(item)
+
+    parsedAuthors = []
+    # Split into first, von, last, jr
+    for author in authors:
+        commas = 0
+        fvl = []
+        vl = []
+        f = []
+        v = []
+        l = []
+        j = []
+        cur = fvl
+        for item in author:
+            if item == ',':
+                if commas == 0:
+                    vl = fvl
+                    fvl = []
+                    cur = f
+                else:
+                    j.extend(f)
+                    cur = f = []
+                commas += 1
+            else:
+                cur.append(item)
+
+        if commas == 0:
+            split_von(f,v,l,fvl)
+        else:
+            f_tmp = []
+            split_von(f_tmp,v,l,vl)
+
+        parsedAuthors.append(ParsedAuthor(f,v,l,j))
+
+    return parsedAuthors
author	Thibaut Horel <thibaut.horel@gmail.com>	2016-02-04 19:46:04 -0500
committer	Thibaut Horel <thibaut.horel@gmail.com>	2016-02-04 19:46:04 -0500
commit	871c61c6b4351d4a9dd78ba1d70d6e1af8ffe1e7 (patch)
tree	99bce3e74cbcff075dcb6bceacd0f2e1133bef4d /entry.py
parent	fd20589a448cd19d036f18cabb1663c33a24375d (diff)
download	anonbib-871c61c6b4351d4a9dd78ba1d70d6e1af8ffe1e7.tar.gz