diff options
| author | Thibaut Horel <thibaut.horel@gmail.com> | 2016-02-04 19:46:04 -0500 |
|---|---|---|
| committer | Thibaut Horel <thibaut.horel@gmail.com> | 2016-02-04 19:46:04 -0500 |
| commit | 871c61c6b4351d4a9dd78ba1d70d6e1af8ffe1e7 (patch) | |
| tree | 99bce3e74cbcff075dcb6bceacd0f2e1133bef4d /entry.py | |
| parent | fd20589a448cd19d036f18cabb1663c33a24375d (diff) | |
| download | anonbib-871c61c6b4351d4a9dd78ba1d70d6e1af8ffe1e7.tar.gz | |
Start cleaning: PEP8 and split the BibTeX.py monster
Diffstat (limited to 'entry.py')
| -rw-r--r-- | entry.py | 653 |
1 files changed, 653 insertions, 0 deletions
diff --git a/entry.py b/entry.py new file mode 100644 index 0000000..9846e32 --- /dev/null +++ b/entry.py @@ -0,0 +1,653 @@ +import rank +import sys +import re +import config +import os +from utils import htmlize, txtize, url_untranslate, unTeXescapeURL, smartJoin,\ + _split + +# Fields that we only care about for making web pages (BibTeX doesn't +# recognize them.) +WWW_FIELDS = ['www_section', 'www_important', 'www_remarks', + 'www_abstract_url', 'www_html_url', 'www_pdf_url', 'www_ps_url', + 'www_txt_url', 'www_ps_gz_url', 'www_amazon_url', + 'www_excerpt_url', 'www_publisher_url', + 'www_cache_section', 'www_tags'] + +def author_url(author): + """Given an author's name, return a URL for his/her homepage.""" + for pat, url in config.AUTHOR_RE_LIST: + if pat.search(author): + return url + return None +ALLCHARS = "".join(map(chr,range(256))) +PRINTINGCHARS = "\t\n\r"+"".join(map(chr,range(32, 127))) +LC_CHARS = "abcdefghijklmnopqrstuvwxyz" +SV_DELCHARS = ("ABCDEFGHIJKLMNOPQRSTUVWXYZ" + "abcdefghijklmnopqrstuvwxyz" + "@") +RE_ESCAPED = re.compile(r'\\.') +PROCEEDINGS_RE = re.compile( + r'((?:proceedings|workshop record) of(?: the)? )(.*)', + re.I) + +def split_von(f,v,l,x): + in_von = 0 + while x: + tt = t = x[0] + del x[0] + if tt[:2] == '{\\': + tt = tt.translate(ALLCHARS, SV_DELCHARS) + tt = RE_ESCAPED.sub("", tt) + tt = tt.translate(ALLCHARS, "{}") + if tt.translate(ALLCHARS, LC_CHARS) == "": + v.append(t) + in_von = 1 + elif in_von and f is not None: + l.append(t) + l.extend(x) + return + else: + f.append(t) + if not in_von: + l.append(f[-1]) + del f[-1] + +def buildAuthorTable(entries): + """Given a list of BibTeXEntry, return a map from parsed author name to + parsed canonical name. + """ + authorsByLast = {} + for e in entries: + for a in e.parsedAuthor: + authorsByLast.setdefault(tuple(a.last), []).append(a) + # map from author to collapsed author. + result = {} + for k,v in config.COLLAPSE_AUTHORS.items(): + a = parseAuthor(k)[0] + c = parseAuthor(v)[0] + result[c] = c + result[a] = c + + for e in entries: + for author in e.parsedAuthor: + if result.has_key(author): + continue + + c = author + for a in authorsByLast[tuple(author.last)]: + if a is author: + continue + c = c.collapsesTo(a) + result[author] = c + + if 0: + for a,c in result.items(): + if a != c: + print "Collapsing authors: %s => %s" % (a,c) + if 0: + print parseAuthor("Franz Kaashoek")[0].collapsesTo( + parseAuthor("M. Franz Kaashoek")[0]) + print parseAuthor("Paul F. Syverson")[0].collapsesTo( + parseAuthor("Paul Syverson")[0]) + print parseAuthor("Paul Syverson")[0].collapsesTo( + parseAuthor("Paul F. Syverson")[0]) + + return result + +# List of fields that appear when we display the entries as BibTeX. +DISPLAYED_FIELDS = [ 'title', 'author', 'journal', 'booktitle', +'school', 'institution', 'organization', 'volume', 'number', 'year', +'month', 'address', 'location', 'chapter', 'edition', 'pages', 'editor', +'howpublished', 'key', 'publisher', 'type', 'note', 'series' ] + +class BibTeXEntry: + """A single BibTeX entry.""" + def __init__(self, type, key, entries): + self.type = type # What kind of entry is it? (@book,@injournal,etc) + self.key = key # What key does it have? + self.entries = entries # Map from key to value. + self.entryLine = 0 # Defined on this line number + def get(self, k, v=None): + return self.entries.get(k,v) + def has_key(self, k): + return self.entries.has_key(k) + def __getitem__(self, k): + return self.entries[k] + def __setitem__(self, k, v): + self.entries[k] = v + def __str__(self): + return self.format(70,1) + def getURL(self): + """Return the best URL to use for this paper, or None.""" + best = None + for field in ['www_pdf_url', 'www_ps_gz_url', 'www_ps_url', + 'www_html_url', 'www_txt_url', ]: + u = self.get(field) + if u: + if not best: + best = u + elif (best.startswith("http://citeseer.nj.nec.com/") + and not u.startswith("http://citeseer.nj.nec.com/")): + best = u + return best + + def format(self, width=70, indent=8, v=0, invStrings={}): + """Format this entry as BibTeX.""" + d = ["@%s{%s,\n" % (self.type, self.key)] + if v: + df = DISPLAYED_FIELDS[:] + for k in self.entries.keys(): + if k not in df: + df.append(k) + else: + df = DISPLAYED_FIELDS + for f in df: + if not self.entries.has_key(f): + continue + v = self.entries[f] + if v.startswith("<span class='bad'>"): + d.append("%%%%% ERROR: Missing field\n") + d.append("%% %s = {?????},\n"%f) + continue + np = v.translate(ALLCHARS, PRINTINGCHARS) + if np: + d.append("%%%%% "+("ERROR: Non-ASCII characters: '%r'\n"%np)) + d.append(" ") + v = v.replace("&", "&") + if invStrings.has_key(v): + s = "%s = %s,\n" %(f, invStrings[v]) + else: + s = "%s = {%s},\n" % (f, v) + d.append(_split(s,width,indent)) + d.append("}\n") + return "".join(d) + def resolve(self): + """Handle post-processing for this entry""" + a = self.get('author') + if a: + self.parsedAuthor = parseAuthor(a) + #print a + #print " => ",repr(self.parsedAuthor) + else: + self.parsedAuthor = None + + def isImportant(self): + """Return 1 iff this entry is marked as important""" + imp = self.get("www_important") + if imp and imp.strip().lower() not in ("no", "false", "0"): + return 1 + return 0 + + def check(self): + """Print any errors for this entry, and return true if there were + none.""" + errs = self._check() + for e in errs: + print e + return not errs + + def _check(self): + errs = [] + if self.type == 'inproceedings': + fields = 'booktitle', 'year' + elif self.type == 'incollection': + fields = 'booktitle', 'year' + elif self.type == 'proceedings': + fields = 'booktitle', 'editor' + elif self.type == 'article': + fields = 'journal', 'year' + elif self.type == 'techreport': + fields = 'institution', + elif self.type == 'misc': + fields = 'howpublished', + elif self.type in ('mastersthesis', 'phdthesis'): + fields = () + else: + fields = () + errs.append("ERROR: odd type %s"%self.type) + if self.type != 'proceedings': + fields += 'title', 'author', 'www_section', 'year' + + for field in fields: + if self.get(field) is None or \ + self.get(field).startswith("<span class='bad'>"): + errs.append("ERROR: %s has no %s field" % (self.key, field)) + self.entries[field] = "<span class='bad'>%s:??</span>"%field + + if self.type == 'inproceedings': + if self.get("booktitle"): + if not self['booktitle'].startswith("Proceedings of") and \ + not self['booktitle'].startswith("{Proceedings of"): + errs.append("ERROR: %s's booktitle (%r) doesn't start with 'Proceedings of'" % (self.key, self['booktitle'])) + + if self.has_key("pages") and not re.search(r'\d+--\d+', self['pages']): + errs.append("ERROR: Misformed pages in %s"%self.key) + + if self.type == 'proceedings': + if self.get('title'): + errs.append("ERROR: %s is a proceedings: it should have a booktitle, not a title." % self.key) + + for field, value in self.entries.items(): + if value.translate(ALLCHARS, PRINTINGCHARS): + errs.append("ERROR: %s.%s has non-ASCII characters"%( + self.key, field)) + if field.startswith("www_") and field not in WWW_FIELDS: + errs.append("ERROR: unknown www field %s"% field) + if value.strip()[-1:] == '.' and \ + field not in ("notes", "www_remarks", "author"): + errs.append("ERROR: %s.%s has an extraneous period"%(self.key, + field)) + return errs + + def biblio_to_html(self): + """Return the HTML for the citation portion of entry.""" + if self.type in ('inproceedings', 'incollection'): + booktitle = self['booktitle'] + bookurl = self.get('bookurl') + if bookurl: + m = PROCEEDINGS_RE.match(booktitle) + if m: + res = ["In the ", m.group(1), + '<a href="%s">' % bookurl, m.group(2), "</a>"] + else: + res = ['In the <a href="%s">%s</a>' % (bookurl, booktitle)] + else: + res = ["In the ", booktitle] + + if self.get("edition"): + res.append(",") + res.append(self['edition']) + if self.get("location"): + res.append(", ") + res.append(self['location']) + elif self.get("address"): + res.append(", ") + res.append(self['address']) + res.append(", %s %s" % (self.get('month', ""), self['year'])) + if not self.get('pages'): + pass + elif "-" in self['pages']: + res.append(", pages %s" % self['pages']) + else: + res.append(", page %s" % self['pages']) + elif self.type == 'article': + res = ["In "] + if self.get('journalurl'): + res.append('<a href="%s">%s</a>' % (self['journalurl'], + self['journal'])) + else: + res.append(self['journal']) + if self.get('volume'): + res.append(" <b>%s</b>" % self['volume']) + if self.get('number'): + res.append("(%s)" % self['number']) + res.append(", %s %s" % (self.get('month', ""), self['year'])) + if not self.get('pages'): + pass + elif "-" in self['pages']: + res.append(", pages %s" % self['pages']) + else: + res.append(", page %s" % self['pages']) + elif self.type == 'techreport': + res = ["%s %s %s" % (self['institution'], + self.get('type', 'technical report'), + self.get('number', ""))] + if self.get('month') or self.get('year'): + res.append(", %s %s" % (self.get('month', ''), + self.get('year', ''))) + elif self.type == 'mastersthesis' or self.type == 'phdthesis': + if self.get('type'): + res = [self['type']] + elif self.type == 'mastersthesis': + res = ["Masters's thesis"] + else: + res = ["Ph.D. thesis"] + if self.get('school'): + res.append(", %s" % (self['school'])) + if self.get('month') or self.get('year'): + res.append(", %s %s" % (self.get('month', ''), + self.get('year', ''))) + elif self.type == 'book': + res = [self['publisher']] + if self.get('year'): + res.append(" ") + res.append(self.get('year')) + # res.append(", %s"%(self.get('year'))) + if self.get('series'): + res.append(",") + res.append(self['series']) + elif self.type == 'misc': + res = [self['howpublished']] + if self.get('month') or self.get('year'): + res.append(", %s %s" % (self.get('month', ''), + self.get('year', ''))) + if not self.get('pages'): + pass + elif "-" in self['pages']: + res.append(", pages %s" % self['pages']) + else: + res.append(", page %s" % self['pages']) + else: + res = ["<Odd type %s>" % self.type] + + res[0:0] = ["<span class='biblio'>"] + res.append(".</span>") + + bibtexurl = "./bibtex.html#%s" % url_untranslate(self.key) + res.append((" <span class='availability'>" + "(<a href='%s'>BibTeX entry</a>)" + "</span>") % bibtexurl) + return htmlize("".join(res)) + + def to_html(self, cache_path="./cache", base_url="."): + """Return the HTML for this entry.""" + imp = self.isImportant() + draft = self.get('year') == 'forthcoming' + if imp: + res = ["<li><div class='impEntry'><p class='impEntry'>"] + elif draft: + res = ["<li><div class='draftEntry'><p class='draftEntry'>"] + else: + res = ["<li><p class='entry'>"] + + if imp or not draft: + # Add a picture of the rank + # Only if year is known or paper important! + r = rank.get_rank_html(self['title'], self.get('year'), + update=False, base_url=base_url) + if r is not None: + res.append(r) + + res.append("<span class='title'><a name='%s'>%s</a></span>"%( + url_untranslate(self.key),htmlize(self['title']))) + + for cached in 0,1: + availability = [] + if not cached: + for which in [ "amazon", "excerpt", "publisher" ]: + key = "www_%s_url"%which + if self.get(key): + url=self[key] + url = unTeXescapeURL(url) + availability.append('<a href="%s">%s</a>' %(url,which)) + + cache_section = self.get('www_cache_section', ".") + if cache_section not in config.CACHE_SECTIONS: + if cache_section != ".": + print >>sys.stderr, "Unrecognized cache section %s"%( + cache_section) + cache_section="." + + for key, name, ext in (('www_abstract_url', 'abstract','abstract'), + ('www_html_url', 'HTML', 'html'), + ('www_pdf_url', 'PDF', 'pdf'), + ('www_ps_url', 'PS', 'ps'), + ('www_txt_url', 'TXT', 'txt'), + ('www_ps_gz_url', 'gzipped PS','ps.gz') + ): + if cached: + #XXXX the URL needs to be relative to the absolute + #XXXX cache path. + url = smartJoin(cache_path,cache_section, + "%s.%s"%(self.key,ext)) + fname = smartJoin(config.OUTPUT_DIR, config.CACHE_DIR, + cache_section, + "%s.%s"%(self.key,ext)) + if not os.path.exists(fname): continue + else: + url = self.get(key) + if not url: continue + url = unTeXescapeURL(url) + url = url.replace('&', '&') + availability.append('<a href="%s">%s</a>' %(url,name)) + + if availability: + res.append([" ", " "][cached]) + res.append("<span class='availability'>(") + if cached: res.append("Cached: ") + res.append(", ".join(availability)) + res.append(")</span>") + + res.append("<br /><span class='author'>by ") + + #res.append("\n<!-- %r -->\n" % self.parsedAuthor) + htmlAuthors = [ a.htmlizeWithLink() for a in self.parsedAuthor ] + + if len(htmlAuthors) == 1: + res.append(htmlAuthors[0]) + elif len(htmlAuthors) == 2: + res.append(" and ".join(htmlAuthors)) + else: + res.append(", ".join(htmlAuthors[:-1])) + res.append(", and ") + res.append(htmlAuthors[-1]) + + if res[-1][-1] != '.': + res.append(".") + res.append("</span><br />\n") + res.append(self.biblio_to_html()) + res.append("<a href='#%s'>·</a>"%url_untranslate(self.key)) + res.append("</p>") + + if self.get('www_remarks'): + res.append("<p class='remarks'>%s</p>"%htmlize( + self['www_remarks'])) + + if imp or draft: + res.append("</div>") + res.append("</li>\n\n") + + return "".join(res) + + +class ParsedAuthor: + """The parsed name of an author. + + Eddie deserves credit for this incredibly hairy business. + """ + def __init__(self, first, von, last, jr): + self.first = first + self.von = von + self.last = last + self.jr = jr + self.collapsable = 1 + + self.html = htmlize(str(self)) + self.txt = txtize(str(self)) + + s = self.html + for pat in config.NO_COLLAPSE_AUTHORS_RE_LIST: + if pat.search(s): + self.collapsable = 0 + break + + def __eq__(self, o): + return ((self.first == o.first) and + (self.last == o.last) and + (self.von == o.von) and + (self.jr == o.jr)) + + def __hash__(self): + return hash(repr(self)) + + def collapsesTo(self, o): + """Return true iff 'o' could be a more canonical version of this author + """ + if not self.collapsable or not o.collapsable: + return self + + if self.last != o.last or self.von != o.von or self.jr != o.jr: + return self + if not self.first: + return o + + if len(self.first) == len(o.first): + n = [] + for a,b in zip(self.first, o.first): + if a == b: + n.append(a) + elif len(a) == 2 and a[1] == '.' and a[0] == b[0]: + n.append(b) + elif len(b) == 2 and b[1] == '.' and a[0] == b[0]: + n.append(a) + else: + return self + if n == self.first: + return self + elif n == o.first: + return o + else: + return self + else: + realname = max([len(n) for n in self.first+o.first])>2 + if not realname: + return self + + if len(self.first) < len(o.first): + short = self.first; long = o.first + else: + short = o.first; long = self.first + + initials_s = "".join([n[0] for n in short]) + initials_l = "".join([n[0] for n in long]) + idx = initials_l.find(initials_s) + if idx < 0: + return self + n = long[:idx] + for i in range(idx, idx+len(short)): + a = long[i]; b = short[i-idx] + if a == b: + n.append(a) + elif len(a) == 2 and a[1] == '.' and a[0] == b[0]: + n.append(b) + elif len(b) == 2 and b[1] == '.' and a[0] == b[0]: + n.append(a) + else: + return self + n += long[idx+len(short):] + + if n == self.first: + return self + elif n == o.first: + return o + else: + return self + + def __repr__(self): + return "ParsedAuthor(%r,%r,%r,%r)"%(self.first,self.von, + self.last,self.jr) + def __str__(self): + a = " ".join(self.first+self.von+self.last) + if self.jr: + return "%s, %s" % (a,self.jr) + return a + + def getHomepage(self): + s = self.html + for pat, url in config.AUTHOR_RE_LIST: + if pat.search(s): + return url + return None + + def getSortingName(self): + """Return a representation of this author's name in von-last-first-jr + order, unless overridden by ALPH """ + s = self.html + for pat,v in config.ALPHABETIZE_AUTHOR_AS_RE_LIST: + if pat.search(s): + return v + + return txtize(" ".join(self.von+self.last+self.first+self.jr)) + + def getSectionName(self): + """Return a HTML representation of this author's name in + last, first von, jr order""" + secname = " ".join(self.last) + more = self.first+self.von + if more: + secname += ", "+" ".join(more) + if self.jr: + secname += ", "+" ".join(self.jr) + secname = htmlize(secname) + return secname + + def htmlizeWithLink(self): + a = self.html + u = self.getHomepage() + if u: + return "<a href='%s'>%s</a>"%(u,a) + else: + return a + + +def parseAuthor(s): + try: + return _parseAuthor(s) + except: + print >>sys.stderr, "Internal error while parsing author %r"%s + raise + +def _parseAuthor(s): + """Take an author string and return a list of ParsedAuthor.""" + items = [] + + s = s.strip() + while s: + s = s.strip() + bracelevel = 0 + for i in xrange(len(s)): + if s[i] == '{': + bracelevel += 1 + elif s[i] == '}': + bracelevel -= 1 + elif bracelevel <= 0 and s[i] in " \t\n,": + break + if i+1 == len(s): + items.append(s) + else: + items.append(s[0:i]) + if (s[i] == ','): + items.append(',') + s = s[i+1:] + + authors = [[]] + for item in items: + if item == 'and': + authors.append([]) + else: + authors[-1].append(item) + + parsedAuthors = [] + # Split into first, von, last, jr + for author in authors: + commas = 0 + fvl = [] + vl = [] + f = [] + v = [] + l = [] + j = [] + cur = fvl + for item in author: + if item == ',': + if commas == 0: + vl = fvl + fvl = [] + cur = f + else: + j.extend(f) + cur = f = [] + commas += 1 + else: + cur.append(item) + + if commas == 0: + split_von(f,v,l,fvl) + else: + f_tmp = [] + split_von(f_tmp,v,l,vl) + + parsedAuthors.append(ParsedAuthor(f,v,l,j)) + + return parsedAuthors |
