import rank import sys import re import config import os from utils import htmlize, txtize, url_untranslate, unTeXescapeURL, smartJoin,\ _split # Fields that we only care about for making web pages (BibTeX doesn't # recognize them.) WWW_FIELDS = ['www_section', 'www_important', 'www_remarks', 'www_abstract_url', 'www_html_url', 'www_pdf_url', 'www_ps_url', 'www_txt_url', 'www_ps_gz_url', 'www_amazon_url', 'www_excerpt_url', 'www_publisher_url', 'www_cache_section', 'www_tags'] def author_url(author): """Given an author's name, return a URL for his/her homepage.""" for pat, url in config.AUTHOR_RE_LIST: if pat.search(author): return url return None ALLCHARS = "".join(map(chr,range(256))) PRINTINGCHARS = "\t\n\r"+"".join(map(chr,range(32, 127))) LC_CHARS = "abcdefghijklmnopqrstuvwxyz" SV_DELCHARS = ("ABCDEFGHIJKLMNOPQRSTUVWXYZ" "abcdefghijklmnopqrstuvwxyz" "@") RE_ESCAPED = re.compile(r'\\.') PROCEEDINGS_RE = re.compile( r'((?:proceedings|workshop record) of(?: the)? )(.*)', re.I) def split_von(f,v,l,x): in_von = 0 while x: tt = t = x[0] del x[0] if tt[:2] == '{\\': tt = tt.translate(ALLCHARS, SV_DELCHARS) tt = RE_ESCAPED.sub("", tt) tt = tt.translate(ALLCHARS, "{}") if tt.translate(ALLCHARS, LC_CHARS) == "": v.append(t) in_von = 1 elif in_von and f is not None: l.append(t) l.extend(x) return else: f.append(t) if not in_von: l.append(f[-1]) del f[-1] def buildAuthorTable(entries): """Given a list of BibTeXEntry, return a map from parsed author name to parsed canonical name. """ authorsByLast = {} for e in entries: for a in e.parsedAuthor: authorsByLast.setdefault(tuple(a.last), []).append(a) # map from author to collapsed author. result = {} for k,v in config.COLLAPSE_AUTHORS.items(): a = parseAuthor(k)[0] c = parseAuthor(v)[0] result[c] = c result[a] = c for e in entries: for author in e.parsedAuthor: if result.has_key(author): continue c = author for a in authorsByLast[tuple(author.last)]: if a is author: continue c = c.collapsesTo(a) result[author] = c if 0: for a,c in result.items(): if a != c: print "Collapsing authors: %s => %s" % (a,c) if 0: print parseAuthor("Franz Kaashoek")[0].collapsesTo( parseAuthor("M. Franz Kaashoek")[0]) print parseAuthor("Paul F. Syverson")[0].collapsesTo( parseAuthor("Paul Syverson")[0]) print parseAuthor("Paul Syverson")[0].collapsesTo( parseAuthor("Paul F. Syverson")[0]) return result # List of fields that appear when we display the entries as BibTeX. DISPLAYED_FIELDS = [ 'title', 'author', 'journal', 'booktitle', 'school', 'institution', 'organization', 'volume', 'number', 'year', 'month', 'address', 'location', 'chapter', 'edition', 'pages', 'editor', 'howpublished', 'key', 'publisher', 'type', 'note', 'series' ] class BibTeXEntry: """A single BibTeX entry.""" def __init__(self, type, key, entries): self.type = type # What kind of entry is it? (@book,@injournal,etc) self.key = key # What key does it have? self.entries = entries # Map from key to value. self.entryLine = 0 # Defined on this line number def get(self, k, v=None): return self.entries.get(k,v) def has_key(self, k): return self.entries.has_key(k) def __getitem__(self, k): return self.entries[k] def __setitem__(self, k, v): self.entries[k] = v def __str__(self): return self.format(70,1) def getURL(self): """Return the best URL to use for this paper, or None.""" best = None for field in ['www_pdf_url', 'www_ps_gz_url', 'www_ps_url', 'www_html_url', 'www_txt_url', ]: u = self.get(field) if u: if not best: best = u elif (best.startswith("http://citeseer.nj.nec.com/") and not u.startswith("http://citeseer.nj.nec.com/")): best = u return best def format(self, width=70, indent=8, v=0, invStrings={}): """Format this entry as BibTeX.""" d = ["@%s{%s,\n" % (self.type, self.key)] if v: df = DISPLAYED_FIELDS[:] for k in self.entries.keys(): if k not in df: df.append(k) else: df = DISPLAYED_FIELDS for f in df: if not self.entries.has_key(f): continue v = self.entries[f] if v.startswith(""): d.append("%%%%% ERROR: Missing field\n") d.append("%% %s = {?????},\n"%f) continue np = v.translate(ALLCHARS, PRINTINGCHARS) if np: d.append("%%%%% "+("ERROR: Non-ASCII characters: '%r'\n"%np)) d.append(" ") v = v.replace("&", "&") if invStrings.has_key(v): s = "%s = %s,\n" %(f, invStrings[v]) else: s = "%s = {%s},\n" % (f, v) d.append(_split(s,width,indent)) d.append("}\n") return "".join(d) def resolve(self): """Handle post-processing for this entry""" a = self.get('author') if a: self.parsedAuthor = parseAuthor(a) #print a #print " => ",repr(self.parsedAuthor) else: self.parsedAuthor = None def isImportant(self): """Return 1 iff this entry is marked as important""" imp = self.get("www_important") if imp and imp.strip().lower() not in ("no", "false", "0"): return 1 return 0 def check(self): """Print any errors for this entry, and return true if there were none.""" errs = self._check() for e in errs: print e return not errs def _check(self): errs = [] if self.type == 'inproceedings': fields = 'booktitle', 'year' elif self.type == 'incollection': fields = 'booktitle', 'year' elif self.type == 'proceedings': fields = 'booktitle', 'editor' elif self.type == 'article': fields = 'journal', 'year' elif self.type == 'techreport': fields = 'institution', elif self.type == 'misc': fields = 'howpublished', elif self.type in ('mastersthesis', 'phdthesis'): fields = () else: fields = () errs.append("ERROR: odd type %s"%self.type) if self.type != 'proceedings': fields += 'title', 'author', 'www_section', 'year' for field in fields: if self.get(field) is None or \ self.get(field).startswith(""): errs.append("ERROR: %s has no %s field" % (self.key, field)) self.entries[field] = "%s:??"%field if self.type == 'inproceedings': if self.get("booktitle"): if not self['booktitle'].startswith("Proceedings of") and \ not self['booktitle'].startswith("{Proceedings of"): errs.append("ERROR: %s's booktitle (%r) doesn't start with 'Proceedings of'" % (self.key, self['booktitle'])) if self.has_key("pages") and not re.search(r'\d+--\d+', self['pages']): errs.append("ERROR: Misformed pages in %s"%self.key) if self.type == 'proceedings': if self.get('title'): errs.append("ERROR: %s is a proceedings: it should have a booktitle, not a title." % self.key) for field, value in self.entries.items(): if value.translate(ALLCHARS, PRINTINGCHARS): errs.append("ERROR: %s.%s has non-ASCII characters"%( self.key, field)) if field.startswith("www_") and field not in WWW_FIELDS: errs.append("ERROR: unknown www field %s"% field) if value.strip()[-1:] == '.' and \ field not in ("notes", "www_remarks", "author"): errs.append("ERROR: %s.%s has an extraneous period"%(self.key, field)) return errs def biblio_to_html(self): """Return the HTML for the citation portion of entry.""" if self.type in ('inproceedings', 'incollection'): booktitle = self['booktitle'] bookurl = self.get('bookurl') if bookurl: m = PROCEEDINGS_RE.match(booktitle) if m: res = ["In the ", m.group(1), '' % bookurl, m.group(2), ""] else: res = ['In the %s' % (bookurl, booktitle)] else: res = ["In the ", booktitle] if self.get("edition"): res.append(",") res.append(self['edition']) if self.get("location"): res.append(", ") res.append(self['location']) elif self.get("address"): res.append(", ") res.append(self['address']) res.append(", %s %s" % (self.get('month', ""), self['year'])) if not self.get('pages'): pass elif "-" in self['pages']: res.append(", pages %s" % self['pages']) else: res.append(", page %s" % self['pages']) elif self.type == 'article': res = ["In "] if self.get('journalurl'): res.append('%s' % (self['journalurl'], self['journal'])) else: res.append(self['journal']) if self.get('volume'): res.append(" %s" % self['volume']) if self.get('number'): res.append("(%s)" % self['number']) res.append(", %s %s" % (self.get('month', ""), self['year'])) if not self.get('pages'): pass elif "-" in self['pages']: res.append(", pages %s" % self['pages']) else: res.append(", page %s" % self['pages']) elif self.type == 'techreport': res = ["%s %s %s" % (self['institution'], self.get('type', 'technical report'), self.get('number', ""))] if self.get('month') or self.get('year'): res.append(", %s %s" % (self.get('month', ''), self.get('year', ''))) elif self.type == 'mastersthesis' or self.type == 'phdthesis': if self.get('type'): res = [self['type']] elif self.type == 'mastersthesis': res = ["Masters's thesis"] else: res = ["Ph.D. thesis"] if self.get('school'): res.append(", %s" % (self['school'])) if self.get('month') or self.get('year'): res.append(", %s %s" % (self.get('month', ''), self.get('year', ''))) elif self.type == 'book': res = [self['publisher']] if self.get('year'): res.append(" ") res.append(self.get('year')) # res.append(", %s"%(self.get('year'))) if self.get('series'): res.append(",") res.append(self['series']) elif self.type == 'misc': res = [self['howpublished']] if self.get('month') or self.get('year'): res.append(", %s %s" % (self.get('month', ''), self.get('year', ''))) if not self.get('pages'): pass elif "-" in self['pages']: res.append(", pages %s" % self['pages']) else: res.append(", page %s" % self['pages']) else: res = ["<Odd type %s>" % self.type] res[0:0] = [""] res.append(".") bibtexurl = "./bibtex.html#%s" % url_untranslate(self.key) res.append((" " "(BibTeX entry)" "") % bibtexurl) return htmlize("".join(res)) def to_html(self, cache_path="./cache", base_url="."): """Return the HTML for this entry.""" imp = self.isImportant() draft = self.get('year') == 'forthcoming' if imp: res = ["
  • "] elif draft: res = ["

  • "] else: res = ["

  • "] if imp or not draft: # Add a picture of the rank # Only if year is known or paper important! r = rank.get_rank_html(self['title'], self.get('year'), update=False, base_url=base_url) if r is not None: res.append(r) res.append("%s"%( url_untranslate(self.key),htmlize(self['title']))) for cached in 0,1: availability = [] if not cached: for which in [ "amazon", "excerpt", "publisher" ]: key = "www_%s_url"%which if self.get(key): url=self[key] url = unTeXescapeURL(url) availability.append('%s' %(url,which)) cache_section = self.get('www_cache_section', ".") if cache_section not in config.CACHE_SECTIONS: if cache_section != ".": print >>sys.stderr, "Unrecognized cache section %s"%( cache_section) cache_section="." for key, name, ext in (('www_abstract_url', 'abstract','abstract'), ('www_html_url', 'HTML', 'html'), ('www_pdf_url', 'PDF', 'pdf'), ('www_ps_url', 'PS', 'ps'), ('www_txt_url', 'TXT', 'txt'), ('www_ps_gz_url', 'gzipped PS','ps.gz') ): if cached: #XXXX the URL needs to be relative to the absolute #XXXX cache path. url = smartJoin(cache_path,cache_section, "%s.%s"%(self.key,ext)) fname = smartJoin(config.OUTPUT_DIR, config.CACHE_DIR, cache_section, "%s.%s"%(self.key,ext)) if not os.path.exists(fname): continue else: url = self.get(key) if not url: continue url = unTeXescapeURL(url) url = url.replace('&', '&') availability.append('%s' %(url,name)) if availability: res.append([" ", " "][cached]) res.append("(") if cached: res.append("Cached: ") res.append(", ".join(availability)) res.append(")") res.append("
    by ") #res.append("\n\n" % self.parsedAuthor) htmlAuthors = [ a.htmlizeWithLink() for a in self.parsedAuthor ] if len(htmlAuthors) == 1: res.append(htmlAuthors[0]) elif len(htmlAuthors) == 2: res.append(" and ".join(htmlAuthors)) else: res.append(", ".join(htmlAuthors[:-1])) res.append(", and ") res.append(htmlAuthors[-1]) if res[-1][-1] != '.': res.append(".") res.append("
    \n") res.append(self.biblio_to_html()) res.append("·"%url_untranslate(self.key)) res.append("

    ") if self.get('www_remarks'): res.append("

    %s

    "%htmlize( self['www_remarks'])) if imp or draft: res.append("") res.append("
  • \n\n") return "".join(res) class ParsedAuthor: """The parsed name of an author. Eddie deserves credit for this incredibly hairy business. """ def __init__(self, first, von, last, jr): self.first = first self.von = von self.last = last self.jr = jr self.collapsable = 1 self.html = htmlize(str(self)) self.txt = txtize(str(self)) s = self.html for pat in config.NO_COLLAPSE_AUTHORS_RE_LIST: if pat.search(s): self.collapsable = 0 break def __eq__(self, o): return ((self.first == o.first) and (self.last == o.last) and (self.von == o.von) and (self.jr == o.jr)) def __hash__(self): return hash(repr(self)) def collapsesTo(self, o): """Return true iff 'o' could be a more canonical version of this author """ if not self.collapsable or not o.collapsable: return self if self.last != o.last or self.von != o.von or self.jr != o.jr: return self if not self.first: return o if len(self.first) == len(o.first): n = [] for a,b in zip(self.first, o.first): if a == b: n.append(a) elif len(a) == 2 and a[1] == '.' and a[0] == b[0]: n.append(b) elif len(b) == 2 and b[1] == '.' and a[0] == b[0]: n.append(a) else: return self if n == self.first: return self elif n == o.first: return o else: return self else: realname = max([len(n) for n in self.first+o.first])>2 if not realname: return self if len(self.first) < len(o.first): short = self.first; long = o.first else: short = o.first; long = self.first initials_s = "".join([n[0] for n in short]) initials_l = "".join([n[0] for n in long]) idx = initials_l.find(initials_s) if idx < 0: return self n = long[:idx] for i in range(idx, idx+len(short)): a = long[i]; b = short[i-idx] if a == b: n.append(a) elif len(a) == 2 and a[1] == '.' and a[0] == b[0]: n.append(b) elif len(b) == 2 and b[1] == '.' and a[0] == b[0]: n.append(a) else: return self n += long[idx+len(short):] if n == self.first: return self elif n == o.first: return o else: return self def __repr__(self): return "ParsedAuthor(%r,%r,%r,%r)"%(self.first,self.von, self.last,self.jr) def __str__(self): a = " ".join(self.first+self.von+self.last) if self.jr: return "%s, %s" % (a,self.jr) return a def getHomepage(self): s = self.html for pat, url in config.AUTHOR_RE_LIST: if pat.search(s): return url return None def getSortingName(self): """Return a representation of this author's name in von-last-first-jr order, unless overridden by ALPH """ s = self.html for pat,v in config.ALPHABETIZE_AUTHOR_AS_RE_LIST: if pat.search(s): return v return txtize(" ".join(self.von+self.last+self.first+self.jr)) def getSectionName(self): """Return a HTML representation of this author's name in last, first von, jr order""" secname = " ".join(self.last) more = self.first+self.von if more: secname += ", "+" ".join(more) if self.jr: secname += ", "+" ".join(self.jr) secname = htmlize(secname) return secname def htmlizeWithLink(self): a = self.html u = self.getHomepage() if u: return "%s"%(u,a) else: return a def parseAuthor(s): try: return _parseAuthor(s) except: print >>sys.stderr, "Internal error while parsing author %r"%s raise def _parseAuthor(s): """Take an author string and return a list of ParsedAuthor.""" items = [] s = s.strip() while s: s = s.strip() bracelevel = 0 for i in xrange(len(s)): if s[i] == '{': bracelevel += 1 elif s[i] == '}': bracelevel -= 1 elif bracelevel <= 0 and s[i] in " \t\n,": break if i+1 == len(s): items.append(s) else: items.append(s[0:i]) if (s[i] == ','): items.append(',') s = s[i+1:] authors = [[]] for item in items: if item == 'and': authors.append([]) else: authors[-1].append(item) parsedAuthors = [] # Split into first, von, last, jr for author in authors: commas = 0 fvl = [] vl = [] f = [] v = [] l = [] j = [] cur = fvl for item in author: if item == ',': if commas == 0: vl = fvl fvl = [] cur = f else: j.extend(f) cur = f = [] commas += 1 else: cur.append(item) if commas == 0: split_von(f,v,l,fvl) else: f_tmp = [] split_von(f_tmp,v,l,vl) parsedAuthors.append(ParsedAuthor(f,v,l,j)) return parsedAuthors