From 871c61c6b4351d4a9dd78ba1d70d6e1af8ffe1e7 Mon Sep 17 00:00:00 2001 From: Thibaut Horel Date: Thu, 4 Feb 2016 19:46:04 -0500 Subject: Start cleaning: PEP8 and split the BibTeX.py monster --- BibTeX.py | 759 ++------------------------------------------------------------ entry.py | 653 +++++++++++++++++++++++++++++++++++++++++++++++++++++ utils.py | 118 ++++++++++ 3 files changed, 791 insertions(+), 739 deletions(-) create mode 100644 entry.py create mode 100644 utils.py diff --git a/BibTeX.py b/BibTeX.py index e076200..d0f5624 100644 --- a/BibTeX.py +++ b/BibTeX.py @@ -14,63 +14,43 @@ import copy import config -import rank +from entry import BibTeXEntry, buildAuthorTable +from utils import txtize, url_untranslate, smartJoin -__all__ = [ 'ParseError', 'BibTeX', 'BibTeXEntry', 'htmlize', - 'ParsedAuthor', 'FileIter', 'Parser', 'parseFile', - 'splitBibTeXEntriesBy', 'sortBibTexEntriesBy', ] +__all__ = ['ParseError', 'BibTeX', 'BibTeXEntry', 'htmlize', + 'ParsedAuthor', 'FileIter', 'Parser', 'parseFile', + 'splitEntriesBy', 'sortEntriesBy'] # List: must map from month number to month name. -MONTHS = [ None, - "January", "February", "March", "April", "May", "June", - "July", "August", "September", "October", "November", "December"] - -# Fields that we only care about for making web pages (BibTeX doesn't -# recognize them.) -WWW_FIELDS = [ 'www_section', 'www_important', 'www_remarks', - 'www_abstract_url', 'www_html_url', 'www_pdf_url', 'www_ps_url', - 'www_txt_url', 'www_ps_gz_url', 'www_amazon_url', - 'www_excerpt_url', 'www_publisher_url', - 'www_cache_section', 'www_tags' ] - -def url_untranslate(s): - """Change a BibTeX key into a string suitable for use in a URL.""" - s = re.sub(r'([%<>`#, &_\';])', - lambda m: "_%02x"%ord(m.group(1)), - s) - s = s.replace("/",":") - return s +MONTHS = [None, "January", "February", "March", "April", "May", "June", + "July", "August", "September", "October", "November", "December"] + + + + class ParseError(Exception): """Raised on invalid BibTeX""" pass -def smartJoin(*lst): - """Equivalent to os.path.join, but handle"." and ".." entries a bit better. - """ - lst = [ item for item in lst if item != "." ] - idx = 0 - while idx < len(lst): - if idx > 0 and lst[idx] == "..": - del lst[idx] - else: - idx += 1 - return os.path.join(*lst) + class BibTeX: """A parsed BibTeX file""" def __init__(self): - self.entries = [] # List of BibTeXEntry - self.byKey = {} # Map from BibTeX key to BibTeX entry. + self.entries = [] # List of BibTeXEntry + self.byKey = {} # Map from BibTeX key to BibTeX entry. + def addEntry(self, ent): """Add a BibTeX entry to this file.""" k = ent.key if self.byKey.get(ent.key.lower()): - print >> sys.stderr, "Already have an entry named %s"%k + print >> sys.stderr, "Already have an entry named %s" % k return self.entries.append(ent) self.byKey[ent.key.lower()] = ent + def resolve(self): """Validate all entries in this file, and resolve cross-references""" seen = {} @@ -80,7 +60,7 @@ class BibTeX: try: cr = self.byKey[ent['crossref'].lower()] except KeyError: - print "No such crossref: %s"% ent['crossref'] + print "No such crossref: %s" % ent['crossref'] break if seen.get(cr.key): raise ParseError("Circular crossref at %s" % ent.key) @@ -88,7 +68,7 @@ class BibTeX: del ent.entries['crossref'] if cr.entryLine < ent.entryLine: - print "Warning: crossref %s used after declaration"%cr.key + print "Warning: crossref %s used after declaration" % cr.key for k in cr.entries.keys(): if ent.entries.has_key(k): @@ -113,47 +93,7 @@ class BibTeX: newEntries.append(ent) self.entries = newEntries -def buildAuthorTable(entries): - """Given a list of BibTeXEntry, return a map from parsed author name to - parsed canonical name. - """ - authorsByLast = {} - for e in entries: - for a in e.parsedAuthor: - authorsByLast.setdefault(tuple(a.last), []).append(a) - # map from author to collapsed author. - result = {} - for k,v in config.COLLAPSE_AUTHORS.items(): - a = parseAuthor(k)[0] - c = parseAuthor(v)[0] - result[c] = c - result[a] = c - - for e in entries: - for author in e.parsedAuthor: - if result.has_key(author): - continue - - c = author - for a in authorsByLast[tuple(author.last)]: - if a is author: - continue - c = c.collapsesTo(a) - result[author] = c - - if 0: - for a,c in result.items(): - if a != c: - print "Collapsing authors: %s => %s" % (a,c) - if 0: - print parseAuthor("Franz Kaashoek")[0].collapsesTo( - parseAuthor("M. Franz Kaashoek")[0]) - print parseAuthor("Paul F. Syverson")[0].collapsesTo( - parseAuthor("Paul Syverson")[0]) - print parseAuthor("Paul Syverson")[0].collapsesTo( - parseAuthor("Paul F. Syverson")[0]) - return result def splitEntriesBy(entries, field): """Take a list of BibTeX entries and the name of a bibtex field; return @@ -281,570 +221,9 @@ def sortEntriesByDate(entries): return [ t[2] for t in tmp ] -# List of fields that appear when we display the entries as BibTeX. -DISPLAYED_FIELDS = [ 'title', 'author', 'journal', 'booktitle', -'school', 'institution', 'organization', 'volume', 'number', 'year', -'month', 'address', 'location', 'chapter', 'edition', 'pages', 'editor', -'howpublished', 'key', 'publisher', 'type', 'note', 'series' ] - -class BibTeXEntry: - """A single BibTeX entry.""" - def __init__(self, type, key, entries): - self.type = type # What kind of entry is it? (@book,@injournal,etc) - self.key = key # What key does it have? - self.entries = entries # Map from key to value. - self.entryLine = 0 # Defined on this line number - def get(self, k, v=None): - return self.entries.get(k,v) - def has_key(self, k): - return self.entries.has_key(k) - def __getitem__(self, k): - return self.entries[k] - def __setitem__(self, k, v): - self.entries[k] = v - def __str__(self): - return self.format(70,1) - def getURL(self): - """Return the best URL to use for this paper, or None.""" - best = None - for field in ['www_pdf_url', 'www_ps_gz_url', 'www_ps_url', - 'www_html_url', 'www_txt_url', ]: - u = self.get(field) - if u: - if not best: - best = u - elif (best.startswith("http://citeseer.nj.nec.com/") - and not u.startswith("http://citeseer.nj.nec.com/")): - best = u - return best - - def format(self, width=70, indent=8, v=0, invStrings={}): - """Format this entry as BibTeX.""" - d = ["@%s{%s,\n" % (self.type, self.key)] - if v: - df = DISPLAYED_FIELDS[:] - for k in self.entries.keys(): - if k not in df: - df.append(k) - else: - df = DISPLAYED_FIELDS - for f in df: - if not self.entries.has_key(f): - continue - v = self.entries[f] - if v.startswith(""): - d.append("%%%%% ERROR: Missing field\n") - d.append("%% %s = {?????},\n"%f) - continue - np = v.translate(ALLCHARS, PRINTINGCHARS) - if np: - d.append("%%%%% "+("ERROR: Non-ASCII characters: '%r'\n"%np)) - d.append(" ") - v = v.replace("&", "&") - if invStrings.has_key(v): - s = "%s = %s,\n" %(f, invStrings[v]) - else: - s = "%s = {%s},\n" % (f, v) - d.append(_split(s,width,indent)) - d.append("}\n") - return "".join(d) - def resolve(self): - """Handle post-processing for this entry""" - a = self.get('author') - if a: - self.parsedAuthor = parseAuthor(a) - #print a - #print " => ",repr(self.parsedAuthor) - else: - self.parsedAuthor = None - - def isImportant(self): - """Return 1 iff this entry is marked as important""" - imp = self.get("www_important") - if imp and imp.strip().lower() not in ("no", "false", "0"): - return 1 - return 0 - - def check(self): - """Print any errors for this entry, and return true if there were - none.""" - errs = self._check() - for e in errs: - print e - return not errs - - def _check(self): - errs = [] - if self.type == 'inproceedings': - fields = 'booktitle', 'year' - elif self.type == 'incollection': - fields = 'booktitle', 'year' - elif self.type == 'proceedings': - fields = 'booktitle', 'editor' - elif self.type == 'article': - fields = 'journal', 'year' - elif self.type == 'techreport': - fields = 'institution', - elif self.type == 'misc': - fields = 'howpublished', - elif self.type in ('mastersthesis', 'phdthesis'): - fields = () - else: - fields = () - errs.append("ERROR: odd type %s"%self.type) - if self.type != 'proceedings': - fields += 'title', 'author', 'www_section', 'year' - - for field in fields: - if self.get(field) is None or \ - self.get(field).startswith(""): - errs.append("ERROR: %s has no %s field" % (self.key, field)) - self.entries[field] = "%s:??"%field - - if self.type == 'inproceedings': - if self.get("booktitle"): - if not self['booktitle'].startswith("Proceedings of") and \ - not self['booktitle'].startswith("{Proceedings of"): - errs.append("ERROR: %s's booktitle (%r) doesn't start with 'Proceedings of'" % (self.key, self['booktitle'])) - - if self.has_key("pages") and not re.search(r'\d+--\d+', self['pages']): - errs.append("ERROR: Misformed pages in %s"%self.key) - - if self.type == 'proceedings': - if self.get('title'): - errs.append("ERROR: %s is a proceedings: it should have a booktitle, not a title." % self.key) - - for field, value in self.entries.items(): - if value.translate(ALLCHARS, PRINTINGCHARS): - errs.append("ERROR: %s.%s has non-ASCII characters"%( - self.key, field)) - if field.startswith("www_") and field not in WWW_FIELDS: - errs.append("ERROR: unknown www field %s"% field) - if value.strip()[-1:] == '.' and \ - field not in ("notes", "www_remarks", "author"): - errs.append("ERROR: %s.%s has an extraneous period"%(self.key, - field)) - return errs - - def biblio_to_html(self): - """Return the HTML for the citation portion of entry.""" - if self.type in ('inproceedings', 'incollection'): - booktitle = self['booktitle'] - bookurl = self.get('bookurl') - if bookurl: - m = PROCEEDINGS_RE.match(booktitle) - if m: - res = ["In the ", m.group(1), - ''%bookurl, m.group(2), ""] - else: - res = ['In the %s' % (bookurl,booktitle)] - else: - res = ["In the ", booktitle ] - - if self.get("edition"): - res.append(",") - res.append(self['edition']) - if self.get("location"): - res.append(", ") - res.append(self['location']) - elif self.get("address"): - res.append(", ") - res.append(self['address']) - res.append(", %s %s" % (self.get('month',""), self['year'])) - if not self.get('pages'): - pass - elif "-" in self['pages']: - res.append(", pages %s"%self['pages']) - else: - res.append(", page %s"%self['pages']) - elif self.type == 'article': - res = ["In "] - if self.get('journalurl'): - res.append('%s'% - (self['journalurl'],self['journal'])) - else: - res.append(self['journal']) - if self.get('volume'): - res.append(" %s"%self['volume']) - if self.get('number'): - res.append("(%s)"%self['number']) - res.append(", %s %s" % (self.get('month',""), self['year'])) - if not self.get('pages'): - pass - elif "-" in self['pages']: - res.append(", pages %s"%self['pages']) - else: - res.append(", page %s"%self['pages']) - elif self.type == 'techreport': - res = [ "%s %s %s" % (self['institution'], - self.get('type', 'technical report'), - self.get('number', "")) ] - if self.get('month') or self.get('year'): - res.append(", %s %s" % (self.get('month', ''), - self.get('year', ''))) - elif self.type == 'mastersthesis' or self.type == 'phdthesis': - if self.get('type'): - res = [self['type']] - elif self.type == 'mastersthesis': - res = ["Masters's thesis"] - else: - res = ["Ph.D. thesis"] - if self.get('school'): - res.append(", %s"%(self['school'])) - if self.get('month') or self.get('year'): - res.append(", %s %s" % (self.get('month', ''), - self.get('year', ''))) - elif self.type == 'book': - res = [self['publisher']] - if self.get('year'): - res.append(" "); - res.append(self.get('year')); - # res.append(", %s"%(self.get('year'))) - if self.get('series'): - res.append(","); - res.append(self['series']); - elif self.type == 'misc': - res = [self['howpublished']] - if self.get('month') or self.get('year'): - res.append(", %s %s" % (self.get('month', ''), - self.get('year', ''))) - if not self.get('pages'): - pass - elif "-" in self['pages']: - res.append(", pages %s"%self['pages']) - else: - res.append(", page %s"%self['pages']) - else: - res = ["<Odd type %s>"%self.type] - - res[0:0] = [""] - res.append(".") - - bibtexurl = "./bibtex.html#%s"%url_untranslate(self.key) - res.append((" " - "(BibTeX entry)" - "") %bibtexurl) - return htmlize("".join(res)) - - def to_html(self, cache_path="./cache", base_url="."): - """Return the HTML for this entry.""" - imp = self.isImportant() - draft = self.get('year') == 'forthcoming' - if imp: - res = ["
  • " ] - elif draft: - res = ["

  • " ] - else: - res = ["

  • "] - - if imp or not draft: - # Add a picture of the rank - # Only if year is known or paper important! - r = rank.get_rank_html(self['title'], self.get('year'), - update=False, base_url=base_url) - if r is not None: - res.append(r) - - res.append("%s"%( - url_untranslate(self.key),htmlize(self['title']))) - - for cached in 0,1: - availability = [] - if not cached: - for which in [ "amazon", "excerpt", "publisher" ]: - key = "www_%s_url"%which - if self.get(key): - url=self[key] - url = unTeXescapeURL(url) - availability.append('%s' %(url,which)) - - cache_section = self.get('www_cache_section', ".") - if cache_section not in config.CACHE_SECTIONS: - if cache_section != ".": - print >>sys.stderr, "Unrecognized cache section %s"%( - cache_section) - cache_section="." - - for key, name, ext in (('www_abstract_url', 'abstract','abstract'), - ('www_html_url', 'HTML', 'html'), - ('www_pdf_url', 'PDF', 'pdf'), - ('www_ps_url', 'PS', 'ps'), - ('www_txt_url', 'TXT', 'txt'), - ('www_ps_gz_url', 'gzipped PS','ps.gz') - ): - if cached: - #XXXX the URL needs to be relative to the absolute - #XXXX cache path. - url = smartJoin(cache_path,cache_section, - "%s.%s"%(self.key,ext)) - fname = smartJoin(config.OUTPUT_DIR, config.CACHE_DIR, - cache_section, - "%s.%s"%(self.key,ext)) - if not os.path.exists(fname): continue - else: - url = self.get(key) - if not url: continue - url = unTeXescapeURL(url) - url = url.replace('&', '&') - availability.append('%s' %(url,name)) - - if availability: - res.append([" ", " "][cached]) - res.append("(") - if cached: res.append("Cached: ") - res.append(", ".join(availability)) - res.append(")") - - res.append("
    by ") - - #res.append("\n\n" % self.parsedAuthor) - htmlAuthors = [ a.htmlizeWithLink() for a in self.parsedAuthor ] - - if len(htmlAuthors) == 1: - res.append(htmlAuthors[0]) - elif len(htmlAuthors) == 2: - res.append(" and ".join(htmlAuthors)) - else: - res.append(", ".join(htmlAuthors[:-1])) - res.append(", and ") - res.append(htmlAuthors[-1]) - - if res[-1][-1] != '.': - res.append(".") - res.append("
    \n") - res.append(self.biblio_to_html()) - res.append("·"%url_untranslate(self.key)) - res.append("

    ") - - if self.get('www_remarks'): - res.append("

    %s

    "%htmlize( - self['www_remarks'])) - - if imp or draft: - res.append("") - res.append("
  • \n\n") - - return "".join(res) - -def unTeXescapeURL(s): - """Turn a URL as formatted in TeX into a real URL.""" - s = s.replace("\\_", "_") - s = s.replace("\\-", "") - s = s.replace("\{}", "") - s = s.replace("{}", "") - return s - -def TeXescapeURL(s): - """Escape a URL for use in TeX""" - s = s.replace("_", "\\_") - s = s.replace("~", "\{}~") - return s - -RE_LONE_AMP = re.compile(r'&([^a-z0-9])') -RE_LONE_I = re.compile(r'\\i([^a-z0-9])') -RE_ACCENT = re.compile(r'\\([\'`~^"c])([^{]|{.})') -RE_LIGATURE = re.compile(r'\\(AE|ae|OE|oe|AA|aa|O|o|ss)([^a-z0-9])') -ACCENT_MAP = { "'" : 'acute', - "`" : 'grave', - "~" : 'tilde', - "^" : 'circ', - '"' : 'uml', - "c" : 'cedil', - } -UNICODE_MAP = { 'ń' : 'ń', } -HTML_LIGATURE_MAP = { - 'AE' : 'Æ', - 'ae' : 'æ', - 'OE' : 'Œ', - 'oe' : 'œ', - 'AA' : 'Å', - 'aa' : 'å', - 'O' : 'Ø', - 'o' : 'ø', - 'ss' : 'ß', - } -RE_TEX_CMD = re.compile(r"(?:\\[a-zA-Z@]+|\\.)") -RE_PAGE_SPAN = re.compile(r"(\d)--(\d)") -def _unaccent(m): - accent,char = m.groups() - if char[0] == '{': - char = char[1] - accented = "&%s%s;" % (char, ACCENT_MAP[accent]) - return UNICODE_MAP.get(accented, accented) -def _unlig_html(m): - return "%s%s"%(HTML_LIGATURE_MAP[m.group(1)],m.group(2)) -def htmlize(s): - """Turn a TeX string into good-looking HTML.""" - s = RE_LONE_AMP.sub(lambda m: "&%s" % m.group(1), s) - s = RE_LONE_I.sub(lambda m: "i%s" % m.group(1), s) - s = RE_ACCENT.sub(_unaccent, s) - s = unTeXescapeURL(s) - s = RE_LIGATURE.sub(_unlig_html, s); - s = RE_TEX_CMD.sub("", s) - s = s.translate(ALLCHARS, "{}") - s = RE_PAGE_SPAN.sub(lambda m: "%s-%s"%(m.groups()), s) - s = s.replace("---", "—"); - s = s.replace("--", "–"); - return s - -def author_url(author): - """Given an author's name, return a URL for his/her homepage.""" - for pat, url in config.AUTHOR_RE_LIST: - if pat.search(author): - return url - return None - -def txtize(s): - """Turn a TeX string into decnent plaintext.""" - s = RE_LONE_I.sub(lambda m: "i%s" % m.group(1), s) - s = RE_ACCENT.sub(lambda m: "%s" % m.group(2), s) - s = RE_LIGATURE.sub(lambda m: "%s%s"%m.groups(), s) - s = RE_TEX_CMD.sub("", s) - s = s.translate(ALLCHARS, "{}") - return s - -PROCEEDINGS_RE = re.compile( - r'((?:proceedings|workshop record) of(?: the)? )(.*)', - re.I) - -class ParsedAuthor: - """The parsed name of an author. - - Eddie deserves credit for this incredibly hairy business. - """ - def __init__(self, first, von, last, jr): - self.first = first - self.von = von - self.last = last - self.jr = jr - self.collapsable = 1 - - self.html = htmlize(str(self)) - self.txt = txtize(str(self)) - - s = self.html - for pat in config.NO_COLLAPSE_AUTHORS_RE_LIST: - if pat.search(s): - self.collapsable = 0 - break - def __eq__(self, o): - return ((self.first == o.first) and - (self.last == o.last) and - (self.von == o.von) and - (self.jr == o.jr)) - - def __hash__(self): - return hash(repr(self)) - - def collapsesTo(self, o): - """Return true iff 'o' could be a more canonical version of this author - """ - if not self.collapsable or not o.collapsable: - return self - - if self.last != o.last or self.von != o.von or self.jr != o.jr: - return self - if not self.first: - return o - - if len(self.first) == len(o.first): - n = [] - for a,b in zip(self.first, o.first): - if a == b: - n.append(a) - elif len(a) == 2 and a[1] == '.' and a[0] == b[0]: - n.append(b) - elif len(b) == 2 and b[1] == '.' and a[0] == b[0]: - n.append(a) - else: - return self - if n == self.first: - return self - elif n == o.first: - return o - else: - return self - else: - realname = max([len(n) for n in self.first+o.first])>2 - if not realname: - return self - if len(self.first) < len(o.first): - short = self.first; long = o.first - else: - short = o.first; long = self.first - - initials_s = "".join([n[0] for n in short]) - initials_l = "".join([n[0] for n in long]) - idx = initials_l.find(initials_s) - if idx < 0: - return self - n = long[:idx] - for i in range(idx, idx+len(short)): - a = long[i]; b = short[i-idx] - if a == b: - n.append(a) - elif len(a) == 2 and a[1] == '.' and a[0] == b[0]: - n.append(b) - elif len(b) == 2 and b[1] == '.' and a[0] == b[0]: - n.append(a) - else: - return self - n += long[idx+len(short):] - if n == self.first: - return self - elif n == o.first: - return o - else: - return self - - def __repr__(self): - return "ParsedAuthor(%r,%r,%r,%r)"%(self.first,self.von, - self.last,self.jr) - def __str__(self): - a = " ".join(self.first+self.von+self.last) - if self.jr: - return "%s, %s" % (a,self.jr) - return a - - def getHomepage(self): - s = self.html - for pat, url in config.AUTHOR_RE_LIST: - if pat.search(s): - return url - return None - - def getSortingName(self): - """Return a representation of this author's name in von-last-first-jr - order, unless overridden by ALPH """ - s = self.html - for pat,v in config.ALPHABETIZE_AUTHOR_AS_RE_LIST: - if pat.search(s): - return v - - return txtize(" ".join(self.von+self.last+self.first+self.jr)) - - def getSectionName(self): - """Return a HTML representation of this author's name in - last, first von, jr order""" - secname = " ".join(self.last) - more = self.first+self.von - if more: - secname += ", "+" ".join(more) - if self.jr: - secname += ", "+" ".join(self.jr) - secname = htmlize(secname) - return secname - - def htmlizeWithLink(self): - a = self.html - u = self.getHomepage() - if u: - return "%s"%(u,a) - else: - return a def _split(s,w=79,indent=8): r = [] @@ -886,105 +265,7 @@ class FileIter: return self._next() -def parseAuthor(s): - try: - return _parseAuthor(s) - except: - print >>sys.stderr, "Internal error while parsing author %r"%s - raise - -def _parseAuthor(s): - """Take an author string and return a list of ParsedAuthor.""" - items = [] - s = s.strip() - while s: - s = s.strip() - bracelevel = 0 - for i in xrange(len(s)): - if s[i] == '{': - bracelevel += 1 - elif s[i] == '}': - bracelevel -= 1 - elif bracelevel <= 0 and s[i] in " \t\n,": - break - if i+1 == len(s): - items.append(s) - else: - items.append(s[0:i]) - if (s[i] == ','): - items.append(',') - s = s[i+1:] - - authors = [[]] - for item in items: - if item == 'and': - authors.append([]) - else: - authors[-1].append(item) - - parsedAuthors = [] - # Split into first, von, last, jr - for author in authors: - commas = 0 - fvl = [] - vl = [] - f = [] - v = [] - l = [] - j = [] - cur = fvl - for item in author: - if item == ',': - if commas == 0: - vl = fvl - fvl = [] - cur = f - else: - j.extend(f) - cur = f = [] - commas += 1 - else: - cur.append(item) - - if commas == 0: - split_von(f,v,l,fvl) - else: - f_tmp = [] - split_von(f_tmp,v,l,vl) - - parsedAuthors.append(ParsedAuthor(f,v,l,j)) - - return parsedAuthors - -ALLCHARS = "".join(map(chr,range(256))) -PRINTINGCHARS = "\t\n\r"+"".join(map(chr,range(32, 127))) -LC_CHARS = "abcdefghijklmnopqrstuvwxyz" -SV_DELCHARS = ("ABCDEFGHIJKLMNOPQRSTUVWXYZ" - "abcdefghijklmnopqrstuvwxyz" - "@") -RE_ESCAPED = re.compile(r'\\.') -def split_von(f,v,l,x): - in_von = 0 - while x: - tt = t = x[0] - del x[0] - if tt[:2] == '{\\': - tt = tt.translate(ALLCHARS, SV_DELCHARS) - tt = RE_ESCAPED.sub("", tt) - tt = tt.translate(ALLCHARS, "{}") - if tt.translate(ALLCHARS, LC_CHARS) == "": - v.append(t) - in_von = 1 - elif in_von and f is not None: - l.append(t) - l.extend(x) - return - else: - f.append(t) - if not in_von: - l.append(f[-1]) - del f[-1] class Parser: @@ -1016,7 +297,7 @@ class Parser: def _parseKey(self, line): it = self.fileiter - line = _advance(it,line) + line = _advance(it, line) m = KEY_RE.match(line) if not m: raise ParseError("Expected key at line %s"%self.fileiter.lineno) diff --git a/entry.py b/entry.py new file mode 100644 index 0000000..9846e32 --- /dev/null +++ b/entry.py @@ -0,0 +1,653 @@ +import rank +import sys +import re +import config +import os +from utils import htmlize, txtize, url_untranslate, unTeXescapeURL, smartJoin,\ + _split + +# Fields that we only care about for making web pages (BibTeX doesn't +# recognize them.) +WWW_FIELDS = ['www_section', 'www_important', 'www_remarks', + 'www_abstract_url', 'www_html_url', 'www_pdf_url', 'www_ps_url', + 'www_txt_url', 'www_ps_gz_url', 'www_amazon_url', + 'www_excerpt_url', 'www_publisher_url', + 'www_cache_section', 'www_tags'] + +def author_url(author): + """Given an author's name, return a URL for his/her homepage.""" + for pat, url in config.AUTHOR_RE_LIST: + if pat.search(author): + return url + return None +ALLCHARS = "".join(map(chr,range(256))) +PRINTINGCHARS = "\t\n\r"+"".join(map(chr,range(32, 127))) +LC_CHARS = "abcdefghijklmnopqrstuvwxyz" +SV_DELCHARS = ("ABCDEFGHIJKLMNOPQRSTUVWXYZ" + "abcdefghijklmnopqrstuvwxyz" + "@") +RE_ESCAPED = re.compile(r'\\.') +PROCEEDINGS_RE = re.compile( + r'((?:proceedings|workshop record) of(?: the)? )(.*)', + re.I) + +def split_von(f,v,l,x): + in_von = 0 + while x: + tt = t = x[0] + del x[0] + if tt[:2] == '{\\': + tt = tt.translate(ALLCHARS, SV_DELCHARS) + tt = RE_ESCAPED.sub("", tt) + tt = tt.translate(ALLCHARS, "{}") + if tt.translate(ALLCHARS, LC_CHARS) == "": + v.append(t) + in_von = 1 + elif in_von and f is not None: + l.append(t) + l.extend(x) + return + else: + f.append(t) + if not in_von: + l.append(f[-1]) + del f[-1] + +def buildAuthorTable(entries): + """Given a list of BibTeXEntry, return a map from parsed author name to + parsed canonical name. + """ + authorsByLast = {} + for e in entries: + for a in e.parsedAuthor: + authorsByLast.setdefault(tuple(a.last), []).append(a) + # map from author to collapsed author. + result = {} + for k,v in config.COLLAPSE_AUTHORS.items(): + a = parseAuthor(k)[0] + c = parseAuthor(v)[0] + result[c] = c + result[a] = c + + for e in entries: + for author in e.parsedAuthor: + if result.has_key(author): + continue + + c = author + for a in authorsByLast[tuple(author.last)]: + if a is author: + continue + c = c.collapsesTo(a) + result[author] = c + + if 0: + for a,c in result.items(): + if a != c: + print "Collapsing authors: %s => %s" % (a,c) + if 0: + print parseAuthor("Franz Kaashoek")[0].collapsesTo( + parseAuthor("M. Franz Kaashoek")[0]) + print parseAuthor("Paul F. Syverson")[0].collapsesTo( + parseAuthor("Paul Syverson")[0]) + print parseAuthor("Paul Syverson")[0].collapsesTo( + parseAuthor("Paul F. Syverson")[0]) + + return result + +# List of fields that appear when we display the entries as BibTeX. +DISPLAYED_FIELDS = [ 'title', 'author', 'journal', 'booktitle', +'school', 'institution', 'organization', 'volume', 'number', 'year', +'month', 'address', 'location', 'chapter', 'edition', 'pages', 'editor', +'howpublished', 'key', 'publisher', 'type', 'note', 'series' ] + +class BibTeXEntry: + """A single BibTeX entry.""" + def __init__(self, type, key, entries): + self.type = type # What kind of entry is it? (@book,@injournal,etc) + self.key = key # What key does it have? + self.entries = entries # Map from key to value. + self.entryLine = 0 # Defined on this line number + def get(self, k, v=None): + return self.entries.get(k,v) + def has_key(self, k): + return self.entries.has_key(k) + def __getitem__(self, k): + return self.entries[k] + def __setitem__(self, k, v): + self.entries[k] = v + def __str__(self): + return self.format(70,1) + def getURL(self): + """Return the best URL to use for this paper, or None.""" + best = None + for field in ['www_pdf_url', 'www_ps_gz_url', 'www_ps_url', + 'www_html_url', 'www_txt_url', ]: + u = self.get(field) + if u: + if not best: + best = u + elif (best.startswith("http://citeseer.nj.nec.com/") + and not u.startswith("http://citeseer.nj.nec.com/")): + best = u + return best + + def format(self, width=70, indent=8, v=0, invStrings={}): + """Format this entry as BibTeX.""" + d = ["@%s{%s,\n" % (self.type, self.key)] + if v: + df = DISPLAYED_FIELDS[:] + for k in self.entries.keys(): + if k not in df: + df.append(k) + else: + df = DISPLAYED_FIELDS + for f in df: + if not self.entries.has_key(f): + continue + v = self.entries[f] + if v.startswith(""): + d.append("%%%%% ERROR: Missing field\n") + d.append("%% %s = {?????},\n"%f) + continue + np = v.translate(ALLCHARS, PRINTINGCHARS) + if np: + d.append("%%%%% "+("ERROR: Non-ASCII characters: '%r'\n"%np)) + d.append(" ") + v = v.replace("&", "&") + if invStrings.has_key(v): + s = "%s = %s,\n" %(f, invStrings[v]) + else: + s = "%s = {%s},\n" % (f, v) + d.append(_split(s,width,indent)) + d.append("}\n") + return "".join(d) + def resolve(self): + """Handle post-processing for this entry""" + a = self.get('author') + if a: + self.parsedAuthor = parseAuthor(a) + #print a + #print " => ",repr(self.parsedAuthor) + else: + self.parsedAuthor = None + + def isImportant(self): + """Return 1 iff this entry is marked as important""" + imp = self.get("www_important") + if imp and imp.strip().lower() not in ("no", "false", "0"): + return 1 + return 0 + + def check(self): + """Print any errors for this entry, and return true if there were + none.""" + errs = self._check() + for e in errs: + print e + return not errs + + def _check(self): + errs = [] + if self.type == 'inproceedings': + fields = 'booktitle', 'year' + elif self.type == 'incollection': + fields = 'booktitle', 'year' + elif self.type == 'proceedings': + fields = 'booktitle', 'editor' + elif self.type == 'article': + fields = 'journal', 'year' + elif self.type == 'techreport': + fields = 'institution', + elif self.type == 'misc': + fields = 'howpublished', + elif self.type in ('mastersthesis', 'phdthesis'): + fields = () + else: + fields = () + errs.append("ERROR: odd type %s"%self.type) + if self.type != 'proceedings': + fields += 'title', 'author', 'www_section', 'year' + + for field in fields: + if self.get(field) is None or \ + self.get(field).startswith(""): + errs.append("ERROR: %s has no %s field" % (self.key, field)) + self.entries[field] = "%s:??"%field + + if self.type == 'inproceedings': + if self.get("booktitle"): + if not self['booktitle'].startswith("Proceedings of") and \ + not self['booktitle'].startswith("{Proceedings of"): + errs.append("ERROR: %s's booktitle (%r) doesn't start with 'Proceedings of'" % (self.key, self['booktitle'])) + + if self.has_key("pages") and not re.search(r'\d+--\d+', self['pages']): + errs.append("ERROR: Misformed pages in %s"%self.key) + + if self.type == 'proceedings': + if self.get('title'): + errs.append("ERROR: %s is a proceedings: it should have a booktitle, not a title." % self.key) + + for field, value in self.entries.items(): + if value.translate(ALLCHARS, PRINTINGCHARS): + errs.append("ERROR: %s.%s has non-ASCII characters"%( + self.key, field)) + if field.startswith("www_") and field not in WWW_FIELDS: + errs.append("ERROR: unknown www field %s"% field) + if value.strip()[-1:] == '.' and \ + field not in ("notes", "www_remarks", "author"): + errs.append("ERROR: %s.%s has an extraneous period"%(self.key, + field)) + return errs + + def biblio_to_html(self): + """Return the HTML for the citation portion of entry.""" + if self.type in ('inproceedings', 'incollection'): + booktitle = self['booktitle'] + bookurl = self.get('bookurl') + if bookurl: + m = PROCEEDINGS_RE.match(booktitle) + if m: + res = ["In the ", m.group(1), + '' % bookurl, m.group(2), ""] + else: + res = ['In the %s' % (bookurl, booktitle)] + else: + res = ["In the ", booktitle] + + if self.get("edition"): + res.append(",") + res.append(self['edition']) + if self.get("location"): + res.append(", ") + res.append(self['location']) + elif self.get("address"): + res.append(", ") + res.append(self['address']) + res.append(", %s %s" % (self.get('month', ""), self['year'])) + if not self.get('pages'): + pass + elif "-" in self['pages']: + res.append(", pages %s" % self['pages']) + else: + res.append(", page %s" % self['pages']) + elif self.type == 'article': + res = ["In "] + if self.get('journalurl'): + res.append('%s' % (self['journalurl'], + self['journal'])) + else: + res.append(self['journal']) + if self.get('volume'): + res.append(" %s" % self['volume']) + if self.get('number'): + res.append("(%s)" % self['number']) + res.append(", %s %s" % (self.get('month', ""), self['year'])) + if not self.get('pages'): + pass + elif "-" in self['pages']: + res.append(", pages %s" % self['pages']) + else: + res.append(", page %s" % self['pages']) + elif self.type == 'techreport': + res = ["%s %s %s" % (self['institution'], + self.get('type', 'technical report'), + self.get('number', ""))] + if self.get('month') or self.get('year'): + res.append(", %s %s" % (self.get('month', ''), + self.get('year', ''))) + elif self.type == 'mastersthesis' or self.type == 'phdthesis': + if self.get('type'): + res = [self['type']] + elif self.type == 'mastersthesis': + res = ["Masters's thesis"] + else: + res = ["Ph.D. thesis"] + if self.get('school'): + res.append(", %s" % (self['school'])) + if self.get('month') or self.get('year'): + res.append(", %s %s" % (self.get('month', ''), + self.get('year', ''))) + elif self.type == 'book': + res = [self['publisher']] + if self.get('year'): + res.append(" ") + res.append(self.get('year')) + # res.append(", %s"%(self.get('year'))) + if self.get('series'): + res.append(",") + res.append(self['series']) + elif self.type == 'misc': + res = [self['howpublished']] + if self.get('month') or self.get('year'): + res.append(", %s %s" % (self.get('month', ''), + self.get('year', ''))) + if not self.get('pages'): + pass + elif "-" in self['pages']: + res.append(", pages %s" % self['pages']) + else: + res.append(", page %s" % self['pages']) + else: + res = ["<Odd type %s>" % self.type] + + res[0:0] = [""] + res.append(".") + + bibtexurl = "./bibtex.html#%s" % url_untranslate(self.key) + res.append((" " + "(BibTeX entry)" + "") % bibtexurl) + return htmlize("".join(res)) + + def to_html(self, cache_path="./cache", base_url="."): + """Return the HTML for this entry.""" + imp = self.isImportant() + draft = self.get('year') == 'forthcoming' + if imp: + res = ["
  • "] + elif draft: + res = ["

  • "] + else: + res = ["

  • "] + + if imp or not draft: + # Add a picture of the rank + # Only if year is known or paper important! + r = rank.get_rank_html(self['title'], self.get('year'), + update=False, base_url=base_url) + if r is not None: + res.append(r) + + res.append("%s"%( + url_untranslate(self.key),htmlize(self['title']))) + + for cached in 0,1: + availability = [] + if not cached: + for which in [ "amazon", "excerpt", "publisher" ]: + key = "www_%s_url"%which + if self.get(key): + url=self[key] + url = unTeXescapeURL(url) + availability.append('%s' %(url,which)) + + cache_section = self.get('www_cache_section', ".") + if cache_section not in config.CACHE_SECTIONS: + if cache_section != ".": + print >>sys.stderr, "Unrecognized cache section %s"%( + cache_section) + cache_section="." + + for key, name, ext in (('www_abstract_url', 'abstract','abstract'), + ('www_html_url', 'HTML', 'html'), + ('www_pdf_url', 'PDF', 'pdf'), + ('www_ps_url', 'PS', 'ps'), + ('www_txt_url', 'TXT', 'txt'), + ('www_ps_gz_url', 'gzipped PS','ps.gz') + ): + if cached: + #XXXX the URL needs to be relative to the absolute + #XXXX cache path. + url = smartJoin(cache_path,cache_section, + "%s.%s"%(self.key,ext)) + fname = smartJoin(config.OUTPUT_DIR, config.CACHE_DIR, + cache_section, + "%s.%s"%(self.key,ext)) + if not os.path.exists(fname): continue + else: + url = self.get(key) + if not url: continue + url = unTeXescapeURL(url) + url = url.replace('&', '&') + availability.append('%s' %(url,name)) + + if availability: + res.append([" ", " "][cached]) + res.append("(") + if cached: res.append("Cached: ") + res.append(", ".join(availability)) + res.append(")") + + res.append("
    by ") + + #res.append("\n\n" % self.parsedAuthor) + htmlAuthors = [ a.htmlizeWithLink() for a in self.parsedAuthor ] + + if len(htmlAuthors) == 1: + res.append(htmlAuthors[0]) + elif len(htmlAuthors) == 2: + res.append(" and ".join(htmlAuthors)) + else: + res.append(", ".join(htmlAuthors[:-1])) + res.append(", and ") + res.append(htmlAuthors[-1]) + + if res[-1][-1] != '.': + res.append(".") + res.append("
    \n") + res.append(self.biblio_to_html()) + res.append("·"%url_untranslate(self.key)) + res.append("

    ") + + if self.get('www_remarks'): + res.append("

    %s

    "%htmlize( + self['www_remarks'])) + + if imp or draft: + res.append("") + res.append("
  • \n\n") + + return "".join(res) + + +class ParsedAuthor: + """The parsed name of an author. + + Eddie deserves credit for this incredibly hairy business. + """ + def __init__(self, first, von, last, jr): + self.first = first + self.von = von + self.last = last + self.jr = jr + self.collapsable = 1 + + self.html = htmlize(str(self)) + self.txt = txtize(str(self)) + + s = self.html + for pat in config.NO_COLLAPSE_AUTHORS_RE_LIST: + if pat.search(s): + self.collapsable = 0 + break + + def __eq__(self, o): + return ((self.first == o.first) and + (self.last == o.last) and + (self.von == o.von) and + (self.jr == o.jr)) + + def __hash__(self): + return hash(repr(self)) + + def collapsesTo(self, o): + """Return true iff 'o' could be a more canonical version of this author + """ + if not self.collapsable or not o.collapsable: + return self + + if self.last != o.last or self.von != o.von or self.jr != o.jr: + return self + if not self.first: + return o + + if len(self.first) == len(o.first): + n = [] + for a,b in zip(self.first, o.first): + if a == b: + n.append(a) + elif len(a) == 2 and a[1] == '.' and a[0] == b[0]: + n.append(b) + elif len(b) == 2 and b[1] == '.' and a[0] == b[0]: + n.append(a) + else: + return self + if n == self.first: + return self + elif n == o.first: + return o + else: + return self + else: + realname = max([len(n) for n in self.first+o.first])>2 + if not realname: + return self + + if len(self.first) < len(o.first): + short = self.first; long = o.first + else: + short = o.first; long = self.first + + initials_s = "".join([n[0] for n in short]) + initials_l = "".join([n[0] for n in long]) + idx = initials_l.find(initials_s) + if idx < 0: + return self + n = long[:idx] + for i in range(idx, idx+len(short)): + a = long[i]; b = short[i-idx] + if a == b: + n.append(a) + elif len(a) == 2 and a[1] == '.' and a[0] == b[0]: + n.append(b) + elif len(b) == 2 and b[1] == '.' and a[0] == b[0]: + n.append(a) + else: + return self + n += long[idx+len(short):] + + if n == self.first: + return self + elif n == o.first: + return o + else: + return self + + def __repr__(self): + return "ParsedAuthor(%r,%r,%r,%r)"%(self.first,self.von, + self.last,self.jr) + def __str__(self): + a = " ".join(self.first+self.von+self.last) + if self.jr: + return "%s, %s" % (a,self.jr) + return a + + def getHomepage(self): + s = self.html + for pat, url in config.AUTHOR_RE_LIST: + if pat.search(s): + return url + return None + + def getSortingName(self): + """Return a representation of this author's name in von-last-first-jr + order, unless overridden by ALPH """ + s = self.html + for pat,v in config.ALPHABETIZE_AUTHOR_AS_RE_LIST: + if pat.search(s): + return v + + return txtize(" ".join(self.von+self.last+self.first+self.jr)) + + def getSectionName(self): + """Return a HTML representation of this author's name in + last, first von, jr order""" + secname = " ".join(self.last) + more = self.first+self.von + if more: + secname += ", "+" ".join(more) + if self.jr: + secname += ", "+" ".join(self.jr) + secname = htmlize(secname) + return secname + + def htmlizeWithLink(self): + a = self.html + u = self.getHomepage() + if u: + return "%s"%(u,a) + else: + return a + + +def parseAuthor(s): + try: + return _parseAuthor(s) + except: + print >>sys.stderr, "Internal error while parsing author %r"%s + raise + +def _parseAuthor(s): + """Take an author string and return a list of ParsedAuthor.""" + items = [] + + s = s.strip() + while s: + s = s.strip() + bracelevel = 0 + for i in xrange(len(s)): + if s[i] == '{': + bracelevel += 1 + elif s[i] == '}': + bracelevel -= 1 + elif bracelevel <= 0 and s[i] in " \t\n,": + break + if i+1 == len(s): + items.append(s) + else: + items.append(s[0:i]) + if (s[i] == ','): + items.append(',') + s = s[i+1:] + + authors = [[]] + for item in items: + if item == 'and': + authors.append([]) + else: + authors[-1].append(item) + + parsedAuthors = [] + # Split into first, von, last, jr + for author in authors: + commas = 0 + fvl = [] + vl = [] + f = [] + v = [] + l = [] + j = [] + cur = fvl + for item in author: + if item == ',': + if commas == 0: + vl = fvl + fvl = [] + cur = f + else: + j.extend(f) + cur = f = [] + commas += 1 + else: + cur.append(item) + + if commas == 0: + split_von(f,v,l,fvl) + else: + f_tmp = [] + split_von(f_tmp,v,l,vl) + + parsedAuthors.append(ParsedAuthor(f,v,l,j)) + + return parsedAuthors diff --git a/utils.py b/utils.py new file mode 100644 index 0000000..4d4b583 --- /dev/null +++ b/utils.py @@ -0,0 +1,118 @@ +import re +import os + +ALLCHARS = "".join(map(chr,range(256))) +RE_LONE_AMP = re.compile(r'&([^a-z0-9])') +RE_LONE_I = re.compile(r'\\i([^a-z0-9])') +RE_ACCENT = re.compile(r'\\([\'`~^"c])([^{]|{.})') +RE_LIGATURE = re.compile(r'\\(AE|ae|OE|oe|AA|aa|O|o|ss)([^a-z0-9])') +ACCENT_MAP = { "'" : 'acute', + "`" : 'grave', + "~" : 'tilde', + "^" : 'circ', + '"' : 'uml', + "c" : 'cedil', + } + +UNICODE_MAP = { 'ń' : 'ń', } +HTML_LIGATURE_MAP = { + 'AE' : 'Æ', + 'ae' : 'æ', + 'OE' : 'Œ', + 'oe' : 'œ', + 'AA' : 'Å', + 'aa' : 'å', + 'O' : 'Ø', + 'o' : 'ø', + 'ss' : 'ß', + } +RE_TEX_CMD = re.compile(r"(?:\\[a-zA-Z@]+|\\.)") +RE_PAGE_SPAN = re.compile(r"(\d)--(\d)") + +def url_untranslate(s): + """Change a BibTeX key into a string suitable for use in a URL.""" + s = re.sub(r'([%<>`#, &_\';])', lambda m: "_%02x" % ord(m.group(1)), s) + s = s.replace("/", ":") + return s + +def txtize(s): + """Turn a TeX string into decnent plaintext.""" + s = RE_LONE_I.sub(lambda m: "i%s" % m.group(1), s) + s = RE_ACCENT.sub(lambda m: "%s" % m.group(2), s) + s = RE_LIGATURE.sub(lambda m: "%s%s"%m.groups(), s) + s = RE_TEX_CMD.sub("", s) + s = s.translate(ALLCHARS, "{}") + return s + +def unTeXescapeURL(s): + """Turn a URL as formatted in TeX into a real URL.""" + s = s.replace("\\_", "_") + s = s.replace("\\-", "") + s = s.replace("\{}", "") + s = s.replace("{}", "") + return s + +def TeXescapeURL(s): + """Escape a URL for use in TeX""" + s = s.replace("_", "\\_") + s = s.replace("~", "\{}~") + return s + +def _unaccent(m): + accent,char = m.groups() + if char[0] == '{': + char = char[1] + accented = "&%s%s;" % (char, ACCENT_MAP[accent]) + return UNICODE_MAP.get(accented, accented) + +def _unlig_html(m): + return "%s%s"%(HTML_LIGATURE_MAP[m.group(1)],m.group(2)) + +def htmlize(s): + """Turn a TeX string into good-looking HTML.""" + s = RE_LONE_AMP.sub(lambda m: "&%s" % m.group(1), s) + s = RE_LONE_I.sub(lambda m: "i%s" % m.group(1), s) + s = RE_ACCENT.sub(_unaccent, s) + s = unTeXescapeURL(s) + s = RE_LIGATURE.sub(_unlig_html, s); + s = RE_TEX_CMD.sub("", s) + s = s.translate(ALLCHARS, "{}") + s = RE_PAGE_SPAN.sub(lambda m: "%s-%s"%(m.groups()), s) + s = s.replace("---", "—"); + s = s.replace("--", "–"); + return s + +def smartJoin(*lst): + """Equivalent to os.path.join, but handle"." and ".." entries a bit better. + """ + lst = [item for item in lst if item != "."] + idx = 0 + while idx < len(lst): + if idx > 0 and lst[idx] == "..": + del lst[idx] + else: + idx += 1 + return os.path.join(*lst) + +def _split(s,w=79,indent=8): + r = [] + s = re.sub(r"\s+", " ", s) + first = 1 + indentation = "" + while len(s) > w: + for i in xrange(w-1, 20, -1): + if s[i] == ' ': + r.append(indentation+s[:i]) + s = s[i+1:] + break + else: + r.append(indentation+s.strip()) + s = "" + if first: + first = 0 + w -= indent + indentation = " "*indent + if (s): + r.append(indentation+s) + r.append("") + return "\n".join(r) -- cgit v1.2.3-70-g09d2