import sys import config import re from utils import htmlize, txtize, ALLCHARS, PRINTINGCHARS LC_CHARS = "abcdefghijklmnopqrstuvwxyz" SV_DELCHARS = ("ABCDEFGHIJKLMNOPQRSTUVWXYZ" "abcdefghijklmnopqrstuvwxyz" "@") RE_ESCAPED = re.compile(r'\\.') def split_von(f,v,l,x): in_von = 0 while x: tt = t = x[0] del x[0] if tt[:2] == '{\\': tt = tt.translate(ALLCHARS, SV_DELCHARS) tt = RE_ESCAPED.sub("", tt) tt = tt.translate(ALLCHARS, "{}") if tt.translate(ALLCHARS, LC_CHARS) == "": v.append(t) in_von = 1 elif in_von and f is not None: l.append(t) l.extend(x) return else: f.append(t) if not in_von: l.append(f[-1]) del f[-1] def buildAuthorTable(entries): """Given a list of BibTeXEntry, return a map from parsed author name to parsed canonical name. """ authorsByLast = {} for e in entries: for a in e.parsedAuthor: authorsByLast.setdefault(tuple(a.last), []).append(a) # map from author to collapsed author. result = {} for k,v in config.COLLAPSE_AUTHORS.items(): a = parseAuthor(k)[0] c = parseAuthor(v)[0] result[c] = c result[a] = c for e in entries: for author in e.parsedAuthor: if result.has_key(author): continue c = author for a in authorsByLast[tuple(author.last)]: if a is author: continue c = c.collapsesTo(a) result[author] = c if 0: for a,c in result.items(): if a != c: print "Collapsing authors: %s => %s" % (a,c) if 0: print parseAuthor("Franz Kaashoek")[0].collapsesTo( parseAuthor("M. Franz Kaashoek")[0]) print parseAuthor("Paul F. Syverson")[0].collapsesTo( parseAuthor("Paul Syverson")[0]) print parseAuthor("Paul Syverson")[0].collapsesTo( parseAuthor("Paul F. Syverson")[0]) return result class ParsedAuthor: """The parsed name of an author. Eddie deserves credit for this incredibly hairy business. """ def __init__(self, first, von, last, jr): self.first = first self.von = von self.last = last self.jr = jr self.collapsable = 1 self.html = htmlize(str(self)) self.txt = txtize(str(self)) s = self.html for pat in config.NO_COLLAPSE_AUTHORS_RE_LIST: if pat.search(s): self.collapsable = 0 break def __eq__(self, o): return ((self.first == o.first) and (self.last == o.last) and (self.von == o.von) and (self.jr == o.jr)) def __hash__(self): return hash(repr(self)) def collapsesTo(self, o): """Return true iff 'o' could be a more canonical version of this author """ if not self.collapsable or not o.collapsable: return self if self.last != o.last or self.von != o.von or self.jr != o.jr: return self if not self.first: return o if len(self.first) == len(o.first): n = [] for a,b in zip(self.first, o.first): if a == b: n.append(a) elif len(a) == 2 and a[1] == '.' and a[0] == b[0]: n.append(b) elif len(b) == 2 and b[1] == '.' and a[0] == b[0]: n.append(a) else: return self if n == self.first: return self elif n == o.first: return o else: return self else: realname = max([len(n) for n in self.first+o.first])>2 if not realname: return self if len(self.first) < len(o.first): short = self.first; long = o.first else: short = o.first; long = self.first initials_s = "".join([n[0] for n in short]) initials_l = "".join([n[0] for n in long]) idx = initials_l.find(initials_s) if idx < 0: return self n = long[:idx] for i in range(idx, idx+len(short)): a = long[i]; b = short[i-idx] if a == b: n.append(a) elif len(a) == 2 and a[1] == '.' and a[0] == b[0]: n.append(b) elif len(b) == 2 and b[1] == '.' and a[0] == b[0]: n.append(a) else: return self n += long[idx+len(short):] if n == self.first: return self elif n == o.first: return o else: return self def __repr__(self): return "ParsedAuthor(%r,%r,%r,%r)"%(self.first,self.von, self.last,self.jr) def __str__(self): a = " ".join(self.first+self.von+self.last) if self.jr: return "%s, %s" % (a,self.jr) return a def getHomepage(self): s = self.html for pat, url in config.AUTHOR_RE_LIST: if pat.search(s): return url return None def getSortingName(self): """Return a representation of this author's name in von-last-first-jr order, unless overridden by ALPH """ s = self.html for pat,v in config.ALPHABETIZE_AUTHOR_AS_RE_LIST: if pat.search(s): return v return txtize(" ".join(self.von+self.last+self.first+self.jr)) def getSectionName(self): """Return a HTML representation of this author's name in last, first von, jr order""" secname = " ".join(self.last) more = self.first+self.von if more: secname += ", "+" ".join(more) if self.jr: secname += ", "+" ".join(self.jr) secname = htmlize(secname) return secname def htmlizeWithLink(self): a = self.html u = self.getHomepage() if u: return "%s"%(u,a) else: return a def parseAuthor(s): try: return _parseAuthor(s) except: print >>sys.stderr, "Internal error while parsing author %r"%s raise def _parseAuthor(s): """Take an author string and return a list of ParsedAuthor.""" items = [] s = s.strip() while s: s = s.strip() bracelevel = 0 for i in xrange(len(s)): if s[i] == '{': bracelevel += 1 elif s[i] == '}': bracelevel -= 1 elif bracelevel <= 0 and s[i] in " \t\n,": break if i+1 == len(s): items.append(s) else: items.append(s[0:i]) if (s[i] == ','): items.append(',') s = s[i+1:] authors = [[]] for item in items: if item == 'and': authors.append([]) else: authors[-1].append(item) parsedAuthors = [] # Split into first, von, last, jr for author in authors: commas = 0 fvl = [] vl = [] f = [] v = [] l = [] j = [] cur = fvl for item in author: if item == ',': if commas == 0: vl = fvl fvl = [] cur = f else: j.extend(f) cur = f = [] commas += 1 else: cur.append(item) if commas == 0: split_von(f,v,l,fvl) else: f_tmp = [] split_von(f_tmp,v,l,vl) parsedAuthors.append(ParsedAuthor(f,v,l,j)) return parsedAuthors