From 5af5043ac67529aa2ecc05c6a3bbc22a4419b9cb Mon Sep 17 00:00:00 2001 From: Thibaut Horel Date: Thu, 4 Feb 2016 20:06:49 -0500 Subject: Split author --- author.py | 286 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 286 insertions(+) create mode 100644 author.py (limited to 'author.py') diff --git a/author.py b/author.py new file mode 100644 index 0000000..44319e7 --- /dev/null +++ b/author.py @@ -0,0 +1,286 @@ +import sys +import config +import re +from utils import htmlize, txtize, ALLCHARS, PRINTINGCHARS + + +LC_CHARS = "abcdefghijklmnopqrstuvwxyz" +SV_DELCHARS = ("ABCDEFGHIJKLMNOPQRSTUVWXYZ" + "abcdefghijklmnopqrstuvwxyz" + "@") +RE_ESCAPED = re.compile(r'\\.') + +def split_von(f,v,l,x): + in_von = 0 + while x: + tt = t = x[0] + del x[0] + if tt[:2] == '{\\': + tt = tt.translate(ALLCHARS, SV_DELCHARS) + tt = RE_ESCAPED.sub("", tt) + tt = tt.translate(ALLCHARS, "{}") + if tt.translate(ALLCHARS, LC_CHARS) == "": + v.append(t) + in_von = 1 + elif in_von and f is not None: + l.append(t) + l.extend(x) + return + else: + f.append(t) + if not in_von: + l.append(f[-1]) + del f[-1] + +def buildAuthorTable(entries): + """Given a list of BibTeXEntry, return a map from parsed author name to + parsed canonical name. + """ + authorsByLast = {} + for e in entries: + for a in e.parsedAuthor: + authorsByLast.setdefault(tuple(a.last), []).append(a) + # map from author to collapsed author. + result = {} + for k,v in config.COLLAPSE_AUTHORS.items(): + a = parseAuthor(k)[0] + c = parseAuthor(v)[0] + result[c] = c + result[a] = c + + for e in entries: + for author in e.parsedAuthor: + if result.has_key(author): + continue + + c = author + for a in authorsByLast[tuple(author.last)]: + if a is author: + continue + c = c.collapsesTo(a) + result[author] = c + + if 0: + for a,c in result.items(): + if a != c: + print "Collapsing authors: %s => %s" % (a,c) + if 0: + print parseAuthor("Franz Kaashoek")[0].collapsesTo( + parseAuthor("M. Franz Kaashoek")[0]) + print parseAuthor("Paul F. Syverson")[0].collapsesTo( + parseAuthor("Paul Syverson")[0]) + print parseAuthor("Paul Syverson")[0].collapsesTo( + parseAuthor("Paul F. Syverson")[0]) + + return result + +class ParsedAuthor: + """The parsed name of an author. + + Eddie deserves credit for this incredibly hairy business. + """ + def __init__(self, first, von, last, jr): + self.first = first + self.von = von + self.last = last + self.jr = jr + self.collapsable = 1 + + self.html = htmlize(str(self)) + self.txt = txtize(str(self)) + + s = self.html + for pat in config.NO_COLLAPSE_AUTHORS_RE_LIST: + if pat.search(s): + self.collapsable = 0 + break + + def __eq__(self, o): + return ((self.first == o.first) and + (self.last == o.last) and + (self.von == o.von) and + (self.jr == o.jr)) + + def __hash__(self): + return hash(repr(self)) + + def collapsesTo(self, o): + """Return true iff 'o' could be a more canonical version of this author + """ + if not self.collapsable or not o.collapsable: + return self + + if self.last != o.last or self.von != o.von or self.jr != o.jr: + return self + if not self.first: + return o + + if len(self.first) == len(o.first): + n = [] + for a,b in zip(self.first, o.first): + if a == b: + n.append(a) + elif len(a) == 2 and a[1] == '.' and a[0] == b[0]: + n.append(b) + elif len(b) == 2 and b[1] == '.' and a[0] == b[0]: + n.append(a) + else: + return self + if n == self.first: + return self + elif n == o.first: + return o + else: + return self + else: + realname = max([len(n) for n in self.first+o.first])>2 + if not realname: + return self + + if len(self.first) < len(o.first): + short = self.first; long = o.first + else: + short = o.first; long = self.first + + initials_s = "".join([n[0] for n in short]) + initials_l = "".join([n[0] for n in long]) + idx = initials_l.find(initials_s) + if idx < 0: + return self + n = long[:idx] + for i in range(idx, idx+len(short)): + a = long[i]; b = short[i-idx] + if a == b: + n.append(a) + elif len(a) == 2 and a[1] == '.' and a[0] == b[0]: + n.append(b) + elif len(b) == 2 and b[1] == '.' and a[0] == b[0]: + n.append(a) + else: + return self + n += long[idx+len(short):] + + if n == self.first: + return self + elif n == o.first: + return o + else: + return self + + def __repr__(self): + return "ParsedAuthor(%r,%r,%r,%r)"%(self.first,self.von, + self.last,self.jr) + def __str__(self): + a = " ".join(self.first+self.von+self.last) + if self.jr: + return "%s, %s" % (a,self.jr) + return a + + def getHomepage(self): + s = self.html + for pat, url in config.AUTHOR_RE_LIST: + if pat.search(s): + return url + return None + + def getSortingName(self): + """Return a representation of this author's name in von-last-first-jr + order, unless overridden by ALPH """ + s = self.html + for pat,v in config.ALPHABETIZE_AUTHOR_AS_RE_LIST: + if pat.search(s): + return v + + return txtize(" ".join(self.von+self.last+self.first+self.jr)) + + def getSectionName(self): + """Return a HTML representation of this author's name in + last, first von, jr order""" + secname = " ".join(self.last) + more = self.first+self.von + if more: + secname += ", "+" ".join(more) + if self.jr: + secname += ", "+" ".join(self.jr) + secname = htmlize(secname) + return secname + + def htmlizeWithLink(self): + a = self.html + u = self.getHomepage() + if u: + return "%s"%(u,a) + else: + return a + + +def parseAuthor(s): + try: + return _parseAuthor(s) + except: + print >>sys.stderr, "Internal error while parsing author %r"%s + raise + +def _parseAuthor(s): + """Take an author string and return a list of ParsedAuthor.""" + items = [] + + s = s.strip() + while s: + s = s.strip() + bracelevel = 0 + for i in xrange(len(s)): + if s[i] == '{': + bracelevel += 1 + elif s[i] == '}': + bracelevel -= 1 + elif bracelevel <= 0 and s[i] in " \t\n,": + break + if i+1 == len(s): + items.append(s) + else: + items.append(s[0:i]) + if (s[i] == ','): + items.append(',') + s = s[i+1:] + + authors = [[]] + for item in items: + if item == 'and': + authors.append([]) + else: + authors[-1].append(item) + + parsedAuthors = [] + # Split into first, von, last, jr + for author in authors: + commas = 0 + fvl = [] + vl = [] + f = [] + v = [] + l = [] + j = [] + cur = fvl + for item in author: + if item == ',': + if commas == 0: + vl = fvl + fvl = [] + cur = f + else: + j.extend(f) + cur = f = [] + commas += 1 + else: + cur.append(item) + + if commas == 0: + split_von(f,v,l,fvl) + else: + f_tmp = [] + split_von(f_tmp,v,l,vl) + + parsedAuthors.append(ParsedAuthor(f,v,l,j)) + + return parsedAuthors -- cgit v1.2.3-70-g09d2