From 5af5043ac67529aa2ecc05c6a3bbc22a4419b9cb Mon Sep 17 00:00:00 2001 From: Thibaut Horel Date: Thu, 4 Feb 2016 20:06:49 -0500 Subject: Split author --- entry.py | 287 +-------------------------------------------------------------- 1 file changed, 3 insertions(+), 284 deletions(-) (limited to 'entry.py') diff --git a/entry.py b/entry.py index 9846e32..4be2bc2 100644 --- a/entry.py +++ b/entry.py @@ -3,8 +3,9 @@ import sys import re import config import os -from utils import htmlize, txtize, url_untranslate, unTeXescapeURL, smartJoin,\ - _split +from utils import htmlize, url_untranslate, unTeXescapeURL, smartJoin,\ + _split, ALLCHARS, PRINTINGCHARS +from author import parseAuthor # Fields that we only care about for making web pages (BibTeX doesn't # recognize them.) @@ -20,80 +21,10 @@ def author_url(author): if pat.search(author): return url return None -ALLCHARS = "".join(map(chr,range(256))) -PRINTINGCHARS = "\t\n\r"+"".join(map(chr,range(32, 127))) -LC_CHARS = "abcdefghijklmnopqrstuvwxyz" -SV_DELCHARS = ("ABCDEFGHIJKLMNOPQRSTUVWXYZ" - "abcdefghijklmnopqrstuvwxyz" - "@") -RE_ESCAPED = re.compile(r'\\.') PROCEEDINGS_RE = re.compile( r'((?:proceedings|workshop record) of(?: the)? )(.*)', re.I) -def split_von(f,v,l,x): - in_von = 0 - while x: - tt = t = x[0] - del x[0] - if tt[:2] == '{\\': - tt = tt.translate(ALLCHARS, SV_DELCHARS) - tt = RE_ESCAPED.sub("", tt) - tt = tt.translate(ALLCHARS, "{}") - if tt.translate(ALLCHARS, LC_CHARS) == "": - v.append(t) - in_von = 1 - elif in_von and f is not None: - l.append(t) - l.extend(x) - return - else: - f.append(t) - if not in_von: - l.append(f[-1]) - del f[-1] - -def buildAuthorTable(entries): - """Given a list of BibTeXEntry, return a map from parsed author name to - parsed canonical name. - """ - authorsByLast = {} - for e in entries: - for a in e.parsedAuthor: - authorsByLast.setdefault(tuple(a.last), []).append(a) - # map from author to collapsed author. - result = {} - for k,v in config.COLLAPSE_AUTHORS.items(): - a = parseAuthor(k)[0] - c = parseAuthor(v)[0] - result[c] = c - result[a] = c - - for e in entries: - for author in e.parsedAuthor: - if result.has_key(author): - continue - - c = author - for a in authorsByLast[tuple(author.last)]: - if a is author: - continue - c = c.collapsesTo(a) - result[author] = c - - if 0: - for a,c in result.items(): - if a != c: - print "Collapsing authors: %s => %s" % (a,c) - if 0: - print parseAuthor("Franz Kaashoek")[0].collapsesTo( - parseAuthor("M. Franz Kaashoek")[0]) - print parseAuthor("Paul F. Syverson")[0].collapsesTo( - parseAuthor("Paul Syverson")[0]) - print parseAuthor("Paul Syverson")[0].collapsesTo( - parseAuthor("Paul F. Syverson")[0]) - - return result # List of fields that appear when we display the entries as BibTeX. DISPLAYED_FIELDS = [ 'title', 'author', 'journal', 'booktitle', @@ -439,215 +370,3 @@ class BibTeXEntry: res.append("\n\n") return "".join(res) - - -class ParsedAuthor: - """The parsed name of an author. - - Eddie deserves credit for this incredibly hairy business. - """ - def __init__(self, first, von, last, jr): - self.first = first - self.von = von - self.last = last - self.jr = jr - self.collapsable = 1 - - self.html = htmlize(str(self)) - self.txt = txtize(str(self)) - - s = self.html - for pat in config.NO_COLLAPSE_AUTHORS_RE_LIST: - if pat.search(s): - self.collapsable = 0 - break - - def __eq__(self, o): - return ((self.first == o.first) and - (self.last == o.last) and - (self.von == o.von) and - (self.jr == o.jr)) - - def __hash__(self): - return hash(repr(self)) - - def collapsesTo(self, o): - """Return true iff 'o' could be a more canonical version of this author - """ - if not self.collapsable or not o.collapsable: - return self - - if self.last != o.last or self.von != o.von or self.jr != o.jr: - return self - if not self.first: - return o - - if len(self.first) == len(o.first): - n = [] - for a,b in zip(self.first, o.first): - if a == b: - n.append(a) - elif len(a) == 2 and a[1] == '.' and a[0] == b[0]: - n.append(b) - elif len(b) == 2 and b[1] == '.' and a[0] == b[0]: - n.append(a) - else: - return self - if n == self.first: - return self - elif n == o.first: - return o - else: - return self - else: - realname = max([len(n) for n in self.first+o.first])>2 - if not realname: - return self - - if len(self.first) < len(o.first): - short = self.first; long = o.first - else: - short = o.first; long = self.first - - initials_s = "".join([n[0] for n in short]) - initials_l = "".join([n[0] for n in long]) - idx = initials_l.find(initials_s) - if idx < 0: - return self - n = long[:idx] - for i in range(idx, idx+len(short)): - a = long[i]; b = short[i-idx] - if a == b: - n.append(a) - elif len(a) == 2 and a[1] == '.' and a[0] == b[0]: - n.append(b) - elif len(b) == 2 and b[1] == '.' and a[0] == b[0]: - n.append(a) - else: - return self - n += long[idx+len(short):] - - if n == self.first: - return self - elif n == o.first: - return o - else: - return self - - def __repr__(self): - return "ParsedAuthor(%r,%r,%r,%r)"%(self.first,self.von, - self.last,self.jr) - def __str__(self): - a = " ".join(self.first+self.von+self.last) - if self.jr: - return "%s, %s" % (a,self.jr) - return a - - def getHomepage(self): - s = self.html - for pat, url in config.AUTHOR_RE_LIST: - if pat.search(s): - return url - return None - - def getSortingName(self): - """Return a representation of this author's name in von-last-first-jr - order, unless overridden by ALPH """ - s = self.html - for pat,v in config.ALPHABETIZE_AUTHOR_AS_RE_LIST: - if pat.search(s): - return v - - return txtize(" ".join(self.von+self.last+self.first+self.jr)) - - def getSectionName(self): - """Return a HTML representation of this author's name in - last, first von, jr order""" - secname = " ".join(self.last) - more = self.first+self.von - if more: - secname += ", "+" ".join(more) - if self.jr: - secname += ", "+" ".join(self.jr) - secname = htmlize(secname) - return secname - - def htmlizeWithLink(self): - a = self.html - u = self.getHomepage() - if u: - return "%s"%(u,a) - else: - return a - - -def parseAuthor(s): - try: - return _parseAuthor(s) - except: - print >>sys.stderr, "Internal error while parsing author %r"%s - raise - -def _parseAuthor(s): - """Take an author string and return a list of ParsedAuthor.""" - items = [] - - s = s.strip() - while s: - s = s.strip() - bracelevel = 0 - for i in xrange(len(s)): - if s[i] == '{': - bracelevel += 1 - elif s[i] == '}': - bracelevel -= 1 - elif bracelevel <= 0 and s[i] in " \t\n,": - break - if i+1 == len(s): - items.append(s) - else: - items.append(s[0:i]) - if (s[i] == ','): - items.append(',') - s = s[i+1:] - - authors = [[]] - for item in items: - if item == 'and': - authors.append([]) - else: - authors[-1].append(item) - - parsedAuthors = [] - # Split into first, von, last, jr - for author in authors: - commas = 0 - fvl = [] - vl = [] - f = [] - v = [] - l = [] - j = [] - cur = fvl - for item in author: - if item == ',': - if commas == 0: - vl = fvl - fvl = [] - cur = f - else: - j.extend(f) - cur = f = [] - commas += 1 - else: - cur.append(item) - - if commas == 0: - split_von(f,v,l,fvl) - else: - f_tmp = [] - split_von(f_tmp,v,l,vl) - - parsedAuthors.append(ParsedAuthor(f,v,l,j)) - - return parsedAuthors -- cgit v1.2.3-70-g09d2