From 5af5043ac67529aa2ecc05c6a3bbc22a4419b9cb Mon Sep 17 00:00:00 2001 From: Thibaut Horel Date: Thu, 4 Feb 2016 20:06:49 -0500 Subject: Split author --- BibTeX.py | 4 - author.py | 286 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ entry.py | 287 +---------------------------------------------------------- sortutils.py | 2 +- utils.py | 1 + writeHTML.py | 1 - 6 files changed, 291 insertions(+), 290 deletions(-) create mode 100644 author.py diff --git a/BibTeX.py b/BibTeX.py index 85228a1..6831929 100644 --- a/BibTeX.py +++ b/BibTeX.py @@ -80,9 +80,6 @@ class BibTeX: self.entries = newEntries - - - class FileIter: def __init__(self, fname=None, file=None, it=None, string=None): if fname: @@ -392,4 +389,3 @@ if __name__ == '__main__': for e in r.entries: if e.type in ("proceedings", "journal"): continue print e.to_html() - diff --git a/author.py b/author.py new file mode 100644 index 0000000..44319e7 --- /dev/null +++ b/author.py @@ -0,0 +1,286 @@ +import sys +import config +import re +from utils import htmlize, txtize, ALLCHARS, PRINTINGCHARS + + +LC_CHARS = "abcdefghijklmnopqrstuvwxyz" +SV_DELCHARS = ("ABCDEFGHIJKLMNOPQRSTUVWXYZ" + "abcdefghijklmnopqrstuvwxyz" + "@") +RE_ESCAPED = re.compile(r'\\.') + +def split_von(f,v,l,x): + in_von = 0 + while x: + tt = t = x[0] + del x[0] + if tt[:2] == '{\\': + tt = tt.translate(ALLCHARS, SV_DELCHARS) + tt = RE_ESCAPED.sub("", tt) + tt = tt.translate(ALLCHARS, "{}") + if tt.translate(ALLCHARS, LC_CHARS) == "": + v.append(t) + in_von = 1 + elif in_von and f is not None: + l.append(t) + l.extend(x) + return + else: + f.append(t) + if not in_von: + l.append(f[-1]) + del f[-1] + +def buildAuthorTable(entries): + """Given a list of BibTeXEntry, return a map from parsed author name to + parsed canonical name. + """ + authorsByLast = {} + for e in entries: + for a in e.parsedAuthor: + authorsByLast.setdefault(tuple(a.last), []).append(a) + # map from author to collapsed author. + result = {} + for k,v in config.COLLAPSE_AUTHORS.items(): + a = parseAuthor(k)[0] + c = parseAuthor(v)[0] + result[c] = c + result[a] = c + + for e in entries: + for author in e.parsedAuthor: + if result.has_key(author): + continue + + c = author + for a in authorsByLast[tuple(author.last)]: + if a is author: + continue + c = c.collapsesTo(a) + result[author] = c + + if 0: + for a,c in result.items(): + if a != c: + print "Collapsing authors: %s => %s" % (a,c) + if 0: + print parseAuthor("Franz Kaashoek")[0].collapsesTo( + parseAuthor("M. Franz Kaashoek")[0]) + print parseAuthor("Paul F. Syverson")[0].collapsesTo( + parseAuthor("Paul Syverson")[0]) + print parseAuthor("Paul Syverson")[0].collapsesTo( + parseAuthor("Paul F. Syverson")[0]) + + return result + +class ParsedAuthor: + """The parsed name of an author. + + Eddie deserves credit for this incredibly hairy business. + """ + def __init__(self, first, von, last, jr): + self.first = first + self.von = von + self.last = last + self.jr = jr + self.collapsable = 1 + + self.html = htmlize(str(self)) + self.txt = txtize(str(self)) + + s = self.html + for pat in config.NO_COLLAPSE_AUTHORS_RE_LIST: + if pat.search(s): + self.collapsable = 0 + break + + def __eq__(self, o): + return ((self.first == o.first) and + (self.last == o.last) and + (self.von == o.von) and + (self.jr == o.jr)) + + def __hash__(self): + return hash(repr(self)) + + def collapsesTo(self, o): + """Return true iff 'o' could be a more canonical version of this author + """ + if not self.collapsable or not o.collapsable: + return self + + if self.last != o.last or self.von != o.von or self.jr != o.jr: + return self + if not self.first: + return o + + if len(self.first) == len(o.first): + n = [] + for a,b in zip(self.first, o.first): + if a == b: + n.append(a) + elif len(a) == 2 and a[1] == '.' and a[0] == b[0]: + n.append(b) + elif len(b) == 2 and b[1] == '.' and a[0] == b[0]: + n.append(a) + else: + return self + if n == self.first: + return self + elif n == o.first: + return o + else: + return self + else: + realname = max([len(n) for n in self.first+o.first])>2 + if not realname: + return self + + if len(self.first) < len(o.first): + short = self.first; long = o.first + else: + short = o.first; long = self.first + + initials_s = "".join([n[0] for n in short]) + initials_l = "".join([n[0] for n in long]) + idx = initials_l.find(initials_s) + if idx < 0: + return self + n = long[:idx] + for i in range(idx, idx+len(short)): + a = long[i]; b = short[i-idx] + if a == b: + n.append(a) + elif len(a) == 2 and a[1] == '.' and a[0] == b[0]: + n.append(b) + elif len(b) == 2 and b[1] == '.' and a[0] == b[0]: + n.append(a) + else: + return self + n += long[idx+len(short):] + + if n == self.first: + return self + elif n == o.first: + return o + else: + return self + + def __repr__(self): + return "ParsedAuthor(%r,%r,%r,%r)"%(self.first,self.von, + self.last,self.jr) + def __str__(self): + a = " ".join(self.first+self.von+self.last) + if self.jr: + return "%s, %s" % (a,self.jr) + return a + + def getHomepage(self): + s = self.html + for pat, url in config.AUTHOR_RE_LIST: + if pat.search(s): + return url + return None + + def getSortingName(self): + """Return a representation of this author's name in von-last-first-jr + order, unless overridden by ALPH """ + s = self.html + for pat,v in config.ALPHABETIZE_AUTHOR_AS_RE_LIST: + if pat.search(s): + return v + + return txtize(" ".join(self.von+self.last+self.first+self.jr)) + + def getSectionName(self): + """Return a HTML representation of this author's name in + last, first von, jr order""" + secname = " ".join(self.last) + more = self.first+self.von + if more: + secname += ", "+" ".join(more) + if self.jr: + secname += ", "+" ".join(self.jr) + secname = htmlize(secname) + return secname + + def htmlizeWithLink(self): + a = self.html + u = self.getHomepage() + if u: + return "%s"%(u,a) + else: + return a + + +def parseAuthor(s): + try: + return _parseAuthor(s) + except: + print >>sys.stderr, "Internal error while parsing author %r"%s + raise + +def _parseAuthor(s): + """Take an author string and return a list of ParsedAuthor.""" + items = [] + + s = s.strip() + while s: + s = s.strip() + bracelevel = 0 + for i in xrange(len(s)): + if s[i] == '{': + bracelevel += 1 + elif s[i] == '}': + bracelevel -= 1 + elif bracelevel <= 0 and s[i] in " \t\n,": + break + if i+1 == len(s): + items.append(s) + else: + items.append(s[0:i]) + if (s[i] == ','): + items.append(',') + s = s[i+1:] + + authors = [[]] + for item in items: + if item == 'and': + authors.append([]) + else: + authors[-1].append(item) + + parsedAuthors = [] + # Split into first, von, last, jr + for author in authors: + commas = 0 + fvl = [] + vl = [] + f = [] + v = [] + l = [] + j = [] + cur = fvl + for item in author: + if item == ',': + if commas == 0: + vl = fvl + fvl = [] + cur = f + else: + j.extend(f) + cur = f = [] + commas += 1 + else: + cur.append(item) + + if commas == 0: + split_von(f,v,l,fvl) + else: + f_tmp = [] + split_von(f_tmp,v,l,vl) + + parsedAuthors.append(ParsedAuthor(f,v,l,j)) + + return parsedAuthors diff --git a/entry.py b/entry.py index 9846e32..4be2bc2 100644 --- a/entry.py +++ b/entry.py @@ -3,8 +3,9 @@ import sys import re import config import os -from utils import htmlize, txtize, url_untranslate, unTeXescapeURL, smartJoin,\ - _split +from utils import htmlize, url_untranslate, unTeXescapeURL, smartJoin,\ + _split, ALLCHARS, PRINTINGCHARS +from author import parseAuthor # Fields that we only care about for making web pages (BibTeX doesn't # recognize them.) @@ -20,80 +21,10 @@ def author_url(author): if pat.search(author): return url return None -ALLCHARS = "".join(map(chr,range(256))) -PRINTINGCHARS = "\t\n\r"+"".join(map(chr,range(32, 127))) -LC_CHARS = "abcdefghijklmnopqrstuvwxyz" -SV_DELCHARS = ("ABCDEFGHIJKLMNOPQRSTUVWXYZ" - "abcdefghijklmnopqrstuvwxyz" - "@") -RE_ESCAPED = re.compile(r'\\.') PROCEEDINGS_RE = re.compile( r'((?:proceedings|workshop record) of(?: the)? )(.*)', re.I) -def split_von(f,v,l,x): - in_von = 0 - while x: - tt = t = x[0] - del x[0] - if tt[:2] == '{\\': - tt = tt.translate(ALLCHARS, SV_DELCHARS) - tt = RE_ESCAPED.sub("", tt) - tt = tt.translate(ALLCHARS, "{}") - if tt.translate(ALLCHARS, LC_CHARS) == "": - v.append(t) - in_von = 1 - elif in_von and f is not None: - l.append(t) - l.extend(x) - return - else: - f.append(t) - if not in_von: - l.append(f[-1]) - del f[-1] - -def buildAuthorTable(entries): - """Given a list of BibTeXEntry, return a map from parsed author name to - parsed canonical name. - """ - authorsByLast = {} - for e in entries: - for a in e.parsedAuthor: - authorsByLast.setdefault(tuple(a.last), []).append(a) - # map from author to collapsed author. - result = {} - for k,v in config.COLLAPSE_AUTHORS.items(): - a = parseAuthor(k)[0] - c = parseAuthor(v)[0] - result[c] = c - result[a] = c - - for e in entries: - for author in e.parsedAuthor: - if result.has_key(author): - continue - - c = author - for a in authorsByLast[tuple(author.last)]: - if a is author: - continue - c = c.collapsesTo(a) - result[author] = c - - if 0: - for a,c in result.items(): - if a != c: - print "Collapsing authors: %s => %s" % (a,c) - if 0: - print parseAuthor("Franz Kaashoek")[0].collapsesTo( - parseAuthor("M. Franz Kaashoek")[0]) - print parseAuthor("Paul F. Syverson")[0].collapsesTo( - parseAuthor("Paul Syverson")[0]) - print parseAuthor("Paul Syverson")[0].collapsesTo( - parseAuthor("Paul F. Syverson")[0]) - - return result # List of fields that appear when we display the entries as BibTeX. DISPLAYED_FIELDS = [ 'title', 'author', 'journal', 'booktitle', @@ -439,215 +370,3 @@ class BibTeXEntry: res.append("\n\n") return "".join(res) - - -class ParsedAuthor: - """The parsed name of an author. - - Eddie deserves credit for this incredibly hairy business. - """ - def __init__(self, first, von, last, jr): - self.first = first - self.von = von - self.last = last - self.jr = jr - self.collapsable = 1 - - self.html = htmlize(str(self)) - self.txt = txtize(str(self)) - - s = self.html - for pat in config.NO_COLLAPSE_AUTHORS_RE_LIST: - if pat.search(s): - self.collapsable = 0 - break - - def __eq__(self, o): - return ((self.first == o.first) and - (self.last == o.last) and - (self.von == o.von) and - (self.jr == o.jr)) - - def __hash__(self): - return hash(repr(self)) - - def collapsesTo(self, o): - """Return true iff 'o' could be a more canonical version of this author - """ - if not self.collapsable or not o.collapsable: - return self - - if self.last != o.last or self.von != o.von or self.jr != o.jr: - return self - if not self.first: - return o - - if len(self.first) == len(o.first): - n = [] - for a,b in zip(self.first, o.first): - if a == b: - n.append(a) - elif len(a) == 2 and a[1] == '.' and a[0] == b[0]: - n.append(b) - elif len(b) == 2 and b[1] == '.' and a[0] == b[0]: - n.append(a) - else: - return self - if n == self.first: - return self - elif n == o.first: - return o - else: - return self - else: - realname = max([len(n) for n in self.first+o.first])>2 - if not realname: - return self - - if len(self.first) < len(o.first): - short = self.first; long = o.first - else: - short = o.first; long = self.first - - initials_s = "".join([n[0] for n in short]) - initials_l = "".join([n[0] for n in long]) - idx = initials_l.find(initials_s) - if idx < 0: - return self - n = long[:idx] - for i in range(idx, idx+len(short)): - a = long[i]; b = short[i-idx] - if a == b: - n.append(a) - elif len(a) == 2 and a[1] == '.' and a[0] == b[0]: - n.append(b) - elif len(b) == 2 and b[1] == '.' and a[0] == b[0]: - n.append(a) - else: - return self - n += long[idx+len(short):] - - if n == self.first: - return self - elif n == o.first: - return o - else: - return self - - def __repr__(self): - return "ParsedAuthor(%r,%r,%r,%r)"%(self.first,self.von, - self.last,self.jr) - def __str__(self): - a = " ".join(self.first+self.von+self.last) - if self.jr: - return "%s, %s" % (a,self.jr) - return a - - def getHomepage(self): - s = self.html - for pat, url in config.AUTHOR_RE_LIST: - if pat.search(s): - return url - return None - - def getSortingName(self): - """Return a representation of this author's name in von-last-first-jr - order, unless overridden by ALPH """ - s = self.html - for pat,v in config.ALPHABETIZE_AUTHOR_AS_RE_LIST: - if pat.search(s): - return v - - return txtize(" ".join(self.von+self.last+self.first+self.jr)) - - def getSectionName(self): - """Return a HTML representation of this author's name in - last, first von, jr order""" - secname = " ".join(self.last) - more = self.first+self.von - if more: - secname += ", "+" ".join(more) - if self.jr: - secname += ", "+" ".join(self.jr) - secname = htmlize(secname) - return secname - - def htmlizeWithLink(self): - a = self.html - u = self.getHomepage() - if u: - return "%s"%(u,a) - else: - return a - - -def parseAuthor(s): - try: - return _parseAuthor(s) - except: - print >>sys.stderr, "Internal error while parsing author %r"%s - raise - -def _parseAuthor(s): - """Take an author string and return a list of ParsedAuthor.""" - items = [] - - s = s.strip() - while s: - s = s.strip() - bracelevel = 0 - for i in xrange(len(s)): - if s[i] == '{': - bracelevel += 1 - elif s[i] == '}': - bracelevel -= 1 - elif bracelevel <= 0 and s[i] in " \t\n,": - break - if i+1 == len(s): - items.append(s) - else: - items.append(s[0:i]) - if (s[i] == ','): - items.append(',') - s = s[i+1:] - - authors = [[]] - for item in items: - if item == 'and': - authors.append([]) - else: - authors[-1].append(item) - - parsedAuthors = [] - # Split into first, von, last, jr - for author in authors: - commas = 0 - fvl = [] - vl = [] - f = [] - v = [] - l = [] - j = [] - cur = fvl - for item in author: - if item == ',': - if commas == 0: - vl = fvl - fvl = [] - cur = f - else: - j.extend(f) - cur = f = [] - commas += 1 - else: - cur.append(item) - - if commas == 0: - split_von(f,v,l,fvl) - else: - f_tmp = [] - split_von(f_tmp,v,l,vl) - - parsedAuthors.append(ParsedAuthor(f,v,l,j)) - - return parsedAuthors diff --git a/sortutils.py b/sortutils.py index 419fe03..d86a299 100644 --- a/sortutils.py +++ b/sortutils.py @@ -1,7 +1,7 @@ import config import copy from utils import txtize -from entry import buildAuthorTable +from author import buildAuthorTable import re # List: must map from month number to month name. diff --git a/utils.py b/utils.py index 4d4b583..e62c446 100644 --- a/utils.py +++ b/utils.py @@ -1,6 +1,7 @@ import re import os +PRINTINGCHARS = "\t\n\r"+"".join(map(chr,range(32, 127))) ALLCHARS = "".join(map(chr,range(256))) RE_LONE_AMP = re.compile(r'&([^a-z0-9])') RE_LONE_I = re.compile(r'\\i([^a-z0-9])') diff --git a/writeHTML.py b/writeHTML.py index d4e11a0..9e7ddd7 100755 --- a/writeHTML.py +++ b/writeHTML.py @@ -15,7 +15,6 @@ import BibTeX from sortutils import sortEntriesBy, splitSortedEntriesBy, sortEntriesByDate,\ splitEntriesByAuthor from utils import smartJoin, url_untranslate -from entry import buildAuthorTable import config def getTemplate(name): -- cgit v1.2.3-70-g09d2