aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorThibaut Horel <thibaut.horel@gmail.com>2016-02-04 20:06:49 -0500
committerThibaut Horel <thibaut.horel@gmail.com>2016-02-04 20:06:49 -0500
commit5af5043ac67529aa2ecc05c6a3bbc22a4419b9cb (patch)
tree6d73e11141cf2ffbec11561e44d5c60f0dc75131
parentda7359cd452f2ded9e05e753fb125508343b8587 (diff)
downloadanonbib-5af5043ac67529aa2ecc05c6a3bbc22a4419b9cb.tar.gz
Split author
-rw-r--r--BibTeX.py4
-rw-r--r--author.py286
-rw-r--r--entry.py287
-rw-r--r--sortutils.py2
-rw-r--r--utils.py1
-rwxr-xr-xwriteHTML.py1
6 files changed, 291 insertions, 290 deletions
diff --git a/BibTeX.py b/BibTeX.py
index 85228a1..6831929 100644
--- a/BibTeX.py
+++ b/BibTeX.py
@@ -80,9 +80,6 @@ class BibTeX:
self.entries = newEntries
-
-
-
class FileIter:
def __init__(self, fname=None, file=None, it=None, string=None):
if fname:
@@ -392,4 +389,3 @@ if __name__ == '__main__':
for e in r.entries:
if e.type in ("proceedings", "journal"): continue
print e.to_html()
-
diff --git a/author.py b/author.py
new file mode 100644
index 0000000..44319e7
--- /dev/null
+++ b/author.py
@@ -0,0 +1,286 @@
+import sys
+import config
+import re
+from utils import htmlize, txtize, ALLCHARS, PRINTINGCHARS
+
+
+LC_CHARS = "abcdefghijklmnopqrstuvwxyz"
+SV_DELCHARS = ("ABCDEFGHIJKLMNOPQRSTUVWXYZ"
+ "abcdefghijklmnopqrstuvwxyz"
+ "@")
+RE_ESCAPED = re.compile(r'\\.')
+
+def split_von(f,v,l,x):
+ in_von = 0
+ while x:
+ tt = t = x[0]
+ del x[0]
+ if tt[:2] == '{\\':
+ tt = tt.translate(ALLCHARS, SV_DELCHARS)
+ tt = RE_ESCAPED.sub("", tt)
+ tt = tt.translate(ALLCHARS, "{}")
+ if tt.translate(ALLCHARS, LC_CHARS) == "":
+ v.append(t)
+ in_von = 1
+ elif in_von and f is not None:
+ l.append(t)
+ l.extend(x)
+ return
+ else:
+ f.append(t)
+ if not in_von:
+ l.append(f[-1])
+ del f[-1]
+
+def buildAuthorTable(entries):
+ """Given a list of BibTeXEntry, return a map from parsed author name to
+ parsed canonical name.
+ """
+ authorsByLast = {}
+ for e in entries:
+ for a in e.parsedAuthor:
+ authorsByLast.setdefault(tuple(a.last), []).append(a)
+ # map from author to collapsed author.
+ result = {}
+ for k,v in config.COLLAPSE_AUTHORS.items():
+ a = parseAuthor(k)[0]
+ c = parseAuthor(v)[0]
+ result[c] = c
+ result[a] = c
+
+ for e in entries:
+ for author in e.parsedAuthor:
+ if result.has_key(author):
+ continue
+
+ c = author
+ for a in authorsByLast[tuple(author.last)]:
+ if a is author:
+ continue
+ c = c.collapsesTo(a)
+ result[author] = c
+
+ if 0:
+ for a,c in result.items():
+ if a != c:
+ print "Collapsing authors: %s => %s" % (a,c)
+ if 0:
+ print parseAuthor("Franz Kaashoek")[0].collapsesTo(
+ parseAuthor("M. Franz Kaashoek")[0])
+ print parseAuthor("Paul F. Syverson")[0].collapsesTo(
+ parseAuthor("Paul Syverson")[0])
+ print parseAuthor("Paul Syverson")[0].collapsesTo(
+ parseAuthor("Paul F. Syverson")[0])
+
+ return result
+
+class ParsedAuthor:
+ """The parsed name of an author.
+
+ Eddie deserves credit for this incredibly hairy business.
+ """
+ def __init__(self, first, von, last, jr):
+ self.first = first
+ self.von = von
+ self.last = last
+ self.jr = jr
+ self.collapsable = 1
+
+ self.html = htmlize(str(self))
+ self.txt = txtize(str(self))
+
+ s = self.html
+ for pat in config.NO_COLLAPSE_AUTHORS_RE_LIST:
+ if pat.search(s):
+ self.collapsable = 0
+ break
+
+ def __eq__(self, o):
+ return ((self.first == o.first) and
+ (self.last == o.last) and
+ (self.von == o.von) and
+ (self.jr == o.jr))
+
+ def __hash__(self):
+ return hash(repr(self))
+
+ def collapsesTo(self, o):
+ """Return true iff 'o' could be a more canonical version of this author
+ """
+ if not self.collapsable or not o.collapsable:
+ return self
+
+ if self.last != o.last or self.von != o.von or self.jr != o.jr:
+ return self
+ if not self.first:
+ return o
+
+ if len(self.first) == len(o.first):
+ n = []
+ for a,b in zip(self.first, o.first):
+ if a == b:
+ n.append(a)
+ elif len(a) == 2 and a[1] == '.' and a[0] == b[0]:
+ n.append(b)
+ elif len(b) == 2 and b[1] == '.' and a[0] == b[0]:
+ n.append(a)
+ else:
+ return self
+ if n == self.first:
+ return self
+ elif n == o.first:
+ return o
+ else:
+ return self
+ else:
+ realname = max([len(n) for n in self.first+o.first])>2
+ if not realname:
+ return self
+
+ if len(self.first) < len(o.first):
+ short = self.first; long = o.first
+ else:
+ short = o.first; long = self.first
+
+ initials_s = "".join([n[0] for n in short])
+ initials_l = "".join([n[0] for n in long])
+ idx = initials_l.find(initials_s)
+ if idx < 0:
+ return self
+ n = long[:idx]
+ for i in range(idx, idx+len(short)):
+ a = long[i]; b = short[i-idx]
+ if a == b:
+ n.append(a)
+ elif len(a) == 2 and a[1] == '.' and a[0] == b[0]:
+ n.append(b)
+ elif len(b) == 2 and b[1] == '.' and a[0] == b[0]:
+ n.append(a)
+ else:
+ return self
+ n += long[idx+len(short):]
+
+ if n == self.first:
+ return self
+ elif n == o.first:
+ return o
+ else:
+ return self
+
+ def __repr__(self):
+ return "ParsedAuthor(%r,%r,%r,%r)"%(self.first,self.von,
+ self.last,self.jr)
+ def __str__(self):
+ a = " ".join(self.first+self.von+self.last)
+ if self.jr:
+ return "%s, %s" % (a,self.jr)
+ return a
+
+ def getHomepage(self):
+ s = self.html
+ for pat, url in config.AUTHOR_RE_LIST:
+ if pat.search(s):
+ return url
+ return None
+
+ def getSortingName(self):
+ """Return a representation of this author's name in von-last-first-jr
+ order, unless overridden by ALPH """
+ s = self.html
+ for pat,v in config.ALPHABETIZE_AUTHOR_AS_RE_LIST:
+ if pat.search(s):
+ return v
+
+ return txtize(" ".join(self.von+self.last+self.first+self.jr))
+
+ def getSectionName(self):
+ """Return a HTML representation of this author's name in
+ last, first von, jr order"""
+ secname = " ".join(self.last)
+ more = self.first+self.von
+ if more:
+ secname += ", "+" ".join(more)
+ if self.jr:
+ secname += ", "+" ".join(self.jr)
+ secname = htmlize(secname)
+ return secname
+
+ def htmlizeWithLink(self):
+ a = self.html
+ u = self.getHomepage()
+ if u:
+ return "<a href='%s'>%s</a>"%(u,a)
+ else:
+ return a
+
+
+def parseAuthor(s):
+ try:
+ return _parseAuthor(s)
+ except:
+ print >>sys.stderr, "Internal error while parsing author %r"%s
+ raise
+
+def _parseAuthor(s):
+ """Take an author string and return a list of ParsedAuthor."""
+ items = []
+
+ s = s.strip()
+ while s:
+ s = s.strip()
+ bracelevel = 0
+ for i in xrange(len(s)):
+ if s[i] == '{':
+ bracelevel += 1
+ elif s[i] == '}':
+ bracelevel -= 1
+ elif bracelevel <= 0 and s[i] in " \t\n,":
+ break
+ if i+1 == len(s):
+ items.append(s)
+ else:
+ items.append(s[0:i])
+ if (s[i] == ','):
+ items.append(',')
+ s = s[i+1:]
+
+ authors = [[]]
+ for item in items:
+ if item == 'and':
+ authors.append([])
+ else:
+ authors[-1].append(item)
+
+ parsedAuthors = []
+ # Split into first, von, last, jr
+ for author in authors:
+ commas = 0
+ fvl = []
+ vl = []
+ f = []
+ v = []
+ l = []
+ j = []
+ cur = fvl
+ for item in author:
+ if item == ',':
+ if commas == 0:
+ vl = fvl
+ fvl = []
+ cur = f
+ else:
+ j.extend(f)
+ cur = f = []
+ commas += 1
+ else:
+ cur.append(item)
+
+ if commas == 0:
+ split_von(f,v,l,fvl)
+ else:
+ f_tmp = []
+ split_von(f_tmp,v,l,vl)
+
+ parsedAuthors.append(ParsedAuthor(f,v,l,j))
+
+ return parsedAuthors
diff --git a/entry.py b/entry.py
index 9846e32..4be2bc2 100644
--- a/entry.py
+++ b/entry.py
@@ -3,8 +3,9 @@ import sys
import re
import config
import os
-from utils import htmlize, txtize, url_untranslate, unTeXescapeURL, smartJoin,\
- _split
+from utils import htmlize, url_untranslate, unTeXescapeURL, smartJoin,\
+ _split, ALLCHARS, PRINTINGCHARS
+from author import parseAuthor
# Fields that we only care about for making web pages (BibTeX doesn't
# recognize them.)
@@ -20,80 +21,10 @@ def author_url(author):
if pat.search(author):
return url
return None
-ALLCHARS = "".join(map(chr,range(256)))
-PRINTINGCHARS = "\t\n\r"+"".join(map(chr,range(32, 127)))
-LC_CHARS = "abcdefghijklmnopqrstuvwxyz"
-SV_DELCHARS = ("ABCDEFGHIJKLMNOPQRSTUVWXYZ"
- "abcdefghijklmnopqrstuvwxyz"
- "@")
-RE_ESCAPED = re.compile(r'\\.')
PROCEEDINGS_RE = re.compile(
r'((?:proceedings|workshop record) of(?: the)? )(.*)',
re.I)
-def split_von(f,v,l,x):
- in_von = 0
- while x:
- tt = t = x[0]
- del x[0]
- if tt[:2] == '{\\':
- tt = tt.translate(ALLCHARS, SV_DELCHARS)
- tt = RE_ESCAPED.sub("", tt)
- tt = tt.translate(ALLCHARS, "{}")
- if tt.translate(ALLCHARS, LC_CHARS) == "":
- v.append(t)
- in_von = 1
- elif in_von and f is not None:
- l.append(t)
- l.extend(x)
- return
- else:
- f.append(t)
- if not in_von:
- l.append(f[-1])
- del f[-1]
-
-def buildAuthorTable(entries):
- """Given a list of BibTeXEntry, return a map from parsed author name to
- parsed canonical name.
- """
- authorsByLast = {}
- for e in entries:
- for a in e.parsedAuthor:
- authorsByLast.setdefault(tuple(a.last), []).append(a)
- # map from author to collapsed author.
- result = {}
- for k,v in config.COLLAPSE_AUTHORS.items():
- a = parseAuthor(k)[0]
- c = parseAuthor(v)[0]
- result[c] = c
- result[a] = c
-
- for e in entries:
- for author in e.parsedAuthor:
- if result.has_key(author):
- continue
-
- c = author
- for a in authorsByLast[tuple(author.last)]:
- if a is author:
- continue
- c = c.collapsesTo(a)
- result[author] = c
-
- if 0:
- for a,c in result.items():
- if a != c:
- print "Collapsing authors: %s => %s" % (a,c)
- if 0:
- print parseAuthor("Franz Kaashoek")[0].collapsesTo(
- parseAuthor("M. Franz Kaashoek")[0])
- print parseAuthor("Paul F. Syverson")[0].collapsesTo(
- parseAuthor("Paul Syverson")[0])
- print parseAuthor("Paul Syverson")[0].collapsesTo(
- parseAuthor("Paul F. Syverson")[0])
-
- return result
# List of fields that appear when we display the entries as BibTeX.
DISPLAYED_FIELDS = [ 'title', 'author', 'journal', 'booktitle',
@@ -439,215 +370,3 @@ class BibTeXEntry:
res.append("</li>\n\n")
return "".join(res)
-
-
-class ParsedAuthor:
- """The parsed name of an author.
-
- Eddie deserves credit for this incredibly hairy business.
- """
- def __init__(self, first, von, last, jr):
- self.first = first
- self.von = von
- self.last = last
- self.jr = jr
- self.collapsable = 1
-
- self.html = htmlize(str(self))
- self.txt = txtize(str(self))
-
- s = self.html
- for pat in config.NO_COLLAPSE_AUTHORS_RE_LIST:
- if pat.search(s):
- self.collapsable = 0
- break
-
- def __eq__(self, o):
- return ((self.first == o.first) and
- (self.last == o.last) and
- (self.von == o.von) and
- (self.jr == o.jr))
-
- def __hash__(self):
- return hash(repr(self))
-
- def collapsesTo(self, o):
- """Return true iff 'o' could be a more canonical version of this author
- """
- if not self.collapsable or not o.collapsable:
- return self
-
- if self.last != o.last or self.von != o.von or self.jr != o.jr:
- return self
- if not self.first:
- return o
-
- if len(self.first) == len(o.first):
- n = []
- for a,b in zip(self.first, o.first):
- if a == b:
- n.append(a)
- elif len(a) == 2 and a[1] == '.' and a[0] == b[0]:
- n.append(b)
- elif len(b) == 2 and b[1] == '.' and a[0] == b[0]:
- n.append(a)
- else:
- return self
- if n == self.first:
- return self
- elif n == o.first:
- return o
- else:
- return self
- else:
- realname = max([len(n) for n in self.first+o.first])>2
- if not realname:
- return self
-
- if len(self.first) < len(o.first):
- short = self.first; long = o.first
- else:
- short = o.first; long = self.first
-
- initials_s = "".join([n[0] for n in short])
- initials_l = "".join([n[0] for n in long])
- idx = initials_l.find(initials_s)
- if idx < 0:
- return self
- n = long[:idx]
- for i in range(idx, idx+len(short)):
- a = long[i]; b = short[i-idx]
- if a == b:
- n.append(a)
- elif len(a) == 2 and a[1] == '.' and a[0] == b[0]:
- n.append(b)
- elif len(b) == 2 and b[1] == '.' and a[0] == b[0]:
- n.append(a)
- else:
- return self
- n += long[idx+len(short):]
-
- if n == self.first:
- return self
- elif n == o.first:
- return o
- else:
- return self
-
- def __repr__(self):
- return "ParsedAuthor(%r,%r,%r,%r)"%(self.first,self.von,
- self.last,self.jr)
- def __str__(self):
- a = " ".join(self.first+self.von+self.last)
- if self.jr:
- return "%s, %s" % (a,self.jr)
- return a
-
- def getHomepage(self):
- s = self.html
- for pat, url in config.AUTHOR_RE_LIST:
- if pat.search(s):
- return url
- return None
-
- def getSortingName(self):
- """Return a representation of this author's name in von-last-first-jr
- order, unless overridden by ALPH """
- s = self.html
- for pat,v in config.ALPHABETIZE_AUTHOR_AS_RE_LIST:
- if pat.search(s):
- return v
-
- return txtize(" ".join(self.von+self.last+self.first+self.jr))
-
- def getSectionName(self):
- """Return a HTML representation of this author's name in
- last, first von, jr order"""
- secname = " ".join(self.last)
- more = self.first+self.von
- if more:
- secname += ", "+" ".join(more)
- if self.jr:
- secname += ", "+" ".join(self.jr)
- secname = htmlize(secname)
- return secname
-
- def htmlizeWithLink(self):
- a = self.html
- u = self.getHomepage()
- if u:
- return "<a href='%s'>%s</a>"%(u,a)
- else:
- return a
-
-
-def parseAuthor(s):
- try:
- return _parseAuthor(s)
- except:
- print >>sys.stderr, "Internal error while parsing author %r"%s
- raise
-
-def _parseAuthor(s):
- """Take an author string and return a list of ParsedAuthor."""
- items = []
-
- s = s.strip()
- while s:
- s = s.strip()
- bracelevel = 0
- for i in xrange(len(s)):
- if s[i] == '{':
- bracelevel += 1
- elif s[i] == '}':
- bracelevel -= 1
- elif bracelevel <= 0 and s[i] in " \t\n,":
- break
- if i+1 == len(s):
- items.append(s)
- else:
- items.append(s[0:i])
- if (s[i] == ','):
- items.append(',')
- s = s[i+1:]
-
- authors = [[]]
- for item in items:
- if item == 'and':
- authors.append([])
- else:
- authors[-1].append(item)
-
- parsedAuthors = []
- # Split into first, von, last, jr
- for author in authors:
- commas = 0
- fvl = []
- vl = []
- f = []
- v = []
- l = []
- j = []
- cur = fvl
- for item in author:
- if item == ',':
- if commas == 0:
- vl = fvl
- fvl = []
- cur = f
- else:
- j.extend(f)
- cur = f = []
- commas += 1
- else:
- cur.append(item)
-
- if commas == 0:
- split_von(f,v,l,fvl)
- else:
- f_tmp = []
- split_von(f_tmp,v,l,vl)
-
- parsedAuthors.append(ParsedAuthor(f,v,l,j))
-
- return parsedAuthors
diff --git a/sortutils.py b/sortutils.py
index 419fe03..d86a299 100644
--- a/sortutils.py
+++ b/sortutils.py
@@ -1,7 +1,7 @@
import config
import copy
from utils import txtize
-from entry import buildAuthorTable
+from author import buildAuthorTable
import re
# List: must map from month number to month name.
diff --git a/utils.py b/utils.py
index 4d4b583..e62c446 100644
--- a/utils.py
+++ b/utils.py
@@ -1,6 +1,7 @@
import re
import os
+PRINTINGCHARS = "\t\n\r"+"".join(map(chr,range(32, 127)))
ALLCHARS = "".join(map(chr,range(256)))
RE_LONE_AMP = re.compile(r'&([^a-z0-9])')
RE_LONE_I = re.compile(r'\\i([^a-z0-9])')
diff --git a/writeHTML.py b/writeHTML.py
index d4e11a0..9e7ddd7 100755
--- a/writeHTML.py
+++ b/writeHTML.py
@@ -15,7 +15,6 @@ import BibTeX
from sortutils import sortEntriesBy, splitSortedEntriesBy, sortEntriesByDate,\
splitEntriesByAuthor
from utils import smartJoin, url_untranslate
-from entry import buildAuthorTable
import config
def getTemplate(name):