Split author

author: Thibaut Horel <thibaut.horel@gmail.com> 2016-02-04 20:06:49 -0500
committer: Thibaut Horel <thibaut.horel@gmail.com> 2016-02-04 20:06:49 -0500
commit: 5af5043ac67529aa2ecc05c6a3bbc22a4419b9cb (patch)
tree: 6d73e11141cf2ffbec11561e44d5c60f0dc75131
parent: da7359cd452f2ded9e05e753fb125508343b8587 (diff)
download: anonbib-5af5043ac67529aa2ecc05c6a3bbc22a4419b9cb.tar.gz
6 files changed, 291 insertions, 290 deletions
diff --git a/BibTeX.py b/BibTeX.py
index 85228a1..6831929 100644
--- a/BibTeX.py
+++ b/BibTeX.py
@@ -80,9 +80,6 @@ class BibTeX:
         self.entries = newEntries
 
 
-
-
-
 class FileIter:
     def __init__(self, fname=None, file=None, it=None, string=None):
         if fname:
@@ -392,4 +389,3 @@ if __name__ == '__main__':
     for e in r.entries:
         if e.type in ("proceedings", "journal"): continue
         print e.to_html()
-
diff --git a/author.py b/author.py
new file mode 100644
index 0000000..44319e7
--- /dev/null
+++ b/author.py
@@ -0,0 +1,286 @@
+import sys
+import config
+import re
+from utils import htmlize, txtize, ALLCHARS, PRINTINGCHARS
+
+
+LC_CHARS = "abcdefghijklmnopqrstuvwxyz"
+SV_DELCHARS = ("ABCDEFGHIJKLMNOPQRSTUVWXYZ"
+               "abcdefghijklmnopqrstuvwxyz"
+               "@")
+RE_ESCAPED = re.compile(r'\\.')
+
+def split_von(f,v,l,x):
+    in_von = 0
+    while x:
+        tt = t = x[0]
+        del x[0]
+        if tt[:2] == '{\\':
+            tt = tt.translate(ALLCHARS, SV_DELCHARS)
+            tt = RE_ESCAPED.sub("", tt)
+            tt = tt.translate(ALLCHARS, "{}")
+        if tt.translate(ALLCHARS, LC_CHARS) == "":
+            v.append(t)
+            in_von = 1
+        elif in_von and f is not None:
+            l.append(t)
+            l.extend(x)
+            return
+        else:
+            f.append(t)
+    if not in_von:
+        l.append(f[-1])
+        del f[-1]
+
+def buildAuthorTable(entries):
+    """Given a list of BibTeXEntry, return a map from parsed author name to
+       parsed canonical name.
+    """
+    authorsByLast = {}
+    for e in entries:
+        for a in e.parsedAuthor:
+            authorsByLast.setdefault(tuple(a.last), []).append(a)
+    # map from author to collapsed author.
+    result = {}
+    for k,v in config.COLLAPSE_AUTHORS.items():
+        a = parseAuthor(k)[0]
+        c = parseAuthor(v)[0]
+        result[c] = c
+        result[a] = c
+
+    for e in entries:
+        for author in e.parsedAuthor:
+            if result.has_key(author):
+                continue
+
+            c = author
+            for a in authorsByLast[tuple(author.last)]:
+                if a is author:
+                    continue
+                c = c.collapsesTo(a)
+            result[author] = c
+
+    if 0:
+        for a,c in result.items():
+            if a != c:
+                print "Collapsing authors: %s => %s" % (a,c)
+    if 0:
+        print parseAuthor("Franz Kaashoek")[0].collapsesTo(
+            parseAuthor("M. Franz Kaashoek")[0])
+        print parseAuthor("Paul F. Syverson")[0].collapsesTo(
+            parseAuthor("Paul Syverson")[0])
+        print parseAuthor("Paul Syverson")[0].collapsesTo(
+            parseAuthor("Paul F. Syverson")[0])
+
+    return result
+
+class ParsedAuthor:
+    """The parsed name of an author.
+
+       Eddie deserves credit for this incredibly hairy business.
+    """
+    def __init__(self, first, von, last, jr):
+        self.first = first
+        self.von = von
+        self.last = last
+        self.jr = jr
+        self.collapsable = 1
+
+        self.html = htmlize(str(self))
+        self.txt = txtize(str(self))
+
+        s = self.html
+        for pat in config.NO_COLLAPSE_AUTHORS_RE_LIST:
+            if pat.search(s):
+                self.collapsable = 0
+                break
+
+    def __eq__(self, o):
+        return ((self.first == o.first) and
+                (self.last  == o.last) and
+                (self.von   == o.von) and
+                (self.jr    == o.jr))
+
+    def __hash__(self):
+        return hash(repr(self))
+
+    def collapsesTo(self, o):
+        """Return true iff 'o' could be a more canonical version of this author
+        """
+        if not self.collapsable or not o.collapsable:
+            return self
+
+        if self.last != o.last or self.von != o.von or self.jr != o.jr:
+            return self
+        if not self.first:
+            return o
+
+        if len(self.first) == len(o.first):
+            n = []
+            for a,b in zip(self.first, o.first):
+                if a == b:
+                    n.append(a)
+                elif len(a) == 2 and a[1] == '.' and a[0] == b[0]:
+                    n.append(b)
+                elif len(b) == 2 and b[1] == '.' and a[0] == b[0]:
+                    n.append(a)
+                else:
+                    return self
+            if n == self.first:
+                return self
+            elif n == o.first:
+                return o
+            else:
+                return self
+        else:
+            realname = max([len(n) for n in self.first+o.first])>2
+            if not realname:
+                return self
+
+            if len(self.first) < len(o.first):
+                short = self.first; long = o.first
+            else:
+                short = o.first; long = self.first
+
+            initials_s = "".join([n[0] for n in short])
+            initials_l = "".join([n[0] for n in long])
+            idx = initials_l.find(initials_s)
+            if idx < 0:
+                return self
+            n = long[:idx]
+            for i in range(idx, idx+len(short)):
+                a = long[i]; b = short[i-idx]
+                if a == b:
+                    n.append(a)
+                elif len(a) == 2 and a[1] == '.' and a[0] == b[0]:
+                    n.append(b)
+                elif len(b) == 2 and b[1] == '.' and a[0] == b[0]:
+                    n.append(a)
+                else:
+                    return self
+            n += long[idx+len(short):]
+
+            if n == self.first:
+                return self
+            elif n == o.first:
+                return o
+            else:
+                return self
+
+    def __repr__(self):
+        return "ParsedAuthor(%r,%r,%r,%r)"%(self.first,self.von,
+                                            self.last,self.jr)
+    def __str__(self):
+        a = " ".join(self.first+self.von+self.last)
+        if self.jr:
+            return "%s, %s" % (a,self.jr)
+        return a
+
+    def getHomepage(self):
+        s = self.html
+        for pat, url in config.AUTHOR_RE_LIST:
+            if pat.search(s):
+                return url
+        return None
+
+    def getSortingName(self):
+        """Return a representation of this author's name in von-last-first-jr
+           order, unless overridden by ALPH """
+        s = self.html
+        for pat,v in config.ALPHABETIZE_AUTHOR_AS_RE_LIST:
+            if pat.search(s):
+                return v
+
+        return txtize(" ".join(self.von+self.last+self.first+self.jr))
+
+    def getSectionName(self):
+        """Return a HTML representation of this author's name in
+           last, first von, jr order"""
+        secname = " ".join(self.last)
+        more = self.first+self.von
+        if more:
+            secname += ", "+" ".join(more)
+        if self.jr:
+            secname += ", "+" ".join(self.jr)
+        secname = htmlize(secname)
+        return secname
+
+    def htmlizeWithLink(self):
+        a = self.html
+        u = self.getHomepage()
+        if u:
+            return "<a href='%s'>%s</a>"%(u,a)
+        else:
+            return a
+
+
+def parseAuthor(s):
+    try:
+        return _parseAuthor(s)
+    except:
+        print >>sys.stderr, "Internal error while parsing author %r"%s
+        raise
+
+def _parseAuthor(s):
+    """Take an author string and return a list of ParsedAuthor."""
+    items = []
+
+    s = s.strip()
+    while s:
+        s = s.strip()
+        bracelevel = 0
+        for i in xrange(len(s)):
+            if s[i] == '{':
+                bracelevel += 1
+            elif s[i] == '}':
+                bracelevel -= 1
+            elif bracelevel <= 0 and s[i] in " \t\n,":
+                break
+        if i+1 == len(s):
+            items.append(s)
+        else:
+            items.append(s[0:i])
+        if (s[i] == ','):
+            items.append(',')
+        s = s[i+1:]
+
+    authors = [[]]
+    for item in items:
+        if item == 'and':
+            authors.append([])
+        else:
+            authors[-1].append(item)
+
+    parsedAuthors = []
+    # Split into first, von, last, jr
+    for author in authors:
+        commas = 0
+        fvl = []
+        vl = []
+        f = []
+        v = []
+        l = []
+        j = []
+        cur = fvl
+        for item in author:
+            if item == ',':
+                if commas == 0:
+                    vl = fvl
+                    fvl = []
+                    cur = f
+                else:
+                    j.extend(f)
+                    cur = f = []
+                commas += 1
+            else:
+                cur.append(item)
+
+        if commas == 0:
+            split_von(f,v,l,fvl)
+        else:
+            f_tmp = []
+            split_von(f_tmp,v,l,vl)
+
+        parsedAuthors.append(ParsedAuthor(f,v,l,j))
+
+    return parsedAuthors
diff --git a/entry.py b/entry.py
index 9846e32..4be2bc2 100644
--- a/entry.py
+++ b/entry.py
@@ -3,8 +3,9 @@ import sys
 import re
 import config
 import os
-from utils import htmlize, txtize, url_untranslate, unTeXescapeURL, smartJoin,\
-    _split
+from utils import htmlize, url_untranslate, unTeXescapeURL, smartJoin,\
+    _split, ALLCHARS, PRINTINGCHARS
+from author import parseAuthor
 
 # Fields that we only care about for making web pages (BibTeX doesn't
 # recognize them.)
@@ -20,80 +21,10 @@ def author_url(author):
         if pat.search(author):
             return url
     return None
-ALLCHARS = "".join(map(chr,range(256)))
-PRINTINGCHARS = "\t\n\r"+"".join(map(chr,range(32, 127)))
-LC_CHARS = "abcdefghijklmnopqrstuvwxyz"
-SV_DELCHARS = ("ABCDEFGHIJKLMNOPQRSTUVWXYZ"
-               "abcdefghijklmnopqrstuvwxyz"
-               "@")
-RE_ESCAPED = re.compile(r'\\.')
 PROCEEDINGS_RE = re.compile(
                         r'((?:proceedings|workshop record) of(?: the)? )(.*)',
                         re.I)
 
-def split_von(f,v,l,x):
-    in_von = 0
-    while x:
-        tt = t = x[0]
-        del x[0]
-        if tt[:2] == '{\\':
-            tt = tt.translate(ALLCHARS, SV_DELCHARS)
-            tt = RE_ESCAPED.sub("", tt)
-            tt = tt.translate(ALLCHARS, "{}")
-        if tt.translate(ALLCHARS, LC_CHARS) == "":
-            v.append(t)
-            in_von = 1
-        elif in_von and f is not None:
-            l.append(t)
-            l.extend(x)
-            return
-        else:
-            f.append(t)
-    if not in_von:
-        l.append(f[-1])
-        del f[-1]
-
-def buildAuthorTable(entries):
-    """Given a list of BibTeXEntry, return a map from parsed author name to
-       parsed canonical name.
-    """
-    authorsByLast = {}
-    for e in entries:
-        for a in e.parsedAuthor:
-            authorsByLast.setdefault(tuple(a.last), []).append(a)
-    # map from author to collapsed author.
-    result = {}
-    for k,v in config.COLLAPSE_AUTHORS.items():
-        a = parseAuthor(k)[0]
-        c = parseAuthor(v)[0]
-        result[c] = c
-        result[a] = c
-
-    for e in entries:
-        for author in e.parsedAuthor:
-            if result.has_key(author):
-                continue
-
-            c = author
-            for a in authorsByLast[tuple(author.last)]:
-                if a is author:
-                    continue
-                c = c.collapsesTo(a)
-            result[author] = c
-
-    if 0:
-        for a,c in result.items():
-            if a != c:
-                print "Collapsing authors: %s => %s" % (a,c)
-    if 0:
-        print parseAuthor("Franz Kaashoek")[0].collapsesTo(
-            parseAuthor("M. Franz Kaashoek")[0])
-        print parseAuthor("Paul F. Syverson")[0].collapsesTo(
-            parseAuthor("Paul Syverson")[0])
-        print parseAuthor("Paul Syverson")[0].collapsesTo(
-            parseAuthor("Paul F. Syverson")[0])
-
-    return result
 
 # List of fields that appear when we display the entries as BibTeX.
 DISPLAYED_FIELDS = [ 'title', 'author', 'journal', 'booktitle',
@@ -439,215 +370,3 @@ class BibTeXEntry:
         res.append("</li>\n\n")
 
         return "".join(res)
-
-
-class ParsedAuthor:
-    """The parsed name of an author.
-
-       Eddie deserves credit for this incredibly hairy business.
-    """
-    def __init__(self, first, von, last, jr):
-        self.first = first
-        self.von = von
-        self.last = last
-        self.jr = jr
-        self.collapsable = 1
-
-        self.html = htmlize(str(self))
-        self.txt = txtize(str(self))
-
-        s = self.html
-        for pat in config.NO_COLLAPSE_AUTHORS_RE_LIST:
-            if pat.search(s):
-                self.collapsable = 0
-                break
-
-    def __eq__(self, o):
-        return ((self.first == o.first) and
-                (self.last  == o.last) and
-                (self.von   == o.von) and
-                (self.jr    == o.jr))
-
-    def __hash__(self):
-        return hash(repr(self))
-
-    def collapsesTo(self, o):
-        """Return true iff 'o' could be a more canonical version of this author
-        """
-        if not self.collapsable or not o.collapsable:
-            return self
-
-        if self.last != o.last or self.von != o.von or self.jr != o.jr:
-            return self
-        if not self.first:
-            return o
-
-        if len(self.first) == len(o.first):
-            n = []
-            for a,b in zip(self.first, o.first):
-                if a == b:
-                    n.append(a)
-                elif len(a) == 2 and a[1] == '.' and a[0] == b[0]:
-                    n.append(b)
-                elif len(b) == 2 and b[1] == '.' and a[0] == b[0]:
-                    n.append(a)
-                else:
-                    return self
-            if n == self.first:
-                return self
-            elif n == o.first:
-                return o
-            else:
-                return self
-        else:
-            realname = max([len(n) for n in self.first+o.first])>2
-            if not realname:
-                return self
-
-            if len(self.first) < len(o.first):
-                short = self.first; long = o.first
-            else:
-                short = o.first; long = self.first
-
-            initials_s = "".join([n[0] for n in short])
-            initials_l = "".join([n[0] for n in long])
-            idx = initials_l.find(initials_s)
-            if idx < 0:
-                return self
-            n = long[:idx]
-            for i in range(idx, idx+len(short)):
-                a = long[i]; b = short[i-idx]
-                if a == b:
-                    n.append(a)
-                elif len(a) == 2 and a[1] == '.' and a[0] == b[0]:
-                    n.append(b)
-                elif len(b) == 2 and b[1] == '.' and a[0] == b[0]:
-                    n.append(a)
-                else:
-                    return self
-            n += long[idx+len(short):]
-
-            if n == self.first:
-                return self
-            elif n == o.first:
-                return o
-            else:
-                return self
-
-    def __repr__(self):
-        return "ParsedAuthor(%r,%r,%r,%r)"%(self.first,self.von,
-                                            self.last,self.jr)
-    def __str__(self):
-        a = " ".join(self.first+self.von+self.last)
-        if self.jr:
-            return "%s, %s" % (a,self.jr)
-        return a
-
-    def getHomepage(self):
-        s = self.html
-        for pat, url in config.AUTHOR_RE_LIST:
-            if pat.search(s):
-                return url
-        return None
-
-    def getSortingName(self):
-        """Return a representation of this author's name in von-last-first-jr
-           order, unless overridden by ALPH """
-        s = self.html
-        for pat,v in config.ALPHABETIZE_AUTHOR_AS_RE_LIST:
-            if pat.search(s):
-                return v
-
-        return txtize(" ".join(self.von+self.last+self.first+self.jr))
-
-    def getSectionName(self):
-        """Return a HTML representation of this author's name in
-           last, first von, jr order"""
-        secname = " ".join(self.last)
-        more = self.first+self.von
-        if more:
-            secname += ", "+" ".join(more)
-        if self.jr:
-            secname += ", "+" ".join(self.jr)
-        secname = htmlize(secname)
-        return secname
-
-    def htmlizeWithLink(self):
-        a = self.html
-        u = self.getHomepage()
-        if u:
-            return "<a href='%s'>%s</a>"%(u,a)
-        else:
-            return a
-
-
-def parseAuthor(s):
-    try:
-        return _parseAuthor(s)
-    except:
-        print >>sys.stderr, "Internal error while parsing author %r"%s
-        raise
-
-def _parseAuthor(s):
-    """Take an author string and return a list of ParsedAuthor."""
-    items = []
-
-    s = s.strip()
-    while s:
-        s = s.strip()
-        bracelevel = 0
-        for i in xrange(len(s)):
-            if s[i] == '{':
-                bracelevel += 1
-            elif s[i] == '}':
-                bracelevel -= 1
-            elif bracelevel <= 0 and s[i] in " \t\n,":
-                break
-        if i+1 == len(s):
-            items.append(s)
-        else:
-            items.append(s[0:i])
-        if (s[i] == ','):
-            items.append(',')
-        s = s[i+1:]
-
-    authors = [[]]
-    for item in items:
-        if item == 'and':
-            authors.append([])
-        else:
-            authors[-1].append(item)
-
-    parsedAuthors = []
-    # Split into first, von, last, jr
-    for author in authors:
-        commas = 0
-        fvl = []
-        vl = []
-        f = []
-        v = []
-        l = []
-        j = []
-        cur = fvl
-        for item in author:
-            if item == ',':
-                if commas == 0:
-                    vl = fvl
-                    fvl = []
-                    cur = f
-                else:
-                    j.extend(f)
-                    cur = f = []
-                commas += 1
-            else:
-                cur.append(item)
-
-        if commas == 0:
-            split_von(f,v,l,fvl)
-        else:
-            f_tmp = []
-            split_von(f_tmp,v,l,vl)
-
-        parsedAuthors.append(ParsedAuthor(f,v,l,j))
-
-    return parsedAuthors
diff --git a/sortutils.py b/sortutils.py
index 419fe03..d86a299 100644
--- a/sortutils.py
+++ b/sortutils.py
@@ -1,7 +1,7 @@
 import config
 import copy
 from utils import txtize
-from entry import buildAuthorTable
+from author import buildAuthorTable
 import re
 
 # List: must map from month number to month name.
diff --git a/utils.py b/utils.py
index 4d4b583..e62c446 100644
--- a/utils.py
+++ b/utils.py
@@ -1,6 +1,7 @@
 import re
 import os
 
+PRINTINGCHARS = "\t\n\r"+"".join(map(chr,range(32, 127)))
 ALLCHARS = "".join(map(chr,range(256)))
 RE_LONE_AMP = re.compile(r'&([^a-z0-9])')
 RE_LONE_I = re.compile(r'\\i([^a-z0-9])')
diff --git a/writeHTML.py b/writeHTML.py
index d4e11a0..9e7ddd7 100755
--- a/writeHTML.py
+++ b/writeHTML.py
@@ -15,7 +15,6 @@ import BibTeX
 from sortutils import sortEntriesBy, splitSortedEntriesBy, sortEntriesByDate,\
     splitEntriesByAuthor
 from utils import smartJoin, url_untranslate
-from entry import buildAuthorTable
 import config
 
 def getTemplate(name):
author	Thibaut Horel <thibaut.horel@gmail.com>	2016-02-04 20:06:49 -0500
committer	Thibaut Horel <thibaut.horel@gmail.com>	2016-02-04 20:06:49 -0500
commit	5af5043ac67529aa2ecc05c6a3bbc22a4419b9cb (patch)
tree	6d73e11141cf2ffbec11561e44d5c60f0dc75131
parent	da7359cd452f2ded9e05e753fb125508343b8587 (diff)
download	anonbib-5af5043ac67529aa2ecc05c6a3bbc22a4419b9cb.tar.gz