Collapse authors with similar names; make output pass XHTML/CSS validators.

Also some author refactoring. svn:r30
author: Nick Mathewson <nickm@torproject.org> 2003-05-23 02:38:55 +0000
committer: Nick Mathewson <nickm@torproject.org> 2003-05-23 02:38:55 +0000
commit: a5d4c56336ec216006fe830f630b7de5dde5c67a (patch)
tree: 4fe39099c345876d29d8aeb4afca07101c2a6d5d
parent: 4c84657cc6a6cffab2e322c70423e4d3b7977ea9 (diff)
download: anonbib-a5d4c56336ec216006fe830f630b7de5dde5c67a.tar.gz
8 files changed, 239 insertions, 83 deletions
diff --git a/BibTeX.py b/BibTeX.py
index 2e61453..bf849bd 100644
--- a/BibTeX.py
+++ b/BibTeX.py
@@ -20,9 +20,10 @@ WWW_FIELDS = [ 'www_section', 'www_important', 'www_remarks',
                'www_txt_url', 'www_ps_gz_url' ]
 
 def url_untranslate(s):
-    s = s.replace(" ", "+")
-    s = re.sub(r'([%<>])',
-               lambda m: "%%%02x"%ord(m.group(1)),
+    #s = s.replace(" ", "_")
+    #s = s.replace(',', "_")
+    s = re.sub(r'([%<>, _])',
+               lambda m: "_%02x"%ord(m.group(1)),
                s)
     return s
 
@@ -64,6 +65,40 @@ class BibTeX:
                 newEntries.append(ent)
         self.entries = newEntries                
 
+def buildAuthorTable(entries):
+
+    authorsByLast = {}
+    for e in entries:
+        for a in e.parsedAuthor:
+            authorsByLast.setdefault(tuple(a.last), []).append(a)
+    # map from author to collapsed author.            
+    result = {}
+    for e in entries:
+        for author in e.parsedAuthor:
+            if result.has_key(author):
+                continue
+            
+            c = author
+            for a in authorsByLast[tuple(author.last)]:
+                if a is author:
+                    continue
+                c = c.collapsesTo(a)
+            result[author] = c
+
+    if 1:
+        for a,c in result.items():
+            if a != c:
+                print "Collapsing authors: %s => %s" % (a,c)
+    if 0:
+        print parseAuthor("Franz Kaashoek")[0].collapsesTo(
+            parseAuthor("M. Franz Kaashoek")[0])
+        print parseAuthor("Paul F. Syverson")[0].collapsesTo(
+            parseAuthor("Paul Syverson")[0])
+        print parseAuthor("Paul Syverson")[0].collapsesTo(
+            parseAuthor("Paul F. Syverson")[0])
+                
+    return result
+
 def splitEntriesBy(entries, field):
     result = {}
     for ent in entries:
@@ -90,30 +125,28 @@ def splitSortedEntriesBy(entries, field):
 
 def sortEntriesBy(entries, field, default):
     tmp = []
+    i = 0
     for ent in entries:
+        i += 1
         v = ent.get(field, default)
         if v.startswith("<span class='bad'>"):
             v = default
-        tmp.append((txtize(v), ent))
+        tmp.append((txtize(v), i, ent))
     tmp.sort()
-    return [ t[1] for t in tmp ]
+    return [ t[2] for t in tmp ]
 
 def splitEntriesByAuthor(entries):
+    collapsedAuthors = buildAuthorTable(entries)
     entries = sortEntriesByDate(entries)
     result = {} # Name in sorting order -> entries
     htmlResult = {} # name in sorting order -> Full name
     url_map = {} # Full name -> Url
     for ent in entries:
         for a in ent.parsedAuthor:
-            sortkey = txtize(" ".join(a.von+a.last+a.first+a.jr))
-            url = author_url(" ".join(a.first+a.von+a.last+a.jr))
-            secname = " ".join(a.last)
-            more = a.first+a.von
-            if more:
-                secname += ", "+" ".join(more)
-            if a.jr:
-                secname += ", "+" ".join(a.jr)
-            secname = htmlize(secname)
+            canonical = collapsedAuthors[a]
+            url = canonical.getHomepage()
+            sortkey = canonical.getSortingName()
+            secname = canonical.getSectionName()
             if url:
                 url_map[secname] = url
 
@@ -126,16 +159,20 @@ def splitEntriesByAuthor(entries):
 
 def sortEntriesByAuthor(entries):
     tmp = []
+    i = 0
     for ent in entries:
+        i += 1
         authors = [ txtize(" ".join(a.von+a.last+a.first+a.jr))
                     for a in ent.parsedAuthor ]
-        tmp.append((tuple(authors), ent))
+        tmp.append((tuple(authors), i, ent))
     tmp.sort()
-    return [ t[1] for t in tmp ]
+    return [ t[2] for t in tmp ]
 
 def sortEntriesByDate(entries):
     tmp = []
+    i = 0
     for ent in entries:
+        i += 1
         try:
             mon = MONTHS.index(ent.get("month"))
         except ValueError:
@@ -149,9 +186,9 @@ def sortEntriesByDate(entries):
             date = 10000*13
         except ValueError:
             date = 10000*13
-        tmp.append((date, ent))
+        tmp.append((date, i, ent))
     tmp.sort()
-    return [ t[1] for t in tmp ]
+    return [ t[2] for t in tmp ]
     
 
 DISPLAYED_FIELDS = [ 'title', 'author', 'journal', 'booktitle',
@@ -365,11 +402,6 @@ class BibTeXEntry:
         else:
             res = ["<li><p class='entry'><span class='title'>%s</span>"%(
                 htmlize(self['title']))]
-
-        #eclass = ["entry", "impEntry"][imp]
-        #        
-        #res = ["<li><p class='%s'><span class='title'>%s</span>"%(
-        #    eclass, htmlize(self['title']))]
                 
         availability = []
         for key, name in (('www_abstract_url', 'abstract'),
@@ -385,10 +417,10 @@ class BibTeXEntry:
             res.append(" <span class='availability'>(")
             res.append(",&nbsp;".join(availability))
             res.append(")</span>")
-        res.append("<br><span class='author'>by ")
+        res.append("<br /><span class='author'>by ")
 
         #res.append("\n<!-- %r -->\n" % self.parsedAuthor)
-        htmlAuthors = [ htmlize_author(a) for a in self.parsedAuthor ]
+        htmlAuthors = [ a.htmlizeWithLink() for a in self.parsedAuthor ]
 
         if len(htmlAuthors) == 1:
             res.append(htmlAuthors[0])
@@ -401,7 +433,7 @@ class BibTeXEntry:
 
         if res[-1][-1] != '.':
             res.append(".")
-        res.append("</span><br>\n")
+        res.append("</span><br />\n")
         res.append(self.biblio_to_html())
 
         res.append("</p>"),
@@ -440,19 +472,6 @@ def htmlize(s):
     s = s.replace("--", "&ndash;");
     return s
 
-def htmlize_author(author):
-    f,v,l,j = author.first,author.von,author.last,author.jr
-    a = " ".join(f+v+l)
-    if j:
-        a = "%s, %s" %(a,j)
-    a = htmlize(a)
-    u = author_url(a)
-    if u:
-        return "<a href='%s'>%s</a>"%(u,a)
-    else:
-        return a
-    return a
-
 def author_url(author):
     for pat, url in config.AUTHOR_RE_LIST:
         if pat.search(author):
@@ -465,24 +484,137 @@ def txtize(s):
     s = RE_TEX_CMD.sub("", s)
     s = s.translate(ALLCHARS, "{}")
     return s
-    
 
 PROCEEDINGS_RE = re.compile(
                         r'((?:proceedings|workshop record) of(?: the)? )(.*)',
                         re.I)
                      
-
 class ParsedAuthor:
     def __init__(self, first, von, last, jr):
         self.first = first
         self.von = von
         self.last = last
         self.jr = jr
+        self.collapsable = 1
+        s = htmlize(str(self))
+        for pat in config.NO_COLLAPSE_AUTHORS_RE_LIST:
+            if pat.search(s):
+                self.collapsable = 0
+                break
+        
+    def __eq__(self, o):
+        return ((self.first == o.first) and
+                (self.last  == o.last) and                
+                (self.von   == o.von) and
+                (self.jr    == o.jr))
+
+    def __neq__(self, o):
+        return ((self.first != o.first) or
+                (self.last  != o.last) or
+                (self.von   != o.von) or                
+                (self.jr    != o.jr))
+    
+    def __hash__(self):
+        return hash(repr(self))
+
+    def collapsesTo(self, o):
+        if not self.collapsable or not o.collapsable:
+            return self
+        
+        if self.last != o.last or self.von != o.von or self.jr != o.jr:
+            return self
+        if not self.first:
+            return o
+
+        if len(self.first) == len(o.first):
+            n = []
+            for a,b in zip(self.first, o.first):
+                if a == b:
+                    n.append(a)
+                elif len(a) == 2 and a[1] == '.' and a[0] == b[0]:
+                    n.append(b)
+                elif len(b) == 2 and b[1] == '.' and a[0] == b[0]:
+                    n.append(a)
+                else:
+                    return self
+            if n == self.first:
+                return self
+            elif n == o.first:
+                return o
+            else:
+                return self
+        else:
+            realname = max([len(n) for n in self.first+o.first])>2
+            if not realname:
+                return self
+            
+            if len(self.first) < len(o.first):
+                short = self.first; long = o.first
+            else:
+                short = o.first; long = self.first
+
+            initials_s = "".join([n[0] for n in short])
+            initials_l = "".join([n[0] for n in long])
+            idx = initials_l.find(initials_s)
+            if idx < 0:
+                return self
+            n = long[:idx]
+            for i in range(idx, idx+len(short)):
+                a = long[i]; b = short[i-idx]
+                if a == b:
+                    n.append(a)
+                elif len(a) == 2 and a[1] == '.' and a[0] == b[0]:
+                    n.append(b)
+                elif len(b) == 2 and b[1] == '.' and a[0] == b[0]:
+                    n.append(a)                    
+                else:
+                    return self
+            n += long[idx+len(short):]
+
+            if n == self.first:
+                return self
+            elif n == o.first:
+                return o
+            else:
+                return self
+        
     def __repr__(self):
         return "ParsedAuthor(%r,%r,%r,%r)"%(self.first,self.von,
                                             self.last,self.jr)
     def __str__(self):
-        return " ".join(self.first+self.von+self.last+self.jr)
+        a = " ".join(self.first+self.von+self.last)
+        if self.jr:
+            return "%s, %s" % (a,self.jr)
+        return a
+
+    def getHomepage(self):
+        s = htmlize(str(self))
+        for pat, url in config.AUTHOR_RE_LIST:
+            if pat.search(str(self)):
+                return url
+        return None
+
+    def getSortingName(self):
+        return txtize(" ".join(self.von+self.last+self.first+self.jr))
+                          
+    def getSectionName(self):
+        secname = " ".join(self.last)
+        more = self.first+self.von
+        if more:
+            secname += ", "+" ".join(more)
+        if self.jr:
+            secname += ", "+" ".join(self.jr)
+        secname = htmlize(secname)
+        return secname
+        
+    def htmlizeWithLink(self):
+        a = str(self)
+        a = htmlize(a)
+        u = self.getHomepage()
+        if u:
+            return "<a href='%s'>%s</a>"%(u,a)
+        else:
+            return a
 
 def _split(s,w=79,indent=8):
     r = []
@@ -522,7 +654,6 @@ class FileIter:
     def next(self):
         self.lineno += 1
         return self._next()
-    
 
 def parseAuthor(s):
     items = []
diff --git a/TODO b/TODO
index 68a9b6a..914e61a 100644
--- a/TODO
+++ b/TODO
@@ -28,9 +28,11 @@ Next:
         - Also clean \_ to _ and back
         - Look for urls in wherepublished.
         - Forgive newlines in wherepublished, note.
-        - "Systems, Zero Knowledge"?
-        - When sorting by author, is Paul Syverson different from Paul
-          F. Syverson?
-        - What the heck is the algorithm for sorting within a year,
+        D "Systems, Zero Knowledge"?
+	- Make CSS and HTML pass the validator
+        o When sorting by author, make "Paul F. Syverson" the same person
+          as "Paul Syverson" unless somebody says different.
+        - When sorting within a year
+What the heck is the algorithm for sorting within a year,
           when sorting by date? What should it be?
 
diff --git a/_template_.html b/_template_.html
index 473fe4c..a3c7d28 100644
--- a/_template_.html
+++ b/_template_.html
@@ -1,22 +1,25 @@
-<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 3.2//EN">
-<html><head>
-<meta http-equiv="Content-Type" content="text/html; charset=ISO-8859-1">
-<meta http-equiv="Content-Style-Type" content="text/css">
+<?xml version="1.0"?>
+<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"
+    "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
+<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">
+<head>
+<meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
+<meta http-equiv="Content-Style-Type" content="text/css" />
 
 <!-- *** I AM MACHINE GENERATED! DO NOT EDIT ME!
-  -- *** EDIT THE .bib FILE or _template_.html INSTEAD!
-  --
-  -- Generated by `%(command_line)s'
-  -- (c) Eddie Kohler 1999-2000, Nick Mathewson 2003 -->
+     *** EDIT THE .bib FILE or _template_.html INSTEAD!
+    
+     Generated by `%(command_line)s'
+     (c) Eddie Kohler 1999-2000, Nick Mathewson 2003 -->
 
 <title>Anonymity Bibliography</title>
 
-<link rel="stylesheet" type="text/css" href="./css/main.css">
-<link rel="stylesheet" type="text/css" href="./css/pubs.css">
+<link rel="stylesheet" type="text/css" href="./css/main.css" />
+<link rel="stylesheet" type="text/css" href="./css/pubs.css" />
 
 </head>
 <body bgcolor="#ffffff" text="#000000" link="#bb0000" vlink="#990099"
-alink="#ff9900" marginheight="0" marginwidth="0">
+alink="#ff9900" >
 
 <h1 align="center">Anonymity bibliography</h1>
 <p align="center">%(choices)s</p>
@@ -29,11 +32,10 @@ alink="#ff9900" marginheight="0" marginwidth="0">
 <!-- Table 2: The sidebar-->
 <table align="right" cellspacing="0" cellpadding="5" width="100" 
      class="sidebar">
-<tr valign="top"><td><p 
-class="l1"><strong>%(sectiontypes)s:</strong><br>
+<tr valign="top"><td><p class="l1"><strong>%(sectiontypes)s:</strong><br /></p>
 %(sections)s
-</p></td>
-
+</td>
+</tr>
 </table><!-- End of table 2 -->
 </td>
 
@@ -41,13 +43,13 @@ class="l1"><strong>%(sectiontypes)s:</strong><br>
 
 <h2>Publications by %(field)s</h2>
 
-<ul>
+<ul class="sections">
 %(entries)s
 </ul>
 
 </td>
 
-<td width="5%%"><br></td>
+<td width="5%%"><br /></td>
 
 </tr>
 </table><!-- End of table 1 --> 
diff --git a/_template_bibtex.html b/_template_bibtex.html
index 82246bd..d5bad7e 100644
--- a/_template_bibtex.html
+++ b/_template_bibtex.html
@@ -1,24 +1,28 @@
-<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 3.2//EN">
-<html><head>
-<meta http-equiv="Content-Type" content="text/html; charset=ISO-8859-1">
-<meta http-equiv="Content-Style-Type" content="text/css">
+<?xml version="1.0"?>
+<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"
+    "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
+<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">
+<head>
+<meta http-equiv="Content-Type" content="text/html; charset=ISO-8859-1" />
+<meta http-equiv="Content-Style-Type" content="text/css" />
 
 <!-- *** I AM MACHINE GENERATED! DO NOT EDIT ME!
-  -- *** EDIT THE .bib FILE or _template_.html INSTEAD!
-  --
-  -- Generated by `%(command_line)s'
-  -- (c) Eddie Kohler 1999-2000, Nick Mathewson 2003 -->
+     *** EDIT THE .bib FILE or _template_.html INSTEAD!
+   
+     Generated by `%(command_line)s'
+     (c) Eddie Kohler 1999-2000, Nick Mathewson 2003 -->
 
 <title>Anonymity Bibliography: BibTeX</title>
 
-<link rel="stylesheet" type="text/css" href="./css/main.css">
-<link rel="stylesheet" type="text/css" href="./css/pubs.css">
+<link rel="stylesheet" type="text/css" href="./css/main.css" />
+<link rel="stylesheet" type="text/css" href="./css/pubs.css" />
 
 </head>
 <body bgcolor="#ffffff" text="#000000" link="#bb0000" vlink="#990099"
-      alink="#ff9900" marginheight="0" marginwidth="0">
+      alink="#ff9900" >
 
 <table cellspacing="15" border="0" align="center" width="100%%">
 %(entries)s
 </table>
+</body>
 </html>
diff --git a/config.py b/config.py
index 8de6c96..e6eacda 100644
--- a/config.py
+++ b/config.py
@@ -29,6 +29,13 @@ AUTHOR_URLS = {
     
     }
 
+# List of paterns for author names _not_ to do an initial-tolerant
+# match on when building section list.  E.g., if "J\\. Smith" is in
+# this list, he won't be folded into "John Smith".
+NO_COLLAPSE_AUTHORS = [
+
+]
+
 INITIAL_STRINGS = {
     # MONTHS
      'jan' : 'January',         'feb' : 'February',
@@ -55,3 +62,7 @@ OMIT_ENTRIES = ("proceedings", "journal")
 AUTHOR_RE_LIST = [
     (re.compile(k, re.I), v,) for k, v in AUTHOR_URLS.items()
     ]
+
+NO_COLLAPSE_AUTHORS_RE_LIST = [
+    re.compile(pat, re.I) for pat in NO_COLLAPSE_AUTHORS
+    ]
diff --git a/css/main.css b/css/main.css
index 8b336c9..b14c394 100644
--- a/css/main.css
+++ b/css/main.css
@@ -49,6 +49,10 @@ UL.expand {
   margin-bottom: 1em;
 }
 
+UL.sections {
+  list-style: none;
+}
+
 /* Font-level properties */
 
 PRE {
diff --git a/css/pubs.css b/css/pubs.css
index 57b654f..07d7a37 100644
--- a/css/pubs.css
+++ b/css/pubs.css
@@ -29,6 +29,7 @@ SPAN.biblio A {
 
 SPAN.bad {
   text-decoration: underline;
+  color: #000;
   background-color: #FDF;
 }
 
@@ -38,7 +39,7 @@ P.remarks {
   margin-bottom: 0;
   margin-left: 5em;
   padding-left: 0.5em;
-  border-width: 0 0 0 5;
+  border-width: 0 0 0 5px;
   border-color: black;
   border-style: solid;
 }
@@ -63,7 +64,7 @@ P.entry {
 }
 
 DIV.impEntry {
-  border-width: 0.1;
+  border-width: 1px;
   border-color: black;
   border-style: solid;
   background-color: #FFE;
@@ -80,7 +81,7 @@ P.impEntry {
 }
 
 TABLE.sidebar {
-  border-width: 2;
+  border-width: 2px;
   border-color: black;
   border-style: solid;
   background-color: #CFF;
@@ -88,7 +89,7 @@ TABLE.sidebar {
 
 TD.bibtex {
   font-family: lucidatypewriter, "Lucida Typewriter", Monaco, "Lucida Sans Unicode", monospace;  
-  border-width: 2;
+  border-width: 2px;
   font-weight: normal; 
   border-color: black;
   border-style: solid;
@@ -99,3 +100,4 @@ PRE.bibtex {
   font-family: lucidatypewriter, "Lucida Typewriter", Monaco, "Lucida Sans Unicode", monospace; 
   font-size: smaller;
 }
+
diff --git a/writeHTML.py b/writeHTML.py
index 019795f..e5d0816 100644
--- a/writeHTML.py
+++ b/writeHTML.py
@@ -25,15 +25,15 @@ def writeBody(f, sections, section_urls):
         sDisp = re.sub(r'\s+', ' ', s.strip())
         sDisp = sDisp.replace(" ", "&nbsp;")
         if u:
-            print >>f, ('<h3><a name="%s"><a href="%s">%s</a></a></h3>'%(
+            print >>f, ('<li><h3><a name="%s"></a><a href="%s">%s</a></h3>'%(
                 (BibTeX.url_untranslate(s), u, sDisp)))
         else:
-            print >>f, ('<h3><a name="%s">%s</a></h3>'%(
+            print >>f, ('<li><h3><a name="%s">%s</a></h3>'%(
                 BibTeX.url_untranslate(s),sDisp))
         print >>f, "<ul class='expand'>"
         for e in entries:
             print >>f, e.to_html()
-        print >>f, "</ul>"
+        print >>f, "</ul></li>"
 
 def writeHTML(f, sections, sectionType, fieldName, choices, section_urls={}):
     """sections: list of (sectionname, [list of BibTeXEntry])'''
@@ -57,7 +57,7 @@ def writeHTML(f, sections, sectionType, fieldName, choices, section_urls={}):
         else:
             choiceStr.append(choice)
         
-    choiceStr = "<p align='center'>%s</p>" % (" | ".join(choiceStr))
+    choiceStr = ("&nbsp;|&nbsp;".join(choiceStr))
 
     fields = { 'command_line' :  "",
                'sectiontypes' :  sectionType,
author	Nick Mathewson <nickm@torproject.org>	2003-05-23 02:38:55 +0000
committer	Nick Mathewson <nickm@torproject.org>	2003-05-23 02:38:55 +0000
commit	a5d4c56336ec216006fe830f630b7de5dde5c67a (patch)
tree	4fe39099c345876d29d8aeb4afca07101c2a6d5d
parent	4c84657cc6a6cffab2e322c70423e4d3b7977ea9 (diff)
download	anonbib-a5d4c56336ec216006fe830f630b7de5dde5c67a.tar.gz