From 133164406a50d56633fa39fb553cbb838f70c529 Mon Sep 17 00:00:00 2001
From: Nick Mathewson <nickm@torproject.org>
Date: Sun, 15 Aug 2004 20:03:38 +0000
Subject: Add basic unit tests; fix lastname,firstname name syntax; make
 configuration a separate file; make templates a configuration option; add
 separate cache jails to take advantage of apache http auth (last idea is from
 geoff)

svn:r109
---
 BibTeX.py             |  55 ++++++++++++-------
 Makefile              |   6 +--
 TODO                  |   4 +-
 _template_bibtex.html |   2 +-
 config.py             | 144 ++++++++++++--------------------------------------
 metaphone.py          |   1 -
 updateCache.py        |  31 +++++++----
 writeHTML.py          |  13 +++--
 8 files changed, 107 insertions(+), 149 deletions(-)
diff --git a/BibTeX.py b/BibTeX.py
index 5f71708..1637ff0 100644
--- a/BibTeX.py
+++ b/BibTeX.py
@@ -26,13 +26,11 @@ MONTHS = [ None,
 # recognize them.)
 WWW_FIELDS = [ 'www_section', 'www_important', 'www_remarks',
                'www_abstract_url', 'www_html_url', 'www_pdf_url', 'www_ps_url',
-               'www_txt_url', 'www_ps_gz_url', 'www_amazon_url', 
-	       'www_excerpt_url' ]
+               'www_txt_url', 'www_ps_gz_url', 'www_amazon_url',
+	       'www_excerpt_url', 'www_cache_section' ]
 
 def url_untranslate(s):
     """Change a BibTeX key into a string suitable for use in a URL."""
-    #s = s.replace(" ", "_")
-    #s = s.replace(',', "_")
     s = re.sub(r'([%<>, _])',
                lambda m: "_%02x"%ord(m.group(1)),
                s)
@@ -42,6 +40,19 @@ class ParseError(Exception):
     """Raised on invalid BibTeX"""
     pass
 
+
+def smartJoin(*lst):
+    """Equivalent to os.path.join, but handle"." and ".." entries a bit better.
+    """
+    lst = [ item for item in lst if item != "." ]
+    idx = 0
+    while idx < len(lst):
+        if idx > 0 and lst[idx] == "..":
+            del lst[idx]
+        else:
+            idx += 1
+    return os.path.join(*lst)
+
 class BibTeX:
     """A parsed BibTeX file"""
     def __init__(self):
@@ -485,6 +496,13 @@ class BibTeXEntry:
                     url = unTeXescapeURL(url)
                     availability.append('<a href="%s">%s</a>' %(url,"excerpt"))
 
+            cache_section = self.get('www_cache_section', ".")
+            if cache_section not in config.CACHE_SECTIONS:
+                if cache_section != ".":
+                    print >>sys.stderr, "Unrecognized cache section %s"%(
+                        cache_section)
+                    cache_section="."
+
             for key, name, ext in (('www_abstract_url', 'abstract','abstract'),
                                    ('www_html_url', 'HTML', 'html'),
                                    ('www_pdf_url', 'PDF', 'pdf'),
@@ -493,10 +511,11 @@ class BibTeXEntry:
                                    ('www_ps_gz_url', 'gzipped&nbsp;PS','ps.gz')
                                    ):
                 if cached:
-                    url = os.path.join(".", config.CACHE_DIR,
-                                       "%s.%s"%(self.key,ext))
-                    fname = os.path.join(config.OUTPUT_DIR, config.CACHE_DIR,
-                                         "%s.%s"%(self.key,ext))
+                    url = smartJoin(".", config.CACHE_DIR,cache_section,
+                                    "%s.%s"%(self.key,ext))
+                    fname = smartJoin(config.OUTPUT_DIR, config.CACHE_DIR,
+                                      cache_section,
+                                      "%s.%s"%(self.key,ext))
                     if not os.path.exists(fname): continue
                 else:
                     url = self.get(key)
@@ -607,8 +626,8 @@ def author_url(author):
     return None
 
 def txtize(s):
-    """Turn a TeX string into decent plaintext."""
-    s = RE_LONE_I.sub(lambda m: "%s" % m.group(1), s)
+    """Turn a TeX string into decnent plaintext."""
+    s = RE_LONE_I.sub(lambda m: "i%s" % m.group(1), s)
     s = RE_ACCENT.sub(lambda m: "%s" % m.group(2), s)
     s = RE_LIGATURE.sub(lambda m: "%s%s"%m.groups(), s)
     s = RE_TEX_CMD.sub("", s)
@@ -807,7 +826,6 @@ def parseAuthor(s):
     """Take an author string and return a list of ParsedAuthor."""
     items = []
 
-    #print "A", `s`
     s = s.strip()
     while s:
         s = s.strip()
@@ -827,8 +845,6 @@ def parseAuthor(s):
             items.append(',')
         s = s[i+1:]
 
-    #print "B", items
-
     authors = [[]]
     for item in items:
         if item == 'and':
@@ -836,13 +852,9 @@ def parseAuthor(s):
         else:
             authors[-1].append(item)
 
-    #print "C", authors
-
     parsedAuthors = []
     # Split into first, von, last, jr
     for author in authors:
-        #print author
-
         commas = 0
         fvl = []
         vl = []
@@ -859,16 +871,18 @@ def parseAuthor(s):
                     cur = f
                 else:
                     j.extend(f)
-                    f = []
+                    cur = f = []
+                commas += 1
             else:
                 cur.append(item)
+
         if commas == 0:
             split_von(f,v,l,fvl)
         else:
-            split_von(None,v,l,vl)
+            f_tmp = []
+            split_von(f_tmp,v,l,vl)
 
         parsedAuthors.append(ParsedAuthor(f,v,l,j))
-        #print "   ====> ", parsedAuthors[-1]
 
     return parsedAuthors
 
@@ -901,6 +915,7 @@ def split_von(f,v,l,x):
         l.append(f[-1])
         del f[-1]
 
+
 class Parser:
     """Parser class: reads BibTeX from a file and returns a BibTeX object."""
     def __init__(self, fileiter, initial_strings, result=None):
diff --git a/Makefile b/Makefile
index b72ab63..e89a9e8 100644
--- a/Makefile
+++ b/Makefile
@@ -1,15 +1,15 @@
 
 PYTHON=python2
-VERSION=0.1
+VERSION=0.2cvs
 
 all:
-	$(PYTHON) writeHTML.py
+	$(PYTHON) writeHTML.py anonbib.cfg
 
 clean:
 	rm -f *~ */*~ *.pyc *.pyo
 
 update: 
-	$(PYTHON) updateCache.py
+	$(PYTHON) updateCache.py anonbib.cfg
 
 veryclean: clean
 	rm -f author.html date.html topic.html bibtex.html tmp.bib
diff --git a/TODO b/TODO
index fe39c7a..6ff9c77 100644
--- a/TODO
+++ b/TODO
@@ -1,6 +1,8 @@
 
 - More general tasks
         . Know about @book
+	. Write unit tests for everything
+	. Make name parsing vaguely sane
         - Maybe uncrossref in tmp.bib
         - Maybe pull important papers to the start of their sections?
         . Clean \{}~ when going from note to url; add \{}~ when making
@@ -11,7 +13,7 @@
         - When sorting by date, entries with unknown months go into a magic
           "month zero" before January.  Is this right?
 	- Strip unused features.
-	- Take a configuration file on the command line instead of just
+	o Take a configuration file on the command line instead of just
 	  importing config.py.
 
 - Cache tasks
diff --git a/_template_bibtex.html b/_template_bibtex.html
index d5bad7e..ee4c8f3 100644
--- a/_template_bibtex.html
+++ b/_template_bibtex.html
@@ -8,7 +8,7 @@
 
 <!-- *** I AM MACHINE GENERATED! DO NOT EDIT ME!
      *** EDIT THE .bib FILE or _template_.html INSTEAD!
-   
+
      Generated by `%(command_line)s'
      (c) Eddie Kohler 1999-2000, Nick Mathewson 2003 -->
 
diff --git a/config.py b/config.py
index 0fea53c..68fbbf7 100644
--- a/config.py
+++ b/config.py
@@ -2,92 +2,38 @@
 
 import re
 
-# Our input filename.
-MASTER_BIB = "./anonbib.bib"
-
-# Where do we put generated HTML?
-OUTPUT_DIR = "."
-
-# Where do we put cached papers (relative to OUTPUT_DIR)
-CACHE_DIR = "cache"
-
-# Timeout when downloading from a server while caching, in seconds.
-DOWNLOAD_CONNECT_TIMEOUT = 15
-
-# Map from author name regex to author homepage.
-AUTHOR_URLS = {
-    'Ross.*Anderson' : 'http://www.cl.cam.ac.uk/users/rja14/',
-    'Alessandro.*Acquisti' : 'http://www.sims.berkeley.edu/~acquisti/',
-    'Adam.*Back' : 'http://www.cypherspace.org/~adam/',
-    'Berthold' : 'http://page.inf.fu-berlin.de/~berthold/',
-    'Miguel.*Castro' : 'http://research.microsoft.com/users/mcastro/',
-    'Chaum' : 'http://www.chaum.com/',
-    'J.*Claessens' : 'http://www.esat.kuleuven.ac.be/~joclaess/',
-    'R.*Clayton' : 'http://www.cl.cam.ac.uk/~rnc1/',
-    'Danezis' : 'http://www.cl.cam.ac.uk/~gd216/',
-    'Claudia.*az' : 'http://www.esat.kuleuven.ac.be/~cdiaz/',
-    'Dingledine' : 'http://www.freehaven.net/~arma/cv.html',
-    'Desmedt' : 'http://www.cs.fsu.edu/~desmedt/',
-    'Douceur' : 'http://research.microsoft.com/~johndo/',
-    'Michael.*Freedman' : 'http://www.scs.cs.nyu.edu/~mfreed/',
-    'Ian.*Goldberg' : 'http://www.cs.berkeley.edu/~iang/',
-    'Christian.*Grothoff' : 'http://www.ovmj.org/~grothoff/',
-    'D.*Hopwood' : 'http://www.users.zetnet.co.uk/hopwood/',
-    'Jakobsson' : 'http://www.rsasecurity.com/rsalabs/staff/bios/mjakobsson/',
-    'Juels' : 'http://www.rsasecurity.com/rsalabs/staff/bios/ajuels/',
-    'K.*Kurosawa' : 'http://kuro.cis.ibaraki.ac.jp/~kurosawa/',
-    'H.*Langos' : 'http://www.wh9.tu-dresden.de/~heinrich/',
-    'B.*Liskov' : 'http://www.pmg.lcs.mit.edu/barbara_liskov.html',
-    'Mathewson' : 'http://www.wangafu.net/~nickm/',
-    'Mazi&egrave;res' : 'http://www.scs.cs.nyu.edu/~dm/',
-    'B.*M&ouml;ller' : ('http://www.informatik.tu-darmstadt.de/TI/'
-                        'Mitarbeiter/moeller.html'),
-    'U.*M&ouml;ller' : 'http://www.ulfm.de/',
-    'D.*Molnar' : 'http://hcs.harvard.edu/~dmolnar/papers.html',
-    'R.*Morris' : 'http://www.pdos.lcs.mit.edu/~rtm/',
-    'A.*Pfitzmann' : 'http://dud.inf.tu-dresden.de/~pfitza/',
-    'B.*Pfitzmann' : 'http://www.zurich.ibm.com/~bpf/',
-    'B.*Preneel' : 'http://www.esat.kuleuven.ac.be/~preneel/',
-    'Daniel.*Simon' : 'http://research.microsoft.com/crypto/dansimon/me.htm',
-    'Rackoff' : 'http://www.cs.toronto.edu/DCS/People/Faculty/rackoff.html',
-    'Jean F' : 'http://www.geocities.com/j_f_raymond/',
-    'M.*Rennhard' : 'http://www.tik.ee.ethz.ch/~rennhard/',
-    'M.*Reiter' : 'http://www.ece.cmu.edu/~reiter/',
-    'Rivest' : 'http://theory.lcs.mit.edu/~rivest/',
-    'Avi.*Rubin' : 'http://avirubin.com/',
-    'Serjantov' : 'http://www.cl.cam.ac.uk/users/aas23/',
-    'S.*Seys' : 'http://www.esat.kuleuven.ac.be/~sseys/',
-    'Shoup' : 'http://www.shoup.net/',
-    'Syverson' : 'http://www.syverson.org/',
-    'Tsudik' : 'http://www.ics.uci.edu/~gts/c.html',
-    'M.*Waidner' : 'http://www.zurich.ibm.com/~wmi/',
-    'David.*Wagner' : 'http://www.cs.berkeley.edu/~daw/',
-    'M.*Waldman' : 'http://cs1.cs.nyu.edu/~waldman/',
-    'B.*Waters' : 'http://www.cs.princeton.edu/~bwaters/',
-    'M.*Wright' : 'http://www.cs.umass.edu/~mwright/',
-    }
-
-# List of paterns for author names _not_ to do an initial-tolerant
-# match on when building section list.  E.g., if "J\\. Smith" is in
-# this list, he won't be folded into "John Smith".
-NO_COLLAPSE_AUTHORS = [
-
-]
-
-# Map from LaTeX-style name of author to collapse to canonical name.
-COLLAPSE_AUTHORS = {
-    "Nicholas Mathewson": "Nick Mathewson",
-    }
-
-# Map from author pattern to collation key.
-# This keeps 'Zero Knowledge Systems' from getting alphabetized as "Systems,
-# Zero Knowledge."
-ALPHABETIZE_AUTHOR_AS = {
-    "Zero.*Knowledge.*Systems": "Zero Knowledge Systems",
-    }
-
-# Map of strings to initialzie BibTeX parsing with.
-INITIAL_STRINGS = {
+_KEYS = [ "ALPHABETIZE_AUTHOR_AS","AUTHOR_URLS","CACHE_DIR","CACHE_SECTIONS",
+          "COLLAPSE_AUTHORS",
+          "DOWNLOAD_CONNECT_TIMEOUT","INITIAL_STRINGS",
+          "MASTER_BIB", "NO_COLLAPSE_AUTHORS", "OMIT_ENTRIES",
+          "OUTPUT_DIR", "TEMPLATE_FILE", "BIBTEX_TEMPLATE_FILE" ]
+
+for _k in _KEYS:
+    globals()[_k]=None
+
+def load(cfgFile):
+    mod = {}
+    execfile(cfgFile, mod)
+    for _k in _KEYS:
+        try:
+            globals()[_k]=mod[_k]
+        except KeyError:
+            raise KeyError("Configuration option %s is missing"%_k)
+
+    INITIAL_STRINGS.update(_EXTRA_INITIAL_STRINGS)
+    AUTHOR_RE_LIST[:] = [
+        (re.compile(k, re.I), v,) for k, v in AUTHOR_URLS.items()
+        ]
+
+    NO_COLLAPSE_AUTHORS_RE_LIST[:] = [
+        re.compile(pat, re.I) for pat in NO_COLLAPSE_AUTHORS
+        ]
+
+    ALPHABETIZE_AUTHOR_AS_RE_LIST[:] = [
+        (re.compile(k, re.I), v,) for k,v in ALPHABETIZE_AUTHOR_AS.items()
+        ]
+
+_EXTRA_INITIAL_STRINGS = {
     # MONTHS
      'jan' : 'January',         'feb' : 'February',
      'mar' : 'March',           'apr' : 'April',
@@ -95,30 +41,10 @@ INITIAL_STRINGS = {
      'jul' : 'July',            'aug' : 'August',
      'sep' : 'September',       'oct' : 'October',
      'nov' : 'November',        'dec' : 'December',
-
-    # SECTIONS
-     'sec_mix' : "Mix Networks: Design",
-     'sec_mixattacks' : "Mix Networks: Attacks",
-     'sec_stream' : "Stream-based anonymity",
-     'sec_traffic' : "Traffic analysis",
-     'sec_pub' : "Anonymous publication",
-     'sec_nym' : "Pseudonymity"
 }
 
-# Don't put in any entries of this type.
-OMIT_ENTRIES = ("proceedings", "journal")
-
-
-### Don't edit below this line
-
-AUTHOR_RE_LIST = [
-    (re.compile(k, re.I), v,) for k, v in AUTHOR_URLS.items()
-    ]
+AUTHOR_RE_LIST = []
 
-NO_COLLAPSE_AUTHORS_RE_LIST = [
-    re.compile(pat, re.I) for pat in NO_COLLAPSE_AUTHORS
-    ]
+NO_COLLAPSE_AUTHORS_RE_LIST = []
 
-ALPHABETIZE_AUTHOR_AS_RE_LIST = [
-    (re.compile(k, re.I), v,) for k,v in ALPHABETIZE_AUTHOR_AS.items()
-    ]
+ALPHABETIZE_AUTHOR_AS_RE_LIST = []
diff --git a/metaphone.py b/metaphone.py
index 9c261d6..e86d4fc 100644
--- a/metaphone.py
+++ b/metaphone.py
@@ -188,7 +188,6 @@ def metaphone(s):
 def demo(a):
     print a, "=>", metaphone(a)
 
-print __name__
 if __name__ == '__main__':
     demo("Nick. Mathewson")
 
diff --git a/updateCache.py b/updateCache.py
index b746f8c..cb2417a 100644
--- a/updateCache.py
+++ b/updateCache.py
@@ -30,13 +30,14 @@ def tryUnlink(fn):
     except OSError:
         pass
 
-def getCacheFname(key, ftype):
-    return os.path.join(config.OUTPUT_DIR,config.CACHE_DIR,
-                        "%s.%s"%(key,ftype))
-
-
-def downloadFile(key, ftype, url, timeout=config.DOWNLOAD_CONNECT_TIMEOUT):
-    fname = getCacheFname(key, ftype)
+def getCacheFname(key, ftype, section):
+    return BibTeX.smartJoin(config.OUTPUT_DIR,config.CACHE_DIR,
+                            section,
+                            "%s.%s"%(key,ftype))
+
+def downloadFile(key, ftype, section, url,
+                 timeout=config.DOWNLOAD_CONNECT_TIMEOUT):
+    fname = getCacheFname(key, ftype, section)
     fnameTmp = fname+".tmp"
     fnameURL = fname+".url"
     tryUnlink(fnameTmp)
@@ -89,7 +90,7 @@ def getURLs(entry):
     return r
 
 def getCachedURL(key, ftype):
-    fname = getCacheFname(key, ftype)
+    fname = getCacheFname(key, ftype, section)
     urlFname = fname+".url"
     if not os.path.exists(fname) or not os.path.exists(urlFname):
         return None
@@ -106,10 +107,11 @@ def downloadAll(bibtex, missingOnly=0):
     for e in bibtex.entries:
         urls = getURLs(e)
         key = e.key
+        section = e.get("www_cache_section", ".")
         for ftype, url in urls.items():
-            fname = getCacheFname(key, ftype)
+            fname = getCacheFname(key, ftype, section)
             if missingOnly:
-                cachedURL = getCachedURL(key, ftype) 
+                cachedURL = getCachedURL(key, ftype, section)
                 if cachedURL == url:
                     print >>sys.stderr,"Skipping",url
                     continue
@@ -118,7 +120,7 @@ def downloadAll(bibtex, missingOnly=0):
                 else:
                     print >>sys.stderr,"No record for %s.%s"%(key,ftype)
             try:
-                downloadFile(key, ftype, url)
+                downloadFile(key, ftype, section, url)
                 print "Downloaded",url
             except UIError, e:
                 print >>sys.stderr, str(e)
@@ -129,5 +131,12 @@ def downloadAll(bibtex, missingOnly=0):
                 errors.append((key,ftype,url,msg))
     return errors
 
+if len(sys.argv) == 2:
+    print "Loading from %s"%sys.argv[1]
+else:
+    print >>sys.stderr, "Expected a single configuration file as an argument"
+    sys.exit(1)
+config.load(sys.argv[1])
+
 bib = BibTeX.parseFile(config.MASTER_BIB)
 downloadAll(bib,missingOnly=1)
diff --git a/writeHTML.py b/writeHTML.py
index 9f6a0fa..d25a0a0 100644
--- a/writeHTML.py
+++ b/writeHTML.py
@@ -14,7 +14,7 @@ import BibTeX
 import config
 
 def getTemplate(name):
-    f = open(name+".html")
+    f = open(name)
     template = f.read()
     f.close()
     template_s, template_e = template.split("%(entries)s")
@@ -70,11 +70,18 @@ def writeHTML(f, sections, sectionType, fieldName, choices, section_urls={}):
                'sections' : secStr,
          }
 
-    header, footer = getTemplate("_template_")
+    header, footer = getTemplate(config.TEMPLATE_FILE)
     print >>f, header%fields
     writeBody(f, sections, section_urls)
     print >>f, footer%fields
 
+if len(sys.argv) == 2:
+    print "Loading from %s"%sys.argv[1]
+else:
+    print >>sys.stderr, "Expected a single configuration file as an argument"
+    sys.exit(1)
+config.load(sys.argv[1])
+
 bib = BibTeX.parseFile(config.MASTER_BIB)
 
 ##### Sorted views:
@@ -145,7 +152,7 @@ entries = bib.entries[:]
 entries = [ (ent.key, ent) for ent in entries ]
 entries.sort()
 entries = [ ent[1] for ent in entries ]
-header,footer = getTemplate("_template_bibtex")
+header,footer = getTemplate(config.BIBTEX_TEMPLATE_FILE)
 f = open(os.path.join(config.OUTPUT_DIR,"bibtex.html"), 'w')
 print >>f, header % { 'command_line' : "" }
 for ent in entries:
-- 
cgit v1.2.3-70-g09d2