From 133164406a50d56633fa39fb553cbb838f70c529 Mon Sep 17 00:00:00 2001 From: Nick Mathewson Date: Sun, 15 Aug 2004 20:03:38 +0000 Subject: Add basic unit tests; fix lastname,firstname name syntax; make configuration a separate file; make templates a configuration option; add separate cache jails to take advantage of apache http auth (last idea is from geoff) svn:r109 --- BibTeX.py | 55 ++++++++++++------- Makefile | 6 +-- TODO | 4 +- _template_bibtex.html | 2 +- config.py | 144 ++++++++++++-------------------------------------- metaphone.py | 1 - updateCache.py | 31 +++++++---- writeHTML.py | 13 +++-- 8 files changed, 107 insertions(+), 149 deletions(-) diff --git a/BibTeX.py b/BibTeX.py index 5f71708..1637ff0 100644 --- a/BibTeX.py +++ b/BibTeX.py @@ -26,13 +26,11 @@ MONTHS = [ None, # recognize them.) WWW_FIELDS = [ 'www_section', 'www_important', 'www_remarks', 'www_abstract_url', 'www_html_url', 'www_pdf_url', 'www_ps_url', - 'www_txt_url', 'www_ps_gz_url', 'www_amazon_url', - 'www_excerpt_url' ] + 'www_txt_url', 'www_ps_gz_url', 'www_amazon_url', + 'www_excerpt_url', 'www_cache_section' ] def url_untranslate(s): """Change a BibTeX key into a string suitable for use in a URL.""" - #s = s.replace(" ", "_") - #s = s.replace(',', "_") s = re.sub(r'([%<>, _])', lambda m: "_%02x"%ord(m.group(1)), s) @@ -42,6 +40,19 @@ class ParseError(Exception): """Raised on invalid BibTeX""" pass + +def smartJoin(*lst): + """Equivalent to os.path.join, but handle"." and ".." entries a bit better. + """ + lst = [ item for item in lst if item != "." ] + idx = 0 + while idx < len(lst): + if idx > 0 and lst[idx] == "..": + del lst[idx] + else: + idx += 1 + return os.path.join(*lst) + class BibTeX: """A parsed BibTeX file""" def __init__(self): @@ -485,6 +496,13 @@ class BibTeXEntry: url = unTeXescapeURL(url) availability.append('%s' %(url,"excerpt")) + cache_section = self.get('www_cache_section', ".") + if cache_section not in config.CACHE_SECTIONS: + if cache_section != ".": + print >>sys.stderr, "Unrecognized cache section %s"%( + cache_section) + cache_section="." + for key, name, ext in (('www_abstract_url', 'abstract','abstract'), ('www_html_url', 'HTML', 'html'), ('www_pdf_url', 'PDF', 'pdf'), @@ -493,10 +511,11 @@ class BibTeXEntry: ('www_ps_gz_url', 'gzipped PS','ps.gz') ): if cached: - url = os.path.join(".", config.CACHE_DIR, - "%s.%s"%(self.key,ext)) - fname = os.path.join(config.OUTPUT_DIR, config.CACHE_DIR, - "%s.%s"%(self.key,ext)) + url = smartJoin(".", config.CACHE_DIR,cache_section, + "%s.%s"%(self.key,ext)) + fname = smartJoin(config.OUTPUT_DIR, config.CACHE_DIR, + cache_section, + "%s.%s"%(self.key,ext)) if not os.path.exists(fname): continue else: url = self.get(key) @@ -607,8 +626,8 @@ def author_url(author): return None def txtize(s): - """Turn a TeX string into decent plaintext.""" - s = RE_LONE_I.sub(lambda m: "%s" % m.group(1), s) + """Turn a TeX string into decnent plaintext.""" + s = RE_LONE_I.sub(lambda m: "i%s" % m.group(1), s) s = RE_ACCENT.sub(lambda m: "%s" % m.group(2), s) s = RE_LIGATURE.sub(lambda m: "%s%s"%m.groups(), s) s = RE_TEX_CMD.sub("", s) @@ -807,7 +826,6 @@ def parseAuthor(s): """Take an author string and return a list of ParsedAuthor.""" items = [] - #print "A", `s` s = s.strip() while s: s = s.strip() @@ -827,8 +845,6 @@ def parseAuthor(s): items.append(',') s = s[i+1:] - #print "B", items - authors = [[]] for item in items: if item == 'and': @@ -836,13 +852,9 @@ def parseAuthor(s): else: authors[-1].append(item) - #print "C", authors - parsedAuthors = [] # Split into first, von, last, jr for author in authors: - #print author - commas = 0 fvl = [] vl = [] @@ -859,16 +871,18 @@ def parseAuthor(s): cur = f else: j.extend(f) - f = [] + cur = f = [] + commas += 1 else: cur.append(item) + if commas == 0: split_von(f,v,l,fvl) else: - split_von(None,v,l,vl) + f_tmp = [] + split_von(f_tmp,v,l,vl) parsedAuthors.append(ParsedAuthor(f,v,l,j)) - #print " ====> ", parsedAuthors[-1] return parsedAuthors @@ -901,6 +915,7 @@ def split_von(f,v,l,x): l.append(f[-1]) del f[-1] + class Parser: """Parser class: reads BibTeX from a file and returns a BibTeX object.""" def __init__(self, fileiter, initial_strings, result=None): diff --git a/Makefile b/Makefile index b72ab63..e89a9e8 100644 --- a/Makefile +++ b/Makefile @@ -1,15 +1,15 @@ PYTHON=python2 -VERSION=0.1 +VERSION=0.2cvs all: - $(PYTHON) writeHTML.py + $(PYTHON) writeHTML.py anonbib.cfg clean: rm -f *~ */*~ *.pyc *.pyo update: - $(PYTHON) updateCache.py + $(PYTHON) updateCache.py anonbib.cfg veryclean: clean rm -f author.html date.html topic.html bibtex.html tmp.bib diff --git a/TODO b/TODO index fe39c7a..6ff9c77 100644 --- a/TODO +++ b/TODO @@ -1,6 +1,8 @@ - More general tasks . Know about @book + . Write unit tests for everything + . Make name parsing vaguely sane - Maybe uncrossref in tmp.bib - Maybe pull important papers to the start of their sections? . Clean \{}~ when going from note to url; add \{}~ when making @@ -11,7 +13,7 @@ - When sorting by date, entries with unknown months go into a magic "month zero" before January. Is this right? - Strip unused features. - - Take a configuration file on the command line instead of just + o Take a configuration file on the command line instead of just importing config.py. - Cache tasks diff --git a/_template_bibtex.html b/_template_bibtex.html index d5bad7e..ee4c8f3 100644 --- a/_template_bibtex.html +++ b/_template_bibtex.html @@ -8,7 +8,7 @@ diff --git a/config.py b/config.py index 0fea53c..68fbbf7 100644 --- a/config.py +++ b/config.py @@ -2,92 +2,38 @@ import re -# Our input filename. -MASTER_BIB = "./anonbib.bib" - -# Where do we put generated HTML? -OUTPUT_DIR = "." - -# Where do we put cached papers (relative to OUTPUT_DIR) -CACHE_DIR = "cache" - -# Timeout when downloading from a server while caching, in seconds. -DOWNLOAD_CONNECT_TIMEOUT = 15 - -# Map from author name regex to author homepage. -AUTHOR_URLS = { - 'Ross.*Anderson' : 'http://www.cl.cam.ac.uk/users/rja14/', - 'Alessandro.*Acquisti' : 'http://www.sims.berkeley.edu/~acquisti/', - 'Adam.*Back' : 'http://www.cypherspace.org/~adam/', - 'Berthold' : 'http://page.inf.fu-berlin.de/~berthold/', - 'Miguel.*Castro' : 'http://research.microsoft.com/users/mcastro/', - 'Chaum' : 'http://www.chaum.com/', - 'J.*Claessens' : 'http://www.esat.kuleuven.ac.be/~joclaess/', - 'R.*Clayton' : 'http://www.cl.cam.ac.uk/~rnc1/', - 'Danezis' : 'http://www.cl.cam.ac.uk/~gd216/', - 'Claudia.*az' : 'http://www.esat.kuleuven.ac.be/~cdiaz/', - 'Dingledine' : 'http://www.freehaven.net/~arma/cv.html', - 'Desmedt' : 'http://www.cs.fsu.edu/~desmedt/', - 'Douceur' : 'http://research.microsoft.com/~johndo/', - 'Michael.*Freedman' : 'http://www.scs.cs.nyu.edu/~mfreed/', - 'Ian.*Goldberg' : 'http://www.cs.berkeley.edu/~iang/', - 'Christian.*Grothoff' : 'http://www.ovmj.org/~grothoff/', - 'D.*Hopwood' : 'http://www.users.zetnet.co.uk/hopwood/', - 'Jakobsson' : 'http://www.rsasecurity.com/rsalabs/staff/bios/mjakobsson/', - 'Juels' : 'http://www.rsasecurity.com/rsalabs/staff/bios/ajuels/', - 'K.*Kurosawa' : 'http://kuro.cis.ibaraki.ac.jp/~kurosawa/', - 'H.*Langos' : 'http://www.wh9.tu-dresden.de/~heinrich/', - 'B.*Liskov' : 'http://www.pmg.lcs.mit.edu/barbara_liskov.html', - 'Mathewson' : 'http://www.wangafu.net/~nickm/', - 'Mazières' : 'http://www.scs.cs.nyu.edu/~dm/', - 'B.*Möller' : ('http://www.informatik.tu-darmstadt.de/TI/' - 'Mitarbeiter/moeller.html'), - 'U.*Möller' : 'http://www.ulfm.de/', - 'D.*Molnar' : 'http://hcs.harvard.edu/~dmolnar/papers.html', - 'R.*Morris' : 'http://www.pdos.lcs.mit.edu/~rtm/', - 'A.*Pfitzmann' : 'http://dud.inf.tu-dresden.de/~pfitza/', - 'B.*Pfitzmann' : 'http://www.zurich.ibm.com/~bpf/', - 'B.*Preneel' : 'http://www.esat.kuleuven.ac.be/~preneel/', - 'Daniel.*Simon' : 'http://research.microsoft.com/crypto/dansimon/me.htm', - 'Rackoff' : 'http://www.cs.toronto.edu/DCS/People/Faculty/rackoff.html', - 'Jean F' : 'http://www.geocities.com/j_f_raymond/', - 'M.*Rennhard' : 'http://www.tik.ee.ethz.ch/~rennhard/', - 'M.*Reiter' : 'http://www.ece.cmu.edu/~reiter/', - 'Rivest' : 'http://theory.lcs.mit.edu/~rivest/', - 'Avi.*Rubin' : 'http://avirubin.com/', - 'Serjantov' : 'http://www.cl.cam.ac.uk/users/aas23/', - 'S.*Seys' : 'http://www.esat.kuleuven.ac.be/~sseys/', - 'Shoup' : 'http://www.shoup.net/', - 'Syverson' : 'http://www.syverson.org/', - 'Tsudik' : 'http://www.ics.uci.edu/~gts/c.html', - 'M.*Waidner' : 'http://www.zurich.ibm.com/~wmi/', - 'David.*Wagner' : 'http://www.cs.berkeley.edu/~daw/', - 'M.*Waldman' : 'http://cs1.cs.nyu.edu/~waldman/', - 'B.*Waters' : 'http://www.cs.princeton.edu/~bwaters/', - 'M.*Wright' : 'http://www.cs.umass.edu/~mwright/', - } - -# List of paterns for author names _not_ to do an initial-tolerant -# match on when building section list. E.g., if "J\\. Smith" is in -# this list, he won't be folded into "John Smith". -NO_COLLAPSE_AUTHORS = [ - -] - -# Map from LaTeX-style name of author to collapse to canonical name. -COLLAPSE_AUTHORS = { - "Nicholas Mathewson": "Nick Mathewson", - } - -# Map from author pattern to collation key. -# This keeps 'Zero Knowledge Systems' from getting alphabetized as "Systems, -# Zero Knowledge." -ALPHABETIZE_AUTHOR_AS = { - "Zero.*Knowledge.*Systems": "Zero Knowledge Systems", - } - -# Map of strings to initialzie BibTeX parsing with. -INITIAL_STRINGS = { +_KEYS = [ "ALPHABETIZE_AUTHOR_AS","AUTHOR_URLS","CACHE_DIR","CACHE_SECTIONS", + "COLLAPSE_AUTHORS", + "DOWNLOAD_CONNECT_TIMEOUT","INITIAL_STRINGS", + "MASTER_BIB", "NO_COLLAPSE_AUTHORS", "OMIT_ENTRIES", + "OUTPUT_DIR", "TEMPLATE_FILE", "BIBTEX_TEMPLATE_FILE" ] + +for _k in _KEYS: + globals()[_k]=None + +def load(cfgFile): + mod = {} + execfile(cfgFile, mod) + for _k in _KEYS: + try: + globals()[_k]=mod[_k] + except KeyError: + raise KeyError("Configuration option %s is missing"%_k) + + INITIAL_STRINGS.update(_EXTRA_INITIAL_STRINGS) + AUTHOR_RE_LIST[:] = [ + (re.compile(k, re.I), v,) for k, v in AUTHOR_URLS.items() + ] + + NO_COLLAPSE_AUTHORS_RE_LIST[:] = [ + re.compile(pat, re.I) for pat in NO_COLLAPSE_AUTHORS + ] + + ALPHABETIZE_AUTHOR_AS_RE_LIST[:] = [ + (re.compile(k, re.I), v,) for k,v in ALPHABETIZE_AUTHOR_AS.items() + ] + +_EXTRA_INITIAL_STRINGS = { # MONTHS 'jan' : 'January', 'feb' : 'February', 'mar' : 'March', 'apr' : 'April', @@ -95,30 +41,10 @@ INITIAL_STRINGS = { 'jul' : 'July', 'aug' : 'August', 'sep' : 'September', 'oct' : 'October', 'nov' : 'November', 'dec' : 'December', - - # SECTIONS - 'sec_mix' : "Mix Networks: Design", - 'sec_mixattacks' : "Mix Networks: Attacks", - 'sec_stream' : "Stream-based anonymity", - 'sec_traffic' : "Traffic analysis", - 'sec_pub' : "Anonymous publication", - 'sec_nym' : "Pseudonymity" } -# Don't put in any entries of this type. -OMIT_ENTRIES = ("proceedings", "journal") - - -### Don't edit below this line - -AUTHOR_RE_LIST = [ - (re.compile(k, re.I), v,) for k, v in AUTHOR_URLS.items() - ] +AUTHOR_RE_LIST = [] -NO_COLLAPSE_AUTHORS_RE_LIST = [ - re.compile(pat, re.I) for pat in NO_COLLAPSE_AUTHORS - ] +NO_COLLAPSE_AUTHORS_RE_LIST = [] -ALPHABETIZE_AUTHOR_AS_RE_LIST = [ - (re.compile(k, re.I), v,) for k,v in ALPHABETIZE_AUTHOR_AS.items() - ] +ALPHABETIZE_AUTHOR_AS_RE_LIST = [] diff --git a/metaphone.py b/metaphone.py index 9c261d6..e86d4fc 100644 --- a/metaphone.py +++ b/metaphone.py @@ -188,7 +188,6 @@ def metaphone(s): def demo(a): print a, "=>", metaphone(a) -print __name__ if __name__ == '__main__': demo("Nick. Mathewson") diff --git a/updateCache.py b/updateCache.py index b746f8c..cb2417a 100644 --- a/updateCache.py +++ b/updateCache.py @@ -30,13 +30,14 @@ def tryUnlink(fn): except OSError: pass -def getCacheFname(key, ftype): - return os.path.join(config.OUTPUT_DIR,config.CACHE_DIR, - "%s.%s"%(key,ftype)) - - -def downloadFile(key, ftype, url, timeout=config.DOWNLOAD_CONNECT_TIMEOUT): - fname = getCacheFname(key, ftype) +def getCacheFname(key, ftype, section): + return BibTeX.smartJoin(config.OUTPUT_DIR,config.CACHE_DIR, + section, + "%s.%s"%(key,ftype)) + +def downloadFile(key, ftype, section, url, + timeout=config.DOWNLOAD_CONNECT_TIMEOUT): + fname = getCacheFname(key, ftype, section) fnameTmp = fname+".tmp" fnameURL = fname+".url" tryUnlink(fnameTmp) @@ -89,7 +90,7 @@ def getURLs(entry): return r def getCachedURL(key, ftype): - fname = getCacheFname(key, ftype) + fname = getCacheFname(key, ftype, section) urlFname = fname+".url" if not os.path.exists(fname) or not os.path.exists(urlFname): return None @@ -106,10 +107,11 @@ def downloadAll(bibtex, missingOnly=0): for e in bibtex.entries: urls = getURLs(e) key = e.key + section = e.get("www_cache_section", ".") for ftype, url in urls.items(): - fname = getCacheFname(key, ftype) + fname = getCacheFname(key, ftype, section) if missingOnly: - cachedURL = getCachedURL(key, ftype) + cachedURL = getCachedURL(key, ftype, section) if cachedURL == url: print >>sys.stderr,"Skipping",url continue @@ -118,7 +120,7 @@ def downloadAll(bibtex, missingOnly=0): else: print >>sys.stderr,"No record for %s.%s"%(key,ftype) try: - downloadFile(key, ftype, url) + downloadFile(key, ftype, section, url) print "Downloaded",url except UIError, e: print >>sys.stderr, str(e) @@ -129,5 +131,12 @@ def downloadAll(bibtex, missingOnly=0): errors.append((key,ftype,url,msg)) return errors +if len(sys.argv) == 2: + print "Loading from %s"%sys.argv[1] +else: + print >>sys.stderr, "Expected a single configuration file as an argument" + sys.exit(1) +config.load(sys.argv[1]) + bib = BibTeX.parseFile(config.MASTER_BIB) downloadAll(bib,missingOnly=1) diff --git a/writeHTML.py b/writeHTML.py index 9f6a0fa..d25a0a0 100644 --- a/writeHTML.py +++ b/writeHTML.py @@ -14,7 +14,7 @@ import BibTeX import config def getTemplate(name): - f = open(name+".html") + f = open(name) template = f.read() f.close() template_s, template_e = template.split("%(entries)s") @@ -70,11 +70,18 @@ def writeHTML(f, sections, sectionType, fieldName, choices, section_urls={}): 'sections' : secStr, } - header, footer = getTemplate("_template_") + header, footer = getTemplate(config.TEMPLATE_FILE) print >>f, header%fields writeBody(f, sections, section_urls) print >>f, footer%fields +if len(sys.argv) == 2: + print "Loading from %s"%sys.argv[1] +else: + print >>sys.stderr, "Expected a single configuration file as an argument" + sys.exit(1) +config.load(sys.argv[1]) + bib = BibTeX.parseFile(config.MASTER_BIB) ##### Sorted views: @@ -145,7 +152,7 @@ entries = bib.entries[:] entries = [ (ent.key, ent) for ent in entries ] entries.sort() entries = [ ent[1] for ent in entries ] -header,footer = getTemplate("_template_bibtex") +header,footer = getTemplate(config.BIBTEX_TEMPLATE_FILE) f = open(os.path.join(config.OUTPUT_DIR,"bibtex.html"), 'w') print >>f, header % { 'command_line' : "" } for ent in entries: -- cgit v1.2.3-70-g09d2