Add a simple caching implementation

svn:r67
author: Nick Mathewson <nickm@torproject.org> 2003-09-09 00:25:29 +0000
committer: Nick Mathewson <nickm@torproject.org> 2003-09-09 00:25:29 +0000
commit: c666e2e0794ba284b5a0c0f17d54c4a3233ea187 (patch)
tree: 498512e8706921fa27af73e5f28d665859b50379
parent: d3acfd8b215920491745f462d08fcd5e963a52ca (diff)
download: anonbib-c666e2e0794ba284b5a0c0f17d54c4a3233ea187.tar.gz
5 files changed, 168 insertions, 15 deletions
diff --git a/.cvsignore b/.cvsignore
index 432c089..6ca418d 100644
--- a/.cvsignore
+++ b/.cvsignore
@@ -4,3 +4,4 @@ date.html
 topic.html
 author.html
 bibtex.html
+cache
+\ No newline at end of file
diff --git a/BibTeX.py b/BibTeX.py
index ac5ba5d..b9d36ec 100644
--- a/BibTeX.py
+++ b/BibTeX.py
@@ -3,6 +3,7 @@
 import cStringIO
 import re
 import sys
+import os
 
 import config
 
@@ -414,21 +415,33 @@ class BibTeXEntry:
         res.append("<span class='title'><a name='%s'>%s</a></span>"%(
             url_untranslate(self.key),htmlize(self['title'])))
                 
-        availability = []
-        for key, name in (('www_abstract_url', 'abstract'),
-                          ('www_html_url', 'HTML'),
-                          ('www_pdf_url', 'PDF'),
-                          ('www_ps_url', 'PS'),
-                          ('www_txt_url', 'TXT'),
-                          ('www_ps_gz_url', 'gzipped&nbsp;PS')):
-            url = self.get(key)
-            if not url: continue
-            url = unTeXescapeURL(url)
-            availability.append('<a href="%s">%s</a>' %(url,name))
-        if availability:
-            res.append(" <span class='availability'>(")
-            res.append(",&nbsp;".join(availability))
-            res.append(")</span>")
+        for cached in 0,1:
+            availability = []
+            for key, name, ext in (('www_abstract_url', 'abstract','abstract'),
+                                   ('www_html_url', 'HTML', 'html'),
+                                   ('www_pdf_url', 'PDF', 'pdf'),
+                                   ('www_ps_url', 'PS', 'ps'),
+                                   ('www_txt_url', 'TXT', 'txt'),
+                                   ('www_ps_gz_url', 'gzipped&nbsp;PS','ps.gz')
+                                   ):
+                if cached:
+                    url = os.path.join(".", config.CACHE_DIR,
+                                       "%s.%s"%(self.key,ext))
+                    fname = os.path.join(config.OUTPUT_DIR, config.CACHE_DIR,
+                                         "%s.%s"%(self.key,ext))
+                    if not os.path.exists(fname): continue
+                else:
+                    url = self.get(key)
+                    if not url: continue
+                url = unTeXescapeURL(url)
+                availability.append('<a href="%s">%s</a>' %(url,name))
+
+            if availability:
+                res.append(" <span class='availability'>(")
+                if cached: res.append("Cached:&nbsp;")
+                res.append(",&nbsp;".join(availability))
+                res.append(")</span>")
+
         res.append("<br /><span class='author'>by ")
 
         #res.append("\n<!-- %r -->\n" % self.parsedAuthor)
diff --git a/config.py b/config.py
index 76e486e..a777659 100644
--- a/config.py
+++ b/config.py
@@ -5,6 +5,12 @@ MASTER_BIB = "./anonbib.bib"
 
 OUTPUT_DIR = "."
 
+# relative to OUTPUT_DIR.
+CACHE_DIR = "cache"
+
+# Time to connect to a server while caching.
+DOWNLOAD_CONNECT_TIMEOUT = 15
+
 AUTHOR_URLS = {
     'Ross.*Anderson' : 'http://www.cl.cam.ac.uk/users/rja14/',
     'Alessandro.*Acquisti' : 'http://www.sims.berkeley.edu/~acquisti/',
diff --git a/updateCache.py b/updateCache.py
new file mode 100644
index 0000000..5edda3d
--- /dev/null
+++ b/updateCache.py
@@ -0,0 +1,132 @@
+#!/usr/bin/python2
+
+"""Download files in bibliography into a local cache, in order to 
+"""
+
+import os
+import sys
+import signal
+import time
+
+import BibTeX
+import config
+import urllib2
+import getopt
+import socket
+import errno
+
+os.umask(022)
+
+FILE_TYPES = [ "txt", "html", "pdf", "ps", "ps.gz", "abstract" ]
+BIN_FILE_TYPES = [ 'pdf', 'ps.gz' ]
+
+class UIError(Exception):
+    pass
+
+def tryUnlink(fn):
+    try:
+        os.unlink(fn)
+    except OSError:
+        pass
+
+def getCacheFname(key, ftype):
+    return os.path.join(config.OUTPUT_DIR,config.CACHE_DIR,
+                        "%s.%s"%(key,ftype))
+
+
+def downloadFile(key, ftype, url, timeout=config.DOWNLOAD_CONNECT_TIMEOUT):
+    fname = getCacheFname(key, ftype)
+    fnameTmp = fname+".tmp"
+    fnameURL = fname+".url"
+    tryUnlink(fnameTmp)
+
+    def sigalrmHandler(sig,_):
+        pass
+    signal.signal(signal.SIGALRM, sigalrmHandler)
+    signal.alarm(timeout)
+    try:
+        try:
+            infile = urllib2.urlopen(url)
+        except IOError, e:
+            raise UIError("Cannot connect to url %s: %s"%(url,e))
+        except socket.error, e:
+            if getattr(e,"errno",-1) == errno.EINTR:
+                raise UIError("Connection timed out to url %s"%url)
+            else:
+                raise UIError("Error connecting to %s: %s"%(url, e))
+    finally:
+        signal.alarm(0)
+
+    mode = 'w'
+    if ftype in BIN_FILE_TYPES:
+        mode = 'wb'
+    outfile = open(fnameTmp, mode)
+    try:
+        while 1:
+            s = infile.read(1<<16)
+            if not s: break
+            outfile.write(s)
+    finally:
+        infile.close()
+        outfile.close()
+
+    urlfile = open(fnameURL, 'w')
+    print >>urlfile, time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
+    if "\n" in url: url = url.replace("\n", " ")
+    print >>urlfile, url
+    urlfile.close()
+
+    os.rename(fnameTmp, fname)
+
+def getURLs(entry):
+    r = {}
+    for ftype in FILE_TYPES:
+        ftype2 = ftype.replace(".", "_")
+        url = entry.get("www_%s_url"%ftype2)
+        if url:
+            r[ftype] = url.strip().replace("\n", " ")
+    return r
+
+def getCachedURL(key, ftype):
+    fname = getCacheFname(key, ftype)
+    urlFname = fname+".url"
+    if not os.path.exists(fname) or not os.path.exists(urlFname):
+        return None
+    f = open(urlFname, 'r')
+    lines = f.readlines()
+    f.close()
+    if len(lines) != 2:
+        print >>sys.stderr, "ERROR: unexpected number of lines in", urlFname
+    return lines[1].strip()
+
+def downloadAll(bibtex, missingOnly=0):
+    """returns list of tuples of key, ftype, url, error"""
+    errors = []
+    for e in bibtex.entries:
+        urls = getURLs(e)
+        key = e.key
+        for ftype, url in urls.items():
+            fname = getCacheFname(key, ftype)
+            if missingOnly:
+                cachedURL = getCachedURL(key, ftype) 
+                if cachedURL == url:
+                    print >>sys.stderr,"Skipping",url
+                    continue
+                elif cachedURL is not None:
+                    print >>sys.stderr,"URL for %s.%s has changed"%(key,ftype)
+                else:
+                    print >>sys.stderr,"No record for %s.%s"%(key,ftype)
+            try:
+                downloadFile(key, ftype, url)
+                print "Downloaded",url
+            except UIError, e:
+                print >>sys,stderr, str(e)
+                errors.append((key,ftype,url,str(e)))
+            except (IOError, socket.error), e:
+                msg = "Error downloading %s: %s"%(url,str(e))
+                print >>sys.stderr, msg
+                errors.append((key,ftype,url,msg))
+    return errors
+
+bib = BibTeX.parseFile(config.MASTER_BIB)
+downloadAll(bib,missingOnly=1)
diff --git a/writeHTML.py b/writeHTML.py
index 85b1aa3..d7b1ffa 100644
--- a/writeHTML.py
+++ b/writeHTML.py
@@ -5,6 +5,7 @@ import re
 import os
 
 assert sys.version_info[:3] >= (2,2,0)
+os.umask(022)
 
 import BibTeX
 import config
author	Nick Mathewson <nickm@torproject.org>	2003-09-09 00:25:29 +0000
committer	Nick Mathewson <nickm@torproject.org>	2003-09-09 00:25:29 +0000
commit	c666e2e0794ba284b5a0c0f17d54c4a3233ea187 (patch)
tree	498512e8706921fa27af73e5f28d665859b50379
parent	d3acfd8b215920491745f462d08fcd5e963a52ca (diff)
download	anonbib-c666e2e0794ba284b5a0c0f17d54c4a3233ea187.tar.gz