diff options
Diffstat (limited to 'updateCache.py')
| -rw-r--r-- | updateCache.py | 132 |
1 files changed, 132 insertions, 0 deletions
diff --git a/updateCache.py b/updateCache.py new file mode 100644 index 0000000..5edda3d --- /dev/null +++ b/updateCache.py @@ -0,0 +1,132 @@ +#!/usr/bin/python2 + +"""Download files in bibliography into a local cache, in order to +""" + +import os +import sys +import signal +import time + +import BibTeX +import config +import urllib2 +import getopt +import socket +import errno + +os.umask(022) + +FILE_TYPES = [ "txt", "html", "pdf", "ps", "ps.gz", "abstract" ] +BIN_FILE_TYPES = [ 'pdf', 'ps.gz' ] + +class UIError(Exception): + pass + +def tryUnlink(fn): + try: + os.unlink(fn) + except OSError: + pass + +def getCacheFname(key, ftype): + return os.path.join(config.OUTPUT_DIR,config.CACHE_DIR, + "%s.%s"%(key,ftype)) + + +def downloadFile(key, ftype, url, timeout=config.DOWNLOAD_CONNECT_TIMEOUT): + fname = getCacheFname(key, ftype) + fnameTmp = fname+".tmp" + fnameURL = fname+".url" + tryUnlink(fnameTmp) + + def sigalrmHandler(sig,_): + pass + signal.signal(signal.SIGALRM, sigalrmHandler) + signal.alarm(timeout) + try: + try: + infile = urllib2.urlopen(url) + except IOError, e: + raise UIError("Cannot connect to url %s: %s"%(url,e)) + except socket.error, e: + if getattr(e,"errno",-1) == errno.EINTR: + raise UIError("Connection timed out to url %s"%url) + else: + raise UIError("Error connecting to %s: %s"%(url, e)) + finally: + signal.alarm(0) + + mode = 'w' + if ftype in BIN_FILE_TYPES: + mode = 'wb' + outfile = open(fnameTmp, mode) + try: + while 1: + s = infile.read(1<<16) + if not s: break + outfile.write(s) + finally: + infile.close() + outfile.close() + + urlfile = open(fnameURL, 'w') + print >>urlfile, time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) + if "\n" in url: url = url.replace("\n", " ") + print >>urlfile, url + urlfile.close() + + os.rename(fnameTmp, fname) + +def getURLs(entry): + r = {} + for ftype in FILE_TYPES: + ftype2 = ftype.replace(".", "_") + url = entry.get("www_%s_url"%ftype2) + if url: + r[ftype] = url.strip().replace("\n", " ") + return r + +def getCachedURL(key, ftype): + fname = getCacheFname(key, ftype) + urlFname = fname+".url" + if not os.path.exists(fname) or not os.path.exists(urlFname): + return None + f = open(urlFname, 'r') + lines = f.readlines() + f.close() + if len(lines) != 2: + print >>sys.stderr, "ERROR: unexpected number of lines in", urlFname + return lines[1].strip() + +def downloadAll(bibtex, missingOnly=0): + """returns list of tuples of key, ftype, url, error""" + errors = [] + for e in bibtex.entries: + urls = getURLs(e) + key = e.key + for ftype, url in urls.items(): + fname = getCacheFname(key, ftype) + if missingOnly: + cachedURL = getCachedURL(key, ftype) + if cachedURL == url: + print >>sys.stderr,"Skipping",url + continue + elif cachedURL is not None: + print >>sys.stderr,"URL for %s.%s has changed"%(key,ftype) + else: + print >>sys.stderr,"No record for %s.%s"%(key,ftype) + try: + downloadFile(key, ftype, url) + print "Downloaded",url + except UIError, e: + print >>sys,stderr, str(e) + errors.append((key,ftype,url,str(e))) + except (IOError, socket.error), e: + msg = "Error downloading %s: %s"%(url,str(e)) + print >>sys.stderr, msg + errors.append((key,ftype,url,msg)) + return errors + +bib = BibTeX.parseFile(config.MASTER_BIB) +downloadAll(bib,missingOnly=1) |
