updateCache.py (5244B)
1 #!/usr/bin/python2 2 # Copyright 2003-2008, Nick Mathewson. See LICENSE for licensing info. 3 4 """Download files in bibliography into a local cache. 5 """ 6 7 import os 8 import sys 9 import ssl 10 import signal 11 import time 12 import gzip 13 import urllib2 14 import getopt 15 import socket 16 import errno 17 import httplib 18 import BibTeX 19 import config 20 21 22 FILE_TYPES = ["txt", "html", "pdf", "ps", "ps.gz", "abstract"] 23 BIN_FILE_TYPES = ['pdf', 'ps.gz'] 24 25 class UIError(Exception): 26 pass 27 28 def tryUnlink(fn): 29 try: 30 os.unlink(fn) 31 except OSError: 32 pass 33 34 def getCacheFname(key, ftype, section): 35 return BibTeX.smartJoin(config.OUTPUT_DIR, config.CACHE_DIR, 36 section, 37 "%s.%s"%(key, ftype)) 38 39 def downloadFile(key, ftype, section, url, timeout=None): 40 if timeout is None: 41 timeout = config.DOWNLOAD_CONNECT_TIMEOUT 42 fname = getCacheFname(key, ftype, section) 43 parent = os.path.split(fname)[0] 44 if not os.path.exists(parent): 45 os.makedirs(parent) 46 47 fnameTmp = fname+".tmp" 48 fnameURL = fname+".url" 49 tryUnlink(fnameTmp) 50 51 def sigalrmHandler(sig, _): 52 pass 53 signal.signal(signal.SIGALRM, sigalrmHandler) 54 signal.alarm(timeout) 55 try: 56 try: 57 infile = urllib2.urlopen(url) 58 except httplib.InvalidURL, e: 59 raise UIError("Invalid URL %s: %s"%(url, e)) 60 except IOError, e: 61 raise UIError("Cannot connect to url %s: %s"%(url, e)) 62 except socket.error, e: 63 if getattr(e, "errno", -1) == errno.EINTR: 64 raise UIError("Connection timed out to url %s"%url) 65 else: 66 raise UIError("Error connecting to %s: %s"%(url, e)) 67 finally: 68 signal.alarm(0) 69 70 mode = 'w' 71 if ftype in BIN_FILE_TYPES: 72 mode = 'wb' 73 outfile = open(fnameTmp, mode) 74 try: 75 while 1: 76 s = infile.read(1<<16) 77 if not s: break 78 outfile.write(s) 79 finally: 80 infile.close() 81 outfile.close() 82 83 urlfile = open(fnameURL, 'w') 84 print >>urlfile, time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) 85 if "\n" in url: url = url.replace("\n", " ") 86 print >>urlfile, url 87 urlfile.close() 88 89 os.rename(fnameTmp, fname) 90 91 def getURLs(entry): 92 r = {} 93 for ftype in FILE_TYPES: 94 ftype2 = ftype.replace(".", "_") 95 url = entry.get("www_%s_url"%ftype2) 96 if url: 97 r[ftype] = url.strip().replace("\n", " ") 98 return r 99 100 def getCachedURL(key, ftype, section): 101 fname = getCacheFname(key, ftype, section) 102 urlFname = fname+".url" 103 if not os.path.exists(fname) or not os.path.exists(urlFname): 104 return None 105 f = open(urlFname, 'r') 106 lines = f.readlines() 107 f.close() 108 if len(lines) != 2: 109 print >>sys.stderr, "ERROR: unexpected number of lines in", urlFname 110 return lines[1].strip() 111 112 def downloadAll(bibtex, missingOnly=0): 113 """returns list of tuples of key, ftype, url, error""" 114 errors = [] 115 for e in bibtex.entries: 116 urls = getURLs(e) 117 key = e.key 118 section = e.get("www_cache_section", ".") 119 for ftype, url in urls.items(): 120 if missingOnly: 121 cachedURL = getCachedURL(key, ftype, section) 122 if cachedURL == url: 123 print >>sys.stderr, "Skipping", url 124 continue 125 elif cachedURL is not None: 126 print >>sys.stderr, "URL for %s.%s has changed"%(key, ftype) 127 else: 128 print >>sys.stderr, "I have no copy of %s.%s"%(key, ftype) 129 try: 130 downloadFile(key, ftype, section, url) 131 print "Downloaded", url 132 except UIError, e: 133 print >>sys.stderr, str(e) 134 errors.append((key, ftype, url, str(e))) 135 except (IOError, socket.error, ssl.CertificateError), e: 136 msg = "Error downloading %s: %s"%(url, str(e)) 137 print >>sys.stderr, msg 138 errors.append((key, ftype, url, msg)) 139 if urls.has_key("ps") and not urls.has_key("ps.gz"): 140 # Say, this is something we'd like to have gzipped locally. 141 psFname = getCacheFname(key, "ps", section) 142 psGzFname = getCacheFname(key, "ps.gz", section) 143 if os.path.exists(psFname) and not os.path.exists(psGzFname): 144 # This is something we haven't gzipped yet. 145 print "Compressing a copy of", psFname 146 outf = gzip.GzipFile(psGzFname, "wb") 147 inf = open(psFname, "rb") 148 while 1: 149 s = inf.read(4096) 150 if not s: 151 break 152 outf.write(s) 153 outf.close() 154 inf.close() 155 156 return errors 157 158 if __name__ == '__main__': 159 if len(sys.argv) == 2: 160 print "Loading from %s"%sys.argv[1] 161 else: 162 print >>sys.stderr, "Expected a single configuration file as an argument" 163 sys.exit(1) 164 config.load(sys.argv[1]) 165 166 if config.CACHE_UMASK != None: 167 os.umask(config.CACHE_UMASK) 168 169 bib = BibTeX.parseFile(config.MASTER_BIB) 170 downloadAll(bib, missingOnly=1)