gnunetbib

Bibliography (BibTeX, based on AnonBib)
Log | Files | Refs | README | LICENSE

updateCache.py (5244B)


      1 #!/usr/bin/python2
      2 # Copyright 2003-2008, Nick Mathewson.  See LICENSE for licensing info.
      3 
      4 """Download files in bibliography into a local cache.
      5 """
      6 
      7 import os
      8 import sys
      9 import ssl
     10 import signal
     11 import time
     12 import gzip
     13 import urllib2
     14 import getopt
     15 import socket
     16 import errno
     17 import httplib
     18 import BibTeX
     19 import config
     20 
     21 
     22 FILE_TYPES = ["txt", "html", "pdf", "ps", "ps.gz", "abstract"]
     23 BIN_FILE_TYPES = ['pdf', 'ps.gz']
     24 
     25 class UIError(Exception):
     26     pass
     27 
     28 def tryUnlink(fn):
     29     try:
     30         os.unlink(fn)
     31     except OSError:
     32         pass
     33 
     34 def getCacheFname(key, ftype, section):
     35     return BibTeX.smartJoin(config.OUTPUT_DIR, config.CACHE_DIR,
     36                             section,
     37                             "%s.%s"%(key, ftype))
     38 
     39 def downloadFile(key, ftype, section, url, timeout=None):
     40     if timeout is None:
     41         timeout = config.DOWNLOAD_CONNECT_TIMEOUT
     42     fname = getCacheFname(key, ftype, section)
     43     parent = os.path.split(fname)[0]
     44     if not os.path.exists(parent):
     45         os.makedirs(parent)
     46 
     47     fnameTmp = fname+".tmp"
     48     fnameURL = fname+".url"
     49     tryUnlink(fnameTmp)
     50 
     51     def sigalrmHandler(sig, _):
     52         pass
     53     signal.signal(signal.SIGALRM, sigalrmHandler)
     54     signal.alarm(timeout)
     55     try:
     56         try:
     57             infile = urllib2.urlopen(url)
     58         except httplib.InvalidURL, e:
     59             raise UIError("Invalid URL %s: %s"%(url, e))
     60         except IOError, e:
     61             raise UIError("Cannot connect to url %s: %s"%(url, e))
     62         except socket.error, e:
     63             if getattr(e, "errno", -1) == errno.EINTR:
     64                 raise UIError("Connection timed out to url %s"%url)
     65             else:
     66                 raise UIError("Error connecting to %s: %s"%(url, e))
     67     finally:
     68         signal.alarm(0)
     69 
     70     mode = 'w'
     71     if ftype in BIN_FILE_TYPES:
     72         mode = 'wb'
     73     outfile = open(fnameTmp, mode)
     74     try:
     75         while 1:
     76             s = infile.read(1<<16)
     77             if not s: break
     78             outfile.write(s)
     79     finally:
     80         infile.close()
     81         outfile.close()
     82 
     83     urlfile = open(fnameURL, 'w')
     84     print >>urlfile, time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
     85     if "\n" in url: url = url.replace("\n", " ")
     86     print >>urlfile, url
     87     urlfile.close()
     88 
     89     os.rename(fnameTmp, fname)
     90 
     91 def getURLs(entry):
     92     r = {}
     93     for ftype in FILE_TYPES:
     94         ftype2 = ftype.replace(".", "_")
     95         url = entry.get("www_%s_url"%ftype2)
     96         if url:
     97             r[ftype] = url.strip().replace("\n", " ")
     98     return r
     99 
    100 def getCachedURL(key, ftype, section):
    101     fname = getCacheFname(key, ftype, section)
    102     urlFname = fname+".url"
    103     if not os.path.exists(fname) or not os.path.exists(urlFname):
    104         return None
    105     f = open(urlFname, 'r')
    106     lines = f.readlines()
    107     f.close()
    108     if len(lines) != 2:
    109         print >>sys.stderr, "ERROR: unexpected number of lines in", urlFname
    110     return lines[1].strip()
    111 
    112 def downloadAll(bibtex, missingOnly=0):
    113     """returns list of tuples of key, ftype, url, error"""
    114     errors = []
    115     for e in bibtex.entries:
    116         urls = getURLs(e)
    117         key = e.key
    118         section = e.get("www_cache_section", ".")
    119         for ftype, url in urls.items():
    120             if missingOnly:
    121                 cachedURL = getCachedURL(key, ftype, section)
    122                 if cachedURL == url:
    123                     print >>sys.stderr, "Skipping", url
    124                     continue
    125                 elif cachedURL is not None:
    126                     print >>sys.stderr, "URL for %s.%s has changed"%(key, ftype)
    127                 else:
    128                     print >>sys.stderr, "I have no copy of %s.%s"%(key, ftype)
    129             try:
    130                 downloadFile(key, ftype, section, url)
    131                 print "Downloaded", url
    132             except UIError, e:
    133                 print >>sys.stderr, str(e)
    134                 errors.append((key, ftype, url, str(e)))
    135             except (IOError, socket.error, ssl.CertificateError), e:
    136                 msg = "Error downloading %s: %s"%(url, str(e))
    137                 print >>sys.stderr, msg
    138                 errors.append((key, ftype, url, msg))
    139         if urls.has_key("ps") and not urls.has_key("ps.gz"):
    140             # Say, this is something we'd like to have gzipped locally.
    141             psFname = getCacheFname(key, "ps", section)
    142             psGzFname = getCacheFname(key, "ps.gz", section)
    143             if os.path.exists(psFname) and not os.path.exists(psGzFname):
    144                 # This is something we haven't gzipped yet.
    145                 print "Compressing a copy of", psFname
    146                 outf = gzip.GzipFile(psGzFname, "wb")
    147                 inf = open(psFname, "rb")
    148                 while 1:
    149                     s = inf.read(4096)
    150                     if not s:
    151                         break
    152                     outf.write(s)
    153                 outf.close()
    154                 inf.close()
    155 
    156     return errors
    157 
    158 if __name__ == '__main__':
    159     if len(sys.argv) == 2:
    160         print "Loading from %s"%sys.argv[1]
    161     else:
    162         print >>sys.stderr, "Expected a single configuration file as an argument"
    163         sys.exit(1)
    164     config.load(sys.argv[1])
    165 
    166     if config.CACHE_UMASK != None:
    167         os.umask(config.CACHE_UMASK)
    168 
    169     bib = BibTeX.parseFile(config.MASTER_BIB)
    170     downloadAll(bib, missingOnly=1)