gnunetbib

Bibliography (BibTeX, based on AnonBib)
Log | Files | Refs | README | LICENSE

rank.py (6134B)


      1 # Make rankings of papers and authors for automatic classification of content hotness
      2 
      3 # Google Scholar address
      4 # http://scholar.google.com/scholar?as_epq=
      5 
      6 # Take care of the caching setup
      7 cache_expire = 60*60*24*30 # 30 days
      8 
      9 # Checks
     10 import config
     11 import os
     12 import sys
     13 from os.path import exists, isdir, join, getmtime
     14 from os import listdir, remove
     15 
     16 def remove_old():
     17    # Remove all old cached files
     18    filenames = listdir(cache_folder())
     19    from time import time
     20    now = time()
     21    for f in filenames:
     22       pf = join(cache_folder(), f)
     23       time_mt =  getmtime(pf)
     24       if now - time_mt > cache_expire: # 30 days
     25          remove(pf)
     26 
     27 def cache_folder():
     28    r = join(config.OUTPUT_DIR, config.CITE_CACHE_DIR)
     29    if not exists(r):
     30       os.makedirs(r)
     31    assert isdir(r)
     32    return r
     33 
     34 import re
     35 from urllib.request import urlopen, build_opener
     36 from urllib.parse import quote
     37 from datetime import date
     38 import hashlib
     39 
     40 # A more handy hash
     41 def md5h(s):
     42    m = hashlib.md5()
     43    m.update(s.encode('utf-8'))
     44    return m.hexdigest()
     45 
     46 format_tested = 0
     47 
     48 def getPageForTitle(title, cache=True, update=True, save=True):
     49    #Returns (citation-count, scholar url) tuple, or (None,None)
     50    global format_tested
     51    if not format_tested and update:
     52       format_tested = 1
     53       TestScholarFormat()
     54 
     55    # Do not assume that the title is clean
     56    title = re.sub("\s+", " ", title)
     57    title = re.sub("[^'a-zA-Z0-9\. \-\/:]", "", title)
     58    title = re.sub("'\/", " ", title)
     59 
     60    # We rely on google scholar to return the article with this exact title
     61    gurl = "http://scholar.google.com/scholar?as_q=&as_epq=%s&as_occt=title"
     62 
     63    url = gurl % quote(title)
     64 
     65    # Access cache or network
     66    if exists(join(cache_folder(), md5h(url))) and cache:
     67       return url, file(join(cache_folder(), md5h(url)),'r').read()
     68    elif update:
     69       print(("Downloading rank for %r."%title))
     70 
     71       # Make a custom user agent (so that we are not filtered by Google)!
     72       opener = build_opener()
     73       opener.addheaders = [('User-agent', 'Anon.Bib.0.1')]
     74 
     75       print("connecting...")
     76       connection = opener.open(url)
     77       print("reading")
     78       page = connection.read()
     79       print("done")
     80       if save:
     81          file(join(cache_folder(), md5h(url)),'w').write(page)
     82       return url, page
     83    else:
     84       return url, None
     85 
     86 def getCite(title, cache=True, update=True, save=True):
     87    url, page = getPageForTitle(title, cache=cache, update=update, save=save)
     88    if not page:
     89       return None,None
     90 
     91    # Check if it finds any articles
     92    if len(re.findall("did not match any articles", page)) > 0:
     93       return (None, None)
     94 
     95    # Kill all tags!
     96    cpage = re.sub("<[^>]*>", "", page)
     97 
     98    # Add up all citations
     99    s = sum([int(x) for x in re.findall("Cited by ([0-9]*)", cpage)])
    100    return (s, url)
    101 
    102 def getPaperURLs(title, cache=True, update=True, save=True):
    103    url, page = getPageForTitle(title, cache=cache, update=update, save=save)
    104    if not page:
    105       return []
    106    pages = re.findall(r'\&\#x25ba\;.*class=fl href="([^"]*)"', page)
    107    return pages
    108 
    109 def get_rank_html(title, years=None, base_url=".", update=True,
    110                   velocity=False):
    111    s,url = getCite(title, update=update)
    112 
    113    # Paper cannot be found
    114    if s is None:
    115       return ''
    116 
    117    html = ''
    118 
    119    url = url.replace("&","&amp;")
    120 
    121    # Hotness
    122    H,h = 50,5
    123    if s >= H:
    124       html += '<a href="%s"><img src="%s/gold.gif" alt="More than %s citations on Google Scholar" title="More than %s citations on Google Scholar" /></a>' % (url,base_url,H,H)
    125    elif s >= h:
    126       html += '<a href="%s"><img src="%s/silver.gif" alt="More than %s citations on Google Scholar" title="More than %s citations on Google Scholar" /></a>' % (url,base_url,h,h)
    127 
    128    # Only include the velocity if asked.
    129    if velocity:
    130       # Velocity
    131       d = date.today().year - int(years)
    132       if d >= 0:
    133          if 2 < s / (d +1) < 10:
    134             html += '<img src="%s/ups.gif" />' % base_url
    135          if 10 <= s / (d +1):
    136             html += '<img src="%s/upb.gif" />' % base_url
    137 
    138    return html
    139 
    140 def TestScholarFormat():
    141    # We need to ensure that Google Scholar does not change its page format under our feet
    142    # Use some cases to check if all is good
    143    print("Checking google scholar formats...")
    144    stopAndGoCites = getCite("Stop-and-Go MIXes: Providing Probabilistic Anonymity in an Open System", False)[0]
    145    dragonCites = getCite("Mixes protected by Dragons and Pixies: an empirical study", False, save=False)[0]
    146 
    147    if stopAndGoCites in (0, None):
    148       print("""OOPS.\n
    149 It looks like Google Scholar changed their URL format or their output format.
    150 I went to count the cites for the Stop-and-Go MIXes paper, and got nothing.""")
    151       sys.exit(1)
    152 
    153    if dragonCites != None:
    154       print("""OOPS.\n
    155 It looks like Google Scholar changed their URL format or their output format.
    156 I went to count the cites for a fictitious paper, and found some.""")
    157       sys.exit(1)
    158 
    159 def urlIsUseless(u):
    160    if u.find("freehaven.net/anonbib/") >= 0:
    161       # Our own cache is not the primary citation for anything.
    162       return True
    163    elif u.find("owens.mit.edu") >= 0:
    164       # These citations only work for 'members of the MIT community'.
    165       return True
    166    else:
    167       return False
    168 
    169 URLTYPES=[ "pdf", "ps", "txt", "ps_gz", "html" ]
    170 
    171 if __name__ == '__main__':
    172    # First download the bibliography file.
    173    import BibTeX
    174    suggest = False
    175    if sys.argv[1] == 'suggest':
    176       suggest = True
    177       del sys.argv[1]
    178 
    179    config.load(sys.argv[1])
    180    if config.CACHE_UMASK != None:
    181       os.umask(config.CACHE_UMASK)
    182    bib = BibTeX.parseFile(config.MASTER_BIB)
    183    remove_old()
    184 
    185    print("Downloading missing ranks.")
    186    for ent in bib.entries:
    187       getCite(ent['title'], cache=True, update=True)
    188 
    189    if suggest:
    190       for ent in bib.entries:
    191          haveOne = False
    192          for utype in URLTYPES:
    193             if "www_%s_url"%utype in ent:
    194                haveOne = True
    195                break
    196          if haveOne:
    197             continue
    198          print((ent.key, "has no URLs given."))
    199          urls = [ u for u in getPaperURLs(ent['title']) if not urlIsUseless(u) ]
    200          for u in urls:
    201             print(("\t", u))
    202