rank.py (6134B)
1 # Make rankings of papers and authors for automatic classification of content hotness 2 3 # Google Scholar address 4 # http://scholar.google.com/scholar?as_epq= 5 6 # Take care of the caching setup 7 cache_expire = 60*60*24*30 # 30 days 8 9 # Checks 10 import config 11 import os 12 import sys 13 from os.path import exists, isdir, join, getmtime 14 from os import listdir, remove 15 16 def remove_old(): 17 # Remove all old cached files 18 filenames = listdir(cache_folder()) 19 from time import time 20 now = time() 21 for f in filenames: 22 pf = join(cache_folder(), f) 23 time_mt = getmtime(pf) 24 if now - time_mt > cache_expire: # 30 days 25 remove(pf) 26 27 def cache_folder(): 28 r = join(config.OUTPUT_DIR, config.CITE_CACHE_DIR) 29 if not exists(r): 30 os.makedirs(r) 31 assert isdir(r) 32 return r 33 34 import re 35 from urllib.request import urlopen, build_opener 36 from urllib.parse import quote 37 from datetime import date 38 import hashlib 39 40 # A more handy hash 41 def md5h(s): 42 m = hashlib.md5() 43 m.update(s.encode('utf-8')) 44 return m.hexdigest() 45 46 format_tested = 0 47 48 def getPageForTitle(title, cache=True, update=True, save=True): 49 #Returns (citation-count, scholar url) tuple, or (None,None) 50 global format_tested 51 if not format_tested and update: 52 format_tested = 1 53 TestScholarFormat() 54 55 # Do not assume that the title is clean 56 title = re.sub("\s+", " ", title) 57 title = re.sub("[^'a-zA-Z0-9\. \-\/:]", "", title) 58 title = re.sub("'\/", " ", title) 59 60 # We rely on google scholar to return the article with this exact title 61 gurl = "http://scholar.google.com/scholar?as_q=&as_epq=%s&as_occt=title" 62 63 url = gurl % quote(title) 64 65 # Access cache or network 66 if exists(join(cache_folder(), md5h(url))) and cache: 67 return url, file(join(cache_folder(), md5h(url)),'r').read() 68 elif update: 69 print(("Downloading rank for %r."%title)) 70 71 # Make a custom user agent (so that we are not filtered by Google)! 72 opener = build_opener() 73 opener.addheaders = [('User-agent', 'Anon.Bib.0.1')] 74 75 print("connecting...") 76 connection = opener.open(url) 77 print("reading") 78 page = connection.read() 79 print("done") 80 if save: 81 file(join(cache_folder(), md5h(url)),'w').write(page) 82 return url, page 83 else: 84 return url, None 85 86 def getCite(title, cache=True, update=True, save=True): 87 url, page = getPageForTitle(title, cache=cache, update=update, save=save) 88 if not page: 89 return None,None 90 91 # Check if it finds any articles 92 if len(re.findall("did not match any articles", page)) > 0: 93 return (None, None) 94 95 # Kill all tags! 96 cpage = re.sub("<[^>]*>", "", page) 97 98 # Add up all citations 99 s = sum([int(x) for x in re.findall("Cited by ([0-9]*)", cpage)]) 100 return (s, url) 101 102 def getPaperURLs(title, cache=True, update=True, save=True): 103 url, page = getPageForTitle(title, cache=cache, update=update, save=save) 104 if not page: 105 return [] 106 pages = re.findall(r'\&\#x25ba\;.*class=fl href="([^"]*)"', page) 107 return pages 108 109 def get_rank_html(title, years=None, base_url=".", update=True, 110 velocity=False): 111 s,url = getCite(title, update=update) 112 113 # Paper cannot be found 114 if s is None: 115 return '' 116 117 html = '' 118 119 url = url.replace("&","&") 120 121 # Hotness 122 H,h = 50,5 123 if s >= H: 124 html += '<a href="%s"><img src="%s/gold.gif" alt="More than %s citations on Google Scholar" title="More than %s citations on Google Scholar" /></a>' % (url,base_url,H,H) 125 elif s >= h: 126 html += '<a href="%s"><img src="%s/silver.gif" alt="More than %s citations on Google Scholar" title="More than %s citations on Google Scholar" /></a>' % (url,base_url,h,h) 127 128 # Only include the velocity if asked. 129 if velocity: 130 # Velocity 131 d = date.today().year - int(years) 132 if d >= 0: 133 if 2 < s / (d +1) < 10: 134 html += '<img src="%s/ups.gif" />' % base_url 135 if 10 <= s / (d +1): 136 html += '<img src="%s/upb.gif" />' % base_url 137 138 return html 139 140 def TestScholarFormat(): 141 # We need to ensure that Google Scholar does not change its page format under our feet 142 # Use some cases to check if all is good 143 print("Checking google scholar formats...") 144 stopAndGoCites = getCite("Stop-and-Go MIXes: Providing Probabilistic Anonymity in an Open System", False)[0] 145 dragonCites = getCite("Mixes protected by Dragons and Pixies: an empirical study", False, save=False)[0] 146 147 if stopAndGoCites in (0, None): 148 print("""OOPS.\n 149 It looks like Google Scholar changed their URL format or their output format. 150 I went to count the cites for the Stop-and-Go MIXes paper, and got nothing.""") 151 sys.exit(1) 152 153 if dragonCites != None: 154 print("""OOPS.\n 155 It looks like Google Scholar changed their URL format or their output format. 156 I went to count the cites for a fictitious paper, and found some.""") 157 sys.exit(1) 158 159 def urlIsUseless(u): 160 if u.find("freehaven.net/anonbib/") >= 0: 161 # Our own cache is not the primary citation for anything. 162 return True 163 elif u.find("owens.mit.edu") >= 0: 164 # These citations only work for 'members of the MIT community'. 165 return True 166 else: 167 return False 168 169 URLTYPES=[ "pdf", "ps", "txt", "ps_gz", "html" ] 170 171 if __name__ == '__main__': 172 # First download the bibliography file. 173 import BibTeX 174 suggest = False 175 if sys.argv[1] == 'suggest': 176 suggest = True 177 del sys.argv[1] 178 179 config.load(sys.argv[1]) 180 if config.CACHE_UMASK != None: 181 os.umask(config.CACHE_UMASK) 182 bib = BibTeX.parseFile(config.MASTER_BIB) 183 remove_old() 184 185 print("Downloading missing ranks.") 186 for ent in bib.entries: 187 getCite(ent['title'], cache=True, update=True) 188 189 if suggest: 190 for ent in bib.entries: 191 haveOne = False 192 for utype in URLTYPES: 193 if "www_%s_url"%utype in ent: 194 haveOne = True 195 break 196 if haveOne: 197 continue 198 print((ent.key, "has no URLs given.")) 199 urls = [ u for u in getPaperURLs(ent['title']) if not urlIsUseless(u) ] 200 for u in urls: 201 print(("\t", u)) 202