aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorNick Mathewson <nickm@torproject.org>2007-07-10 18:42:56 +0000
committerNick Mathewson <nickm@torproject.org>2007-07-10 18:42:56 +0000
commitd1d3a099f2da7611439e7f57569ff7493c41e0b4 (patch)
tree7f9190bd59bc9b6554dbe49b0a65b2f6d470de7e
parentc933c52d4520527b93f2ae0c8534fa684e2a9fb9 (diff)
downloadgnunetbib-d1d3a099f2da7611439e7f57569ff7493c41e0b4.tar.gz
gnunetbib-d1d3a099f2da7611439e7f57569ff7493c41e0b4.zip
r13683@catbus: nickm | 2007-07-10 14:42:53 -0400
Patch from George to add citation-rank-based icons to anonbib output. Hacked up a bit so that "cache" and "generate page" are separate, so that the image urls are no longer hardwired to ~george, so output locations are configurable, etc. svn:r234
-rw-r--r--BibTeX.py14
-rw-r--r--Makefile1
-rw-r--r--anonbib.cfg3
-rw-r--r--config.py1
-rw-r--r--gold.gifbin0 -> 540 bytes
-rw-r--r--rank.py128
-rw-r--r--silver.gifbin0 -> 539 bytes
-rw-r--r--upb.gifbin0 -> 555 bytes
-rw-r--r--ups.gifbin0 -> 536 bytes
-rwxr-xr-xwriteHTML.py7
10 files changed, 149 insertions, 5 deletions
diff --git a/BibTeX.py b/BibTeX.py
index 1b26d72..2181ba6 100644
--- a/BibTeX.py
+++ b/BibTeX.py
@@ -13,6 +13,8 @@ import os
13 13
14import config 14import config
15 15
16import rank
17
16__all__ = [ 'ParseError', 'BibTeX', 'BibTeXEntry', 'htmlize', 18__all__ = [ 'ParseError', 'BibTeX', 'BibTeXEntry', 'htmlize',
17 'ParsedAuthor', 'FileIter', 'Parser', 'parseFile', 19 'ParsedAuthor', 'FileIter', 'Parser', 'parseFile',
18 'splitBibTeXEntriesBy', 'sortBibTexEntriesBy', ] 20 'splitBibTeXEntriesBy', 'sortBibTexEntriesBy', ]
@@ -400,7 +402,7 @@ class BibTeXEntry:
400 return errs 402 return errs
401 403
402 def biblio_to_html(self): 404 def biblio_to_html(self):
403 """Return the HTML for the citatation portion of entry.""" 405 """Return the HTML for the citation portion of entry."""
404 if self.type == 'inproceedings': 406 if self.type == 'inproceedings':
405 booktitle = self['booktitle'] 407 booktitle = self['booktitle']
406 bookurl = self.get('bookurl') 408 bookurl = self.get('bookurl')
@@ -496,7 +498,7 @@ class BibTeXEntry:
496 "</span>") %bibtexurl) 498 "</span>") %bibtexurl)
497 return htmlize("".join(res)) 499 return htmlize("".join(res))
498 500
499 def to_html(self, cache_path="./cache"): 501 def to_html(self, cache_path="./cache", base_url="."):
500 """Return the HTML for this entry.""" 502 """Return the HTML for this entry."""
501 imp = self.isImportant() 503 imp = self.isImportant()
502 draft = self.get('year') == 'forthcoming' 504 draft = self.get('year') == 'forthcoming'
@@ -507,6 +509,14 @@ class BibTeXEntry:
507 else: 509 else:
508 res = ["<li><p class='entry'>"] 510 res = ["<li><p class='entry'>"]
509 511
512 if imp or not draft:
513 # Add a picture of the rank
514 # Only if year is known or paper important!
515 r = rank.get_rank_html(self['title'], self.get('year'),
516 update=False, base_url=base_url)
517 if r is not None:
518 res.append(r)
519
510 res.append("<span class='title'><a name='%s'>%s</a></span>"%( 520 res.append("<span class='title'><a name='%s'>%s</a></span>"%(
511 url_untranslate(self.key),htmlize(self['title']))) 521 url_untranslate(self.key),htmlize(self['title'])))
512 522
diff --git a/Makefile b/Makefile
index e1ef106..378bf75 100644
--- a/Makefile
+++ b/Makefile
@@ -9,6 +9,7 @@ clean:
9 9
10update: 10update:
11 $(PYTHON) updateCache.py anonbib.cfg 11 $(PYTHON) updateCache.py anonbib.cfg
12 $(PYTHON) rank.py anonbib.cfg
12 13
13test: 14test:
14 $(PYTHON) test.py 15 $(PYTHON) test.py
diff --git a/anonbib.cfg b/anonbib.cfg
index 0ff6638..c1d9aa5 100644
--- a/anonbib.cfg
+++ b/anonbib.cfg
@@ -9,6 +9,9 @@ OUTPUT_DIR = "."
9# Where do we put cached papers (relative to OUTPUT_DIR) 9# Where do we put cached papers (relative to OUTPUT_DIR)
10CACHE_DIR = "cache" 10CACHE_DIR = "cache"
11 11
12# Where do we cache citations papers (relative to OUTPUT_DIR)
13CITE_CACHE_DIR = "cite_cache"
14
12# Are there subsections for cached papers? This is useful for putting 15# Are there subsections for cached papers? This is useful for putting
13# different Apache permission on different directories. 16# different Apache permission on different directories.
14CACHE_SECTIONS = [ ] 17CACHE_SECTIONS = [ ]
diff --git a/config.py b/config.py
index 6a25731..175a3d5 100644
--- a/config.py
+++ b/config.py
@@ -4,6 +4,7 @@ import re
4 4
5_KEYS = [ "ALL_TAGS", 5_KEYS = [ "ALL_TAGS",
6 "ALPHABETIZE_AUTHOR_AS","AUTHOR_URLS","CACHE_DIR","CACHE_SECTIONS", 6 "ALPHABETIZE_AUTHOR_AS","AUTHOR_URLS","CACHE_DIR","CACHE_SECTIONS",
7 "CITE_CACHE_DIR",
7 "COLLAPSE_AUTHORS", 8 "COLLAPSE_AUTHORS",
8 "DOWNLOAD_CONNECT_TIMEOUT","INITIAL_STRINGS", 9 "DOWNLOAD_CONNECT_TIMEOUT","INITIAL_STRINGS",
9 "MASTER_BIB", "NO_COLLAPSE_AUTHORS", "OMIT_ENTRIES", 10 "MASTER_BIB", "NO_COLLAPSE_AUTHORS", "OMIT_ENTRIES",
diff --git a/gold.gif b/gold.gif
new file mode 100644
index 0000000..44505db
--- /dev/null
+++ b/gold.gif
Binary files differ
diff --git a/rank.py b/rank.py
new file mode 100644
index 0000000..81592c2
--- /dev/null
+++ b/rank.py
@@ -0,0 +1,128 @@
1# Make rankings of papers and authors for automatic classification of content hotness
2
3# Google Scholar address
4# http://scholar.google.com/scholar?as_epq=
5
6# Take care of the caching setup
7cache_expire = 60*60*24*30 # 30 days
8
9# Checks
10import config
11import os
12import sys
13from os.path import exists, isdir, join, getmtime
14from os import listdir, remove
15
16def remove_old():
17 # Remove all old cached files
18 filenames = listdir(cache_folder())
19 from time import time
20 now = time()
21 for f in filenames:
22 pf = join(cache_folder(), f)
23 time_mt = getmtime(pf)
24 if now - time_mt > cache_expire: # 30 days
25 remove(pf)
26
27def cache_folder():
28 r = join(config.OUTPUT_DIR, config.CITE_CACHE_DIR)
29 if not exists(r):
30 os.makedirs(r)
31 assert isdir(r)
32 return r
33
34import md5
35import re
36from urllib2 import urlopen, build_opener
37from datetime import date
38
39# A more handy hash
40def md5h(s):
41 m = md5.new()
42 m.update(s)
43 return m.digest().encode('hex_codec')
44
45format_tested = 0
46
47def getCite(title, cache=True, update=True):
48 global format_tested
49 if not format_tested and update:
50 format_tested = 1
51 TestScholarFormat()
52
53 # Do not assume that the title is clean
54 title = re.sub("\s+", " ", title)
55 title = re.sub("[^'a-zA-Z0-9\. \-\/:]", "", title)
56 title = re.sub("'\/", " ", title)
57
58 # Make a custom user agent (so that we are not filtered by Google)!
59 opener = build_opener()
60 opener.addheaders = [('User-agent', 'Anon.Bib.0.1')]
61
62 # We rely on google scholar to return the article with this exact title
63 gurl = "http://scholar.google.com/scholar?as_epq=%s&as_occt=title"
64 from urllib import quote
65 url = gurl % quote(title)
66
67 # Access cache or network
68 if exists(join(cache_folder(), md5h(url))) and cache:
69 page = file(join(cache_folder(), md5h(url)),'r').read()
70 elif update:
71 print "Downloading rank for %r."%title
72 page = opener.open(url).read()
73 file(join(cache_folder(), md5h(url)),'w').write(page)
74 else:
75 return None
76
77 # Check if it finds any articles
78 if len(re.findall("did not match any articles", page)) > 0:
79 return None
80
81 # Kill all tags!
82 cpage = re.sub("<[^>]*>", "", page)
83
84 # Add up all citations
85 s = sum([int(x) for x in re.findall("Cited by ([0-9]*)", cpage)])
86 return s
87
88def get_rank_html(title, years=None, base_url=".", update=True):
89 s = getCite(title, update=update)
90
91 # Paper cannot be found
92 if s is None:
93 return ''
94
95 html = ''
96
97 # Hotness
98 if s >= 50:
99 html += '<img src="%s/gold.gif" />' % base_url
100 elif s >= 5:
101 html += '<img src="%s/silver.gif" />' % base_url
102
103 # Velocity
104 d = date.today().year - int(years)
105 if d >= 0:
106 if 2 < s / (d +1) < 10:
107 html += '<img src="%s/ups.gif" />' % base_url
108 if 10 <= s / (d +1):<
109 html += '<img src="%s/upb.gif" />' % base_url
110
111 return html
112
113def TestScholarFormat():
114 # We need to ensure that Google Scholar does not change its page format under our feet
115 # Use some cases to check if all is good
116 assert(getCite("Stop-and-Go MIXes: Providing Probabilistic Anonymity in an Open System", False) > 0)
117 assert(getCite("Mixes protected by Dragons and Pixies: an empirical study", False) == None)
118
119if __name__ == '__main__':
120 # First download the bibliography file.
121 import BibTeX
122 config.load(sys.argv[1])
123 bib = BibTeX.parseFile(config.MASTER_BIB)
124 remove_old()
125 print "Downloading missing ranks."
126 for ent in bib.entries:
127 getCite(ent['title'], cache=True, update=True)
128
diff --git a/silver.gif b/silver.gif
new file mode 100644
index 0000000..8a4ff29
--- /dev/null
+++ b/silver.gif
Binary files differ
diff --git a/upb.gif b/upb.gif
new file mode 100644
index 0000000..5852828
--- /dev/null
+++ b/upb.gif
Binary files differ
diff --git a/ups.gif b/ups.gif
new file mode 100644
index 0000000..36f0124
--- /dev/null
+++ b/ups.gif
Binary files differ
diff --git a/writeHTML.py b/writeHTML.py
index 3184ef0..934b46b 100755
--- a/writeHTML.py
+++ b/writeHTML.py
@@ -29,7 +29,7 @@ def pathLength(s):
29 s = parent 29 s = parent
30 return n 30 return n
31 31
32def writeBody(f, sections, section_urls, cache_path): 32def writeBody(f, sections, section_urls, cache_path, base_url):
33 '''f: an open file 33 '''f: an open file
34 sections: list of (sectionname, [list of BibTeXEntry]) 34 sections: list of (sectionname, [list of BibTeXEntry])
35 section_urls: map from sectionname to external url''' 35 section_urls: map from sectionname to external url'''
@@ -45,7 +45,7 @@ def writeBody(f, sections, section_urls, cache_path):
45 BibTeX.url_untranslate(s),sDisp)) 45 BibTeX.url_untranslate(s),sDisp))
46 print >>f, "<ul class='expand'>" 46 print >>f, "<ul class='expand'>"
47 for e in entries: 47 for e in entries:
48 print >>f, e.to_html(cache_path=cache_path) 48 print >>f, e.to_html(cache_path=cache_path, base_url=base_url)
49 print >>f, "</ul></li>" 49 print >>f, "</ul></li>"
50 50
51def writeHTML(f, sections, sectionType, fieldName, choices, 51def writeHTML(f, sections, sectionType, fieldName, choices,
@@ -104,7 +104,8 @@ def writeHTML(f, sections, sectionType, fieldName, choices,
104 104
105 header, footer = getTemplate(config.TEMPLATE_FILE) 105 header, footer = getTemplate(config.TEMPLATE_FILE)
106 print >>f, header%fields 106 print >>f, header%fields
107 writeBody(f, sections, section_urls, cache_path=cache_url_path) 107 writeBody(f, sections, section_urls, cache_path=cache_url_path,
108 base_url=root)
108 print >>f, footer%fields 109 print >>f, footer%fields
109 110
110def writePageSet(config, bib, tag): 111def writePageSet(config, bib, tag):