diff options
author | Nick Mathewson <nickm@torproject.org> | 2007-07-10 18:42:56 +0000 |
---|---|---|
committer | Nick Mathewson <nickm@torproject.org> | 2007-07-10 18:42:56 +0000 |
commit | d1d3a099f2da7611439e7f57569ff7493c41e0b4 (patch) | |
tree | 7f9190bd59bc9b6554dbe49b0a65b2f6d470de7e | |
parent | c933c52d4520527b93f2ae0c8534fa684e2a9fb9 (diff) | |
download | gnunetbib-d1d3a099f2da7611439e7f57569ff7493c41e0b4.tar.gz gnunetbib-d1d3a099f2da7611439e7f57569ff7493c41e0b4.zip |
r13683@catbus: nickm | 2007-07-10 14:42:53 -0400
Patch from George to add citation-rank-based icons to anonbib output. Hacked up a bit so that "cache" and "generate page" are separate, so that the image urls are no longer hardwired to ~george, so output locations are configurable, etc.
svn:r234
-rw-r--r-- | BibTeX.py | 14 | ||||
-rw-r--r-- | Makefile | 1 | ||||
-rw-r--r-- | anonbib.cfg | 3 | ||||
-rw-r--r-- | config.py | 1 | ||||
-rw-r--r-- | gold.gif | bin | 0 -> 540 bytes | |||
-rw-r--r-- | rank.py | 128 | ||||
-rw-r--r-- | silver.gif | bin | 0 -> 539 bytes | |||
-rw-r--r-- | upb.gif | bin | 0 -> 555 bytes | |||
-rw-r--r-- | ups.gif | bin | 0 -> 536 bytes | |||
-rwxr-xr-x | writeHTML.py | 7 |
10 files changed, 149 insertions, 5 deletions
@@ -13,6 +13,8 @@ import os | |||
13 | 13 | ||
14 | import config | 14 | import config |
15 | 15 | ||
16 | import rank | ||
17 | |||
16 | __all__ = [ 'ParseError', 'BibTeX', 'BibTeXEntry', 'htmlize', | 18 | __all__ = [ 'ParseError', 'BibTeX', 'BibTeXEntry', 'htmlize', |
17 | 'ParsedAuthor', 'FileIter', 'Parser', 'parseFile', | 19 | 'ParsedAuthor', 'FileIter', 'Parser', 'parseFile', |
18 | 'splitBibTeXEntriesBy', 'sortBibTexEntriesBy', ] | 20 | 'splitBibTeXEntriesBy', 'sortBibTexEntriesBy', ] |
@@ -400,7 +402,7 @@ class BibTeXEntry: | |||
400 | return errs | 402 | return errs |
401 | 403 | ||
402 | def biblio_to_html(self): | 404 | def biblio_to_html(self): |
403 | """Return the HTML for the citatation portion of entry.""" | 405 | """Return the HTML for the citation portion of entry.""" |
404 | if self.type == 'inproceedings': | 406 | if self.type == 'inproceedings': |
405 | booktitle = self['booktitle'] | 407 | booktitle = self['booktitle'] |
406 | bookurl = self.get('bookurl') | 408 | bookurl = self.get('bookurl') |
@@ -496,7 +498,7 @@ class BibTeXEntry: | |||
496 | "</span>") %bibtexurl) | 498 | "</span>") %bibtexurl) |
497 | return htmlize("".join(res)) | 499 | return htmlize("".join(res)) |
498 | 500 | ||
499 | def to_html(self, cache_path="./cache"): | 501 | def to_html(self, cache_path="./cache", base_url="."): |
500 | """Return the HTML for this entry.""" | 502 | """Return the HTML for this entry.""" |
501 | imp = self.isImportant() | 503 | imp = self.isImportant() |
502 | draft = self.get('year') == 'forthcoming' | 504 | draft = self.get('year') == 'forthcoming' |
@@ -507,6 +509,14 @@ class BibTeXEntry: | |||
507 | else: | 509 | else: |
508 | res = ["<li><p class='entry'>"] | 510 | res = ["<li><p class='entry'>"] |
509 | 511 | ||
512 | if imp or not draft: | ||
513 | # Add a picture of the rank | ||
514 | # Only if year is known or paper important! | ||
515 | r = rank.get_rank_html(self['title'], self.get('year'), | ||
516 | update=False, base_url=base_url) | ||
517 | if r is not None: | ||
518 | res.append(r) | ||
519 | |||
510 | res.append("<span class='title'><a name='%s'>%s</a></span>"%( | 520 | res.append("<span class='title'><a name='%s'>%s</a></span>"%( |
511 | url_untranslate(self.key),htmlize(self['title']))) | 521 | url_untranslate(self.key),htmlize(self['title']))) |
512 | 522 | ||
@@ -9,6 +9,7 @@ clean: | |||
9 | 9 | ||
10 | update: | 10 | update: |
11 | $(PYTHON) updateCache.py anonbib.cfg | 11 | $(PYTHON) updateCache.py anonbib.cfg |
12 | $(PYTHON) rank.py anonbib.cfg | ||
12 | 13 | ||
13 | test: | 14 | test: |
14 | $(PYTHON) test.py | 15 | $(PYTHON) test.py |
diff --git a/anonbib.cfg b/anonbib.cfg index 0ff6638..c1d9aa5 100644 --- a/anonbib.cfg +++ b/anonbib.cfg | |||
@@ -9,6 +9,9 @@ OUTPUT_DIR = "." | |||
9 | # Where do we put cached papers (relative to OUTPUT_DIR) | 9 | # Where do we put cached papers (relative to OUTPUT_DIR) |
10 | CACHE_DIR = "cache" | 10 | CACHE_DIR = "cache" |
11 | 11 | ||
12 | # Where do we cache citations papers (relative to OUTPUT_DIR) | ||
13 | CITE_CACHE_DIR = "cite_cache" | ||
14 | |||
12 | # Are there subsections for cached papers? This is useful for putting | 15 | # Are there subsections for cached papers? This is useful for putting |
13 | # different Apache permission on different directories. | 16 | # different Apache permission on different directories. |
14 | CACHE_SECTIONS = [ ] | 17 | CACHE_SECTIONS = [ ] |
@@ -4,6 +4,7 @@ import re | |||
4 | 4 | ||
5 | _KEYS = [ "ALL_TAGS", | 5 | _KEYS = [ "ALL_TAGS", |
6 | "ALPHABETIZE_AUTHOR_AS","AUTHOR_URLS","CACHE_DIR","CACHE_SECTIONS", | 6 | "ALPHABETIZE_AUTHOR_AS","AUTHOR_URLS","CACHE_DIR","CACHE_SECTIONS", |
7 | "CITE_CACHE_DIR", | ||
7 | "COLLAPSE_AUTHORS", | 8 | "COLLAPSE_AUTHORS", |
8 | "DOWNLOAD_CONNECT_TIMEOUT","INITIAL_STRINGS", | 9 | "DOWNLOAD_CONNECT_TIMEOUT","INITIAL_STRINGS", |
9 | "MASTER_BIB", "NO_COLLAPSE_AUTHORS", "OMIT_ENTRIES", | 10 | "MASTER_BIB", "NO_COLLAPSE_AUTHORS", "OMIT_ENTRIES", |
diff --git a/gold.gif b/gold.gif new file mode 100644 index 0000000..44505db --- /dev/null +++ b/gold.gif | |||
Binary files differ | |||
@@ -0,0 +1,128 @@ | |||
1 | # Make rankings of papers and authors for automatic classification of content hotness | ||
2 | |||
3 | # Google Scholar address | ||
4 | # http://scholar.google.com/scholar?as_epq= | ||
5 | |||
6 | # Take care of the caching setup | ||
7 | cache_expire = 60*60*24*30 # 30 days | ||
8 | |||
9 | # Checks | ||
10 | import config | ||
11 | import os | ||
12 | import sys | ||
13 | from os.path import exists, isdir, join, getmtime | ||
14 | from os import listdir, remove | ||
15 | |||
16 | def remove_old(): | ||
17 | # Remove all old cached files | ||
18 | filenames = listdir(cache_folder()) | ||
19 | from time import time | ||
20 | now = time() | ||
21 | for f in filenames: | ||
22 | pf = join(cache_folder(), f) | ||
23 | time_mt = getmtime(pf) | ||
24 | if now - time_mt > cache_expire: # 30 days | ||
25 | remove(pf) | ||
26 | |||
27 | def cache_folder(): | ||
28 | r = join(config.OUTPUT_DIR, config.CITE_CACHE_DIR) | ||
29 | if not exists(r): | ||
30 | os.makedirs(r) | ||
31 | assert isdir(r) | ||
32 | return r | ||
33 | |||
34 | import md5 | ||
35 | import re | ||
36 | from urllib2 import urlopen, build_opener | ||
37 | from datetime import date | ||
38 | |||
39 | # A more handy hash | ||
40 | def md5h(s): | ||
41 | m = md5.new() | ||
42 | m.update(s) | ||
43 | return m.digest().encode('hex_codec') | ||
44 | |||
45 | format_tested = 0 | ||
46 | |||
47 | def getCite(title, cache=True, update=True): | ||
48 | global format_tested | ||
49 | if not format_tested and update: | ||
50 | format_tested = 1 | ||
51 | TestScholarFormat() | ||
52 | |||
53 | # Do not assume that the title is clean | ||
54 | title = re.sub("\s+", " ", title) | ||
55 | title = re.sub("[^'a-zA-Z0-9\. \-\/:]", "", title) | ||
56 | title = re.sub("'\/", " ", title) | ||
57 | |||
58 | # Make a custom user agent (so that we are not filtered by Google)! | ||
59 | opener = build_opener() | ||
60 | opener.addheaders = [('User-agent', 'Anon.Bib.0.1')] | ||
61 | |||
62 | # We rely on google scholar to return the article with this exact title | ||
63 | gurl = "http://scholar.google.com/scholar?as_epq=%s&as_occt=title" | ||
64 | from urllib import quote | ||
65 | url = gurl % quote(title) | ||
66 | |||
67 | # Access cache or network | ||
68 | if exists(join(cache_folder(), md5h(url))) and cache: | ||
69 | page = file(join(cache_folder(), md5h(url)),'r').read() | ||
70 | elif update: | ||
71 | print "Downloading rank for %r."%title | ||
72 | page = opener.open(url).read() | ||
73 | file(join(cache_folder(), md5h(url)),'w').write(page) | ||
74 | else: | ||
75 | return None | ||
76 | |||
77 | # Check if it finds any articles | ||
78 | if len(re.findall("did not match any articles", page)) > 0: | ||
79 | return None | ||
80 | |||
81 | # Kill all tags! | ||
82 | cpage = re.sub("<[^>]*>", "", page) | ||
83 | |||
84 | # Add up all citations | ||
85 | s = sum([int(x) for x in re.findall("Cited by ([0-9]*)", cpage)]) | ||
86 | return s | ||
87 | |||
88 | def get_rank_html(title, years=None, base_url=".", update=True): | ||
89 | s = getCite(title, update=update) | ||
90 | |||
91 | # Paper cannot be found | ||
92 | if s is None: | ||
93 | return '' | ||
94 | |||
95 | html = '' | ||
96 | |||
97 | # Hotness | ||
98 | if s >= 50: | ||
99 | html += '<img src="%s/gold.gif" />' % base_url | ||
100 | elif s >= 5: | ||
101 | html += '<img src="%s/silver.gif" />' % base_url | ||
102 | |||
103 | # Velocity | ||
104 | d = date.today().year - int(years) | ||
105 | if d >= 0: | ||
106 | if 2 < s / (d +1) < 10: | ||
107 | html += '<img src="%s/ups.gif" />' % base_url | ||
108 | if 10 <= s / (d +1):< | ||
109 | html += '<img src="%s/upb.gif" />' % base_url | ||
110 | |||
111 | return html | ||
112 | |||
113 | def TestScholarFormat(): | ||
114 | # We need to ensure that Google Scholar does not change its page format under our feet | ||
115 | # Use some cases to check if all is good | ||
116 | assert(getCite("Stop-and-Go MIXes: Providing Probabilistic Anonymity in an Open System", False) > 0) | ||
117 | assert(getCite("Mixes protected by Dragons and Pixies: an empirical study", False) == None) | ||
118 | |||
119 | if __name__ == '__main__': | ||
120 | # First download the bibliography file. | ||
121 | import BibTeX | ||
122 | config.load(sys.argv[1]) | ||
123 | bib = BibTeX.parseFile(config.MASTER_BIB) | ||
124 | remove_old() | ||
125 | print "Downloading missing ranks." | ||
126 | for ent in bib.entries: | ||
127 | getCite(ent['title'], cache=True, update=True) | ||
128 | |||
diff --git a/silver.gif b/silver.gif new file mode 100644 index 0000000..8a4ff29 --- /dev/null +++ b/silver.gif | |||
Binary files differ | |||
Binary files differ | |||
Binary files differ | |||
diff --git a/writeHTML.py b/writeHTML.py index 3184ef0..934b46b 100755 --- a/writeHTML.py +++ b/writeHTML.py | |||
@@ -29,7 +29,7 @@ def pathLength(s): | |||
29 | s = parent | 29 | s = parent |
30 | return n | 30 | return n |
31 | 31 | ||
32 | def writeBody(f, sections, section_urls, cache_path): | 32 | def writeBody(f, sections, section_urls, cache_path, base_url): |
33 | '''f: an open file | 33 | '''f: an open file |
34 | sections: list of (sectionname, [list of BibTeXEntry]) | 34 | sections: list of (sectionname, [list of BibTeXEntry]) |
35 | section_urls: map from sectionname to external url''' | 35 | section_urls: map from sectionname to external url''' |
@@ -45,7 +45,7 @@ def writeBody(f, sections, section_urls, cache_path): | |||
45 | BibTeX.url_untranslate(s),sDisp)) | 45 | BibTeX.url_untranslate(s),sDisp)) |
46 | print >>f, "<ul class='expand'>" | 46 | print >>f, "<ul class='expand'>" |
47 | for e in entries: | 47 | for e in entries: |
48 | print >>f, e.to_html(cache_path=cache_path) | 48 | print >>f, e.to_html(cache_path=cache_path, base_url=base_url) |
49 | print >>f, "</ul></li>" | 49 | print >>f, "</ul></li>" |
50 | 50 | ||
51 | def writeHTML(f, sections, sectionType, fieldName, choices, | 51 | def writeHTML(f, sections, sectionType, fieldName, choices, |
@@ -104,7 +104,8 @@ def writeHTML(f, sections, sectionType, fieldName, choices, | |||
104 | 104 | ||
105 | header, footer = getTemplate(config.TEMPLATE_FILE) | 105 | header, footer = getTemplate(config.TEMPLATE_FILE) |
106 | print >>f, header%fields | 106 | print >>f, header%fields |
107 | writeBody(f, sections, section_urls, cache_path=cache_url_path) | 107 | writeBody(f, sections, section_urls, cache_path=cache_url_path, |
108 | base_url=root) | ||
108 | print >>f, footer%fields | 109 | print >>f, footer%fields |
109 | 110 | ||
110 | def writePageSet(config, bib, tag): | 111 | def writePageSet(config, bib, tag): |