r13683@catbus: nickm | 2007-07-10 14:42:53 -0400

Patch from George to add citation-rank-based icons to anonbib output. Hacked up a bit so that "cache" and "generate page" are separate, so that the image urls are no longer hardwired to ~george, so output locations are configurable, etc. svn:r234
author: Nick Mathewson <nickm@torproject.org> 2007-07-10 18:42:56 +0000
committer: Nick Mathewson <nickm@torproject.org> 2007-07-10 18:42:56 +0000
commit: d1d3a099f2da7611439e7f57569ff7493c41e0b4 (patch)
tree: 7f9190bd59bc9b6554dbe49b0a65b2f6d470de7e
parent: c933c52d4520527b93f2ae0c8534fa684e2a9fb9 (diff)
download: gnunetbib-d1d3a099f2da7611439e7f57569ff7493c41e0b4.tar.gz
gnunetbib-d1d3a099f2da7611439e7f57569ff7493c41e0b4.zip
10 files changed, 149 insertions, 5 deletions
diff --git a/BibTeX.py b/BibTeX.py
index 1b26d72..2181ba6 100644
--- a/BibTeX.py
+++ b/BibTeX.py
@@ -13,6 +13,8 @@ import os
 import config
+import rank
 __all__ = [ 'ParseError', 'BibTeX', 'BibTeXEntry', 'htmlize',
            'ParsedAuthor', 'FileIter', 'Parser', 'parseFile',
            'splitBibTeXEntriesBy', 'sortBibTexEntriesBy', ]
@@ -400,7 +402,7 @@ class BibTeXEntry:
        return errs
    def biblio_to_html(self):
-        """Return the HTML for the citatation portion of entry."""
+        """Return the HTML for the citation portion of entry."""
        if self.type == 'inproceedings':
            booktitle = self['booktitle']
            bookurl = self.get('bookurl')
@@ -496,7 +498,7 @@ class BibTeXEntry:
                   "</span>") %bibtexurl)
        return htmlize("".join(res))
-    def to_html(self, cache_path="./cache"):
+    def to_html(self, cache_path="./cache", base_url="."):
        """Return the HTML for this entry."""
        imp = self.isImportant()
        draft = self.get('year') == 'forthcoming'
@@ -507,6 +509,14 @@ class BibTeXEntry:
        else:
            res = ["<li><p class='entry'>"]
+        if imp or not draft:
+            # Add a picture of the rank
+            # Only if year is known or paper important!
+            r = rank.get_rank_html(self['title'], self.get('year'),
+                                   update=False, base_url=base_url)
+            if r is not None:
+                res.append(r)
        res.append("<span class='title'><a name='%s'>%s</a></span>"%(
            url_untranslate(self.key),htmlize(self['title'])))
diff --git a/Makefile b/Makefile
index e1ef106..378bf75 100644
--- a/Makefile
+++ b/Makefile
@@ -9,6 +9,7 @@ clean:
 update:
        $(PYTHON) updateCache.py anonbib.cfg
+        $(PYTHON) rank.py anonbib.cfg
 test:
        $(PYTHON) test.py
diff --git a/anonbib.cfg b/anonbib.cfg
index 0ff6638..c1d9aa5 100644
--- a/anonbib.cfg
+++ b/anonbib.cfg
@@ -9,6 +9,9 @@ OUTPUT_DIR = "."
 # Where do we put cached papers (relative to OUTPUT_DIR)
 CACHE_DIR = "cache"
+# Where do we cache citations papers (relative to OUTPUT_DIR)
+CITE_CACHE_DIR = "cite_cache"
 # Are there subsections for cached papers?  This is useful for putting
 # different Apache permission on different directories.
 CACHE_SECTIONS = [ ]
diff --git a/config.py b/config.py
index 6a25731..175a3d5 100644
--- a/config.py
+++ b/config.py
@@ -4,6 +4,7 @@ import re
 _KEYS = [ "ALL_TAGS",
          "ALPHABETIZE_AUTHOR_AS","AUTHOR_URLS","CACHE_DIR","CACHE_SECTIONS",
+          "CITE_CACHE_DIR",
          "COLLAPSE_AUTHORS",
          "DOWNLOAD_CONNECT_TIMEOUT","INITIAL_STRINGS",
          "MASTER_BIB", "NO_COLLAPSE_AUTHORS", "OMIT_ENTRIES",
diff --git a/gold.gif b/gold.gif
new file mode 100644
index 0000000..44505db
--- /dev/null
+++ b/gold.gif
Binary files differ
diff --git a/rank.py b/rank.py
new file mode 100644
index 0000000..81592c2
--- /dev/null
+++ b/rank.py
@@ -0,0 +1,128 @@
+# Make rankings of papers and authors for automatic classification of content hotness
+# Google Scholar address
+# http://scholar.google.com/scholar?as_epq=
+# Take care of the caching setup
+cache_expire = 60*60*24*30 # 30 days
+# Checks
+import config
+import os
+import sys
+from os.path import exists, isdir, join, getmtime
+from os import listdir, remove
+def remove_old():
+   # Remove all old cached files
+   filenames = listdir(cache_folder())
+   from time import time
+   now = time()
+   for f in filenames:
+      pf = join(cache_folder(), f)
+      time_mt =  getmtime(pf)
+      if now - time_mt > cache_expire: # 30 days
+         remove(pf)
+def cache_folder():
+   r = join(config.OUTPUT_DIR, config.CITE_CACHE_DIR)
+   if not exists(r):
+      os.makedirs(r)
+   assert isdir(r)
+   return r
+import md5
+import re
+from urllib2 import urlopen, build_opener
+from datetime import date
+# A more handy hash
+def md5h(s):
+   m = md5.new()
+   m.update(s)
+   return m.digest().encode('hex_codec')
+format_tested = 0
+def getCite(title, cache=True, update=True):
+   global format_tested
+   if not format_tested and update:
+      format_tested = 1
+      TestScholarFormat()
+   # Do not assume that the title is clean
+   title = re.sub("\s+", " ", title)
+   title = re.sub("[^'a-zA-Z0-9\. \-\/:]", "", title)
+   title = re.sub("'\/", " ", title)
+   # Make a custom user agent (so that we are not filtered by Google)!
+   opener = build_opener()
+   opener.addheaders = [('User-agent', 'Anon.Bib.0.1')]
+   # We rely on google scholar to return the article with this exact title
+   gurl = "http://scholar.google.com/scholar?as_epq=%s&as_occt=title"
+   from urllib import quote
+   url = gurl % quote(title)
+   # Access cache or network
+   if exists(join(cache_folder(), md5h(url))) and cache:
+      page = file(join(cache_folder(), md5h(url)),'r').read()
+   elif update:
+      print "Downloading rank for %r."%title
+      page = opener.open(url).read()
+      file(join(cache_folder(), md5h(url)),'w').write(page)
+   else:
+      return None
+   # Check if it finds any articles
+   if len(re.findall("did not match any articles", page)) > 0:
+      return None
+   # Kill all tags!
+   cpage = re.sub("<[^>]*>", "", page)
+   # Add up all citations
+   s = sum([int(x) for x in re.findall("Cited by ([0-9]*)", cpage)])
+   return s
+def get_rank_html(title, years=None, base_url=".", update=True):
+   s = getCite(title, update=update)
+   # Paper cannot be found
+   if s is None:
+      return ''
+   html = ''
+   # Hotness
+   if s >= 50:
+      html += '<img src="%s/gold.gif" />' % base_url
+   elif s >= 5:
+      html += '<img src="%s/silver.gif" />' % base_url
+   # Velocity
+   d = date.today().year - int(years)
+   if d >= 0:
+      if 2 < s / (d +1) < 10:
+         html += '<img src="%s/ups.gif" />' % base_url
+      if 10 <= s / (d +1):<
+         html += '<img src="%s/upb.gif" />' % base_url
+   return html
+def TestScholarFormat():
+   # We need to ensure that Google Scholar does not change its page format under our feet
+   # Use some cases to check if all is good
+   assert(getCite("Stop-and-Go MIXes: Providing Probabilistic Anonymity in an Open System", False) > 0)
+   assert(getCite("Mixes protected by Dragons and Pixies: an empirical study", False) == None)
+if __name__ == '__main__':
+   # First download the bibliography file.
+   import BibTeX
+   config.load(sys.argv[1])
+   bib = BibTeX.parseFile(config.MASTER_BIB)
+   remove_old()
+   print "Downloading missing ranks."
+   for ent in bib.entries:
+      getCite(ent['title'], cache=True, update=True)
diff --git a/silver.gif b/silver.gif
new file mode 100644
index 0000000..8a4ff29
--- /dev/null
+++ b/silver.gif
Binary files differ
diff --git a/upb.gif b/upb.gif
new file mode 100644
index 0000000..5852828
--- /dev/null
+++ b/upb.gif
Binary files differ
diff --git a/ups.gif b/ups.gif
new file mode 100644
index 0000000..36f0124
--- /dev/null
+++ b/ups.gif
Binary files differ
diff --git a/writeHTML.py b/writeHTML.py
index 3184ef0..934b46b 100755
--- a/writeHTML.py
+++ b/writeHTML.py
@@ -29,7 +29,7 @@ def pathLength(s):
        s = parent
    return n
-def writeBody(f, sections, section_urls, cache_path):
+def writeBody(f, sections, section_urls, cache_path, base_url):
    '''f: an open file
       sections: list of (sectionname, [list of BibTeXEntry])
       section_urls: map from sectionname to external url'''
@@ -45,7 +45,7 @@ def writeBody(f, sections, section_urls, cache_path):
                BibTeX.url_untranslate(s),sDisp))
        print >>f, "<ul class='expand'>"
        for e in entries:
-            print >>f, e.to_html(cache_path=cache_path)
+            print >>f, e.to_html(cache_path=cache_path, base_url=base_url)
        print >>f, "</ul></li>"
 def writeHTML(f, sections, sectionType, fieldName, choices,
@@ -104,7 +104,8 @@ def writeHTML(f, sections, sectionType, fieldName, choices,
    header, footer = getTemplate(config.TEMPLATE_FILE)
    print >>f, header%fields
-    writeBody(f, sections, section_urls, cache_path=cache_url_path)
+    writeBody(f, sections, section_urls, cache_path=cache_url_path,
+              base_url=root)
    print >>f, footer%fields
 def writePageSet(config, bib, tag):
author	Nick Mathewson <nickm@torproject.org>	2007-07-10 18:42:56 +0000
committer	Nick Mathewson <nickm@torproject.org>	2007-07-10 18:42:56 +0000
commit	d1d3a099f2da7611439e7f57569ff7493c41e0b4 (patch)
tree	7f9190bd59bc9b6554dbe49b0a65b2f6d470de7e
parent	c933c52d4520527b93f2ae0c8534fa684e2a9fb9 (diff)
download	gnunetbib-d1d3a099f2da7611439e7f57569ff7493c41e0b4.tar.gz gnunetbib-d1d3a099f2da7611439e7f57569ff7493c41e0b4.zip

diff --git a/BibTeX.py b/BibTeX.py index 1b26d72..2181ba6 100644 --- a/BibTeX.py +++ b/BibTeX.py
@@ -13,6 +13,8 @@ import os
13		13
14	import config	14	import config
15		15
		16	import rank
		17
16	__all__ = [ 'ParseError', 'BibTeX', 'BibTeXEntry', 'htmlize',	18	__all__ = [ 'ParseError', 'BibTeX', 'BibTeXEntry', 'htmlize',
17	'ParsedAuthor', 'FileIter', 'Parser', 'parseFile',	19	'ParsedAuthor', 'FileIter', 'Parser', 'parseFile',
18	'splitBibTeXEntriesBy', 'sortBibTexEntriesBy', ]	20	'splitBibTeXEntriesBy', 'sortBibTexEntriesBy', ]
@@ -400,7 +402,7 @@ class BibTeXEntry:
400	return errs	402	return errs
401		403
402	def biblio_to_html(self):	404	def biblio_to_html(self):
403	"""Return the HTML for the citatation portion of entry."""	405	"""Return the HTML for the citation portion of entry."""
404	if self.type == 'inproceedings':	406	if self.type == 'inproceedings':
405	booktitle = self['booktitle']	407	booktitle = self['booktitle']
406	bookurl = self.get('bookurl')	408	bookurl = self.get('bookurl')
@@ -496,7 +498,7 @@ class BibTeXEntry:
496	"</span>") %bibtexurl)	498	"</span>") %bibtexurl)
497	return htmlize("".join(res))	499	return htmlize("".join(res))
498		500
499	def to_html(self, cache_path="./cache"):	501	def to_html(self, cache_path="./cache", base_url="."):
500	"""Return the HTML for this entry."""	502	"""Return the HTML for this entry."""
501	imp = self.isImportant()	503	imp = self.isImportant()
502	draft = self.get('year') == 'forthcoming'	504	draft = self.get('year') == 'forthcoming'
@@ -507,6 +509,14 @@ class BibTeXEntry:
507	else:	509	else:
508	res = ["<li><p class='entry'>"]	510	res = ["<li><p class='entry'>"]
509		511
		512	if imp or not draft:
		513	# Add a picture of the rank
		514	# Only if year is known or paper important!
		515	r = rank.get_rank_html(self['title'], self.get('year'),
		516	update=False, base_url=base_url)
		517	if r is not None:
		518	res.append(r)
		519
510	res.append("<span class='title'><a name='%s'>%s</a></span>"%(	520	res.append("<span class='title'><a name='%s'>%s</a></span>"%(
511	url_untranslate(self.key),htmlize(self['title'])))	521	url_untranslate(self.key),htmlize(self['title'])))
512		522


diff --git a/Makefile b/Makefile index e1ef106..378bf75 100644 --- a/Makefile +++ b/Makefile
@@ -9,6 +9,7 @@ clean:
9		9
10	update:	10	update:
11	$(PYTHON) updateCache.py anonbib.cfg	11	$(PYTHON) updateCache.py anonbib.cfg
		12	$(PYTHON) rank.py anonbib.cfg
12		13
13	test:	14	test:
14	$(PYTHON) test.py	15	$(PYTHON) test.py


diff --git a/anonbib.cfg b/anonbib.cfg index 0ff6638..c1d9aa5 100644 --- a/anonbib.cfg +++ b/anonbib.cfg
@@ -9,6 +9,9 @@ OUTPUT_DIR = "."
9	# Where do we put cached papers (relative to OUTPUT_DIR)	9	# Where do we put cached papers (relative to OUTPUT_DIR)
10	CACHE_DIR = "cache"	10	CACHE_DIR = "cache"
11		11
		12	# Where do we cache citations papers (relative to OUTPUT_DIR)
		13	CITE_CACHE_DIR = "cite_cache"
		14
12	# Are there subsections for cached papers? This is useful for putting	15	# Are there subsections for cached papers? This is useful for putting
13	# different Apache permission on different directories.	16	# different Apache permission on different directories.
14	CACHE_SECTIONS = [ ]	17	CACHE_SECTIONS = [ ]


diff --git a/config.py b/config.py index 6a25731..175a3d5 100644 --- a/config.py +++ b/config.py
@@ -4,6 +4,7 @@ import re
4		4
5	_KEYS = [ "ALL_TAGS",	5	_KEYS = [ "ALL_TAGS",
6	"ALPHABETIZE_AUTHOR_AS","AUTHOR_URLS","CACHE_DIR","CACHE_SECTIONS",	6	"ALPHABETIZE_AUTHOR_AS","AUTHOR_URLS","CACHE_DIR","CACHE_SECTIONS",
		7	"CITE_CACHE_DIR",
7	"COLLAPSE_AUTHORS",	8	"COLLAPSE_AUTHORS",
8	"DOWNLOAD_CONNECT_TIMEOUT","INITIAL_STRINGS",	9	"DOWNLOAD_CONNECT_TIMEOUT","INITIAL_STRINGS",
9	"MASTER_BIB", "NO_COLLAPSE_AUTHORS", "OMIT_ENTRIES",	10	"MASTER_BIB", "NO_COLLAPSE_AUTHORS", "OMIT_ENTRIES",


diff --git a/gold.gif b/gold.gif new file mode 100644 index 0000000..44505db --- /dev/null +++ b/gold.gif
Binary files differ


diff --git a/rank.py b/rank.py new file mode 100644 index 0000000..81592c2 --- /dev/null +++ b/rank.py
@@ -0,0 +1,128 @@
		1	# Make rankings of papers and authors for automatic classification of content hotness
		2
		3	# Google Scholar address
		4	# http://scholar.google.com/scholar?as_epq=
		5
		6	# Take care of the caching setup
		7	cache_expire = 606024*30 # 30 days
		8
		9	# Checks
		10	import config
		11	import os
		12	import sys
		13	from os.path import exists, isdir, join, getmtime
		14	from os import listdir, remove
		15
		16	def remove_old():
		17	# Remove all old cached files
		18	filenames = listdir(cache_folder())
		19	from time import time
		20	now = time()
		21	for f in filenames:
		22	pf = join(cache_folder(), f)
		23	time_mt = getmtime(pf)
		24	if now - time_mt > cache_expire: # 30 days
		25	remove(pf)
		26
		27	def cache_folder():
		28	r = join(config.OUTPUT_DIR, config.CITE_CACHE_DIR)
		29	if not exists(r):
		30	os.makedirs(r)
		31	assert isdir(r)
		32	return r
		33
		34	import md5
		35	import re
		36	from urllib2 import urlopen, build_opener
		37	from datetime import date
		38
		39	# A more handy hash
		40	def md5h(s):
		41	m = md5.new()
		42	m.update(s)
		43	return m.digest().encode('hex_codec')
		44
		45	format_tested = 0
		46
		47	def getCite(title, cache=True, update=True):
		48	global format_tested
		49	if not format_tested and update:
		50	format_tested = 1
		51	TestScholarFormat()
		52
		53	# Do not assume that the title is clean
		54	title = re.sub("\s+", " ", title)
		55	title = re.sub("[^'a-zA-Z0-9\. \-\/:]", "", title)
		56	title = re.sub("'\/", " ", title)
		57
		58	# Make a custom user agent (so that we are not filtered by Google)!
		59	opener = build_opener()
		60	opener.addheaders = [('User-agent', 'Anon.Bib.0.1')]
		61
		62	# We rely on google scholar to return the article with this exact title
		63	gurl = "http://scholar.google.com/scholar?as_epq=%s&as_occt=title"
		64	from urllib import quote
		65	url = gurl % quote(title)
		66
		67	# Access cache or network
		68	if exists(join(cache_folder(), md5h(url))) and cache:
		69	page = file(join(cache_folder(), md5h(url)),'r').read()
		70	elif update:
		71	print "Downloading rank for %r."%title
		72	page = opener.open(url).read()
		73	file(join(cache_folder(), md5h(url)),'w').write(page)
		74	else:
		75	return None
		76
		77	# Check if it finds any articles
		78	if len(re.findall("did not match any articles", page)) > 0:
		79	return None
		80
		81	# Kill all tags!
		82	cpage = re.sub("<[^>]*>", "", page)
		83
		84	# Add up all citations
		85	s = sum([int(x) for x in re.findall("Cited by ([0-9]*)", cpage)])
		86	return s
		87
		88	def get_rank_html(title, years=None, base_url=".", update=True):
		89	s = getCite(title, update=update)
		90
		91	# Paper cannot be found
		92	if s is None:
		93	return ''
		94
		95	html = ''
		96
		97	# Hotness
		98	if s >= 50:
		99	html += '<img src="%s/gold.gif" />' % base_url
		100	elif s >= 5:
		101	html += '<img src="%s/silver.gif" />' % base_url
		102
		103	# Velocity
		104	d = date.today().year - int(years)
		105	if d >= 0:
		106	if 2 < s / (d +1) < 10:
		107	html += '<img src="%s/ups.gif" />' % base_url
		108	if 10 <= s / (d +1):<
		109	html += '<img src="%s/upb.gif" />' % base_url
		110
		111	return html
		112
		113	def TestScholarFormat():
		114	# We need to ensure that Google Scholar does not change its page format under our feet
		115	# Use some cases to check if all is good
		116	assert(getCite("Stop-and-Go MIXes: Providing Probabilistic Anonymity in an Open System", False) > 0)
		117	assert(getCite("Mixes protected by Dragons and Pixies: an empirical study", False) == None)
		118
		119	if __name__ == '__main__':
		120	# First download the bibliography file.
		121	import BibTeX
		122	config.load(sys.argv[1])
		123	bib = BibTeX.parseFile(config.MASTER_BIB)
		124	remove_old()
		125	print "Downloading missing ranks."
		126	for ent in bib.entries:
		127	getCite(ent['title'], cache=True, update=True)
		128


diff --git a/silver.gif b/silver.gif new file mode 100644 index 0000000..8a4ff29 --- /dev/null +++ b/silver.gif
Binary files differ


diff --git a/upb.gif b/upb.gif new file mode 100644 index 0000000..5852828 --- /dev/null +++ b/upb.gif
Binary files differ


diff --git a/ups.gif b/ups.gif new file mode 100644 index 0000000..36f0124 --- /dev/null +++ b/ups.gif
Binary files differ


diff --git a/writeHTML.py b/writeHTML.py index 3184ef0..934b46b 100755 --- a/writeHTML.py +++ b/writeHTML.py
@@ -29,7 +29,7 @@ def pathLength(s):
29	s = parent	29	s = parent
30	return n	30	return n
31		31
32	def writeBody(f, sections, section_urls, cache_path):	32	def writeBody(f, sections, section_urls, cache_path, base_url):
33	'''f: an open file	33	'''f: an open file
34	sections: list of (sectionname, [list of BibTeXEntry])	34	sections: list of (sectionname, [list of BibTeXEntry])
35	section_urls: map from sectionname to external url'''	35	section_urls: map from sectionname to external url'''
@@ -45,7 +45,7 @@ def writeBody(f, sections, section_urls, cache_path):
45	BibTeX.url_untranslate(s),sDisp))	45	BibTeX.url_untranslate(s),sDisp))
46	print >>f, "<ul class='expand'>"	46	print >>f, "<ul class='expand'>"
47	for e in entries:	47	for e in entries:
48	print >>f, e.to_html(cache_path=cache_path)	48	print >>f, e.to_html(cache_path=cache_path, base_url=base_url)
49	print >>f, "</ul></li>"	49	print >>f, "</ul></li>"
50		50
51	def writeHTML(f, sections, sectionType, fieldName, choices,	51	def writeHTML(f, sections, sectionType, fieldName, choices,
@@ -104,7 +104,8 @@ def writeHTML(f, sections, sectionType, fieldName, choices,
104		104
105	header, footer = getTemplate(config.TEMPLATE_FILE)	105	header, footer = getTemplate(config.TEMPLATE_FILE)
106	print >>f, header%fields	106	print >>f, header%fields
107	writeBody(f, sections, section_urls, cache_path=cache_url_path)	107	writeBody(f, sections, section_urls, cache_path=cache_url_path,
		108	base_url=root)
108	print >>f, footer%fields	109	print >>f, footer%fields
109		110
110	def writePageSet(config, bib, tag):	111	def writePageSet(config, bib, tag):