r13683@catbus: nickm | 2007-07-10 14:42:53 -0400

Patch from George to add citation-rank-based icons to anonbib output. Hacked up a bit so that "cache" and "generate page" are separate, so that the image urls are no longer hardwired to ~george, so output locations are configurable, etc. svn:r234
author: Nick Mathewson <nickm@torproject.org> 2007-07-10 18:42:56 +0000
committer: Nick Mathewson <nickm@torproject.org> 2007-07-10 18:42:56 +0000
commit: ef19d6e6ffdcc5ea552a2ac5ceb7e01c1e00bef9 (patch)
tree: d0a310bbbe8460db50438a00d6c88c316781616a /rank.py
parent: 7569ee874013990b661875748fe39a85dd70c2a1 (diff)
download: anonbib-ef19d6e6ffdcc5ea552a2ac5ceb7e01c1e00bef9.tar.gz
1 files changed, 128 insertions, 0 deletions
diff --git a/rank.py b/rank.py
new file mode 100644
index 0000000..81592c2
--- /dev/null
+++ b/rank.py
@@ -0,0 +1,128 @@
+# Make rankings of papers and authors for automatic classification of content hotness
+
+# Google Scholar address
+# http://scholar.google.com/scholar?as_epq=
+
+# Take care of the caching setup
+cache_expire = 60*60*24*30 # 30 days
+
+# Checks
+import config
+import os
+import sys
+from os.path import exists, isdir, join, getmtime
+from os import listdir, remove
+
+def remove_old():
+   # Remove all old cached files
+   filenames = listdir(cache_folder())
+   from time import time
+   now = time()
+   for f in filenames:
+      pf = join(cache_folder(), f)
+      time_mt =  getmtime(pf)
+      if now - time_mt > cache_expire: # 30 days
+         remove(pf)
+
+def cache_folder():
+   r = join(config.OUTPUT_DIR, config.CITE_CACHE_DIR)
+   if not exists(r):
+      os.makedirs(r)
+   assert isdir(r)
+   return r
+
+import md5
+import re
+from urllib2 import urlopen, build_opener
+from datetime import date
+
+# A more handy hash
+def md5h(s):
+   m = md5.new()
+   m.update(s)
+   return m.digest().encode('hex_codec')
+
+format_tested = 0
+
+def getCite(title, cache=True, update=True):
+   global format_tested
+   if not format_tested and update:
+      format_tested = 1
+      TestScholarFormat()
+
+   # Do not assume that the title is clean
+   title = re.sub("\s+", " ", title)
+   title = re.sub("[^'a-zA-Z0-9\. \-\/:]", "", title)
+   title = re.sub("'\/", " ", title)
+
+   # Make a custom user agent (so that we are not filtered by Google)!
+   opener = build_opener()
+   opener.addheaders = [('User-agent', 'Anon.Bib.0.1')]
+
+   # We rely on google scholar to return the article with this exact title
+   gurl = "http://scholar.google.com/scholar?as_epq=%s&as_occt=title"
+   from urllib import quote
+   url = gurl % quote(title)
+
+   # Access cache or network
+   if exists(join(cache_folder(), md5h(url))) and cache:
+      page = file(join(cache_folder(), md5h(url)),'r').read()
+   elif update:
+      print "Downloading rank for %r."%title
+      page = opener.open(url).read()
+      file(join(cache_folder(), md5h(url)),'w').write(page)
+   else:
+      return None
+
+   # Check if it finds any articles
+   if len(re.findall("did not match any articles", page)) > 0:
+      return None
+
+   # Kill all tags!
+   cpage = re.sub("<[^>]*>", "", page)
+
+   # Add up all citations
+   s = sum([int(x) for x in re.findall("Cited by ([0-9]*)", cpage)])
+   return s
+
+def get_rank_html(title, years=None, base_url=".", update=True):
+   s = getCite(title, update=update)
+
+   # Paper cannot be found
+   if s is None:
+      return ''
+
+   html = ''
+
+   # Hotness
+   if s >= 50:
+      html += '<img src="%s/gold.gif" />' % base_url
+   elif s >= 5:
+      html += '<img src="%s/silver.gif" />' % base_url
+
+   # Velocity
+   d = date.today().year - int(years)
+   if d >= 0:
+      if 2 < s / (d +1) < 10:
+         html += '<img src="%s/ups.gif" />' % base_url
+      if 10 <= s / (d +1):<
+         html += '<img src="%s/upb.gif" />' % base_url
+
+   return html
+
+def TestScholarFormat():
+   # We need to ensure that Google Scholar does not change its page format under our feet
+   # Use some cases to check if all is good
+   assert(getCite("Stop-and-Go MIXes: Providing Probabilistic Anonymity in an Open System", False) > 0)
+   assert(getCite("Mixes protected by Dragons and Pixies: an empirical study", False) == None)
+
+if __name__ == '__main__':
+   # First download the bibliography file.
+   import BibTeX
+   config.load(sys.argv[1])
+   bib = BibTeX.parseFile(config.MASTER_BIB)
+   remove_old()
+   print "Downloading missing ranks."
+   for ent in bib.entries:
+      getCite(ent['title'], cache=True, update=True)
+
author	Nick Mathewson <nickm@torproject.org>	2007-07-10 18:42:56 +0000
committer	Nick Mathewson <nickm@torproject.org>	2007-07-10 18:42:56 +0000
commit	ef19d6e6ffdcc5ea552a2ac5ceb7e01c1e00bef9 (patch)
tree	d0a310bbbe8460db50438a00d6c88c316781616a /rank.py
parent	7569ee874013990b661875748fe39a85dd70c2a1 (diff)
download	anonbib-ef19d6e6ffdcc5ea552a2ac5ceb7e01c1e00bef9.tar.gz