rank.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128

# Make rankings of papers and authors for automatic classification of content hotness

# Google Scholar address
# http://scholar.google.com/scholar?as_epq=

# Take care of the caching setup
cache_expire = 60*60*24*30 # 30 days

# Checks
import config
import os
import sys
from os.path import exists, isdir, join, getmtime
from os import listdir, remove

def remove_old():
   # Remove all old cached files
   filenames = listdir(cache_folder())
   from time import time
   now = time()
   for f in filenames:
      pf = join(cache_folder(), f)
      time_mt =  getmtime(pf)
      if now - time_mt > cache_expire: # 30 days
         remove(pf)

def cache_folder():
   r = join(config.OUTPUT_DIR, config.CITE_CACHE_DIR)
   if not exists(r):
      os.makedirs(r)
   assert isdir(r)
   return r

import md5
import re
from urllib2 import urlopen, build_opener
from datetime import date

# A more handy hash
def md5h(s):
   m = md5.new()
   m.update(s)
   return m.digest().encode('hex_codec')

format_tested = 0

def getCite(title, cache=True, update=True):
   global format_tested
   if not format_tested and update:
      format_tested = 1
      TestScholarFormat()

   # Do not assume that the title is clean
   title = re.sub("\s+", " ", title)
   title = re.sub("[^'a-zA-Z0-9\. \-\/:]", "", title)
   title = re.sub("'\/", " ", title)

   # Make a custom user agent (so that we are not filtered by Google)!
   opener = build_opener()
   opener.addheaders = [('User-agent', 'Anon.Bib.0.1')]

   # We rely on google scholar to return the article with this exact title
   gurl = "http://scholar.google.com/scholar?as_epq=%s&as_occt=title"
   from urllib import quote
   url = gurl % quote(title)

   # Access cache or network
   if exists(join(cache_folder(), md5h(url))) and cache:
      page = file(join(cache_folder(), md5h(url)),'r').read()
   elif update:
      print "Downloading rank for %r."%title
      page = opener.open(url).read()
      file(join(cache_folder(), md5h(url)),'w').write(page)
   else:
      return None

   # Check if it finds any articles
   if len(re.findall("did not match any articles", page)) > 0:
      return None

   # Kill all tags!
   cpage = re.sub("<[^>]*>", "", page)

   # Add up all citations
   s = sum([int(x) for x in re.findall("Cited by ([0-9]*)", cpage)])
   return s

def get_rank_html(title, years=None, base_url=".", update=True):
   s = getCite(title, update=update)

   # Paper cannot be found
   if s is None:
      return ''

   html = ''

   # Hotness
   if s >= 50:
      html += '<img src="%s/gold.gif" />' % base_url
   elif s >= 5:
      html += '<img src="%s/silver.gif" />' % base_url

   # Velocity
   d = date.today().year - int(years)
   if d >= 0:
      if 2 < s / (d +1) < 10:
         html += '<img src="%s/ups.gif" />' % base_url
      if 10 <= s / (d +1):<
         html += '<img src="%s/upb.gif" />' % base_url

   return html

def TestScholarFormat():
   # We need to ensure that Google Scholar does not change its page format under our feet
   # Use some cases to check if all is good
   assert(getCite("Stop-and-Go MIXes: Providing Probabilistic Anonymity in an Open System", False) > 0)
   assert(getCite("Mixes protected by Dragons and Pixies: an empirical study", False) == None)

if __name__ == '__main__':
   # First download the bibliography file.
   import BibTeX
   config.load(sys.argv[1])
   bib = BibTeX.parseFile(config.MASTER_BIB)
   remove_old()
   print "Downloading missing ranks."
   for ent in bib.entries:
      getCite(ent['title'], cache=True, update=True)