rank.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202

# Make rankings of papers and authors for automatic classification of content hotness

# Google Scholar address
# http://scholar.google.com/scholar?as_epq=

# Take care of the caching setup
cache_expire = 60*60*24*30 # 30 days

# Checks
import config
import os
import sys
from os.path import exists, isdir, join, getmtime
from os import listdir, remove

def remove_old():
   # Remove all old cached files
   filenames = listdir(cache_folder())
   from time import time
   now = time()
   for f in filenames:
      pf = join(cache_folder(), f)
      time_mt =  getmtime(pf)
      if now - time_mt > cache_expire: # 30 days
         remove(pf)

def cache_folder():
   r = join(config.OUTPUT_DIR, config.CITE_CACHE_DIR)
   if not exists(r):
      os.makedirs(r)
   assert isdir(r)
   return r

import re
from urllib2 import urlopen, build_opener
from urllib import quote
from datetime import date
import hashlib

# A more handy hash
def md5h(s):
   m = hashlib.md5()
   m.update(s)
   return m.hexdigest()

format_tested = 0

def getPageForTitle(title, cache=True, update=True, save=True):
   #Returns (citation-count, scholar url) tuple, or (None,None)
   global format_tested
   if not format_tested and update:
      format_tested = 1
      TestScholarFormat()

   # Do not assume that the title is clean
   title = re.sub("\s+", " ", title)
   title = re.sub("[^'a-zA-Z0-9\. \-\/:]", "", title)
   title = re.sub("'\/", " ", title)

   # We rely on google scholar to return the article with this exact title
   gurl = "http://scholar.google.com/scholar?as_q=&as_epq=%s&as_occt=title"

   url = gurl % quote(title)

   # Access cache or network
   if exists(join(cache_folder(), md5h(url))) and cache:
      return url, file(join(cache_folder(), md5h(url)),'r').read()
   elif update:
      print "Downloading rank for %r."%title

      # Make a custom user agent (so that we are not filtered by Google)!
      opener = build_opener()
      opener.addheaders = [('User-agent', 'Anon.Bib.0.1')]

      print "connecting..."
      connection = opener.open(url)
      print "reading"
      page = connection.read()
      print "done"
      if save:
         file(join(cache_folder(), md5h(url)),'w').write(page)
      return url, page
   else:
      return url, None

def getCite(title, cache=True, update=True, save=True):
   url, page = getPageForTitle(title, cache=cache, update=update, save=save)
   if not page:
      return None,None

   # Check if it finds any articles
   if len(re.findall("did not match any articles", page)) > 0:
      return (None, None)

   # Kill all tags!
   cpage = re.sub("<[^>]*>", "", page)

   # Add up all citations
   s = sum([int(x) for x in re.findall("Cited by ([0-9]*)", cpage)])
   return (s, url)

def getPaperURLs(title, cache=True, update=True, save=True):
   url, page = getPageForTitle(title, cache=cache, update=update, save=save)
   if not page:
      return []
   pages = re.findall(r'\&\#x25ba\;.*class=fl href="([^"]*)"', page)
   return pages

def get_rank_html(title, years=None, base_url=".", update=True,
                  velocity=False):
   s,url = getCite(title, update=update)

   # Paper cannot be found
   if s is None:
      return ''

   html = ''

   url = url.replace("&","&amp;")

   # Hotness
   H,h = 50,5
   if s >= H:
      html += '<a href="%s"><img src="%s/gold.gif" alt="More than %s citations on Google Scholar" title="More than %s citations on Google Scholar" /></a>' % (url,base_url,H,H)
   elif s >= h:
      html += '<a href="%s"><img src="%s/silver.gif" alt="More than %s citations on Google Scholar" title="More than %s citations on Google Scholar" /></a>' % (url,base_url,h,h)

   # Only include the velocity if asked.
   if velocity:
      # Velocity
      d = date.today().year - int(years)
      if d >= 0:
         if 2 < s / (d +1) < 10:
            html += '<img src="%s/ups.gif" />' % base_url
         if 10 <= s / (d +1):
            html += '<img src="%s/upb.gif" />' % base_url

   return html

def TestScholarFormat():
   # We need to ensure that Google Scholar does not change its page format under our feet
   # Use some cases to check if all is good
   print "Checking google scholar formats..."
   stopAndGoCites = getCite("Stop-and-Go MIXes: Providing Probabilistic Anonymity in an Open System", False)[0]
   dragonCites = getCite("Mixes protected by Dragons and Pixies: an empirical study", False, save=False)[0]

   if stopAndGoCites in (0, None):
      print """OOPS.\n
It looks like Google Scholar changed their URL format or their output format.
I went to count the cites for the Stop-and-Go MIXes paper, and got nothing."""
      sys.exit(1)

   if dragonCites != None:
      print """OOPS.\n
It looks like Google Scholar changed their URL format or their output format.
I went to count the cites for a fictitious paper, and found some."""
      sys.exit(1)

def urlIsUseless(u):
   if u.find("freehaven.net/anonbib/") >= 0:
      # Our own cache is not the primary citation for anything.
      return True
   elif u.find("owens.mit.edu") >= 0:
      # These citations only work for 'members of the MIT community'.
      return True
   else:
      return False

URLTYPES=[ "pdf", "ps", "txt", "ps_gz", "html" ]

if __name__ == '__main__':
   # First download the bibliography file.
   import BibTeX
   suggest = False
   if sys.argv[1] == 'suggest':
      suggest = True
      del sys.argv[1]

   config.load(sys.argv[1])
   if config.CACHE_UMASK != None:
      os.umask(config.CACHE_UMASK)
   bib = BibTeX.parseFile(config.MASTER_BIB)
   remove_old()

   print "Downloading missing ranks."
   for ent in bib.entries:
      getCite(ent['title'], cache=True, update=True)

   if suggest:
      for ent in bib.entries:
         haveOne = False
         for utype in URLTYPES:
            if ent.has_key("www_%s_url"%utype):
               haveOne = True
               break
         if haveOne:
            continue
         print ent.key, "has no URLs given."
         urls = [ u for u in getPaperURLs(ent['title']) if not urlIsUseless(u) ]
         for u in urls:
            print "\t", u