1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
|
# Make rankings of papers and authors for automatic classification of content hotness
# Google Scholar address
# http://scholar.google.com/scholar?as_epq=
# Take care of the caching setup
cache_expire = 60*60*24*30 # 30 days
# Checks
import config
import os
import sys
from os.path import exists, isdir, join, getmtime
from os import listdir, remove
def remove_old():
# Remove all old cached files
filenames = listdir(cache_folder())
from time import time
now = time()
for f in filenames:
pf = join(cache_folder(), f)
time_mt = getmtime(pf)
if now - time_mt > cache_expire: # 30 days
remove(pf)
def cache_folder():
r = join(config.OUTPUT_DIR, config.CITE_CACHE_DIR)
if not exists(r):
os.makedirs(r)
assert isdir(r)
return r
import md5
import re
from urllib2 import urlopen, build_opener
from urllib import quote
from datetime import date
# A more handy hash
def md5h(s):
m = md5.new()
m.update(s)
return m.digest().encode('hex_codec')
format_tested = 0
def getPageForTitle(title, cache=True, update=True, save=True):
#Returns (citation-count, scholar url) tuple, or (None,None)
global format_tested
if not format_tested and update:
format_tested = 1
TestScholarFormat()
# Do not assume that the title is clean
title = re.sub("\s+", " ", title)
title = re.sub("[^'a-zA-Z0-9\. \-\/:]", "", title)
title = re.sub("'\/", " ", title)
# We rely on google scholar to return the article with this exact title
gurl = "http://scholar.google.com/scholar?as_epq=%s&as_occt=title"
url = gurl % quote(title)
# Access cache or network
if exists(join(cache_folder(), md5h(url))) and cache:
return url, file(join(cache_folder(), md5h(url)),'r').read()
elif update:
print "Downloading rank for %r."%title
# Make a custom user agent (so that we are not filtered by Google)!
opener = build_opener()
opener.addheaders = [('User-agent', 'Anon.Bib.0.1')]
print "connecting..."
connection = opener.open(url)
print "reading"
page = connection.read()
print "done"
if save:
file(join(cache_folder(), md5h(url)),'w').write(page)
return url, page
else:
return url, None
def getCite(title, cache=True, update=True, save=True):
url, page = getPageForTitle(title, cache=cache, update=update, save=save)
if not page:
return None
# Check if it finds any articles
if len(re.findall("did not match any articles", page)) > 0:
return (None, None)
# Kill all tags!
cpage = re.sub("<[^>]*>", "", page)
# Add up all citations
s = sum([int(x) for x in re.findall("Cited by ([0-9]*)", cpage)])
return (s, url)
def getPaperURLs(title, cache=True, update=True, save=True):
url, page = getPageForTitle(title, cache=cache, update=update, save=save)
if not page:
return []
pages = re.findall(r'\&\#x25ba\;.*class=fl href="([^"]*)"', page)
return pages
def get_rank_html(title, years=None, base_url=".", update=True,
velocity=False):
s,url = getCite(title, update=update)
# Paper cannot be found
if s is None:
return ''
html = ''
# Hotness
H,h = 50,5
if s >= H:
html += '<a href="%s"><img src="%s/gold.gif" alt="More than %s citations on Google Scholar" title="More than %s citations on Google Scholar" /></a>' % (url,base_url,H,H)
elif s >= h:
html += '<a href="%s"><img src="%s/silver.gif" alt="More than %s citations on Google Scholar" title="More than %s citations on Google Scholar" /></a>' % (url,base_url,h,h)
# Only include the velocity if asked.
if velocity:
# Velocity
d = date.today().year - int(years)
if d >= 0:
if 2 < s / (d +1) < 10:
html += '<img src="%s/ups.gif" />' % base_url
if 10 <= s / (d +1):
html += '<img src="%s/upb.gif" />' % base_url
return html
def TestScholarFormat():
# We need to ensure that Google Scholar does not change its page format under our feet
# Use some cases to check if all is good
print "Checking google scholar formats..."
assert(getCite("Stop-and-Go MIXes: Providing Probabilistic Anonymity in an Open System", False)[0] > 0)
assert(getCite("Mixes protected by Dragons and Pixies: an empirical study", False, save=False)[0] == None)
URLTYPES=[ "pdf", "ps", "txt", "ps_gz", "html" ]
if __name__ == '__main__':
# First download the bibliography file.
import BibTeX
suggest = False
if sys.argv[1] == 'suggest':
suggest = True
del sys.argv[1]
config.load(sys.argv[1])
if config.CACHE_UMASK != None:
os.umask(config.CACHE_UMASK)
bib = BibTeX.parseFile(config.MASTER_BIB)
remove_old()
print "Downloading missing ranks."
for ent in bib.entries:
getCite(ent['title'], cache=True, update=True)
if suggest:
for ent in bib.entries:
haveOne = False
for utype in URLTYPES:
if ent.has_key("www_%s_url"%utype):
haveOne = True
break
if haveOne:
continue
print ent.key, "has no URLs given."
urls = [ u for u in getPaperURLs(ent['title'])
if u.find("freehaven.net/anonbib") < 0 ]
for u in urls:
print "\t", u
|