1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
|
# Make rankings of papers and authors for automatic classification of content hotness
# Google Scholar address
# http://scholar.google.com/scholar?as_epq=
# Take care of the caching setup
cache_expire = 60*60*24*30 # 30 days
# Checks
import config
import os
import sys
from os.path import exists, isdir, join, getmtime
from os import listdir, remove
def remove_old():
# Remove all old cached files
filenames = listdir(cache_folder())
from time import time
now = time()
for f in filenames:
pf = join(cache_folder(), f)
time_mt = getmtime(pf)
if now - time_mt > cache_expire: # 30 days
remove(pf)
def cache_folder():
r = join(config.OUTPUT_DIR, config.CITE_CACHE_DIR)
if not exists(r):
os.makedirs(r)
assert isdir(r)
return r
import md5
import re
from urllib2 import urlopen, build_opener
from datetime import date
# A more handy hash
def md5h(s):
m = md5.new()
m.update(s)
return m.digest().encode('hex_codec')
format_tested = 0
def getCite(title, cache=True, update=True):
global format_tested
if not format_tested and update:
format_tested = 1
TestScholarFormat()
# Do not assume that the title is clean
title = re.sub("\s+", " ", title)
title = re.sub("[^'a-zA-Z0-9\. \-\/:]", "", title)
title = re.sub("'\/", " ", title)
# Make a custom user agent (so that we are not filtered by Google)!
opener = build_opener()
opener.addheaders = [('User-agent', 'Anon.Bib.0.1')]
# We rely on google scholar to return the article with this exact title
gurl = "http://scholar.google.com/scholar?as_epq=%s&as_occt=title"
from urllib import quote
url = gurl % quote(title)
# Access cache or network
if exists(join(cache_folder(), md5h(url))) and cache:
page = file(join(cache_folder(), md5h(url)),'r').read()
elif update:
print "Downloading rank for %r."%title
page = opener.open(url).read()
file(join(cache_folder(), md5h(url)),'w').write(page)
else:
return None
# Check if it finds any articles
if len(re.findall("did not match any articles", page)) > 0:
return None
# Kill all tags!
cpage = re.sub("<[^>]*>", "", page)
# Add up all citations
s = sum([int(x) for x in re.findall("Cited by ([0-9]*)", cpage)])
return s
def get_rank_html(title, years=None, base_url=".", update=True):
s = getCite(title, update=update)
# Paper cannot be found
if s is None:
return ''
html = ''
# Hotness
if s >= 50:
html += '<img src="%s/gold.gif" />' % base_url
elif s >= 5:
html += '<img src="%s/silver.gif" />' % base_url
# Velocity
d = date.today().year - int(years)
if d >= 0:
if 2 < s / (d +1) < 10:
html += '<img src="%s/ups.gif" />' % base_url
if 10 <= s / (d +1):<
html += '<img src="%s/upb.gif" />' % base_url
return html
def TestScholarFormat():
# We need to ensure that Google Scholar does not change its page format under our feet
# Use some cases to check if all is good
assert(getCite("Stop-and-Go MIXes: Providing Probabilistic Anonymity in an Open System", False) > 0)
assert(getCite("Mixes protected by Dragons and Pixies: an empirical study", False) == None)
if __name__ == '__main__':
# First download the bibliography file.
import BibTeX
config.load(sys.argv[1])
bib = BibTeX.parseFile(config.MASTER_BIB)
remove_old()
print "Downloading missing ranks."
for ent in bib.entries:
getCite(ent['title'], cache=True, update=True)
|