aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorNick Mathewson <nickm@torproject.org>2007-07-10 18:42:56 +0000
committerNick Mathewson <nickm@torproject.org>2007-07-10 18:42:56 +0000
commitef19d6e6ffdcc5ea552a2ac5ceb7e01c1e00bef9 (patch)
treed0a310bbbe8460db50438a00d6c88c316781616a
parent7569ee874013990b661875748fe39a85dd70c2a1 (diff)
downloadanonbib-ef19d6e6ffdcc5ea552a2ac5ceb7e01c1e00bef9.tar.gz
r13683@catbus: nickm | 2007-07-10 14:42:53 -0400
Patch from George to add citation-rank-based icons to anonbib output. Hacked up a bit so that "cache" and "generate page" are separate, so that the image urls are no longer hardwired to ~george, so output locations are configurable, etc. svn:r234
-rw-r--r--BibTeX.py14
-rw-r--r--Makefile1
-rw-r--r--config.py1
-rw-r--r--gold.gifbin0 -> 540 bytes
-rw-r--r--rank.py128
-rw-r--r--silver.gifbin0 -> 539 bytes
-rw-r--r--upb.gifbin0 -> 555 bytes
-rw-r--r--ups.gifbin0 -> 536 bytes
-rwxr-xr-xwriteHTML.py7
9 files changed, 146 insertions, 5 deletions
diff --git a/BibTeX.py b/BibTeX.py
index 1b26d72..2181ba6 100644
--- a/BibTeX.py
+++ b/BibTeX.py
@@ -13,6 +13,8 @@ import os
import config
+import rank
+
__all__ = [ 'ParseError', 'BibTeX', 'BibTeXEntry', 'htmlize',
'ParsedAuthor', 'FileIter', 'Parser', 'parseFile',
'splitBibTeXEntriesBy', 'sortBibTexEntriesBy', ]
@@ -400,7 +402,7 @@ class BibTeXEntry:
return errs
def biblio_to_html(self):
- """Return the HTML for the citatation portion of entry."""
+ """Return the HTML for the citation portion of entry."""
if self.type == 'inproceedings':
booktitle = self['booktitle']
bookurl = self.get('bookurl')
@@ -496,7 +498,7 @@ class BibTeXEntry:
"</span>") %bibtexurl)
return htmlize("".join(res))
- def to_html(self, cache_path="./cache"):
+ def to_html(self, cache_path="./cache", base_url="."):
"""Return the HTML for this entry."""
imp = self.isImportant()
draft = self.get('year') == 'forthcoming'
@@ -507,6 +509,14 @@ class BibTeXEntry:
else:
res = ["<li><p class='entry'>"]
+ if imp or not draft:
+ # Add a picture of the rank
+ # Only if year is known or paper important!
+ r = rank.get_rank_html(self['title'], self.get('year'),
+ update=False, base_url=base_url)
+ if r is not None:
+ res.append(r)
+
res.append("<span class='title'><a name='%s'>%s</a></span>"%(
url_untranslate(self.key),htmlize(self['title'])))
diff --git a/Makefile b/Makefile
index e1ef106..378bf75 100644
--- a/Makefile
+++ b/Makefile
@@ -9,6 +9,7 @@ clean:
update:
$(PYTHON) updateCache.py anonbib.cfg
+ $(PYTHON) rank.py anonbib.cfg
test:
$(PYTHON) test.py
diff --git a/config.py b/config.py
index 6a25731..175a3d5 100644
--- a/config.py
+++ b/config.py
@@ -4,6 +4,7 @@ import re
_KEYS = [ "ALL_TAGS",
"ALPHABETIZE_AUTHOR_AS","AUTHOR_URLS","CACHE_DIR","CACHE_SECTIONS",
+ "CITE_CACHE_DIR",
"COLLAPSE_AUTHORS",
"DOWNLOAD_CONNECT_TIMEOUT","INITIAL_STRINGS",
"MASTER_BIB", "NO_COLLAPSE_AUTHORS", "OMIT_ENTRIES",
diff --git a/gold.gif b/gold.gif
new file mode 100644
index 0000000..44505db
--- /dev/null
+++ b/gold.gif
Binary files differ
diff --git a/rank.py b/rank.py
new file mode 100644
index 0000000..81592c2
--- /dev/null
+++ b/rank.py
@@ -0,0 +1,128 @@
+# Make rankings of papers and authors for automatic classification of content hotness
+
+# Google Scholar address
+# http://scholar.google.com/scholar?as_epq=
+
+# Take care of the caching setup
+cache_expire = 60*60*24*30 # 30 days
+
+# Checks
+import config
+import os
+import sys
+from os.path import exists, isdir, join, getmtime
+from os import listdir, remove
+
+def remove_old():
+ # Remove all old cached files
+ filenames = listdir(cache_folder())
+ from time import time
+ now = time()
+ for f in filenames:
+ pf = join(cache_folder(), f)
+ time_mt = getmtime(pf)
+ if now - time_mt > cache_expire: # 30 days
+ remove(pf)
+
+def cache_folder():
+ r = join(config.OUTPUT_DIR, config.CITE_CACHE_DIR)
+ if not exists(r):
+ os.makedirs(r)
+ assert isdir(r)
+ return r
+
+import md5
+import re
+from urllib2 import urlopen, build_opener
+from datetime import date
+
+# A more handy hash
+def md5h(s):
+ m = md5.new()
+ m.update(s)
+ return m.digest().encode('hex_codec')
+
+format_tested = 0
+
+def getCite(title, cache=True, update=True):
+ global format_tested
+ if not format_tested and update:
+ format_tested = 1
+ TestScholarFormat()
+
+ # Do not assume that the title is clean
+ title = re.sub("\s+", " ", title)
+ title = re.sub("[^'a-zA-Z0-9\. \-\/:]", "", title)
+ title = re.sub("'\/", " ", title)
+
+ # Make a custom user agent (so that we are not filtered by Google)!
+ opener = build_opener()
+ opener.addheaders = [('User-agent', 'Anon.Bib.0.1')]
+
+ # We rely on google scholar to return the article with this exact title
+ gurl = "http://scholar.google.com/scholar?as_epq=%s&as_occt=title"
+ from urllib import quote
+ url = gurl % quote(title)
+
+ # Access cache or network
+ if exists(join(cache_folder(), md5h(url))) and cache:
+ page = file(join(cache_folder(), md5h(url)),'r').read()
+ elif update:
+ print "Downloading rank for %r."%title
+ page = opener.open(url).read()
+ file(join(cache_folder(), md5h(url)),'w').write(page)
+ else:
+ return None
+
+ # Check if it finds any articles
+ if len(re.findall("did not match any articles", page)) > 0:
+ return None
+
+ # Kill all tags!
+ cpage = re.sub("<[^>]*>", "", page)
+
+ # Add up all citations
+ s = sum([int(x) for x in re.findall("Cited by ([0-9]*)", cpage)])
+ return s
+
+def get_rank_html(title, years=None, base_url=".", update=True):
+ s = getCite(title, update=update)
+
+ # Paper cannot be found
+ if s is None:
+ return ''
+
+ html = ''
+
+ # Hotness
+ if s >= 50:
+ html += '<img src="%s/gold.gif" />' % base_url
+ elif s >= 5:
+ html += '<img src="%s/silver.gif" />' % base_url
+
+ # Velocity
+ d = date.today().year - int(years)
+ if d >= 0:
+ if 2 < s / (d +1) < 10:
+ html += '<img src="%s/ups.gif" />' % base_url
+ if 10 <= s / (d +1):<
+ html += '<img src="%s/upb.gif" />' % base_url
+
+ return html
+
+def TestScholarFormat():
+ # We need to ensure that Google Scholar does not change its page format under our feet
+ # Use some cases to check if all is good
+ assert(getCite("Stop-and-Go MIXes: Providing Probabilistic Anonymity in an Open System", False) > 0)
+ assert(getCite("Mixes protected by Dragons and Pixies: an empirical study", False) == None)
+
+if __name__ == '__main__':
+ # First download the bibliography file.
+ import BibTeX
+ config.load(sys.argv[1])
+ bib = BibTeX.parseFile(config.MASTER_BIB)
+ remove_old()
+ print "Downloading missing ranks."
+ for ent in bib.entries:
+ getCite(ent['title'], cache=True, update=True)
+
diff --git a/silver.gif b/silver.gif
new file mode 100644
index 0000000..8a4ff29
--- /dev/null
+++ b/silver.gif
Binary files differ
diff --git a/upb.gif b/upb.gif
new file mode 100644
index 0000000..5852828
--- /dev/null
+++ b/upb.gif
Binary files differ
diff --git a/ups.gif b/ups.gif
new file mode 100644
index 0000000..36f0124
--- /dev/null
+++ b/ups.gif
Binary files differ
diff --git a/writeHTML.py b/writeHTML.py
index 3184ef0..934b46b 100755
--- a/writeHTML.py
+++ b/writeHTML.py
@@ -29,7 +29,7 @@ def pathLength(s):
s = parent
return n
-def writeBody(f, sections, section_urls, cache_path):
+def writeBody(f, sections, section_urls, cache_path, base_url):
'''f: an open file
sections: list of (sectionname, [list of BibTeXEntry])
section_urls: map from sectionname to external url'''
@@ -45,7 +45,7 @@ def writeBody(f, sections, section_urls, cache_path):
BibTeX.url_untranslate(s),sDisp))
print >>f, "<ul class='expand'>"
for e in entries:
- print >>f, e.to_html(cache_path=cache_path)
+ print >>f, e.to_html(cache_path=cache_path, base_url=base_url)
print >>f, "</ul></li>"
def writeHTML(f, sections, sectionType, fieldName, choices,
@@ -104,7 +104,8 @@ def writeHTML(f, sections, sectionType, fieldName, choices,
header, footer = getTemplate(config.TEMPLATE_FILE)
print >>f, header%fields
- writeBody(f, sections, section_urls, cache_path=cache_url_path)
+ writeBody(f, sections, section_urls, cache_path=cache_url_path,
+ base_url=root)
print >>f, footer%fields
def writePageSet(config, bib, tag):