diff options
| -rw-r--r-- | BibTeX.py | 14 | ||||
| -rw-r--r-- | Makefile | 1 | ||||
| -rw-r--r-- | config.py | 1 | ||||
| -rw-r--r-- | gold.gif | bin | 0 -> 540 bytes | |||
| -rw-r--r-- | rank.py | 128 | ||||
| -rw-r--r-- | silver.gif | bin | 0 -> 539 bytes | |||
| -rw-r--r-- | upb.gif | bin | 0 -> 555 bytes | |||
| -rw-r--r-- | ups.gif | bin | 0 -> 536 bytes | |||
| -rwxr-xr-x | writeHTML.py | 7 |
9 files changed, 146 insertions, 5 deletions
@@ -13,6 +13,8 @@ import os import config +import rank + __all__ = [ 'ParseError', 'BibTeX', 'BibTeXEntry', 'htmlize', 'ParsedAuthor', 'FileIter', 'Parser', 'parseFile', 'splitBibTeXEntriesBy', 'sortBibTexEntriesBy', ] @@ -400,7 +402,7 @@ class BibTeXEntry: return errs def biblio_to_html(self): - """Return the HTML for the citatation portion of entry.""" + """Return the HTML for the citation portion of entry.""" if self.type == 'inproceedings': booktitle = self['booktitle'] bookurl = self.get('bookurl') @@ -496,7 +498,7 @@ class BibTeXEntry: "</span>") %bibtexurl) return htmlize("".join(res)) - def to_html(self, cache_path="./cache"): + def to_html(self, cache_path="./cache", base_url="."): """Return the HTML for this entry.""" imp = self.isImportant() draft = self.get('year') == 'forthcoming' @@ -507,6 +509,14 @@ class BibTeXEntry: else: res = ["<li><p class='entry'>"] + if imp or not draft: + # Add a picture of the rank + # Only if year is known or paper important! + r = rank.get_rank_html(self['title'], self.get('year'), + update=False, base_url=base_url) + if r is not None: + res.append(r) + res.append("<span class='title'><a name='%s'>%s</a></span>"%( url_untranslate(self.key),htmlize(self['title']))) @@ -9,6 +9,7 @@ clean: update: $(PYTHON) updateCache.py anonbib.cfg + $(PYTHON) rank.py anonbib.cfg test: $(PYTHON) test.py @@ -4,6 +4,7 @@ import re _KEYS = [ "ALL_TAGS", "ALPHABETIZE_AUTHOR_AS","AUTHOR_URLS","CACHE_DIR","CACHE_SECTIONS", + "CITE_CACHE_DIR", "COLLAPSE_AUTHORS", "DOWNLOAD_CONNECT_TIMEOUT","INITIAL_STRINGS", "MASTER_BIB", "NO_COLLAPSE_AUTHORS", "OMIT_ENTRIES", diff --git a/gold.gif b/gold.gif Binary files differnew file mode 100644 index 0000000..44505db --- /dev/null +++ b/gold.gif @@ -0,0 +1,128 @@ +# Make rankings of papers and authors for automatic classification of content hotness + +# Google Scholar address +# http://scholar.google.com/scholar?as_epq= + +# Take care of the caching setup +cache_expire = 60*60*24*30 # 30 days + +# Checks +import config +import os +import sys +from os.path import exists, isdir, join, getmtime +from os import listdir, remove + +def remove_old(): + # Remove all old cached files + filenames = listdir(cache_folder()) + from time import time + now = time() + for f in filenames: + pf = join(cache_folder(), f) + time_mt = getmtime(pf) + if now - time_mt > cache_expire: # 30 days + remove(pf) + +def cache_folder(): + r = join(config.OUTPUT_DIR, config.CITE_CACHE_DIR) + if not exists(r): + os.makedirs(r) + assert isdir(r) + return r + +import md5 +import re +from urllib2 import urlopen, build_opener +from datetime import date + +# A more handy hash +def md5h(s): + m = md5.new() + m.update(s) + return m.digest().encode('hex_codec') + +format_tested = 0 + +def getCite(title, cache=True, update=True): + global format_tested + if not format_tested and update: + format_tested = 1 + TestScholarFormat() + + # Do not assume that the title is clean + title = re.sub("\s+", " ", title) + title = re.sub("[^'a-zA-Z0-9\. \-\/:]", "", title) + title = re.sub("'\/", " ", title) + + # Make a custom user agent (so that we are not filtered by Google)! + opener = build_opener() + opener.addheaders = [('User-agent', 'Anon.Bib.0.1')] + + # We rely on google scholar to return the article with this exact title + gurl = "http://scholar.google.com/scholar?as_epq=%s&as_occt=title" + from urllib import quote + url = gurl % quote(title) + + # Access cache or network + if exists(join(cache_folder(), md5h(url))) and cache: + page = file(join(cache_folder(), md5h(url)),'r').read() + elif update: + print "Downloading rank for %r."%title + page = opener.open(url).read() + file(join(cache_folder(), md5h(url)),'w').write(page) + else: + return None + + # Check if it finds any articles + if len(re.findall("did not match any articles", page)) > 0: + return None + + # Kill all tags! + cpage = re.sub("<[^>]*>", "", page) + + # Add up all citations + s = sum([int(x) for x in re.findall("Cited by ([0-9]*)", cpage)]) + return s + +def get_rank_html(title, years=None, base_url=".", update=True): + s = getCite(title, update=update) + + # Paper cannot be found + if s is None: + return '' + + html = '' + + # Hotness + if s >= 50: + html += '<img src="%s/gold.gif" />' % base_url + elif s >= 5: + html += '<img src="%s/silver.gif" />' % base_url + + # Velocity + d = date.today().year - int(years) + if d >= 0: + if 2 < s / (d +1) < 10: + html += '<img src="%s/ups.gif" />' % base_url + if 10 <= s / (d +1):< + html += '<img src="%s/upb.gif" />' % base_url + + return html + +def TestScholarFormat(): + # We need to ensure that Google Scholar does not change its page format under our feet + # Use some cases to check if all is good + assert(getCite("Stop-and-Go MIXes: Providing Probabilistic Anonymity in an Open System", False) > 0) + assert(getCite("Mixes protected by Dragons and Pixies: an empirical study", False) == None) + +if __name__ == '__main__': + # First download the bibliography file. + import BibTeX + config.load(sys.argv[1]) + bib = BibTeX.parseFile(config.MASTER_BIB) + remove_old() + print "Downloading missing ranks." + for ent in bib.entries: + getCite(ent['title'], cache=True, update=True) + diff --git a/silver.gif b/silver.gif Binary files differBinary files differBinary files differnew file mode 100644 index 0000000..8a4ff29 --- /dev/null +++ b/silver.gif diff --git a/writeHTML.py b/writeHTML.py index 3184ef0..934b46b 100755 --- a/writeHTML.py +++ b/writeHTML.py @@ -29,7 +29,7 @@ def pathLength(s): s = parent return n -def writeBody(f, sections, section_urls, cache_path): +def writeBody(f, sections, section_urls, cache_path, base_url): '''f: an open file sections: list of (sectionname, [list of BibTeXEntry]) section_urls: map from sectionname to external url''' @@ -45,7 +45,7 @@ def writeBody(f, sections, section_urls, cache_path): BibTeX.url_untranslate(s),sDisp)) print >>f, "<ul class='expand'>" for e in entries: - print >>f, e.to_html(cache_path=cache_path) + print >>f, e.to_html(cache_path=cache_path, base_url=base_url) print >>f, "</ul></li>" def writeHTML(f, sections, sectionType, fieldName, choices, @@ -104,7 +104,8 @@ def writeHTML(f, sections, sectionType, fieldName, choices, header, footer = getTemplate(config.TEMPLATE_FILE) print >>f, header%fields - writeBody(f, sections, section_urls, cache_path=cache_url_path) + writeBody(f, sections, section_urls, cache_path=cache_url_path, + base_url=root) print >>f, footer%fields def writePageSet(config, bib, tag): |
