Add a 'make suggest' target to see if google scholar knows any URLs we do not.

svn:r304
author: Nick Mathewson <nickm@torproject.org> 2008-11-24 20:00:06 +0000
committer: Nick Mathewson <nickm@torproject.org> 2008-11-24 20:00:06 +0000
commit: 5bfc263f67e8efa305a330a8928ce98c940b657f (patch)
tree: 59407bb03502d7df644e276d2709aebf1166a14f /rank.py
parent: 714263e8f1a59030f0c32adcbc37bde408f1af46 (diff)
download: anonbib-5bfc263f67e8efa305a330a8928ce98c940b657f.tar.gz
1 files changed, 50 insertions, 8 deletions
diff --git a/rank.py b/rank.py
index 3f7ed71..578cc3f 100644
--- a/rank.py
+++ b/rank.py
@@ -33,7 +33,7 @@ def cache_folder():
 
 import md5
 import re
-from urllib2 import urlopen, build_opener
+from urllib2 import urlopen, build_opener, quote
 from datetime import date
 
 # A more handy hash
@@ -44,7 +44,7 @@ def md5h(s):
 
 format_tested = 0
 
-def getCite(title, cache=True, update=True, save=True):
+def getPageForTitle(title, cache=True, update=True, save=True):
    #Returns (citation-count, scholar url) tuple, or (None,None)
    global format_tested
    if not format_tested and update:
@@ -56,15 +56,13 @@ def getCite(title, cache=True, update=True, save=True):
    title = re.sub("[^'a-zA-Z0-9\. \-\/:]", "", title)
    title = re.sub("'\/", " ", title)
 
-
    # We rely on google scholar to return the article with this exact title
    gurl = "http://scholar.google.com/scholar?as_epq=%s&as_occt=title"
-   from urllib import quote
    url = gurl % quote(title)
 
    # Access cache or network
    if exists(join(cache_folder(), md5h(url))) and cache:
-      page = file(join(cache_folder(), md5h(url)),'r').read()
+      return url, file(join(cache_folder(), md5h(url)),'r').read()
    elif update:
       print "Downloading rank for %r."%title
 
@@ -72,10 +70,21 @@ def getCite(title, cache=True, update=True, save=True):
       opener = build_opener()
       opener.addheaders = [('User-agent', 'Anon.Bib.0.1')]
 
-      page = opener.open(url).read()
-      if save: file(join(cache_folder(), md5h(url)),'w').write(page)
+      print "connecting..."
+      connection = opener.open(url)
+      print "reading"
+      page = connection.read()
+      print "done"
+      if save:
+         file(join(cache_folder(), md5h(url)),'w').write(page)
+      return url, page
    else:
-      return (None, None)
+      return url, None
+
+def getCite(title, cache=True, update=True, save=True):
+   url, page = getPageForTitle(title, cache=cache, update=update, save=save)
+   if not page:
+      return None
 
    # Check if it finds any articles
    if len(re.findall("did not match any articles", page)) > 0:
@@ -88,6 +97,13 @@ def getCite(title, cache=True, update=True, save=True):
    s = sum([int(x) for x in re.findall("Cited by ([0-9]*)", cpage)])
    return (s, url)
 
+def getPaperURLs(title, cache=True, update=True, save=True):
+   url, page = getPageForTitle(title, cache=cache, update=update, save=save)
+   if not page:
+      return []
+   pages = re.findall(r'\&\#x25ba\;.*class=fl href="([^"]*)"', page)
+   return pages
+
 def get_rank_html(title, years=None, base_url=".", update=True,
                   velocity=False):
    s,url = getCite(title, update=update)
@@ -120,18 +136,44 @@ def get_rank_html(title, years=None, base_url=".", update=True,
 def TestScholarFormat():
    # We need to ensure that Google Scholar does not change its page format under our feet
    # Use some cases to check if all is good
+   print "Checking google scholar formats..."
    assert(getCite("Stop-and-Go MIXes: Providing Probabilistic Anonymity in an Open System", False)[0] > 0)
    assert(getCite("Mixes protected by Dragons and Pixies: an empirical study", False, save=False)[0] == None)
 
+URLTYPES=[ "pdf", "ps", "txt", "ps_gz", "html" ]
+
 if __name__ == '__main__':
    # First download the bibliography file.
    import BibTeX
+   suggest = False
+   if sys.argv[1] == 'suggest':
+      suggest = True
+      del sys.argv[1]
+
    config.load(sys.argv[1])
    if config.CACHE_UMASK != None:
       os.umask(config.CACHE_UMASK)
    bib = BibTeX.parseFile(config.MASTER_BIB)
    remove_old()
+
    print "Downloading missing ranks."
    for ent in bib.entries:
       getCite(ent['title'], cache=True, update=True)
 
+   if suggest:
+      for ent in bib.entries:
+         haveOne = False
+         for utype in URLTYPES:
+            if ent.has_key("www_%s_url"%utype):
+               haveOne = True
+               break
+         if haveOne:
+            continue
+         print ent.key, "has no URLs given."
+         urls = [ u for u in getPaperURLs(ent['title'])
+                  if u.find("freehaven.net/anonbib") < 0 ]
+         for u in urls:
+            print "\t", u
+
+
+
author	Nick Mathewson <nickm@torproject.org>	2008-11-24 20:00:06 +0000
committer	Nick Mathewson <nickm@torproject.org>	2008-11-24 20:00:06 +0000
commit	5bfc263f67e8efa305a330a8928ce98c940b657f (patch)
tree	59407bb03502d7df644e276d2709aebf1166a14f /rank.py
parent	714263e8f1a59030f0c32adcbc37bde408f1af46 (diff)
download	anonbib-5bfc263f67e8efa305a330a8928ce98c940b657f.tar.gz