diff options
| author | Nick Mathewson <nickm@torproject.org> | 2003-05-20 18:42:35 +0000 |
|---|---|---|
| committer | Nick Mathewson <nickm@torproject.org> | 2003-05-20 18:42:35 +0000 |
| commit | 13eb0356e6a0d2246c22a54a66ccd8aee2eb6b4b (patch) | |
| tree | 4f4c741b45507cfd00c26f038fd797c695852381 | |
| parent | 9f0f80a518c4084cab4d5c5ffd2aac0e707ddcb6 (diff) | |
| download | anonbib-13eb0356e6a0d2246c22a54a66ccd8aee2eb6b4b.tar.gz | |
It feels very close to done. Committing for now.
svn:r13
| -rw-r--r-- | .cvsignore | 2 | ||||
| -rw-r--r-- | BibTeX.py | 109 | ||||
| -rw-r--r-- | css/pubs.css | 37 | ||||
| -rw-r--r-- | metaphone.py | 190 | ||||
| -rw-r--r-- | reconcile.py | 255 | ||||
| -rw-r--r-- | writeHTML.py | 25 |
6 files changed, 587 insertions, 31 deletions
@@ -2,3 +2,5 @@ *.pyo date.html topic.html +author.html +bibtex.html @@ -15,6 +15,10 @@ MONTHS = [ None, "January", "February", "March", "April", "May", "June", "July", "August", "September", "October", "November", "December"] +WWW_FIELDS = [ 'www_section', 'www_important', 'www_remarks', + 'www_abstract_url', 'www_html_url', 'www_pdf_url', 'www_ps_url', + 'www_txt_url', 'www_ps_gz_url' ] + def url_untranslate(s): s = s.replace(" ", "+") s = re.sub(r'([%<>])', @@ -87,7 +91,10 @@ def splitSortedEntriesBy(entries, field): def sortEntriesBy(entries, field, default): tmp = [] for ent in entries: - tmp = [ (txtize(ent.get(field, default)), ent) for ent in entries ] + v = ent.get(field, default) + if v.startswith("<span class='bad'>"): + v = default + tmp.append((txtize(v), ent)) tmp.sort() return [ t[1] for t in tmp ] @@ -137,6 +144,8 @@ def sortEntriesByDate(entries): except KeyError: print "ERROR: No year field in %s"%ent.key date = 10000*13 + except ValueError: + date = 10000*13 tmp.append((date, ent)) tmp.sort() return [ t[1] for t in tmp ] @@ -161,6 +170,19 @@ class BibTeXEntry: self.entries[k] = v def __str__(self): return self.format(70,1) + def getURL(self): + best = None + for field in ['www_pdf_url', 'www_ps_gz_url', 'www_ps_url', + 'www_html_url', 'www_txt_url']: + u = self.get(field) + if u: + if not best: + best = u + elif (best.startswith("http://citeseer.nj.nec.com/") + and not u.startswith("http://citeseer.nj.nec.com/")): + best = u + return best + def format(self, width=70, indent=8, v=0): d = ["@%s{%s,\n" % (self.type, self.key)] if v: @@ -174,6 +196,10 @@ class BibTeXEntry: if not self.entries.has_key(f): continue v = self.entries[f] + if v.startswith("<span class='bad'>"): + d.append("%%%%% ERROR: Missing field\n") + d.append("%% %s = {?????},\n"%f) + continue d.append(" ") s = "%s = {%s},\n" % (f, v) d.append(_split(s,width,indent)) @@ -187,8 +213,22 @@ class BibTeXEntry: #print " => ",repr(self.parsedAuthor) else: self.parsedAuthor = None + + def isImportant(self): + imp = self.get("www_important") + if imp and imp.strip().lower() not in ("no", "false", "0"): + return 1 + return 0 + def check(self): - ok = 1 + errs = self._check() + for e in errs: + print e + return not errs + + + def _check(self): + errs = [] if self.type == 'inproceedings': fields = 'booktitle', 'year' elif self.type == 'article': @@ -197,17 +237,22 @@ class BibTeXEntry: fields = 'institution', 'number' elif self.type == 'misc': fields = 'howpublished', + elif self.type in ('mastersthesis', 'phdthesis'): + fields = () else: fields = () - fields += 'title', 'author' + errs.append("ERROR: odd type %s"%self.type) + fields += 'title', 'author', 'www_section', 'year' for field in fields: - if not self.get(field): - print "ERROR: %s has no %s field" % (self.key, field) + if self.get(field) is None or \ + self.get(field).startswith("<span class='bad'>"): + errs.append("ERROR: %s has no %s field" % (self.key, field)) self.entries[field] = "<span class='bad'>%s:??</span>"%field - ok = 0 - - return ok + for field in self.entries.keys(): + if field.startswith("www_") and field not in WWW_FIELDS: + errs.append("ERROR: unknown www field %s"% field) + return errs def biblio_to_html(self): if self.type == 'inproceedings': @@ -297,8 +342,19 @@ class BibTeXEntry: return htmlize("".join(res)) def to_html(self): - res = ["<li><p class='entry'><span class='title'>%s</span>"%( - htmlize(self['title']))] + imp = self.isImportant() + if imp: + res = ["<li><div class='impEntry'><p class='impEntry'>", + "<span class='title'>%s</span>"%(htmlize(self['title']))] + else: + res = ["<li><p class='entry'><span class='title'>%s</span>"%( + htmlize(self['title']))] + + #eclass = ["entry", "impEntry"][imp] + # + #res = ["<li><p class='%s'><span class='title'>%s</span>"%( + # eclass, htmlize(self['title']))] + availability = [] for key, name in (('www_abstract_url', 'abstract'), ('www_html_url', 'HTML'), @@ -331,7 +387,17 @@ class BibTeXEntry: res.append(".") res.append("</span><br>\n") res.append(self.biblio_to_html()) - res.append("</p></li>\n\n") + + res.append("</p>"), + + if self.get('www_remarks'): + res.append("<p class='remarks'>%s</span>"%htmlize( + self['www_remarks'])) + + if imp: + res.append("</div>") + res.append("</li>\n\n") + return "".join(res) RE_LONE_AMP = re.compile(r'&([^a-z0-9])') @@ -406,19 +472,20 @@ def _split(s,w=79,indent=8): first = 1 indentation = "" while len(s) > w: - for i in xrange(w-1, 0, -1): + for i in xrange(w-1, 20, -1): if s[i] == ' ': r.append(indentation+s[:i]) s = s[i+1:] break else: - r.append(indentation+s[:w]) - s = s[w:] + r.append(indentation+s.strip()) + s = "" if first: first = 0 w -= indent indentation = " "*indent - r.append(indentation+s) + if (s): + r.append(indentation+s) r.append("") return "\n".join(r) @@ -536,12 +603,14 @@ def split_von(f,v,l,x): del f[-1] class Parser: - def __init__(self, fileiter, initial_strings): + def __init__(self, fileiter, initial_strings, result=None): self.strings = config.INITIAL_STRINGS.copy() self.strings.update(initial_strings) self.fileiter = fileiter self.entries = {} - self.result = BibTeX() + if result is None: + result = BibTeX() + self.result = result self.litStringLine = 0 self.entryLine = 0 @@ -754,14 +823,14 @@ BRACE_CLOSE_RE = re.compile(r'^([^\{\}]*)\}(.*)') BRACE_OPEN_RE = re.compile(r'^([^\{\}]*\{)(.*)') RAW_DATA_RE = re.compile(r'^([^\s\},]+)(.*)') -def parseFile(filename): +def parseFile(filename, result=None): f = FileIter(fname=filename) - p = Parser(f, {}) + p = Parser(f, {}, result) r = p.parse() r.resolve() for e in r.entries: e.check() - return r + return r if __name__ == '__main__': import sys diff --git a/css/pubs.css b/css/pubs.css index 971616e..57b654f 100644 --- a/css/pubs.css +++ b/css/pubs.css @@ -32,6 +32,21 @@ SPAN.bad { background-color: #FDF; } +P.remarks { + font-family: serif; + margin-top: 0.3em; + margin-bottom: 0; + margin-left: 5em; + padding-left: 0.5em; + border-width: 0 0 0 5; + border-color: black; + border-style: solid; +} + +P.remarks A { + text-decoration: underline; +} + P.l1 { margin-left: 0.5em; } @@ -42,6 +57,28 @@ P.l2 { margin-bottom: 0.3em; } +P.entry { + margin-top: 0.7em; + margin-bottom: 0; +} + +DIV.impEntry { + border-width: 0.1; + border-color: black; + border-style: solid; + background-color: #FFE; + padding: 0.3em; + margin-top: 0.7em; + margin-bottom: 0; +} + +P.impEntry { + background-color: #FFE; + padding: 0; + margin-top: 0; + margin-bottom: 0; +} + TABLE.sidebar { border-width: 2; border-color: black; diff --git a/metaphone.py b/metaphone.py new file mode 100644 index 0000000..99ae068 --- /dev/null +++ b/metaphone.py @@ -0,0 +1,190 @@ +#!/usr/bin/python2 + +import string + +TRIPLES = { + 'dge': 'j', + 'dgi': 'j', + 'dgy': 'j', + 'sia': '+x', + 'sio': '+x', + 'tia': '+x', + 'tio': '+x', + 'tch': '', + 'tha': '0', + 'the': '0', + 'thi': '0', + 'tho': '0', + 'thu': '0', + } + +DOUBLES = { + 'ph' : 'f', + 'sh' : 'x' + } + +SINGLETONS = { + 'd': 't', + 'f': 'f', + 'j': 'j', + 'l': 'l', + 'm': 'm', + 'n': 'n', + 'r': 'r', + 'p': 'p', + 'q': 'k', + 'v': 'f', + 'x': 'ks', + 'z': 's', + + +} + +ALLCHARS = "".join(map(chr, range(256))) +NONLCCHARS = "".join([c for c in ALLCHARS if not c.islower()]) +def metaphone(s): + s = s.lower() + s = s.translate(ALLCHARS, NONLCCHARS) + + if not s: return "" + + # If ae, gn, kn, pn, wr then drop the first letter. + if s[:2] in ("ae", "gn", "kn", "pn", "wr"): + s = s[1:] + + # Change "x" to "s" + if s[0] == 'x': + x = "s%s" % s[1:] + + # Get rid of "h" in "wh". + if s[:2] == 'wh': + s = "w%s" % s[1:] + + # Get rid of s from end. + if s[-1] == 's': + s = s[:-1] + + result = [] + prevLtr = ' ' + vowelBefore = 0 + lastChar = len(s)-1 + for idx in range(len(s)): + curLtr = s[idx] + # If first char is a vowel, keep it. + if curLtr in "aeiou": + if idx == 0: + result.append(curLtr) + continue + + # Skip double letters. + if idx < lastChar: + if curLtr == s[idx+1]: + continue + + try: + r = TRIPLES[s[idx:idx+3]] + if r == "+x": + if idx > 1: + result.append("x") + continue + else: + result.append(r) + continue + except KeyError: + pass + try: + r = DOUBLES[s[idx:idx+2]] + result.append(r) + continue + except KeyError: + pass + try: + r = SINGLETONS[s[idx]] + result.append(r) + continue + except KeyError: + pass + + if idx > 0: + prevLtr = s[idx-1] + vowelBefore = prevLtr in "aeiou" + curLtr = s[idx] + + nextLtr2 = nextLtr3 = ' ' + if idx < lastChar: + nextLtr = s[idx+1] + vowelAfter = nextLtr in "aeiou" + frontvAfter = nextLtr in "eiy" + if idx+1 < lastChar: + nextLtr2 = s[idx+2] + if idx+2 < lastChar: + nextLtr3 = s[idx+3] + else: + nextLtr = ' ' + vowelAfter = frontvAfter = 0 + + + if curLtr == 'b': + if idx == lastChar and prevLtr == 'm': + pass + else: + result.append(curLtr) + elif curLtr == 'c': + # silent 'sci', 'sce, 'scy', 'sci', etc OK. + if not (prevLtr == 's' and frontvAfter): + if nextLtr in 'ia': + result.append("x") + elif frontvAfter: + result.append("s") + elif prevLtr == 's' and nextLtr == 'h': + result.append('k') + elif nextLtr == 'h': + if idx == 0 and nextLtr2 in "aeiou": + result.append('k') + else: + result.append('x') + elif prevLtr == 'c': + result.append('c') + else: + result.append('k') + elif curLtr == 'g': + if (idx < lastChar-1) and nextLtr == 'h': + pass + elif s[idx:] == 'gned': + pass + elif s[idx:] == 'gn': + pass + elif prevLtr == 'd' and frontvAfter: + pass + else: + hard = (prevLtr == 'g') + if frontvAfter and not hard: + result.append('j') + else: + result.append('k') + elif curLtr == 'h': + if prevLtr in 'csptg': + pass + elif vowelBefore and not vowelAfter: + pass + else: + result.append('h') + elif curLtr == 'k': + if prevLtr != 'c': result.append('k') + elif curLtr in 'wy': + if vowelAfter: + result.append(curLtr) + + return "".join(result) + +def demo(a): + print a, "=>", metaphone(a) + +print __name__ +if __name__ == '__main__': + demo("Nick. Mathewson") + + demo("joe schmidt") + demo("Beethoven") + + demo("Andrea Plaid") diff --git a/reconcile.py b/reconcile.py index 8d5063a..e580b48 100644 --- a/reconcile.py +++ b/reconcile.py @@ -1,2 +1,255 @@ -#!/usr/bin/python +#!/usr/bin/python2 +import sys +import re + +import BibTeX +import config +import metaphone + +_MPCACHE = {} +def soundsLike(s1, s2): + c = _MPCACHE + s1 = clean(s1) + s2 = clean(s2) + try: + m1 = c[s1] + except KeyError: + m1 = c[s1] = metaphone.metaphone(s1) + try: + m2 = c[s2] + except KeyError: + m2 = c[s2] = metaphone.metaphone(s2) + + return m1 == m2 + +def mphone(s): + c = _MPCACHE + s = clean(s) + try: + return c[s] + except: + m = c[s] = metaphone.metaphone(s) + return m + +def clean(s): + s = re.sub(r'\s+', ' ', s) + s = s.strip() + return s + +class MasterBibTeX(BibTeX.BibTeX): + def __init__(self): + BibTeX.BibTeX.__init__(self) + + def buildIndex(self): + self.byTitle = {} + for ent in self.entries: + for t in self._titleForms(ent['title']): + self.byTitle.setdefault(t, []).append(ent) + + def _titleForms(self, title): + title = title.lower() + title = re.sub(r'\b(an|a|the|of)\b', "", title) + title = clean(title) + res = [ mphone(title) ] + if ':' in title: + for t in title.split(":"): + res.append(mphone(t.strip())) + #print "%r\n => %s" % (title,res) + return res + + def _titlesAlike(self, t1, t2): + t1 = clean(t1) + t2 = clean(t2) + if t1 == t2: + return 2 + tf1 = self._titleForms(t1) + tf2 = self._titleForms(t2) + for t in tf1: + if t in tf2: return 1 + return 0 + + def _authorsAlike(self, a1, a2): + if not soundsLike(" ".join(a1.last)," ".join(a2.last)): + return 0 + + if (a1.first == a2.first and a1.von == a2.von + and a1.jr == a2.jr): + return 2 + + + if soundsLike(" ".join(a1.first), " ".join(a2.first)): + return 1 + + if not a1.first or not a2.first: + return 1 + + if self._initialize(a1.first) == self._initialize(a2.first): + return 1 + + return 0 + + def _initialize(self, name): + n1 = name + name = " ".join(name).lower() + name = re.sub(r'([a-z])[a-z\.]*', r'\1', name) + name = clean(name) + return name + + def _authorListsAlike(self, a1, a2): + if len(a1) != len(a2): + return 0 + a1 = [ (a.last, a) for a in a1 ] + a2 = [ (a.last, a) for a in a2 ] + a1.sort() + a2.sort() + if len(a1) != len(a2): + return 0 + r = 2 + for (_, a1), (_, a2) in zip(a1,a2): + x = self._authorsAlike(a1,a2) + if not x: + return 0 + elif x == 1: + r = 1 + return r + + def _entryDatesAlike(self, e1, e2): + try: + if clean(e1['year']) == clean(e2['year']): + return 2 + else: + return 0 + except KeyError: + return 1 + + def includes(self, ent, all=0): + title = ent['title'] + candidates = [] + for form in self._titleForms(title): + try: + candidates.extend(self.byTitle[form]) + except KeyError: + pass + goodness = [] + for knownEnt in candidates: + match = (self._entryDatesAlike(ent, knownEnt) * + self._titlesAlike(ent['title'], knownEnt['title']) * + self._authorListsAlike(ent.parsedAuthor, + knownEnt.parsedAuthor) ) + if match: + goodness.append((match, knownEnt)) + goodness.sort() + if all: + return goodness + if goodness: + return goodness[-1] + else: + return None, None + + def demo(self): + for e in self.entries: + matches = self.includes(e, 1) + m2 = [] + mids = [] + for g,m in matches: + if id(m) not in mids: + mids.append(id(m)) + m2.append((g,m)) + matches = m2 + + if not matches: + print "No match for %s"%e.key + if matches[-1][1] is e: + print "%s matches for %s: OK."%(len(matches), e.key) + else: + print "%s matches for %s: %s is best!" %(len(matches), e.key, + matches[-1][1].key) + if len(matches) > 1: + for g, m in matches: + print "%%%% goodness", g + print m + + +all_ok = 1 +def emit(f,ent): + global all_ok + + errs = ent._check() + if master.byKey.has_key(ent.key.strip().lower()): + errs.append("ERROR: Key collision with master file") + + if errs: + all_ok = 0 + + note = ent.get("note") + if ent.getURL() and not note: + ent['note'] = "\url{%s}"%ent.getURL() + elif note: + m = re.match(r'\\url{(.*)}', note) + if m: + url = m.group(0) + tp = None + if url.endswith(".txt"): + tp = "txt" + elif url.endswith(".ps.gz"): + tp = "ps_gz" + elif url.endswith(".ps"): + tp = "ps_gz" + elif url.endswith(".pdf"): + tp = "pdf" + elif url.endswith(".html"): + tp = "html" + if tp: + ent['www_%s_url'%tp] = url + + if errs: + all_ok = 0 + for e in errs: + print >>f, "%%%%", e + + print >>f, ent.format(77, 4, v=1) + +def emitKnown(f, ent, matches): + print >>f, "%% Candidates are:", ", ".join([e.key for g,e in matches]) + print >>f, "%%" + print >>f, "%"+(ent.format(77).replace("\n", "\n%")) + +if len(sys.argv) != 2: + print "reconcile.py expects 1 argument" + sys.exit(1) + +print "========= Scanning master ==========" +master = MasterBibTeX() +master = BibTeX.parseFile(config.MASTER_BIB, master) +master.buildIndex() + +print "========= Scanning new file ========" +try: + fn = sys.argv[1] + input = BibTeX.parseFile(fn) +except BibTex.ParseError, e: + print "Error parsing %s: %s"%(fn,e) + sys.exit(1) + +f = open('tmp.bib', 'w') +for e in input.entries: + if not (e.get('title') and e.get('author')): + print >>f, "%%\n%%%% Not enough information to search for a match: need title and author.\n%%" + emit(f, e) + continue + + matches = master.includes(e, all=1) + if not matches: + print >>f, "%%\n%%%% This entry is probably new: No match found.\n%%" + emit(f, e) + else: + print >>f, "%%" + print >>f, "%%%% Possible match found for this entry; max goodness",\ + matches[-1][0], "\n%%" + emitKnown(f, e, matches) + +if not all_ok: + print >>f, "\n\n\nErrors remain; not finished.\n" + +f.close() diff --git a/writeHTML.py b/writeHTML.py index 2ffd7c7..53babe9 100644 --- a/writeHTML.py +++ b/writeHTML.py @@ -1,6 +1,7 @@ -#!/usr/bin/python +#!/usr/bin/python2 import re +import os import BibTeX import config @@ -14,12 +15,13 @@ def getTemplate(name): def writeBody(f, sections, section_urls): '''f: an open file - sections: list of (sectionname, [list of BibTeXEntry])''' + sections: list of (sectionname, [list of BibTeXEntry]) + section_urls: map from sectionname to external url''' for s, entries in sections: u = section_urls.get(s) if u: print >>f, ('<h3><a name="%s"><a href="%s">%s</a></a></h3>'%( - u, BibTeX.url_untranslate(s),s)) + (BibTeX.url_untranslate(s), u, s))) else: print >>f, ('<h3><a name="%s">%s</a></h3>'%( BibTeX.url_untranslate(s),s)) @@ -70,14 +72,14 @@ bib = BibTeX.parseFile(config.MASTER_BIB) entries = BibTeX.sortEntriesBy(bib.entries, "www_section", "ZZZZZZZZZZZZZZZZZ") entries = BibTeX.splitSortedEntriesBy(entries, "www_section") -if entries[-1][0] is None: +if entries[-1][0].startswith("<span class='bad'>"): entries[-1] = ("Miscellaneous", entries[-1][1]) entries = [ (s, BibTeX.sortEntriesByAuthor(ents)) for s, ents in entries ] -f = open("topic.html", 'w') +f = open(os.path.join(config.OUTPUT_DIR,"topic.html"), 'w') writeHTML(f, entries, "Topics", "topic", (("By topic", None), ("By date", "./date.html"), @@ -94,13 +96,16 @@ if entries[-1][0] == None: sections = [ ent[0] for ent in entries ] first_year = int(entries[0][1][0]['year']) -last_year = int(entries[-1][1][0].get('year', - entries[-2][1][0]['year'])) +try: + last_year = int(entries[-1][1][0].get('year')) +except ValueError: + last_year = int(entries[-2][1][0].get('year')) + years = map(str, range(first_year, last_year+1)) if entries[-1][0] == 'Unknown': years.append("Unknown") -f = open("date.html", 'w') +f = open(os.path.join(config.OUTPUT_DIR,"date.html"), 'w') writeHTML(f, entries, "Years", "date", (("By topic", "./topic.html"), ("By date", None), @@ -111,7 +116,7 @@ f.close() ## By author entries, url_map = BibTeX.splitEntriesByAuthor(bib.entries) -f = open("author.html", 'w') +f = open(os.path.join(config.OUTPUT_DIR,"author.html"), 'w') writeHTML(f, entries, "Authors", "author", (("By topic", "./topic.html"), ("By date", "./date.html"), @@ -127,7 +132,7 @@ entries = [ (ent.key, ent) for ent in entries ] entries.sort() entries = [ ent[1] for ent in entries ] header,footer = getTemplate("_template_bibtex") -f = open("bibtex.html", 'w') +f = open(os.path.join(config.OUTPUT_DIR,"bibtex.html"), 'w') print >>f, header % { 'command_line' : "" } for ent in entries: print >>f, ( |
