aboutsummaryrefslogtreecommitdiffstats
path: root/reconcile.py
diff options
context:
space:
mode:
Diffstat (limited to 'reconcile.py')
-rw-r--r--reconcile.py255
1 files changed, 254 insertions, 1 deletions
diff --git a/reconcile.py b/reconcile.py
index 8d5063a..e580b48 100644
--- a/reconcile.py
+++ b/reconcile.py
@@ -1,2 +1,255 @@
-#!/usr/bin/python
+#!/usr/bin/python2
+import sys
+import re
+
+import BibTeX
+import config
+import metaphone
+
+_MPCACHE = {}
+def soundsLike(s1, s2):
+ c = _MPCACHE
+ s1 = clean(s1)
+ s2 = clean(s2)
+ try:
+ m1 = c[s1]
+ except KeyError:
+ m1 = c[s1] = metaphone.metaphone(s1)
+ try:
+ m2 = c[s2]
+ except KeyError:
+ m2 = c[s2] = metaphone.metaphone(s2)
+
+ return m1 == m2
+
+def mphone(s):
+ c = _MPCACHE
+ s = clean(s)
+ try:
+ return c[s]
+ except:
+ m = c[s] = metaphone.metaphone(s)
+ return m
+
+def clean(s):
+ s = re.sub(r'\s+', ' ', s)
+ s = s.strip()
+ return s
+
+class MasterBibTeX(BibTeX.BibTeX):
+ def __init__(self):
+ BibTeX.BibTeX.__init__(self)
+
+ def buildIndex(self):
+ self.byTitle = {}
+ for ent in self.entries:
+ for t in self._titleForms(ent['title']):
+ self.byTitle.setdefault(t, []).append(ent)
+
+ def _titleForms(self, title):
+ title = title.lower()
+ title = re.sub(r'\b(an|a|the|of)\b', "", title)
+ title = clean(title)
+ res = [ mphone(title) ]
+ if ':' in title:
+ for t in title.split(":"):
+ res.append(mphone(t.strip()))
+ #print "%r\n => %s" % (title,res)
+ return res
+
+ def _titlesAlike(self, t1, t2):
+ t1 = clean(t1)
+ t2 = clean(t2)
+ if t1 == t2:
+ return 2
+ tf1 = self._titleForms(t1)
+ tf2 = self._titleForms(t2)
+ for t in tf1:
+ if t in tf2: return 1
+ return 0
+
+ def _authorsAlike(self, a1, a2):
+ if not soundsLike(" ".join(a1.last)," ".join(a2.last)):
+ return 0
+
+ if (a1.first == a2.first and a1.von == a2.von
+ and a1.jr == a2.jr):
+ return 2
+
+
+ if soundsLike(" ".join(a1.first), " ".join(a2.first)):
+ return 1
+
+ if not a1.first or not a2.first:
+ return 1
+
+ if self._initialize(a1.first) == self._initialize(a2.first):
+ return 1
+
+ return 0
+
+ def _initialize(self, name):
+ n1 = name
+ name = " ".join(name).lower()
+ name = re.sub(r'([a-z])[a-z\.]*', r'\1', name)
+ name = clean(name)
+ return name
+
+ def _authorListsAlike(self, a1, a2):
+ if len(a1) != len(a2):
+ return 0
+ a1 = [ (a.last, a) for a in a1 ]
+ a2 = [ (a.last, a) for a in a2 ]
+ a1.sort()
+ a2.sort()
+ if len(a1) != len(a2):
+ return 0
+ r = 2
+ for (_, a1), (_, a2) in zip(a1,a2):
+ x = self._authorsAlike(a1,a2)
+ if not x:
+ return 0
+ elif x == 1:
+ r = 1
+ return r
+
+ def _entryDatesAlike(self, e1, e2):
+ try:
+ if clean(e1['year']) == clean(e2['year']):
+ return 2
+ else:
+ return 0
+ except KeyError:
+ return 1
+
+ def includes(self, ent, all=0):
+ title = ent['title']
+ candidates = []
+ for form in self._titleForms(title):
+ try:
+ candidates.extend(self.byTitle[form])
+ except KeyError:
+ pass
+ goodness = []
+ for knownEnt in candidates:
+ match = (self._entryDatesAlike(ent, knownEnt) *
+ self._titlesAlike(ent['title'], knownEnt['title']) *
+ self._authorListsAlike(ent.parsedAuthor,
+ knownEnt.parsedAuthor) )
+ if match:
+ goodness.append((match, knownEnt))
+ goodness.sort()
+ if all:
+ return goodness
+ if goodness:
+ return goodness[-1]
+ else:
+ return None, None
+
+ def demo(self):
+ for e in self.entries:
+ matches = self.includes(e, 1)
+ m2 = []
+ mids = []
+ for g,m in matches:
+ if id(m) not in mids:
+ mids.append(id(m))
+ m2.append((g,m))
+ matches = m2
+
+ if not matches:
+ print "No match for %s"%e.key
+ if matches[-1][1] is e:
+ print "%s matches for %s: OK."%(len(matches), e.key)
+ else:
+ print "%s matches for %s: %s is best!" %(len(matches), e.key,
+ matches[-1][1].key)
+ if len(matches) > 1:
+ for g, m in matches:
+ print "%%%% goodness", g
+ print m
+
+
+all_ok = 1
+def emit(f,ent):
+ global all_ok
+
+ errs = ent._check()
+ if master.byKey.has_key(ent.key.strip().lower()):
+ errs.append("ERROR: Key collision with master file")
+
+ if errs:
+ all_ok = 0
+
+ note = ent.get("note")
+ if ent.getURL() and not note:
+ ent['note'] = "\url{%s}"%ent.getURL()
+ elif note:
+ m = re.match(r'\\url{(.*)}', note)
+ if m:
+ url = m.group(0)
+ tp = None
+ if url.endswith(".txt"):
+ tp = "txt"
+ elif url.endswith(".ps.gz"):
+ tp = "ps_gz"
+ elif url.endswith(".ps"):
+ tp = "ps_gz"
+ elif url.endswith(".pdf"):
+ tp = "pdf"
+ elif url.endswith(".html"):
+ tp = "html"
+ if tp:
+ ent['www_%s_url'%tp] = url
+
+ if errs:
+ all_ok = 0
+ for e in errs:
+ print >>f, "%%%%", e
+
+ print >>f, ent.format(77, 4, v=1)
+
+def emitKnown(f, ent, matches):
+ print >>f, "%% Candidates are:", ", ".join([e.key for g,e in matches])
+ print >>f, "%%"
+ print >>f, "%"+(ent.format(77).replace("\n", "\n%"))
+
+if len(sys.argv) != 2:
+ print "reconcile.py expects 1 argument"
+ sys.exit(1)
+
+print "========= Scanning master =========="
+master = MasterBibTeX()
+master = BibTeX.parseFile(config.MASTER_BIB, master)
+master.buildIndex()
+
+print "========= Scanning new file ========"
+try:
+ fn = sys.argv[1]
+ input = BibTeX.parseFile(fn)
+except BibTex.ParseError, e:
+ print "Error parsing %s: %s"%(fn,e)
+ sys.exit(1)
+
+f = open('tmp.bib', 'w')
+for e in input.entries:
+ if not (e.get('title') and e.get('author')):
+ print >>f, "%%\n%%%% Not enough information to search for a match: need title and author.\n%%"
+ emit(f, e)
+ continue
+
+ matches = master.includes(e, all=1)
+ if not matches:
+ print >>f, "%%\n%%%% This entry is probably new: No match found.\n%%"
+ emit(f, e)
+ else:
+ print >>f, "%%"
+ print >>f, "%%%% Possible match found for this entry; max goodness",\
+ matches[-1][0], "\n%%"
+ emitKnown(f, e, matches)
+
+if not all_ok:
+ print >>f, "\n\n\nErrors remain; not finished.\n"
+
+f.close()