reconcile.py (8091B)
1 #!/usr/bin/python2 2 # Copyright 2003-2008, Nick Mathewson. See LICENSE for licensing info. 3 4 """Code to determine which entries are new and which are old. 5 6 To scan a new file, run "python reconcile.py anonbib.cfg new-file.bib". This 7 will generate a new bibtex file called 'tmp.bib', with all the new entries 8 cleaned up a little, and all the duplicate entries commented out. 9 """ 10 11 import sys 12 import re 13 14 assert sys.version_info[:3] >= (2,2,0) 15 16 import BibTeX 17 import config 18 import metaphone 19 20 _MPCACHE = {} 21 def soundsLike(s1, s2): 22 c = _MPCACHE 23 s1 = clean(s1) 24 s2 = clean(s2) 25 try: 26 m1 = c[s1] 27 except KeyError: 28 m1 = c[s1] = metaphone.metaphone(s1) 29 try: 30 m2 = c[s2] 31 except KeyError: 32 m2 = c[s2] = metaphone.metaphone(s2) 33 34 return m1 == m2 35 36 def mphone(s): 37 c = _MPCACHE 38 s = clean(s) 39 try: 40 return c[s] 41 except: 42 m = c[s] = metaphone.metaphone(s) 43 return m 44 45 def clean(s): 46 s = re.sub(r'\s+', ' ', s) 47 s = s.strip() 48 return s 49 50 class MasterBibTeX(BibTeX.BibTeX): 51 def __init__(self): 52 BibTeX.BibTeX.__init__(self) 53 54 def buildIndex(self): 55 self.byTitle = {} 56 for ent in self.entries: 57 for t in self._titleForms(ent['title']): 58 self.byTitle.setdefault(t, []).append(ent) 59 60 def _titleForms(self, title): 61 title = title.lower() 62 title = re.sub(r'\b(an|a|the|of)\b', "", title) 63 title = clean(title) 64 res = [ mphone(title) ] 65 if ':' in title: 66 for t in title.split(":"): 67 res.append(mphone(t.strip())) 68 #print "%r\n => %s" % (title,res) 69 return res 70 71 def _titlesAlike(self, t1, t2): 72 t1 = clean(t1) 73 t2 = clean(t2) 74 if t1 == t2: 75 return 2 76 tf1 = self._titleForms(t1) 77 tf2 = self._titleForms(t2) 78 for t in tf1: 79 if t in tf2: return 1 80 return 0 81 82 def _authorsAlike(self, a1, a2): 83 if not soundsLike(" ".join(a1.last)," ".join(a2.last)): 84 return 0 85 86 if (a1.first == a2.first and a1.von == a2.von 87 and a1.jr == a2.jr): 88 return 2 89 90 91 if soundsLike(" ".join(a1.first), " ".join(a2.first)): 92 return 1 93 94 if not a1.first or not a2.first: 95 return 1 96 97 if self._initialize(a1.first) == self._initialize(a2.first): 98 return 1 99 100 return 0 101 102 def _initialize(self, name): 103 name = " ".join(name).lower() 104 name = re.sub(r'([a-z])[a-z\.]*', r'\1', name) 105 name = clean(name) 106 return name 107 108 def _authorListsAlike(self, a1, a2): 109 if len(a1) != len(a2): 110 return 0 111 a1 = [ (a.last, a) for a in a1 ] 112 a2 = [ (a.last, a) for a in a2 ] 113 a1.sort() 114 a2.sort() 115 if len(a1) != len(a2): 116 return 0 117 r = 2 118 for (_, a1), (_, a2) in zip(a1,a2): 119 x = self._authorsAlike(a1,a2) 120 if not x: 121 return 0 122 elif x == 1: 123 r = 1 124 return r 125 126 def _entryDatesAlike(self, e1, e2): 127 try: 128 if clean(e1['year']) == clean(e2['year']): 129 return 2 130 else: 131 return 0 132 except KeyError: 133 return 1 134 135 def includes(self, ent, all=0): 136 title = ent['title'] 137 candidates = [] 138 for form in self._titleForms(title): 139 try: 140 candidates.extend(self.byTitle[form]) 141 except KeyError: 142 pass 143 goodness = [] 144 for knownEnt in candidates: 145 match = (self._entryDatesAlike(ent, knownEnt) * 146 self._titlesAlike(ent['title'], knownEnt['title']) * 147 self._authorListsAlike(ent.parsedAuthor, 148 knownEnt.parsedAuthor) ) 149 if match: 150 goodness.append((match, knownEnt)) 151 goodness.sort() 152 if all: 153 return goodness 154 if goodness: 155 return goodness[-1] 156 else: 157 return None, None 158 159 def demo(self): 160 for e in self.entries: 161 matches = self.includes(e, 1) 162 m2 = [] 163 mids = [] 164 for g,m in matches: 165 if id(m) not in mids: 166 mids.append(id(m)) 167 m2.append((g,m)) 168 matches = m2 169 170 if not matches: 171 print "No match for %s"%e.key 172 if matches[-1][1] is e: 173 print "%s matches for %s: OK."%(len(matches), e.key) 174 else: 175 print "%s matches for %s: %s is best!" %(len(matches), e.key, 176 matches[-1][1].key) 177 if len(matches) > 1: 178 for g, m in matches: 179 print "%%%% goodness", g 180 print m 181 182 183 def noteToURL(note): 184 " returns tp, url " 185 note = note.replace("\n", " ") 186 m = re.match(r'\s*(?:\\newline\s*)*\s*\\url{(.*)}\s*(?:\\newline\s*)*', 187 note) 188 if not m: 189 return None 190 url = m.group(1) 191 for suffix, tp in ((".html", "html"), 192 (".ps", "ps"), 193 (".ps.gz", "ps_gz"), 194 (".pdf", "pdf"), 195 (".txt", "txt")): 196 if url.endswith(suffix): 197 return tp,url 198 return "???", url 199 200 all_ok = 1 201 def emit(f,ent): 202 global all_ok 203 204 errs = ent._check() 205 if master.byKey.has_key(ent.key.strip().lower()): 206 errs.append("ERROR: Key collision with master file") 207 208 if errs: 209 all_ok = 0 210 211 note = ent.get("note") 212 if ent.getURL() and not note: 213 ent['note'] = "\url{%s}"%ent.getURL() 214 elif note: 215 m = re.match(r'\\url{(.*)}', note) 216 if m: 217 url = m.group(0) 218 tp = None 219 if url.endswith(".txt"): 220 tp = "txt" 221 elif url.endswith(".ps.gz"): 222 tp = "ps_gz" 223 elif url.endswith(".ps"): 224 tp = "ps_gz" 225 elif url.endswith(".pdf"): 226 tp = "pdf" 227 elif url.endswith(".html"): 228 tp = "html" 229 if tp: 230 ent['www_%s_url'%tp] = url 231 232 if errs: 233 all_ok = 0 234 for e in errs: 235 print >>f, "%%%%", e 236 237 print >>f, ent.format(77, 4, v=1, invStrings=invStrings) 238 239 def emitKnown(f, ent, matches): 240 print >>f, "%% Candidates are:", ", ".join([e.key for g,e in matches]) 241 print >>f, "%%" 242 print >>f, "%"+(ent.format(77,4,1,invStrings).replace("\n", "\n%")) 243 244 if __name__ == '__main__': 245 if len(sys.argv) != 3: 246 print "reconcile.py expects 2 arguments" 247 sys.exit(1) 248 249 config.load(sys.argv[1]) 250 251 print "========= Scanning master ==========" 252 master = MasterBibTeX() 253 master = BibTeX.parseFile(config.MASTER_BIB, result=master) 254 master.buildIndex() 255 256 print "========= Scanning new file ========" 257 try: 258 fn = sys.argv[2] 259 input = BibTeX.parseFile(fn) 260 except BibTeX.ParseError, e: 261 print "Error parsing %s: %s"%(fn,e) 262 sys.exit(1) 263 264 f = open('tmp.bib', 'w') 265 keys = input.newStrings.keys() 266 keys.sort() 267 for k in keys: 268 v = input.newStrings[k] 269 print >>f, "@string{%s = {%s}}"%(k,v) 270 271 invStrings = input.invStrings 272 273 for e in input.entries: 274 if not (e.get('title') and e.get('author')): 275 print >>f, "%%\n%%%% Not enough information to search for a match: need title and author.\n%%" 276 emit(f, e) 277 continue 278 279 matches = master.includes(e, all=1) 280 if not matches: 281 print >>f, "%%\n%%%% This entry is probably new: No match found.\n%%" 282 emit(f, e) 283 else: 284 print >>f, "%%" 285 print >>f, "%%%% Possible match found for this entry; max goodness",\ 286 matches[-1][0], "\n%%" 287 emitKnown(f, e, matches) 288 289 if not all_ok: 290 print >>f, "\n\n\nErrors remain; not finished.\n" 291 292 f.close()