gnunetbib

Bibliography (BibTeX, based on AnonBib)
Log | Files | Refs | README | LICENSE

reconcile.py (8091B)


      1 #!/usr/bin/python2
      2 # Copyright 2003-2008, Nick Mathewson.  See LICENSE for licensing info.
      3 
      4 """Code to determine which entries are new and which are old.
      5 
      6    To scan a new file, run "python reconcile.py anonbib.cfg new-file.bib".  This
      7    will generate a new bibtex file called 'tmp.bib', with all the new entries
      8    cleaned up a little, and all the duplicate entries commented out.
      9 """
     10 
     11 import sys
     12 import re
     13 
     14 assert sys.version_info[:3] >= (2,2,0)
     15 
     16 import BibTeX
     17 import config
     18 import metaphone
     19 
     20 _MPCACHE = {}
     21 def soundsLike(s1, s2):
     22     c = _MPCACHE
     23     s1 = clean(s1)
     24     s2 = clean(s2)
     25     try:
     26         m1 = c[s1]
     27     except KeyError:
     28         m1 = c[s1] = metaphone.metaphone(s1)
     29     try:
     30         m2 = c[s2]
     31     except KeyError:
     32         m2 = c[s2] = metaphone.metaphone(s2)
     33 
     34     return m1 == m2
     35 
     36 def mphone(s):
     37     c = _MPCACHE
     38     s = clean(s)
     39     try:
     40         return c[s]
     41     except:
     42         m = c[s] = metaphone.metaphone(s)
     43         return m
     44 
     45 def clean(s):
     46     s = re.sub(r'\s+', ' ', s)
     47     s = s.strip()
     48     return s
     49 
     50 class MasterBibTeX(BibTeX.BibTeX):
     51     def __init__(self):
     52         BibTeX.BibTeX.__init__(self)
     53 
     54     def buildIndex(self):
     55         self.byTitle = {}
     56         for ent in self.entries:
     57             for t in self._titleForms(ent['title']):
     58                 self.byTitle.setdefault(t, []).append(ent)
     59 
     60     def _titleForms(self, title):
     61         title = title.lower()
     62         title = re.sub(r'\b(an|a|the|of)\b', "", title)
     63         title = clean(title)
     64         res = [ mphone(title) ]
     65         if ':' in title:
     66             for t in title.split(":"):
     67                 res.append(mphone(t.strip()))
     68         #print "%r\n   => %s" % (title,res)
     69         return res
     70 
     71     def _titlesAlike(self, t1, t2):
     72         t1 = clean(t1)
     73         t2 = clean(t2)
     74         if t1 == t2:
     75             return 2
     76         tf1 = self._titleForms(t1)
     77         tf2 = self._titleForms(t2)
     78         for t in tf1:
     79             if t in tf2: return 1
     80         return 0
     81 
     82     def _authorsAlike(self, a1, a2):
     83         if not soundsLike(" ".join(a1.last)," ".join(a2.last)):
     84             return 0
     85 
     86         if (a1.first == a2.first and a1.von == a2.von
     87             and a1.jr == a2.jr):
     88             return 2
     89 
     90 
     91         if soundsLike(" ".join(a1.first), " ".join(a2.first)):
     92             return 1
     93 
     94         if not a1.first or not a2.first:
     95             return 1
     96 
     97         if self._initialize(a1.first) == self._initialize(a2.first):
     98             return 1
     99 
    100         return 0
    101 
    102     def _initialize(self, name):
    103         name = " ".join(name).lower()
    104         name = re.sub(r'([a-z])[a-z\.]*', r'\1', name)
    105         name = clean(name)
    106         return name
    107 
    108     def _authorListsAlike(self, a1, a2):
    109         if len(a1) != len(a2):
    110             return 0
    111         a1 = [ (a.last, a) for a in a1 ]
    112         a2 = [ (a.last, a) for a in a2 ]
    113         a1.sort()
    114         a2.sort()
    115         if len(a1) != len(a2):
    116             return 0
    117         r = 2
    118         for (_, a1), (_, a2) in zip(a1,a2):
    119             x = self._authorsAlike(a1,a2)
    120             if not x:
    121                 return 0
    122             elif x == 1:
    123                 r = 1
    124         return r
    125 
    126     def _entryDatesAlike(self, e1, e2):
    127         try:
    128             if clean(e1['year']) == clean(e2['year']):
    129                 return 2
    130             else:
    131                 return 0
    132         except KeyError:
    133             return 1
    134 
    135     def includes(self, ent, all=0):
    136         title = ent['title']
    137         candidates = []
    138         for form in self._titleForms(title):
    139             try:
    140                 candidates.extend(self.byTitle[form])
    141             except KeyError:
    142                 pass
    143         goodness = []
    144         for knownEnt in candidates:
    145             match = (self._entryDatesAlike(ent, knownEnt) *
    146                      self._titlesAlike(ent['title'], knownEnt['title']) *
    147                      self._authorListsAlike(ent.parsedAuthor,
    148                                             knownEnt.parsedAuthor) )
    149             if match:
    150                 goodness.append((match, knownEnt))
    151         goodness.sort()
    152         if all:
    153             return goodness
    154         if goodness:
    155             return goodness[-1]
    156         else:
    157             return None, None
    158 
    159     def demo(self):
    160         for e in self.entries:
    161             matches = self.includes(e, 1)
    162             m2 = []
    163             mids = []
    164             for g,m in matches:
    165                 if id(m) not in mids:
    166                     mids.append(id(m))
    167                     m2.append((g,m))
    168             matches = m2
    169 
    170             if not matches:
    171                 print "No match for %s"%e.key
    172             if matches[-1][1] is e:
    173                 print "%s matches for %s: OK."%(len(matches), e.key)
    174             else:
    175                 print "%s matches for %s: %s is best!" %(len(matches), e.key,
    176                                                          matches[-1][1].key)
    177             if len(matches) > 1:
    178                 for g, m in matches:
    179                     print "%%%% goodness", g
    180                     print m
    181 
    182 
    183 def noteToURL(note):
    184     " returns tp, url "
    185     note = note.replace("\n", " ")
    186     m = re.match(r'\s*(?:\\newline\s*)*\s*\\url{(.*)}\s*(?:\\newline\s*)*',
    187                  note)
    188     if not m:
    189         return None
    190     url = m.group(1)
    191     for suffix, tp in ((".html", "html"),
    192                        (".ps", "ps"),
    193                        (".ps.gz", "ps_gz"),
    194                        (".pdf", "pdf"),
    195                        (".txt", "txt")):
    196         if url.endswith(suffix):
    197             return tp,url
    198     return "???", url
    199 
    200 all_ok = 1
    201 def emit(f,ent):
    202     global all_ok
    203 
    204     errs = ent._check()
    205     if master.byKey.has_key(ent.key.strip().lower()):
    206         errs.append("ERROR: Key collision with master file")
    207 
    208     if errs:
    209         all_ok = 0
    210 
    211     note = ent.get("note")
    212     if ent.getURL() and not note:
    213         ent['note'] = "\url{%s}"%ent.getURL()
    214     elif note:
    215         m = re.match(r'\\url{(.*)}', note)
    216         if m:
    217             url = m.group(0)
    218             tp = None
    219             if url.endswith(".txt"):
    220                 tp = "txt"
    221             elif url.endswith(".ps.gz"):
    222                 tp = "ps_gz"
    223             elif url.endswith(".ps"):
    224                 tp = "ps_gz"
    225             elif url.endswith(".pdf"):
    226                 tp = "pdf"
    227             elif url.endswith(".html"):
    228                 tp = "html"
    229             if tp:
    230                 ent['www_%s_url'%tp] = url
    231 
    232     if errs:
    233         all_ok = 0
    234     for e in errs:
    235         print >>f, "%%%%", e
    236 
    237     print >>f, ent.format(77, 4, v=1, invStrings=invStrings)
    238 
    239 def emitKnown(f, ent, matches):
    240     print >>f, "%% Candidates are:", ", ".join([e.key for g,e in matches])
    241     print >>f, "%%"
    242     print >>f, "%"+(ent.format(77,4,1,invStrings).replace("\n", "\n%"))
    243 
    244 if __name__ == '__main__':
    245     if len(sys.argv) != 3:
    246         print "reconcile.py expects 2 arguments"
    247         sys.exit(1)
    248 
    249     config.load(sys.argv[1])
    250 
    251     print "========= Scanning master =========="
    252     master = MasterBibTeX()
    253     master = BibTeX.parseFile(config.MASTER_BIB, result=master)
    254     master.buildIndex()
    255 
    256     print "========= Scanning new file ========"
    257     try:
    258         fn = sys.argv[2]
    259         input = BibTeX.parseFile(fn)
    260     except BibTeX.ParseError, e:
    261         print "Error parsing %s: %s"%(fn,e)
    262         sys.exit(1)
    263 
    264     f = open('tmp.bib', 'w')
    265     keys = input.newStrings.keys()
    266     keys.sort()
    267     for k in keys:
    268         v = input.newStrings[k]
    269         print >>f, "@string{%s = {%s}}"%(k,v)
    270 
    271     invStrings = input.invStrings
    272 
    273     for e in input.entries:
    274         if not (e.get('title') and e.get('author')):
    275             print >>f, "%%\n%%%% Not enough information to search for a match: need title and author.\n%%"
    276             emit(f, e)
    277             continue
    278 
    279         matches = master.includes(e, all=1)
    280         if not matches:
    281             print >>f, "%%\n%%%% This entry is probably new: No match found.\n%%"
    282             emit(f, e)
    283         else:
    284             print >>f, "%%"
    285             print >>f, "%%%% Possible match found for this entry; max goodness",\
    286                   matches[-1][0], "\n%%"
    287             emitKnown(f, e, matches)
    288 
    289     if not all_ok:
    290         print >>f, "\n\n\nErrors remain; not finished.\n"
    291 
    292     f.close()