gnunetbib

Bibliography (BibTeX, based on AnonBib)
Log | Files | Refs | README | LICENSE

BibTeX.py (47966B)


      1 #!/usr/bin/python3
      2 # Copyright 2003-2008, Nick Mathewson.  See LICENSE for licensing info.
      3 # Copyright 2018, 2019 ng0 <ng0@n0.is>.
      4 
      5 """
      6 BibTeX.py -- parse and manipulate BibTeX files and entries.
      7 Based on perl code by Eddie Kohler; heavily modified.
      8 """
      9 
     10 
     11 from future.utils import raise_with_traceback
     12 from io import StringIO ## for Python 3
     13 import re
     14 import sys
     15 import os
     16 import copy
     17 import future
     18 
     19 import config
     20 import rank
     21 
     22 __all__ = ['ParseError', 'BibTeX', 'BibTeXEntry', 'htmlize',
     23            'ParsedAuthor', 'FileIter', 'Parser', 'parseFile',
     24            'splitBibTeXEntriesBy', 'sortBibTexEntriesBy',]
     25 
     26 # List: must map from month number to month name.
     27 MONTHS = [None,
     28           "January", "February", "March", "April", "May", "June",
     29           "July", "August", "September", "October", "November", "December"]
     30 
     31 # Fields that we only care about for making web pages (BibTeX doesn't
     32 # recognize them.)
     33 WWW_FIELDS = ['www_section', 'www_important', 'www_remarks',
     34               'www_abstract_url', 'www_html_url', 'www_pdf_url',
     35               'www_ps_url', 'www_txt_url', 'www_ps_gz_url',
     36               'www_amazon_url', 'www_excerpt_url', 'www_publisher_url',
     37               'www_cache_section', 'www_tags']
     38 
     39 def url_untranslate(s):
     40     """
     41     Change a BibTeX key into a string suitable for use in a URL.
     42     """
     43     s = re.sub(r'([%<>`#, &_\';])',
     44                lambda m: "_%02x"%ord(m.group(1)),
     45                s)
     46     s = s.replace("/", ":")
     47     return s
     48 
     49 class ParseError(Exception):
     50     """
     51     Raised on invalid BibTeX
     52     """
     53     pass
     54 
     55 
     56 def smartJoin(*lst):
     57     """
     58     Equivalent to os.path.join, but handle"." and ".."
     59     entries a bit better.
     60     """
     61     lst = [item for item in lst if item != "."]
     62     idx = 0
     63     while idx < len(lst):
     64         if idx > 0 and lst[idx] == "..":
     65             del lst[idx]
     66         else:
     67             idx += 1
     68     return os.path.join(*lst)
     69 
     70 class BibTeX:
     71     """
     72     A parsed BibTeX file
     73     """
     74     def __init__(self):
     75         self.entries = [] # List of BibTeXEntry
     76         self.byKey = {} # Map from BibTeX key to BibTeX entry.
     77     def addEntry(self, ent):
     78         """Add a BibTeX entry to this file."""
     79         k = ent.key
     80         if self.byKey.get(ent.key.lower()):
     81             print("Already have an entry named %s"%k, file=sys.stderr)
     82             return
     83         self.entries.append(ent)
     84         self.byKey[ent.key.lower()] = ent
     85     def resolve(self):
     86         """
     87         Validate all entries in this file, and resolve cross-references
     88         """
     89         seen = {}
     90         for ent in self.entries:
     91             seen.clear()
     92             while ent.get('crossref'):
     93                 try:
     94                     cr = self.byKey[ent['crossref'].lower()]
     95                 except KeyError:
     96                     print("No such crossref: %s"% ent['crossref'])
     97                     break
     98                 if seen.get(cr.key):
     99                     #raise ParseError("Circular crossref at %s" % ent.key)
    100                     raise_with_traceback(ParseError("Circular crossref at %s" % ent.key))
    101                 seen[cr.key] = 1
    102                 del ent.entries['crossref']
    103 
    104                 if cr.entryLine < ent.entryLine:
    105                     print("Warning: crossref %s used after declaration"%cr.key)
    106 
    107                 for k in list(cr.entries.keys()):
    108                     if k in ent.entries:
    109                         print("ERROR: %s defined both in %s and in %s"
    110                               %(k, ent.key, cr.key))
    111                     else:
    112                         ent.entries[k] = cr.entries[k]
    113 
    114             ent.resolve()
    115         newEntries = []
    116         rk = config.REQUIRE_KEY
    117         if rk is None:
    118             # hack: if no key is required, require "title", since every
    119             # entry will have a title.
    120             rk = "title"
    121         print("rk is " + rk)
    122         for ent in self.entries:
    123             if ent.type in config.OMIT_ENTRIES or rk not in ent.entries.keys():
    124                 ent.check()
    125                 del self.byKey[ent.key.lower()]
    126             else:
    127                 newEntries.append(ent)
    128         self.entries = newEntries
    129 
    130 def buildAuthorTable(entries):
    131     """
    132     Given a list of BibTeXEntry, return a map from parsed author name to
    133     parsed canonical name.
    134     """
    135     authorsByLast = {}
    136     for e in entries:
    137         for a in e.parsedAuthor:
    138             authorsByLast.setdefault(tuple(a.last), []).append(a)
    139     # map from author to collapsed author.
    140     result = {}
    141     for k, v in list(config.COLLAPSE_AUTHORS.items()):
    142         a = parseAuthor(k)[0]
    143         c = parseAuthor(v)[0]
    144         result[c] = c
    145         result[a] = c
    146 
    147     for e in entries:
    148         for author in e.parsedAuthor:
    149             if author in result:
    150                 continue
    151 
    152             c = author
    153             for a in authorsByLast[tuple(author.last)]:
    154                 if a is author:
    155                     continue
    156                 c = c.collapsesTo(a)
    157             result[author] = c
    158 
    159     if 0:
    160         for a, c in list(result.items()):
    161             if a != c:
    162                 print("Collapsing authors: %s => %s" % (a, c))
    163     if 0:
    164         print(parseAuthor("Franz Kaashoek")[0].collapsesTo(
    165             parseAuthor("M. Franz Kaashoek")[0]))
    166         print(parseAuthor("Paul F. Syverson")[0].collapsesTo(
    167             parseAuthor("Paul Syverson")[0]))
    168         print(parseAuthor("Paul Syverson")[0].collapsesTo(
    169             parseAuthor("Paul F. Syverson")[0]))
    170 
    171     return result
    172 
    173 def splitEntriesBy(entries, field):
    174     """
    175     Take a list of BibTeX entries and the name of a bibtex field; return
    176     a map from vield value to list of entry.
    177     """
    178     result = {}
    179     for ent in entries:
    180         key = ent.get(field)
    181         if field in config.MULTI_VAL_FIELDS:
    182             key = [k.strip() for k in key.split(',')]
    183         else:
    184             key = [key]
    185         for k in key:
    186             try:
    187                 result[k].append(ent)
    188             except:
    189                 result[k] = [ent]
    190     return result
    191 
    192 def splitSortedEntriesBy(entries, field):
    193     """
    194     Take inputs as in splitEntriesBy, where 'entries' is sorted by 'field'.
    195     Return a list of (field-value, entry-list) tuples, in the order
    196     given in 'entries'.
    197     """
    198     result = []
    199     curVal = "alskjdsakldj"
    200     curList = []
    201     for ent in entries:
    202         key = ent.get(field)
    203         if key == curVal:
    204             curList.append(ent)
    205         else:
    206             curVal = key
    207             curList = [ent]
    208             result.append((curVal, curList))
    209     return result
    210 
    211 def sortEntriesBy(entries, field, default):
    212     """
    213     Take inputs as in splitEntriesBy, and return a list of entries sorted
    214     by the value of 'field'. Entries without 'field' are sorted as if their
    215     value were 'default'.
    216     """
    217     tmp = []
    218     i = 0
    219     for ent in entries:
    220         i += 1
    221         v = ent.get(field, default)
    222         if v.startswith("<span class='bad'>"):
    223             v = default
    224         if field in config.MULTI_VAL_FIELDS:
    225             for v_j in v.split(','):
    226                 ent_j = copy.deepcopy(ent)
    227                 ent_j.__setitem__(field, v_j.strip())
    228                 tmp.append((txtize(v_j.strip()), i, ent_j))
    229         else: tmp.append((txtize(v), i, ent))
    230     tmp.sort()
    231     return [t[2] for t in tmp]
    232 
    233 def splitEntriesByAuthor(entries):
    234     """
    235     Take a list of entries, sort them by author names, and return:
    236     a sorted list of (authorname-in-html, bibtex-entry-list) tuples,
    237     a map from authorname-in-html to name-for-url.
    238     Entries with multiple authors appear once per author.
    239     """
    240     collapsedAuthors = buildAuthorTable(entries)
    241     entries = sortEntriesByDate(entries)
    242     result = {} # Name in sorting order -> entries
    243     htmlResult = {} # name in sorting order -> Full name
    244     url_map = {} # Full name -> Url
    245     for ent in entries:
    246         for a in ent.parsedAuthor:
    247             canonical = collapsedAuthors[a]
    248             url = canonical.getHomepage()
    249             sortkey = canonical.getSortingName()
    250             secname = canonical.getSectionName()
    251             if url:
    252                 url_map[secname] = url
    253 
    254             htmlResult[sortkey] = secname
    255             result.setdefault(sortkey, []).append(ent)
    256     sortnames = list(result.keys())
    257     sortnames.sort()
    258     sections = [(htmlResult[n], result[n]) for n in sortnames]
    259     return sections, url_map
    260 
    261 ## def sortEntriesByAuthor(entries):
    262 ##     tmp = []
    263 ##     i = 0
    264 ##     for ent in entries:
    265 ##         i += 1
    266 ##         authors = [ txtize(" ".join(a.von+a.last+a.first+a.jr))
    267 ##                     for a in ent.parsedAuthor ]
    268 ##         tmp.append((tuple(authors), i, ent))
    269 ##     tmp.sort()
    270 ##     return [ t[2] for t in tmp ]
    271 
    272 def sortEntriesByDate(entries):
    273     """
    274     Sort a list of entries by their publication date.
    275     """
    276     tmp = []
    277     i = 0
    278     for ent in entries:
    279         i += 1
    280         if (ent.get('month') == "forthcoming" or ent.get('year') == "forthcoming"):
    281             tmp.append((20000*13, i, ent))
    282             continue
    283         try:
    284             monthname = ent.get("month")
    285             if monthname is not None:
    286                 match = re.match(r"(\w+)--\w+", monthname)
    287                 if match:
    288                     monthname = match.group(1)
    289             mon = MONTHS.index(monthname)
    290         except ValueError:
    291             print("Unknown month %r in %s"%(ent.get("month"), ent.key))
    292             mon = 0
    293 
    294         try:
    295             date = int(ent['year'])*13 + mon
    296         except KeyError:
    297             print("ERROR: No year field in %s"%ent.key)
    298             date = 10000*13
    299         except ValueError:
    300             date = 10000*13
    301         tmp.append((date, i, ent))
    302     tmp.sort(reverse=True)
    303     return [t[2] for t in tmp]
    304 
    305 
    306 # List of fields that appear when we display the entries as BibTeX.
    307 DISPLAYED_FIELDS = ['title', 'author', 'journal', 'booktitle',
    308                     'school', 'institution', 'organization', 'volume',
    309                     'number', 'year', 'month', 'address', 'location',
    310                     'chapter', 'edition', 'pages', 'editor',
    311                     'howpublished', 'key', 'publisher', 'type',
    312                     'note', 'series']
    313 
    314 class BibTeXEntry:
    315     """
    316     A single BibTeX entry.
    317     """
    318 
    319     def __init__(self, type, key, entries):
    320         self.type = type  # Kind of entry: @book, @injournal,etc
    321         self.key = key # What key does it have?
    322         self.entries = entries # Map from key to value.
    323         self.entryLine = 0 # Defined on this line number
    324 
    325     def get(self, k, v=None):
    326         return self.entries.get(k, v)
    327 
    328     def has_key(self, k):
    329         return k in self.entries
    330 
    331     def __getitem__(self, k):
    332         return self.entries[k]
    333 
    334     def __setitem__(self, k, v):
    335         self.entries[k] = v
    336 
    337     def __str__(self):
    338         return self.format(70, 1)
    339 
    340     def getURL(self):
    341         """Return the best URL to use for this paper, or None."""
    342         best = None
    343         for field in ['www_pdf_url', 'www_ps_gz_url', 'www_ps_url',
    344                       'www_html_url', 'www_txt_url',]:
    345             u = self.get(field)
    346             if u:
    347                 if not best:
    348                     best = u
    349                 elif (best.startswith("http://citeseer.nj.nec.com/")
    350                       and not u.startswith("http://citeseer.nj.nec.com/")):
    351                     best = u
    352         return best
    353 
    354     def format(self, width=70, indent=8, v=0, invStrings={}):
    355         """
    356         Format this entry as BibTeX.
    357         """
    358         d = ["@%s{%s,\n" % (self.type, self.key)]
    359         if v:
    360             df = DISPLAYED_FIELDS[:]
    361             for k in list(self.entries.keys()):
    362                 if k not in df:
    363                     df.append(k)
    364         else:
    365             df = DISPLAYED_FIELDS
    366         for f in df:
    367             if f not in self.entries:
    368                 continue
    369             v = self.entries[f]
    370             if v.startswith("<span class='bad'>"):
    371                 d.append("%%%%% ERROR: Missing field\n")
    372                 d.append("%% %s = {?????},\n"%f)
    373                 continue
    374             np = v.translate(str.maketrans(ALLCHARS, ALLCHARS, PRINTINGCHARS))
    375             if np:
    376                 d.append("%%%%% "+("ERROR: Non-ASCII characters: '%r'\n"%np))
    377             d.append("  ")
    378             v = v.replace("&", "&amp;")
    379             if v in invStrings:
    380                 s = "%s = %s,\n" %(f, invStrings[v])
    381             else:
    382                 s = "%s = {%s},\n" % (f, v)
    383             d.append(_split(s, width, indent))
    384         d.append("}\n")
    385         return "".join(d)
    386     def resolve(self):
    387         """
    388         Handle post-processing for this entry
    389         """
    390         a = self.get('author')
    391         if a:
    392             self.parsedAuthor = parseAuthor(a)
    393             #print(a)
    394             #print("   => ",repr(self.parsedAuthor))
    395         else:
    396             self.parsedAuthor = None
    397 
    398     def isImportant(self):
    399         """
    400         Return 1 iff this entry is marked as important
    401         """
    402         imp = self.get("www_important")
    403         if imp and imp.strip().lower() not in ("no", "false", "0"):
    404             return 1
    405         return 0
    406 
    407     def check(self):
    408         """
    409         Print any errors for this entry, and return true if there were
    410         none.
    411         """
    412         errs = self._check()
    413         for e in errs:
    414             print(e)
    415         return not errs
    416 
    417     # FIXME: Here's some fields repeated after you enter the
    418     # if self.type != 'proceedings'
    419     # conditional body.
    420     # Besides the official BibTeX resources, this is a good
    421     # reference point: https://verbosus.com/bibtex-style-examples.html
    422     def _check(self):
    423         """
    424         The message 'in record %s' relates to the entire bibtex record,
    425         giving a means to locate by searching.
    426         FIXME: Really print the line in the '.bib' file.
    427         """
    428         errs = []
    429         if self.type == 'inproceedings':
    430             fields = 'booktitle', 'year'
    431         elif self.type == 'incollection':
    432             fields = 'booktitle', 'year'
    433         elif self.type == 'proceedings':
    434             fields = 'booktitle', 'editor'
    435         elif self.type == 'article':
    436             fields = 'journal', 'year'
    437         elif self.type == 'book':
    438             fields = 'title', 'year', 'publisher'
    439         elif self.type == 'booklet':
    440             fields = 'title', 'year'
    441         elif self.type == 'techreport':
    442             fields = 'institution',
    443         elif self.type == 'misc':
    444             fields = 'howpublished',
    445         elif self.type == 'conference':
    446             fields = 'booktitle', 'year'
    447         elif self.type in ('mastersthesis', 'phdthesis'):
    448             fields = ()
    449         else:
    450             fields = ()
    451             errs.append("ERROR (record %s):\t odd type %s"
    452                         % (self.entryLine, self.type))
    453         if self.type != ('proceedings' or 'conference'):
    454             fields += 'title', 'author', 'www_section', 'year'
    455 
    456         for field in fields:
    457             if self.get(field) is None or \
    458                    self.get(field).startswith("<span class='bad'>"):
    459                 errs.append("ERROR (record %s):\t %s field"
    460                             "\tnot found in\t %s"
    461                             % (self.entryLine, field, self.key))
    462                 self.entries[field] = "<span class='bad'>%s:??</span>"%field
    463 
    464         if self.type == 'inproceedings':
    465             if self.get("booktitle"):
    466                 if not self['booktitle'].startswith("Proceedings of") and \
    467                    not self['booktitle'].startswith("{Proceedings of"):
    468                     errs.append("ERROR (record %s):\t %s's booktitle "
    469                                 "(%r) doesn't start with 'Proceedings of'"
    470                                 % (self.entryLine, selfself.key, self['booktitle']))
    471 
    472         if "pages" in self.entries.keys() and not re.search(r'\d+--\d+', self.entries['pages']):
    473             errs.append("ERROR (record %s):\t Misformed pages in %s"
    474                         % (self.entryLine, self.key))
    475 
    476         if self.type == 'proceedings':
    477             if self.get('title'):
    478                 errs.append("ERROR (record %s):\t %s is a proceedings: "
    479                             "it should have a booktitle, not a title."
    480                             % (self.entryLine, self.key))
    481 
    482         for field, value in list(self.entries.items()):
    483             if value.translate(str.maketrans(ALLCHARS, ALLCHARS, PRINTINGCHARS)):
    484                 errs.append("ERROR (record %s):\t %s.%s "
    485                             "has non-ASCII characters"
    486                             % (self.entryLine, self.key, field))
    487             if field.startswith("www_") and field not in WWW_FIELDS:
    488                 errs.append("ERROR (record %s):\t unknown "
    489                             "www field %s"
    490                             % (self.entryLine, field))
    491             if value.strip()[-1:] == '.' and \
    492                 field not in ("notes", "www_remarks", "author"):
    493                 errs.append("ERROR (record %s):\t %s.%s "
    494                             "has an extraneous period"
    495                             % (self.entryLine, self.key, field))
    496         return errs
    497 
    498     def biblio_to_html(self):
    499         """
    500         Return the HTML for the citation portion of entry.
    501         """
    502         if self.type in ('inproceedings', 'incollection'):
    503             booktitle = self['booktitle']
    504             bookurl = self.get('bookurl')
    505             if bookurl:
    506                 m = PROCEEDINGS_RE.match(booktitle)
    507                 if m:
    508                     res = ["In the ", m.group(1),
    509                            '<a href="%s">'%bookurl, m.group(2), "</a>"]
    510                 else:
    511                     res = ['In the <a href="%s">%s</a>'% (bookurl, booktitle)]
    512             else:
    513                 res = ["In the ", booktitle]
    514 
    515             if self.get("edition"):
    516                 res.append(",")
    517                 res.append(self['edition'])
    518             if self.get("location"):
    519                 res.append(", ")
    520                 res.append(self['location'])
    521             elif self.get("address"):
    522                 res.append(", ")
    523                 res.append(self['address'])
    524             res.append(", %s %s" % (self.get('month', ""), self['year']))
    525             if not self.get('pages'):
    526                 pass
    527             elif "-" in self['pages']:
    528                 res.append(", pages&nbsp;%s"%self['pages'])
    529             else:
    530                 res.append(", page&nbsp;%s"%self['pages'])
    531         elif self.type == 'article':
    532             res = ["In "]
    533             if self.get('journalurl'):
    534                 res.append('<a href="%s">%s</a>'%
    535                            (self['journalurl'], self['journal']))
    536             else:
    537                 res.append(self['journal'])
    538             if self.get('volume'):
    539                 res.append(" <b>%s</b>"%self['volume'])
    540             if self.get('number'):
    541                 res.append("(%s)"%self['number'])
    542             res.append(", %s %s" % (self.get('month', ""), self['year']))
    543             if not self.get('pages'):
    544                 pass
    545             elif "-" in self['pages']:
    546                 res.append(", pages&nbsp;%s"%self['pages'])
    547             else:
    548                 res.append(", page&nbsp;%s"%self['pages'])
    549         elif self.type == 'techreport':
    550             res = ["%s %s %s" % (self['institution'],
    551                                  self.get('type', 'technical report'),
    552                                  self.get('number', ""))]
    553             if self.get('month') or self.get('year'):
    554                 res.append(", %s %s" % (self.get('month', ''),
    555                                         self.get('year', '')))
    556         # FIXME: less clauses.
    557         elif self.type == 'mastersthesis' or self.type == 'phdthesis':
    558             if self.get('type'):
    559                 if self.type == 'mastersthesis' or\
    560                    self.type == 'Master' or\
    561                    self.type == 'Masters' or\
    562                    self.type == 'Master\'s':
    563                     res = ["Master thesis"]
    564                 if self.type == 'Bachelor' or\
    565                    self.type == 'Bachelors':
    566                     res = ["Bachelor thesis"]
    567                 res = [self['type']]
    568             else:
    569                 res = ["Ph.D. thesis"]
    570             if self.get('school'):
    571                 res.append(", %s"%(self['school']))
    572             if self.get('month') or self.get('year'):
    573                 res.append(", %s %s" % (self.get('month', ''),
    574                                         self.get('year', '')))
    575         # elif self.type == 'book':
    576         #     res = [self['publisher']]
    577         #     if self.get('year'):
    578         #         res.append(" ")
    579         #         res.append(self.get('year'))
    580         #         # res.append(", %s"%(self.get('year')))
    581         #     if self.get('series'):
    582         #         res.append(",")
    583         #         res.append(self['series'])
    584         # elif self.type == 'booklet':
    585         #     # res = self.get('publisher')
    586         #     res = [self['publisher']]
    587         #     if self.get('year'):
    588         #         res.append(" ")
    589         #         res.append(self.get('year'))
    590         elif self.type == 'misc':
    591             res = [self['howpublished']]
    592             if self.get('month') or self.get('year'):
    593                 res.append(", %s %s" % (self.get('month', ''),
    594                                         self.get('year', '')))
    595             if not self.get('pages'):
    596                 pass
    597             elif "-" in self['pages']:
    598                 res.append(", pages&nbsp;%s"%self['pages'])
    599             else:
    600                 res.append(", page&nbsp;%s"%self['pages'])
    601         else:
    602             res = ["&lt;Odd type %s&gt;"%self.type]
    603 
    604         res[0:0] = ["<span class='biblio'>"]
    605         res.append(".</span>")
    606 
    607         bibtexurl = "./bibtex.html#%s"%url_untranslate(self.key)
    608         res.append((" <span class='availability'>"
    609                     "(<a href='%s'>BibTeX&nbsp;entry</a>)"
    610                     "</span>") %bibtexurl)
    611 
    612         # Produce the link to the dot bib file 'record.bib' for the
    613         # current record, assuming it is relative from FILE in
    614         # directory "../bib/".
    615         # FIXME: move everything into 'out', allowing to build
    616         # a website with relative links and more structure.
    617         bibtexurl_file = "./bib/%s" %url_untranslate(self.key)
    618         res.append((" <span class='availability'>"
    619                     "(<a href='%s/record.bib'>Download&nbsp;bibtex&nbsp;record</a>)"
    620                     "</span>")
    621                    %bibtexurl_file)
    622         return htmlize("".join(res))
    623 
    624     def to_html(self, cache_path="./cache", base_url="html"):
    625         """
    626         Return the HTML for this entry.
    627         """
    628         imp = self.isImportant()
    629         draft = self.get('year') == 'forthcoming'
    630         if imp:
    631             res = ["<li><div class='impEntry'><p class='impEntry'>"]
    632         elif draft:
    633             res = ["<li><div class='draftEntry'><p class='draftEntry'>"]
    634         else:
    635             res = ["<p class='item-preview'><p class='item-date'>"]
    636 
    637         if imp or not draft:
    638             """
    639             Add a picture of the rank
    640             Only if year is known or paper important!
    641             """
    642             r = rank.get_rank_html(self['title'], self.get('year'),
    643                                    update=False, base_url=base_url)
    644             if r is not None:
    645                 res.append(r)
    646 
    647         res.append("<span class='title'><a name='%s'>%s</a></span>"
    648                    %(url_untranslate(self.key), htmlize(self['title'])))
    649 
    650         for cached in 0, 1:
    651             availability = []
    652             if not cached:
    653                 for which in ["amazon", "excerpt", "publisher"]:
    654                     key = "www_%s_url"%which
    655                     if self.get(key):
    656                         url = self[key]
    657                         url = unTeXescapeURL(url)
    658                         availability.append('<a href="%s">%s</a>'
    659                                             % (url, which))
    660 
    661             cache_section = self.get('www_cache_section', ".")
    662             if cache_section not in config.CACHE_SECTIONS:
    663                 if cache_section != ".":
    664                     print("Unrecognized cache section %s"%(cache_section),
    665                           file=sys.stderr)
    666                     cache_section = "."
    667 
    668             for key, name, ext in (('www_abstract_url', 'abstract', 'abstract'),
    669                                    ('www_html_url', 'HTML', 'html'),
    670                                    ('www_pdf_url', 'PDF', 'pdf'),
    671                                    ('www_ps_url', 'PS', 'ps'),
    672                                    ('www_txt_url', 'TXT', 'txt'),
    673                                    ('www_ps_gz_url', 'gzipped&nbsp;PS', 'ps.gz')):
    674                 if cached:
    675                     #XXXX the URL needs to be relative to the absolute
    676                     #XXXX cache path.
    677                     url = smartJoin(cache_path, cache_section,
    678                                     "%s.%s"%(self.key, ext))
    679                     fname = smartJoin(config.OUTPUT_DIR, config.CACHE_DIR,
    680                                       cache_section,
    681                                       "%s.%s" % (self.key, ext))
    682                     if not os.path.exists(fname): continue
    683                 else:
    684                     url = self.get(key)
    685                     if not url: continue
    686                 url = unTeXescapeURL(url)
    687                 url = url.replace('&', '&amp;')
    688                 availability.append('<a href="%s">%s</a>' %(url, name))
    689 
    690             if availability:
    691                 res.append([" ", "&nbsp;"][cached])
    692                 res.append("<span class='availability'>(")
    693                 if cached: res.append("Cached:&nbsp;")
    694                 res.append(",&nbsp;".join(availability))
    695                 res.append(")</span>")
    696 
    697         res.append("<br /><span class='author'>by ")
    698 
    699         #res.append("\n<!-- %r -->\n" % self.parsedAuthor)
    700         htmlAuthors = [a.htmlizeWithLink() for a in self.parsedAuthor]
    701 
    702         if len(htmlAuthors) == 1:
    703             res.append(htmlAuthors[0])
    704         elif len(htmlAuthors) == 2:
    705             res.append(" and ".join(htmlAuthors))
    706         else:
    707             res.append(", ".join(htmlAuthors[:-1]))
    708             res.append(", and ")
    709             res.append(htmlAuthors[-1])
    710 
    711         if res[-1][-1] != '.':
    712             res.append(".")
    713         res.append("</span><br />\n")
    714         res.append(self.biblio_to_html())
    715         res.append("\n<br>\n(<a href='#%s'>direct link</a>)"
    716                    %url_untranslate(self.key))
    717         if self.get('url'):
    718             res.append(" (<a href='%s'>website</a>)"
    719                        %htmlize(self['url']))
    720         res.append("</p>")
    721 
    722         if self.get('www_remarks'):
    723             res.append("<p class='remarks'>%s</p>"
    724                        %htmlize(self['www_remarks']))
    725 
    726         if self.get('abstract'):
    727             res.append("<p class='abstract item-summary'>%s</p>"
    728                        %htmlize(self['abstract']))
    729 
    730         #res.append("\n<br>[<a href='#'>Go to top</a>]")
    731         res.append("\n<p class='item-date'>[<a href='#'>Go to top</a>]</p>")
    732 
    733         if imp or draft:
    734             res.append("</div>")
    735         res.append("</p>\n\n")
    736 
    737         return "".join(res)
    738 
    739 def unTeXescapeURL(s):
    740     """
    741     Turn a URL as formatted in TeX into a real URL.
    742     """
    743     s = s.replace("\\_", "_")
    744     s = s.replace("\\-", "")
    745     s = s.replace("\{}", "")
    746     s = s.replace("{}", "")
    747     return s
    748 
    749 def TeXescapeURL(s):
    750     """
    751     Escape a URL for use in TeX
    752     """
    753     s = s.replace("_", "\\_")
    754     s = s.replace("~", "\{}~")
    755     return s
    756 
    757 RE_LONE_AMP = re.compile(r'&([^a-z0-9])')
    758 
    759 RE_LONE_I = re.compile(r'\\i([^a-z0-9])')
    760 
    761 RE_ACCENT = re.compile(r'\\([\'`~^"c])([^{]|{.})')
    762 
    763 RE_LIGATURE = re.compile(r'\\(AE|ae|OE|oe|AA|aa|O|o|ss)([^a-z0-9])')
    764 
    765 ACCENT_MAP = {"'" : 'acute',
    766               "`" : 'grave',
    767               "~" : 'tilde',
    768               "^" : 'circ',
    769               '"' : 'uml',
    770               "c" : 'cedil',}
    771 
    772 UNICODE_MAP = {'&nacute;' : '&#x0144;',}
    773 
    774 HTML_LIGATURE_MAP = {
    775     'AE' : '&AElig;',
    776     'ae' : '&aelig;',
    777     'OE' : '&OElig;',
    778     'oe' : '&oelig;',
    779     'AA' : '&Aring;',
    780     'aa' : '&aring;',
    781     'O'  : '&Oslash;',
    782     'o'  : '&oslash;',
    783     'ss' : '&szlig;',}
    784 
    785 RE_TEX_CMD = re.compile(r"(?:\\[a-zA-Z@]+|\\.)")
    786 
    787 RE_PAGE_SPAN = re.compile(r"(\d)--(\d)")
    788 
    789 def _unaccent(m):
    790     accent, char = m.groups()
    791     if char[0] == '{':
    792         char = char[1]
    793     accented = "&%s%s;" % (char, ACCENT_MAP[accent])
    794     return UNICODE_MAP.get(accented, accented)
    795 
    796 def _unlig_html(m):
    797     return "%s%s"%(HTML_LIGATURE_MAP[m.group(1)], m.group(2))
    798 
    799 def htmlize(s):
    800     """
    801     Turn a TeX string into good-looking HTML.
    802     """
    803     s = RE_LONE_AMP.sub(lambda m: "&amp;%s" % m.group(1), s)
    804     s = RE_LONE_I.sub(lambda m: "i%s" % m.group(1), s)
    805     s = RE_ACCENT.sub(_unaccent, s)
    806     s = unTeXescapeURL(s)
    807     s = RE_LIGATURE.sub(_unlig_html, s)
    808     s = RE_TEX_CMD.sub("", s)
    809     s = s.translate(str.maketrans(ALLCHARS, ALLCHARS, "{}"))
    810     s = RE_PAGE_SPAN.sub(lambda m: "%s-%s"%(m.groups()), s)
    811     s = s.replace("---", "&mdash;")
    812     s = s.replace("--", "&ndash;")
    813     return s
    814 
    815 def author_url(author):
    816     """
    817     Given an author's name, return a URL for his/her homepage.
    818     """
    819     for pat, url in config.AUTHOR_RE_LIST:
    820         if pat.search(author):
    821             return url
    822     return None
    823 
    824 def txtize(s):
    825     """
    826     Turn a TeX string into decent plaintext.
    827     """
    828     s = RE_LONE_I.sub(lambda m: "i%s" % m.group(1), s)
    829     s = RE_ACCENT.sub(lambda m: "%s" % m.group(2), s)
    830     s = RE_LIGATURE.sub(lambda m: "%s%s"%m.groups(), s)
    831     s = RE_TEX_CMD.sub("", s)
    832     s = s.translate(str.maketrans(ALLCHARS, ALLCHARS, "{}"))
    833     return s
    834 
    835 PROCEEDINGS_RE = re.compile(r'((?:proceedings|workshop record) of(?: the)? )(.*)', re.I)
    836 
    837 class ParsedAuthor:
    838     """
    839     The parsed name of an author.
    840     Eddie deserves credit for this incredibly hairy business.
    841     """
    842     def __init__(self, first, von, last, jr):
    843         self.first = first
    844         self.von = von
    845         self.last = last
    846         self.jr = jr
    847         self.collapsable = 1
    848 
    849         self.html = htmlize(str(self))
    850         self.txt = txtize(str(self))
    851 
    852         s = self.html
    853         for pat in config.NO_COLLAPSE_AUTHORS_RE_LIST:
    854             if pat.search(s):
    855                 self.collapsable = 0
    856                 break
    857 
    858     def __eq__(self, o):
    859         return ((self.first == o.first) and
    860                 (self.last == o.last) and
    861                 (self.von == o.von) and
    862                 (self.jr == o.jr))
    863 
    864     def __hash__(self):
    865         return hash(repr(self))
    866 
    867     def collapsesTo(self, o):
    868         """
    869         Return true iff 'o' could be a more canonical version of
    870         this author
    871         """
    872         if not self.collapsable or not o.collapsable:
    873             return self
    874 
    875         if self.last != o.last or self.von != o.von or self.jr != o.jr:
    876             return self
    877         if not self.first:
    878             return o
    879 
    880         if len(self.first) == len(o.first):
    881             n = []
    882             for a, b in zip(self.first, o.first):
    883                 if a == b:
    884                     n.append(a)
    885                 elif len(a) == 2 and a[1] == '.' and a[0] == b[0]:
    886                     n.append(b)
    887                 elif len(b) == 2 and b[1] == '.' and a[0] == b[0]:
    888                     n.append(a)
    889                 else:
    890                     return self
    891             if n == self.first:
    892                 return self
    893             elif n == o.first:
    894                 return o
    895             else:
    896                 return self
    897         else:
    898             realname = max([len(n) for n in self.first+o.first]) > 2
    899             if not realname:
    900                 return self
    901 
    902             if len(self.first) < len(o.first):
    903                 short = self.first; long = o.first
    904             else:
    905                 short = o.first; long = self.first
    906 
    907             initials_s = "".join([n[0] for n in short])
    908             initials_l = "".join([n[0] for n in long])
    909             idx = initials_l.find(initials_s)
    910             if idx < 0:
    911                 return self
    912             n = long[:idx]
    913             for i in range(idx, idx+len(short)):
    914                 a = long[i]; b = short[i-idx]
    915                 if a == b:
    916                     n.append(a)
    917                 elif len(a) == 2 and a[1] == '.' and a[0] == b[0]:
    918                     n.append(b)
    919                 elif len(b) == 2 and b[1] == '.' and a[0] == b[0]:
    920                     n.append(a)
    921                 else:
    922                     return self
    923             n += long[idx+len(short):]
    924 
    925             if n == self.first:
    926                 return self
    927             elif n == o.first:
    928                 return o
    929             else:
    930                 return self
    931 
    932     def __repr__(self):
    933         return "ParsedAuthor(%r,%r,%r,%r)"%(self.first, self.von,
    934                                             self.last, self.jr)
    935     def __str__(self):
    936         a = " ".join(self.first+self.von+self.last)
    937         if self.jr:
    938             return "%s, %s" % (a, self.jr)
    939         return a
    940 
    941     def getHomepage(self):
    942         s = self.html
    943         for pat, url in config.AUTHOR_RE_LIST:
    944             if pat.search(s):
    945                 return url
    946         return None
    947 
    948     def getSortingName(self):
    949         """
    950         Return a representation of this author's name in von-last-first-jr
    951         order, unless overridden by ALPH
    952         """
    953         s = self.html
    954         for pat, v in config.ALPHABETIZE_AUTHOR_AS_RE_LIST:
    955             if pat.search(s):
    956                 return v
    957 
    958         return txtize(" ".join(self.von+self.last+self.first+self.jr))
    959 
    960     def getSectionName(self):
    961         """Return a HTML representation of this author's name in
    962            last, first von, jr order"""
    963         secname = " ".join(self.last)
    964         more = self.first+self.von
    965         if more:
    966             secname += ", "+" ".join(more)
    967         if self.jr:
    968             secname += ", "+" ".join(self.jr)
    969         secname = htmlize(secname)
    970         return secname
    971 
    972     def htmlizeWithLink(self):
    973         a = self.html
    974         u = self.getHomepage()
    975         if u:
    976             return "<a href='%s'>%s</a>"%(u, a)
    977         else:
    978             return a
    979 
    980 def _split(s, w=79, indent=8):
    981     r = []
    982     s = re.sub(r"\s+", " ", s)
    983     first = 1
    984     indentation = ""
    985     while len(s) > w:
    986         for i in range(w-1, 20, -1):
    987             if s[i] == ' ':
    988                 r.append(indentation+s[:i])
    989                 s = s[i+1:]
    990                 break
    991         else:
    992             r.append(indentation+s.strip())
    993             s = ""
    994         if first:
    995             first = 0
    996             w -= indent
    997             indentation = " "*indent
    998     if (s):
    999         r.append(indentation+s)
   1000     r.append("")
   1001     return "\n".join(r)
   1002 
   1003 class FileIter:
   1004     def __init__(self, fname=None, file=None, it=None, string=None):
   1005         if fname:
   1006             file = open(fname, 'r')
   1007         if string:
   1008             file = StringIO(string)
   1009         if file:
   1010             it = iter(file)
   1011         self.iter = it
   1012         assert self.iter
   1013         self.lineno = 0
   1014         self._next = it.__next__
   1015     def __next__(self):
   1016         self.lineno += 1
   1017         return self._next()
   1018 
   1019 
   1020 def parseAuthor(s):
   1021     try:
   1022         return _parseAuthor(s)
   1023     except:
   1024         print("Internal error while parsing author %r"%s, file=sys.stderr)
   1025         raise
   1026 
   1027 def _parseAuthor(s):
   1028     """
   1029     Take an author string and return a list of ParsedAuthor.
   1030     """
   1031     items = []
   1032 
   1033     s = s.strip()
   1034     while s:
   1035         s = s.strip()
   1036         bracelevel = 0
   1037         for i in range(len(s)):
   1038             if s[i] == '{':
   1039                 bracelevel += 1
   1040             elif s[i] == '}':
   1041                 bracelevel -= 1
   1042             elif bracelevel <= 0 and s[i] in " \t\n,":
   1043                 break
   1044         if i+1 == len(s):
   1045             items.append(s)
   1046         else:
   1047             items.append(s[0:i])
   1048         if (s[i] == ','):
   1049             items.append(',')
   1050         s = s[i+1:]
   1051 
   1052     authors = [[]]
   1053     for item in items:
   1054         if item == 'and':
   1055             authors.append([])
   1056         else:
   1057             authors[-1].append(item)
   1058 
   1059     parsedAuthors = []
   1060     # Split into first, von, last, jr
   1061     for author in authors:
   1062         commas = 0
   1063         fvl = []
   1064         vl = []
   1065         f = []
   1066         v = []
   1067         l = []
   1068         j = []
   1069         cur = fvl
   1070         for item in author:
   1071             if item == ',':
   1072                 if commas == 0:
   1073                     vl = fvl
   1074                     fvl = []
   1075                     cur = f
   1076                 else:
   1077                     j.extend(f)
   1078                     cur = f = []
   1079                 commas += 1
   1080             else:
   1081                 cur.append(item)
   1082 
   1083         if commas == 0:
   1084             split_von(f, v, l, fvl)
   1085         else:
   1086             f_tmp = []
   1087             split_von(f_tmp, v, l, vl)
   1088 
   1089         parsedAuthors.append(ParsedAuthor(f, v, l, j))
   1090 
   1091     return parsedAuthors
   1092 
   1093 ALLCHARS = "".join(map(chr, list(range(256))))
   1094 
   1095 PRINTINGCHARS = "\t\n\r"+"".join(map(chr, list(range(32, 127))))
   1096 
   1097 LC_CHARS = "abcdefghijklmnopqrstuvwxyz"
   1098 
   1099 SV_DELCHARS = ("ABCDEFGHIJKLMNOPQRSTUVWXYZ"
   1100                "abcdefghijklmnopqrstuvwxyz"
   1101                "@")
   1102 
   1103 RE_ESCAPED = re.compile(r'\\.')
   1104 
   1105 def split_von(f, v, l, x):
   1106     in_von = 0
   1107     while x:
   1108         tt = t = x[0]
   1109         del x[0]
   1110         if tt[:2] == '{\\':
   1111             tt = tt.translate(str.maketrans(ALLCHARS, ALLCHARS, SV_DELCHARS))
   1112             tt = RE_ESCAPED.sub("", tt)
   1113             tt = tt.translate(str.maketrans(ALLCHARS, ALLCHARS, "{}"))
   1114         if tt.translate(str.maketrans(ALLCHARS, ALLCHARS, LC_CHARS)) == "":
   1115             v.append(t)
   1116             in_von = 1
   1117         elif in_von and f is not None:
   1118             l.append(t)
   1119             l.extend(x)
   1120             return
   1121         else:
   1122             f.append(t)
   1123     if not in_von:
   1124         l.append(f[-1])
   1125         del f[-1]
   1126 
   1127 
   1128 class Parser:
   1129     """
   1130     Parser class: reads BibTeX from a file and returns a BibTeX object.
   1131     """
   1132     ## Fields
   1133     # strings: maps entry string keys to their values.
   1134     # newStrings: all string definitions not in config.INITIAL_STRINGS
   1135     # invStrings: map from string values to their keys.
   1136     # fileiter: the line iterator we're parsing from.
   1137     # result: the BibTeX object that we're parsing into
   1138     # litStringLine: the line on which we started parsing a literal string;
   1139     #     0 for none.
   1140     # entryLine: the line on which the current entry started; 0 for none.
   1141     #
   1142     # curEntType: the type of the entry we're parsing now. (paper,article,etc)
   1143     def __init__(self, fileiter, initial_strings, result=None):
   1144         self.strings = config.INITIAL_STRINGS.copy()
   1145         self.strings.update(initial_strings)
   1146         self.newStrings = {}
   1147         self.invStrings = {}
   1148         for k, v in list(config.INITIAL_STRINGS.items()):
   1149             self.invStrings[v] = k
   1150         self.fileiter = fileiter
   1151         if result is None:
   1152             result = BibTeX()
   1153         self.result = result
   1154         self.litStringLine = 0
   1155         self.entryLine = 0
   1156 
   1157     def _parseKey(self, line):
   1158         it = self.fileiter
   1159         line = _advance(it, line)
   1160         m = KEY_RE.match(line)
   1161         if not m:
   1162             raise ParseError("Expected key at line %s"
   1163                              % self.fileiter.lineno)
   1164         key, line = m.groups()
   1165         return key, line
   1166 
   1167     def _parseValue(self, line):
   1168         it = self.fileiter
   1169         bracelevel = 0
   1170         data = []
   1171         while 1:
   1172             line = _advance(it, line)
   1173             line = line.strip()
   1174             assert line
   1175 
   1176             # Literal string?
   1177             if line[0] == '"':
   1178                 line = line[1:]
   1179                 self.litStringLine = it.lineno
   1180                 while 1:
   1181                     if bracelevel:
   1182                         m = BRACE_CLOSE_RE.match(line)
   1183                         if m:
   1184                             data.append(m.group(1))
   1185                             data.append('}')
   1186                             line = m.group(2)
   1187                             bracelevel -= 1
   1188                             continue
   1189                     else:
   1190                         m = STRING_CLOSE_RE.match(line)
   1191                         if m:
   1192                             data.append(m.group(1))
   1193                             line = m.group(2)
   1194                             break
   1195                     m = BRACE_OPEN_RE.match(line)
   1196                     if m:
   1197                         data.append(m.group(1))
   1198                         line = m.group(2)
   1199                         bracelevel += 1
   1200                         continue
   1201                     data.append(line)
   1202                     data.append(" ")
   1203                     line = next(it)
   1204                 self.litStringLine = 0
   1205             elif line[0] == '{':
   1206                 bracelevel += 1
   1207                 line = line[1:]
   1208                 while bracelevel:
   1209                     m = BRACE_CLOSE_RE.match(line)
   1210                     if m:
   1211                         #print bracelevel, "A", repr(m.group(1))
   1212                         data.append(m.group(1))
   1213                         bracelevel -= 1
   1214                         if bracelevel > 0:
   1215                             #print bracelevel, "- '}'"
   1216                             data.append('}')
   1217                         line = m.group(2)
   1218                         continue
   1219                     m = BRACE_OPEN_RE.match(line)
   1220                     if m:
   1221                         bracelevel += 1
   1222                         #print bracelevel, "B", repr(m.group(1))
   1223                         data.append(m.group(1))
   1224                         line = m.group(2)
   1225                         continue
   1226                     else:
   1227                         #print bracelevel, "C", repr(line)
   1228                         data.append(line)
   1229                         data.append(" ")
   1230                         line = next(it)
   1231             elif line[0] == '#':
   1232                 print("Weird concat on line %s"%it.lineno,
   1233                       file=sys.stderr)
   1234             elif line[0] in "},":
   1235                 if not data:
   1236                     print("No data after field on line %s"%(it.lineno),
   1237                           file=sys.stderr)
   1238             else:
   1239                 m = RAW_DATA_RE.match(line)
   1240                 if m:
   1241                     s = self.strings.get(m.group(1).lower())
   1242                     if s is not None:
   1243                         data.append(s)
   1244                     else:
   1245                         data.append(m.group(1))
   1246                     line = m.group(2)
   1247                 else:
   1248                     raise ParseError("Questionable line at line %s"%it.lineno)
   1249 
   1250             # Got a string, check for concatenation.
   1251             if line.isspace() or not line:
   1252                 data.append(" ")
   1253             line = _advance(it, line)
   1254             line = line.strip()
   1255             assert line
   1256             if line[0] == '#':
   1257                 line = line[1:]
   1258             else:
   1259                 data = "".join(data)
   1260                 data = re.sub(r'\s+', ' ', data)
   1261                 data = re.sub(r'^\s+', '', data)
   1262                 data = re.sub(r'\s+$', '', data)
   1263                 return data, line
   1264 
   1265     def _parseEntry(self, line): #name, strings, entries
   1266         it = self.fileiter
   1267         self.entryLine = it.lineno
   1268         line = _advance(it, line)
   1269 
   1270         m = BRACE_BEGIN_RE.match(line)
   1271         if not m:
   1272             raise ParseError("Expected an opening brace at line %s"%it.lineno)
   1273         line = m.group(1)
   1274 
   1275         proto = {'string' : 'p',
   1276                  'preamble' : 'v',}.get(self.curEntType, 'kp*')
   1277 
   1278         v = []
   1279         while 1:
   1280             line = _advance(it, line)
   1281 
   1282             m = BRACE_END_RE.match(line)
   1283             if m:
   1284                 line = m.group(1)
   1285                 break
   1286             if not proto:
   1287                 raise ParseError("Overlong entry starting on line %s"
   1288                                  % self.entryLine)
   1289             elif proto[0] == 'k':
   1290                 key, line = self._parseKey(line)
   1291                 v.append(key)
   1292             elif proto[0] == 'v':
   1293                 value, line = self._parseValue(line)
   1294                 v.append(value)
   1295             elif proto[0] == 'p':
   1296                 key, line = self._parseKey(line)
   1297                 v.append(key)
   1298                 line = _advance(it, line)
   1299                 line = line.lstrip()
   1300                 if line[0] == '=':
   1301                     line = line[1:]
   1302                 value, line = self._parseValue(line)
   1303                 v.append(value)
   1304             else:
   1305                 assert 0
   1306             line = line.strip()
   1307             if line and line[0] == ',':
   1308                 line = line[1:]
   1309             if proto and proto[1:] != '*':
   1310                 proto = proto[1:]
   1311         if proto and proto[1:] != '*':
   1312             raise ParseError("Missing arguments to %s on line %s"
   1313                              % (self.curEntType, self.entryLine))
   1314 
   1315         if self.curEntType == 'string':
   1316             self.strings[v[0]] = v[1]
   1317             self.newStrings[v[0]] = v[1]
   1318             self.invStrings[v[1]] = v[0]
   1319         elif self.curEntType == 'preamble':
   1320             pass
   1321         else:
   1322             key = v[0]
   1323             d = {}
   1324             for i in range(1, len(v), 2):
   1325                 d[v[i].lower()] = v[i+1]
   1326             ent = BibTeXEntry(self.curEntType, key, d)
   1327             ent.entryLine = self.entryLine
   1328             self.result.addEntry(ent)
   1329 
   1330         return line
   1331 
   1332     def parse(self):
   1333         try:
   1334             self._parse()
   1335         except StopIteration:
   1336             if self.litStringLine:
   1337                 raise ParseError("Unexpected EOF in string (started on %s)"
   1338                                  % self.litStringLine)
   1339             elif self.entryLine:
   1340                 raise ParseError("Unexpected EOF at line %s (entry started "
   1341                                  "on %s)" % (self.fileiter.lineno,
   1342                                              self.entryLine))
   1343 
   1344         self.result.invStrings = self.invStrings
   1345         self.result.newStrings = self.newStrings
   1346 
   1347         return self.result
   1348 
   1349     def _parse(self):
   1350         it = self.fileiter
   1351         line = next(it)
   1352         while 1:
   1353             # Skip blank lines.
   1354             while not line \
   1355                   or line.isspace() \
   1356                   or OUTER_COMMENT_RE.match(line):
   1357                 line = next(it)
   1358             # Get the first line of an entry.
   1359             m = ENTRY_BEGIN_RE.match(line)
   1360             if m:
   1361                 self.curEntType = m.group(1).lower()
   1362                 line = m.group(2)
   1363                 line = self._parseEntry(line)
   1364                 self.entryLine = 0
   1365             else:
   1366                 raise ParseError("Bad input at line %s "
   1367                                  "(expected a new entry.)"
   1368                                  % it.lineno)
   1369 
   1370 def _advance(it, line):
   1371     while not line \
   1372           or line.isspace() \
   1373           or COMMENT_RE.match(line):
   1374         line = next(it)
   1375     return line
   1376 
   1377 # Matches a comment line outside of an entry.
   1378 OUTER_COMMENT_RE = re.compile(r'^\s*[\#\%]')
   1379 
   1380 # Matches a comment line inside of an entry.
   1381 COMMENT_RE = re.compile(r'^\s*\%')
   1382 
   1383 # Matches the start of an entry. group 1 is the type of the entry.
   1384 # group 2 is the rest of the line.
   1385 ENTRY_BEGIN_RE = re.compile(r'''^\s*\@([^\s\"\%\'\(\)\,\=\{\}]+)(.*)''')
   1386 
   1387 # Start of an entry.  group 1 is the keyword naming the entry.
   1388 BRACE_BEGIN_RE = re.compile(r'\s*\{(.*)')
   1389 BRACE_END_RE = re.compile(r'\s*\}(.*)')
   1390 KEY_RE = re.compile(r'''\s*([^\"\#\%\'\(\)\,\=\{\}\s]+)(.*)''')
   1391 
   1392 STRING_CLOSE_RE = re.compile(r'^([^\{\}\"]*)\"(.*)')
   1393 BRACE_CLOSE_RE = re.compile(r'^([^\{\}]*)\}(.*)')
   1394 BRACE_OPEN_RE = re.compile(r'^([^\{\}]*\{)(.*)')
   1395 RAW_DATA_RE = re.compile(r'^([^\s\},]+)(.*)')
   1396 
   1397 def parseFile(filename, result=None):
   1398     """
   1399     Helper function: parse a single BibTeX file
   1400     """
   1401     f = FileIter(fname=filename)
   1402     p = Parser(f, {}, result)
   1403     r = p.parse()
   1404     r.resolve()
   1405     for e in r.entries:
   1406         e.check()
   1407     return r
   1408 
   1409 def parseString(string, result=None):
   1410     """
   1411     Helper function: parse BibTeX from a string
   1412     """
   1413     f = FileIter(string=string)
   1414     p = Parser(f, {}, result)
   1415     r = p.parse()
   1416     r.resolve()
   1417     for e in r.entries:
   1418         e.check()
   1419     return r
   1420 
   1421 if __name__ == '__main__':
   1422     if len(sys.argv) > 1:
   1423         fname = sys.argv[1]
   1424     else:
   1425         fname = "testbib/pdos.bib"
   1426 
   1427     r = parseFile(fname)
   1428 
   1429     for e in r.entries:
   1430         if e.type in ("proceedings", "journal"): continue
   1431         print(e.to_html())