BibTeX.py (47966B)
1 #!/usr/bin/python3 2 # Copyright 2003-2008, Nick Mathewson. See LICENSE for licensing info. 3 # Copyright 2018, 2019 ng0 <ng0@n0.is>. 4 5 """ 6 BibTeX.py -- parse and manipulate BibTeX files and entries. 7 Based on perl code by Eddie Kohler; heavily modified. 8 """ 9 10 11 from future.utils import raise_with_traceback 12 from io import StringIO ## for Python 3 13 import re 14 import sys 15 import os 16 import copy 17 import future 18 19 import config 20 import rank 21 22 __all__ = ['ParseError', 'BibTeX', 'BibTeXEntry', 'htmlize', 23 'ParsedAuthor', 'FileIter', 'Parser', 'parseFile', 24 'splitBibTeXEntriesBy', 'sortBibTexEntriesBy',] 25 26 # List: must map from month number to month name. 27 MONTHS = [None, 28 "January", "February", "March", "April", "May", "June", 29 "July", "August", "September", "October", "November", "December"] 30 31 # Fields that we only care about for making web pages (BibTeX doesn't 32 # recognize them.) 33 WWW_FIELDS = ['www_section', 'www_important', 'www_remarks', 34 'www_abstract_url', 'www_html_url', 'www_pdf_url', 35 'www_ps_url', 'www_txt_url', 'www_ps_gz_url', 36 'www_amazon_url', 'www_excerpt_url', 'www_publisher_url', 37 'www_cache_section', 'www_tags'] 38 39 def url_untranslate(s): 40 """ 41 Change a BibTeX key into a string suitable for use in a URL. 42 """ 43 s = re.sub(r'([%<>`#, &_\';])', 44 lambda m: "_%02x"%ord(m.group(1)), 45 s) 46 s = s.replace("/", ":") 47 return s 48 49 class ParseError(Exception): 50 """ 51 Raised on invalid BibTeX 52 """ 53 pass 54 55 56 def smartJoin(*lst): 57 """ 58 Equivalent to os.path.join, but handle"." and ".." 59 entries a bit better. 60 """ 61 lst = [item for item in lst if item != "."] 62 idx = 0 63 while idx < len(lst): 64 if idx > 0 and lst[idx] == "..": 65 del lst[idx] 66 else: 67 idx += 1 68 return os.path.join(*lst) 69 70 class BibTeX: 71 """ 72 A parsed BibTeX file 73 """ 74 def __init__(self): 75 self.entries = [] # List of BibTeXEntry 76 self.byKey = {} # Map from BibTeX key to BibTeX entry. 77 def addEntry(self, ent): 78 """Add a BibTeX entry to this file.""" 79 k = ent.key 80 if self.byKey.get(ent.key.lower()): 81 print("Already have an entry named %s"%k, file=sys.stderr) 82 return 83 self.entries.append(ent) 84 self.byKey[ent.key.lower()] = ent 85 def resolve(self): 86 """ 87 Validate all entries in this file, and resolve cross-references 88 """ 89 seen = {} 90 for ent in self.entries: 91 seen.clear() 92 while ent.get('crossref'): 93 try: 94 cr = self.byKey[ent['crossref'].lower()] 95 except KeyError: 96 print("No such crossref: %s"% ent['crossref']) 97 break 98 if seen.get(cr.key): 99 #raise ParseError("Circular crossref at %s" % ent.key) 100 raise_with_traceback(ParseError("Circular crossref at %s" % ent.key)) 101 seen[cr.key] = 1 102 del ent.entries['crossref'] 103 104 if cr.entryLine < ent.entryLine: 105 print("Warning: crossref %s used after declaration"%cr.key) 106 107 for k in list(cr.entries.keys()): 108 if k in ent.entries: 109 print("ERROR: %s defined both in %s and in %s" 110 %(k, ent.key, cr.key)) 111 else: 112 ent.entries[k] = cr.entries[k] 113 114 ent.resolve() 115 newEntries = [] 116 rk = config.REQUIRE_KEY 117 if rk is None: 118 # hack: if no key is required, require "title", since every 119 # entry will have a title. 120 rk = "title" 121 print("rk is " + rk) 122 for ent in self.entries: 123 if ent.type in config.OMIT_ENTRIES or rk not in ent.entries.keys(): 124 ent.check() 125 del self.byKey[ent.key.lower()] 126 else: 127 newEntries.append(ent) 128 self.entries = newEntries 129 130 def buildAuthorTable(entries): 131 """ 132 Given a list of BibTeXEntry, return a map from parsed author name to 133 parsed canonical name. 134 """ 135 authorsByLast = {} 136 for e in entries: 137 for a in e.parsedAuthor: 138 authorsByLast.setdefault(tuple(a.last), []).append(a) 139 # map from author to collapsed author. 140 result = {} 141 for k, v in list(config.COLLAPSE_AUTHORS.items()): 142 a = parseAuthor(k)[0] 143 c = parseAuthor(v)[0] 144 result[c] = c 145 result[a] = c 146 147 for e in entries: 148 for author in e.parsedAuthor: 149 if author in result: 150 continue 151 152 c = author 153 for a in authorsByLast[tuple(author.last)]: 154 if a is author: 155 continue 156 c = c.collapsesTo(a) 157 result[author] = c 158 159 if 0: 160 for a, c in list(result.items()): 161 if a != c: 162 print("Collapsing authors: %s => %s" % (a, c)) 163 if 0: 164 print(parseAuthor("Franz Kaashoek")[0].collapsesTo( 165 parseAuthor("M. Franz Kaashoek")[0])) 166 print(parseAuthor("Paul F. Syverson")[0].collapsesTo( 167 parseAuthor("Paul Syverson")[0])) 168 print(parseAuthor("Paul Syverson")[0].collapsesTo( 169 parseAuthor("Paul F. Syverson")[0])) 170 171 return result 172 173 def splitEntriesBy(entries, field): 174 """ 175 Take a list of BibTeX entries and the name of a bibtex field; return 176 a map from vield value to list of entry. 177 """ 178 result = {} 179 for ent in entries: 180 key = ent.get(field) 181 if field in config.MULTI_VAL_FIELDS: 182 key = [k.strip() for k in key.split(',')] 183 else: 184 key = [key] 185 for k in key: 186 try: 187 result[k].append(ent) 188 except: 189 result[k] = [ent] 190 return result 191 192 def splitSortedEntriesBy(entries, field): 193 """ 194 Take inputs as in splitEntriesBy, where 'entries' is sorted by 'field'. 195 Return a list of (field-value, entry-list) tuples, in the order 196 given in 'entries'. 197 """ 198 result = [] 199 curVal = "alskjdsakldj" 200 curList = [] 201 for ent in entries: 202 key = ent.get(field) 203 if key == curVal: 204 curList.append(ent) 205 else: 206 curVal = key 207 curList = [ent] 208 result.append((curVal, curList)) 209 return result 210 211 def sortEntriesBy(entries, field, default): 212 """ 213 Take inputs as in splitEntriesBy, and return a list of entries sorted 214 by the value of 'field'. Entries without 'field' are sorted as if their 215 value were 'default'. 216 """ 217 tmp = [] 218 i = 0 219 for ent in entries: 220 i += 1 221 v = ent.get(field, default) 222 if v.startswith("<span class='bad'>"): 223 v = default 224 if field in config.MULTI_VAL_FIELDS: 225 for v_j in v.split(','): 226 ent_j = copy.deepcopy(ent) 227 ent_j.__setitem__(field, v_j.strip()) 228 tmp.append((txtize(v_j.strip()), i, ent_j)) 229 else: tmp.append((txtize(v), i, ent)) 230 tmp.sort() 231 return [t[2] for t in tmp] 232 233 def splitEntriesByAuthor(entries): 234 """ 235 Take a list of entries, sort them by author names, and return: 236 a sorted list of (authorname-in-html, bibtex-entry-list) tuples, 237 a map from authorname-in-html to name-for-url. 238 Entries with multiple authors appear once per author. 239 """ 240 collapsedAuthors = buildAuthorTable(entries) 241 entries = sortEntriesByDate(entries) 242 result = {} # Name in sorting order -> entries 243 htmlResult = {} # name in sorting order -> Full name 244 url_map = {} # Full name -> Url 245 for ent in entries: 246 for a in ent.parsedAuthor: 247 canonical = collapsedAuthors[a] 248 url = canonical.getHomepage() 249 sortkey = canonical.getSortingName() 250 secname = canonical.getSectionName() 251 if url: 252 url_map[secname] = url 253 254 htmlResult[sortkey] = secname 255 result.setdefault(sortkey, []).append(ent) 256 sortnames = list(result.keys()) 257 sortnames.sort() 258 sections = [(htmlResult[n], result[n]) for n in sortnames] 259 return sections, url_map 260 261 ## def sortEntriesByAuthor(entries): 262 ## tmp = [] 263 ## i = 0 264 ## for ent in entries: 265 ## i += 1 266 ## authors = [ txtize(" ".join(a.von+a.last+a.first+a.jr)) 267 ## for a in ent.parsedAuthor ] 268 ## tmp.append((tuple(authors), i, ent)) 269 ## tmp.sort() 270 ## return [ t[2] for t in tmp ] 271 272 def sortEntriesByDate(entries): 273 """ 274 Sort a list of entries by their publication date. 275 """ 276 tmp = [] 277 i = 0 278 for ent in entries: 279 i += 1 280 if (ent.get('month') == "forthcoming" or ent.get('year') == "forthcoming"): 281 tmp.append((20000*13, i, ent)) 282 continue 283 try: 284 monthname = ent.get("month") 285 if monthname is not None: 286 match = re.match(r"(\w+)--\w+", monthname) 287 if match: 288 monthname = match.group(1) 289 mon = MONTHS.index(monthname) 290 except ValueError: 291 print("Unknown month %r in %s"%(ent.get("month"), ent.key)) 292 mon = 0 293 294 try: 295 date = int(ent['year'])*13 + mon 296 except KeyError: 297 print("ERROR: No year field in %s"%ent.key) 298 date = 10000*13 299 except ValueError: 300 date = 10000*13 301 tmp.append((date, i, ent)) 302 tmp.sort(reverse=True) 303 return [t[2] for t in tmp] 304 305 306 # List of fields that appear when we display the entries as BibTeX. 307 DISPLAYED_FIELDS = ['title', 'author', 'journal', 'booktitle', 308 'school', 'institution', 'organization', 'volume', 309 'number', 'year', 'month', 'address', 'location', 310 'chapter', 'edition', 'pages', 'editor', 311 'howpublished', 'key', 'publisher', 'type', 312 'note', 'series'] 313 314 class BibTeXEntry: 315 """ 316 A single BibTeX entry. 317 """ 318 319 def __init__(self, type, key, entries): 320 self.type = type # Kind of entry: @book, @injournal,etc 321 self.key = key # What key does it have? 322 self.entries = entries # Map from key to value. 323 self.entryLine = 0 # Defined on this line number 324 325 def get(self, k, v=None): 326 return self.entries.get(k, v) 327 328 def has_key(self, k): 329 return k in self.entries 330 331 def __getitem__(self, k): 332 return self.entries[k] 333 334 def __setitem__(self, k, v): 335 self.entries[k] = v 336 337 def __str__(self): 338 return self.format(70, 1) 339 340 def getURL(self): 341 """Return the best URL to use for this paper, or None.""" 342 best = None 343 for field in ['www_pdf_url', 'www_ps_gz_url', 'www_ps_url', 344 'www_html_url', 'www_txt_url',]: 345 u = self.get(field) 346 if u: 347 if not best: 348 best = u 349 elif (best.startswith("http://citeseer.nj.nec.com/") 350 and not u.startswith("http://citeseer.nj.nec.com/")): 351 best = u 352 return best 353 354 def format(self, width=70, indent=8, v=0, invStrings={}): 355 """ 356 Format this entry as BibTeX. 357 """ 358 d = ["@%s{%s,\n" % (self.type, self.key)] 359 if v: 360 df = DISPLAYED_FIELDS[:] 361 for k in list(self.entries.keys()): 362 if k not in df: 363 df.append(k) 364 else: 365 df = DISPLAYED_FIELDS 366 for f in df: 367 if f not in self.entries: 368 continue 369 v = self.entries[f] 370 if v.startswith("<span class='bad'>"): 371 d.append("%%%%% ERROR: Missing field\n") 372 d.append("%% %s = {?????},\n"%f) 373 continue 374 np = v.translate(str.maketrans(ALLCHARS, ALLCHARS, PRINTINGCHARS)) 375 if np: 376 d.append("%%%%% "+("ERROR: Non-ASCII characters: '%r'\n"%np)) 377 d.append(" ") 378 v = v.replace("&", "&") 379 if v in invStrings: 380 s = "%s = %s,\n" %(f, invStrings[v]) 381 else: 382 s = "%s = {%s},\n" % (f, v) 383 d.append(_split(s, width, indent)) 384 d.append("}\n") 385 return "".join(d) 386 def resolve(self): 387 """ 388 Handle post-processing for this entry 389 """ 390 a = self.get('author') 391 if a: 392 self.parsedAuthor = parseAuthor(a) 393 #print(a) 394 #print(" => ",repr(self.parsedAuthor)) 395 else: 396 self.parsedAuthor = None 397 398 def isImportant(self): 399 """ 400 Return 1 iff this entry is marked as important 401 """ 402 imp = self.get("www_important") 403 if imp and imp.strip().lower() not in ("no", "false", "0"): 404 return 1 405 return 0 406 407 def check(self): 408 """ 409 Print any errors for this entry, and return true if there were 410 none. 411 """ 412 errs = self._check() 413 for e in errs: 414 print(e) 415 return not errs 416 417 # FIXME: Here's some fields repeated after you enter the 418 # if self.type != 'proceedings' 419 # conditional body. 420 # Besides the official BibTeX resources, this is a good 421 # reference point: https://verbosus.com/bibtex-style-examples.html 422 def _check(self): 423 """ 424 The message 'in record %s' relates to the entire bibtex record, 425 giving a means to locate by searching. 426 FIXME: Really print the line in the '.bib' file. 427 """ 428 errs = [] 429 if self.type == 'inproceedings': 430 fields = 'booktitle', 'year' 431 elif self.type == 'incollection': 432 fields = 'booktitle', 'year' 433 elif self.type == 'proceedings': 434 fields = 'booktitle', 'editor' 435 elif self.type == 'article': 436 fields = 'journal', 'year' 437 elif self.type == 'book': 438 fields = 'title', 'year', 'publisher' 439 elif self.type == 'booklet': 440 fields = 'title', 'year' 441 elif self.type == 'techreport': 442 fields = 'institution', 443 elif self.type == 'misc': 444 fields = 'howpublished', 445 elif self.type == 'conference': 446 fields = 'booktitle', 'year' 447 elif self.type in ('mastersthesis', 'phdthesis'): 448 fields = () 449 else: 450 fields = () 451 errs.append("ERROR (record %s):\t odd type %s" 452 % (self.entryLine, self.type)) 453 if self.type != ('proceedings' or 'conference'): 454 fields += 'title', 'author', 'www_section', 'year' 455 456 for field in fields: 457 if self.get(field) is None or \ 458 self.get(field).startswith("<span class='bad'>"): 459 errs.append("ERROR (record %s):\t %s field" 460 "\tnot found in\t %s" 461 % (self.entryLine, field, self.key)) 462 self.entries[field] = "<span class='bad'>%s:??</span>"%field 463 464 if self.type == 'inproceedings': 465 if self.get("booktitle"): 466 if not self['booktitle'].startswith("Proceedings of") and \ 467 not self['booktitle'].startswith("{Proceedings of"): 468 errs.append("ERROR (record %s):\t %s's booktitle " 469 "(%r) doesn't start with 'Proceedings of'" 470 % (self.entryLine, selfself.key, self['booktitle'])) 471 472 if "pages" in self.entries.keys() and not re.search(r'\d+--\d+', self.entries['pages']): 473 errs.append("ERROR (record %s):\t Misformed pages in %s" 474 % (self.entryLine, self.key)) 475 476 if self.type == 'proceedings': 477 if self.get('title'): 478 errs.append("ERROR (record %s):\t %s is a proceedings: " 479 "it should have a booktitle, not a title." 480 % (self.entryLine, self.key)) 481 482 for field, value in list(self.entries.items()): 483 if value.translate(str.maketrans(ALLCHARS, ALLCHARS, PRINTINGCHARS)): 484 errs.append("ERROR (record %s):\t %s.%s " 485 "has non-ASCII characters" 486 % (self.entryLine, self.key, field)) 487 if field.startswith("www_") and field not in WWW_FIELDS: 488 errs.append("ERROR (record %s):\t unknown " 489 "www field %s" 490 % (self.entryLine, field)) 491 if value.strip()[-1:] == '.' and \ 492 field not in ("notes", "www_remarks", "author"): 493 errs.append("ERROR (record %s):\t %s.%s " 494 "has an extraneous period" 495 % (self.entryLine, self.key, field)) 496 return errs 497 498 def biblio_to_html(self): 499 """ 500 Return the HTML for the citation portion of entry. 501 """ 502 if self.type in ('inproceedings', 'incollection'): 503 booktitle = self['booktitle'] 504 bookurl = self.get('bookurl') 505 if bookurl: 506 m = PROCEEDINGS_RE.match(booktitle) 507 if m: 508 res = ["In the ", m.group(1), 509 '<a href="%s">'%bookurl, m.group(2), "</a>"] 510 else: 511 res = ['In the <a href="%s">%s</a>'% (bookurl, booktitle)] 512 else: 513 res = ["In the ", booktitle] 514 515 if self.get("edition"): 516 res.append(",") 517 res.append(self['edition']) 518 if self.get("location"): 519 res.append(", ") 520 res.append(self['location']) 521 elif self.get("address"): 522 res.append(", ") 523 res.append(self['address']) 524 res.append(", %s %s" % (self.get('month', ""), self['year'])) 525 if not self.get('pages'): 526 pass 527 elif "-" in self['pages']: 528 res.append(", pages %s"%self['pages']) 529 else: 530 res.append(", page %s"%self['pages']) 531 elif self.type == 'article': 532 res = ["In "] 533 if self.get('journalurl'): 534 res.append('<a href="%s">%s</a>'% 535 (self['journalurl'], self['journal'])) 536 else: 537 res.append(self['journal']) 538 if self.get('volume'): 539 res.append(" <b>%s</b>"%self['volume']) 540 if self.get('number'): 541 res.append("(%s)"%self['number']) 542 res.append(", %s %s" % (self.get('month', ""), self['year'])) 543 if not self.get('pages'): 544 pass 545 elif "-" in self['pages']: 546 res.append(", pages %s"%self['pages']) 547 else: 548 res.append(", page %s"%self['pages']) 549 elif self.type == 'techreport': 550 res = ["%s %s %s" % (self['institution'], 551 self.get('type', 'technical report'), 552 self.get('number', ""))] 553 if self.get('month') or self.get('year'): 554 res.append(", %s %s" % (self.get('month', ''), 555 self.get('year', ''))) 556 # FIXME: less clauses. 557 elif self.type == 'mastersthesis' or self.type == 'phdthesis': 558 if self.get('type'): 559 if self.type == 'mastersthesis' or\ 560 self.type == 'Master' or\ 561 self.type == 'Masters' or\ 562 self.type == 'Master\'s': 563 res = ["Master thesis"] 564 if self.type == 'Bachelor' or\ 565 self.type == 'Bachelors': 566 res = ["Bachelor thesis"] 567 res = [self['type']] 568 else: 569 res = ["Ph.D. thesis"] 570 if self.get('school'): 571 res.append(", %s"%(self['school'])) 572 if self.get('month') or self.get('year'): 573 res.append(", %s %s" % (self.get('month', ''), 574 self.get('year', ''))) 575 # elif self.type == 'book': 576 # res = [self['publisher']] 577 # if self.get('year'): 578 # res.append(" ") 579 # res.append(self.get('year')) 580 # # res.append(", %s"%(self.get('year'))) 581 # if self.get('series'): 582 # res.append(",") 583 # res.append(self['series']) 584 # elif self.type == 'booklet': 585 # # res = self.get('publisher') 586 # res = [self['publisher']] 587 # if self.get('year'): 588 # res.append(" ") 589 # res.append(self.get('year')) 590 elif self.type == 'misc': 591 res = [self['howpublished']] 592 if self.get('month') or self.get('year'): 593 res.append(", %s %s" % (self.get('month', ''), 594 self.get('year', ''))) 595 if not self.get('pages'): 596 pass 597 elif "-" in self['pages']: 598 res.append(", pages %s"%self['pages']) 599 else: 600 res.append(", page %s"%self['pages']) 601 else: 602 res = ["<Odd type %s>"%self.type] 603 604 res[0:0] = ["<span class='biblio'>"] 605 res.append(".</span>") 606 607 bibtexurl = "./bibtex.html#%s"%url_untranslate(self.key) 608 res.append((" <span class='availability'>" 609 "(<a href='%s'>BibTeX entry</a>)" 610 "</span>") %bibtexurl) 611 612 # Produce the link to the dot bib file 'record.bib' for the 613 # current record, assuming it is relative from FILE in 614 # directory "../bib/". 615 # FIXME: move everything into 'out', allowing to build 616 # a website with relative links and more structure. 617 bibtexurl_file = "./bib/%s" %url_untranslate(self.key) 618 res.append((" <span class='availability'>" 619 "(<a href='%s/record.bib'>Download bibtex record</a>)" 620 "</span>") 621 %bibtexurl_file) 622 return htmlize("".join(res)) 623 624 def to_html(self, cache_path="./cache", base_url="html"): 625 """ 626 Return the HTML for this entry. 627 """ 628 imp = self.isImportant() 629 draft = self.get('year') == 'forthcoming' 630 if imp: 631 res = ["<li><div class='impEntry'><p class='impEntry'>"] 632 elif draft: 633 res = ["<li><div class='draftEntry'><p class='draftEntry'>"] 634 else: 635 res = ["<p class='item-preview'><p class='item-date'>"] 636 637 if imp or not draft: 638 """ 639 Add a picture of the rank 640 Only if year is known or paper important! 641 """ 642 r = rank.get_rank_html(self['title'], self.get('year'), 643 update=False, base_url=base_url) 644 if r is not None: 645 res.append(r) 646 647 res.append("<span class='title'><a name='%s'>%s</a></span>" 648 %(url_untranslate(self.key), htmlize(self['title']))) 649 650 for cached in 0, 1: 651 availability = [] 652 if not cached: 653 for which in ["amazon", "excerpt", "publisher"]: 654 key = "www_%s_url"%which 655 if self.get(key): 656 url = self[key] 657 url = unTeXescapeURL(url) 658 availability.append('<a href="%s">%s</a>' 659 % (url, which)) 660 661 cache_section = self.get('www_cache_section', ".") 662 if cache_section not in config.CACHE_SECTIONS: 663 if cache_section != ".": 664 print("Unrecognized cache section %s"%(cache_section), 665 file=sys.stderr) 666 cache_section = "." 667 668 for key, name, ext in (('www_abstract_url', 'abstract', 'abstract'), 669 ('www_html_url', 'HTML', 'html'), 670 ('www_pdf_url', 'PDF', 'pdf'), 671 ('www_ps_url', 'PS', 'ps'), 672 ('www_txt_url', 'TXT', 'txt'), 673 ('www_ps_gz_url', 'gzipped PS', 'ps.gz')): 674 if cached: 675 #XXXX the URL needs to be relative to the absolute 676 #XXXX cache path. 677 url = smartJoin(cache_path, cache_section, 678 "%s.%s"%(self.key, ext)) 679 fname = smartJoin(config.OUTPUT_DIR, config.CACHE_DIR, 680 cache_section, 681 "%s.%s" % (self.key, ext)) 682 if not os.path.exists(fname): continue 683 else: 684 url = self.get(key) 685 if not url: continue 686 url = unTeXescapeURL(url) 687 url = url.replace('&', '&') 688 availability.append('<a href="%s">%s</a>' %(url, name)) 689 690 if availability: 691 res.append([" ", " "][cached]) 692 res.append("<span class='availability'>(") 693 if cached: res.append("Cached: ") 694 res.append(", ".join(availability)) 695 res.append(")</span>") 696 697 res.append("<br /><span class='author'>by ") 698 699 #res.append("\n<!-- %r -->\n" % self.parsedAuthor) 700 htmlAuthors = [a.htmlizeWithLink() for a in self.parsedAuthor] 701 702 if len(htmlAuthors) == 1: 703 res.append(htmlAuthors[0]) 704 elif len(htmlAuthors) == 2: 705 res.append(" and ".join(htmlAuthors)) 706 else: 707 res.append(", ".join(htmlAuthors[:-1])) 708 res.append(", and ") 709 res.append(htmlAuthors[-1]) 710 711 if res[-1][-1] != '.': 712 res.append(".") 713 res.append("</span><br />\n") 714 res.append(self.biblio_to_html()) 715 res.append("\n<br>\n(<a href='#%s'>direct link</a>)" 716 %url_untranslate(self.key)) 717 if self.get('url'): 718 res.append(" (<a href='%s'>website</a>)" 719 %htmlize(self['url'])) 720 res.append("</p>") 721 722 if self.get('www_remarks'): 723 res.append("<p class='remarks'>%s</p>" 724 %htmlize(self['www_remarks'])) 725 726 if self.get('abstract'): 727 res.append("<p class='abstract item-summary'>%s</p>" 728 %htmlize(self['abstract'])) 729 730 #res.append("\n<br>[<a href='#'>Go to top</a>]") 731 res.append("\n<p class='item-date'>[<a href='#'>Go to top</a>]</p>") 732 733 if imp or draft: 734 res.append("</div>") 735 res.append("</p>\n\n") 736 737 return "".join(res) 738 739 def unTeXescapeURL(s): 740 """ 741 Turn a URL as formatted in TeX into a real URL. 742 """ 743 s = s.replace("\\_", "_") 744 s = s.replace("\\-", "") 745 s = s.replace("\{}", "") 746 s = s.replace("{}", "") 747 return s 748 749 def TeXescapeURL(s): 750 """ 751 Escape a URL for use in TeX 752 """ 753 s = s.replace("_", "\\_") 754 s = s.replace("~", "\{}~") 755 return s 756 757 RE_LONE_AMP = re.compile(r'&([^a-z0-9])') 758 759 RE_LONE_I = re.compile(r'\\i([^a-z0-9])') 760 761 RE_ACCENT = re.compile(r'\\([\'`~^"c])([^{]|{.})') 762 763 RE_LIGATURE = re.compile(r'\\(AE|ae|OE|oe|AA|aa|O|o|ss)([^a-z0-9])') 764 765 ACCENT_MAP = {"'" : 'acute', 766 "`" : 'grave', 767 "~" : 'tilde', 768 "^" : 'circ', 769 '"' : 'uml', 770 "c" : 'cedil',} 771 772 UNICODE_MAP = {'ń' : 'ń',} 773 774 HTML_LIGATURE_MAP = { 775 'AE' : 'Æ', 776 'ae' : 'æ', 777 'OE' : 'Œ', 778 'oe' : 'œ', 779 'AA' : 'Å', 780 'aa' : 'å', 781 'O' : 'Ø', 782 'o' : 'ø', 783 'ss' : 'ß',} 784 785 RE_TEX_CMD = re.compile(r"(?:\\[a-zA-Z@]+|\\.)") 786 787 RE_PAGE_SPAN = re.compile(r"(\d)--(\d)") 788 789 def _unaccent(m): 790 accent, char = m.groups() 791 if char[0] == '{': 792 char = char[1] 793 accented = "&%s%s;" % (char, ACCENT_MAP[accent]) 794 return UNICODE_MAP.get(accented, accented) 795 796 def _unlig_html(m): 797 return "%s%s"%(HTML_LIGATURE_MAP[m.group(1)], m.group(2)) 798 799 def htmlize(s): 800 """ 801 Turn a TeX string into good-looking HTML. 802 """ 803 s = RE_LONE_AMP.sub(lambda m: "&%s" % m.group(1), s) 804 s = RE_LONE_I.sub(lambda m: "i%s" % m.group(1), s) 805 s = RE_ACCENT.sub(_unaccent, s) 806 s = unTeXescapeURL(s) 807 s = RE_LIGATURE.sub(_unlig_html, s) 808 s = RE_TEX_CMD.sub("", s) 809 s = s.translate(str.maketrans(ALLCHARS, ALLCHARS, "{}")) 810 s = RE_PAGE_SPAN.sub(lambda m: "%s-%s"%(m.groups()), s) 811 s = s.replace("---", "—") 812 s = s.replace("--", "–") 813 return s 814 815 def author_url(author): 816 """ 817 Given an author's name, return a URL for his/her homepage. 818 """ 819 for pat, url in config.AUTHOR_RE_LIST: 820 if pat.search(author): 821 return url 822 return None 823 824 def txtize(s): 825 """ 826 Turn a TeX string into decent plaintext. 827 """ 828 s = RE_LONE_I.sub(lambda m: "i%s" % m.group(1), s) 829 s = RE_ACCENT.sub(lambda m: "%s" % m.group(2), s) 830 s = RE_LIGATURE.sub(lambda m: "%s%s"%m.groups(), s) 831 s = RE_TEX_CMD.sub("", s) 832 s = s.translate(str.maketrans(ALLCHARS, ALLCHARS, "{}")) 833 return s 834 835 PROCEEDINGS_RE = re.compile(r'((?:proceedings|workshop record) of(?: the)? )(.*)', re.I) 836 837 class ParsedAuthor: 838 """ 839 The parsed name of an author. 840 Eddie deserves credit for this incredibly hairy business. 841 """ 842 def __init__(self, first, von, last, jr): 843 self.first = first 844 self.von = von 845 self.last = last 846 self.jr = jr 847 self.collapsable = 1 848 849 self.html = htmlize(str(self)) 850 self.txt = txtize(str(self)) 851 852 s = self.html 853 for pat in config.NO_COLLAPSE_AUTHORS_RE_LIST: 854 if pat.search(s): 855 self.collapsable = 0 856 break 857 858 def __eq__(self, o): 859 return ((self.first == o.first) and 860 (self.last == o.last) and 861 (self.von == o.von) and 862 (self.jr == o.jr)) 863 864 def __hash__(self): 865 return hash(repr(self)) 866 867 def collapsesTo(self, o): 868 """ 869 Return true iff 'o' could be a more canonical version of 870 this author 871 """ 872 if not self.collapsable or not o.collapsable: 873 return self 874 875 if self.last != o.last or self.von != o.von or self.jr != o.jr: 876 return self 877 if not self.first: 878 return o 879 880 if len(self.first) == len(o.first): 881 n = [] 882 for a, b in zip(self.first, o.first): 883 if a == b: 884 n.append(a) 885 elif len(a) == 2 and a[1] == '.' and a[0] == b[0]: 886 n.append(b) 887 elif len(b) == 2 and b[1] == '.' and a[0] == b[0]: 888 n.append(a) 889 else: 890 return self 891 if n == self.first: 892 return self 893 elif n == o.first: 894 return o 895 else: 896 return self 897 else: 898 realname = max([len(n) for n in self.first+o.first]) > 2 899 if not realname: 900 return self 901 902 if len(self.first) < len(o.first): 903 short = self.first; long = o.first 904 else: 905 short = o.first; long = self.first 906 907 initials_s = "".join([n[0] for n in short]) 908 initials_l = "".join([n[0] for n in long]) 909 idx = initials_l.find(initials_s) 910 if idx < 0: 911 return self 912 n = long[:idx] 913 for i in range(idx, idx+len(short)): 914 a = long[i]; b = short[i-idx] 915 if a == b: 916 n.append(a) 917 elif len(a) == 2 and a[1] == '.' and a[0] == b[0]: 918 n.append(b) 919 elif len(b) == 2 and b[1] == '.' and a[0] == b[0]: 920 n.append(a) 921 else: 922 return self 923 n += long[idx+len(short):] 924 925 if n == self.first: 926 return self 927 elif n == o.first: 928 return o 929 else: 930 return self 931 932 def __repr__(self): 933 return "ParsedAuthor(%r,%r,%r,%r)"%(self.first, self.von, 934 self.last, self.jr) 935 def __str__(self): 936 a = " ".join(self.first+self.von+self.last) 937 if self.jr: 938 return "%s, %s" % (a, self.jr) 939 return a 940 941 def getHomepage(self): 942 s = self.html 943 for pat, url in config.AUTHOR_RE_LIST: 944 if pat.search(s): 945 return url 946 return None 947 948 def getSortingName(self): 949 """ 950 Return a representation of this author's name in von-last-first-jr 951 order, unless overridden by ALPH 952 """ 953 s = self.html 954 for pat, v in config.ALPHABETIZE_AUTHOR_AS_RE_LIST: 955 if pat.search(s): 956 return v 957 958 return txtize(" ".join(self.von+self.last+self.first+self.jr)) 959 960 def getSectionName(self): 961 """Return a HTML representation of this author's name in 962 last, first von, jr order""" 963 secname = " ".join(self.last) 964 more = self.first+self.von 965 if more: 966 secname += ", "+" ".join(more) 967 if self.jr: 968 secname += ", "+" ".join(self.jr) 969 secname = htmlize(secname) 970 return secname 971 972 def htmlizeWithLink(self): 973 a = self.html 974 u = self.getHomepage() 975 if u: 976 return "<a href='%s'>%s</a>"%(u, a) 977 else: 978 return a 979 980 def _split(s, w=79, indent=8): 981 r = [] 982 s = re.sub(r"\s+", " ", s) 983 first = 1 984 indentation = "" 985 while len(s) > w: 986 for i in range(w-1, 20, -1): 987 if s[i] == ' ': 988 r.append(indentation+s[:i]) 989 s = s[i+1:] 990 break 991 else: 992 r.append(indentation+s.strip()) 993 s = "" 994 if first: 995 first = 0 996 w -= indent 997 indentation = " "*indent 998 if (s): 999 r.append(indentation+s) 1000 r.append("") 1001 return "\n".join(r) 1002 1003 class FileIter: 1004 def __init__(self, fname=None, file=None, it=None, string=None): 1005 if fname: 1006 file = open(fname, 'r') 1007 if string: 1008 file = StringIO(string) 1009 if file: 1010 it = iter(file) 1011 self.iter = it 1012 assert self.iter 1013 self.lineno = 0 1014 self._next = it.__next__ 1015 def __next__(self): 1016 self.lineno += 1 1017 return self._next() 1018 1019 1020 def parseAuthor(s): 1021 try: 1022 return _parseAuthor(s) 1023 except: 1024 print("Internal error while parsing author %r"%s, file=sys.stderr) 1025 raise 1026 1027 def _parseAuthor(s): 1028 """ 1029 Take an author string and return a list of ParsedAuthor. 1030 """ 1031 items = [] 1032 1033 s = s.strip() 1034 while s: 1035 s = s.strip() 1036 bracelevel = 0 1037 for i in range(len(s)): 1038 if s[i] == '{': 1039 bracelevel += 1 1040 elif s[i] == '}': 1041 bracelevel -= 1 1042 elif bracelevel <= 0 and s[i] in " \t\n,": 1043 break 1044 if i+1 == len(s): 1045 items.append(s) 1046 else: 1047 items.append(s[0:i]) 1048 if (s[i] == ','): 1049 items.append(',') 1050 s = s[i+1:] 1051 1052 authors = [[]] 1053 for item in items: 1054 if item == 'and': 1055 authors.append([]) 1056 else: 1057 authors[-1].append(item) 1058 1059 parsedAuthors = [] 1060 # Split into first, von, last, jr 1061 for author in authors: 1062 commas = 0 1063 fvl = [] 1064 vl = [] 1065 f = [] 1066 v = [] 1067 l = [] 1068 j = [] 1069 cur = fvl 1070 for item in author: 1071 if item == ',': 1072 if commas == 0: 1073 vl = fvl 1074 fvl = [] 1075 cur = f 1076 else: 1077 j.extend(f) 1078 cur = f = [] 1079 commas += 1 1080 else: 1081 cur.append(item) 1082 1083 if commas == 0: 1084 split_von(f, v, l, fvl) 1085 else: 1086 f_tmp = [] 1087 split_von(f_tmp, v, l, vl) 1088 1089 parsedAuthors.append(ParsedAuthor(f, v, l, j)) 1090 1091 return parsedAuthors 1092 1093 ALLCHARS = "".join(map(chr, list(range(256)))) 1094 1095 PRINTINGCHARS = "\t\n\r"+"".join(map(chr, list(range(32, 127)))) 1096 1097 LC_CHARS = "abcdefghijklmnopqrstuvwxyz" 1098 1099 SV_DELCHARS = ("ABCDEFGHIJKLMNOPQRSTUVWXYZ" 1100 "abcdefghijklmnopqrstuvwxyz" 1101 "@") 1102 1103 RE_ESCAPED = re.compile(r'\\.') 1104 1105 def split_von(f, v, l, x): 1106 in_von = 0 1107 while x: 1108 tt = t = x[0] 1109 del x[0] 1110 if tt[:2] == '{\\': 1111 tt = tt.translate(str.maketrans(ALLCHARS, ALLCHARS, SV_DELCHARS)) 1112 tt = RE_ESCAPED.sub("", tt) 1113 tt = tt.translate(str.maketrans(ALLCHARS, ALLCHARS, "{}")) 1114 if tt.translate(str.maketrans(ALLCHARS, ALLCHARS, LC_CHARS)) == "": 1115 v.append(t) 1116 in_von = 1 1117 elif in_von and f is not None: 1118 l.append(t) 1119 l.extend(x) 1120 return 1121 else: 1122 f.append(t) 1123 if not in_von: 1124 l.append(f[-1]) 1125 del f[-1] 1126 1127 1128 class Parser: 1129 """ 1130 Parser class: reads BibTeX from a file and returns a BibTeX object. 1131 """ 1132 ## Fields 1133 # strings: maps entry string keys to their values. 1134 # newStrings: all string definitions not in config.INITIAL_STRINGS 1135 # invStrings: map from string values to their keys. 1136 # fileiter: the line iterator we're parsing from. 1137 # result: the BibTeX object that we're parsing into 1138 # litStringLine: the line on which we started parsing a literal string; 1139 # 0 for none. 1140 # entryLine: the line on which the current entry started; 0 for none. 1141 # 1142 # curEntType: the type of the entry we're parsing now. (paper,article,etc) 1143 def __init__(self, fileiter, initial_strings, result=None): 1144 self.strings = config.INITIAL_STRINGS.copy() 1145 self.strings.update(initial_strings) 1146 self.newStrings = {} 1147 self.invStrings = {} 1148 for k, v in list(config.INITIAL_STRINGS.items()): 1149 self.invStrings[v] = k 1150 self.fileiter = fileiter 1151 if result is None: 1152 result = BibTeX() 1153 self.result = result 1154 self.litStringLine = 0 1155 self.entryLine = 0 1156 1157 def _parseKey(self, line): 1158 it = self.fileiter 1159 line = _advance(it, line) 1160 m = KEY_RE.match(line) 1161 if not m: 1162 raise ParseError("Expected key at line %s" 1163 % self.fileiter.lineno) 1164 key, line = m.groups() 1165 return key, line 1166 1167 def _parseValue(self, line): 1168 it = self.fileiter 1169 bracelevel = 0 1170 data = [] 1171 while 1: 1172 line = _advance(it, line) 1173 line = line.strip() 1174 assert line 1175 1176 # Literal string? 1177 if line[0] == '"': 1178 line = line[1:] 1179 self.litStringLine = it.lineno 1180 while 1: 1181 if bracelevel: 1182 m = BRACE_CLOSE_RE.match(line) 1183 if m: 1184 data.append(m.group(1)) 1185 data.append('}') 1186 line = m.group(2) 1187 bracelevel -= 1 1188 continue 1189 else: 1190 m = STRING_CLOSE_RE.match(line) 1191 if m: 1192 data.append(m.group(1)) 1193 line = m.group(2) 1194 break 1195 m = BRACE_OPEN_RE.match(line) 1196 if m: 1197 data.append(m.group(1)) 1198 line = m.group(2) 1199 bracelevel += 1 1200 continue 1201 data.append(line) 1202 data.append(" ") 1203 line = next(it) 1204 self.litStringLine = 0 1205 elif line[0] == '{': 1206 bracelevel += 1 1207 line = line[1:] 1208 while bracelevel: 1209 m = BRACE_CLOSE_RE.match(line) 1210 if m: 1211 #print bracelevel, "A", repr(m.group(1)) 1212 data.append(m.group(1)) 1213 bracelevel -= 1 1214 if bracelevel > 0: 1215 #print bracelevel, "- '}'" 1216 data.append('}') 1217 line = m.group(2) 1218 continue 1219 m = BRACE_OPEN_RE.match(line) 1220 if m: 1221 bracelevel += 1 1222 #print bracelevel, "B", repr(m.group(1)) 1223 data.append(m.group(1)) 1224 line = m.group(2) 1225 continue 1226 else: 1227 #print bracelevel, "C", repr(line) 1228 data.append(line) 1229 data.append(" ") 1230 line = next(it) 1231 elif line[0] == '#': 1232 print("Weird concat on line %s"%it.lineno, 1233 file=sys.stderr) 1234 elif line[0] in "},": 1235 if not data: 1236 print("No data after field on line %s"%(it.lineno), 1237 file=sys.stderr) 1238 else: 1239 m = RAW_DATA_RE.match(line) 1240 if m: 1241 s = self.strings.get(m.group(1).lower()) 1242 if s is not None: 1243 data.append(s) 1244 else: 1245 data.append(m.group(1)) 1246 line = m.group(2) 1247 else: 1248 raise ParseError("Questionable line at line %s"%it.lineno) 1249 1250 # Got a string, check for concatenation. 1251 if line.isspace() or not line: 1252 data.append(" ") 1253 line = _advance(it, line) 1254 line = line.strip() 1255 assert line 1256 if line[0] == '#': 1257 line = line[1:] 1258 else: 1259 data = "".join(data) 1260 data = re.sub(r'\s+', ' ', data) 1261 data = re.sub(r'^\s+', '', data) 1262 data = re.sub(r'\s+$', '', data) 1263 return data, line 1264 1265 def _parseEntry(self, line): #name, strings, entries 1266 it = self.fileiter 1267 self.entryLine = it.lineno 1268 line = _advance(it, line) 1269 1270 m = BRACE_BEGIN_RE.match(line) 1271 if not m: 1272 raise ParseError("Expected an opening brace at line %s"%it.lineno) 1273 line = m.group(1) 1274 1275 proto = {'string' : 'p', 1276 'preamble' : 'v',}.get(self.curEntType, 'kp*') 1277 1278 v = [] 1279 while 1: 1280 line = _advance(it, line) 1281 1282 m = BRACE_END_RE.match(line) 1283 if m: 1284 line = m.group(1) 1285 break 1286 if not proto: 1287 raise ParseError("Overlong entry starting on line %s" 1288 % self.entryLine) 1289 elif proto[0] == 'k': 1290 key, line = self._parseKey(line) 1291 v.append(key) 1292 elif proto[0] == 'v': 1293 value, line = self._parseValue(line) 1294 v.append(value) 1295 elif proto[0] == 'p': 1296 key, line = self._parseKey(line) 1297 v.append(key) 1298 line = _advance(it, line) 1299 line = line.lstrip() 1300 if line[0] == '=': 1301 line = line[1:] 1302 value, line = self._parseValue(line) 1303 v.append(value) 1304 else: 1305 assert 0 1306 line = line.strip() 1307 if line and line[0] == ',': 1308 line = line[1:] 1309 if proto and proto[1:] != '*': 1310 proto = proto[1:] 1311 if proto and proto[1:] != '*': 1312 raise ParseError("Missing arguments to %s on line %s" 1313 % (self.curEntType, self.entryLine)) 1314 1315 if self.curEntType == 'string': 1316 self.strings[v[0]] = v[1] 1317 self.newStrings[v[0]] = v[1] 1318 self.invStrings[v[1]] = v[0] 1319 elif self.curEntType == 'preamble': 1320 pass 1321 else: 1322 key = v[0] 1323 d = {} 1324 for i in range(1, len(v), 2): 1325 d[v[i].lower()] = v[i+1] 1326 ent = BibTeXEntry(self.curEntType, key, d) 1327 ent.entryLine = self.entryLine 1328 self.result.addEntry(ent) 1329 1330 return line 1331 1332 def parse(self): 1333 try: 1334 self._parse() 1335 except StopIteration: 1336 if self.litStringLine: 1337 raise ParseError("Unexpected EOF in string (started on %s)" 1338 % self.litStringLine) 1339 elif self.entryLine: 1340 raise ParseError("Unexpected EOF at line %s (entry started " 1341 "on %s)" % (self.fileiter.lineno, 1342 self.entryLine)) 1343 1344 self.result.invStrings = self.invStrings 1345 self.result.newStrings = self.newStrings 1346 1347 return self.result 1348 1349 def _parse(self): 1350 it = self.fileiter 1351 line = next(it) 1352 while 1: 1353 # Skip blank lines. 1354 while not line \ 1355 or line.isspace() \ 1356 or OUTER_COMMENT_RE.match(line): 1357 line = next(it) 1358 # Get the first line of an entry. 1359 m = ENTRY_BEGIN_RE.match(line) 1360 if m: 1361 self.curEntType = m.group(1).lower() 1362 line = m.group(2) 1363 line = self._parseEntry(line) 1364 self.entryLine = 0 1365 else: 1366 raise ParseError("Bad input at line %s " 1367 "(expected a new entry.)" 1368 % it.lineno) 1369 1370 def _advance(it, line): 1371 while not line \ 1372 or line.isspace() \ 1373 or COMMENT_RE.match(line): 1374 line = next(it) 1375 return line 1376 1377 # Matches a comment line outside of an entry. 1378 OUTER_COMMENT_RE = re.compile(r'^\s*[\#\%]') 1379 1380 # Matches a comment line inside of an entry. 1381 COMMENT_RE = re.compile(r'^\s*\%') 1382 1383 # Matches the start of an entry. group 1 is the type of the entry. 1384 # group 2 is the rest of the line. 1385 ENTRY_BEGIN_RE = re.compile(r'''^\s*\@([^\s\"\%\'\(\)\,\=\{\}]+)(.*)''') 1386 1387 # Start of an entry. group 1 is the keyword naming the entry. 1388 BRACE_BEGIN_RE = re.compile(r'\s*\{(.*)') 1389 BRACE_END_RE = re.compile(r'\s*\}(.*)') 1390 KEY_RE = re.compile(r'''\s*([^\"\#\%\'\(\)\,\=\{\}\s]+)(.*)''') 1391 1392 STRING_CLOSE_RE = re.compile(r'^([^\{\}\"]*)\"(.*)') 1393 BRACE_CLOSE_RE = re.compile(r'^([^\{\}]*)\}(.*)') 1394 BRACE_OPEN_RE = re.compile(r'^([^\{\}]*\{)(.*)') 1395 RAW_DATA_RE = re.compile(r'^([^\s\},]+)(.*)') 1396 1397 def parseFile(filename, result=None): 1398 """ 1399 Helper function: parse a single BibTeX file 1400 """ 1401 f = FileIter(fname=filename) 1402 p = Parser(f, {}, result) 1403 r = p.parse() 1404 r.resolve() 1405 for e in r.entries: 1406 e.check() 1407 return r 1408 1409 def parseString(string, result=None): 1410 """ 1411 Helper function: parse BibTeX from a string 1412 """ 1413 f = FileIter(string=string) 1414 p = Parser(f, {}, result) 1415 r = p.parse() 1416 r.resolve() 1417 for e in r.entries: 1418 e.check() 1419 return r 1420 1421 if __name__ == '__main__': 1422 if len(sys.argv) > 1: 1423 fname = sys.argv[1] 1424 else: 1425 fname = "testbib/pdos.bib" 1426 1427 r = parseFile(fname) 1428 1429 for e in r.entries: 1430 if e.type in ("proceedings", "journal"): continue 1431 print(e.to_html())