lex.go (30022B)
1 package toml 2 3 import ( 4 "fmt" 5 "reflect" 6 "runtime" 7 "strings" 8 "unicode" 9 "unicode/utf8" 10 ) 11 12 type itemType int 13 14 const ( 15 itemError itemType = iota 16 itemEOF 17 itemText 18 itemString 19 itemStringEsc 20 itemRawString 21 itemMultilineString 22 itemRawMultilineString 23 itemBool 24 itemInteger 25 itemFloat 26 itemDatetime 27 itemArray // the start of an array 28 itemArrayEnd 29 itemTableStart 30 itemTableEnd 31 itemArrayTableStart 32 itemArrayTableEnd 33 itemKeyStart 34 itemKeyEnd 35 itemCommentStart 36 itemInlineTableStart 37 itemInlineTableEnd 38 ) 39 40 const eof = 0 41 42 type stateFn func(lx *lexer) stateFn 43 44 func (p Position) String() string { 45 return fmt.Sprintf("at line %d; start %d; length %d", p.Line, p.Start, p.Len) 46 } 47 48 type lexer struct { 49 input string 50 start int 51 pos int 52 line int 53 state stateFn 54 items chan item 55 esc bool 56 57 // Allow for backing up up to 4 runes. This is necessary because TOML 58 // contains 3-rune tokens (""" and '''). 59 prevWidths [4]int 60 nprev int // how many of prevWidths are in use 61 atEOF bool // If we emit an eof, we can still back up, but it is not OK to call next again. 62 63 // A stack of state functions used to maintain context. 64 // 65 // The idea is to reuse parts of the state machine in various places. For 66 // example, values can appear at the top level or within arbitrarily nested 67 // arrays. The last state on the stack is used after a value has been lexed. 68 // Similarly for comments. 69 stack []stateFn 70 } 71 72 type item struct { 73 typ itemType 74 val string 75 err error 76 pos Position 77 } 78 79 func (lx *lexer) nextItem() item { 80 for { 81 select { 82 case item := <-lx.items: 83 return item 84 default: 85 lx.state = lx.state(lx) 86 //fmt.Printf(" STATE %-24s current: %-10s stack: %s\n", lx.state, lx.current(), lx.stack) 87 } 88 } 89 } 90 91 func lex(input string) *lexer { 92 lx := &lexer{ 93 input: input, 94 state: lexTop, 95 items: make(chan item, 10), 96 stack: make([]stateFn, 0, 10), 97 line: 1, 98 } 99 return lx 100 } 101 102 func (lx *lexer) push(state stateFn) { 103 lx.stack = append(lx.stack, state) 104 } 105 106 func (lx *lexer) pop() stateFn { 107 if len(lx.stack) == 0 { 108 panic("BUG in lexer: no states to pop") 109 } 110 last := lx.stack[len(lx.stack)-1] 111 lx.stack = lx.stack[0 : len(lx.stack)-1] 112 return last 113 } 114 115 func (lx *lexer) current() string { 116 return lx.input[lx.start:lx.pos] 117 } 118 119 func (lx lexer) getPos() Position { 120 p := Position{ 121 Line: lx.line, 122 Start: lx.start, 123 Len: lx.pos - lx.start, 124 } 125 if p.Len <= 0 { 126 p.Len = 1 127 } 128 return p 129 } 130 131 func (lx *lexer) emit(typ itemType) { 132 // Needed for multiline strings ending with an incomplete UTF-8 sequence. 133 if lx.start > lx.pos { 134 lx.error(errLexUTF8{lx.input[lx.pos]}) 135 return 136 } 137 lx.items <- item{typ: typ, pos: lx.getPos(), val: lx.current()} 138 lx.start = lx.pos 139 } 140 141 func (lx *lexer) emitTrim(typ itemType) { 142 lx.items <- item{typ: typ, pos: lx.getPos(), val: strings.TrimSpace(lx.current())} 143 lx.start = lx.pos 144 } 145 146 func (lx *lexer) next() (r rune) { 147 if lx.atEOF { 148 panic("BUG in lexer: next called after EOF") 149 } 150 if lx.pos >= len(lx.input) { 151 lx.atEOF = true 152 return eof 153 } 154 155 if lx.input[lx.pos] == '\n' { 156 lx.line++ 157 } 158 lx.prevWidths[3] = lx.prevWidths[2] 159 lx.prevWidths[2] = lx.prevWidths[1] 160 lx.prevWidths[1] = lx.prevWidths[0] 161 if lx.nprev < 4 { 162 lx.nprev++ 163 } 164 165 r, w := utf8.DecodeRuneInString(lx.input[lx.pos:]) 166 if r == utf8.RuneError && w == 1 { 167 lx.error(errLexUTF8{lx.input[lx.pos]}) 168 return utf8.RuneError 169 } 170 171 // Note: don't use peek() here, as this calls next(). 172 if isControl(r) || (r == '\r' && (len(lx.input)-1 == lx.pos || lx.input[lx.pos+1] != '\n')) { 173 lx.errorControlChar(r) 174 return utf8.RuneError 175 } 176 177 lx.prevWidths[0] = w 178 lx.pos += w 179 return r 180 } 181 182 // ignore skips over the pending input before this point. 183 func (lx *lexer) ignore() { 184 lx.start = lx.pos 185 } 186 187 // backup steps back one rune. Can be called 4 times between calls to next. 188 func (lx *lexer) backup() { 189 if lx.atEOF { 190 lx.atEOF = false 191 return 192 } 193 if lx.nprev < 1 { 194 panic("BUG in lexer: backed up too far") 195 } 196 w := lx.prevWidths[0] 197 lx.prevWidths[0] = lx.prevWidths[1] 198 lx.prevWidths[1] = lx.prevWidths[2] 199 lx.prevWidths[2] = lx.prevWidths[3] 200 lx.nprev-- 201 202 lx.pos -= w 203 if lx.pos < len(lx.input) && lx.input[lx.pos] == '\n' { 204 lx.line-- 205 } 206 } 207 208 // accept consumes the next rune if it's equal to `valid`. 209 func (lx *lexer) accept(valid rune) bool { 210 if lx.next() == valid { 211 return true 212 } 213 lx.backup() 214 return false 215 } 216 217 // peek returns but does not consume the next rune in the input. 218 func (lx *lexer) peek() rune { 219 r := lx.next() 220 lx.backup() 221 return r 222 } 223 224 // skip ignores all input that matches the given predicate. 225 func (lx *lexer) skip(pred func(rune) bool) { 226 for { 227 r := lx.next() 228 if pred(r) { 229 continue 230 } 231 lx.backup() 232 lx.ignore() 233 return 234 } 235 } 236 237 // error stops all lexing by emitting an error and returning `nil`. 238 // 239 // Note that any value that is a character is escaped if it's a special 240 // character (newlines, tabs, etc.). 241 func (lx *lexer) error(err error) stateFn { 242 if lx.atEOF { 243 return lx.errorPrevLine(err) 244 } 245 lx.items <- item{typ: itemError, pos: lx.getPos(), err: err} 246 return nil 247 } 248 249 // errorfPrevline is like error(), but sets the position to the last column of 250 // the previous line. 251 // 252 // This is so that unexpected EOF or NL errors don't show on a new blank line. 253 func (lx *lexer) errorPrevLine(err error) stateFn { 254 pos := lx.getPos() 255 pos.Line-- 256 pos.Len = 1 257 pos.Start = lx.pos - 1 258 lx.items <- item{typ: itemError, pos: pos, err: err} 259 return nil 260 } 261 262 // errorPos is like error(), but allows explicitly setting the position. 263 func (lx *lexer) errorPos(start, length int, err error) stateFn { 264 pos := lx.getPos() 265 pos.Start = start 266 pos.Len = length 267 lx.items <- item{typ: itemError, pos: pos, err: err} 268 return nil 269 } 270 271 // errorf is like error, and creates a new error. 272 func (lx *lexer) errorf(format string, values ...any) stateFn { 273 if lx.atEOF { 274 pos := lx.getPos() 275 if lx.pos >= 1 && lx.input[lx.pos-1] == '\n' { 276 pos.Line-- 277 } 278 pos.Len = 1 279 pos.Start = lx.pos - 1 280 lx.items <- item{typ: itemError, pos: pos, err: fmt.Errorf(format, values...)} 281 return nil 282 } 283 lx.items <- item{typ: itemError, pos: lx.getPos(), err: fmt.Errorf(format, values...)} 284 return nil 285 } 286 287 func (lx *lexer) errorControlChar(cc rune) stateFn { 288 return lx.errorPos(lx.pos-1, 1, errLexControl{cc}) 289 } 290 291 // lexTop consumes elements at the top level of TOML data. 292 func lexTop(lx *lexer) stateFn { 293 r := lx.next() 294 if isWhitespace(r) || isNL(r) { 295 return lexSkip(lx, lexTop) 296 } 297 switch r { 298 case '#': 299 lx.push(lexTop) 300 return lexCommentStart 301 case '[': 302 return lexTableStart 303 case eof: 304 if lx.pos > lx.start { 305 // TODO: never reached? I think this can only occur on a bug in the 306 // lexer(?) 307 return lx.errorf("unexpected EOF") 308 } 309 lx.emit(itemEOF) 310 return nil 311 } 312 313 // At this point, the only valid item can be a key, so we back up 314 // and let the key lexer do the rest. 315 lx.backup() 316 lx.push(lexTopEnd) 317 return lexKeyStart 318 } 319 320 // lexTopEnd is entered whenever a top-level item has been consumed. (A value 321 // or a table.) It must see only whitespace, and will turn back to lexTop 322 // upon a newline. If it sees EOF, it will quit the lexer successfully. 323 func lexTopEnd(lx *lexer) stateFn { 324 r := lx.next() 325 switch { 326 case r == '#': 327 // a comment will read to a newline for us. 328 lx.push(lexTop) 329 return lexCommentStart 330 case isWhitespace(r): 331 return lexTopEnd 332 case isNL(r): 333 lx.ignore() 334 return lexTop 335 case r == eof: 336 lx.emit(itemEOF) 337 return nil 338 } 339 return lx.errorf("expected a top-level item to end with a newline, comment, or EOF, but got %q instead", r) 340 } 341 342 // lexTable lexes the beginning of a table. Namely, it makes sure that 343 // it starts with a character other than '.' and ']'. 344 // It assumes that '[' has already been consumed. 345 // It also handles the case that this is an item in an array of tables. 346 // e.g., '[[name]]'. 347 func lexTableStart(lx *lexer) stateFn { 348 if lx.peek() == '[' { 349 lx.next() 350 lx.emit(itemArrayTableStart) 351 lx.push(lexArrayTableEnd) 352 } else { 353 lx.emit(itemTableStart) 354 lx.push(lexTableEnd) 355 } 356 return lexTableNameStart 357 } 358 359 func lexTableEnd(lx *lexer) stateFn { 360 lx.emit(itemTableEnd) 361 return lexTopEnd 362 } 363 364 func lexArrayTableEnd(lx *lexer) stateFn { 365 if r := lx.next(); r != ']' { 366 return lx.errorf("expected end of table array name delimiter ']', but got %q instead", r) 367 } 368 lx.emit(itemArrayTableEnd) 369 return lexTopEnd 370 } 371 372 func lexTableNameStart(lx *lexer) stateFn { 373 lx.skip(isWhitespace) 374 switch r := lx.peek(); { 375 case r == ']' || r == eof: 376 return lx.errorf("unexpected end of table name (table names cannot be empty)") 377 case r == '.': 378 return lx.errorf("unexpected table separator (table names cannot be empty)") 379 case r == '"' || r == '\'': 380 lx.ignore() 381 lx.push(lexTableNameEnd) 382 return lexQuotedName 383 default: 384 lx.push(lexTableNameEnd) 385 return lexBareName 386 } 387 } 388 389 // lexTableNameEnd reads the end of a piece of a table name, optionally 390 // consuming whitespace. 391 func lexTableNameEnd(lx *lexer) stateFn { 392 lx.skip(isWhitespace) 393 switch r := lx.next(); { 394 case r == '.': 395 lx.ignore() 396 return lexTableNameStart 397 case r == ']': 398 return lx.pop() 399 default: 400 return lx.errorf("expected '.' or ']' to end table name, but got %q instead", r) 401 } 402 } 403 404 // lexBareName lexes one part of a key or table. 405 // 406 // It assumes that at least one valid character for the table has already been 407 // read. 408 // 409 // Lexes only one part, e.g. only 'a' inside 'a.b'. 410 func lexBareName(lx *lexer) stateFn { 411 r := lx.next() 412 if isBareKeyChar(r) { 413 return lexBareName 414 } 415 lx.backup() 416 lx.emit(itemText) 417 return lx.pop() 418 } 419 420 // lexQuotedName lexes one part of a quoted key or table name. It assumes that 421 // it starts lexing at the quote itself (" or '). 422 // 423 // Lexes only one part, e.g. only '"a"' inside '"a".b'. 424 func lexQuotedName(lx *lexer) stateFn { 425 r := lx.next() 426 switch { 427 case r == '"': 428 lx.ignore() // ignore the '"' 429 return lexString 430 case r == '\'': 431 lx.ignore() // ignore the "'" 432 return lexRawString 433 434 // TODO: I don't think any of the below conditions can ever be reached? 435 case isWhitespace(r): 436 return lexSkip(lx, lexValue) 437 case r == eof: 438 return lx.errorf("unexpected EOF; expected value") 439 default: 440 return lx.errorf("expected value but found %q instead", r) 441 } 442 } 443 444 // lexKeyStart consumes all key parts until a '='. 445 func lexKeyStart(lx *lexer) stateFn { 446 lx.skip(isWhitespace) 447 switch r := lx.peek(); { 448 case r == '=' || r == eof: 449 return lx.errorf("unexpected '=': key name appears blank") 450 case r == '.': 451 return lx.errorf("unexpected '.': keys cannot start with a '.'") 452 case r == '"' || r == '\'': 453 lx.ignore() 454 fallthrough 455 default: // Bare key 456 lx.emit(itemKeyStart) 457 return lexKeyNameStart 458 } 459 } 460 461 func lexKeyNameStart(lx *lexer) stateFn { 462 lx.skip(isWhitespace) 463 switch r := lx.peek(); { 464 default: 465 lx.push(lexKeyEnd) 466 return lexBareName 467 case r == '"' || r == '\'': 468 lx.ignore() 469 lx.push(lexKeyEnd) 470 return lexQuotedName 471 472 // TODO: I think these can never be reached? 473 case r == '=' || r == eof: 474 return lx.errorf("unexpected '='") 475 case r == '.': 476 return lx.errorf("unexpected '.'") 477 } 478 } 479 480 // lexKeyEnd consumes the end of a key and trims whitespace (up to the key 481 // separator). 482 func lexKeyEnd(lx *lexer) stateFn { 483 lx.skip(isWhitespace) 484 switch r := lx.next(); { 485 case isWhitespace(r): 486 return lexSkip(lx, lexKeyEnd) 487 case r == eof: // TODO: never reached 488 return lx.errorf("unexpected EOF; expected key separator '='") 489 case r == '.': 490 lx.ignore() 491 return lexKeyNameStart 492 case r == '=': 493 lx.emit(itemKeyEnd) 494 return lexSkip(lx, lexValue) 495 default: 496 if r == '\n' { 497 return lx.errorPrevLine(fmt.Errorf("expected '.' or '=', but got %q instead", r)) 498 } 499 return lx.errorf("expected '.' or '=', but got %q instead", r) 500 } 501 } 502 503 // lexValue starts the consumption of a value anywhere a value is expected. 504 // lexValue will ignore whitespace. 505 // After a value is lexed, the last state on the next is popped and returned. 506 func lexValue(lx *lexer) stateFn { 507 // We allow whitespace to precede a value, but NOT newlines. 508 // In array syntax, the array states are responsible for ignoring newlines. 509 r := lx.next() 510 switch { 511 case isWhitespace(r): 512 return lexSkip(lx, lexValue) 513 case isDigit(r): 514 lx.backup() // avoid an extra state and use the same as above 515 return lexNumberOrDateStart 516 } 517 switch r { 518 case '[': 519 lx.ignore() 520 lx.emit(itemArray) 521 return lexArrayValue 522 case '{': 523 lx.ignore() 524 lx.emit(itemInlineTableStart) 525 return lexInlineTableValue 526 case '"': 527 if lx.accept('"') { 528 if lx.accept('"') { 529 lx.ignore() // Ignore """ 530 return lexMultilineString 531 } 532 lx.backup() 533 } 534 lx.ignore() // ignore the '"' 535 return lexString 536 case '\'': 537 if lx.accept('\'') { 538 if lx.accept('\'') { 539 lx.ignore() // Ignore """ 540 return lexMultilineRawString 541 } 542 lx.backup() 543 } 544 lx.ignore() // ignore the "'" 545 return lexRawString 546 case '.': // special error case, be kind to users 547 return lx.errorf("floats must start with a digit, not '.'") 548 case 'i', 'n': 549 if (lx.accept('n') && lx.accept('f')) || (lx.accept('a') && lx.accept('n')) { 550 lx.emit(itemFloat) 551 return lx.pop() 552 } 553 case '-', '+': 554 return lexDecimalNumberStart 555 } 556 if unicode.IsLetter(r) { 557 // Be permissive here; lexBool will give a nice error if the 558 // user wrote something like 559 // x = foo 560 // (i.e. not 'true' or 'false' but is something else word-like.) 561 lx.backup() 562 return lexBool 563 } 564 if r == eof { 565 return lx.errorf("unexpected EOF; expected value") 566 } 567 if r == '\n' { 568 return lx.errorPrevLine(fmt.Errorf("expected value but found %q instead", r)) 569 } 570 return lx.errorf("expected value but found %q instead", r) 571 } 572 573 // lexArrayValue consumes one value in an array. It assumes that '[' or ',' 574 // have already been consumed. All whitespace and newlines are ignored. 575 func lexArrayValue(lx *lexer) stateFn { 576 r := lx.next() 577 switch { 578 case isWhitespace(r) || isNL(r): 579 return lexSkip(lx, lexArrayValue) 580 case r == '#': 581 lx.push(lexArrayValue) 582 return lexCommentStart 583 case r == ',': 584 return lx.errorf("unexpected comma") 585 case r == ']': 586 return lexArrayEnd 587 } 588 589 lx.backup() 590 lx.push(lexArrayValueEnd) 591 return lexValue 592 } 593 594 // lexArrayValueEnd consumes everything between the end of an array value and 595 // the next value (or the end of the array): it ignores whitespace and newlines 596 // and expects either a ',' or a ']'. 597 func lexArrayValueEnd(lx *lexer) stateFn { 598 switch r := lx.next(); { 599 case isWhitespace(r) || isNL(r): 600 return lexSkip(lx, lexArrayValueEnd) 601 case r == '#': 602 lx.push(lexArrayValueEnd) 603 return lexCommentStart 604 case r == ',': 605 lx.ignore() 606 return lexArrayValue // move on to the next value 607 case r == ']': 608 return lexArrayEnd 609 default: 610 return lx.errorf("expected a comma (',') or array terminator (']'), but got %s", runeOrEOF(r)) 611 } 612 } 613 614 // lexArrayEnd finishes the lexing of an array. 615 // It assumes that a ']' has just been consumed. 616 func lexArrayEnd(lx *lexer) stateFn { 617 lx.ignore() 618 lx.emit(itemArrayEnd) 619 return lx.pop() 620 } 621 622 // lexInlineTableValue consumes one key/value pair in an inline table. 623 // It assumes that '{' or ',' have already been consumed. Whitespace is ignored. 624 func lexInlineTableValue(lx *lexer) stateFn { 625 r := lx.next() 626 switch { 627 case isWhitespace(r): 628 return lexSkip(lx, lexInlineTableValue) 629 case isNL(r): 630 return lexSkip(lx, lexInlineTableValue) 631 case r == '#': 632 lx.push(lexInlineTableValue) 633 return lexCommentStart 634 case r == ',': 635 return lx.errorf("unexpected comma") 636 case r == '}': 637 return lexInlineTableEnd 638 } 639 lx.backup() 640 lx.push(lexInlineTableValueEnd) 641 return lexKeyStart 642 } 643 644 // lexInlineTableValueEnd consumes everything between the end of an inline table 645 // key/value pair and the next pair (or the end of the table): 646 // it ignores whitespace and expects either a ',' or a '}'. 647 func lexInlineTableValueEnd(lx *lexer) stateFn { 648 switch r := lx.next(); { 649 case isWhitespace(r): 650 return lexSkip(lx, lexInlineTableValueEnd) 651 case isNL(r): 652 return lexSkip(lx, lexInlineTableValueEnd) 653 case r == '#': 654 lx.push(lexInlineTableValueEnd) 655 return lexCommentStart 656 case r == ',': 657 lx.ignore() 658 lx.skip(isWhitespace) 659 if lx.peek() == '}' { 660 return lexInlineTableValueEnd 661 } 662 return lexInlineTableValue 663 case r == '}': 664 return lexInlineTableEnd 665 default: 666 return lx.errorf("expected a comma or an inline table terminator '}', but got %s instead", runeOrEOF(r)) 667 } 668 } 669 670 func runeOrEOF(r rune) string { 671 if r == eof { 672 return "end of file" 673 } 674 return "'" + string(r) + "'" 675 } 676 677 // lexInlineTableEnd finishes the lexing of an inline table. 678 // It assumes that a '}' has just been consumed. 679 func lexInlineTableEnd(lx *lexer) stateFn { 680 lx.ignore() 681 lx.emit(itemInlineTableEnd) 682 return lx.pop() 683 } 684 685 // lexString consumes the inner contents of a string. It assumes that the 686 // beginning '"' has already been consumed and ignored. 687 func lexString(lx *lexer) stateFn { 688 r := lx.next() 689 switch { 690 case r == eof: 691 return lx.errorf(`unexpected EOF; expected '"'`) 692 case isNL(r): 693 return lx.errorPrevLine(errLexStringNL{}) 694 case r == '\\': 695 lx.push(lexString) 696 return lexStringEscape 697 case r == '"': 698 lx.backup() 699 if lx.esc { 700 lx.esc = false 701 lx.emit(itemStringEsc) 702 } else { 703 lx.emit(itemString) 704 } 705 lx.next() 706 lx.ignore() 707 return lx.pop() 708 } 709 return lexString 710 } 711 712 // lexMultilineString consumes the inner contents of a string. It assumes that 713 // the beginning '"""' has already been consumed and ignored. 714 func lexMultilineString(lx *lexer) stateFn { 715 r := lx.next() 716 switch r { 717 default: 718 return lexMultilineString 719 case eof: 720 return lx.errorf(`unexpected EOF; expected '"""'`) 721 case '\\': 722 return lexMultilineStringEscape 723 case '"': 724 /// Found " → try to read two more "". 725 if lx.accept('"') { 726 if lx.accept('"') { 727 /// Peek ahead: the string can contain " and "", including at the 728 /// end: """str""""" 729 /// 6 or more at the end, however, is an error. 730 if lx.peek() == '"' { 731 /// Check if we already lexed 5 's; if so we have 6 now, and 732 /// that's just too many man! 733 /// 734 /// Second check is for the edge case: 735 /// 736 /// two quotes allowed. 737 /// vv 738 /// """lol \"""""" 739 /// ^^ ^^^---- closing three 740 /// escaped 741 /// 742 /// But ugly, but it works 743 if strings.HasSuffix(lx.current(), `"""""`) && !strings.HasSuffix(lx.current(), `\"""""`) { 744 return lx.errorf(`unexpected '""""""'`) 745 } 746 lx.backup() 747 lx.backup() 748 return lexMultilineString 749 } 750 751 lx.backup() /// backup: don't include the """ in the item. 752 lx.backup() 753 lx.backup() 754 lx.esc = false 755 lx.emit(itemMultilineString) 756 lx.next() /// Read over ''' again and discard it. 757 lx.next() 758 lx.next() 759 lx.ignore() 760 return lx.pop() 761 } 762 lx.backup() 763 } 764 return lexMultilineString 765 } 766 } 767 768 // lexRawString consumes a raw string. Nothing can be escaped in such a string. 769 // It assumes that the beginning "'" has already been consumed and ignored. 770 func lexRawString(lx *lexer) stateFn { 771 r := lx.next() 772 switch { 773 default: 774 return lexRawString 775 case r == eof: 776 return lx.errorf(`unexpected EOF; expected "'"`) 777 case isNL(r): 778 return lx.errorPrevLine(errLexStringNL{}) 779 case r == '\'': 780 lx.backup() 781 lx.emit(itemRawString) 782 lx.next() 783 lx.ignore() 784 return lx.pop() 785 } 786 } 787 788 // lexMultilineRawString consumes a raw string. Nothing can be escaped in such a 789 // string. It assumes that the beginning triple-' has already been consumed and 790 // ignored. 791 func lexMultilineRawString(lx *lexer) stateFn { 792 r := lx.next() 793 switch r { 794 default: 795 return lexMultilineRawString 796 case eof: 797 return lx.errorf(`unexpected EOF; expected "'''"`) 798 case '\'': 799 /// Found ' → try to read two more ''. 800 if lx.accept('\'') { 801 if lx.accept('\'') { 802 /// Peek ahead: the string can contain ' and '', including at the 803 /// end: '''str''''' 804 /// 6 or more at the end, however, is an error. 805 if lx.peek() == '\'' { 806 /// Check if we already lexed 5 's; if so we have 6 now, and 807 /// that's just too many man! 808 if strings.HasSuffix(lx.current(), "'''''") { 809 return lx.errorf(`unexpected "''''''"`) 810 } 811 lx.backup() 812 lx.backup() 813 return lexMultilineRawString 814 } 815 816 lx.backup() /// backup: don't include the ''' in the item. 817 lx.backup() 818 lx.backup() 819 lx.emit(itemRawMultilineString) 820 lx.next() /// Read over ''' again and discard it. 821 lx.next() 822 lx.next() 823 lx.ignore() 824 return lx.pop() 825 } 826 lx.backup() 827 } 828 return lexMultilineRawString 829 } 830 } 831 832 // lexMultilineStringEscape consumes an escaped character. It assumes that the 833 // preceding '\\' has already been consumed. 834 func lexMultilineStringEscape(lx *lexer) stateFn { 835 if isNL(lx.next()) { /// \ escaping newline. 836 return lexMultilineString 837 } 838 lx.backup() 839 lx.push(lexMultilineString) 840 return lexStringEscape(lx) 841 } 842 843 func lexStringEscape(lx *lexer) stateFn { 844 lx.esc = true 845 r := lx.next() 846 switch r { 847 case 'e': 848 fallthrough 849 case 'b': 850 fallthrough 851 case 't': 852 fallthrough 853 case 'n': 854 fallthrough 855 case 'f': 856 fallthrough 857 case 'r': 858 fallthrough 859 case '"': 860 fallthrough 861 case ' ', '\t': 862 // Inside """ .. """ strings you can use \ to escape newlines, and any 863 // amount of whitespace can be between the \ and \n. 864 fallthrough 865 case '\\': 866 return lx.pop() 867 case 'x': 868 return lexHexEscape 869 case 'u': 870 return lexShortUnicodeEscape 871 case 'U': 872 return lexLongUnicodeEscape 873 } 874 return lx.error(errLexEscape{r}) 875 } 876 877 func lexHexEscape(lx *lexer) stateFn { 878 var r rune 879 for i := 0; i < 2; i++ { 880 r = lx.next() 881 if !isHex(r) { 882 return lx.errorf(`expected two hexadecimal digits after '\x', but got %q instead`, lx.current()) 883 } 884 } 885 return lx.pop() 886 } 887 888 func lexShortUnicodeEscape(lx *lexer) stateFn { 889 var r rune 890 for i := 0; i < 4; i++ { 891 r = lx.next() 892 if !isHex(r) { 893 return lx.errorf(`expected four hexadecimal digits after '\u', but got %q instead`, lx.current()) 894 } 895 } 896 return lx.pop() 897 } 898 899 func lexLongUnicodeEscape(lx *lexer) stateFn { 900 var r rune 901 for i := 0; i < 8; i++ { 902 r = lx.next() 903 if !isHex(r) { 904 return lx.errorf(`expected eight hexadecimal digits after '\U', but got %q instead`, lx.current()) 905 } 906 } 907 return lx.pop() 908 } 909 910 // lexNumberOrDateStart processes the first character of a value which begins 911 // with a digit. It exists to catch values starting with '0', so that 912 // lexBaseNumberOrDate can differentiate base prefixed integers from other 913 // types. 914 func lexNumberOrDateStart(lx *lexer) stateFn { 915 if lx.next() == '0' { 916 return lexBaseNumberOrDate 917 } 918 return lexNumberOrDate 919 } 920 921 // lexNumberOrDate consumes either an integer, float or datetime. 922 func lexNumberOrDate(lx *lexer) stateFn { 923 r := lx.next() 924 if isDigit(r) { 925 return lexNumberOrDate 926 } 927 switch r { 928 case '-', ':': 929 return lexDatetime 930 case '_': 931 return lexDecimalNumber 932 case '.', 'e', 'E': 933 return lexFloat 934 } 935 936 lx.backup() 937 lx.emit(itemInteger) 938 return lx.pop() 939 } 940 941 // lexDatetime consumes a Datetime, to a first approximation. 942 // The parser validates that it matches one of the accepted formats. 943 func lexDatetime(lx *lexer) stateFn { 944 r := lx.next() 945 if isDigit(r) { 946 return lexDatetime 947 } 948 switch r { 949 case '-', ':', 'T', 't', ' ', '.', 'Z', 'z', '+': 950 return lexDatetime 951 } 952 953 lx.backup() 954 lx.emitTrim(itemDatetime) 955 return lx.pop() 956 } 957 958 // lexHexInteger consumes a hexadecimal integer after seeing the '0x' prefix. 959 func lexHexInteger(lx *lexer) stateFn { 960 r := lx.next() 961 if isHex(r) { 962 return lexHexInteger 963 } 964 switch r { 965 case '_': 966 return lexHexInteger 967 } 968 969 lx.backup() 970 lx.emit(itemInteger) 971 return lx.pop() 972 } 973 974 // lexOctalInteger consumes an octal integer after seeing the '0o' prefix. 975 func lexOctalInteger(lx *lexer) stateFn { 976 r := lx.next() 977 if isOctal(r) { 978 return lexOctalInteger 979 } 980 switch r { 981 case '_': 982 return lexOctalInteger 983 } 984 985 lx.backup() 986 lx.emit(itemInteger) 987 return lx.pop() 988 } 989 990 // lexBinaryInteger consumes a binary integer after seeing the '0b' prefix. 991 func lexBinaryInteger(lx *lexer) stateFn { 992 r := lx.next() 993 if isBinary(r) { 994 return lexBinaryInteger 995 } 996 switch r { 997 case '_': 998 return lexBinaryInteger 999 } 1000 1001 lx.backup() 1002 lx.emit(itemInteger) 1003 return lx.pop() 1004 } 1005 1006 // lexDecimalNumber consumes a decimal float or integer. 1007 func lexDecimalNumber(lx *lexer) stateFn { 1008 r := lx.next() 1009 if isDigit(r) { 1010 return lexDecimalNumber 1011 } 1012 switch r { 1013 case '.', 'e', 'E': 1014 return lexFloat 1015 case '_': 1016 return lexDecimalNumber 1017 } 1018 1019 lx.backup() 1020 lx.emit(itemInteger) 1021 return lx.pop() 1022 } 1023 1024 // lexDecimalNumber consumes the first digit of a number beginning with a sign. 1025 // It assumes the sign has already been consumed. Values which start with a sign 1026 // are only allowed to be decimal integers or floats. 1027 // 1028 // The special "nan" and "inf" values are also recognized. 1029 func lexDecimalNumberStart(lx *lexer) stateFn { 1030 r := lx.next() 1031 1032 // Special error cases to give users better error messages 1033 switch r { 1034 case 'i': 1035 if !lx.accept('n') || !lx.accept('f') { 1036 return lx.errorf("invalid float: '%s'", lx.current()) 1037 } 1038 lx.emit(itemFloat) 1039 return lx.pop() 1040 case 'n': 1041 if !lx.accept('a') || !lx.accept('n') { 1042 return lx.errorf("invalid float: '%s'", lx.current()) 1043 } 1044 lx.emit(itemFloat) 1045 return lx.pop() 1046 case '0': 1047 p := lx.peek() 1048 switch p { 1049 case 'b', 'o', 'x': 1050 return lx.errorf("cannot use sign with non-decimal numbers: '%s%c'", lx.current(), p) 1051 } 1052 case '.': 1053 return lx.errorf("floats must start with a digit, not '.'") 1054 } 1055 1056 if isDigit(r) { 1057 return lexDecimalNumber 1058 } 1059 1060 return lx.errorf("expected a digit but got %q", r) 1061 } 1062 1063 // lexBaseNumberOrDate differentiates between the possible values which 1064 // start with '0'. It assumes that before reaching this state, the initial '0' 1065 // has been consumed. 1066 func lexBaseNumberOrDate(lx *lexer) stateFn { 1067 r := lx.next() 1068 // Note: All datetimes start with at least two digits, so we don't 1069 // handle date characters (':', '-', etc.) here. 1070 if isDigit(r) { 1071 return lexNumberOrDate 1072 } 1073 switch r { 1074 case '_': 1075 // Can only be decimal, because there can't be an underscore 1076 // between the '0' and the base designator, and dates can't 1077 // contain underscores. 1078 return lexDecimalNumber 1079 case '.', 'e', 'E': 1080 return lexFloat 1081 case 'b': 1082 r = lx.peek() 1083 if !isBinary(r) { 1084 lx.errorf("not a binary number: '%s%c'", lx.current(), r) 1085 } 1086 return lexBinaryInteger 1087 case 'o': 1088 r = lx.peek() 1089 if !isOctal(r) { 1090 lx.errorf("not an octal number: '%s%c'", lx.current(), r) 1091 } 1092 return lexOctalInteger 1093 case 'x': 1094 r = lx.peek() 1095 if !isHex(r) { 1096 lx.errorf("not a hexadecimal number: '%s%c'", lx.current(), r) 1097 } 1098 return lexHexInteger 1099 } 1100 1101 lx.backup() 1102 lx.emit(itemInteger) 1103 return lx.pop() 1104 } 1105 1106 // lexFloat consumes the elements of a float. It allows any sequence of 1107 // float-like characters, so floats emitted by the lexer are only a first 1108 // approximation and must be validated by the parser. 1109 func lexFloat(lx *lexer) stateFn { 1110 r := lx.next() 1111 if isDigit(r) { 1112 return lexFloat 1113 } 1114 switch r { 1115 case '_', '.', '-', '+', 'e', 'E': 1116 return lexFloat 1117 } 1118 1119 lx.backup() 1120 lx.emit(itemFloat) 1121 return lx.pop() 1122 } 1123 1124 // lexBool consumes a bool string: 'true' or 'false. 1125 func lexBool(lx *lexer) stateFn { 1126 var rs []rune 1127 for { 1128 r := lx.next() 1129 if !unicode.IsLetter(r) { 1130 lx.backup() 1131 break 1132 } 1133 rs = append(rs, r) 1134 } 1135 s := string(rs) 1136 switch s { 1137 case "true", "false": 1138 lx.emit(itemBool) 1139 return lx.pop() 1140 } 1141 return lx.errorf("expected value but found %q instead", s) 1142 } 1143 1144 // lexCommentStart begins the lexing of a comment. It will emit 1145 // itemCommentStart and consume no characters, passing control to lexComment. 1146 func lexCommentStart(lx *lexer) stateFn { 1147 lx.ignore() 1148 lx.emit(itemCommentStart) 1149 return lexComment 1150 } 1151 1152 // lexComment lexes an entire comment. It assumes that '#' has been consumed. 1153 // It will consume *up to* the first newline character, and pass control 1154 // back to the last state on the stack. 1155 func lexComment(lx *lexer) stateFn { 1156 switch r := lx.next(); { 1157 case isNL(r) || r == eof: 1158 lx.backup() 1159 lx.emit(itemText) 1160 return lx.pop() 1161 default: 1162 return lexComment 1163 } 1164 } 1165 1166 // lexSkip ignores all slurped input and moves on to the next state. 1167 func lexSkip(lx *lexer, nextState stateFn) stateFn { 1168 lx.ignore() 1169 return nextState 1170 } 1171 1172 func (s stateFn) String() string { 1173 if s == nil { 1174 return "<nil>" 1175 } 1176 name := runtime.FuncForPC(reflect.ValueOf(s).Pointer()).Name() 1177 if i := strings.LastIndexByte(name, '.'); i > -1 { 1178 name = name[i+1:] 1179 } 1180 return name + "()" 1181 } 1182 1183 func (itype itemType) String() string { 1184 switch itype { 1185 case itemError: 1186 return "Error" 1187 case itemEOF: 1188 return "EOF" 1189 case itemText: 1190 return "Text" 1191 case itemString, itemStringEsc, itemRawString, itemMultilineString, itemRawMultilineString: 1192 return "String" 1193 case itemBool: 1194 return "Bool" 1195 case itemInteger: 1196 return "Integer" 1197 case itemFloat: 1198 return "Float" 1199 case itemDatetime: 1200 return "DateTime" 1201 case itemArray: 1202 return "Array" 1203 case itemArrayEnd: 1204 return "ArrayEnd" 1205 case itemTableStart: 1206 return "TableStart" 1207 case itemTableEnd: 1208 return "TableEnd" 1209 case itemArrayTableStart: 1210 return "ArrayTableStart" 1211 case itemArrayTableEnd: 1212 return "ArrayTableEnd" 1213 case itemKeyStart: 1214 return "KeyStart" 1215 case itemKeyEnd: 1216 return "KeyEnd" 1217 case itemCommentStart: 1218 return "CommentStart" 1219 case itemInlineTableStart: 1220 return "InlineTableStart" 1221 case itemInlineTableEnd: 1222 return "InlineTableEnd" 1223 } 1224 panic(fmt.Sprintf("BUG: Unknown type '%d'.", int(itype))) 1225 } 1226 1227 func (item item) String() string { 1228 return fmt.Sprintf("(%s, %s)", item.typ, item.val) 1229 } 1230 1231 func isWhitespace(r rune) bool { return r == '\t' || r == ' ' } 1232 func isNL(r rune) bool { return r == '\n' || r == '\r' } 1233 func isControl(r rune) bool { // Control characters except \t, \r, \n 1234 switch r { 1235 case '\t', '\r', '\n': 1236 return false 1237 default: 1238 return (r >= 0x00 && r <= 0x1f) || r == 0x7f 1239 } 1240 } 1241 func isDigit(r rune) bool { return r >= '0' && r <= '9' } 1242 func isBinary(r rune) bool { return r == '0' || r == '1' } 1243 func isOctal(r rune) bool { return r >= '0' && r <= '7' } 1244 func isHex(r rune) bool { return (r >= '0' && r <= '9') || (r|0x20 >= 'a' && r|0x20 <= 'f') } 1245 func isBareKeyChar(r rune) bool { 1246 return (r >= 'A' && r <= 'Z') || (r >= 'a' && r <= 'z') || 1247 (r >= '0' && r <= '9') || r == '_' || r == '-' 1248 }