github.com/bingoohuang/gg@v0.0.0-20240325092523-45da7dee9335/pkg/yaml/scanner/scanner.go (about) 1 package scanner 2 3 import ( 4 "io" 5 "strings" 6 7 "github.com/bingoohuang/gg/pkg/yaml/token" 8 "golang.org/x/xerrors" 9 ) 10 11 // IndentState state for indent 12 type IndentState int 13 14 const ( 15 // IndentStateEqual equals previous indent 16 IndentStateEqual IndentState = iota 17 // IndentStateUp more indent than previous 18 IndentStateUp 19 // IndentStateDown less indent than previous 20 IndentStateDown 21 // IndentStateKeep uses not indent token 22 IndentStateKeep 23 ) 24 25 // Scanner holds the scanner's internal state while processing a given text. 26 // It can be allocated as part of another data structure but must be initialized via Init before use. 27 type Scanner struct { 28 source []rune 29 sourcePos int 30 sourceSize int 31 line int 32 column int 33 offset int 34 prevIndentLevel int 35 prevIndentNum int 36 prevIndentColumn int 37 docStartColumn int 38 indentLevel int 39 indentNum int 40 isFirstCharAtLine bool 41 isAnchor bool 42 startedFlowSequenceNum int 43 startedFlowMapNum int 44 indentState IndentState 45 savedPos *token.Position 46 } 47 48 func (s *Scanner) pos() *token.Position { 49 return &token.Position{ 50 Line: s.line, 51 Column: s.column, 52 Offset: s.offset, 53 IndentNum: s.indentNum, 54 IndentLevel: s.indentLevel, 55 } 56 } 57 58 func (s *Scanner) bufferedToken(ctx *Context) *token.Token { 59 if s.savedPos != nil { 60 tk := ctx.bufferedToken(s.savedPos) 61 s.savedPos = nil 62 return tk 63 } 64 size := len(ctx.buf) 65 return ctx.bufferedToken(&token.Position{ 66 Line: s.line, 67 Column: s.column - size, 68 Offset: s.offset - size, 69 IndentNum: s.indentNum, 70 IndentLevel: s.indentLevel, 71 }) 72 } 73 74 func (s *Scanner) progressColumn(ctx *Context, num int) { 75 s.column += num 76 s.offset += num 77 ctx.progress(num) 78 } 79 80 func (s *Scanner) progressLine(ctx *Context) { 81 s.column = 1 82 s.line++ 83 s.offset++ 84 s.indentNum = 0 85 s.isFirstCharAtLine = true 86 s.isAnchor = false 87 ctx.progress(1) 88 } 89 90 func (s *Scanner) isNeededKeepPreviousIndentNum(ctx *Context, c rune) bool { 91 if !s.isChangedToIndentStateUp() { 92 return false 93 } 94 if ctx.isDocument() { 95 return true 96 } 97 if c == '-' && ctx.existsBuffer() { 98 return true 99 } 100 return false 101 } 102 103 func (s *Scanner) isNewLineChar(c rune) bool { 104 if c == '\n' { 105 return true 106 } 107 if c == '\r' { 108 return true 109 } 110 return false 111 } 112 113 func (s *Scanner) newLineCount(src []rune) int { 114 size := len(src) 115 cnt := 0 116 for i := 0; i < size; i++ { 117 c := src[i] 118 switch c { 119 case '\r': 120 if i+1 < size && src[i+1] == '\n' { 121 i++ 122 } 123 cnt++ 124 case '\n': 125 cnt++ 126 } 127 } 128 return cnt 129 } 130 131 func (s *Scanner) updateIndent(ctx *Context, c rune) { 132 if s.isFirstCharAtLine && s.isNewLineChar(c) && ctx.isDocument() { 133 return 134 } 135 if s.isFirstCharAtLine && c == ' ' { 136 s.indentNum++ 137 return 138 } 139 if !s.isFirstCharAtLine { 140 s.indentState = IndentStateKeep 141 return 142 } 143 144 if s.prevIndentNum < s.indentNum { 145 s.indentLevel = s.prevIndentLevel + 1 146 s.indentState = IndentStateUp 147 } else if s.prevIndentNum == s.indentNum { 148 s.indentLevel = s.prevIndentLevel 149 s.indentState = IndentStateEqual 150 } else { 151 s.indentState = IndentStateDown 152 if s.prevIndentLevel > 0 { 153 s.indentLevel = s.prevIndentLevel - 1 154 } 155 } 156 157 if s.prevIndentColumn > 0 { 158 if s.prevIndentColumn < s.column { 159 s.indentState = IndentStateUp 160 } else if s.prevIndentColumn == s.column { 161 s.indentState = IndentStateEqual 162 } else { 163 s.indentState = IndentStateDown 164 } 165 } 166 s.isFirstCharAtLine = false 167 if s.isNeededKeepPreviousIndentNum(ctx, c) { 168 return 169 } 170 s.prevIndentNum = s.indentNum 171 s.prevIndentColumn = 0 172 s.prevIndentLevel = s.indentLevel 173 } 174 175 func (s *Scanner) isChangedToIndentStateDown() bool { 176 return s.indentState == IndentStateDown 177 } 178 179 func (s *Scanner) isChangedToIndentStateUp() bool { 180 return s.indentState == IndentStateUp 181 } 182 183 func (s *Scanner) isChangedToIndentStateEqual() bool { 184 return s.indentState == IndentStateEqual 185 } 186 187 func (s *Scanner) addBufferedTokenIfExists(ctx *Context) { 188 ctx.addToken(s.bufferedToken(ctx)) 189 } 190 191 func (s *Scanner) breakLiteral(ctx *Context) { 192 s.docStartColumn = 0 193 ctx.breakLiteral() 194 } 195 196 func (s *Scanner) scanSingleQuote(ctx *Context) (tk *token.Token, pos int) { 197 ctx.addOriginBuf('\'') 198 srcpos := s.pos() 199 startIndex := ctx.idx + 1 200 src := ctx.src 201 size := len(src) 202 value := []rune{} 203 isFirstLineChar := false 204 isNewLine := false 205 for idx := startIndex; idx < size; idx++ { 206 if !isNewLine { 207 s.progressColumn(ctx, 1) 208 } else { 209 isNewLine = false 210 } 211 c := src[idx] 212 pos = idx + 1 213 ctx.addOriginBuf(c) 214 if s.isNewLineChar(c) { 215 value = append(value, ' ') 216 isFirstLineChar = true 217 isNewLine = true 218 s.progressLine(ctx) 219 continue 220 } else if c == ' ' && isFirstLineChar { 221 continue 222 } else if c != '\'' { 223 value = append(value, c) 224 isFirstLineChar = false 225 continue 226 } 227 if idx+1 < len(ctx.src) && ctx.src[idx+1] == '\'' { 228 // '' handle as ' character 229 value = append(value, c) 230 ctx.addOriginBuf(c) 231 idx++ 232 continue 233 } 234 s.progressColumn(ctx, 1) 235 tk = token.SingleQuote(string(value), string(ctx.obuf), srcpos) 236 pos = idx - startIndex + 1 237 return 238 } 239 return 240 } 241 242 func hexToInt(b rune) int { 243 if b >= 'A' && b <= 'F' { 244 return int(b) - 'A' + 10 245 } 246 if b >= 'a' && b <= 'f' { 247 return int(b) - 'a' + 10 248 } 249 return int(b) - '0' 250 } 251 252 func hexRunesToInt(b []rune) int { 253 sum := 0 254 for i := 0; i < len(b); i++ { 255 sum += hexToInt(b[i]) << (uint(len(b)-i-1) * 4) 256 } 257 return sum 258 } 259 260 func (s *Scanner) scanDoubleQuote(ctx *Context) (tk *token.Token, pos int) { 261 ctx.addOriginBuf('"') 262 srcpos := s.pos() 263 startIndex := ctx.idx + 1 264 src := ctx.src 265 size := len(src) 266 value := []rune{} 267 isFirstLineChar := false 268 isNewLine := false 269 for idx := startIndex; idx < size; idx++ { 270 if !isNewLine { 271 s.progressColumn(ctx, 1) 272 } else { 273 isNewLine = false 274 } 275 c := src[idx] 276 pos = idx + 1 277 ctx.addOriginBuf(c) 278 if s.isNewLineChar(c) { 279 value = append(value, ' ') 280 isFirstLineChar = true 281 isNewLine = true 282 s.progressLine(ctx) 283 continue 284 } else if c == ' ' && isFirstLineChar { 285 continue 286 } else if c == '\\' { 287 isFirstLineChar = false 288 if idx+1 < size { 289 nextChar := src[idx+1] 290 switch nextChar { 291 case 'b': 292 ctx.addOriginBuf(nextChar) 293 value = append(value, '\b') 294 idx++ 295 continue 296 case 'e': 297 ctx.addOriginBuf(nextChar) 298 value = append(value, '\x1B') 299 idx++ 300 continue 301 case 'f': 302 ctx.addOriginBuf(nextChar) 303 value = append(value, '\f') 304 idx++ 305 continue 306 case 'n': 307 ctx.addOriginBuf(nextChar) 308 value = append(value, '\n') 309 idx++ 310 continue 311 case 'v': 312 ctx.addOriginBuf(nextChar) 313 value = append(value, '\v') 314 idx++ 315 continue 316 case 'L': // LS (#x2028) 317 ctx.addOriginBuf(nextChar) 318 value = append(value, []rune{'\xE2', '\x80', '\xA8'}...) 319 idx++ 320 continue 321 case 'N': // NEL (#x85) 322 ctx.addOriginBuf(nextChar) 323 value = append(value, []rune{'\xC2', '\x85'}...) 324 idx++ 325 continue 326 case 'P': // PS (#x2029) 327 ctx.addOriginBuf(nextChar) 328 value = append(value, []rune{'\xE2', '\x80', '\xA9'}...) 329 idx++ 330 continue 331 case '_': // #xA0 332 ctx.addOriginBuf(nextChar) 333 value = append(value, []rune{'\xC2', '\xA0'}...) 334 idx++ 335 continue 336 case '"': 337 ctx.addOriginBuf(nextChar) 338 value = append(value, nextChar) 339 idx++ 340 continue 341 case 'x': 342 if idx+3 >= size { 343 // TODO: need to return error 344 // err = xerrors.New("invalid escape character \\x") 345 return 346 } 347 codeNum := hexRunesToInt(src[idx+2 : idx+4]) 348 value = append(value, rune(codeNum)) 349 idx += 3 350 continue 351 case 'u': 352 if idx+5 >= size { 353 // TODO: need to return error 354 // err = xerrors.New("invalid escape character \\u") 355 return 356 } 357 codeNum := hexRunesToInt(src[idx+2 : idx+6]) 358 value = append(value, rune(codeNum)) 359 idx += 5 360 continue 361 case 'U': 362 if idx+9 >= size { 363 // TODO: need to return error 364 // err = xerrors.New("invalid escape character \\U") 365 return 366 } 367 codeNum := hexRunesToInt(src[idx+2 : idx+10]) 368 value = append(value, rune(codeNum)) 369 idx += 9 370 continue 371 case '\\': 372 ctx.addOriginBuf(nextChar) 373 idx++ 374 } 375 } 376 value = append(value, c) 377 continue 378 } else if c != '"' { 379 value = append(value, c) 380 isFirstLineChar = false 381 continue 382 } 383 s.progressColumn(ctx, 1) 384 tk = token.DoubleQuote(string(value), string(ctx.obuf), srcpos) 385 pos = idx - startIndex + 1 386 return 387 } 388 return 389 } 390 391 func (s *Scanner) scanQuote(ctx *Context, ch rune) (tk *token.Token, pos int) { 392 if ch == '\'' { 393 return s.scanSingleQuote(ctx) 394 } 395 return s.scanDoubleQuote(ctx) 396 } 397 398 func (s *Scanner) isMergeKey(ctx *Context) bool { 399 if ctx.repeatNum('<') != 2 { 400 return false 401 } 402 src := ctx.src 403 size := len(src) 404 for idx := ctx.idx + 2; idx < size; idx++ { 405 c := src[idx] 406 if c == ' ' { 407 continue 408 } 409 if c != ':' { 410 return false 411 } 412 if idx+1 < size { 413 nc := src[idx+1] 414 if nc == ' ' || s.isNewLineChar(nc) { 415 return true 416 } 417 } 418 } 419 return false 420 } 421 422 func (s *Scanner) scanTag(ctx *Context) (tk *token.Token, pos int) { 423 ctx.addOriginBuf('!') 424 ctx.progress(1) // skip '!' character 425 for idx, c := range ctx.src[ctx.idx:] { 426 pos = idx + 1 427 ctx.addOriginBuf(c) 428 switch c { 429 case ' ', '\n', '\r': 430 value := ctx.source(ctx.idx-1, ctx.idx+idx) 431 tk = token.Tag(value, string(ctx.obuf), s.pos()) 432 pos = len([]rune(value)) 433 return 434 } 435 } 436 return 437 } 438 439 func (s *Scanner) scanComment(ctx *Context) (tk *token.Token, pos int) { 440 ctx.addOriginBuf('#') 441 ctx.progress(1) // skip '#' character 442 for idx, c := range ctx.src[ctx.idx:] { 443 pos = idx + 1 444 ctx.addOriginBuf(c) 445 switch c { 446 case '\n', '\r': 447 if ctx.previousChar() == '\\' { 448 continue 449 } 450 value := ctx.source(ctx.idx, ctx.idx+idx) 451 tk = token.Comment(value, string(ctx.obuf), s.pos()) 452 pos = len([]rune(value)) + 1 453 return 454 } 455 } 456 return 457 } 458 459 func trimCommentFromLiteralOpt(text string) (string, error) { 460 idx := strings.Index(text, "#") 461 if idx < 0 { 462 return text, nil 463 } 464 if idx == 0 { 465 return "", xerrors.New("invalid literal header") 466 } 467 return text[:idx-1], nil 468 } 469 470 func (s *Scanner) scanLiteral(ctx *Context, c rune) { 471 ctx.addOriginBuf(c) 472 if ctx.isEOS() { 473 if ctx.isLiteral { 474 ctx.addBuf(c) 475 } 476 value := ctx.bufferedSrc() 477 ctx.addToken(token.String(string(value), string(ctx.obuf), s.pos())) 478 ctx.resetBuffer() 479 s.progressColumn(ctx, 1) 480 } else if s.isNewLineChar(c) { 481 if ctx.isLiteral { 482 ctx.addBuf(c) 483 } else { 484 ctx.addBuf(' ') 485 } 486 s.progressLine(ctx) 487 } else if s.isFirstCharAtLine && c == ' ' { 488 if 0 < s.docStartColumn && s.docStartColumn <= s.column { 489 ctx.addBuf(c) 490 } 491 s.progressColumn(ctx, 1) 492 } else { 493 if s.docStartColumn == 0 { 494 s.docStartColumn = s.column 495 } 496 ctx.addBuf(c) 497 s.progressColumn(ctx, 1) 498 } 499 } 500 501 func (s *Scanner) scanLiteralHeader(ctx *Context) (pos int, err error) { 502 header := ctx.currentChar() 503 ctx.addOriginBuf(header) 504 ctx.progress(1) // skip '|' or '>' character 505 for idx, c := range ctx.src[ctx.idx:] { 506 pos = idx 507 ctx.addOriginBuf(c) 508 switch c { 509 case '\n', '\r': 510 value := ctx.source(ctx.idx, ctx.idx+idx) 511 opt := strings.TrimRight(value, " ") 512 orgOptLen := len(opt) 513 opt, err = trimCommentFromLiteralOpt(opt) 514 if err != nil { 515 return 516 } 517 switch opt { 518 case "", "+", "-", 519 "0", "1", "2", "3", "4", "5", "6", "7", "8", "9": 520 hasComment := len(opt) < orgOptLen 521 if header == '|' { 522 if hasComment { 523 commentLen := orgOptLen - len(opt) 524 headerPos := strings.Index(string(ctx.obuf), "|") 525 litBuf := ctx.obuf[:len(ctx.obuf)-commentLen-headerPos] 526 commentBuf := ctx.obuf[len(litBuf):] 527 ctx.addToken(token.Literal("|"+opt, string(litBuf), s.pos())) 528 s.column += len(litBuf) 529 s.offset += len(litBuf) 530 commentHeader := strings.Index(value, "#") 531 ctx.addToken(token.Comment(string(value[commentHeader+1:]), string(commentBuf), s.pos())) 532 } else { 533 ctx.addToken(token.Literal("|"+opt, string(ctx.obuf), s.pos())) 534 } 535 ctx.isLiteral = true 536 } else if header == '>' { 537 if hasComment { 538 commentLen := orgOptLen - len(opt) 539 headerPos := strings.Index(string(ctx.obuf), ">") 540 foldedBuf := ctx.obuf[:len(ctx.obuf)-commentLen-headerPos] 541 commentBuf := ctx.obuf[len(foldedBuf):] 542 ctx.addToken(token.Folded(">"+opt, string(foldedBuf), s.pos())) 543 s.column += len(foldedBuf) 544 s.offset += len(foldedBuf) 545 commentHeader := strings.Index(value, "#") 546 ctx.addToken(token.Comment(string(value[commentHeader+1:]), string(commentBuf), s.pos())) 547 } else { 548 ctx.addToken(token.Folded(">"+opt, string(ctx.obuf), s.pos())) 549 } 550 ctx.isFolded = true 551 } 552 s.indentState = IndentStateKeep 553 ctx.resetBuffer() 554 ctx.literalOpt = opt 555 return 556 } 557 break 558 } 559 } 560 err = xerrors.New("invalid literal header") 561 return 562 } 563 564 func (s *Scanner) scanNewLine(ctx *Context, c rune) { 565 if len(ctx.buf) > 0 && s.savedPos == nil { 566 s.savedPos = s.pos() 567 s.savedPos.Column -= len(ctx.bufferedSrc()) 568 } 569 570 // if the following case, origin buffer has unnecessary two spaces. 571 // So, `removeRightSpaceFromOriginBuf` remove them, also fix column number too. 572 // --- 573 // a:[space][space] 574 // b: c 575 removedNum := ctx.removeRightSpaceFromBuf() 576 if removedNum > 0 { 577 s.column -= removedNum 578 s.offset -= removedNum 579 if s.savedPos != nil { 580 s.savedPos.Column -= removedNum 581 } 582 } 583 584 if ctx.isEOS() { 585 s.addBufferedTokenIfExists(ctx) 586 } else if s.isAnchor { 587 s.addBufferedTokenIfExists(ctx) 588 } 589 ctx.addBuf(' ') 590 ctx.addOriginBuf(c) 591 ctx.isSingleLine = false 592 s.progressLine(ctx) 593 } 594 595 func (s *Scanner) scan(ctx *Context) (pos int) { 596 for ctx.next() { 597 pos = ctx.nextPos() 598 c := ctx.currentChar() 599 s.updateIndent(ctx, c) 600 if ctx.isDocument() { 601 if s.isChangedToIndentStateEqual() || 602 s.isChangedToIndentStateDown() { 603 s.addBufferedTokenIfExists(ctx) 604 s.breakLiteral(ctx) 605 } else { 606 s.scanLiteral(ctx, c) 607 continue 608 } 609 } else if s.isChangedToIndentStateDown() { 610 s.addBufferedTokenIfExists(ctx) 611 } else if s.isChangedToIndentStateEqual() { 612 // if first character is new line character, buffer expect to raw folded literal 613 if len(ctx.obuf) > 0 && s.newLineCount(ctx.obuf) <= 1 { 614 // doesn't raw folded literal 615 s.addBufferedTokenIfExists(ctx) 616 } 617 } 618 switch c { 619 case '{': 620 if !ctx.existsBuffer() { 621 ctx.addOriginBuf(c) 622 ctx.addToken(token.MappingStart(string(ctx.obuf), s.pos())) 623 s.startedFlowMapNum++ 624 s.progressColumn(ctx, 1) 625 return 626 } 627 case '}': 628 if !ctx.existsBuffer() || s.startedFlowMapNum > 0 { 629 ctx.addToken(s.bufferedToken(ctx)) 630 ctx.addOriginBuf(c) 631 ctx.addToken(token.MappingEnd(string(ctx.obuf), s.pos())) 632 s.startedFlowMapNum-- 633 s.progressColumn(ctx, 1) 634 return 635 } 636 case '.': 637 if s.indentNum == 0 && s.column == 1 && ctx.repeatNum('.') == 3 { 638 ctx.addToken(token.DocumentEnd(string(ctx.obuf)+"...", s.pos())) 639 s.progressColumn(ctx, 3) 640 pos += 2 641 return 642 } 643 case '<': 644 if s.isMergeKey(ctx) { 645 s.prevIndentColumn = s.column 646 ctx.addToken(token.MergeKey(string(ctx.obuf)+"<<", s.pos())) 647 s.progressColumn(ctx, 1) 648 pos++ 649 return 650 } 651 case '-': 652 if s.indentNum == 0 && s.column == 1 && ctx.repeatNum('-') == 3 { 653 s.addBufferedTokenIfExists(ctx) 654 ctx.addToken(token.DocumentHeader(string(ctx.obuf)+"---", s.pos())) 655 s.progressColumn(ctx, 3) 656 pos += 2 657 return 658 } 659 if ctx.existsBuffer() && s.isChangedToIndentStateUp() { 660 // raw folded 661 ctx.isRawFolded = true 662 ctx.addBuf(c) 663 ctx.addOriginBuf(c) 664 s.progressColumn(ctx, 1) 665 continue 666 } 667 if ctx.existsBuffer() { 668 // '-' is literal 669 ctx.addBuf(c) 670 ctx.addOriginBuf(c) 671 s.progressColumn(ctx, 1) 672 continue 673 } 674 nc := ctx.nextChar() 675 if nc == ' ' || s.isNewLineChar(nc) { 676 s.addBufferedTokenIfExists(ctx) 677 ctx.addOriginBuf(c) 678 tk := token.SequenceEntry(string(ctx.obuf), s.pos()) 679 s.prevIndentColumn = tk.Position.Column 680 ctx.addToken(tk) 681 s.progressColumn(ctx, 1) 682 return 683 } 684 case '[': 685 if !ctx.existsBuffer() { 686 ctx.addOriginBuf(c) 687 ctx.addToken(token.SequenceStart(string(ctx.obuf), s.pos())) 688 s.startedFlowSequenceNum++ 689 s.progressColumn(ctx, 1) 690 return 691 } 692 case ']': 693 if !ctx.existsBuffer() || s.startedFlowSequenceNum > 0 { 694 s.addBufferedTokenIfExists(ctx) 695 ctx.addOriginBuf(c) 696 ctx.addToken(token.SequenceEnd(string(ctx.obuf), s.pos())) 697 s.startedFlowSequenceNum-- 698 s.progressColumn(ctx, 1) 699 return 700 } 701 case ',': 702 if s.startedFlowSequenceNum > 0 || s.startedFlowMapNum > 0 { 703 s.addBufferedTokenIfExists(ctx) 704 ctx.addOriginBuf(c) 705 ctx.addToken(token.CollectEntry(string(ctx.obuf), s.pos())) 706 s.progressColumn(ctx, 1) 707 return 708 } 709 case ':': 710 nc := ctx.nextChar() 711 if s.startedFlowMapNum > 0 || nc == ' ' || s.isNewLineChar(nc) || ctx.isNextEOS() { 712 // mapping value 713 tk := s.bufferedToken(ctx) 714 if tk != nil { 715 s.prevIndentColumn = tk.Position.Column 716 ctx.addToken(tk) 717 } 718 ctx.addToken(token.MappingValue(s.pos())) 719 s.progressColumn(ctx, 1) 720 return 721 } 722 case '|', '>': 723 if !ctx.existsBuffer() { 724 progress, err := s.scanLiteralHeader(ctx) 725 if err != nil { 726 // TODO: returns syntax error object 727 return 728 } 729 s.progressColumn(ctx, progress) 730 s.progressLine(ctx) 731 continue 732 } 733 case '!': 734 if !ctx.existsBuffer() { 735 token, progress := s.scanTag(ctx) 736 ctx.addToken(token) 737 s.progressColumn(ctx, progress) 738 if c := ctx.previousChar(); s.isNewLineChar(c) { 739 s.progressLine(ctx) 740 } 741 pos += progress 742 return 743 } 744 case '%': 745 if !ctx.existsBuffer() && s.indentNum == 0 { 746 ctx.addToken(token.Directive(string(ctx.obuf)+"%", s.pos())) 747 s.progressColumn(ctx, 1) 748 return 749 } 750 case '?': 751 nc := ctx.nextChar() 752 if !ctx.existsBuffer() && nc == ' ' { 753 ctx.addToken(token.MappingKey(s.pos())) 754 s.progressColumn(ctx, 1) 755 return 756 } 757 case '&': 758 if !ctx.existsBuffer() { 759 s.addBufferedTokenIfExists(ctx) 760 ctx.addOriginBuf(c) 761 ctx.addToken(token.Anchor(string(ctx.obuf), s.pos())) 762 s.progressColumn(ctx, 1) 763 s.isAnchor = true 764 return 765 } 766 case '*': 767 if !ctx.existsBuffer() { 768 s.addBufferedTokenIfExists(ctx) 769 ctx.addOriginBuf(c) 770 ctx.addToken(token.Alias(string(ctx.obuf), s.pos())) 771 s.progressColumn(ctx, 1) 772 return 773 } 774 case '#': 775 if !ctx.existsBuffer() || ctx.previousChar() == ' ' { 776 s.addBufferedTokenIfExists(ctx) 777 token, progress := s.scanComment(ctx) 778 ctx.addToken(token) 779 s.progressColumn(ctx, progress) 780 s.progressLine(ctx) 781 pos += progress 782 return 783 } 784 case '\'', '"': 785 if !ctx.existsBuffer() { 786 token, progress := s.scanQuote(ctx, c) 787 ctx.addToken(token) 788 pos += progress 789 return 790 } 791 case '\r', '\n': 792 // There is no problem that we ignore CR which followed by LF and normalize it to LF, because of following YAML1.2 spec. 793 // > Line breaks inside scalar content must be normalized by the YAML processor. Each such line break must be parsed into a single line feed character. 794 // > Outside scalar content, YAML allows any line break to be used to terminate lines. 795 // > -- https://yaml.org/spec/1.2/spec.html 796 if c == '\r' && ctx.nextChar() == '\n' { 797 ctx.addOriginBuf('\r') 798 ctx.progress(1) 799 c = '\n' 800 } 801 s.scanNewLine(ctx, c) 802 continue 803 case ' ': 804 if ctx.isSaveIndentMode() || (!s.isAnchor && !s.isFirstCharAtLine) { 805 ctx.addBuf(c) 806 ctx.addOriginBuf(c) 807 s.progressColumn(ctx, 1) 808 continue 809 } 810 if s.isFirstCharAtLine { 811 s.progressColumn(ctx, 1) 812 ctx.addOriginBuf(c) 813 continue 814 } 815 s.addBufferedTokenIfExists(ctx) 816 pos-- // to rescan white space at next scanning for adding white space to next buffer. 817 s.isAnchor = false 818 return 819 } 820 ctx.addBuf(c) 821 ctx.addOriginBuf(c) 822 s.progressColumn(ctx, 1) 823 } 824 s.addBufferedTokenIfExists(ctx) 825 return 826 } 827 828 // Init prepares the scanner s to tokenize the text src by setting the scanner at the beginning of src. 829 func (s *Scanner) Init(text string) { 830 src := []rune(text) 831 s.source = src 832 s.sourcePos = 0 833 s.sourceSize = len(src) 834 s.line = 1 835 s.column = 1 836 s.offset = 1 837 s.prevIndentLevel = 0 838 s.prevIndentNum = 0 839 s.prevIndentColumn = 0 840 s.indentLevel = 0 841 s.indentNum = 0 842 s.isFirstCharAtLine = true 843 } 844 845 // Scan scans the next token and returns the token collection. The source end is indicated by io.EOF. 846 func (s *Scanner) Scan() (token.Tokens, error) { 847 if s.sourcePos >= s.sourceSize { 848 return nil, io.EOF 849 } 850 ctx := newContext(s.source[s.sourcePos:]) 851 defer ctx.release() 852 progress := s.scan(ctx) 853 s.sourcePos += progress 854 var tokens token.Tokens 855 tokens = append(tokens, ctx.tokens...) 856 return tokens, nil 857 }