golang.org/x/net@v0.25.1-0.20240516223405-c87a5b62e243/html/token_test.go (about) 1 // Copyright 2010 The Go Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 package html 6 7 import ( 8 "bytes" 9 "io" 10 "io/ioutil" 11 "reflect" 12 "runtime" 13 "strings" 14 "testing" 15 ) 16 17 // https://github.com/golang/go/issues/58246 18 const issue58246 = `<!--[if gte mso 12]> 19 <xml> 20 <o:OfficeDocumentSettings> 21 <o:AllowPNG/> 22 <o:PixelsPerInch>96</o:PixelsPerInch> 23 </o:OfficeDocumentSettings> 24 </xml> 25 <![endif]-->` 26 27 type tokenTest struct { 28 // A short description of the test case. 29 desc string 30 // The HTML to parse. 31 html string 32 // The string representations of the expected tokens, joined by '$'. 33 golden string 34 } 35 36 var tokenTests = []tokenTest{ 37 { 38 "empty", 39 "", 40 "", 41 }, 42 // A single text node. The tokenizer should not break text nodes on whitespace, 43 // nor should it normalize whitespace within a text node. 44 { 45 "text", 46 "foo bar", 47 "foo bar", 48 }, 49 // An entity. 50 { 51 "entity", 52 "one < two", 53 "one < two", 54 }, 55 // A start, self-closing and end tag. The tokenizer does not care if the start 56 // and end tokens don't match; that is the job of the parser. 57 { 58 "tags", 59 "<a>b<c/>d</e>", 60 "<a>$b$<c/>$d$</e>", 61 }, 62 // Angle brackets that aren't a tag. 63 { 64 "not a tag #0", 65 "<", 66 "<", 67 }, 68 { 69 "not a tag #1", 70 "</", 71 "</", 72 }, 73 { 74 "not a tag #2", 75 "</>", 76 "<!---->", 77 }, 78 { 79 "not a tag #3", 80 "a</>b", 81 "a$<!---->$b", 82 }, 83 { 84 "not a tag #4", 85 "</ >", 86 "<!-- -->", 87 }, 88 { 89 "not a tag #5", 90 "</.", 91 "<!--.-->", 92 }, 93 { 94 "not a tag #6", 95 "</.>", 96 "<!--.-->", 97 }, 98 { 99 "not a tag #7", 100 "a < b", 101 "a < b", 102 }, 103 { 104 "not a tag #8", 105 "<.>", 106 "<.>", 107 }, 108 { 109 "not a tag #9", 110 "a<<<b>>>c", 111 "a<<$<b>$>>c", 112 }, 113 { 114 "not a tag #10", 115 "if x<0 and y < 0 then x*y>0", 116 "if x<0 and y < 0 then x*y>0", 117 }, 118 { 119 "not a tag #11", 120 "<<p>", 121 "<$<p>", 122 }, 123 // EOF in a tag name. 124 { 125 "tag name eof #0", 126 "<a", 127 "", 128 }, 129 { 130 "tag name eof #1", 131 "<a ", 132 "", 133 }, 134 { 135 "tag name eof #2", 136 "a<b", 137 "a", 138 }, 139 { 140 "tag name eof #3", 141 "<a><b", 142 "<a>", 143 }, 144 { 145 "tag name eof #4", 146 `<a x`, 147 ``, 148 }, 149 // Some malformed tags that are missing a '>'. 150 { 151 "malformed tag #0", 152 `<p</p>`, 153 `<p< p="">`, 154 }, 155 { 156 "malformed tag #1", 157 `<p </p>`, 158 `<p <="" p="">`, 159 }, 160 { 161 "malformed tag #2", 162 `<p id`, 163 ``, 164 }, 165 { 166 "malformed tag #3", 167 `<p id=`, 168 ``, 169 }, 170 { 171 "malformed tag #4", 172 `<p id=>`, 173 `<p id="">`, 174 }, 175 { 176 "malformed tag #5", 177 `<p id=0`, 178 ``, 179 }, 180 { 181 "malformed tag #6", 182 `<p id=0</p>`, 183 `<p id="0</p">`, 184 }, 185 { 186 "malformed tag #7", 187 `<p id="0</p>`, 188 ``, 189 }, 190 { 191 "malformed tag #8", 192 `<p id="0"</p>`, 193 `<p id="0" <="" p="">`, 194 }, 195 { 196 "malformed tag #9", 197 `<p></p id`, 198 `<p>`, 199 }, 200 // Raw text and RCDATA. 201 { 202 "basic raw text", 203 "<script><a></b></script>", 204 "<script>$<a></b>$</script>", 205 }, 206 { 207 "unfinished script end tag", 208 "<SCRIPT>a</SCR", 209 "<script>$a</SCR", 210 }, 211 { 212 "broken script end tag", 213 "<SCRIPT>a</SCR ipt>", 214 "<script>$a</SCR ipt>", 215 }, 216 { 217 "EOF in script end tag", 218 "<SCRIPT>a</SCRipt", 219 "<script>$a</SCRipt", 220 }, 221 { 222 "scriptx end tag", 223 "<SCRIPT>a</SCRiptx", 224 "<script>$a</SCRiptx", 225 }, 226 { 227 "' ' completes script end tag", 228 "<SCRIPT>a</SCRipt ", 229 "<script>$a", 230 }, 231 { 232 "'>' completes script end tag", 233 "<SCRIPT>a</SCRipt>", 234 "<script>$a$</script>", 235 }, 236 { 237 "self-closing script end tag", 238 "<SCRIPT>a</SCRipt/>", 239 "<script>$a$</script>", 240 }, 241 { 242 "nested script tag", 243 "<SCRIPT>a</SCRipt<script>", 244 "<script>$a</SCRipt<script>", 245 }, 246 { 247 "script end tag after unfinished", 248 "<SCRIPT>a</SCRipt</script>", 249 "<script>$a</SCRipt$</script>", 250 }, 251 { 252 "script/style mismatched tags", 253 "<script>a</style>", 254 "<script>$a</style>", 255 }, 256 { 257 "style element with entity", 258 "<style>'", 259 "<style>$&apos;", 260 }, 261 { 262 "textarea with tag", 263 "<textarea><div></textarea>", 264 "<textarea>$<div>$</textarea>", 265 }, 266 { 267 "title with tag and entity", 268 "<title><b>K&R C</b></title>", 269 "<title>$<b>K&R C</b>$</title>", 270 }, 271 { 272 "title with trailing '<' entity", 273 "<title>foobar<</title>", 274 "<title>$foobar<$</title>", 275 }, 276 // DOCTYPE tests. 277 { 278 "Proper DOCTYPE", 279 "<!DOCTYPE html>", 280 "<!DOCTYPE html>", 281 }, 282 { 283 "DOCTYPE with no space", 284 "<!doctypehtml>", 285 "<!DOCTYPE html>", 286 }, 287 { 288 "DOCTYPE with two spaces", 289 "<!doctype html>", 290 "<!DOCTYPE html>", 291 }, 292 { 293 "looks like DOCTYPE but isn't", 294 "<!DOCUMENT html>", 295 "<!--DOCUMENT html-->", 296 }, 297 { 298 "DOCTYPE at EOF", 299 "<!DOCtype", 300 "<!DOCTYPE >", 301 }, 302 // XML processing instructions. 303 { 304 "XML processing instruction", 305 "<?xml?>", 306 "<!--?xml?-->", 307 }, 308 // Comments. See also func TestComments. 309 { 310 "comment0", 311 "abc<b><!-- skipme --></b>def", 312 "abc$<b>$<!-- skipme -->$</b>$def", 313 }, 314 { 315 "comment1", 316 "a<!-->z", 317 "a$<!---->$z", 318 }, 319 { 320 "comment2", 321 "a<!--->z", 322 "a$<!---->$z", 323 }, 324 { 325 "comment3", 326 "a<!--x>-->z", 327 "a$<!--x>-->$z", 328 }, 329 { 330 "comment4", 331 "a<!--x->-->z", 332 "a$<!--x->-->$z", 333 }, 334 { 335 "comment5", 336 "a<!>z", 337 "a$<!---->$z", 338 }, 339 { 340 "comment6", 341 "a<!->z", 342 "a$<!----->$z", 343 }, 344 { 345 "comment7", 346 "a<!---<>z", 347 "a$<!---<>z-->", 348 }, 349 { 350 "comment8", 351 "a<!--z", 352 "a$<!--z-->", 353 }, 354 { 355 "comment9", 356 "a<!--z-", 357 "a$<!--z-->", 358 }, 359 { 360 "comment10", 361 "a<!--z--", 362 "a$<!--z-->", 363 }, 364 { 365 "comment11", 366 "a<!--z---", 367 "a$<!--z--->", 368 }, 369 { 370 "comment12", 371 "a<!--z----", 372 "a$<!--z---->", 373 }, 374 { 375 "comment13", 376 "a<!--x--!>z", 377 "a$<!--x-->$z", 378 }, 379 { 380 "comment14", 381 "a<!--!-->z", 382 "a$<!--!-->$z", 383 }, 384 { 385 "comment15", 386 "a<!-- !-->z", 387 "a$<!-- !-->$z", 388 }, 389 { 390 "comment16", 391 "a<!--i\x00j-->z", 392 "a$<!--i\uFFFDj-->$z", 393 }, 394 { 395 "comment17", 396 "a<!--\x00", 397 "a$<!--\uFFFD-->", 398 }, 399 { 400 "comment18", 401 "a<!--<!-->z", 402 "a$<!--<!-->$z", 403 }, 404 { 405 "comment19", 406 "a<!--<!--", 407 "a$<!--<!-->", 408 }, 409 { 410 "comment20", 411 "a<!--ij--kl-->z", 412 "a$<!--ij--kl-->$z", 413 }, 414 { 415 "comment21", 416 "a<!--ij--kl--!>z", 417 "a$<!--ij--kl-->$z", 418 }, 419 { 420 "comment22", 421 "a<!--!--!<--!-->z", 422 "a$<!--!--!<--!-->$z", 423 }, 424 { 425 "comment23", 426 "a<!-->-->z", 427 "a$<!-->-->$z", 428 }, 429 { 430 "comment24", 431 "a<!-->>x", 432 "a$<!-->>x-->", 433 }, 434 { 435 "comment25", 436 "a<!-->>", 437 "a$<!-->>-->", 438 }, 439 { 440 "comment26", 441 "a<!-->>-", 442 "a$<!-->>-->", 443 }, 444 { 445 "comment27", 446 "a<!-->>-->z", 447 "a$<!-->>-->$z", 448 }, 449 { 450 "comment28", 451 "a<!--&>-->z", 452 "a$<!--&>-->$z", 453 }, 454 { 455 "comment29", 456 "a<!--&gt;-->z", 457 "a$<!--&gt;-->$z", 458 }, 459 { 460 "comment30", 461 "a<!--&nosuchentity;-->z", 462 "a$<!--&nosuchentity;-->$z", 463 }, 464 { 465 "comment31", 466 "a<!--i>>j-->z", 467 "a$<!--i>>j-->$z", 468 }, 469 { 470 "comment32", 471 "a<!--i!>>j-->z", 472 "a$<!--i!>>j-->$z", 473 }, 474 // https://stackoverflow.design/email/base/mso/#targeting-specific-outlook-versions 475 // says "[For] Windows Outlook 2003 and above... conditional comments allow 476 // us to add bits of HTML that are only read by the Word-based versions of 477 // Outlook". These comments (with angle brackets) should pass through 478 // unchanged (by this Go package) when rendering. 479 // 480 // We should also still escape ">" as ">" when necessary. 481 // https://github.com/golang/go/issues/48237 482 // 483 // The "your code" example below comes from that stackoverflow.design link 484 // above but note that it can contain angle-bracket-rich XML. 485 // https://github.com/golang/go/issues/58246 486 { 487 "issue48237CommentWithAmpgtsemi1", 488 "a<!--<p></p><!--[video]-->-->z", 489 "a$<!--<p></p><!--[video]-->-->$z", 490 }, 491 { 492 "issue48237CommentWithAmpgtsemi2", 493 "a<!--<p></p><!--[video]--!>-->z", 494 "a$<!--<p></p><!--[video]--!>-->$z", 495 }, 496 { 497 "issue58246MicrosoftOutlookComment1", 498 "a<!--[if mso]> your code <![endif]-->z", 499 "a$<!--[if mso]> your code <![endif]-->$z", 500 }, 501 { 502 "issue58246MicrosoftOutlookComment2", 503 "a" + issue58246 + "z", 504 "a$" + issue58246 + "$z", 505 }, 506 // An attribute with a backslash. 507 { 508 "backslash", 509 `<p id="a\"b">`, 510 `<p id="a\" b"="">`, 511 }, 512 // Entities, tag name and attribute key lower-casing, and whitespace 513 // normalization within a tag. 514 { 515 "tricky", 516 "<p \t\n iD=\"a"B\" foo=\"bar\"><EM>te<&;xt</em></p>", 517 `<p id="a"B" foo="bar">$<em>$te<&;xt$</em>$</p>`, 518 }, 519 // A nonexistent entity. Tokenizing and converting back to a string should 520 // escape the "&" to become "&". 521 { 522 "noSuchEntity", 523 `<a b="c&noSuchEntity;d"><&alsoDoesntExist;&`, 524 `<a b="c&noSuchEntity;d">$<&alsoDoesntExist;&`, 525 }, 526 { 527 "entity without semicolon", 528 `¬it;∉<a b="q=z&=5¬ice=hello¬=world">`, 529 `¬it;∉$<a b="q=z&amp=5&notice=hello¬=world">`, 530 }, 531 { 532 "entity with digits", 533 "½", 534 "½", 535 }, 536 // Attribute tests: 537 // http://dev.w3.org/html5/pf-summary/Overview.html#attributes 538 { 539 "Empty attribute", 540 `<input disabled FOO>`, 541 `<input disabled="" foo="">`, 542 }, 543 { 544 "Empty attribute, whitespace", 545 `<input disabled FOO >`, 546 `<input disabled="" foo="">`, 547 }, 548 { 549 "Unquoted attribute value", 550 `<input value=yes FOO=BAR>`, 551 `<input value="yes" foo="BAR">`, 552 }, 553 { 554 "Unquoted attribute value, spaces", 555 `<input value = yes FOO = BAR>`, 556 `<input value="yes" foo="BAR">`, 557 }, 558 { 559 "Unquoted attribute value, trailing space", 560 `<input value=yes FOO=BAR >`, 561 `<input value="yes" foo="BAR">`, 562 }, 563 { 564 "Single-quoted attribute value", 565 `<input value='yes' FOO='BAR'>`, 566 `<input value="yes" foo="BAR">`, 567 }, 568 { 569 "Single-quoted attribute value, trailing space", 570 `<input value='yes' FOO='BAR' >`, 571 `<input value="yes" foo="BAR">`, 572 }, 573 { 574 "Double-quoted attribute value", 575 `<input value="I'm an attribute" FOO="BAR">`, 576 `<input value="I'm an attribute" foo="BAR">`, 577 }, 578 { 579 "Attribute name characters", 580 `<meta http-equiv="content-type">`, 581 `<meta http-equiv="content-type">`, 582 }, 583 { 584 "Mixed attributes", 585 `a<P V="0 1" w='2' X=3 y>z`, 586 `a$<p v="0 1" w="2" x="3" y="">$z`, 587 }, 588 { 589 "Attributes with a solitary single quote", 590 `<p id=can't><p id=won't>`, 591 `<p id="can't">$<p id="won't">`, 592 }, 593 // WHATWG 13.2.5.32 equals sign before attribute name state 594 { 595 "equals sign before attribute name", 596 `<p =>`, 597 `<p =="">`, 598 }, 599 { 600 "equals sign before attribute name, extra cruft", 601 `<p =asd>`, 602 `<p =asd="">`, 603 }, 604 { 605 "forward slash before attribute name", 606 `<p/=">`, 607 `<p ="="">`, 608 }, 609 { 610 "forward slash before attribute name with spaces around", 611 `<p / =">`, 612 `<p ="="">`, 613 }, 614 { 615 "forward slash after attribute name followed by a character", 616 `<p a/ ="">`, 617 `<p a="" =""="">`, 618 }, 619 } 620 621 func TestTokenizer(t *testing.T) { 622 for _, tt := range tokenTests { 623 t.Run(tt.desc, func(t *testing.T) { 624 z := NewTokenizer(strings.NewReader(tt.html)) 625 if tt.golden != "" { 626 for i, s := range strings.Split(tt.golden, "$") { 627 if z.Next() == ErrorToken { 628 t.Errorf("%s token %d: want %q got error %v", tt.desc, i, s, z.Err()) 629 return 630 } 631 actual := z.Token().String() 632 if s != actual { 633 t.Errorf("%s token %d: want %q got %q", tt.desc, i, s, actual) 634 return 635 } 636 } 637 } 638 z.Next() 639 if z.Err() != io.EOF { 640 t.Errorf("%s: want EOF got %q", tt.desc, z.Err()) 641 } 642 }) 643 } 644 } 645 646 func TestMaxBuffer(t *testing.T) { 647 // Exceeding the maximum buffer size generates ErrBufferExceeded. 648 z := NewTokenizer(strings.NewReader("<" + strings.Repeat("t", 10))) 649 z.SetMaxBuf(5) 650 tt := z.Next() 651 if got, want := tt, ErrorToken; got != want { 652 t.Fatalf("token type: got: %v want: %v", got, want) 653 } 654 if got, want := z.Err(), ErrBufferExceeded; got != want { 655 t.Errorf("error type: got: %v want: %v", got, want) 656 } 657 if got, want := string(z.Raw()), "<tttt"; got != want { 658 t.Fatalf("buffered before overflow: got: %q want: %q", got, want) 659 } 660 } 661 662 func TestMaxBufferReconstruction(t *testing.T) { 663 // Exceeding the maximum buffer size at any point while tokenizing permits 664 // reconstructing the original input. 665 tests: 666 for _, test := range tokenTests { 667 for maxBuf := 1; ; maxBuf++ { 668 r := strings.NewReader(test.html) 669 z := NewTokenizer(r) 670 z.SetMaxBuf(maxBuf) 671 var tokenized bytes.Buffer 672 for { 673 tt := z.Next() 674 tokenized.Write(z.Raw()) 675 if tt == ErrorToken { 676 if err := z.Err(); err != io.EOF && err != ErrBufferExceeded { 677 t.Errorf("%s: unexpected error: %v", test.desc, err) 678 } 679 break 680 } 681 } 682 // Anything tokenized along with untokenized input or data left in the reader. 683 assembled, err := ioutil.ReadAll(io.MultiReader(&tokenized, bytes.NewReader(z.Buffered()), r)) 684 if err != nil { 685 t.Errorf("%s: ReadAll: %v", test.desc, err) 686 continue tests 687 } 688 if got, want := string(assembled), test.html; got != want { 689 t.Errorf("%s: reassembled html:\n got: %q\nwant: %q", test.desc, got, want) 690 continue tests 691 } 692 // EOF indicates that we completed tokenization and hence found the max 693 // maxBuf that generates ErrBufferExceeded, so continue to the next test. 694 if z.Err() == io.EOF { 695 break 696 } 697 } // buffer sizes 698 } // tests 699 } 700 701 func TestPassthrough(t *testing.T) { 702 // Accumulating the raw output for each parse event should reconstruct the 703 // original input. 704 for _, test := range tokenTests { 705 z := NewTokenizer(strings.NewReader(test.html)) 706 var parsed bytes.Buffer 707 for { 708 tt := z.Next() 709 parsed.Write(z.Raw()) 710 if tt == ErrorToken { 711 break 712 } 713 } 714 if got, want := parsed.String(), test.html; got != want { 715 t.Errorf("%s: parsed output:\n got: %q\nwant: %q", test.desc, got, want) 716 } 717 } 718 } 719 720 func TestBufAPI(t *testing.T) { 721 s := "0<a>1</a>2<b>3<a>4<a>5</a>6</b>7</a>8<a/>9" 722 z := NewTokenizer(bytes.NewBufferString(s)) 723 var result bytes.Buffer 724 depth := 0 725 loop: 726 for { 727 tt := z.Next() 728 switch tt { 729 case ErrorToken: 730 if z.Err() != io.EOF { 731 t.Error(z.Err()) 732 } 733 break loop 734 case TextToken: 735 if depth > 0 { 736 result.Write(z.Text()) 737 } 738 case StartTagToken, EndTagToken: 739 tn, _ := z.TagName() 740 if len(tn) == 1 && tn[0] == 'a' { 741 if tt == StartTagToken { 742 depth++ 743 } else { 744 depth-- 745 } 746 } 747 } 748 } 749 u := "14567" 750 v := string(result.Bytes()) 751 if u != v { 752 t.Errorf("TestBufAPI: want %q got %q", u, v) 753 } 754 } 755 756 func TestConvertNewlines(t *testing.T) { 757 testCases := map[string]string{ 758 "Mac\rDOS\r\nUnix\n": "Mac\nDOS\nUnix\n", 759 "Unix\nMac\rDOS\r\n": "Unix\nMac\nDOS\n", 760 "DOS\r\nDOS\r\nDOS\r\n": "DOS\nDOS\nDOS\n", 761 "": "", 762 "\n": "\n", 763 "\n\r": "\n\n", 764 "\r": "\n", 765 "\r\n": "\n", 766 "\r\n\n": "\n\n", 767 "\r\n\r": "\n\n", 768 "\r\n\r\n": "\n\n", 769 "\r\r": "\n\n", 770 "\r\r\n": "\n\n", 771 "\r\r\n\n": "\n\n\n", 772 "\r\r\r\n": "\n\n\n", 773 "\r \n": "\n \n", 774 "xyz": "xyz", 775 } 776 for in, want := range testCases { 777 if got := string(convertNewlines([]byte(in))); got != want { 778 t.Errorf("input %q: got %q, want %q", in, got, want) 779 } 780 } 781 } 782 783 func TestReaderEdgeCases(t *testing.T) { 784 const s = "<p>An io.Reader can return (0, nil) or (n, io.EOF).</p>" 785 testCases := []io.Reader{ 786 &zeroOneByteReader{s: s}, 787 &eofStringsReader{s: s}, 788 &stuckReader{}, 789 } 790 for i, tc := range testCases { 791 got := []TokenType{} 792 z := NewTokenizer(tc) 793 for { 794 tt := z.Next() 795 if tt == ErrorToken { 796 break 797 } 798 got = append(got, tt) 799 } 800 if err := z.Err(); err != nil && err != io.EOF { 801 if err != io.ErrNoProgress { 802 t.Errorf("i=%d: %v", i, err) 803 } 804 continue 805 } 806 want := []TokenType{ 807 StartTagToken, 808 TextToken, 809 EndTagToken, 810 } 811 if !reflect.DeepEqual(got, want) { 812 t.Errorf("i=%d: got %v, want %v", i, got, want) 813 continue 814 } 815 } 816 } 817 818 // zeroOneByteReader is like a strings.Reader that alternates between 819 // returning 0 bytes and 1 byte at a time. 820 type zeroOneByteReader struct { 821 s string 822 n int 823 } 824 825 func (r *zeroOneByteReader) Read(p []byte) (int, error) { 826 if len(p) == 0 { 827 return 0, nil 828 } 829 if len(r.s) == 0 { 830 return 0, io.EOF 831 } 832 r.n++ 833 if r.n%2 != 0 { 834 return 0, nil 835 } 836 p[0], r.s = r.s[0], r.s[1:] 837 return 1, nil 838 } 839 840 // eofStringsReader is like a strings.Reader but can return an (n, err) where 841 // n > 0 && err != nil. 842 type eofStringsReader struct { 843 s string 844 } 845 846 func (r *eofStringsReader) Read(p []byte) (int, error) { 847 n := copy(p, r.s) 848 r.s = r.s[n:] 849 if r.s != "" { 850 return n, nil 851 } 852 return n, io.EOF 853 } 854 855 // stuckReader is an io.Reader that always returns no data and no error. 856 type stuckReader struct{} 857 858 func (*stuckReader) Read(p []byte) (int, error) { 859 return 0, nil 860 } 861 862 const ( 863 rawLevel = iota 864 lowLevel 865 highLevel 866 ) 867 868 func benchmarkTokenizer(b *testing.B, level int) { 869 buf, err := ioutil.ReadFile("testdata/go1.html") 870 if err != nil { 871 b.Fatalf("could not read testdata/go1.html: %v", err) 872 } 873 b.SetBytes(int64(len(buf))) 874 runtime.GC() 875 b.ReportAllocs() 876 b.ResetTimer() 877 for i := 0; i < b.N; i++ { 878 z := NewTokenizer(bytes.NewBuffer(buf)) 879 for { 880 tt := z.Next() 881 if tt == ErrorToken { 882 if err := z.Err(); err != nil && err != io.EOF { 883 b.Fatalf("tokenizer error: %v", err) 884 } 885 break 886 } 887 switch level { 888 case rawLevel: 889 // Calling z.Raw just returns the raw bytes of the token. It does 890 // not unescape < to <, or lower-case tag names and attribute keys. 891 z.Raw() 892 case lowLevel: 893 // Caling z.Text, z.TagName and z.TagAttr returns []byte values 894 // whose contents may change on the next call to z.Next. 895 switch tt { 896 case TextToken, CommentToken, DoctypeToken: 897 z.Text() 898 case StartTagToken, SelfClosingTagToken: 899 _, more := z.TagName() 900 for more { 901 _, _, more = z.TagAttr() 902 } 903 case EndTagToken: 904 z.TagName() 905 } 906 case highLevel: 907 // Calling z.Token converts []byte values to strings whose validity 908 // extend beyond the next call to z.Next. 909 z.Token() 910 } 911 } 912 } 913 } 914 915 func BenchmarkRawLevelTokenizer(b *testing.B) { benchmarkTokenizer(b, rawLevel) } 916 func BenchmarkLowLevelTokenizer(b *testing.B) { benchmarkTokenizer(b, lowLevel) } 917 func BenchmarkHighLevelTokenizer(b *testing.B) { benchmarkTokenizer(b, highLevel) }