github.com/hxx258456/ccgo@v0.0.5-0.20230213014102-48b35f46f66f/net/html/token_test.go (about) 1 // Copyright 2010 The Go Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 package html 6 7 import ( 8 "bytes" 9 "io" 10 "io/ioutil" 11 "reflect" 12 "runtime" 13 "strings" 14 "testing" 15 ) 16 17 type tokenTest struct { 18 // A short description of the test case. 19 desc string 20 // The HTML to parse. 21 html string 22 // The string representations of the expected tokens, joined by '$'. 23 golden string 24 } 25 26 var tokenTests = []tokenTest{ 27 { 28 "empty", 29 "", 30 "", 31 }, 32 // A single text node. The tokenizer should not break text nodes on whitespace, 33 // nor should it normalize whitespace within a text node. 34 { 35 "text", 36 "foo bar", 37 "foo bar", 38 }, 39 // An entity. 40 { 41 "entity", 42 "one < two", 43 "one < two", 44 }, 45 // A start, self-closing and end tag. The tokenizer does not care if the start 46 // and end tokens don't match; that is the job of the parser. 47 { 48 "tags", 49 "<a>b<c/>d</e>", 50 "<a>$b$<c/>$d$</e>", 51 }, 52 // Angle brackets that aren't a tag. 53 { 54 "not a tag #0", 55 "<", 56 "<", 57 }, 58 { 59 "not a tag #1", 60 "</", 61 "</", 62 }, 63 { 64 "not a tag #2", 65 "</>", 66 "<!---->", 67 }, 68 { 69 "not a tag #3", 70 "a</>b", 71 "a$<!---->$b", 72 }, 73 { 74 "not a tag #4", 75 "</ >", 76 "<!-- -->", 77 }, 78 { 79 "not a tag #5", 80 "</.", 81 "<!--.-->", 82 }, 83 { 84 "not a tag #6", 85 "</.>", 86 "<!--.-->", 87 }, 88 { 89 "not a tag #7", 90 "a < b", 91 "a < b", 92 }, 93 { 94 "not a tag #8", 95 "<.>", 96 "<.>", 97 }, 98 { 99 "not a tag #9", 100 "a<<<b>>>c", 101 "a<<$<b>$>>c", 102 }, 103 { 104 "not a tag #10", 105 "if x<0 and y < 0 then x*y>0", 106 "if x<0 and y < 0 then x*y>0", 107 }, 108 { 109 "not a tag #11", 110 "<<p>", 111 "<$<p>", 112 }, 113 // EOF in a tag name. 114 { 115 "tag name eof #0", 116 "<a", 117 "", 118 }, 119 { 120 "tag name eof #1", 121 "<a ", 122 "", 123 }, 124 { 125 "tag name eof #2", 126 "a<b", 127 "a", 128 }, 129 { 130 "tag name eof #3", 131 "<a><b", 132 "<a>", 133 }, 134 { 135 "tag name eof #4", 136 `<a x`, 137 ``, 138 }, 139 // Some malformed tags that are missing a '>'. 140 { 141 "malformed tag #0", 142 `<p</p>`, 143 `<p< p="">`, 144 }, 145 { 146 "malformed tag #1", 147 `<p </p>`, 148 `<p <="" p="">`, 149 }, 150 { 151 "malformed tag #2", 152 `<p id`, 153 ``, 154 }, 155 { 156 "malformed tag #3", 157 `<p id=`, 158 ``, 159 }, 160 { 161 "malformed tag #4", 162 `<p id=>`, 163 `<p id="">`, 164 }, 165 { 166 "malformed tag #5", 167 `<p id=0`, 168 ``, 169 }, 170 { 171 "malformed tag #6", 172 `<p id=0</p>`, 173 `<p id="0</p">`, 174 }, 175 { 176 "malformed tag #7", 177 `<p id="0</p>`, 178 ``, 179 }, 180 { 181 "malformed tag #8", 182 `<p id="0"</p>`, 183 `<p id="0" <="" p="">`, 184 }, 185 { 186 "malformed tag #9", 187 `<p></p id`, 188 `<p>`, 189 }, 190 // Raw text and RCDATA. 191 { 192 "basic raw text", 193 "<script><a></b></script>", 194 "<script>$<a></b>$</script>", 195 }, 196 { 197 "unfinished script end tag", 198 "<SCRIPT>a</SCR", 199 "<script>$a</SCR", 200 }, 201 { 202 "broken script end tag", 203 "<SCRIPT>a</SCR ipt>", 204 "<script>$a</SCR ipt>", 205 }, 206 { 207 "EOF in script end tag", 208 "<SCRIPT>a</SCRipt", 209 "<script>$a</SCRipt", 210 }, 211 { 212 "scriptx end tag", 213 "<SCRIPT>a</SCRiptx", 214 "<script>$a</SCRiptx", 215 }, 216 { 217 "' ' completes script end tag", 218 "<SCRIPT>a</SCRipt ", 219 "<script>$a", 220 }, 221 { 222 "'>' completes script end tag", 223 "<SCRIPT>a</SCRipt>", 224 "<script>$a$</script>", 225 }, 226 { 227 "self-closing script end tag", 228 "<SCRIPT>a</SCRipt/>", 229 "<script>$a$</script>", 230 }, 231 { 232 "nested script tag", 233 "<SCRIPT>a</SCRipt<script>", 234 "<script>$a</SCRipt<script>", 235 }, 236 { 237 "script end tag after unfinished", 238 "<SCRIPT>a</SCRipt</script>", 239 "<script>$a</SCRipt$</script>", 240 }, 241 { 242 "script/style mismatched tags", 243 "<script>a</style>", 244 "<script>$a</style>", 245 }, 246 { 247 "style element with entity", 248 "<style>'", 249 "<style>$&apos;", 250 }, 251 { 252 "textarea with tag", 253 "<textarea><div></textarea>", 254 "<textarea>$<div>$</textarea>", 255 }, 256 { 257 "title with tag and entity", 258 "<title><b>K&R C</b></title>", 259 "<title>$<b>K&R C</b>$</title>", 260 }, 261 { 262 "title with trailing '<' entity", 263 "<title>foobar<</title>", 264 "<title>$foobar<$</title>", 265 }, 266 // DOCTYPE tests. 267 { 268 "Proper DOCTYPE", 269 "<!DOCTYPE html>", 270 "<!DOCTYPE html>", 271 }, 272 { 273 "DOCTYPE with no space", 274 "<!doctypehtml>", 275 "<!DOCTYPE html>", 276 }, 277 { 278 "DOCTYPE with two spaces", 279 "<!doctype html>", 280 "<!DOCTYPE html>", 281 }, 282 { 283 "looks like DOCTYPE but isn't", 284 "<!DOCUMENT html>", 285 "<!--DOCUMENT html-->", 286 }, 287 { 288 "DOCTYPE at EOF", 289 "<!DOCtype", 290 "<!DOCTYPE >", 291 }, 292 // XML processing instructions. 293 { 294 "XML processing instruction", 295 "<?xml?>", 296 "<!--?xml?-->", 297 }, 298 // Comments. 299 { 300 "comment0", 301 "abc<b><!-- skipme --></b>def", 302 "abc$<b>$<!-- skipme -->$</b>$def", 303 }, 304 { 305 "comment1", 306 "a<!-->z", 307 "a$<!---->$z", 308 }, 309 { 310 "comment2", 311 "a<!--->z", 312 "a$<!---->$z", 313 }, 314 { 315 "comment3", 316 "a<!--x>-->z", 317 "a$<!--x>-->$z", 318 }, 319 { 320 "comment4", 321 "a<!--x->-->z", 322 "a$<!--x->-->$z", 323 }, 324 { 325 "comment5", 326 "a<!>z", 327 "a$<!---->$z", 328 }, 329 { 330 "comment6", 331 "a<!->z", 332 "a$<!----->$z", 333 }, 334 { 335 "comment7", 336 "a<!---<>z", 337 "a$<!---<>z-->", 338 }, 339 { 340 "comment8", 341 "a<!--z", 342 "a$<!--z-->", 343 }, 344 { 345 "comment9", 346 "a<!--z-", 347 "a$<!--z-->", 348 }, 349 { 350 "comment10", 351 "a<!--z--", 352 "a$<!--z-->", 353 }, 354 { 355 "comment11", 356 "a<!--z---", 357 "a$<!--z--->", 358 }, 359 { 360 "comment12", 361 "a<!--z----", 362 "a$<!--z---->", 363 }, 364 { 365 "comment13", 366 "a<!--x--!>z", 367 "a$<!--x-->$z", 368 }, 369 // An attribute with a backslash. 370 { 371 "backslash", 372 `<p id="a\"b">`, 373 `<p id="a\" b"="">`, 374 }, 375 // Entities, tag name and attribute key lower-casing, and whitespace 376 // normalization within a tag. 377 { 378 "tricky", 379 "<p \t\n iD=\"a"B\" foo=\"bar\"><EM>te<&;xt</em></p>", 380 `<p id="a"B" foo="bar">$<em>$te<&;xt$</em>$</p>`, 381 }, 382 // A nonexistent entity. Tokenizing and converting back to a string should 383 // escape the "&" to become "&". 384 { 385 "noSuchEntity", 386 `<a b="c&noSuchEntity;d"><&alsoDoesntExist;&`, 387 `<a b="c&noSuchEntity;d">$<&alsoDoesntExist;&`, 388 }, 389 { 390 "entity without semicolon", 391 `¬it;∉<a b="q=z&=5¬ice=hello¬=world">`, 392 `¬it;∉$<a b="q=z&amp=5&notice=hello¬=world">`, 393 }, 394 { 395 "entity with digits", 396 "½", 397 "½", 398 }, 399 // Attribute tests: 400 // http://dev.w3.org/html5/pf-summary/Overview.html#attributes 401 { 402 "Empty attribute", 403 `<input disabled FOO>`, 404 `<input disabled="" foo="">`, 405 }, 406 { 407 "Empty attribute, whitespace", 408 `<input disabled FOO >`, 409 `<input disabled="" foo="">`, 410 }, 411 { 412 "Unquoted attribute value", 413 `<input value=yes FOO=BAR>`, 414 `<input value="yes" foo="BAR">`, 415 }, 416 { 417 "Unquoted attribute value, spaces", 418 `<input value = yes FOO = BAR>`, 419 `<input value="yes" foo="BAR">`, 420 }, 421 { 422 "Unquoted attribute value, trailing space", 423 `<input value=yes FOO=BAR >`, 424 `<input value="yes" foo="BAR">`, 425 }, 426 { 427 "Single-quoted attribute value", 428 `<input value='yes' FOO='BAR'>`, 429 `<input value="yes" foo="BAR">`, 430 }, 431 { 432 "Single-quoted attribute value, trailing space", 433 `<input value='yes' FOO='BAR' >`, 434 `<input value="yes" foo="BAR">`, 435 }, 436 { 437 "Double-quoted attribute value", 438 `<input value="I'm an attribute" FOO="BAR">`, 439 `<input value="I'm an attribute" foo="BAR">`, 440 }, 441 { 442 "Attribute name characters", 443 `<meta http-equiv="content-type">`, 444 `<meta http-equiv="content-type">`, 445 }, 446 { 447 "Mixed attributes", 448 `a<P V="0 1" w='2' X=3 y>z`, 449 `a$<p v="0 1" w="2" x="3" y="">$z`, 450 }, 451 { 452 "Attributes with a solitary single quote", 453 `<p id=can't><p id=won't>`, 454 `<p id="can't">$<p id="won't">`, 455 }, 456 } 457 458 func TestTokenizer(t *testing.T) { 459 loop: 460 for _, tt := range tokenTests { 461 z := NewTokenizer(strings.NewReader(tt.html)) 462 if tt.golden != "" { 463 for i, s := range strings.Split(tt.golden, "$") { 464 if z.Next() == ErrorToken { 465 t.Errorf("%s token %d: want %q got error %v", tt.desc, i, s, z.Err()) 466 continue loop 467 } 468 actual := z.Token().String() 469 if s != actual { 470 t.Errorf("%s token %d: want %q got %q", tt.desc, i, s, actual) 471 continue loop 472 } 473 } 474 } 475 z.Next() 476 if z.Err() != io.EOF { 477 t.Errorf("%s: want EOF got %q", tt.desc, z.Err()) 478 } 479 } 480 } 481 482 func TestMaxBuffer(t *testing.T) { 483 // Exceeding the maximum buffer size generates ErrBufferExceeded. 484 z := NewTokenizer(strings.NewReader("<" + strings.Repeat("t", 10))) 485 z.SetMaxBuf(5) 486 tt := z.Next() 487 if got, want := tt, ErrorToken; got != want { 488 t.Fatalf("token type: got: %v want: %v", got, want) 489 } 490 if got, want := z.Err(), ErrBufferExceeded; got != want { 491 t.Errorf("error type: got: %v want: %v", got, want) 492 } 493 if got, want := string(z.Raw()), "<tttt"; got != want { 494 t.Fatalf("buffered before overflow: got: %q want: %q", got, want) 495 } 496 } 497 498 func TestMaxBufferReconstruction(t *testing.T) { 499 // Exceeding the maximum buffer size at any point while tokenizing permits 500 // reconstructing the original input. 501 tests: 502 for _, test := range tokenTests { 503 for maxBuf := 1; ; maxBuf++ { 504 r := strings.NewReader(test.html) 505 z := NewTokenizer(r) 506 z.SetMaxBuf(maxBuf) 507 var tokenized bytes.Buffer 508 for { 509 tt := z.Next() 510 tokenized.Write(z.Raw()) 511 if tt == ErrorToken { 512 if err := z.Err(); err != io.EOF && err != ErrBufferExceeded { 513 t.Errorf("%s: unexpected error: %v", test.desc, err) 514 } 515 break 516 } 517 } 518 // Anything tokenized along with untokenized input or data left in the reader. 519 assembled, err := ioutil.ReadAll(io.MultiReader(&tokenized, bytes.NewReader(z.Buffered()), r)) 520 if err != nil { 521 t.Errorf("%s: ReadAll: %v", test.desc, err) 522 continue tests 523 } 524 if got, want := string(assembled), test.html; got != want { 525 t.Errorf("%s: reassembled html:\n got: %q\nwant: %q", test.desc, got, want) 526 continue tests 527 } 528 // EOF indicates that we completed tokenization and hence found the max 529 // maxBuf that generates ErrBufferExceeded, so continue to the next test. 530 if z.Err() == io.EOF { 531 break 532 } 533 } // buffer sizes 534 } // tests 535 } 536 537 func TestPassthrough(t *testing.T) { 538 // Accumulating the raw output for each parse event should reconstruct the 539 // original input. 540 for _, test := range tokenTests { 541 z := NewTokenizer(strings.NewReader(test.html)) 542 var parsed bytes.Buffer 543 for { 544 tt := z.Next() 545 parsed.Write(z.Raw()) 546 if tt == ErrorToken { 547 break 548 } 549 } 550 if got, want := parsed.String(), test.html; got != want { 551 t.Errorf("%s: parsed output:\n got: %q\nwant: %q", test.desc, got, want) 552 } 553 } 554 } 555 556 func TestBufAPI(t *testing.T) { 557 s := "0<a>1</a>2<b>3<a>4<a>5</a>6</b>7</a>8<a/>9" 558 z := NewTokenizer(bytes.NewBufferString(s)) 559 var result bytes.Buffer 560 depth := 0 561 loop: 562 for { 563 tt := z.Next() 564 switch tt { 565 case ErrorToken: 566 if z.Err() != io.EOF { 567 t.Error(z.Err()) 568 } 569 break loop 570 case TextToken: 571 if depth > 0 { 572 result.Write(z.Text()) 573 } 574 case StartTagToken, EndTagToken: 575 tn, _ := z.TagName() 576 if len(tn) == 1 && tn[0] == 'a' { 577 if tt == StartTagToken { 578 depth++ 579 } else { 580 depth-- 581 } 582 } 583 } 584 } 585 u := "14567" 586 v := string(result.Bytes()) 587 if u != v { 588 t.Errorf("TestBufAPI: want %q got %q", u, v) 589 } 590 } 591 592 func TestConvertNewlines(t *testing.T) { 593 testCases := map[string]string{ 594 "Mac\rDOS\r\nUnix\n": "Mac\nDOS\nUnix\n", 595 "Unix\nMac\rDOS\r\n": "Unix\nMac\nDOS\n", 596 "DOS\r\nDOS\r\nDOS\r\n": "DOS\nDOS\nDOS\n", 597 "": "", 598 "\n": "\n", 599 "\n\r": "\n\n", 600 "\r": "\n", 601 "\r\n": "\n", 602 "\r\n\n": "\n\n", 603 "\r\n\r": "\n\n", 604 "\r\n\r\n": "\n\n", 605 "\r\r": "\n\n", 606 "\r\r\n": "\n\n", 607 "\r\r\n\n": "\n\n\n", 608 "\r\r\r\n": "\n\n\n", 609 "\r \n": "\n \n", 610 "xyz": "xyz", 611 } 612 for in, want := range testCases { 613 if got := string(convertNewlines([]byte(in))); got != want { 614 t.Errorf("input %q: got %q, want %q", in, got, want) 615 } 616 } 617 } 618 619 func TestReaderEdgeCases(t *testing.T) { 620 const s = "<p>An io.Reader can return (0, nil) or (n, io.EOF).</p>" 621 testCases := []io.Reader{ 622 &zeroOneByteReader{s: s}, 623 &eofStringsReader{s: s}, 624 &stuckReader{}, 625 } 626 for i, tc := range testCases { 627 got := []TokenType{} 628 z := NewTokenizer(tc) 629 for { 630 tt := z.Next() 631 if tt == ErrorToken { 632 break 633 } 634 got = append(got, tt) 635 } 636 if err := z.Err(); err != nil && err != io.EOF { 637 if err != io.ErrNoProgress { 638 t.Errorf("i=%d: %v", i, err) 639 } 640 continue 641 } 642 want := []TokenType{ 643 StartTagToken, 644 TextToken, 645 EndTagToken, 646 } 647 if !reflect.DeepEqual(got, want) { 648 t.Errorf("i=%d: got %v, want %v", i, got, want) 649 continue 650 } 651 } 652 } 653 654 // zeroOneByteReader is like a strings.Reader that alternates between 655 // returning 0 bytes and 1 byte at a time. 656 type zeroOneByteReader struct { 657 s string 658 n int 659 } 660 661 func (r *zeroOneByteReader) Read(p []byte) (int, error) { 662 if len(p) == 0 { 663 return 0, nil 664 } 665 if len(r.s) == 0 { 666 return 0, io.EOF 667 } 668 r.n++ 669 if r.n%2 != 0 { 670 return 0, nil 671 } 672 p[0], r.s = r.s[0], r.s[1:] 673 return 1, nil 674 } 675 676 // eofStringsReader is like a strings.Reader but can return an (n, err) where 677 // n > 0 && err != nil. 678 type eofStringsReader struct { 679 s string 680 } 681 682 func (r *eofStringsReader) Read(p []byte) (int, error) { 683 n := copy(p, r.s) 684 r.s = r.s[n:] 685 if r.s != "" { 686 return n, nil 687 } 688 return n, io.EOF 689 } 690 691 // stuckReader is an io.Reader that always returns no data and no error. 692 type stuckReader struct{} 693 694 func (*stuckReader) Read(p []byte) (int, error) { 695 return 0, nil 696 } 697 698 const ( 699 rawLevel = iota 700 lowLevel 701 highLevel 702 ) 703 704 func benchmarkTokenizer(b *testing.B, level int) { 705 buf, err := ioutil.ReadFile("testdata/go1.html") 706 if err != nil { 707 b.Fatalf("could not read testdata/go1.html: %v", err) 708 } 709 b.SetBytes(int64(len(buf))) 710 runtime.GC() 711 b.ReportAllocs() 712 b.ResetTimer() 713 for i := 0; i < b.N; i++ { 714 z := NewTokenizer(bytes.NewBuffer(buf)) 715 for { 716 tt := z.Next() 717 if tt == ErrorToken { 718 if err := z.Err(); err != nil && err != io.EOF { 719 b.Fatalf("tokenizer error: %v", err) 720 } 721 break 722 } 723 switch level { 724 case rawLevel: 725 // Calling z.Raw just returns the raw bytes of the token. It does 726 // not unescape < to <, or lower-case tag names and attribute keys. 727 z.Raw() 728 case lowLevel: 729 // Caling z.Text, z.TagName and z.TagAttr returns []byte values 730 // whose contents may change on the next call to z.Next. 731 switch tt { 732 case TextToken, CommentToken, DoctypeToken: 733 z.Text() 734 case StartTagToken, SelfClosingTagToken: 735 _, more := z.TagName() 736 for more { 737 _, _, more = z.TagAttr() 738 } 739 case EndTagToken: 740 z.TagName() 741 } 742 case highLevel: 743 // Calling z.Token converts []byte values to strings whose validity 744 // extend beyond the next call to z.Next. 745 z.Token() 746 } 747 } 748 } 749 } 750 751 func BenchmarkRawLevelTokenizer(b *testing.B) { benchmarkTokenizer(b, rawLevel) } 752 func BenchmarkLowLevelTokenizer(b *testing.B) { benchmarkTokenizer(b, lowLevel) } 753 func BenchmarkHighLevelTokenizer(b *testing.B) { benchmarkTokenizer(b, highLevel) }