github.com/Andyfoo/golang/x/net@v0.0.0-20190901054642-57c1bf301704/html/token_test.go (about) 1 // Copyright 2010 The Go Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 package html 6 7 import ( 8 "bytes" 9 "io" 10 "io/ioutil" 11 "reflect" 12 "runtime" 13 "strings" 14 "testing" 15 ) 16 17 type tokenTest struct { 18 // A short description of the test case. 19 desc string 20 // The HTML to parse. 21 html string 22 // The string representations of the expected tokens, joined by '$'. 23 golden string 24 } 25 26 var tokenTests = []tokenTest{ 27 { 28 "empty", 29 "", 30 "", 31 }, 32 // A single text node. The tokenizer should not break text nodes on whitespace, 33 // nor should it normalize whitespace within a text node. 34 { 35 "text", 36 "foo bar", 37 "foo bar", 38 }, 39 // An entity. 40 { 41 "entity", 42 "one < two", 43 "one < two", 44 }, 45 // A start, self-closing and end tag. The tokenizer does not care if the start 46 // and end tokens don't match; that is the job of the parser. 47 { 48 "tags", 49 "<a>b<c/>d</e>", 50 "<a>$b$<c/>$d$</e>", 51 }, 52 // Angle brackets that aren't a tag. 53 { 54 "not a tag #0", 55 "<", 56 "<", 57 }, 58 { 59 "not a tag #1", 60 "</", 61 "</", 62 }, 63 { 64 "not a tag #2", 65 "</>", 66 "<!---->", 67 }, 68 { 69 "not a tag #3", 70 "a</>b", 71 "a$<!---->$b", 72 }, 73 { 74 "not a tag #4", 75 "</ >", 76 "<!-- -->", 77 }, 78 { 79 "not a tag #5", 80 "</.", 81 "<!--.-->", 82 }, 83 { 84 "not a tag #6", 85 "</.>", 86 "<!--.-->", 87 }, 88 { 89 "not a tag #7", 90 "a < b", 91 "a < b", 92 }, 93 { 94 "not a tag #8", 95 "<.>", 96 "<.>", 97 }, 98 { 99 "not a tag #9", 100 "a<<<b>>>c", 101 "a<<$<b>$>>c", 102 }, 103 { 104 "not a tag #10", 105 "if x<0 and y < 0 then x*y>0", 106 "if x<0 and y < 0 then x*y>0", 107 }, 108 { 109 "not a tag #11", 110 "<<p>", 111 "<$<p>", 112 }, 113 // EOF in a tag name. 114 { 115 "tag name eof #0", 116 "<a", 117 "", 118 }, 119 { 120 "tag name eof #1", 121 "<a ", 122 "", 123 }, 124 { 125 "tag name eof #2", 126 "a<b", 127 "a", 128 }, 129 { 130 "tag name eof #3", 131 "<a><b", 132 "<a>", 133 }, 134 { 135 "tag name eof #4", 136 `<a x`, 137 ``, 138 }, 139 // Some malformed tags that are missing a '>'. 140 { 141 "malformed tag #0", 142 `<p</p>`, 143 `<p< p="">`, 144 }, 145 { 146 "malformed tag #1", 147 `<p </p>`, 148 `<p <="" p="">`, 149 }, 150 { 151 "malformed tag #2", 152 `<p id`, 153 ``, 154 }, 155 { 156 "malformed tag #3", 157 `<p id=`, 158 ``, 159 }, 160 { 161 "malformed tag #4", 162 `<p id=>`, 163 `<p id="">`, 164 }, 165 { 166 "malformed tag #5", 167 `<p id=0`, 168 ``, 169 }, 170 { 171 "malformed tag #6", 172 `<p id=0</p>`, 173 `<p id="0</p">`, 174 }, 175 { 176 "malformed tag #7", 177 `<p id="0</p>`, 178 ``, 179 }, 180 { 181 "malformed tag #8", 182 `<p id="0"</p>`, 183 `<p id="0" <="" p="">`, 184 }, 185 { 186 "malformed tag #9", 187 `<p></p id`, 188 `<p>`, 189 }, 190 // Raw text and RCDATA. 191 { 192 "basic raw text", 193 "<script><a></b></script>", 194 "<script>$<a></b>$</script>", 195 }, 196 { 197 "unfinished script end tag", 198 "<SCRIPT>a</SCR", 199 "<script>$a</SCR", 200 }, 201 { 202 "broken script end tag", 203 "<SCRIPT>a</SCR ipt>", 204 "<script>$a</SCR ipt>", 205 }, 206 { 207 "EOF in script end tag", 208 "<SCRIPT>a</SCRipt", 209 "<script>$a</SCRipt", 210 }, 211 { 212 "scriptx end tag", 213 "<SCRIPT>a</SCRiptx", 214 "<script>$a</SCRiptx", 215 }, 216 { 217 "' ' completes script end tag", 218 "<SCRIPT>a</SCRipt ", 219 "<script>$a", 220 }, 221 { 222 "'>' completes script end tag", 223 "<SCRIPT>a</SCRipt>", 224 "<script>$a$</script>", 225 }, 226 { 227 "self-closing script end tag", 228 "<SCRIPT>a</SCRipt/>", 229 "<script>$a$</script>", 230 }, 231 { 232 "nested script tag", 233 "<SCRIPT>a</SCRipt<script>", 234 "<script>$a</SCRipt<script>", 235 }, 236 { 237 "script end tag after unfinished", 238 "<SCRIPT>a</SCRipt</script>", 239 "<script>$a</SCRipt$</script>", 240 }, 241 { 242 "script/style mismatched tags", 243 "<script>a</style>", 244 "<script>$a</style>", 245 }, 246 { 247 "style element with entity", 248 "<style>'", 249 "<style>$&apos;", 250 }, 251 { 252 "textarea with tag", 253 "<textarea><div></textarea>", 254 "<textarea>$<div>$</textarea>", 255 }, 256 { 257 "title with tag and entity", 258 "<title><b>K&R C</b></title>", 259 "<title>$<b>K&R C</b>$</title>", 260 }, 261 // DOCTYPE tests. 262 { 263 "Proper DOCTYPE", 264 "<!DOCTYPE html>", 265 "<!DOCTYPE html>", 266 }, 267 { 268 "DOCTYPE with no space", 269 "<!doctypehtml>", 270 "<!DOCTYPE html>", 271 }, 272 { 273 "DOCTYPE with two spaces", 274 "<!doctype html>", 275 "<!DOCTYPE html>", 276 }, 277 { 278 "looks like DOCTYPE but isn't", 279 "<!DOCUMENT html>", 280 "<!--DOCUMENT html-->", 281 }, 282 { 283 "DOCTYPE at EOF", 284 "<!DOCtype", 285 "<!DOCTYPE >", 286 }, 287 // XML processing instructions. 288 { 289 "XML processing instruction", 290 "<?xml?>", 291 "<!--?xml?-->", 292 }, 293 // Comments. 294 { 295 "comment0", 296 "abc<b><!-- skipme --></b>def", 297 "abc$<b>$<!-- skipme -->$</b>$def", 298 }, 299 { 300 "comment1", 301 "a<!-->z", 302 "a$<!---->$z", 303 }, 304 { 305 "comment2", 306 "a<!--->z", 307 "a$<!---->$z", 308 }, 309 { 310 "comment3", 311 "a<!--x>-->z", 312 "a$<!--x>-->$z", 313 }, 314 { 315 "comment4", 316 "a<!--x->-->z", 317 "a$<!--x->-->$z", 318 }, 319 { 320 "comment5", 321 "a<!>z", 322 "a$<!---->$z", 323 }, 324 { 325 "comment6", 326 "a<!->z", 327 "a$<!----->$z", 328 }, 329 { 330 "comment7", 331 "a<!---<>z", 332 "a$<!---<>z-->", 333 }, 334 { 335 "comment8", 336 "a<!--z", 337 "a$<!--z-->", 338 }, 339 { 340 "comment9", 341 "a<!--z-", 342 "a$<!--z-->", 343 }, 344 { 345 "comment10", 346 "a<!--z--", 347 "a$<!--z-->", 348 }, 349 { 350 "comment11", 351 "a<!--z---", 352 "a$<!--z--->", 353 }, 354 { 355 "comment12", 356 "a<!--z----", 357 "a$<!--z---->", 358 }, 359 { 360 "comment13", 361 "a<!--x--!>z", 362 "a$<!--x-->$z", 363 }, 364 // An attribute with a backslash. 365 { 366 "backslash", 367 `<p id="a\"b">`, 368 `<p id="a\" b"="">`, 369 }, 370 // Entities, tag name and attribute key lower-casing, and whitespace 371 // normalization within a tag. 372 { 373 "tricky", 374 "<p \t\n iD=\"a"B\" foo=\"bar\"><EM>te<&;xt</em></p>", 375 `<p id="a"B" foo="bar">$<em>$te<&;xt$</em>$</p>`, 376 }, 377 // A nonexistent entity. Tokenizing and converting back to a string should 378 // escape the "&" to become "&". 379 { 380 "noSuchEntity", 381 `<a b="c&noSuchEntity;d"><&alsoDoesntExist;&`, 382 `<a b="c&noSuchEntity;d">$<&alsoDoesntExist;&`, 383 }, 384 { 385 "entity without semicolon", 386 `¬it;∉<a b="q=z&=5¬ice=hello¬=world">`, 387 `¬it;∉$<a b="q=z&amp=5&notice=hello¬=world">`, 388 }, 389 { 390 "entity with digits", 391 "½", 392 "½", 393 }, 394 // Attribute tests: 395 // http://dev.w3.org/html5/pf-summary/Overview.html#attributes 396 { 397 "Empty attribute", 398 `<input disabled FOO>`, 399 `<input disabled="" foo="">`, 400 }, 401 { 402 "Empty attribute, whitespace", 403 `<input disabled FOO >`, 404 `<input disabled="" foo="">`, 405 }, 406 { 407 "Unquoted attribute value", 408 `<input value=yes FOO=BAR>`, 409 `<input value="yes" foo="BAR">`, 410 }, 411 { 412 "Unquoted attribute value, spaces", 413 `<input value = yes FOO = BAR>`, 414 `<input value="yes" foo="BAR">`, 415 }, 416 { 417 "Unquoted attribute value, trailing space", 418 `<input value=yes FOO=BAR >`, 419 `<input value="yes" foo="BAR">`, 420 }, 421 { 422 "Single-quoted attribute value", 423 `<input value='yes' FOO='BAR'>`, 424 `<input value="yes" foo="BAR">`, 425 }, 426 { 427 "Single-quoted attribute value, trailing space", 428 `<input value='yes' FOO='BAR' >`, 429 `<input value="yes" foo="BAR">`, 430 }, 431 { 432 "Double-quoted attribute value", 433 `<input value="I'm an attribute" FOO="BAR">`, 434 `<input value="I'm an attribute" foo="BAR">`, 435 }, 436 { 437 "Attribute name characters", 438 `<meta http-equiv="content-type">`, 439 `<meta http-equiv="content-type">`, 440 }, 441 { 442 "Mixed attributes", 443 `a<P V="0 1" w='2' X=3 y>z`, 444 `a$<p v="0 1" w="2" x="3" y="">$z`, 445 }, 446 { 447 "Attributes with a solitary single quote", 448 `<p id=can't><p id=won't>`, 449 `<p id="can't">$<p id="won't">`, 450 }, 451 } 452 453 func TestTokenizer(t *testing.T) { 454 loop: 455 for _, tt := range tokenTests { 456 z := NewTokenizer(strings.NewReader(tt.html)) 457 if tt.golden != "" { 458 for i, s := range strings.Split(tt.golden, "$") { 459 if z.Next() == ErrorToken { 460 t.Errorf("%s token %d: want %q got error %v", tt.desc, i, s, z.Err()) 461 continue loop 462 } 463 actual := z.Token().String() 464 if s != actual { 465 t.Errorf("%s token %d: want %q got %q", tt.desc, i, s, actual) 466 continue loop 467 } 468 } 469 } 470 z.Next() 471 if z.Err() != io.EOF { 472 t.Errorf("%s: want EOF got %q", tt.desc, z.Err()) 473 } 474 } 475 } 476 477 func TestMaxBuffer(t *testing.T) { 478 // Exceeding the maximum buffer size generates ErrBufferExceeded. 479 z := NewTokenizer(strings.NewReader("<" + strings.Repeat("t", 10))) 480 z.SetMaxBuf(5) 481 tt := z.Next() 482 if got, want := tt, ErrorToken; got != want { 483 t.Fatalf("token type: got: %v want: %v", got, want) 484 } 485 if got, want := z.Err(), ErrBufferExceeded; got != want { 486 t.Errorf("error type: got: %v want: %v", got, want) 487 } 488 if got, want := string(z.Raw()), "<tttt"; got != want { 489 t.Fatalf("buffered before overflow: got: %q want: %q", got, want) 490 } 491 } 492 493 func TestMaxBufferReconstruction(t *testing.T) { 494 // Exceeding the maximum buffer size at any point while tokenizing permits 495 // reconstructing the original input. 496 tests: 497 for _, test := range tokenTests { 498 for maxBuf := 1; ; maxBuf++ { 499 r := strings.NewReader(test.html) 500 z := NewTokenizer(r) 501 z.SetMaxBuf(maxBuf) 502 var tokenized bytes.Buffer 503 for { 504 tt := z.Next() 505 tokenized.Write(z.Raw()) 506 if tt == ErrorToken { 507 if err := z.Err(); err != io.EOF && err != ErrBufferExceeded { 508 t.Errorf("%s: unexpected error: %v", test.desc, err) 509 } 510 break 511 } 512 } 513 // Anything tokenized along with untokenized input or data left in the reader. 514 assembled, err := ioutil.ReadAll(io.MultiReader(&tokenized, bytes.NewReader(z.Buffered()), r)) 515 if err != nil { 516 t.Errorf("%s: ReadAll: %v", test.desc, err) 517 continue tests 518 } 519 if got, want := string(assembled), test.html; got != want { 520 t.Errorf("%s: reassembled html:\n got: %q\nwant: %q", test.desc, got, want) 521 continue tests 522 } 523 // EOF indicates that we completed tokenization and hence found the max 524 // maxBuf that generates ErrBufferExceeded, so continue to the next test. 525 if z.Err() == io.EOF { 526 break 527 } 528 } // buffer sizes 529 } // tests 530 } 531 532 func TestPassthrough(t *testing.T) { 533 // Accumulating the raw output for each parse event should reconstruct the 534 // original input. 535 for _, test := range tokenTests { 536 z := NewTokenizer(strings.NewReader(test.html)) 537 var parsed bytes.Buffer 538 for { 539 tt := z.Next() 540 parsed.Write(z.Raw()) 541 if tt == ErrorToken { 542 break 543 } 544 } 545 if got, want := parsed.String(), test.html; got != want { 546 t.Errorf("%s: parsed output:\n got: %q\nwant: %q", test.desc, got, want) 547 } 548 } 549 } 550 551 func TestBufAPI(t *testing.T) { 552 s := "0<a>1</a>2<b>3<a>4<a>5</a>6</b>7</a>8<a/>9" 553 z := NewTokenizer(bytes.NewBufferString(s)) 554 var result bytes.Buffer 555 depth := 0 556 loop: 557 for { 558 tt := z.Next() 559 switch tt { 560 case ErrorToken: 561 if z.Err() != io.EOF { 562 t.Error(z.Err()) 563 } 564 break loop 565 case TextToken: 566 if depth > 0 { 567 result.Write(z.Text()) 568 } 569 case StartTagToken, EndTagToken: 570 tn, _ := z.TagName() 571 if len(tn) == 1 && tn[0] == 'a' { 572 if tt == StartTagToken { 573 depth++ 574 } else { 575 depth-- 576 } 577 } 578 } 579 } 580 u := "14567" 581 v := string(result.Bytes()) 582 if u != v { 583 t.Errorf("TestBufAPI: want %q got %q", u, v) 584 } 585 } 586 587 func TestConvertNewlines(t *testing.T) { 588 testCases := map[string]string{ 589 "Mac\rDOS\r\nUnix\n": "Mac\nDOS\nUnix\n", 590 "Unix\nMac\rDOS\r\n": "Unix\nMac\nDOS\n", 591 "DOS\r\nDOS\r\nDOS\r\n": "DOS\nDOS\nDOS\n", 592 "": "", 593 "\n": "\n", 594 "\n\r": "\n\n", 595 "\r": "\n", 596 "\r\n": "\n", 597 "\r\n\n": "\n\n", 598 "\r\n\r": "\n\n", 599 "\r\n\r\n": "\n\n", 600 "\r\r": "\n\n", 601 "\r\r\n": "\n\n", 602 "\r\r\n\n": "\n\n\n", 603 "\r\r\r\n": "\n\n\n", 604 "\r \n": "\n \n", 605 "xyz": "xyz", 606 } 607 for in, want := range testCases { 608 if got := string(convertNewlines([]byte(in))); got != want { 609 t.Errorf("input %q: got %q, want %q", in, got, want) 610 } 611 } 612 } 613 614 func TestReaderEdgeCases(t *testing.T) { 615 const s = "<p>An io.Reader can return (0, nil) or (n, io.EOF).</p>" 616 testCases := []io.Reader{ 617 &zeroOneByteReader{s: s}, 618 &eofStringsReader{s: s}, 619 &stuckReader{}, 620 } 621 for i, tc := range testCases { 622 got := []TokenType{} 623 z := NewTokenizer(tc) 624 for { 625 tt := z.Next() 626 if tt == ErrorToken { 627 break 628 } 629 got = append(got, tt) 630 } 631 if err := z.Err(); err != nil && err != io.EOF { 632 if err != io.ErrNoProgress { 633 t.Errorf("i=%d: %v", i, err) 634 } 635 continue 636 } 637 want := []TokenType{ 638 StartTagToken, 639 TextToken, 640 EndTagToken, 641 } 642 if !reflect.DeepEqual(got, want) { 643 t.Errorf("i=%d: got %v, want %v", i, got, want) 644 continue 645 } 646 } 647 } 648 649 // zeroOneByteReader is like a strings.Reader that alternates between 650 // returning 0 bytes and 1 byte at a time. 651 type zeroOneByteReader struct { 652 s string 653 n int 654 } 655 656 func (r *zeroOneByteReader) Read(p []byte) (int, error) { 657 if len(p) == 0 { 658 return 0, nil 659 } 660 if len(r.s) == 0 { 661 return 0, io.EOF 662 } 663 r.n++ 664 if r.n%2 != 0 { 665 return 0, nil 666 } 667 p[0], r.s = r.s[0], r.s[1:] 668 return 1, nil 669 } 670 671 // eofStringsReader is like a strings.Reader but can return an (n, err) where 672 // n > 0 && err != nil. 673 type eofStringsReader struct { 674 s string 675 } 676 677 func (r *eofStringsReader) Read(p []byte) (int, error) { 678 n := copy(p, r.s) 679 r.s = r.s[n:] 680 if r.s != "" { 681 return n, nil 682 } 683 return n, io.EOF 684 } 685 686 // stuckReader is an io.Reader that always returns no data and no error. 687 type stuckReader struct{} 688 689 func (*stuckReader) Read(p []byte) (int, error) { 690 return 0, nil 691 } 692 693 const ( 694 rawLevel = iota 695 lowLevel 696 highLevel 697 ) 698 699 func benchmarkTokenizer(b *testing.B, level int) { 700 buf, err := ioutil.ReadFile("testdata/go1.html") 701 if err != nil { 702 b.Fatalf("could not read testdata/go1.html: %v", err) 703 } 704 b.SetBytes(int64(len(buf))) 705 runtime.GC() 706 b.ReportAllocs() 707 b.ResetTimer() 708 for i := 0; i < b.N; i++ { 709 z := NewTokenizer(bytes.NewBuffer(buf)) 710 for { 711 tt := z.Next() 712 if tt == ErrorToken { 713 if err := z.Err(); err != nil && err != io.EOF { 714 b.Fatalf("tokenizer error: %v", err) 715 } 716 break 717 } 718 switch level { 719 case rawLevel: 720 // Calling z.Raw just returns the raw bytes of the token. It does 721 // not unescape < to <, or lower-case tag names and attribute keys. 722 z.Raw() 723 case lowLevel: 724 // Caling z.Text, z.TagName and z.TagAttr returns []byte values 725 // whose contents may change on the next call to z.Next. 726 switch tt { 727 case TextToken, CommentToken, DoctypeToken: 728 z.Text() 729 case StartTagToken, SelfClosingTagToken: 730 _, more := z.TagName() 731 for more { 732 _, _, more = z.TagAttr() 733 } 734 case EndTagToken: 735 z.TagName() 736 } 737 case highLevel: 738 // Calling z.Token converts []byte values to strings whose validity 739 // extend beyond the next call to z.Next. 740 z.Token() 741 } 742 } 743 } 744 } 745 746 func BenchmarkRawLevelTokenizer(b *testing.B) { benchmarkTokenizer(b, rawLevel) } 747 func BenchmarkLowLevelTokenizer(b *testing.B) { benchmarkTokenizer(b, lowLevel) } 748 func BenchmarkHighLevelTokenizer(b *testing.B) { benchmarkTokenizer(b, highLevel) }