github.com/vugu/vugu@v0.3.6-0.20240430171613-3f6f402e014b/internal/htmlx/token.go (about) 1 // Copyright 2010 The Go Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 package htmlx 6 7 import ( 8 "bytes" 9 "errors" 10 "io" 11 "strconv" 12 "strings" 13 14 "github.com/vugu/vugu/internal/htmlx/atom" 15 ) 16 17 // A TokenType is the type of a Token. 18 type TokenType uint32 19 20 const ( 21 // ErrorToken means that an error occurred during tokenization. 22 ErrorToken TokenType = iota 23 // TextToken means a text node. 24 TextToken 25 // A StartTagToken looks like <a>. 26 StartTagToken 27 // An EndTagToken looks like </a>. 28 EndTagToken 29 // A SelfClosingTagToken tag looks like <br/>. 30 SelfClosingTagToken 31 // A CommentToken looks like <!--x-->. 32 CommentToken 33 // A DoctypeToken looks like <!DOCTYPE x> 34 DoctypeToken 35 ) 36 37 // ErrBufferExceeded means that the buffering limit was exceeded. 38 var ErrBufferExceeded = errors.New("max buffer exceeded") 39 40 // String returns a string representation of the TokenType. 41 func (t TokenType) String() string { 42 switch t { 43 case ErrorToken: 44 return "Error" 45 case TextToken: 46 return "Text" 47 case StartTagToken: 48 return "StartTag" 49 case EndTagToken: 50 return "EndTag" 51 case SelfClosingTagToken: 52 return "SelfClosingTag" 53 case CommentToken: 54 return "Comment" 55 case DoctypeToken: 56 return "Doctype" 57 } 58 return "Invalid(" + strconv.Itoa(int(t)) + ")" 59 } 60 61 // An Attribute is an attribute namespace-key-value triple. Namespace is 62 // non-empty for foreign attributes like xlink, Key is alphabetic (and hence 63 // does not contain escapable characters like '&', '<' or '>'), and Val is 64 // unescaped (it looks like "a<b" rather than "a<b"). 65 // 66 // Namespace is only used by the parser, not the tokenizer. 67 type Attribute struct { 68 Namespace, Key, Val string 69 } 70 71 // A Token consists of a TokenType and some Data (tag name for start and end 72 // tags, content for text, comments and doctypes). A tag Token may also contain 73 // a slice of Attributes. Data is unescaped for all Tokens (it looks like "a<b" 74 // rather than "a<b"). For tag Tokens, DataAtom is the atom for Data, or 75 // zero if Data is not a known tag name. 76 type Token struct { 77 Type TokenType 78 DataAtom atom.Atom 79 Data string 80 Attr []Attribute 81 Column int 82 Line int 83 } 84 85 // tagString returns a string representation of a tag Token's Data and Attr. 86 func (t Token) tagString() string { 87 if len(t.Attr) == 0 { 88 return t.Data 89 } 90 buf := bytes.NewBufferString(t.Data) 91 for _, a := range t.Attr { 92 buf.WriteByte(' ') 93 buf.WriteString(a.Key) 94 buf.WriteString(`="`) 95 err := escape(buf, a.Val) 96 if err != nil { 97 panic(err) 98 } 99 buf.WriteByte('"') 100 } 101 return buf.String() 102 } 103 104 // String returns a string representation of the Token. 105 func (t Token) String() string { 106 switch t.Type { 107 case ErrorToken: 108 return "" 109 case TextToken: 110 return EscapeString(t.Data) 111 case StartTagToken: 112 return "<" + t.tagString() + ">" 113 case EndTagToken: 114 return "</" + t.tagString() + ">" 115 case SelfClosingTagToken: 116 return "<" + t.tagString() + "/>" 117 case CommentToken: 118 return "<!--" + t.Data + "-->" 119 case DoctypeToken: 120 return "<!DOCTYPE " + t.Data + ">" 121 } 122 return "Invalid(" + strconv.Itoa(int(t.Type)) + ")" 123 } 124 125 // span is a range of bytes in a Tokenizer's buffer. The start is inclusive, 126 // the end is exclusive. 127 type span struct { 128 start, end int 129 } 130 131 // A Tokenizer returns a stream of HTML Tokens. 132 type Tokenizer struct { 133 // r is the source of the HTML text. 134 r io.Reader 135 // tt is the TokenType of the current token. 136 tt TokenType 137 // err is the first error encountered during tokenization. It is possible 138 // for tt != Error && err != nil to hold: this means that Next returned a 139 // valid token but the subsequent Next call will return an error token. 140 // For example, if the HTML text input was just "plain", then the first 141 // Next call would set z.err to io.EOF but return a TextToken, and all 142 // subsequent Next calls would return an ErrorToken. 143 // err is never reset. Once it becomes non-nil, it stays non-nil. 144 err error 145 // readErr is the error returned by the io.Reader r. It is separate from 146 // err because it is valid for an io.Reader to return (n int, err1 error) 147 // such that n > 0 && err1 != nil, and callers should always process the 148 // n > 0 bytes before considering the error err1. 149 readErr error 150 // buf[raw.start:raw.end] holds the raw bytes of the current token. 151 // buf[raw.end:] is buffered input that will yield future tokens. 152 raw span 153 buf []byte 154 // maxBuf limits the data buffered in buf. A value of 0 means unlimited. 155 maxBuf int 156 // buf[data.start:data.end] holds the raw bytes of the current token's data: 157 // a text token's text, a tag token's tag name, etc. 158 data span 159 // pendingAttr is the attribute key and value currently being tokenized. 160 // When complete, pendingAttr is pushed onto attr. nAttrReturned is 161 // incremented on each call to TagAttr. 162 pendingAttr [2]span 163 attr [][2]span 164 nAttrReturned int 165 // rawTag is the "script" in "</script>" that closes the next token. If 166 // non-empty, the subsequent call to Next will return a raw or RCDATA text 167 // token: one that treats "<p>" as text instead of an element. 168 // rawTag's contents are lower-cased. 169 rawTag string 170 // textIsRaw is whether the current text token's data is not escaped. 171 textIsRaw bool 172 // convertNUL is whether NUL bytes in the current token's data should 173 // be converted into \ufffd replacement characters. 174 convertNUL bool 175 // allowCDATA is whether CDATA sections are allowed in the current context. 176 allowCDATA bool 177 // tokenLine is the line that tt is found on. 178 tokenLine int 179 // tokenColumn is the column that tt starts on. 180 tokenColumn int 181 // currentLine is the ongoing temporary variable for tracking lines. 182 currentLine int 183 // currentColumn is the ongoing temporary variable for tracking columns. 184 currentColumn int 185 } 186 187 // AllowCDATA sets whether or not the tokenizer recognizes <![CDATA[foo]]> as 188 // the text "foo". The default value is false, which means to recognize it as 189 // a bogus comment "<!-- [CDATA[foo]] -->" instead. 190 // 191 // Strictly speaking, an HTML5 compliant tokenizer should allow CDATA if and 192 // only if tokenizing foreign content, such as MathML and SVG. However, 193 // tracking foreign-contentness is difficult to do purely in the tokenizer, 194 // as opposed to the parser, due to HTML integration points: an <svg> element 195 // can contain a <foreignObject> that is foreign-to-SVG but not foreign-to- 196 // HTML. For strict compliance with the HTML5 tokenization algorithm, it is the 197 // responsibility of the user of a tokenizer to call AllowCDATA as appropriate. 198 // In practice, if using the tokenizer without caring whether MathML or SVG 199 // CDATA is text or comments, such as tokenizing HTML to find all the anchor 200 // text, it is acceptable to ignore this responsibility. 201 func (z *Tokenizer) AllowCDATA(allowCDATA bool) { 202 z.allowCDATA = allowCDATA 203 } 204 205 // NextIsNotRawText instructs the tokenizer that the next token should not be 206 // considered as 'raw text'. Some elements, such as script and title elements, 207 // normally require the next token after the opening tag to be 'raw text' that 208 // has no child elements. For example, tokenizing "<title>a<b>c</b>d</title>" 209 // yields a start tag token for "<title>", a text token for "a<b>c</b>d", and 210 // an end tag token for "</title>". There are no distinct start tag or end tag 211 // tokens for the "<b>" and "</b>". 212 // 213 // This tokenizer implementation will generally look for raw text at the right 214 // times. Strictly speaking, an HTML5 compliant tokenizer should not look for 215 // raw text if in foreign content: <title> generally needs raw text, but a 216 // <title> inside an <svg> does not. Another example is that a <textarea> 217 // generally needs raw text, but a <textarea> is not allowed as an immediate 218 // child of a <select>; in normal parsing, a <textarea> implies </select>, but 219 // one cannot close the implicit element when parsing a <select>'s InnerHTML. 220 // Similarly to AllowCDATA, tracking the correct moment to override raw-text- 221 // ness is difficult to do purely in the tokenizer, as opposed to the parser. 222 // For strict compliance with the HTML5 tokenization algorithm, it is the 223 // responsibility of the user of a tokenizer to call NextIsNotRawText as 224 // appropriate. In practice, like AllowCDATA, it is acceptable to ignore this 225 // responsibility for basic usage. 226 // 227 // Note that this 'raw text' concept is different from the one offered by the 228 // Tokenizer.Raw method. 229 func (z *Tokenizer) NextIsNotRawText() { 230 z.rawTag = "" 231 } 232 233 // Err returns the error associated with the most recent ErrorToken token. 234 // This is typically io.EOF, meaning the end of tokenization. 235 func (z *Tokenizer) Err() error { 236 if z.tt != ErrorToken { 237 return nil 238 } 239 return z.err 240 } 241 242 // readByte returns the next byte from the input stream, doing a buffered read 243 // from z.r into z.buf if necessary. z.buf[z.raw.start:z.raw.end] remains a contiguous byte 244 // slice that holds all the bytes read so far for the current token. 245 // It sets z.err if the underlying reader returns an error. 246 // Pre-condition: z.err == nil. 247 func (z *Tokenizer) readByte() byte { 248 if z.raw.end >= len(z.buf) { 249 // Our buffer is exhausted and we have to read from z.r. Check if the 250 // previous read resulted in an error. 251 if z.readErr != nil { 252 z.err = z.readErr 253 return 0 254 } 255 // We copy z.buf[z.raw.start:z.raw.end] to the beginning of z.buf. If the length 256 // z.raw.end - z.raw.start is more than half the capacity of z.buf, then we 257 // allocate a new buffer before the copy. 258 c := cap(z.buf) 259 d := z.raw.end - z.raw.start 260 var buf1 []byte 261 if 2*d > c { 262 buf1 = make([]byte, d, 2*c) 263 } else { 264 buf1 = z.buf[:d] 265 } 266 copy(buf1, z.buf[z.raw.start:z.raw.end]) 267 if x := z.raw.start; x != 0 { 268 // Adjust the data/attr spans to refer to the same contents after the copy. 269 z.data.start -= x 270 z.data.end -= x 271 z.pendingAttr[0].start -= x 272 z.pendingAttr[0].end -= x 273 z.pendingAttr[1].start -= x 274 z.pendingAttr[1].end -= x 275 for i := range z.attr { 276 z.attr[i][0].start -= x 277 z.attr[i][0].end -= x 278 z.attr[i][1].start -= x 279 z.attr[i][1].end -= x 280 } 281 } 282 z.raw.start, z.raw.end, z.buf = 0, d, buf1[:d] 283 // Now that we have copied the live bytes to the start of the buffer, 284 // we read from z.r into the remainder. 285 var n int 286 n, z.readErr = readAtLeastOneByte(z.r, buf1[d:cap(buf1)]) 287 if n == 0 { 288 z.err = z.readErr 289 return 0 290 } 291 z.buf = buf1[:d+n] 292 } 293 x := z.buf[z.raw.end] 294 z.raw.end++ 295 if z.maxBuf > 0 && z.raw.end-z.raw.start >= z.maxBuf { 296 z.err = ErrBufferExceeded 297 return 0 298 } 299 300 // Increment the line and column tracker 301 if x == '\n' { 302 z.currentLine++ 303 z.currentColumn = 0 304 } else { 305 z.currentColumn++ 306 } 307 308 return x 309 } 310 311 // Buffered returns a slice containing data buffered but not yet tokenized. 312 func (z *Tokenizer) Buffered() []byte { 313 return z.buf[z.raw.end:] 314 } 315 316 // readAtLeastOneByte wraps an io.Reader so that reading cannot return (0, nil). 317 // It returns io.ErrNoProgress if the underlying r.Read method returns (0, nil) 318 // too many times in succession. 319 func readAtLeastOneByte(r io.Reader, b []byte) (int, error) { 320 for i := 0; i < 100; i++ { 321 n, err := r.Read(b) 322 if n != 0 || err != nil { 323 return n, err 324 } 325 } 326 return 0, io.ErrNoProgress 327 } 328 329 // skipWhiteSpace skips past any white space. 330 func (z *Tokenizer) skipWhiteSpace() { 331 if z.err != nil { 332 return 333 } 334 for { 335 c := z.readByte() 336 if z.err != nil { 337 return 338 } 339 switch c { 340 case ' ', '\n', '\r', '\t', '\f': 341 // No-op. 342 default: 343 z.raw.end-- 344 return 345 } 346 } 347 } 348 349 // readRawOrRCDATA reads until the next "</foo>", where "foo" is z.rawTag and 350 // is typically something like "script" or "textarea". 351 func (z *Tokenizer) readRawOrRCDATA() { 352 if z.rawTag == "script" { 353 z.readScript() 354 z.textIsRaw = true 355 z.rawTag = "" 356 return 357 } 358 loop: 359 for { 360 c := z.readByte() 361 if z.err != nil { 362 break loop 363 } 364 if c != '<' { 365 continue loop 366 } 367 c = z.readByte() 368 if z.err != nil { 369 break loop 370 } 371 if c != '/' { 372 continue loop 373 } 374 if z.readRawEndTag() || z.err != nil { 375 break loop 376 } 377 } 378 z.data.end = z.raw.end 379 // A textarea's or title's RCDATA can contain escaped entities. 380 z.textIsRaw = z.rawTag != "textarea" && z.rawTag != "title" 381 z.rawTag = "" 382 } 383 384 // readRawEndTag attempts to read a tag like "</foo>", where "foo" is z.rawTag. 385 // If it succeeds, it backs up the input position to reconsume the tag and 386 // returns true. Otherwise it returns false. The opening "</" has already been 387 // consumed. 388 func (z *Tokenizer) readRawEndTag() bool { 389 for i := 0; i < len(z.rawTag); i++ { 390 c := z.readByte() 391 if z.err != nil { 392 return false 393 } 394 if c != z.rawTag[i] && c != z.rawTag[i]-('a'-'A') { 395 z.raw.end-- 396 return false 397 } 398 } 399 c := z.readByte() 400 if z.err != nil { 401 return false 402 } 403 switch c { 404 case ' ', '\n', '\r', '\t', '\f', '/', '>': 405 // The 3 is 2 for the leading "</" plus 1 for the trailing character c. 406 z.raw.end -= 3 + len(z.rawTag) 407 return true 408 } 409 z.raw.end-- 410 return false 411 } 412 413 // readScript reads until the next </script> tag, following the byzantine 414 // rules for escaping/hiding the closing tag. 415 func (z *Tokenizer) readScript() { 416 defer func() { 417 z.data.end = z.raw.end 418 }() 419 var c byte 420 421 scriptData: 422 c = z.readByte() 423 if z.err != nil { 424 return 425 } 426 if c == '<' { 427 goto scriptDataLessThanSign 428 } 429 goto scriptData 430 431 scriptDataLessThanSign: 432 c = z.readByte() 433 if z.err != nil { 434 return 435 } 436 switch c { 437 case '/': 438 goto scriptDataEndTagOpen 439 case '!': 440 goto scriptDataEscapeStart 441 } 442 z.raw.end-- 443 goto scriptData 444 445 scriptDataEndTagOpen: 446 if z.readRawEndTag() || z.err != nil { 447 return 448 } 449 goto scriptData 450 451 scriptDataEscapeStart: 452 c = z.readByte() 453 if z.err != nil { 454 return 455 } 456 if c == '-' { 457 goto scriptDataEscapeStartDash 458 } 459 z.raw.end-- 460 goto scriptData 461 462 scriptDataEscapeStartDash: 463 c = z.readByte() 464 if z.err != nil { 465 return 466 } 467 if c == '-' { 468 goto scriptDataEscapedDashDash 469 } 470 z.raw.end-- 471 goto scriptData 472 473 scriptDataEscaped: 474 c = z.readByte() 475 if z.err != nil { 476 return 477 } 478 switch c { 479 case '-': 480 goto scriptDataEscapedDash 481 case '<': 482 goto scriptDataEscapedLessThanSign 483 } 484 goto scriptDataEscaped 485 486 scriptDataEscapedDash: 487 c = z.readByte() 488 if z.err != nil { 489 return 490 } 491 switch c { 492 case '-': 493 goto scriptDataEscapedDashDash 494 case '<': 495 goto scriptDataEscapedLessThanSign 496 } 497 goto scriptDataEscaped 498 499 scriptDataEscapedDashDash: 500 c = z.readByte() 501 if z.err != nil { 502 return 503 } 504 switch c { 505 case '-': 506 goto scriptDataEscapedDashDash 507 case '<': 508 goto scriptDataEscapedLessThanSign 509 case '>': 510 goto scriptData 511 } 512 goto scriptDataEscaped 513 514 scriptDataEscapedLessThanSign: 515 c = z.readByte() 516 if z.err != nil { 517 return 518 } 519 if c == '/' { 520 goto scriptDataEscapedEndTagOpen 521 } 522 if 'a' <= c && c <= 'z' || 'A' <= c && c <= 'Z' { 523 goto scriptDataDoubleEscapeStart 524 } 525 z.raw.end-- 526 goto scriptData 527 528 scriptDataEscapedEndTagOpen: 529 if z.readRawEndTag() || z.err != nil { 530 return 531 } 532 goto scriptDataEscaped 533 534 scriptDataDoubleEscapeStart: 535 z.raw.end-- 536 for i := 0; i < len("script"); i++ { 537 c = z.readByte() 538 if z.err != nil { 539 return 540 } 541 if c != "script"[i] && c != "SCRIPT"[i] { 542 z.raw.end-- 543 goto scriptDataEscaped 544 } 545 } 546 c = z.readByte() 547 if z.err != nil { 548 return 549 } 550 switch c { 551 case ' ', '\n', '\r', '\t', '\f', '/', '>': 552 goto scriptDataDoubleEscaped 553 } 554 z.raw.end-- 555 goto scriptDataEscaped 556 557 scriptDataDoubleEscaped: 558 c = z.readByte() 559 if z.err != nil { 560 return 561 } 562 switch c { 563 case '-': 564 goto scriptDataDoubleEscapedDash 565 case '<': 566 goto scriptDataDoubleEscapedLessThanSign 567 } 568 goto scriptDataDoubleEscaped 569 570 scriptDataDoubleEscapedDash: 571 c = z.readByte() 572 if z.err != nil { 573 return 574 } 575 switch c { 576 case '-': 577 goto scriptDataDoubleEscapedDashDash 578 case '<': 579 goto scriptDataDoubleEscapedLessThanSign 580 } 581 goto scriptDataDoubleEscaped 582 583 scriptDataDoubleEscapedDashDash: 584 c = z.readByte() 585 if z.err != nil { 586 return 587 } 588 switch c { 589 case '-': 590 goto scriptDataDoubleEscapedDashDash 591 case '<': 592 goto scriptDataDoubleEscapedLessThanSign 593 case '>': 594 goto scriptData 595 } 596 goto scriptDataDoubleEscaped 597 598 scriptDataDoubleEscapedLessThanSign: 599 c = z.readByte() 600 if z.err != nil { 601 return 602 } 603 if c == '/' { 604 goto scriptDataDoubleEscapeEnd 605 } 606 z.raw.end-- 607 goto scriptDataDoubleEscaped 608 609 scriptDataDoubleEscapeEnd: 610 if z.readRawEndTag() { 611 z.raw.end += len("</script>") 612 goto scriptDataEscaped 613 } 614 if z.err != nil { 615 return 616 } 617 goto scriptDataDoubleEscaped 618 } 619 620 // readComment reads the next comment token starting with "<!--". The opening 621 // "<!--" has already been consumed. 622 func (z *Tokenizer) readComment() { 623 z.data.start = z.raw.end 624 defer func() { 625 if z.data.end < z.data.start { 626 // It's a comment with no data, like <!-->. 627 z.data.end = z.data.start 628 } 629 }() 630 for dashCount := 2; ; { 631 c := z.readByte() 632 if z.err != nil { 633 // Ignore up to two dashes at EOF. 634 if dashCount > 2 { 635 dashCount = 2 636 } 637 z.data.end = z.raw.end - dashCount 638 return 639 } 640 switch c { 641 case '-': 642 dashCount++ 643 continue 644 case '>': 645 if dashCount >= 2 { 646 z.data.end = z.raw.end - len("-->") 647 return 648 } 649 case '!': 650 if dashCount >= 2 { 651 c = z.readByte() 652 if z.err != nil { 653 z.data.end = z.raw.end 654 return 655 } 656 if c == '>' { 657 z.data.end = z.raw.end - len("--!>") 658 return 659 } 660 } 661 } 662 dashCount = 0 663 } 664 } 665 666 // readUntilCloseAngle reads until the next ">". 667 func (z *Tokenizer) readUntilCloseAngle() { 668 z.data.start = z.raw.end 669 for { 670 c := z.readByte() 671 if z.err != nil { 672 z.data.end = z.raw.end 673 return 674 } 675 if c == '>' { 676 z.data.end = z.raw.end - len(">") 677 return 678 } 679 } 680 } 681 682 // readMarkupDeclaration reads the next token starting with "<!". It might be 683 // a "<!--comment-->", a "<!DOCTYPE foo>", a "<![CDATA[section]]>" or 684 // "<!a bogus comment". The opening "<!" has already been consumed. 685 func (z *Tokenizer) readMarkupDeclaration() TokenType { 686 z.data.start = z.raw.end 687 var c [2]byte 688 for i := 0; i < 2; i++ { 689 c[i] = z.readByte() 690 if z.err != nil { 691 z.data.end = z.raw.end 692 return CommentToken 693 } 694 } 695 if c[0] == '-' && c[1] == '-' { 696 z.readComment() 697 return CommentToken 698 } 699 z.raw.end -= 2 700 if z.readDoctype() { 701 return DoctypeToken 702 } 703 if z.allowCDATA && z.readCDATA() { 704 z.convertNUL = true 705 return TextToken 706 } 707 // It's a bogus comment. 708 z.readUntilCloseAngle() 709 return CommentToken 710 } 711 712 // readDoctype attempts to read a doctype declaration and returns true if 713 // successful. The opening "<!" has already been consumed. 714 func (z *Tokenizer) readDoctype() bool { 715 const s = "DOCTYPE" 716 for i := 0; i < len(s); i++ { 717 c := z.readByte() 718 if z.err != nil { 719 z.data.end = z.raw.end 720 return false 721 } 722 if c != s[i] && c != s[i]+('a'-'A') { 723 // Back up to read the fragment of "DOCTYPE" again. 724 z.raw.end = z.data.start 725 return false 726 } 727 } 728 if z.skipWhiteSpace(); z.err != nil { 729 z.data.start = z.raw.end 730 z.data.end = z.raw.end 731 return true 732 } 733 z.readUntilCloseAngle() 734 return true 735 } 736 737 // readCDATA attempts to read a CDATA section and returns true if 738 // successful. The opening "<!" has already been consumed. 739 func (z *Tokenizer) readCDATA() bool { 740 const s = "[CDATA[" 741 for i := 0; i < len(s); i++ { 742 c := z.readByte() 743 if z.err != nil { 744 z.data.end = z.raw.end 745 return false 746 } 747 if c != s[i] { 748 // Back up to read the fragment of "[CDATA[" again. 749 z.raw.end = z.data.start 750 return false 751 } 752 } 753 z.data.start = z.raw.end 754 brackets := 0 755 for { 756 c := z.readByte() 757 if z.err != nil { 758 z.data.end = z.raw.end 759 return true 760 } 761 switch c { 762 case ']': 763 brackets++ 764 case '>': 765 if brackets >= 2 { 766 z.data.end = z.raw.end - len("]]>") 767 return true 768 } 769 brackets = 0 770 default: 771 brackets = 0 772 } 773 } 774 } 775 776 // startTagIn returns whether the start tag in z.buf[z.data.start:z.data.end] 777 // case-insensitively matches any element of ss. 778 func (z *Tokenizer) startTagIn(ss ...string) bool { 779 loop: 780 for _, s := range ss { 781 if z.data.end-z.data.start != len(s) { 782 continue loop 783 } 784 for i := 0; i < len(s); i++ { 785 c := z.buf[z.data.start+i] 786 if 'A' <= c && c <= 'Z' { 787 c += 'a' - 'A' 788 } 789 if c != s[i] { 790 continue loop 791 } 792 } 793 return true 794 } 795 return false 796 } 797 798 // readStartTag reads the next start tag token. The opening "<a" has already 799 // been consumed, where 'a' means anything in [A-Za-z]. 800 func (z *Tokenizer) readStartTag() TokenType { 801 z.readTag(true) 802 if z.err != nil { 803 return ErrorToken 804 } 805 // Several tags flag the tokenizer's next token as raw. 806 c, raw := z.buf[z.data.start], false 807 if 'A' <= c && c <= 'Z' { 808 c += 'a' - 'A' 809 } 810 switch c { 811 case 'i': 812 raw = z.startTagIn("iframe") 813 case 'n': 814 raw = z.startTagIn("noembed", "noframes", "noscript") 815 case 'p': 816 raw = z.startTagIn("plaintext") 817 case 's': 818 raw = z.startTagIn("script", "style") 819 case 't': 820 raw = z.startTagIn("textarea", "title") 821 case 'x': 822 raw = z.startTagIn("xmp") 823 } 824 if raw { 825 z.rawTag = strings.ToLower(string(z.buf[z.data.start:z.data.end])) 826 } 827 // Look for a self-closing token like "<br/>". 828 if z.err == nil && z.buf[z.raw.end-2] == '/' { 829 return SelfClosingTagToken 830 } 831 return StartTagToken 832 } 833 834 // readTag reads the next tag token and its attributes. If saveAttr, those 835 // attributes are saved in z.attr, otherwise z.attr is set to an empty slice. 836 // The opening "<a" or "</a" has already been consumed, where 'a' means anything 837 // in [A-Za-z]. 838 func (z *Tokenizer) readTag(saveAttr bool) { 839 z.attr = z.attr[:0] 840 z.nAttrReturned = 0 841 // Read the tag name and attribute key/value pairs. 842 z.readTagName() 843 if z.skipWhiteSpace(); z.err != nil { 844 return 845 } 846 for { 847 c := z.readByte() 848 if z.err != nil || c == '>' { 849 break 850 } 851 z.raw.end-- 852 z.readTagAttrKey() 853 z.readTagAttrVal() 854 // Save pendingAttr if saveAttr and that attribute has a non-empty key. 855 if saveAttr && z.pendingAttr[0].start != z.pendingAttr[0].end { 856 z.attr = append(z.attr, z.pendingAttr) 857 } 858 if z.skipWhiteSpace(); z.err != nil { 859 break 860 } 861 } 862 } 863 864 // readTagName sets z.data to the "div" in "<div k=v>". The reader (z.raw.end) 865 // is positioned such that the first byte of the tag name (the "d" in "<div") 866 // has already been consumed. 867 func (z *Tokenizer) readTagName() { 868 z.data.start = z.raw.end - 1 869 for { 870 c := z.readByte() 871 if z.err != nil { 872 z.data.end = z.raw.end 873 return 874 } 875 switch c { 876 case ' ', '\n', '\r', '\t', '\f': 877 z.data.end = z.raw.end - 1 878 return 879 case '/', '>': 880 z.raw.end-- 881 z.data.end = z.raw.end 882 return 883 } 884 } 885 } 886 887 // readTagAttrKey sets z.pendingAttr[0] to the "k" in "<div k=v>". 888 // Precondition: z.err == nil. 889 func (z *Tokenizer) readTagAttrKey() { 890 z.pendingAttr[0].start = z.raw.end 891 for { 892 c := z.readByte() 893 if z.err != nil { 894 z.pendingAttr[0].end = z.raw.end 895 return 896 } 897 switch c { 898 case ' ', '\n', '\r', '\t', '\f', '/': 899 z.pendingAttr[0].end = z.raw.end - 1 900 return 901 case '=', '>': 902 z.raw.end-- 903 z.pendingAttr[0].end = z.raw.end 904 return 905 } 906 } 907 } 908 909 // readTagAttrVal sets z.pendingAttr[1] to the "v" in "<div k=v>". 910 func (z *Tokenizer) readTagAttrVal() { 911 z.pendingAttr[1].start = z.raw.end 912 z.pendingAttr[1].end = z.raw.end 913 if z.skipWhiteSpace(); z.err != nil { 914 return 915 } 916 c := z.readByte() 917 if z.err != nil { 918 return 919 } 920 if c != '=' { 921 z.raw.end-- 922 return 923 } 924 if z.skipWhiteSpace(); z.err != nil { 925 return 926 } 927 quote := z.readByte() 928 if z.err != nil { 929 return 930 } 931 switch quote { 932 case '>': 933 z.raw.end-- 934 return 935 936 case '\'', '"': 937 z.pendingAttr[1].start = z.raw.end 938 for { 939 c := z.readByte() 940 if z.err != nil { 941 z.pendingAttr[1].end = z.raw.end 942 return 943 } 944 if c == quote { 945 z.pendingAttr[1].end = z.raw.end - 1 946 return 947 } 948 } 949 950 default: 951 z.pendingAttr[1].start = z.raw.end - 1 952 for { 953 c := z.readByte() 954 if z.err != nil { 955 z.pendingAttr[1].end = z.raw.end 956 return 957 } 958 switch c { 959 case ' ', '\n', '\r', '\t', '\f': 960 z.pendingAttr[1].end = z.raw.end - 1 961 return 962 case '>': 963 z.raw.end-- 964 z.pendingAttr[1].end = z.raw.end 965 return 966 } 967 } 968 } 969 } 970 971 // Next scans the next token and returns its type. 972 func (z *Tokenizer) Next() TokenType { 973 z.tokenLine = z.currentLine 974 z.tokenColumn = z.currentColumn 975 976 z.raw.start = z.raw.end 977 z.data.start = z.raw.end 978 z.data.end = z.raw.end 979 if z.err != nil { 980 z.tt = ErrorToken 981 982 return z.tt 983 } 984 if z.rawTag != "" { 985 if z.rawTag == "plaintext" { 986 // Read everything up to EOF. 987 for z.err == nil { 988 z.readByte() 989 } 990 z.data.end = z.raw.end 991 z.textIsRaw = true 992 } else { 993 z.readRawOrRCDATA() 994 } 995 if z.data.end > z.data.start { 996 z.tt = TextToken 997 z.convertNUL = true 998 return z.tt 999 } 1000 } 1001 z.textIsRaw = false 1002 z.convertNUL = false 1003 1004 loop: 1005 for { 1006 c := z.readByte() 1007 if z.err != nil { 1008 break loop 1009 } 1010 if c != '<' { 1011 continue loop 1012 } 1013 1014 // Check if the '<' we have just read is part of a tag, comment 1015 // or doctype. If not, it's part of the accumulated text token. 1016 c = z.readByte() 1017 if z.err != nil { 1018 break loop 1019 } 1020 var tokenType TokenType 1021 switch { 1022 case 'a' <= c && c <= 'z' || 'A' <= c && c <= 'Z': 1023 tokenType = StartTagToken 1024 case c == '/': 1025 tokenType = EndTagToken 1026 case c == '!' || c == '?': 1027 // We use CommentToken to mean any of "<!--actual comments-->", 1028 // "<!DOCTYPE declarations>" and "<?xml processing instructions?>". 1029 tokenType = CommentToken 1030 default: 1031 // Reconsume the current character. 1032 z.raw.end-- 1033 continue 1034 } 1035 1036 // We have a non-text token, but we might have accumulated some text 1037 // before that. If so, we return the text first, and return the non- 1038 // text token on the subsequent call to Next. 1039 if x := z.raw.end - len("<a"); z.raw.start < x { 1040 z.raw.end = x 1041 z.data.end = x 1042 z.tt = TextToken 1043 return z.tt 1044 } 1045 switch tokenType { 1046 case StartTagToken: 1047 z.tt = z.readStartTag() 1048 return z.tt 1049 case EndTagToken: 1050 c = z.readByte() 1051 if z.err != nil { 1052 break loop 1053 } 1054 if c == '>' { 1055 // "</>" does not generate a token at all. Generate an empty comment 1056 // to allow passthrough clients to pick up the data using Raw. 1057 // Reset the tokenizer state and start again. 1058 z.tt = CommentToken 1059 return z.tt 1060 } 1061 if 'a' <= c && c <= 'z' || 'A' <= c && c <= 'Z' { 1062 z.readTag(false) 1063 if z.err != nil { 1064 z.tt = ErrorToken 1065 } else { 1066 z.tt = EndTagToken 1067 } 1068 return z.tt 1069 } 1070 z.raw.end-- 1071 z.readUntilCloseAngle() 1072 z.tt = CommentToken 1073 return z.tt 1074 case CommentToken: 1075 if c == '!' { 1076 z.tt = z.readMarkupDeclaration() 1077 return z.tt 1078 } 1079 z.raw.end-- 1080 z.readUntilCloseAngle() 1081 z.tt = CommentToken 1082 return z.tt 1083 } 1084 } 1085 if z.raw.start < z.raw.end { 1086 z.data.end = z.raw.end 1087 z.tt = TextToken 1088 return z.tt 1089 } 1090 z.tt = ErrorToken 1091 return z.tt 1092 } 1093 1094 // Raw returns the unmodified text of the current token. Calling Next, Token, 1095 // Text, TagName or TagAttr may change the contents of the returned slice. 1096 func (z *Tokenizer) Raw() []byte { 1097 return z.buf[z.raw.start:z.raw.end] 1098 } 1099 1100 // convertNewlines converts "\r" and "\r\n" in s to "\n". 1101 // The conversion happens in place, but the resulting slice may be shorter. 1102 func convertNewlines(s []byte) []byte { 1103 for i, c := range s { 1104 if c != '\r' { 1105 continue 1106 } 1107 1108 src := i + 1 1109 if src >= len(s) || s[src] != '\n' { 1110 s[i] = '\n' 1111 continue 1112 } 1113 1114 dst := i 1115 for src < len(s) { 1116 if s[src] == '\r' { 1117 if src+1 < len(s) && s[src+1] == '\n' { 1118 src++ 1119 } 1120 s[dst] = '\n' 1121 } else { 1122 s[dst] = s[src] 1123 } 1124 src++ 1125 dst++ 1126 } 1127 return s[:dst] 1128 } 1129 return s 1130 } 1131 1132 var ( 1133 nul = []byte("\x00") 1134 replacement = []byte("\ufffd") 1135 ) 1136 1137 // Text returns the unescaped text of a text, comment or doctype token. The 1138 // contents of the returned slice may change on the next call to Next. 1139 func (z *Tokenizer) Text() []byte { 1140 switch z.tt { 1141 case TextToken, CommentToken, DoctypeToken: 1142 s := z.buf[z.data.start:z.data.end] 1143 z.data.start = z.raw.end 1144 z.data.end = z.raw.end 1145 s = convertNewlines(s) 1146 if (z.convertNUL || z.tt == CommentToken) && bytes.Contains(s, nul) { 1147 s = bytes.Replace(s, nul, replacement, -1) 1148 } 1149 if !z.textIsRaw { 1150 s = unescape(s, false) 1151 } 1152 return s 1153 } 1154 return nil 1155 } 1156 1157 // TagName returns the lower-cased name of a tag token (the `img` out of 1158 // `<IMG SRC="foo">`) and whether the tag has attributes. 1159 // The contents of the returned slice may change on the next call to Next. 1160 func (z *Tokenizer) TagName() (name []byte, hasAttr bool) { 1161 if z.data.start < z.data.end { 1162 switch z.tt { 1163 case StartTagToken, EndTagToken, SelfClosingTagToken: 1164 s := z.buf[z.data.start:z.data.end] 1165 z.data.start = z.raw.end 1166 z.data.end = z.raw.end 1167 if !strings.Contains(string(s), ":") { 1168 s = lower(s) 1169 } 1170 return s, z.nAttrReturned < len(z.attr) 1171 } 1172 } 1173 return nil, false 1174 } 1175 1176 // TagAttr returns the lower-cased key and unescaped value of the next unparsed 1177 // attribute for the current tag token and whether there are more attributes. 1178 // The contents of the returned slices may change on the next call to Next. 1179 func (z *Tokenizer) TagAttr() (key, val []byte, moreAttr bool) { 1180 if z.nAttrReturned < len(z.attr) { 1181 switch z.tt { 1182 case StartTagToken, SelfClosingTagToken: 1183 x := z.attr[z.nAttrReturned] 1184 z.nAttrReturned++ 1185 key = z.buf[x[0].start:x[0].end] 1186 val = z.buf[x[1].start:x[1].end] 1187 if !strings.HasPrefix(string(key), ":") { 1188 key = lower(key) 1189 } 1190 return key, unescape(convertNewlines(val), true), z.nAttrReturned < len(z.attr) 1191 } 1192 } 1193 return nil, nil, false 1194 } 1195 1196 // Token returns the current Token. The result's Data and Attr values remain 1197 // valid after subsequent Next calls. 1198 func (z *Tokenizer) Token() Token { 1199 t := Token{Type: z.tt, Line: z.tokenLine, Column: z.tokenColumn} 1200 switch z.tt { 1201 case TextToken, CommentToken, DoctypeToken: 1202 t.Data = string(z.Text()) 1203 case StartTagToken, SelfClosingTagToken, EndTagToken: 1204 name, moreAttr := z.TagName() 1205 for moreAttr { 1206 var key, val []byte 1207 key, val, moreAttr = z.TagAttr() 1208 t.Attr = append(t.Attr, Attribute{"", atom.String(key), string(val)}) 1209 } 1210 t.Data = string(name) 1211 if a := atom.Lookup(name); a != 0 { 1212 t.DataAtom, t.Data = a, a.String() 1213 } else { 1214 t.DataAtom, t.Data = 0, string(name) 1215 } 1216 } 1217 return t 1218 } 1219 1220 // SetMaxBuf sets a limit on the amount of data buffered during tokenization. 1221 // A value of 0 means unlimited. 1222 func (z *Tokenizer) SetMaxBuf(n int) { 1223 z.maxBuf = n 1224 } 1225 1226 // NewTokenizer returns a new HTML Tokenizer for the given Reader. 1227 // The input is assumed to be UTF-8 encoded. 1228 func NewTokenizer(r io.Reader) *Tokenizer { 1229 return NewTokenizerFragment(r, "") 1230 } 1231 1232 // NewTokenizerFragment returns a new HTML Tokenizer for the given Reader, for 1233 // tokenizing an existing element's InnerHTML fragment. contextTag is that 1234 // element's tag, such as "div" or "iframe". 1235 // 1236 // For example, how the InnerHTML "a<b" is tokenized depends on whether it is 1237 // for a <p> tag or a <script> tag. 1238 // 1239 // The input is assumed to be UTF-8 encoded. 1240 func NewTokenizerFragment(r io.Reader, contextTag string) *Tokenizer { 1241 z := &Tokenizer{ 1242 r: r, 1243 buf: make([]byte, 0, 4096), 1244 } 1245 if contextTag != "" { 1246 switch s := strings.ToLower(contextTag); s { 1247 case "iframe", "noembed", "noframes", "noscript", "plaintext", "script", "style", "title", "textarea", "xmp": 1248 z.rawTag = s 1249 } 1250 } 1251 return z 1252 }