github.com/hxx258456/ccgo@v0.0.5-0.20230213014102-48b35f46f66f/net/html/parse.go (about) 1 // Copyright 2010 The Go Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 package html 6 7 import ( 8 "errors" 9 "fmt" 10 "io" 11 "strings" 12 13 a "github.com/hxx258456/ccgo/net/html/atom" 14 ) 15 16 // A parser implements the HTML5 parsing algorithm: 17 // https://html.spec.whatwg.org/multipage/syntax.html#tree-construction 18 type parser struct { 19 // tokenizer provides the tokens for the parser. 20 tokenizer *Tokenizer 21 // tok is the most recently read token. 22 tok Token 23 // Self-closing tags like <hr/> are treated as start tags, except that 24 // hasSelfClosingToken is set while they are being processed. 25 hasSelfClosingToken bool 26 // doc is the document root element. 27 doc *Node 28 // The stack of open elements (section 12.2.4.2) and active formatting 29 // elements (section 12.2.4.3). 30 oe, afe nodeStack 31 // Element pointers (section 12.2.4.4). 32 head, form *Node 33 // Other parsing state flags (section 12.2.4.5). 34 scripting, framesetOK bool 35 // The stack of template insertion modes 36 templateStack insertionModeStack 37 // im is the current insertion mode. 38 im insertionMode 39 // originalIM is the insertion mode to go back to after completing a text 40 // or inTableText insertion mode. 41 originalIM insertionMode 42 // fosterParenting is whether new elements should be inserted according to 43 // the foster parenting rules (section 12.2.6.1). 44 fosterParenting bool 45 // quirks is whether the parser is operating in "quirks mode." 46 quirks bool 47 // fragment is whether the parser is parsing an HTML fragment. 48 fragment bool 49 // context is the context element when parsing an HTML fragment 50 // (section 12.4). 51 context *Node 52 } 53 54 func (p *parser) top() *Node { 55 if n := p.oe.top(); n != nil { 56 return n 57 } 58 return p.doc 59 } 60 61 // Stop tags for use in popUntil. These come from section 12.2.4.2. 62 var ( 63 defaultScopeStopTags = map[string][]a.Atom{ 64 "": {a.Applet, a.Caption, a.Html, a.Table, a.Td, a.Th, a.Marquee, a.Object, a.Template}, 65 "math": {a.AnnotationXml, a.Mi, a.Mn, a.Mo, a.Ms, a.Mtext}, 66 "svg": {a.Desc, a.ForeignObject, a.Title}, 67 } 68 ) 69 70 type scope int 71 72 const ( 73 defaultScope scope = iota 74 listItemScope 75 buttonScope 76 tableScope 77 tableRowScope 78 tableBodyScope 79 selectScope 80 ) 81 82 // popUntil pops the stack of open elements at the highest element whose tag 83 // is in matchTags, provided there is no higher element in the scope's stop 84 // tags (as defined in section 12.2.4.2). It returns whether or not there was 85 // such an element. If there was not, popUntil leaves the stack unchanged. 86 // 87 // For example, the set of stop tags for table scope is: "html", "table". If 88 // the stack was: 89 // ["html", "body", "font", "table", "b", "i", "u"] 90 // then popUntil(tableScope, "font") would return false, but 91 // popUntil(tableScope, "i") would return true and the stack would become: 92 // ["html", "body", "font", "table", "b"] 93 // 94 // If an element's tag is in both the stop tags and matchTags, then the stack 95 // will be popped and the function returns true (provided, of course, there was 96 // no higher element in the stack that was also in the stop tags). For example, 97 // popUntil(tableScope, "table") returns true and leaves: 98 // ["html", "body", "font"] 99 func (p *parser) popUntil(s scope, matchTags ...a.Atom) bool { 100 if i := p.indexOfElementInScope(s, matchTags...); i != -1 { 101 p.oe = p.oe[:i] 102 return true 103 } 104 return false 105 } 106 107 // indexOfElementInScope returns the index in p.oe of the highest element whose 108 // tag is in matchTags that is in scope. If no matching element is in scope, it 109 // returns -1. 110 func (p *parser) indexOfElementInScope(s scope, matchTags ...a.Atom) int { 111 for i := len(p.oe) - 1; i >= 0; i-- { 112 tagAtom := p.oe[i].DataAtom 113 if p.oe[i].Namespace == "" { 114 for _, t := range matchTags { 115 if t == tagAtom { 116 return i 117 } 118 } 119 switch s { 120 case defaultScope: 121 // No-op. 122 case listItemScope: 123 if tagAtom == a.Ol || tagAtom == a.Ul { 124 return -1 125 } 126 case buttonScope: 127 if tagAtom == a.Button { 128 return -1 129 } 130 case tableScope: 131 if tagAtom == a.Html || tagAtom == a.Table || tagAtom == a.Template { 132 return -1 133 } 134 case selectScope: 135 if tagAtom != a.Optgroup && tagAtom != a.Option { 136 return -1 137 } 138 default: 139 panic("unreachable") 140 } 141 } 142 switch s { 143 case defaultScope, listItemScope, buttonScope: 144 for _, t := range defaultScopeStopTags[p.oe[i].Namespace] { 145 if t == tagAtom { 146 return -1 147 } 148 } 149 } 150 } 151 return -1 152 } 153 154 // elementInScope is like popUntil, except that it doesn't modify the stack of 155 // open elements. 156 func (p *parser) elementInScope(s scope, matchTags ...a.Atom) bool { 157 return p.indexOfElementInScope(s, matchTags...) != -1 158 } 159 160 // clearStackToContext pops elements off the stack of open elements until a 161 // scope-defined element is found. 162 func (p *parser) clearStackToContext(s scope) { 163 for i := len(p.oe) - 1; i >= 0; i-- { 164 tagAtom := p.oe[i].DataAtom 165 switch s { 166 case tableScope: 167 if tagAtom == a.Html || tagAtom == a.Table || tagAtom == a.Template { 168 p.oe = p.oe[:i+1] 169 return 170 } 171 case tableRowScope: 172 if tagAtom == a.Html || tagAtom == a.Tr || tagAtom == a.Template { 173 p.oe = p.oe[:i+1] 174 return 175 } 176 case tableBodyScope: 177 if tagAtom == a.Html || tagAtom == a.Tbody || tagAtom == a.Tfoot || tagAtom == a.Thead || tagAtom == a.Template { 178 p.oe = p.oe[:i+1] 179 return 180 } 181 default: 182 panic("unreachable") 183 } 184 } 185 } 186 187 // parseGenericRawTextElements implements the generic raw text element parsing 188 // algorithm defined in 12.2.6.2. 189 // https://html.spec.whatwg.org/multipage/parsing.html#parsing-elements-that-contain-only-text 190 // TODO: Since both RAWTEXT and RCDATA states are treated as tokenizer's part 191 // officially, need to make tokenizer consider both states. 192 func (p *parser) parseGenericRawTextElement() { 193 p.addElement() 194 p.originalIM = p.im 195 p.im = textIM 196 } 197 198 // generateImpliedEndTags pops nodes off the stack of open elements as long as 199 // the top node has a tag name of dd, dt, li, optgroup, option, p, rb, rp, rt or rtc. 200 // If exceptions are specified, nodes with that name will not be popped off. 201 func (p *parser) generateImpliedEndTags(exceptions ...string) { 202 var i int 203 loop: 204 for i = len(p.oe) - 1; i >= 0; i-- { 205 n := p.oe[i] 206 if n.Type != ElementNode { 207 break 208 } 209 switch n.DataAtom { 210 case a.Dd, a.Dt, a.Li, a.Optgroup, a.Option, a.P, a.Rb, a.Rp, a.Rt, a.Rtc: 211 for _, except := range exceptions { 212 if n.Data == except { 213 break loop 214 } 215 } 216 continue 217 } 218 break 219 } 220 221 p.oe = p.oe[:i+1] 222 } 223 224 // addChild adds a child node n to the top element, and pushes n onto the stack 225 // of open elements if it is an element node. 226 func (p *parser) addChild(n *Node) { 227 if p.shouldFosterParent() { 228 p.fosterParent(n) 229 } else { 230 p.top().AppendChild(n) 231 } 232 233 if n.Type == ElementNode { 234 p.oe = append(p.oe, n) 235 } 236 } 237 238 // shouldFosterParent returns whether the next node to be added should be 239 // foster parented. 240 func (p *parser) shouldFosterParent() bool { 241 if p.fosterParenting { 242 switch p.top().DataAtom { 243 case a.Table, a.Tbody, a.Tfoot, a.Thead, a.Tr: 244 return true 245 } 246 } 247 return false 248 } 249 250 // fosterParent adds a child node according to the foster parenting rules. 251 // Section 12.2.6.1, "foster parenting". 252 func (p *parser) fosterParent(n *Node) { 253 var table, parent, prev, template *Node 254 var i int 255 for i = len(p.oe) - 1; i >= 0; i-- { 256 if p.oe[i].DataAtom == a.Table { 257 table = p.oe[i] 258 break 259 } 260 } 261 262 var j int 263 for j = len(p.oe) - 1; j >= 0; j-- { 264 if p.oe[j].DataAtom == a.Template { 265 template = p.oe[j] 266 break 267 } 268 } 269 270 if template != nil && (table == nil || j > i) { 271 template.AppendChild(n) 272 return 273 } 274 275 if table == nil { 276 // The foster parent is the html element. 277 parent = p.oe[0] 278 } else { 279 parent = table.Parent 280 } 281 if parent == nil { 282 parent = p.oe[i-1] 283 } 284 285 if table != nil { 286 prev = table.PrevSibling 287 } else { 288 prev = parent.LastChild 289 } 290 if prev != nil && prev.Type == TextNode && n.Type == TextNode { 291 prev.Data += n.Data 292 return 293 } 294 295 parent.InsertBefore(n, table) 296 } 297 298 // addText adds text to the preceding node if it is a text node, or else it 299 // calls addChild with a new text node. 300 func (p *parser) addText(text string) { 301 if text == "" { 302 return 303 } 304 305 if p.shouldFosterParent() { 306 p.fosterParent(&Node{ 307 Type: TextNode, 308 Data: text, 309 }) 310 return 311 } 312 313 t := p.top() 314 if n := t.LastChild; n != nil && n.Type == TextNode { 315 n.Data += text 316 return 317 } 318 p.addChild(&Node{ 319 Type: TextNode, 320 Data: text, 321 }) 322 } 323 324 // addElement adds a child element based on the current token. 325 func (p *parser) addElement() { 326 p.addChild(&Node{ 327 Type: ElementNode, 328 DataAtom: p.tok.DataAtom, 329 Data: p.tok.Data, 330 Attr: p.tok.Attr, 331 }) 332 } 333 334 // Section 12.2.4.3. 335 func (p *parser) addFormattingElement() { 336 tagAtom, attr := p.tok.DataAtom, p.tok.Attr 337 p.addElement() 338 339 // Implement the Noah's Ark clause, but with three per family instead of two. 340 identicalElements := 0 341 findIdenticalElements: 342 for i := len(p.afe) - 1; i >= 0; i-- { 343 n := p.afe[i] 344 if n.Type == scopeMarkerNode { 345 break 346 } 347 if n.Type != ElementNode { 348 continue 349 } 350 if n.Namespace != "" { 351 continue 352 } 353 if n.DataAtom != tagAtom { 354 continue 355 } 356 if len(n.Attr) != len(attr) { 357 continue 358 } 359 compareAttributes: 360 for _, t0 := range n.Attr { 361 for _, t1 := range attr { 362 if t0.Key == t1.Key && t0.Namespace == t1.Namespace && t0.Val == t1.Val { 363 // Found a match for this attribute, continue with the next attribute. 364 continue compareAttributes 365 } 366 } 367 // If we get here, there is no attribute that matches a. 368 // Therefore the element is not identical to the new one. 369 continue findIdenticalElements 370 } 371 372 identicalElements++ 373 if identicalElements >= 3 { 374 p.afe.remove(n) 375 } 376 } 377 378 p.afe = append(p.afe, p.top()) 379 } 380 381 // Section 12.2.4.3. 382 func (p *parser) clearActiveFormattingElements() { 383 for { 384 if n := p.afe.pop(); len(p.afe) == 0 || n.Type == scopeMarkerNode { 385 return 386 } 387 } 388 } 389 390 // Section 12.2.4.3. 391 func (p *parser) reconstructActiveFormattingElements() { 392 n := p.afe.top() 393 if n == nil { 394 return 395 } 396 if n.Type == scopeMarkerNode || p.oe.index(n) != -1 { 397 return 398 } 399 i := len(p.afe) - 1 400 for n.Type != scopeMarkerNode && p.oe.index(n) == -1 { 401 if i == 0 { 402 i = -1 403 break 404 } 405 i-- 406 n = p.afe[i] 407 } 408 for { 409 i++ 410 clone := p.afe[i].clone() 411 p.addChild(clone) 412 p.afe[i] = clone 413 if i == len(p.afe)-1 { 414 break 415 } 416 } 417 } 418 419 // Section 12.2.5. 420 func (p *parser) acknowledgeSelfClosingTag() { 421 p.hasSelfClosingToken = false 422 } 423 424 // An insertion mode (section 12.2.4.1) is the state transition function from 425 // a particular state in the HTML5 parser's state machine. It updates the 426 // parser's fields depending on parser.tok (where ErrorToken means EOF). 427 // It returns whether the token was consumed. 428 type insertionMode func(*parser) bool 429 430 // setOriginalIM sets the insertion mode to return to after completing a text or 431 // inTableText insertion mode. 432 // Section 12.2.4.1, "using the rules for". 433 func (p *parser) setOriginalIM() { 434 if p.originalIM != nil { 435 panic("html: bad parser state: originalIM was set twice") 436 } 437 p.originalIM = p.im 438 } 439 440 // Section 12.2.4.1, "reset the insertion mode". 441 func (p *parser) resetInsertionMode() { 442 for i := len(p.oe) - 1; i >= 0; i-- { 443 n := p.oe[i] 444 last := i == 0 445 if last && p.context != nil { 446 n = p.context 447 } 448 449 switch n.DataAtom { 450 case a.Select: 451 if !last { 452 for ancestor, first := n, p.oe[0]; ancestor != first; { 453 ancestor = p.oe[p.oe.index(ancestor)-1] 454 switch ancestor.DataAtom { 455 case a.Template: 456 p.im = inSelectIM 457 return 458 case a.Table: 459 p.im = inSelectInTableIM 460 return 461 } 462 } 463 } 464 p.im = inSelectIM 465 case a.Td, a.Th: 466 // TODO: remove this divergence from the HTML5 spec. 467 // 468 // See https://bugs.chromium.org/p/chromium/issues/detail?id=829668 469 p.im = inCellIM 470 case a.Tr: 471 p.im = inRowIM 472 case a.Tbody, a.Thead, a.Tfoot: 473 p.im = inTableBodyIM 474 case a.Caption: 475 p.im = inCaptionIM 476 case a.Colgroup: 477 p.im = inColumnGroupIM 478 case a.Table: 479 p.im = inTableIM 480 case a.Template: 481 // TODO: remove this divergence from the HTML5 spec. 482 if n.Namespace != "" { 483 continue 484 } 485 p.im = p.templateStack.top() 486 case a.Head: 487 // TODO: remove this divergence from the HTML5 spec. 488 // 489 // See https://bugs.chromium.org/p/chromium/issues/detail?id=829668 490 p.im = inHeadIM 491 case a.Body: 492 p.im = inBodyIM 493 case a.Frameset: 494 p.im = inFramesetIM 495 case a.Html: 496 if p.head == nil { 497 p.im = beforeHeadIM 498 } else { 499 p.im = afterHeadIM 500 } 501 default: 502 if last { 503 p.im = inBodyIM 504 return 505 } 506 continue 507 } 508 return 509 } 510 } 511 512 const whitespace = " \t\r\n\f" 513 514 // Section 12.2.6.4.1. 515 func initialIM(p *parser) bool { 516 switch p.tok.Type { 517 case TextToken: 518 p.tok.Data = strings.TrimLeft(p.tok.Data, whitespace) 519 if len(p.tok.Data) == 0 { 520 // It was all whitespace, so ignore it. 521 return true 522 } 523 case CommentToken: 524 p.doc.AppendChild(&Node{ 525 Type: CommentNode, 526 Data: p.tok.Data, 527 }) 528 return true 529 case DoctypeToken: 530 n, quirks := parseDoctype(p.tok.Data) 531 p.doc.AppendChild(n) 532 p.quirks = quirks 533 p.im = beforeHTMLIM 534 return true 535 } 536 p.quirks = true 537 p.im = beforeHTMLIM 538 return false 539 } 540 541 // Section 12.2.6.4.2. 542 func beforeHTMLIM(p *parser) bool { 543 switch p.tok.Type { 544 case DoctypeToken: 545 // Ignore the token. 546 return true 547 case TextToken: 548 p.tok.Data = strings.TrimLeft(p.tok.Data, whitespace) 549 if len(p.tok.Data) == 0 { 550 // It was all whitespace, so ignore it. 551 return true 552 } 553 case StartTagToken: 554 if p.tok.DataAtom == a.Html { 555 p.addElement() 556 p.im = beforeHeadIM 557 return true 558 } 559 case EndTagToken: 560 switch p.tok.DataAtom { 561 case a.Head, a.Body, a.Html, a.Br: 562 p.parseImpliedToken(StartTagToken, a.Html, a.Html.String()) 563 return false 564 default: 565 // Ignore the token. 566 return true 567 } 568 case CommentToken: 569 p.doc.AppendChild(&Node{ 570 Type: CommentNode, 571 Data: p.tok.Data, 572 }) 573 return true 574 } 575 p.parseImpliedToken(StartTagToken, a.Html, a.Html.String()) 576 return false 577 } 578 579 // Section 12.2.6.4.3. 580 func beforeHeadIM(p *parser) bool { 581 switch p.tok.Type { 582 case TextToken: 583 p.tok.Data = strings.TrimLeft(p.tok.Data, whitespace) 584 if len(p.tok.Data) == 0 { 585 // It was all whitespace, so ignore it. 586 return true 587 } 588 case StartTagToken: 589 switch p.tok.DataAtom { 590 case a.Head: 591 p.addElement() 592 p.head = p.top() 593 p.im = inHeadIM 594 return true 595 case a.Html: 596 return inBodyIM(p) 597 } 598 case EndTagToken: 599 switch p.tok.DataAtom { 600 case a.Head, a.Body, a.Html, a.Br: 601 p.parseImpliedToken(StartTagToken, a.Head, a.Head.String()) 602 return false 603 default: 604 // Ignore the token. 605 return true 606 } 607 case CommentToken: 608 p.addChild(&Node{ 609 Type: CommentNode, 610 Data: p.tok.Data, 611 }) 612 return true 613 case DoctypeToken: 614 // Ignore the token. 615 return true 616 } 617 618 p.parseImpliedToken(StartTagToken, a.Head, a.Head.String()) 619 return false 620 } 621 622 // Section 12.2.6.4.4. 623 func inHeadIM(p *parser) bool { 624 switch p.tok.Type { 625 case TextToken: 626 s := strings.TrimLeft(p.tok.Data, whitespace) 627 if len(s) < len(p.tok.Data) { 628 // Add the initial whitespace to the current node. 629 p.addText(p.tok.Data[:len(p.tok.Data)-len(s)]) 630 if s == "" { 631 return true 632 } 633 p.tok.Data = s 634 } 635 case StartTagToken: 636 switch p.tok.DataAtom { 637 case a.Html: 638 return inBodyIM(p) 639 case a.Base, a.Basefont, a.Bgsound, a.Link, a.Meta: 640 p.addElement() 641 p.oe.pop() 642 p.acknowledgeSelfClosingTag() 643 return true 644 case a.Noscript: 645 if p.scripting { 646 p.parseGenericRawTextElement() 647 return true 648 } 649 p.addElement() 650 p.im = inHeadNoscriptIM 651 // Don't let the tokenizer go into raw text mode when scripting is disabled. 652 p.tokenizer.NextIsNotRawText() 653 return true 654 case a.Script, a.Title: 655 p.addElement() 656 p.setOriginalIM() 657 p.im = textIM 658 return true 659 case a.Noframes, a.Style: 660 p.parseGenericRawTextElement() 661 return true 662 case a.Head: 663 // Ignore the token. 664 return true 665 case a.Template: 666 // TODO: remove this divergence from the HTML5 spec. 667 // 668 // We don't handle all of the corner cases when mixing foreign 669 // content (i.e. <math> or <svg>) with <template>. Without this 670 // early return, we can get into an infinite loop, possibly because 671 // of the "TODO... further divergence" a little below. 672 // 673 // As a workaround, if we are mixing foreign content and templates, 674 // just ignore the rest of the HTML. Foreign content is rare and a 675 // relatively old HTML feature. Templates are also rare and a 676 // relatively new HTML feature. Their combination is very rare. 677 for _, e := range p.oe { 678 if e.Namespace != "" { 679 p.im = ignoreTheRemainingTokens 680 return true 681 } 682 } 683 684 p.addElement() 685 p.afe = append(p.afe, &scopeMarker) 686 p.framesetOK = false 687 p.im = inTemplateIM 688 p.templateStack = append(p.templateStack, inTemplateIM) 689 return true 690 } 691 case EndTagToken: 692 switch p.tok.DataAtom { 693 case a.Head: 694 p.oe.pop() 695 p.im = afterHeadIM 696 return true 697 case a.Body, a.Html, a.Br: 698 p.parseImpliedToken(EndTagToken, a.Head, a.Head.String()) 699 return false 700 case a.Template: 701 if !p.oe.contains(a.Template) { 702 return true 703 } 704 // TODO: remove this further divergence from the HTML5 spec. 705 // 706 // See https://bugs.chromium.org/p/chromium/issues/detail?id=829668 707 p.generateImpliedEndTags() 708 for i := len(p.oe) - 1; i >= 0; i-- { 709 if n := p.oe[i]; n.Namespace == "" && n.DataAtom == a.Template { 710 p.oe = p.oe[:i] 711 break 712 } 713 } 714 p.clearActiveFormattingElements() 715 p.templateStack.pop() 716 p.resetInsertionMode() 717 return true 718 default: 719 // Ignore the token. 720 return true 721 } 722 case CommentToken: 723 p.addChild(&Node{ 724 Type: CommentNode, 725 Data: p.tok.Data, 726 }) 727 return true 728 case DoctypeToken: 729 // Ignore the token. 730 return true 731 } 732 733 p.parseImpliedToken(EndTagToken, a.Head, a.Head.String()) 734 return false 735 } 736 737 // 12.2.6.4.5. 738 func inHeadNoscriptIM(p *parser) bool { 739 switch p.tok.Type { 740 case DoctypeToken: 741 // Ignore the token. 742 return true 743 case StartTagToken: 744 switch p.tok.DataAtom { 745 case a.Html: 746 return inBodyIM(p) 747 case a.Basefont, a.Bgsound, a.Link, a.Meta, a.Noframes, a.Style: 748 return inHeadIM(p) 749 case a.Head: 750 // Ignore the token. 751 return true 752 case a.Noscript: 753 // Don't let the tokenizer go into raw text mode even when a <noscript> 754 // tag is in "in head noscript" insertion mode. 755 p.tokenizer.NextIsNotRawText() 756 // Ignore the token. 757 return true 758 } 759 case EndTagToken: 760 switch p.tok.DataAtom { 761 case a.Noscript, a.Br: 762 default: 763 // Ignore the token. 764 return true 765 } 766 case TextToken: 767 s := strings.TrimLeft(p.tok.Data, whitespace) 768 if len(s) == 0 { 769 // It was all whitespace. 770 return inHeadIM(p) 771 } 772 case CommentToken: 773 return inHeadIM(p) 774 } 775 p.oe.pop() 776 if p.top().DataAtom != a.Head { 777 panic("html: the new current node will be a head element.") 778 } 779 p.im = inHeadIM 780 if p.tok.DataAtom == a.Noscript { 781 return true 782 } 783 return false 784 } 785 786 // Section 12.2.6.4.6. 787 func afterHeadIM(p *parser) bool { 788 switch p.tok.Type { 789 case TextToken: 790 s := strings.TrimLeft(p.tok.Data, whitespace) 791 if len(s) < len(p.tok.Data) { 792 // Add the initial whitespace to the current node. 793 p.addText(p.tok.Data[:len(p.tok.Data)-len(s)]) 794 if s == "" { 795 return true 796 } 797 p.tok.Data = s 798 } 799 case StartTagToken: 800 switch p.tok.DataAtom { 801 case a.Html: 802 return inBodyIM(p) 803 case a.Body: 804 p.addElement() 805 p.framesetOK = false 806 p.im = inBodyIM 807 return true 808 case a.Frameset: 809 p.addElement() 810 p.im = inFramesetIM 811 return true 812 case a.Base, a.Basefont, a.Bgsound, a.Link, a.Meta, a.Noframes, a.Script, a.Style, a.Template, a.Title: 813 p.oe = append(p.oe, p.head) 814 defer p.oe.remove(p.head) 815 return inHeadIM(p) 816 case a.Head: 817 // Ignore the token. 818 return true 819 } 820 case EndTagToken: 821 switch p.tok.DataAtom { 822 case a.Body, a.Html, a.Br: 823 // Drop down to creating an implied <body> tag. 824 case a.Template: 825 return inHeadIM(p) 826 default: 827 // Ignore the token. 828 return true 829 } 830 case CommentToken: 831 p.addChild(&Node{ 832 Type: CommentNode, 833 Data: p.tok.Data, 834 }) 835 return true 836 case DoctypeToken: 837 // Ignore the token. 838 return true 839 } 840 841 p.parseImpliedToken(StartTagToken, a.Body, a.Body.String()) 842 p.framesetOK = true 843 return false 844 } 845 846 // copyAttributes copies attributes of src not found on dst to dst. 847 func copyAttributes(dst *Node, src Token) { 848 if len(src.Attr) == 0 { 849 return 850 } 851 attr := map[string]string{} 852 for _, t := range dst.Attr { 853 attr[t.Key] = t.Val 854 } 855 for _, t := range src.Attr { 856 if _, ok := attr[t.Key]; !ok { 857 dst.Attr = append(dst.Attr, t) 858 attr[t.Key] = t.Val 859 } 860 } 861 } 862 863 // Section 12.2.6.4.7. 864 func inBodyIM(p *parser) bool { 865 switch p.tok.Type { 866 case TextToken: 867 d := p.tok.Data 868 switch n := p.oe.top(); n.DataAtom { 869 case a.Pre, a.Listing: 870 if n.FirstChild == nil { 871 // Ignore a newline at the start of a <pre> block. 872 if d != "" && d[0] == '\r' { 873 d = d[1:] 874 } 875 if d != "" && d[0] == '\n' { 876 d = d[1:] 877 } 878 } 879 } 880 d = strings.Replace(d, "\x00", "", -1) 881 if d == "" { 882 return true 883 } 884 p.reconstructActiveFormattingElements() 885 p.addText(d) 886 if p.framesetOK && strings.TrimLeft(d, whitespace) != "" { 887 // There were non-whitespace characters inserted. 888 p.framesetOK = false 889 } 890 case StartTagToken: 891 switch p.tok.DataAtom { 892 case a.Html: 893 if p.oe.contains(a.Template) { 894 return true 895 } 896 copyAttributes(p.oe[0], p.tok) 897 case a.Base, a.Basefont, a.Bgsound, a.Link, a.Meta, a.Noframes, a.Script, a.Style, a.Template, a.Title: 898 return inHeadIM(p) 899 case a.Body: 900 if p.oe.contains(a.Template) { 901 return true 902 } 903 if len(p.oe) >= 2 { 904 body := p.oe[1] 905 if body.Type == ElementNode && body.DataAtom == a.Body { 906 p.framesetOK = false 907 copyAttributes(body, p.tok) 908 } 909 } 910 case a.Frameset: 911 if !p.framesetOK || len(p.oe) < 2 || p.oe[1].DataAtom != a.Body { 912 // Ignore the token. 913 return true 914 } 915 body := p.oe[1] 916 if body.Parent != nil { 917 body.Parent.RemoveChild(body) 918 } 919 p.oe = p.oe[:1] 920 p.addElement() 921 p.im = inFramesetIM 922 return true 923 case a.Address, a.Article, a.Aside, a.Blockquote, a.Center, a.Details, a.Dialog, a.Dir, a.Div, a.Dl, a.Fieldset, a.Figcaption, a.Figure, a.Footer, a.Header, a.Hgroup, a.Main, a.Menu, a.Nav, a.Ol, a.P, a.Section, a.Summary, a.Ul: 924 p.popUntil(buttonScope, a.P) 925 p.addElement() 926 case a.H1, a.H2, a.H3, a.H4, a.H5, a.H6: 927 p.popUntil(buttonScope, a.P) 928 switch n := p.top(); n.DataAtom { 929 case a.H1, a.H2, a.H3, a.H4, a.H5, a.H6: 930 p.oe.pop() 931 } 932 p.addElement() 933 case a.Pre, a.Listing: 934 p.popUntil(buttonScope, a.P) 935 p.addElement() 936 // The newline, if any, will be dealt with by the TextToken case. 937 p.framesetOK = false 938 case a.Form: 939 if p.form != nil && !p.oe.contains(a.Template) { 940 // Ignore the token 941 return true 942 } 943 p.popUntil(buttonScope, a.P) 944 p.addElement() 945 if !p.oe.contains(a.Template) { 946 p.form = p.top() 947 } 948 case a.Li: 949 p.framesetOK = false 950 for i := len(p.oe) - 1; i >= 0; i-- { 951 node := p.oe[i] 952 switch node.DataAtom { 953 case a.Li: 954 p.oe = p.oe[:i] 955 case a.Address, a.Div, a.P: 956 continue 957 default: 958 if !isSpecialElement(node) { 959 continue 960 } 961 } 962 break 963 } 964 p.popUntil(buttonScope, a.P) 965 p.addElement() 966 case a.Dd, a.Dt: 967 p.framesetOK = false 968 for i := len(p.oe) - 1; i >= 0; i-- { 969 node := p.oe[i] 970 switch node.DataAtom { 971 case a.Dd, a.Dt: 972 p.oe = p.oe[:i] 973 case a.Address, a.Div, a.P: 974 continue 975 default: 976 if !isSpecialElement(node) { 977 continue 978 } 979 } 980 break 981 } 982 p.popUntil(buttonScope, a.P) 983 p.addElement() 984 case a.Plaintext: 985 p.popUntil(buttonScope, a.P) 986 p.addElement() 987 case a.Button: 988 p.popUntil(defaultScope, a.Button) 989 p.reconstructActiveFormattingElements() 990 p.addElement() 991 p.framesetOK = false 992 case a.A: 993 for i := len(p.afe) - 1; i >= 0 && p.afe[i].Type != scopeMarkerNode; i-- { 994 if n := p.afe[i]; n.Type == ElementNode && n.DataAtom == a.A { 995 p.inBodyEndTagFormatting(a.A, "a") 996 p.oe.remove(n) 997 p.afe.remove(n) 998 break 999 } 1000 } 1001 p.reconstructActiveFormattingElements() 1002 p.addFormattingElement() 1003 case a.B, a.Big, a.Code, a.Em, a.Font, a.I, a.S, a.Small, a.Strike, a.Strong, a.Tt, a.U: 1004 p.reconstructActiveFormattingElements() 1005 p.addFormattingElement() 1006 case a.Nobr: 1007 p.reconstructActiveFormattingElements() 1008 if p.elementInScope(defaultScope, a.Nobr) { 1009 p.inBodyEndTagFormatting(a.Nobr, "nobr") 1010 p.reconstructActiveFormattingElements() 1011 } 1012 p.addFormattingElement() 1013 case a.Applet, a.Marquee, a.Object: 1014 p.reconstructActiveFormattingElements() 1015 p.addElement() 1016 p.afe = append(p.afe, &scopeMarker) 1017 p.framesetOK = false 1018 case a.Table: 1019 if !p.quirks { 1020 p.popUntil(buttonScope, a.P) 1021 } 1022 p.addElement() 1023 p.framesetOK = false 1024 p.im = inTableIM 1025 return true 1026 case a.Area, a.Br, a.Embed, a.Img, a.Input, a.Keygen, a.Wbr: 1027 p.reconstructActiveFormattingElements() 1028 p.addElement() 1029 p.oe.pop() 1030 p.acknowledgeSelfClosingTag() 1031 if p.tok.DataAtom == a.Input { 1032 for _, t := range p.tok.Attr { 1033 if t.Key == "type" { 1034 if strings.ToLower(t.Val) == "hidden" { 1035 // Skip setting framesetOK = false 1036 return true 1037 } 1038 } 1039 } 1040 } 1041 p.framesetOK = false 1042 case a.Param, a.Source, a.Track: 1043 p.addElement() 1044 p.oe.pop() 1045 p.acknowledgeSelfClosingTag() 1046 case a.Hr: 1047 p.popUntil(buttonScope, a.P) 1048 p.addElement() 1049 p.oe.pop() 1050 p.acknowledgeSelfClosingTag() 1051 p.framesetOK = false 1052 case a.Image: 1053 p.tok.DataAtom = a.Img 1054 p.tok.Data = a.Img.String() 1055 return false 1056 case a.Textarea: 1057 p.addElement() 1058 p.setOriginalIM() 1059 p.framesetOK = false 1060 p.im = textIM 1061 case a.Xmp: 1062 p.popUntil(buttonScope, a.P) 1063 p.reconstructActiveFormattingElements() 1064 p.framesetOK = false 1065 p.parseGenericRawTextElement() 1066 case a.Iframe: 1067 p.framesetOK = false 1068 p.parseGenericRawTextElement() 1069 case a.Noembed: 1070 p.parseGenericRawTextElement() 1071 case a.Noscript: 1072 if p.scripting { 1073 p.parseGenericRawTextElement() 1074 return true 1075 } 1076 p.reconstructActiveFormattingElements() 1077 p.addElement() 1078 // Don't let the tokenizer go into raw text mode when scripting is disabled. 1079 p.tokenizer.NextIsNotRawText() 1080 case a.Select: 1081 p.reconstructActiveFormattingElements() 1082 p.addElement() 1083 p.framesetOK = false 1084 p.im = inSelectIM 1085 return true 1086 case a.Optgroup, a.Option: 1087 if p.top().DataAtom == a.Option { 1088 p.oe.pop() 1089 } 1090 p.reconstructActiveFormattingElements() 1091 p.addElement() 1092 case a.Rb, a.Rtc: 1093 if p.elementInScope(defaultScope, a.Ruby) { 1094 p.generateImpliedEndTags() 1095 } 1096 p.addElement() 1097 case a.Rp, a.Rt: 1098 if p.elementInScope(defaultScope, a.Ruby) { 1099 p.generateImpliedEndTags("rtc") 1100 } 1101 p.addElement() 1102 case a.Math, a.Svg: 1103 p.reconstructActiveFormattingElements() 1104 if p.tok.DataAtom == a.Math { 1105 adjustAttributeNames(p.tok.Attr, mathMLAttributeAdjustments) 1106 } else { 1107 adjustAttributeNames(p.tok.Attr, svgAttributeAdjustments) 1108 } 1109 adjustForeignAttributes(p.tok.Attr) 1110 p.addElement() 1111 p.top().Namespace = p.tok.Data 1112 if p.hasSelfClosingToken { 1113 p.oe.pop() 1114 p.acknowledgeSelfClosingTag() 1115 } 1116 return true 1117 case a.Caption, a.Col, a.Colgroup, a.Frame, a.Head, a.Tbody, a.Td, a.Tfoot, a.Th, a.Thead, a.Tr: 1118 // Ignore the token. 1119 default: 1120 p.reconstructActiveFormattingElements() 1121 p.addElement() 1122 } 1123 case EndTagToken: 1124 switch p.tok.DataAtom { 1125 case a.Body: 1126 if p.elementInScope(defaultScope, a.Body) { 1127 p.im = afterBodyIM 1128 } 1129 case a.Html: 1130 if p.elementInScope(defaultScope, a.Body) { 1131 p.parseImpliedToken(EndTagToken, a.Body, a.Body.String()) 1132 return false 1133 } 1134 return true 1135 case a.Address, a.Article, a.Aside, a.Blockquote, a.Button, a.Center, a.Details, a.Dialog, a.Dir, a.Div, a.Dl, a.Fieldset, a.Figcaption, a.Figure, a.Footer, a.Header, a.Hgroup, a.Listing, a.Main, a.Menu, a.Nav, a.Ol, a.Pre, a.Section, a.Summary, a.Ul: 1136 p.popUntil(defaultScope, p.tok.DataAtom) 1137 case a.Form: 1138 if p.oe.contains(a.Template) { 1139 i := p.indexOfElementInScope(defaultScope, a.Form) 1140 if i == -1 { 1141 // Ignore the token. 1142 return true 1143 } 1144 p.generateImpliedEndTags() 1145 if p.oe[i].DataAtom != a.Form { 1146 // Ignore the token. 1147 return true 1148 } 1149 p.popUntil(defaultScope, a.Form) 1150 } else { 1151 node := p.form 1152 p.form = nil 1153 i := p.indexOfElementInScope(defaultScope, a.Form) 1154 if node == nil || i == -1 || p.oe[i] != node { 1155 // Ignore the token. 1156 return true 1157 } 1158 p.generateImpliedEndTags() 1159 p.oe.remove(node) 1160 } 1161 case a.P: 1162 if !p.elementInScope(buttonScope, a.P) { 1163 p.parseImpliedToken(StartTagToken, a.P, a.P.String()) 1164 } 1165 p.popUntil(buttonScope, a.P) 1166 case a.Li: 1167 p.popUntil(listItemScope, a.Li) 1168 case a.Dd, a.Dt: 1169 p.popUntil(defaultScope, p.tok.DataAtom) 1170 case a.H1, a.H2, a.H3, a.H4, a.H5, a.H6: 1171 p.popUntil(defaultScope, a.H1, a.H2, a.H3, a.H4, a.H5, a.H6) 1172 case a.A, a.B, a.Big, a.Code, a.Em, a.Font, a.I, a.Nobr, a.S, a.Small, a.Strike, a.Strong, a.Tt, a.U: 1173 p.inBodyEndTagFormatting(p.tok.DataAtom, p.tok.Data) 1174 case a.Applet, a.Marquee, a.Object: 1175 if p.popUntil(defaultScope, p.tok.DataAtom) { 1176 p.clearActiveFormattingElements() 1177 } 1178 case a.Br: 1179 p.tok.Type = StartTagToken 1180 return false 1181 case a.Template: 1182 return inHeadIM(p) 1183 default: 1184 p.inBodyEndTagOther(p.tok.DataAtom, p.tok.Data) 1185 } 1186 case CommentToken: 1187 p.addChild(&Node{ 1188 Type: CommentNode, 1189 Data: p.tok.Data, 1190 }) 1191 case ErrorToken: 1192 // TODO: remove this divergence from the HTML5 spec. 1193 if len(p.templateStack) > 0 { 1194 p.im = inTemplateIM 1195 return false 1196 } 1197 for _, e := range p.oe { 1198 switch e.DataAtom { 1199 case a.Dd, a.Dt, a.Li, a.Optgroup, a.Option, a.P, a.Rb, a.Rp, a.Rt, a.Rtc, a.Tbody, a.Td, a.Tfoot, a.Th, 1200 a.Thead, a.Tr, a.Body, a.Html: 1201 default: 1202 return true 1203 } 1204 } 1205 } 1206 1207 return true 1208 } 1209 1210 func (p *parser) inBodyEndTagFormatting(tagAtom a.Atom, tagName string) { 1211 // This is the "adoption agency" algorithm, described at 1212 // https://html.spec.whatwg.org/multipage/syntax.html#adoptionAgency 1213 1214 // TODO: this is a fairly literal line-by-line translation of that algorithm. 1215 // Once the code successfully parses the comprehensive test suite, we should 1216 // refactor this code to be more idiomatic. 1217 1218 // Steps 1-2 1219 if current := p.oe.top(); current.Data == tagName && p.afe.index(current) == -1 { 1220 p.oe.pop() 1221 return 1222 } 1223 1224 // Steps 3-5. The outer loop. 1225 for i := 0; i < 8; i++ { 1226 // Step 6. Find the formatting element. 1227 var formattingElement *Node 1228 for j := len(p.afe) - 1; j >= 0; j-- { 1229 if p.afe[j].Type == scopeMarkerNode { 1230 break 1231 } 1232 if p.afe[j].DataAtom == tagAtom { 1233 formattingElement = p.afe[j] 1234 break 1235 } 1236 } 1237 if formattingElement == nil { 1238 p.inBodyEndTagOther(tagAtom, tagName) 1239 return 1240 } 1241 1242 // Step 7. Ignore the tag if formatting element is not in the stack of open elements. 1243 feIndex := p.oe.index(formattingElement) 1244 if feIndex == -1 { 1245 p.afe.remove(formattingElement) 1246 return 1247 } 1248 // Step 8. Ignore the tag if formatting element is not in the scope. 1249 if !p.elementInScope(defaultScope, tagAtom) { 1250 // Ignore the tag. 1251 return 1252 } 1253 1254 // Step 9. This step is omitted because it's just a parse error but no need to return. 1255 1256 // Steps 10-11. Find the furthest block. 1257 var furthestBlock *Node 1258 for _, e := range p.oe[feIndex:] { 1259 if isSpecialElement(e) { 1260 furthestBlock = e 1261 break 1262 } 1263 } 1264 if furthestBlock == nil { 1265 e := p.oe.pop() 1266 for e != formattingElement { 1267 e = p.oe.pop() 1268 } 1269 p.afe.remove(e) 1270 return 1271 } 1272 1273 // Steps 12-13. Find the common ancestor and bookmark node. 1274 commonAncestor := p.oe[feIndex-1] 1275 bookmark := p.afe.index(formattingElement) 1276 1277 // Step 14. The inner loop. Find the lastNode to reparent. 1278 lastNode := furthestBlock 1279 node := furthestBlock 1280 x := p.oe.index(node) 1281 // Step 14.1. 1282 j := 0 1283 for { 1284 // Step 14.2. 1285 j++ 1286 // Step. 14.3. 1287 x-- 1288 node = p.oe[x] 1289 // Step 14.4. Go to the next step if node is formatting element. 1290 if node == formattingElement { 1291 break 1292 } 1293 // Step 14.5. Remove node from the list of active formatting elements if 1294 // inner loop counter is greater than three and node is in the list of 1295 // active formatting elements. 1296 if ni := p.afe.index(node); j > 3 && ni > -1 { 1297 p.afe.remove(node) 1298 // If any element of the list of active formatting elements is removed, 1299 // we need to take care whether bookmark should be decremented or not. 1300 // This is because the value of bookmark may exceed the size of the 1301 // list by removing elements from the list. 1302 if ni <= bookmark { 1303 bookmark-- 1304 } 1305 continue 1306 } 1307 // Step 14.6. Continue the next inner loop if node is not in the list of 1308 // active formatting elements. 1309 if p.afe.index(node) == -1 { 1310 p.oe.remove(node) 1311 continue 1312 } 1313 // Step 14.7. 1314 clone := node.clone() 1315 p.afe[p.afe.index(node)] = clone 1316 p.oe[p.oe.index(node)] = clone 1317 node = clone 1318 // Step 14.8. 1319 if lastNode == furthestBlock { 1320 bookmark = p.afe.index(node) + 1 1321 } 1322 // Step 14.9. 1323 if lastNode.Parent != nil { 1324 lastNode.Parent.RemoveChild(lastNode) 1325 } 1326 node.AppendChild(lastNode) 1327 // Step 14.10. 1328 lastNode = node 1329 } 1330 1331 // Step 15. Reparent lastNode to the common ancestor, 1332 // or for misnested table nodes, to the foster parent. 1333 if lastNode.Parent != nil { 1334 lastNode.Parent.RemoveChild(lastNode) 1335 } 1336 switch commonAncestor.DataAtom { 1337 case a.Table, a.Tbody, a.Tfoot, a.Thead, a.Tr: 1338 p.fosterParent(lastNode) 1339 default: 1340 commonAncestor.AppendChild(lastNode) 1341 } 1342 1343 // Steps 16-18. Reparent nodes from the furthest block's children 1344 // to a clone of the formatting element. 1345 clone := formattingElement.clone() 1346 reparentChildren(clone, furthestBlock) 1347 furthestBlock.AppendChild(clone) 1348 1349 // Step 19. Fix up the list of active formatting elements. 1350 if oldLoc := p.afe.index(formattingElement); oldLoc != -1 && oldLoc < bookmark { 1351 // Move the bookmark with the rest of the list. 1352 bookmark-- 1353 } 1354 p.afe.remove(formattingElement) 1355 p.afe.insert(bookmark, clone) 1356 1357 // Step 20. Fix up the stack of open elements. 1358 p.oe.remove(formattingElement) 1359 p.oe.insert(p.oe.index(furthestBlock)+1, clone) 1360 } 1361 } 1362 1363 // inBodyEndTagOther performs the "any other end tag" algorithm for inBodyIM. 1364 // "Any other end tag" handling from 12.2.6.5 The rules for parsing tokens in foreign content 1365 // https://html.spec.whatwg.org/multipage/syntax.html#parsing-main-inforeign 1366 func (p *parser) inBodyEndTagOther(tagAtom a.Atom, tagName string) { 1367 for i := len(p.oe) - 1; i >= 0; i-- { 1368 // Two element nodes have the same tag if they have the same Data (a 1369 // string-typed field). As an optimization, for common HTML tags, each 1370 // Data string is assigned a unique, non-zero DataAtom (a uint32-typed 1371 // field), since integer comparison is faster than string comparison. 1372 // Uncommon (custom) tags get a zero DataAtom. 1373 // 1374 // The if condition here is equivalent to (p.oe[i].Data == tagName). 1375 if (p.oe[i].DataAtom == tagAtom) && 1376 ((tagAtom != 0) || (p.oe[i].Data == tagName)) { 1377 p.oe = p.oe[:i] 1378 break 1379 } 1380 if isSpecialElement(p.oe[i]) { 1381 break 1382 } 1383 } 1384 } 1385 1386 // Section 12.2.6.4.8. 1387 func textIM(p *parser) bool { 1388 switch p.tok.Type { 1389 case ErrorToken: 1390 p.oe.pop() 1391 case TextToken: 1392 d := p.tok.Data 1393 if n := p.oe.top(); n.DataAtom == a.Textarea && n.FirstChild == nil { 1394 // Ignore a newline at the start of a <textarea> block. 1395 if d != "" && d[0] == '\r' { 1396 d = d[1:] 1397 } 1398 if d != "" && d[0] == '\n' { 1399 d = d[1:] 1400 } 1401 } 1402 if d == "" { 1403 return true 1404 } 1405 p.addText(d) 1406 return true 1407 case EndTagToken: 1408 p.oe.pop() 1409 } 1410 p.im = p.originalIM 1411 p.originalIM = nil 1412 return p.tok.Type == EndTagToken 1413 } 1414 1415 // Section 12.2.6.4.9. 1416 func inTableIM(p *parser) bool { 1417 switch p.tok.Type { 1418 case TextToken: 1419 p.tok.Data = strings.Replace(p.tok.Data, "\x00", "", -1) 1420 switch p.oe.top().DataAtom { 1421 case a.Table, a.Tbody, a.Tfoot, a.Thead, a.Tr: 1422 if strings.Trim(p.tok.Data, whitespace) == "" { 1423 p.addText(p.tok.Data) 1424 return true 1425 } 1426 } 1427 case StartTagToken: 1428 switch p.tok.DataAtom { 1429 case a.Caption: 1430 p.clearStackToContext(tableScope) 1431 p.afe = append(p.afe, &scopeMarker) 1432 p.addElement() 1433 p.im = inCaptionIM 1434 return true 1435 case a.Colgroup: 1436 p.clearStackToContext(tableScope) 1437 p.addElement() 1438 p.im = inColumnGroupIM 1439 return true 1440 case a.Col: 1441 p.parseImpliedToken(StartTagToken, a.Colgroup, a.Colgroup.String()) 1442 return false 1443 case a.Tbody, a.Tfoot, a.Thead: 1444 p.clearStackToContext(tableScope) 1445 p.addElement() 1446 p.im = inTableBodyIM 1447 return true 1448 case a.Td, a.Th, a.Tr: 1449 p.parseImpliedToken(StartTagToken, a.Tbody, a.Tbody.String()) 1450 return false 1451 case a.Table: 1452 if p.popUntil(tableScope, a.Table) { 1453 p.resetInsertionMode() 1454 return false 1455 } 1456 // Ignore the token. 1457 return true 1458 case a.Style, a.Script, a.Template: 1459 return inHeadIM(p) 1460 case a.Input: 1461 for _, t := range p.tok.Attr { 1462 if t.Key == "type" && strings.ToLower(t.Val) == "hidden" { 1463 p.addElement() 1464 p.oe.pop() 1465 return true 1466 } 1467 } 1468 // Otherwise drop down to the default action. 1469 case a.Form: 1470 if p.oe.contains(a.Template) || p.form != nil { 1471 // Ignore the token. 1472 return true 1473 } 1474 p.addElement() 1475 p.form = p.oe.pop() 1476 case a.Select: 1477 p.reconstructActiveFormattingElements() 1478 switch p.top().DataAtom { 1479 case a.Table, a.Tbody, a.Tfoot, a.Thead, a.Tr: 1480 p.fosterParenting = true 1481 } 1482 p.addElement() 1483 p.fosterParenting = false 1484 p.framesetOK = false 1485 p.im = inSelectInTableIM 1486 return true 1487 } 1488 case EndTagToken: 1489 switch p.tok.DataAtom { 1490 case a.Table: 1491 if p.popUntil(tableScope, a.Table) { 1492 p.resetInsertionMode() 1493 return true 1494 } 1495 // Ignore the token. 1496 return true 1497 case a.Body, a.Caption, a.Col, a.Colgroup, a.Html, a.Tbody, a.Td, a.Tfoot, a.Th, a.Thead, a.Tr: 1498 // Ignore the token. 1499 return true 1500 case a.Template: 1501 return inHeadIM(p) 1502 } 1503 case CommentToken: 1504 p.addChild(&Node{ 1505 Type: CommentNode, 1506 Data: p.tok.Data, 1507 }) 1508 return true 1509 case DoctypeToken: 1510 // Ignore the token. 1511 return true 1512 case ErrorToken: 1513 return inBodyIM(p) 1514 } 1515 1516 p.fosterParenting = true 1517 defer func() { p.fosterParenting = false }() 1518 1519 return inBodyIM(p) 1520 } 1521 1522 // Section 12.2.6.4.11. 1523 func inCaptionIM(p *parser) bool { 1524 switch p.tok.Type { 1525 case StartTagToken: 1526 switch p.tok.DataAtom { 1527 case a.Caption, a.Col, a.Colgroup, a.Tbody, a.Td, a.Tfoot, a.Thead, a.Tr: 1528 if !p.popUntil(tableScope, a.Caption) { 1529 // Ignore the token. 1530 return true 1531 } 1532 p.clearActiveFormattingElements() 1533 p.im = inTableIM 1534 return false 1535 case a.Select: 1536 p.reconstructActiveFormattingElements() 1537 p.addElement() 1538 p.framesetOK = false 1539 p.im = inSelectInTableIM 1540 return true 1541 } 1542 case EndTagToken: 1543 switch p.tok.DataAtom { 1544 case a.Caption: 1545 if p.popUntil(tableScope, a.Caption) { 1546 p.clearActiveFormattingElements() 1547 p.im = inTableIM 1548 } 1549 return true 1550 case a.Table: 1551 if !p.popUntil(tableScope, a.Caption) { 1552 // Ignore the token. 1553 return true 1554 } 1555 p.clearActiveFormattingElements() 1556 p.im = inTableIM 1557 return false 1558 case a.Body, a.Col, a.Colgroup, a.Html, a.Tbody, a.Td, a.Tfoot, a.Th, a.Thead, a.Tr: 1559 // Ignore the token. 1560 return true 1561 } 1562 } 1563 return inBodyIM(p) 1564 } 1565 1566 // Section 12.2.6.4.12. 1567 func inColumnGroupIM(p *parser) bool { 1568 switch p.tok.Type { 1569 case TextToken: 1570 s := strings.TrimLeft(p.tok.Data, whitespace) 1571 if len(s) < len(p.tok.Data) { 1572 // Add the initial whitespace to the current node. 1573 p.addText(p.tok.Data[:len(p.tok.Data)-len(s)]) 1574 if s == "" { 1575 return true 1576 } 1577 p.tok.Data = s 1578 } 1579 case CommentToken: 1580 p.addChild(&Node{ 1581 Type: CommentNode, 1582 Data: p.tok.Data, 1583 }) 1584 return true 1585 case DoctypeToken: 1586 // Ignore the token. 1587 return true 1588 case StartTagToken: 1589 switch p.tok.DataAtom { 1590 case a.Html: 1591 return inBodyIM(p) 1592 case a.Col: 1593 p.addElement() 1594 p.oe.pop() 1595 p.acknowledgeSelfClosingTag() 1596 return true 1597 case a.Template: 1598 return inHeadIM(p) 1599 } 1600 case EndTagToken: 1601 switch p.tok.DataAtom { 1602 case a.Colgroup: 1603 if p.oe.top().DataAtom == a.Colgroup { 1604 p.oe.pop() 1605 p.im = inTableIM 1606 } 1607 return true 1608 case a.Col: 1609 // Ignore the token. 1610 return true 1611 case a.Template: 1612 return inHeadIM(p) 1613 } 1614 case ErrorToken: 1615 return inBodyIM(p) 1616 } 1617 if p.oe.top().DataAtom != a.Colgroup { 1618 return true 1619 } 1620 p.oe.pop() 1621 p.im = inTableIM 1622 return false 1623 } 1624 1625 // Section 12.2.6.4.13. 1626 func inTableBodyIM(p *parser) bool { 1627 switch p.tok.Type { 1628 case StartTagToken: 1629 switch p.tok.DataAtom { 1630 case a.Tr: 1631 p.clearStackToContext(tableBodyScope) 1632 p.addElement() 1633 p.im = inRowIM 1634 return true 1635 case a.Td, a.Th: 1636 p.parseImpliedToken(StartTagToken, a.Tr, a.Tr.String()) 1637 return false 1638 case a.Caption, a.Col, a.Colgroup, a.Tbody, a.Tfoot, a.Thead: 1639 if p.popUntil(tableScope, a.Tbody, a.Thead, a.Tfoot) { 1640 p.im = inTableIM 1641 return false 1642 } 1643 // Ignore the token. 1644 return true 1645 } 1646 case EndTagToken: 1647 switch p.tok.DataAtom { 1648 case a.Tbody, a.Tfoot, a.Thead: 1649 if p.elementInScope(tableScope, p.tok.DataAtom) { 1650 p.clearStackToContext(tableBodyScope) 1651 p.oe.pop() 1652 p.im = inTableIM 1653 } 1654 return true 1655 case a.Table: 1656 if p.popUntil(tableScope, a.Tbody, a.Thead, a.Tfoot) { 1657 p.im = inTableIM 1658 return false 1659 } 1660 // Ignore the token. 1661 return true 1662 case a.Body, a.Caption, a.Col, a.Colgroup, a.Html, a.Td, a.Th, a.Tr: 1663 // Ignore the token. 1664 return true 1665 } 1666 case CommentToken: 1667 p.addChild(&Node{ 1668 Type: CommentNode, 1669 Data: p.tok.Data, 1670 }) 1671 return true 1672 } 1673 1674 return inTableIM(p) 1675 } 1676 1677 // Section 12.2.6.4.14. 1678 func inRowIM(p *parser) bool { 1679 switch p.tok.Type { 1680 case StartTagToken: 1681 switch p.tok.DataAtom { 1682 case a.Td, a.Th: 1683 p.clearStackToContext(tableRowScope) 1684 p.addElement() 1685 p.afe = append(p.afe, &scopeMarker) 1686 p.im = inCellIM 1687 return true 1688 case a.Caption, a.Col, a.Colgroup, a.Tbody, a.Tfoot, a.Thead, a.Tr: 1689 if p.popUntil(tableScope, a.Tr) { 1690 p.im = inTableBodyIM 1691 return false 1692 } 1693 // Ignore the token. 1694 return true 1695 } 1696 case EndTagToken: 1697 switch p.tok.DataAtom { 1698 case a.Tr: 1699 if p.popUntil(tableScope, a.Tr) { 1700 p.im = inTableBodyIM 1701 return true 1702 } 1703 // Ignore the token. 1704 return true 1705 case a.Table: 1706 if p.popUntil(tableScope, a.Tr) { 1707 p.im = inTableBodyIM 1708 return false 1709 } 1710 // Ignore the token. 1711 return true 1712 case a.Tbody, a.Tfoot, a.Thead: 1713 if p.elementInScope(tableScope, p.tok.DataAtom) { 1714 p.parseImpliedToken(EndTagToken, a.Tr, a.Tr.String()) 1715 return false 1716 } 1717 // Ignore the token. 1718 return true 1719 case a.Body, a.Caption, a.Col, a.Colgroup, a.Html, a.Td, a.Th: 1720 // Ignore the token. 1721 return true 1722 } 1723 } 1724 1725 return inTableIM(p) 1726 } 1727 1728 // Section 12.2.6.4.15. 1729 func inCellIM(p *parser) bool { 1730 switch p.tok.Type { 1731 case StartTagToken: 1732 switch p.tok.DataAtom { 1733 case a.Caption, a.Col, a.Colgroup, a.Tbody, a.Td, a.Tfoot, a.Th, a.Thead, a.Tr: 1734 if p.popUntil(tableScope, a.Td, a.Th) { 1735 // Close the cell and reprocess. 1736 p.clearActiveFormattingElements() 1737 p.im = inRowIM 1738 return false 1739 } 1740 // Ignore the token. 1741 return true 1742 case a.Select: 1743 p.reconstructActiveFormattingElements() 1744 p.addElement() 1745 p.framesetOK = false 1746 p.im = inSelectInTableIM 1747 return true 1748 } 1749 case EndTagToken: 1750 switch p.tok.DataAtom { 1751 case a.Td, a.Th: 1752 if !p.popUntil(tableScope, p.tok.DataAtom) { 1753 // Ignore the token. 1754 return true 1755 } 1756 p.clearActiveFormattingElements() 1757 p.im = inRowIM 1758 return true 1759 case a.Body, a.Caption, a.Col, a.Colgroup, a.Html: 1760 // Ignore the token. 1761 return true 1762 case a.Table, a.Tbody, a.Tfoot, a.Thead, a.Tr: 1763 if !p.elementInScope(tableScope, p.tok.DataAtom) { 1764 // Ignore the token. 1765 return true 1766 } 1767 // Close the cell and reprocess. 1768 if p.popUntil(tableScope, a.Td, a.Th) { 1769 p.clearActiveFormattingElements() 1770 } 1771 p.im = inRowIM 1772 return false 1773 } 1774 } 1775 return inBodyIM(p) 1776 } 1777 1778 // Section 12.2.6.4.16. 1779 func inSelectIM(p *parser) bool { 1780 switch p.tok.Type { 1781 case TextToken: 1782 p.addText(strings.Replace(p.tok.Data, "\x00", "", -1)) 1783 case StartTagToken: 1784 switch p.tok.DataAtom { 1785 case a.Html: 1786 return inBodyIM(p) 1787 case a.Option: 1788 if p.top().DataAtom == a.Option { 1789 p.oe.pop() 1790 } 1791 p.addElement() 1792 case a.Optgroup: 1793 if p.top().DataAtom == a.Option { 1794 p.oe.pop() 1795 } 1796 if p.top().DataAtom == a.Optgroup { 1797 p.oe.pop() 1798 } 1799 p.addElement() 1800 case a.Select: 1801 if !p.popUntil(selectScope, a.Select) { 1802 // Ignore the token. 1803 return true 1804 } 1805 p.resetInsertionMode() 1806 case a.Input, a.Keygen, a.Textarea: 1807 if p.elementInScope(selectScope, a.Select) { 1808 p.parseImpliedToken(EndTagToken, a.Select, a.Select.String()) 1809 return false 1810 } 1811 // In order to properly ignore <textarea>, we need to change the tokenizer mode. 1812 p.tokenizer.NextIsNotRawText() 1813 // Ignore the token. 1814 return true 1815 case a.Script, a.Template: 1816 return inHeadIM(p) 1817 case a.Iframe, a.Noembed, a.Noframes, a.Noscript, a.Plaintext, a.Style, a.Title, a.Xmp: 1818 // Don't let the tokenizer go into raw text mode when there are raw tags 1819 // to be ignored. These tags should be ignored from the tokenizer 1820 // properly. 1821 p.tokenizer.NextIsNotRawText() 1822 // Ignore the token. 1823 return true 1824 } 1825 case EndTagToken: 1826 switch p.tok.DataAtom { 1827 case a.Option: 1828 if p.top().DataAtom == a.Option { 1829 p.oe.pop() 1830 } 1831 case a.Optgroup: 1832 i := len(p.oe) - 1 1833 if p.oe[i].DataAtom == a.Option { 1834 i-- 1835 } 1836 if p.oe[i].DataAtom == a.Optgroup { 1837 p.oe = p.oe[:i] 1838 } 1839 case a.Select: 1840 if !p.popUntil(selectScope, a.Select) { 1841 // Ignore the token. 1842 return true 1843 } 1844 p.resetInsertionMode() 1845 case a.Template: 1846 return inHeadIM(p) 1847 } 1848 case CommentToken: 1849 p.addChild(&Node{ 1850 Type: CommentNode, 1851 Data: p.tok.Data, 1852 }) 1853 case DoctypeToken: 1854 // Ignore the token. 1855 return true 1856 case ErrorToken: 1857 return inBodyIM(p) 1858 } 1859 1860 return true 1861 } 1862 1863 // Section 12.2.6.4.17. 1864 func inSelectInTableIM(p *parser) bool { 1865 switch p.tok.Type { 1866 case StartTagToken, EndTagToken: 1867 switch p.tok.DataAtom { 1868 case a.Caption, a.Table, a.Tbody, a.Tfoot, a.Thead, a.Tr, a.Td, a.Th: 1869 if p.tok.Type == EndTagToken && !p.elementInScope(tableScope, p.tok.DataAtom) { 1870 // Ignore the token. 1871 return true 1872 } 1873 // This is like p.popUntil(selectScope, a.Select), but it also 1874 // matches <math select>, not just <select>. Matching the MathML 1875 // tag is arguably incorrect (conceptually), but it mimics what 1876 // Chromium does. 1877 for i := len(p.oe) - 1; i >= 0; i-- { 1878 if n := p.oe[i]; n.DataAtom == a.Select { 1879 p.oe = p.oe[:i] 1880 break 1881 } 1882 } 1883 p.resetInsertionMode() 1884 return false 1885 } 1886 } 1887 return inSelectIM(p) 1888 } 1889 1890 // Section 12.2.6.4.18. 1891 func inTemplateIM(p *parser) bool { 1892 switch p.tok.Type { 1893 case TextToken, CommentToken, DoctypeToken: 1894 return inBodyIM(p) 1895 case StartTagToken: 1896 switch p.tok.DataAtom { 1897 case a.Base, a.Basefont, a.Bgsound, a.Link, a.Meta, a.Noframes, a.Script, a.Style, a.Template, a.Title: 1898 return inHeadIM(p) 1899 case a.Caption, a.Colgroup, a.Tbody, a.Tfoot, a.Thead: 1900 p.templateStack.pop() 1901 p.templateStack = append(p.templateStack, inTableIM) 1902 p.im = inTableIM 1903 return false 1904 case a.Col: 1905 p.templateStack.pop() 1906 p.templateStack = append(p.templateStack, inColumnGroupIM) 1907 p.im = inColumnGroupIM 1908 return false 1909 case a.Tr: 1910 p.templateStack.pop() 1911 p.templateStack = append(p.templateStack, inTableBodyIM) 1912 p.im = inTableBodyIM 1913 return false 1914 case a.Td, a.Th: 1915 p.templateStack.pop() 1916 p.templateStack = append(p.templateStack, inRowIM) 1917 p.im = inRowIM 1918 return false 1919 default: 1920 p.templateStack.pop() 1921 p.templateStack = append(p.templateStack, inBodyIM) 1922 p.im = inBodyIM 1923 return false 1924 } 1925 case EndTagToken: 1926 switch p.tok.DataAtom { 1927 case a.Template: 1928 return inHeadIM(p) 1929 default: 1930 // Ignore the token. 1931 return true 1932 } 1933 case ErrorToken: 1934 if !p.oe.contains(a.Template) { 1935 // Ignore the token. 1936 return true 1937 } 1938 // TODO: remove this divergence from the HTML5 spec. 1939 // 1940 // See https://bugs.chromium.org/p/chromium/issues/detail?id=829668 1941 p.generateImpliedEndTags() 1942 for i := len(p.oe) - 1; i >= 0; i-- { 1943 if n := p.oe[i]; n.Namespace == "" && n.DataAtom == a.Template { 1944 p.oe = p.oe[:i] 1945 break 1946 } 1947 } 1948 p.clearActiveFormattingElements() 1949 p.templateStack.pop() 1950 p.resetInsertionMode() 1951 return false 1952 } 1953 return false 1954 } 1955 1956 // Section 12.2.6.4.19. 1957 func afterBodyIM(p *parser) bool { 1958 switch p.tok.Type { 1959 case ErrorToken: 1960 // Stop parsing. 1961 return true 1962 case TextToken: 1963 s := strings.TrimLeft(p.tok.Data, whitespace) 1964 if len(s) == 0 { 1965 // It was all whitespace. 1966 return inBodyIM(p) 1967 } 1968 case StartTagToken: 1969 if p.tok.DataAtom == a.Html { 1970 return inBodyIM(p) 1971 } 1972 case EndTagToken: 1973 if p.tok.DataAtom == a.Html { 1974 if !p.fragment { 1975 p.im = afterAfterBodyIM 1976 } 1977 return true 1978 } 1979 case CommentToken: 1980 // The comment is attached to the <html> element. 1981 if len(p.oe) < 1 || p.oe[0].DataAtom != a.Html { 1982 panic("html: bad parser state: <html> element not found, in the after-body insertion mode") 1983 } 1984 p.oe[0].AppendChild(&Node{ 1985 Type: CommentNode, 1986 Data: p.tok.Data, 1987 }) 1988 return true 1989 } 1990 p.im = inBodyIM 1991 return false 1992 } 1993 1994 // Section 12.2.6.4.20. 1995 func inFramesetIM(p *parser) bool { 1996 switch p.tok.Type { 1997 case CommentToken: 1998 p.addChild(&Node{ 1999 Type: CommentNode, 2000 Data: p.tok.Data, 2001 }) 2002 case TextToken: 2003 // Ignore all text but whitespace. 2004 s := strings.Map(func(c rune) rune { 2005 switch c { 2006 case ' ', '\t', '\n', '\f', '\r': 2007 return c 2008 } 2009 return -1 2010 }, p.tok.Data) 2011 if s != "" { 2012 p.addText(s) 2013 } 2014 case StartTagToken: 2015 switch p.tok.DataAtom { 2016 case a.Html: 2017 return inBodyIM(p) 2018 case a.Frameset: 2019 p.addElement() 2020 case a.Frame: 2021 p.addElement() 2022 p.oe.pop() 2023 p.acknowledgeSelfClosingTag() 2024 case a.Noframes: 2025 return inHeadIM(p) 2026 } 2027 case EndTagToken: 2028 switch p.tok.DataAtom { 2029 case a.Frameset: 2030 if p.oe.top().DataAtom != a.Html { 2031 p.oe.pop() 2032 if p.oe.top().DataAtom != a.Frameset { 2033 p.im = afterFramesetIM 2034 return true 2035 } 2036 } 2037 } 2038 default: 2039 // Ignore the token. 2040 } 2041 return true 2042 } 2043 2044 // Section 12.2.6.4.21. 2045 func afterFramesetIM(p *parser) bool { 2046 switch p.tok.Type { 2047 case CommentToken: 2048 p.addChild(&Node{ 2049 Type: CommentNode, 2050 Data: p.tok.Data, 2051 }) 2052 case TextToken: 2053 // Ignore all text but whitespace. 2054 s := strings.Map(func(c rune) rune { 2055 switch c { 2056 case ' ', '\t', '\n', '\f', '\r': 2057 return c 2058 } 2059 return -1 2060 }, p.tok.Data) 2061 if s != "" { 2062 p.addText(s) 2063 } 2064 case StartTagToken: 2065 switch p.tok.DataAtom { 2066 case a.Html: 2067 return inBodyIM(p) 2068 case a.Noframes: 2069 return inHeadIM(p) 2070 } 2071 case EndTagToken: 2072 switch p.tok.DataAtom { 2073 case a.Html: 2074 p.im = afterAfterFramesetIM 2075 return true 2076 } 2077 default: 2078 // Ignore the token. 2079 } 2080 return true 2081 } 2082 2083 // Section 12.2.6.4.22. 2084 func afterAfterBodyIM(p *parser) bool { 2085 switch p.tok.Type { 2086 case ErrorToken: 2087 // Stop parsing. 2088 return true 2089 case TextToken: 2090 s := strings.TrimLeft(p.tok.Data, whitespace) 2091 if len(s) == 0 { 2092 // It was all whitespace. 2093 return inBodyIM(p) 2094 } 2095 case StartTagToken: 2096 if p.tok.DataAtom == a.Html { 2097 return inBodyIM(p) 2098 } 2099 case CommentToken: 2100 p.doc.AppendChild(&Node{ 2101 Type: CommentNode, 2102 Data: p.tok.Data, 2103 }) 2104 return true 2105 case DoctypeToken: 2106 return inBodyIM(p) 2107 } 2108 p.im = inBodyIM 2109 return false 2110 } 2111 2112 // Section 12.2.6.4.23. 2113 func afterAfterFramesetIM(p *parser) bool { 2114 switch p.tok.Type { 2115 case CommentToken: 2116 p.doc.AppendChild(&Node{ 2117 Type: CommentNode, 2118 Data: p.tok.Data, 2119 }) 2120 case TextToken: 2121 // Ignore all text but whitespace. 2122 s := strings.Map(func(c rune) rune { 2123 switch c { 2124 case ' ', '\t', '\n', '\f', '\r': 2125 return c 2126 } 2127 return -1 2128 }, p.tok.Data) 2129 if s != "" { 2130 p.tok.Data = s 2131 return inBodyIM(p) 2132 } 2133 case StartTagToken: 2134 switch p.tok.DataAtom { 2135 case a.Html: 2136 return inBodyIM(p) 2137 case a.Noframes: 2138 return inHeadIM(p) 2139 } 2140 case DoctypeToken: 2141 return inBodyIM(p) 2142 default: 2143 // Ignore the token. 2144 } 2145 return true 2146 } 2147 2148 func ignoreTheRemainingTokens(p *parser) bool { 2149 return true 2150 } 2151 2152 const whitespaceOrNUL = whitespace + "\x00" 2153 2154 // Section 12.2.6.5 2155 func parseForeignContent(p *parser) bool { 2156 switch p.tok.Type { 2157 case TextToken: 2158 if p.framesetOK { 2159 p.framesetOK = strings.TrimLeft(p.tok.Data, whitespaceOrNUL) == "" 2160 } 2161 p.tok.Data = strings.Replace(p.tok.Data, "\x00", "\ufffd", -1) 2162 p.addText(p.tok.Data) 2163 case CommentToken: 2164 p.addChild(&Node{ 2165 Type: CommentNode, 2166 Data: p.tok.Data, 2167 }) 2168 case StartTagToken: 2169 if !p.fragment { 2170 b := breakout[p.tok.Data] 2171 if p.tok.DataAtom == a.Font { 2172 loop: 2173 for _, attr := range p.tok.Attr { 2174 switch attr.Key { 2175 case "color", "face", "size": 2176 b = true 2177 break loop 2178 } 2179 } 2180 } 2181 if b { 2182 for i := len(p.oe) - 1; i >= 0; i-- { 2183 n := p.oe[i] 2184 if n.Namespace == "" || htmlIntegrationPoint(n) || mathMLTextIntegrationPoint(n) { 2185 p.oe = p.oe[:i+1] 2186 break 2187 } 2188 } 2189 return false 2190 } 2191 } 2192 current := p.adjustedCurrentNode() 2193 switch current.Namespace { 2194 case "math": 2195 adjustAttributeNames(p.tok.Attr, mathMLAttributeAdjustments) 2196 case "svg": 2197 // Adjust SVG tag names. The tokenizer lower-cases tag names, but 2198 // SVG wants e.g. "foreignObject" with a capital second "O". 2199 if x := svgTagNameAdjustments[p.tok.Data]; x != "" { 2200 p.tok.DataAtom = a.Lookup([]byte(x)) 2201 p.tok.Data = x 2202 } 2203 adjustAttributeNames(p.tok.Attr, svgAttributeAdjustments) 2204 default: 2205 panic("html: bad parser state: unexpected namespace") 2206 } 2207 adjustForeignAttributes(p.tok.Attr) 2208 namespace := current.Namespace 2209 p.addElement() 2210 p.top().Namespace = namespace 2211 if namespace != "" { 2212 // Don't let the tokenizer go into raw text mode in foreign content 2213 // (e.g. in an SVG <title> tag). 2214 p.tokenizer.NextIsNotRawText() 2215 } 2216 if p.hasSelfClosingToken { 2217 p.oe.pop() 2218 p.acknowledgeSelfClosingTag() 2219 } 2220 case EndTagToken: 2221 for i := len(p.oe) - 1; i >= 0; i-- { 2222 if p.oe[i].Namespace == "" { 2223 return p.im(p) 2224 } 2225 if strings.EqualFold(p.oe[i].Data, p.tok.Data) { 2226 p.oe = p.oe[:i] 2227 break 2228 } 2229 } 2230 return true 2231 default: 2232 // Ignore the token. 2233 } 2234 return true 2235 } 2236 2237 // Section 12.2.4.2. 2238 func (p *parser) adjustedCurrentNode() *Node { 2239 if len(p.oe) == 1 && p.fragment && p.context != nil { 2240 return p.context 2241 } 2242 return p.oe.top() 2243 } 2244 2245 // Section 12.2.6. 2246 func (p *parser) inForeignContent() bool { 2247 if len(p.oe) == 0 { 2248 return false 2249 } 2250 n := p.adjustedCurrentNode() 2251 if n.Namespace == "" { 2252 return false 2253 } 2254 if mathMLTextIntegrationPoint(n) { 2255 if p.tok.Type == StartTagToken && p.tok.DataAtom != a.Mglyph && p.tok.DataAtom != a.Malignmark { 2256 return false 2257 } 2258 if p.tok.Type == TextToken { 2259 return false 2260 } 2261 } 2262 if n.Namespace == "math" && n.DataAtom == a.AnnotationXml && p.tok.Type == StartTagToken && p.tok.DataAtom == a.Svg { 2263 return false 2264 } 2265 if htmlIntegrationPoint(n) && (p.tok.Type == StartTagToken || p.tok.Type == TextToken) { 2266 return false 2267 } 2268 if p.tok.Type == ErrorToken { 2269 return false 2270 } 2271 return true 2272 } 2273 2274 // parseImpliedToken parses a token as though it had appeared in the parser's 2275 // input. 2276 func (p *parser) parseImpliedToken(t TokenType, dataAtom a.Atom, data string) { 2277 realToken, selfClosing := p.tok, p.hasSelfClosingToken 2278 p.tok = Token{ 2279 Type: t, 2280 DataAtom: dataAtom, 2281 Data: data, 2282 } 2283 p.hasSelfClosingToken = false 2284 p.parseCurrentToken() 2285 p.tok, p.hasSelfClosingToken = realToken, selfClosing 2286 } 2287 2288 // parseCurrentToken runs the current token through the parsing routines 2289 // until it is consumed. 2290 func (p *parser) parseCurrentToken() { 2291 if p.tok.Type == SelfClosingTagToken { 2292 p.hasSelfClosingToken = true 2293 p.tok.Type = StartTagToken 2294 } 2295 2296 consumed := false 2297 for !consumed { 2298 if p.inForeignContent() { 2299 consumed = parseForeignContent(p) 2300 } else { 2301 consumed = p.im(p) 2302 } 2303 } 2304 2305 if p.hasSelfClosingToken { 2306 // This is a parse error, but ignore it. 2307 p.hasSelfClosingToken = false 2308 } 2309 } 2310 2311 func (p *parser) parse() error { 2312 // Iterate until EOF. Any other error will cause an early return. 2313 var err error 2314 for err != io.EOF { 2315 // CDATA sections are allowed only in foreign content. 2316 n := p.oe.top() 2317 p.tokenizer.AllowCDATA(n != nil && n.Namespace != "") 2318 // Read and parse the next token. 2319 p.tokenizer.Next() 2320 p.tok = p.tokenizer.Token() 2321 if p.tok.Type == ErrorToken { 2322 err = p.tokenizer.Err() 2323 if err != nil && err != io.EOF { 2324 return err 2325 } 2326 } 2327 p.parseCurrentToken() 2328 } 2329 return nil 2330 } 2331 2332 // Parse returns the parse tree for the HTML from the given Reader. 2333 // 2334 // It implements the HTML5 parsing algorithm 2335 // (https://html.spec.whatwg.org/multipage/syntax.html#tree-construction), 2336 // which is very complicated. The resultant tree can contain implicitly created 2337 // nodes that have no explicit <tag> listed in r's data, and nodes' parents can 2338 // differ from the nesting implied by a naive processing of start and end 2339 // <tag>s. Conversely, explicit <tag>s in r's data can be silently dropped, 2340 // with no corresponding node in the resulting tree. 2341 // 2342 // The input is assumed to be UTF-8 encoded. 2343 func Parse(r io.Reader) (*Node, error) { 2344 return ParseWithOptions(r) 2345 } 2346 2347 // ParseFragment parses a fragment of HTML and returns the nodes that were 2348 // found. If the fragment is the InnerHTML for an existing element, pass that 2349 // element in context. 2350 // 2351 // It has the same intricacies as Parse. 2352 func ParseFragment(r io.Reader, context *Node) ([]*Node, error) { 2353 return ParseFragmentWithOptions(r, context) 2354 } 2355 2356 // ParseOption configures a parser. 2357 type ParseOption func(p *parser) 2358 2359 // ParseOptionEnableScripting configures the scripting flag. 2360 // https://html.spec.whatwg.org/multipage/webappapis.html#enabling-and-disabling-scripting 2361 // 2362 // By default, scripting is enabled. 2363 func ParseOptionEnableScripting(enable bool) ParseOption { 2364 return func(p *parser) { 2365 p.scripting = enable 2366 } 2367 } 2368 2369 // ParseWithOptions is like Parse, with options. 2370 func ParseWithOptions(r io.Reader, opts ...ParseOption) (*Node, error) { 2371 p := &parser{ 2372 tokenizer: NewTokenizer(r), 2373 doc: &Node{ 2374 Type: DocumentNode, 2375 }, 2376 scripting: true, 2377 framesetOK: true, 2378 im: initialIM, 2379 } 2380 2381 for _, f := range opts { 2382 f(p) 2383 } 2384 2385 if err := p.parse(); err != nil { 2386 return nil, err 2387 } 2388 return p.doc, nil 2389 } 2390 2391 // ParseFragmentWithOptions is like ParseFragment, with options. 2392 func ParseFragmentWithOptions(r io.Reader, context *Node, opts ...ParseOption) ([]*Node, error) { 2393 contextTag := "" 2394 if context != nil { 2395 if context.Type != ElementNode { 2396 return nil, errors.New("html: ParseFragment of non-element Node") 2397 } 2398 // The next check isn't just context.DataAtom.String() == context.Data because 2399 // it is valid to pass an element whose tag isn't a known atom. For example, 2400 // DataAtom == 0 and Data = "tagfromthefuture" is perfectly consistent. 2401 if context.DataAtom != a.Lookup([]byte(context.Data)) { 2402 return nil, fmt.Errorf("html: inconsistent Node: DataAtom=%q, Data=%q", context.DataAtom, context.Data) 2403 } 2404 contextTag = context.DataAtom.String() 2405 } 2406 p := &parser{ 2407 doc: &Node{ 2408 Type: DocumentNode, 2409 }, 2410 scripting: true, 2411 fragment: true, 2412 context: context, 2413 } 2414 if context != nil && context.Namespace != "" { 2415 p.tokenizer = NewTokenizer(r) 2416 } else { 2417 p.tokenizer = NewTokenizerFragment(r, contextTag) 2418 } 2419 2420 for _, f := range opts { 2421 f(p) 2422 } 2423 2424 root := &Node{ 2425 Type: ElementNode, 2426 DataAtom: a.Html, 2427 Data: a.Html.String(), 2428 } 2429 p.doc.AppendChild(root) 2430 p.oe = nodeStack{root} 2431 if context != nil && context.DataAtom == a.Template { 2432 p.templateStack = append(p.templateStack, inTemplateIM) 2433 } 2434 p.resetInsertionMode() 2435 2436 for n := context; n != nil; n = n.Parent { 2437 if n.Type == ElementNode && n.DataAtom == a.Form { 2438 p.form = n 2439 break 2440 } 2441 } 2442 2443 if err := p.parse(); err != nil { 2444 return nil, err 2445 } 2446 2447 parent := p.doc 2448 if context != nil { 2449 parent = root 2450 } 2451 2452 var result []*Node 2453 for c := parent.FirstChild; c != nil; { 2454 next := c.NextSibling 2455 parent.RemoveChild(c) 2456 result = append(result, c) 2457 c = next 2458 } 2459 return result, nil 2460 }