github.com/vugu/vugu@v0.3.5/internal/htmlx/parse.go (about) 1 // Copyright 2010 The Go Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 package htmlx 6 7 import ( 8 "errors" 9 "fmt" 10 "io" 11 "strings" 12 13 a "github.com/vugu/vugu/internal/htmlx/atom" 14 ) 15 16 // A parser implements the HTML5 parsing algorithm: 17 // https://html.spec.whatwg.org/multipage/syntax.html#tree-construction 18 type parser struct { 19 // tokenizer provides the tokens for the parser. 20 tokenizer *Tokenizer 21 // tok is the most recently read token. 22 tok Token 23 // Self-closing tags like <hr/> are treated as start tags, except that 24 // hasSelfClosingToken is set while they are being processed. 25 hasSelfClosingToken bool 26 // doc is the document root element. 27 doc *Node 28 // The stack of open elements (section 12.2.4.2) and active formatting 29 // elements (section 12.2.4.3). 30 oe, afe nodeStack 31 // Element pointers (section 12.2.4.4). 32 head, form *Node 33 // Other parsing state flags (section 12.2.4.5). 34 scripting, framesetOK bool 35 // The stack of template insertion modes 36 templateStack insertionModeStack 37 // im is the current insertion mode. 38 im insertionMode 39 // originalIM is the insertion mode to go back to after completing a text 40 // or inTableText insertion mode. 41 originalIM insertionMode 42 // fosterParenting is whether new elements should be inserted according to 43 // the foster parenting rules (section 12.2.6.1). 44 fosterParenting bool 45 // quirks is whether the parser is operating in "quirks mode." 46 quirks bool 47 // fragment is whether the parser is parsing an HTML fragment. 48 fragment bool 49 // context is the context element when parsing an HTML fragment 50 // (section 12.4). 51 context *Node 52 } 53 54 func (p *parser) top() *Node { 55 if n := p.oe.top(); n != nil { 56 return n 57 } 58 return p.doc 59 } 60 61 // Stop tags for use in popUntil. These come from section 12.2.4.2. 62 var ( 63 defaultScopeStopTags = map[string][]a.Atom{ 64 "": {a.Applet, a.Caption, a.Html, a.Table, a.Td, a.Th, a.Marquee, a.Object, a.Template}, 65 "math": {a.AnnotationXml, a.Mi, a.Mn, a.Mo, a.Ms, a.Mtext}, 66 "svg": {a.Desc, a.ForeignObject, a.Title}, 67 } 68 ) 69 70 type scope int 71 72 const ( 73 defaultScope scope = iota 74 listItemScope 75 buttonScope 76 tableScope 77 tableRowScope 78 tableBodyScope 79 selectScope 80 ) 81 82 // popUntil pops the stack of open elements at the highest element whose tag 83 // is in matchTags, provided there is no higher element in the scope's stop 84 // tags (as defined in section 12.2.4.2). It returns whether or not there was 85 // such an element. If there was not, popUntil leaves the stack unchanged. 86 // 87 // For example, the set of stop tags for table scope is: "html", "table". If 88 // the stack was: 89 // ["html", "body", "font", "table", "b", "i", "u"] 90 // then popUntil(tableScope, "font") would return false, but 91 // popUntil(tableScope, "i") would return true and the stack would become: 92 // ["html", "body", "font", "table", "b"] 93 // 94 // If an element's tag is in both the stop tags and matchTags, then the stack 95 // will be popped and the function returns true (provided, of course, there was 96 // no higher element in the stack that was also in the stop tags). For example, 97 // popUntil(tableScope, "table") returns true and leaves: 98 // ["html", "body", "font"] 99 func (p *parser) popUntil(s scope, matchTags ...a.Atom) bool { 100 if i := p.indexOfElementInScope(s, matchTags...); i != -1 { 101 p.oe = p.oe[:i] 102 return true 103 } 104 return false 105 } 106 107 // indexOfElementInScope returns the index in p.oe of the highest element whose 108 // tag is in matchTags that is in scope. If no matching element is in scope, it 109 // returns -1. 110 func (p *parser) indexOfElementInScope(s scope, matchTags ...a.Atom) int { 111 for i := len(p.oe) - 1; i >= 0; i-- { 112 tagAtom := p.oe[i].DataAtom 113 if p.oe[i].Namespace == "" { 114 for _, t := range matchTags { 115 if t == tagAtom { 116 return i 117 } 118 } 119 switch s { 120 case defaultScope: 121 // No-op. 122 case listItemScope: 123 if tagAtom == a.Ol || tagAtom == a.Ul { 124 return -1 125 } 126 case buttonScope: 127 if tagAtom == a.Button { 128 return -1 129 } 130 case tableScope: 131 if tagAtom == a.Html || tagAtom == a.Table || tagAtom == a.Template { 132 return -1 133 } 134 case selectScope: 135 if tagAtom != a.Optgroup && tagAtom != a.Option { 136 return -1 137 } 138 default: 139 panic("unreachable") 140 } 141 } 142 switch s { 143 case defaultScope, listItemScope, buttonScope: 144 for _, t := range defaultScopeStopTags[p.oe[i].Namespace] { 145 if t == tagAtom { 146 return -1 147 } 148 } 149 } 150 } 151 return -1 152 } 153 154 // elementInScope is like popUntil, except that it doesn't modify the stack of 155 // open elements. 156 func (p *parser) elementInScope(s scope, matchTags ...a.Atom) bool { 157 return p.indexOfElementInScope(s, matchTags...) != -1 158 } 159 160 // clearStackToContext pops elements off the stack of open elements until a 161 // scope-defined element is found. 162 func (p *parser) clearStackToContext(s scope) { 163 for i := len(p.oe) - 1; i >= 0; i-- { 164 tagAtom := p.oe[i].DataAtom 165 switch s { 166 case tableScope: 167 if tagAtom == a.Html || tagAtom == a.Table || tagAtom == a.Template { 168 p.oe = p.oe[:i+1] 169 return 170 } 171 case tableRowScope: 172 if tagAtom == a.Html || tagAtom == a.Tr || tagAtom == a.Template { 173 p.oe = p.oe[:i+1] 174 return 175 } 176 case tableBodyScope: 177 if tagAtom == a.Html || tagAtom == a.Tbody || tagAtom == a.Tfoot || tagAtom == a.Thead || tagAtom == a.Template { 178 p.oe = p.oe[:i+1] 179 return 180 } 181 default: 182 panic("unreachable") 183 } 184 } 185 } 186 187 // generateImpliedEndTags pops nodes off the stack of open elements as long as 188 // the top node has a tag name of dd, dt, li, optgroup, option, p, rb, rp, rt or rtc. 189 // If exceptions are specified, nodes with that name will not be popped off. 190 func (p *parser) generateImpliedEndTags(exceptions ...string) { 191 var i int 192 loop: 193 for i = len(p.oe) - 1; i >= 0; i-- { 194 n := p.oe[i] 195 if n.Type == ElementNode { 196 switch n.DataAtom { 197 case a.Dd, a.Dt, a.Li, a.Optgroup, a.Option, a.P, a.Rb, a.Rp, a.Rt, a.Rtc: 198 for _, except := range exceptions { 199 if n.Data == except { 200 break loop 201 } 202 } 203 continue 204 } 205 } 206 break 207 } 208 209 p.oe = p.oe[:i+1] 210 } 211 212 // addChild adds a child node n to the top element, and pushes n onto the stack 213 // of open elements if it is an element node. 214 func (p *parser) addChild(n *Node) { 215 if p.shouldFosterParent() { 216 p.fosterParent(n) 217 } else { 218 p.top().AppendChild(n) 219 } 220 221 if n.Type == ElementNode { 222 p.oe = append(p.oe, n) 223 } 224 } 225 226 // shouldFosterParent returns whether the next node to be added should be 227 // foster parented. 228 func (p *parser) shouldFosterParent() bool { 229 if p.fosterParenting { 230 switch p.top().DataAtom { 231 case a.Table, a.Tbody, a.Tfoot, a.Thead, a.Tr: 232 return true 233 } 234 } 235 return false 236 } 237 238 // fosterParent adds a child node according to the foster parenting rules. 239 // Section 12.2.6.1, "foster parenting". 240 func (p *parser) fosterParent(n *Node) { 241 var table, parent, prev, template *Node 242 var i int 243 for i = len(p.oe) - 1; i >= 0; i-- { 244 if p.oe[i].DataAtom == a.Table { 245 table = p.oe[i] 246 break 247 } 248 } 249 250 var j int 251 for j = len(p.oe) - 1; j >= 0; j-- { 252 if p.oe[j].DataAtom == a.Template { 253 template = p.oe[j] 254 break 255 } 256 } 257 258 if template != nil && (table == nil || j > i) { 259 template.AppendChild(n) 260 return 261 } 262 263 if table == nil { 264 // The foster parent is the html element. 265 parent = p.oe[0] 266 } else { 267 parent = table.Parent 268 } 269 if parent == nil { 270 parent = p.oe[i-1] 271 } 272 273 if table != nil { 274 prev = table.PrevSibling 275 } else { 276 prev = parent.LastChild 277 } 278 if prev != nil && prev.Type == TextNode && n.Type == TextNode { 279 prev.Data += n.Data 280 return 281 } 282 283 parent.InsertBefore(n, table) 284 } 285 286 // addText adds text to the preceding node if it is a text node, or else it 287 // calls addChild with a new text node. 288 func (p *parser) addText(text string) { 289 if text == "" { 290 return 291 } 292 293 if p.shouldFosterParent() { 294 p.fosterParent(&Node{ 295 Type: TextNode, 296 Data: text, 297 Line: p.tok.Line, // ? 298 Column: p.tok.Column, // ? 299 }) 300 return 301 } 302 303 t := p.top() 304 if n := t.LastChild; n != nil && n.Type == TextNode { 305 n.Data += text 306 return 307 } 308 p.addChild(&Node{ 309 Type: TextNode, 310 Data: text, 311 Line: p.tok.Line, // ? 312 Column: p.tok.Column, // ? 313 }) 314 } 315 316 // addElement adds a child element based on the current token. 317 func (p *parser) addElement() { 318 p.addChild(&Node{ 319 Type: ElementNode, 320 DataAtom: p.tok.DataAtom, 321 Data: p.tok.Data, 322 Attr: p.tok.Attr, 323 Line: p.tok.Line, 324 Column: p.tok.Column, 325 }) 326 } 327 328 // Section 12.2.4.3. 329 func (p *parser) addFormattingElement() { 330 tagAtom, attr := p.tok.DataAtom, p.tok.Attr 331 p.addElement() 332 333 // Implement the Noah's Ark clause, but with three per family instead of two. 334 identicalElements := 0 335 findIdenticalElements: 336 for i := len(p.afe) - 1; i >= 0; i-- { 337 n := p.afe[i] 338 if n.Type == scopeMarkerNode { 339 break 340 } 341 if n.Type != ElementNode { 342 continue 343 } 344 if n.Namespace != "" { 345 continue 346 } 347 if n.DataAtom != tagAtom { 348 continue 349 } 350 if len(n.Attr) != len(attr) { 351 continue 352 } 353 compareAttributes: 354 for _, t0 := range n.Attr { 355 for _, t1 := range attr { 356 if t0.Key == t1.Key && t0.Namespace == t1.Namespace && t0.Val == t1.Val { 357 // Found a match for this attribute, continue with the next attribute. 358 continue compareAttributes 359 } 360 } 361 // If we get here, there is no attribute that matches a. 362 // Therefore the element is not identical to the new one. 363 continue findIdenticalElements 364 } 365 366 identicalElements++ 367 if identicalElements >= 3 { 368 p.afe.remove(n) 369 } 370 } 371 372 p.afe = append(p.afe, p.top()) 373 } 374 375 // Section 12.2.4.3. 376 func (p *parser) clearActiveFormattingElements() { 377 for { 378 n := p.afe.pop() 379 if len(p.afe) == 0 || n.Type == scopeMarkerNode { 380 return 381 } 382 } 383 } 384 385 // Section 12.2.4.3. 386 func (p *parser) reconstructActiveFormattingElements() { 387 n := p.afe.top() 388 if n == nil { 389 return 390 } 391 if n.Type == scopeMarkerNode || p.oe.index(n) != -1 { 392 return 393 } 394 i := len(p.afe) - 1 395 for n.Type != scopeMarkerNode && p.oe.index(n) == -1 { 396 if i == 0 { 397 i = -1 398 break 399 } 400 i-- 401 n = p.afe[i] 402 } 403 for { 404 i++ 405 clone := p.afe[i].clone() 406 p.addChild(clone) 407 p.afe[i] = clone 408 if i == len(p.afe)-1 { 409 break 410 } 411 } 412 } 413 414 // Section 12.2.5. 415 func (p *parser) acknowledgeSelfClosingTag() { 416 p.hasSelfClosingToken = false 417 } 418 419 // An insertion mode (section 12.2.4.1) is the state transition function from 420 // a particular state in the HTML5 parser's state machine. It updates the 421 // parser's fields depending on parser.tok (where ErrorToken means EOF). 422 // It returns whether the token was consumed. 423 type insertionMode func(*parser) bool 424 425 // setOriginalIM sets the insertion mode to return to after completing a text or 426 // inTableText insertion mode. 427 // Section 12.2.4.1, "using the rules for". 428 func (p *parser) setOriginalIM() { 429 if p.originalIM != nil { 430 panic("html: bad parser state: originalIM was set twice") 431 } 432 p.originalIM = p.im 433 } 434 435 // Section 12.2.4.1, "reset the insertion mode". 436 func (p *parser) resetInsertionMode() { 437 for i := len(p.oe) - 1; i >= 0; i-- { 438 n := p.oe[i] 439 last := i == 0 440 if last && p.context != nil { 441 n = p.context 442 } 443 444 switch n.DataAtom { 445 case a.Select: 446 if !last { 447 for ancestor, first := n, p.oe[0]; ancestor != first; { 448 ancestor = p.oe[p.oe.index(ancestor)-1] 449 switch ancestor.DataAtom { 450 case a.Template: 451 p.im = inSelectIM 452 return 453 case a.Table: 454 p.im = inSelectInTableIM 455 return 456 } 457 } 458 } 459 p.im = inSelectIM 460 case a.Td, a.Th: 461 // TODO: remove this divergence from the HTML5 spec. 462 // 463 // See https://bugs.chromium.org/p/chromium/issues/detail?id=829668 464 p.im = inCellIM 465 case a.Tr: 466 p.im = inRowIM 467 case a.Tbody, a.Thead, a.Tfoot: 468 p.im = inTableBodyIM 469 case a.Caption: 470 p.im = inCaptionIM 471 case a.Colgroup: 472 p.im = inColumnGroupIM 473 case a.Table: 474 p.im = inTableIM 475 case a.Template: 476 // TODO: remove this divergence from the HTML5 spec. 477 if n.Namespace != "" { 478 continue 479 } 480 p.im = p.templateStack.top() 481 case a.Head: 482 // TODO: remove this divergence from the HTML5 spec. 483 // 484 // See https://bugs.chromium.org/p/chromium/issues/detail?id=829668 485 p.im = inHeadIM 486 case a.Body: 487 p.im = inBodyIM 488 case a.Frameset: 489 p.im = inFramesetIM 490 case a.Html: 491 if p.head == nil { 492 p.im = beforeHeadIM 493 } else { 494 p.im = afterHeadIM 495 } 496 default: 497 if last { 498 p.im = inBodyIM 499 return 500 } 501 continue 502 } 503 return 504 } 505 } 506 507 const whitespace = " \t\r\n\f" 508 509 // Section 12.2.6.4.1. 510 func initialIM(p *parser) bool { 511 switch p.tok.Type { 512 case TextToken: 513 p.tok.Data = strings.TrimLeft(p.tok.Data, whitespace) 514 if len(p.tok.Data) == 0 { 515 // It was all whitespace, so ignore it. 516 return true 517 } 518 case CommentToken: 519 p.doc.AppendChild(&Node{ 520 Type: CommentNode, 521 Data: p.tok.Data, 522 Line: p.tok.Line, 523 Column: p.tok.Column, 524 }) 525 return true 526 case DoctypeToken: 527 n, quirks := parseDoctype(p.tok.Data) 528 n.Line = p.tok.Line 529 n.Column = p.tok.Column 530 p.doc.AppendChild(n) 531 p.quirks = quirks 532 p.im = beforeHTMLIM 533 return true 534 } 535 p.quirks = true 536 p.im = beforeHTMLIM 537 return false 538 } 539 540 // Section 12.2.6.4.2. 541 func beforeHTMLIM(p *parser) bool { 542 switch p.tok.Type { 543 case DoctypeToken: 544 // Ignore the token. 545 return true 546 case TextToken: 547 p.tok.Data = strings.TrimLeft(p.tok.Data, whitespace) 548 if len(p.tok.Data) == 0 { 549 // It was all whitespace, so ignore it. 550 return true 551 } 552 case StartTagToken: 553 if p.tok.DataAtom == a.Html { 554 p.addElement() 555 p.im = beforeHeadIM 556 return true 557 } 558 case EndTagToken: 559 switch p.tok.DataAtom { 560 case a.Head, a.Body, a.Html, a.Br: 561 p.parseImpliedToken(StartTagToken, a.Html, a.Html.String()) 562 return false 563 default: 564 // Ignore the token. 565 return true 566 } 567 case CommentToken: 568 p.doc.AppendChild(&Node{ 569 Type: CommentNode, 570 Data: p.tok.Data, 571 Line: p.tok.Line, 572 Column: p.tok.Column, 573 }) 574 return true 575 } 576 p.parseImpliedToken(StartTagToken, a.Html, a.Html.String()) 577 return false 578 } 579 580 // Section 12.2.6.4.3. 581 func beforeHeadIM(p *parser) bool { 582 switch p.tok.Type { 583 case TextToken: 584 p.tok.Data = strings.TrimLeft(p.tok.Data, whitespace) 585 if len(p.tok.Data) == 0 { 586 // It was all whitespace, so ignore it. 587 return true 588 } 589 case StartTagToken: 590 switch p.tok.DataAtom { 591 case a.Head: 592 p.addElement() 593 p.head = p.top() 594 p.im = inHeadIM 595 return true 596 case a.Html: 597 return inBodyIM(p) 598 } 599 case EndTagToken: 600 switch p.tok.DataAtom { 601 case a.Head, a.Body, a.Html, a.Br: 602 p.parseImpliedToken(StartTagToken, a.Head, a.Head.String()) 603 return false 604 default: 605 // Ignore the token. 606 return true 607 } 608 case CommentToken: 609 p.addChild(&Node{ 610 Type: CommentNode, 611 Data: p.tok.Data, 612 Line: p.tok.Line, 613 Column: p.tok.Column, 614 }) 615 return true 616 case DoctypeToken: 617 // Ignore the token. 618 return true 619 } 620 621 p.parseImpliedToken(StartTagToken, a.Head, a.Head.String()) 622 return false 623 } 624 625 // Section 12.2.6.4.4. 626 func inHeadIM(p *parser) bool { 627 switch p.tok.Type { 628 case TextToken: 629 s := strings.TrimLeft(p.tok.Data, whitespace) 630 if len(s) < len(p.tok.Data) { 631 // Add the initial whitespace to the current node. 632 p.addText(p.tok.Data[:len(p.tok.Data)-len(s)]) 633 if s == "" { 634 return true 635 } 636 p.tok.Data = s 637 } 638 case StartTagToken: 639 switch p.tok.DataAtom { 640 case a.Html: 641 return inBodyIM(p) 642 case a.Base, a.Basefont, a.Bgsound, a.Command, a.Link, a.Meta: 643 p.addElement() 644 p.oe.pop() 645 p.acknowledgeSelfClosingTag() 646 return true 647 case a.Script, a.Title, a.Noscript, a.Noframes, a.Style: 648 p.addElement() 649 p.setOriginalIM() 650 p.im = textIM 651 return true 652 case a.Head: 653 // Ignore the token. 654 return true 655 case a.Template: 656 p.addElement() 657 p.afe = append(p.afe, &scopeMarker) 658 p.framesetOK = false 659 p.im = inTemplateIM 660 p.templateStack = append(p.templateStack, inTemplateIM) 661 return true 662 } 663 case EndTagToken: 664 switch p.tok.DataAtom { 665 case a.Head: 666 p.oe.pop() 667 p.im = afterHeadIM 668 return true 669 case a.Body, a.Html, a.Br: 670 p.parseImpliedToken(EndTagToken, a.Head, a.Head.String()) 671 return false 672 case a.Template: 673 if !p.oe.contains(a.Template) { 674 return true 675 } 676 // TODO: remove this divergence from the HTML5 spec. 677 // 678 // See https://bugs.chromium.org/p/chromium/issues/detail?id=829668 679 p.generateImpliedEndTags() 680 for i := len(p.oe) - 1; i >= 0; i-- { 681 if n := p.oe[i]; n.Namespace == "" && n.DataAtom == a.Template { 682 p.oe = p.oe[:i] 683 break 684 } 685 } 686 p.clearActiveFormattingElements() 687 p.templateStack.pop() 688 p.resetInsertionMode() 689 return true 690 default: 691 // Ignore the token. 692 return true 693 } 694 case CommentToken: 695 p.addChild(&Node{ 696 Type: CommentNode, 697 Data: p.tok.Data, 698 Line: p.tok.Line, 699 Column: p.tok.Column, 700 }) 701 return true 702 case DoctypeToken: 703 // Ignore the token. 704 return true 705 } 706 707 p.parseImpliedToken(EndTagToken, a.Head, a.Head.String()) 708 return false 709 } 710 711 // Section 12.2.6.4.6. 712 func afterHeadIM(p *parser) bool { 713 switch p.tok.Type { 714 case TextToken: 715 s := strings.TrimLeft(p.tok.Data, whitespace) 716 if len(s) < len(p.tok.Data) { 717 // Add the initial whitespace to the current node. 718 p.addText(p.tok.Data[:len(p.tok.Data)-len(s)]) 719 if s == "" { 720 return true 721 } 722 p.tok.Data = s 723 } 724 case StartTagToken: 725 switch p.tok.DataAtom { 726 case a.Html: 727 return inBodyIM(p) 728 case a.Body: 729 p.addElement() 730 p.framesetOK = false 731 p.im = inBodyIM 732 return true 733 case a.Frameset: 734 p.addElement() 735 p.im = inFramesetIM 736 return true 737 case a.Base, a.Basefont, a.Bgsound, a.Link, a.Meta, a.Noframes, a.Script, a.Style, a.Template, a.Title: 738 p.oe = append(p.oe, p.head) 739 defer p.oe.remove(p.head) 740 return inHeadIM(p) 741 case a.Head: 742 // Ignore the token. 743 return true 744 } 745 case EndTagToken: 746 switch p.tok.DataAtom { 747 case a.Body, a.Html, a.Br: 748 // Drop down to creating an implied <body> tag. 749 case a.Template: 750 return inHeadIM(p) 751 default: 752 // Ignore the token. 753 return true 754 } 755 case CommentToken: 756 p.addChild(&Node{ 757 Type: CommentNode, 758 Data: p.tok.Data, 759 Line: p.tok.Line, 760 Column: p.tok.Column, 761 }) 762 return true 763 case DoctypeToken: 764 // Ignore the token. 765 return true 766 } 767 768 p.parseImpliedToken(StartTagToken, a.Body, a.Body.String()) 769 p.framesetOK = true 770 return false 771 } 772 773 // copyAttributes copies attributes of src not found on dst to dst. 774 func copyAttributes(dst *Node, src Token) { 775 if len(src.Attr) == 0 { 776 return 777 } 778 attr := map[string]string{} 779 for _, t := range dst.Attr { 780 attr[t.Key] = t.Val 781 } 782 for _, t := range src.Attr { 783 if _, ok := attr[t.Key]; !ok { 784 dst.Attr = append(dst.Attr, t) 785 attr[t.Key] = t.Val 786 } 787 } 788 } 789 790 // Section 12.2.6.4.7. 791 func inBodyIM(p *parser) bool { 792 switch p.tok.Type { 793 case TextToken: 794 d := p.tok.Data 795 switch n := p.oe.top(); n.DataAtom { 796 case a.Pre, a.Listing: 797 if n.FirstChild == nil { 798 // Ignore a newline at the start of a <pre> block. 799 if d != "" && d[0] == '\r' { 800 d = d[1:] 801 } 802 if d != "" && d[0] == '\n' { 803 d = d[1:] 804 } 805 } 806 } 807 d = strings.Replace(d, "\x00", "", -1) 808 if d == "" { 809 return true 810 } 811 p.reconstructActiveFormattingElements() 812 p.addText(d) 813 if p.framesetOK && strings.TrimLeft(d, whitespace) != "" { 814 // There were non-whitespace characters inserted. 815 p.framesetOK = false 816 } 817 case StartTagToken: 818 switch p.tok.DataAtom { 819 case a.Html: 820 if p.oe.contains(a.Template) { 821 return true 822 } 823 copyAttributes(p.oe[0], p.tok) 824 case a.Base, a.Basefont, a.Bgsound, a.Command, a.Link, a.Meta, a.Noframes, a.Script, a.Style, a.Template, a.Title: 825 return inHeadIM(p) 826 case a.Body: 827 if p.oe.contains(a.Template) { 828 return true 829 } 830 if len(p.oe) >= 2 { 831 body := p.oe[1] 832 if body.Type == ElementNode && body.DataAtom == a.Body { 833 p.framesetOK = false 834 copyAttributes(body, p.tok) 835 } 836 } 837 case a.Frameset: 838 if !p.framesetOK || len(p.oe) < 2 || p.oe[1].DataAtom != a.Body { 839 // Ignore the token. 840 return true 841 } 842 body := p.oe[1] 843 if body.Parent != nil { 844 body.Parent.RemoveChild(body) 845 } 846 p.oe = p.oe[:1] 847 p.addElement() 848 p.im = inFramesetIM 849 return true 850 case a.Address, a.Article, a.Aside, a.Blockquote, a.Center, a.Details, a.Dir, a.Div, a.Dl, a.Fieldset, a.Figcaption, a.Figure, a.Footer, a.Header, a.Hgroup, a.Menu, a.Nav, a.Ol, a.P, a.Section, a.Summary, a.Ul: 851 p.popUntil(buttonScope, a.P) 852 p.addElement() 853 case a.H1, a.H2, a.H3, a.H4, a.H5, a.H6: 854 p.popUntil(buttonScope, a.P) 855 switch n := p.top(); n.DataAtom { 856 case a.H1, a.H2, a.H3, a.H4, a.H5, a.H6: 857 p.oe.pop() 858 } 859 p.addElement() 860 case a.Pre, a.Listing: 861 p.popUntil(buttonScope, a.P) 862 p.addElement() 863 // The newline, if any, will be dealt with by the TextToken case. 864 p.framesetOK = false 865 case a.Form: 866 if p.form != nil && !p.oe.contains(a.Template) { 867 // Ignore the token 868 return true 869 } 870 p.popUntil(buttonScope, a.P) 871 p.addElement() 872 if !p.oe.contains(a.Template) { 873 p.form = p.top() 874 } 875 case a.Li: 876 p.framesetOK = false 877 for i := len(p.oe) - 1; i >= 0; i-- { 878 node := p.oe[i] 879 switch node.DataAtom { 880 case a.Li: 881 p.oe = p.oe[:i] 882 case a.Address, a.Div, a.P: 883 continue 884 default: 885 if !isSpecialElement(node) { 886 continue 887 } 888 } 889 break 890 } 891 p.popUntil(buttonScope, a.P) 892 p.addElement() 893 case a.Dd, a.Dt: 894 p.framesetOK = false 895 for i := len(p.oe) - 1; i >= 0; i-- { 896 node := p.oe[i] 897 switch node.DataAtom { 898 case a.Dd, a.Dt: 899 p.oe = p.oe[:i] 900 case a.Address, a.Div, a.P: 901 continue 902 default: 903 if !isSpecialElement(node) { 904 continue 905 } 906 } 907 break 908 } 909 p.popUntil(buttonScope, a.P) 910 p.addElement() 911 case a.Plaintext: 912 p.popUntil(buttonScope, a.P) 913 p.addElement() 914 case a.Button: 915 p.popUntil(defaultScope, a.Button) 916 p.reconstructActiveFormattingElements() 917 p.addElement() 918 p.framesetOK = false 919 case a.A: 920 for i := len(p.afe) - 1; i >= 0 && p.afe[i].Type != scopeMarkerNode; i-- { 921 if n := p.afe[i]; n.Type == ElementNode && n.DataAtom == a.A { 922 p.inBodyEndTagFormatting(a.A, "a") 923 p.oe.remove(n) 924 p.afe.remove(n) 925 break 926 } 927 } 928 p.reconstructActiveFormattingElements() 929 p.addFormattingElement() 930 case a.B, a.Big, a.Code, a.Em, a.Font, a.I, a.S, a.Small, a.Strike, a.Strong, a.Tt, a.U: 931 p.reconstructActiveFormattingElements() 932 p.addFormattingElement() 933 case a.Nobr: 934 p.reconstructActiveFormattingElements() 935 if p.elementInScope(defaultScope, a.Nobr) { 936 p.inBodyEndTagFormatting(a.Nobr, "nobr") 937 p.reconstructActiveFormattingElements() 938 } 939 p.addFormattingElement() 940 case a.Applet, a.Marquee, a.Object: 941 p.reconstructActiveFormattingElements() 942 p.addElement() 943 p.afe = append(p.afe, &scopeMarker) 944 p.framesetOK = false 945 case a.Table: 946 if !p.quirks { 947 p.popUntil(buttonScope, a.P) 948 } 949 p.addElement() 950 p.framesetOK = false 951 p.im = inTableIM 952 return true 953 case a.Area, a.Br, a.Embed, a.Img, a.Input, a.Keygen, a.Wbr: 954 p.reconstructActiveFormattingElements() 955 p.addElement() 956 p.oe.pop() 957 p.acknowledgeSelfClosingTag() 958 if p.tok.DataAtom == a.Input { 959 for _, t := range p.tok.Attr { 960 if t.Key == "type" { 961 if strings.ToLower(t.Val) == "hidden" { 962 // Skip setting framesetOK = false 963 return true 964 } 965 } 966 } 967 } 968 p.framesetOK = false 969 case a.Param, a.Source, a.Track: 970 p.addElement() 971 p.oe.pop() 972 p.acknowledgeSelfClosingTag() 973 case a.Hr: 974 p.popUntil(buttonScope, a.P) 975 p.addElement() 976 p.oe.pop() 977 p.acknowledgeSelfClosingTag() 978 p.framesetOK = false 979 case a.Image: 980 p.tok.DataAtom = a.Img 981 p.tok.Data = a.Img.String() 982 return false 983 case a.Isindex: 984 if p.form != nil { 985 // Ignore the token. 986 return true 987 } 988 action := "" 989 prompt := "This is a searchable index. Enter search keywords: " 990 attr := []Attribute{{Key: "name", Val: "isindex"}} 991 for _, t := range p.tok.Attr { 992 switch t.Key { 993 case "action": 994 action = t.Val 995 case "name": 996 // Ignore the attribute. 997 case "prompt": 998 prompt = t.Val 999 default: 1000 attr = append(attr, t) 1001 } 1002 } 1003 p.acknowledgeSelfClosingTag() 1004 p.popUntil(buttonScope, a.P) 1005 p.parseImpliedToken(StartTagToken, a.Form, a.Form.String()) 1006 if p.form == nil { 1007 // NOTE: The 'isindex' element has been removed, 1008 // and the 'template' element has not been designed to be 1009 // collaborative with the index element. 1010 // 1011 // Ignore the token. 1012 return true 1013 } 1014 if action != "" { 1015 p.form.Attr = []Attribute{{Key: "action", Val: action}} 1016 } 1017 p.parseImpliedToken(StartTagToken, a.Hr, a.Hr.String()) 1018 p.parseImpliedToken(StartTagToken, a.Label, a.Label.String()) 1019 p.addText(prompt) 1020 p.addChild(&Node{ 1021 Type: ElementNode, 1022 DataAtom: a.Input, 1023 Data: a.Input.String(), 1024 Attr: attr, 1025 Line: p.tok.Line, 1026 Column: p.tok.Column, 1027 }) 1028 p.oe.pop() 1029 p.parseImpliedToken(EndTagToken, a.Label, a.Label.String()) 1030 p.parseImpliedToken(StartTagToken, a.Hr, a.Hr.String()) 1031 p.parseImpliedToken(EndTagToken, a.Form, a.Form.String()) 1032 case a.Textarea: 1033 p.addElement() 1034 p.setOriginalIM() 1035 p.framesetOK = false 1036 p.im = textIM 1037 case a.Xmp: 1038 p.popUntil(buttonScope, a.P) 1039 p.reconstructActiveFormattingElements() 1040 p.framesetOK = false 1041 p.addElement() 1042 p.setOriginalIM() 1043 p.im = textIM 1044 case a.Iframe: 1045 p.framesetOK = false 1046 p.addElement() 1047 p.setOriginalIM() 1048 p.im = textIM 1049 case a.Noembed, a.Noscript: 1050 p.addElement() 1051 p.setOriginalIM() 1052 p.im = textIM 1053 case a.Select: 1054 p.reconstructActiveFormattingElements() 1055 p.addElement() 1056 p.framesetOK = false 1057 p.im = inSelectIM 1058 return true 1059 case a.Optgroup, a.Option: 1060 if p.top().DataAtom == a.Option { 1061 p.oe.pop() 1062 } 1063 p.reconstructActiveFormattingElements() 1064 p.addElement() 1065 case a.Rb, a.Rtc: 1066 if p.elementInScope(defaultScope, a.Ruby) { 1067 p.generateImpliedEndTags() 1068 } 1069 p.addElement() 1070 case a.Rp, a.Rt: 1071 if p.elementInScope(defaultScope, a.Ruby) { 1072 p.generateImpliedEndTags("rtc") 1073 } 1074 p.addElement() 1075 case a.Math, a.Svg: 1076 p.reconstructActiveFormattingElements() 1077 if p.tok.DataAtom == a.Math { 1078 adjustAttributeNames(p.tok.Attr, mathMLAttributeAdjustments) 1079 } else { 1080 adjustAttributeNames(p.tok.Attr, svgAttributeAdjustments) 1081 } 1082 adjustForeignAttributes(p.tok.Attr) 1083 p.addElement() 1084 p.top().Namespace = p.tok.Data 1085 if p.hasSelfClosingToken { 1086 p.oe.pop() 1087 p.acknowledgeSelfClosingTag() 1088 } 1089 return true 1090 case a.Caption, a.Col, a.Colgroup, a.Frame, a.Head, a.Tbody, a.Td, a.Tfoot, a.Th, a.Thead, a.Tr: 1091 // Ignore the token. 1092 default: 1093 p.reconstructActiveFormattingElements() 1094 p.addElement() 1095 } 1096 case EndTagToken: 1097 switch p.tok.DataAtom { 1098 case a.Body: 1099 if p.elementInScope(defaultScope, a.Body) { 1100 p.im = afterBodyIM 1101 } 1102 case a.Html: 1103 if p.elementInScope(defaultScope, a.Body) { 1104 p.parseImpliedToken(EndTagToken, a.Body, a.Body.String()) 1105 return false 1106 } 1107 return true 1108 case a.Address, a.Article, a.Aside, a.Blockquote, a.Button, a.Center, a.Details, a.Dir, a.Div, a.Dl, a.Fieldset, a.Figcaption, a.Figure, a.Footer, a.Header, a.Hgroup, a.Listing, a.Menu, a.Nav, a.Ol, a.Pre, a.Section, a.Summary, a.Ul: 1109 p.popUntil(defaultScope, p.tok.DataAtom) 1110 case a.Form: 1111 if p.oe.contains(a.Template) { 1112 i := p.indexOfElementInScope(defaultScope, a.Form) 1113 if i == -1 { 1114 // Ignore the token. 1115 return true 1116 } 1117 p.generateImpliedEndTags() 1118 if p.oe[i].DataAtom != a.Form { 1119 // Ignore the token. 1120 return true 1121 } 1122 p.popUntil(defaultScope, a.Form) 1123 } else { 1124 node := p.form 1125 p.form = nil 1126 i := p.indexOfElementInScope(defaultScope, a.Form) 1127 if node == nil || i == -1 || p.oe[i] != node { 1128 // Ignore the token. 1129 return true 1130 } 1131 p.generateImpliedEndTags() 1132 p.oe.remove(node) 1133 } 1134 case a.P: 1135 if !p.elementInScope(buttonScope, a.P) { 1136 p.parseImpliedToken(StartTagToken, a.P, a.P.String()) 1137 } 1138 p.popUntil(buttonScope, a.P) 1139 case a.Li: 1140 p.popUntil(listItemScope, a.Li) 1141 case a.Dd, a.Dt: 1142 p.popUntil(defaultScope, p.tok.DataAtom) 1143 case a.H1, a.H2, a.H3, a.H4, a.H5, a.H6: 1144 p.popUntil(defaultScope, a.H1, a.H2, a.H3, a.H4, a.H5, a.H6) 1145 case a.A, a.B, a.Big, a.Code, a.Em, a.Font, a.I, a.Nobr, a.S, a.Small, a.Strike, a.Strong, a.Tt, a.U: 1146 p.inBodyEndTagFormatting(p.tok.DataAtom, p.tok.Data) 1147 case a.Applet, a.Marquee, a.Object: 1148 if p.popUntil(defaultScope, p.tok.DataAtom) { 1149 p.clearActiveFormattingElements() 1150 } 1151 case a.Br: 1152 p.tok.Type = StartTagToken 1153 return false 1154 case a.Template: 1155 return inHeadIM(p) 1156 default: 1157 p.inBodyEndTagOther(p.tok.DataAtom, p.tok.Data) 1158 } 1159 case CommentToken: 1160 p.addChild(&Node{ 1161 Type: CommentNode, 1162 Data: p.tok.Data, 1163 Line: p.tok.Line, 1164 Column: p.tok.Column, 1165 }) 1166 case ErrorToken: 1167 // TODO: remove this divergence from the HTML5 spec. 1168 if len(p.templateStack) > 0 { 1169 p.im = inTemplateIM 1170 return false 1171 } else { 1172 for _, e := range p.oe { 1173 switch e.DataAtom { 1174 case a.Dd, a.Dt, a.Li, a.Optgroup, a.Option, a.P, a.Rb, a.Rp, a.Rt, a.Rtc, a.Tbody, a.Td, a.Tfoot, a.Th, 1175 a.Thead, a.Tr, a.Body, a.Html: 1176 default: 1177 return true 1178 } 1179 } 1180 } 1181 } 1182 1183 return true 1184 } 1185 1186 func (p *parser) inBodyEndTagFormatting(tagAtom a.Atom, tagName string) { 1187 // This is the "adoption agency" algorithm, described at 1188 // https://html.spec.whatwg.org/multipage/syntax.html#adoptionAgency 1189 1190 // TODO: this is a fairly literal line-by-line translation of that algorithm. 1191 // Once the code successfully parses the comprehensive test suite, we should 1192 // refactor this code to be more idiomatic. 1193 1194 // Steps 1-4. The outer loop. 1195 for i := 0; i < 8; i++ { 1196 // Step 5. Find the formatting element. 1197 var formattingElement *Node 1198 for j := len(p.afe) - 1; j >= 0; j-- { 1199 if p.afe[j].Type == scopeMarkerNode { 1200 break 1201 } 1202 if p.afe[j].DataAtom == tagAtom { 1203 formattingElement = p.afe[j] 1204 break 1205 } 1206 } 1207 if formattingElement == nil { 1208 p.inBodyEndTagOther(tagAtom, tagName) 1209 return 1210 } 1211 feIndex := p.oe.index(formattingElement) 1212 if feIndex == -1 { 1213 p.afe.remove(formattingElement) 1214 return 1215 } 1216 if !p.elementInScope(defaultScope, tagAtom) { 1217 // Ignore the tag. 1218 return 1219 } 1220 1221 // Steps 9-10. Find the furthest block. 1222 var furthestBlock *Node 1223 for _, e := range p.oe[feIndex:] { 1224 if isSpecialElement(e) { 1225 furthestBlock = e 1226 break 1227 } 1228 } 1229 if furthestBlock == nil { 1230 e := p.oe.pop() 1231 for e != formattingElement { 1232 e = p.oe.pop() 1233 } 1234 p.afe.remove(e) 1235 return 1236 } 1237 1238 // Steps 11-12. Find the common ancestor and bookmark node. 1239 commonAncestor := p.oe[feIndex-1] 1240 bookmark := p.afe.index(formattingElement) 1241 1242 // Step 13. The inner loop. Find the lastNode to reparent. 1243 lastNode := furthestBlock 1244 node := furthestBlock 1245 x := p.oe.index(node) 1246 // Steps 13.1-13.2 1247 for j := 0; j < 3; j++ { 1248 // Step 13.3. 1249 x-- 1250 node = p.oe[x] 1251 // Step 13.4 - 13.5. 1252 if p.afe.index(node) == -1 { 1253 p.oe.remove(node) 1254 continue 1255 } 1256 // Step 13.6. 1257 if node == formattingElement { 1258 break 1259 } 1260 // Step 13.7. 1261 clone := node.clone() 1262 p.afe[p.afe.index(node)] = clone 1263 p.oe[p.oe.index(node)] = clone 1264 node = clone 1265 // Step 13.8. 1266 if lastNode == furthestBlock { 1267 bookmark = p.afe.index(node) + 1 1268 } 1269 // Step 13.9. 1270 if lastNode.Parent != nil { 1271 lastNode.Parent.RemoveChild(lastNode) 1272 } 1273 node.AppendChild(lastNode) 1274 // Step 13.10. 1275 lastNode = node 1276 } 1277 1278 // Step 14. Reparent lastNode to the common ancestor, 1279 // or for misnested table nodes, to the foster parent. 1280 if lastNode.Parent != nil { 1281 lastNode.Parent.RemoveChild(lastNode) 1282 } 1283 switch commonAncestor.DataAtom { 1284 case a.Table, a.Tbody, a.Tfoot, a.Thead, a.Tr: 1285 p.fosterParent(lastNode) 1286 default: 1287 commonAncestor.AppendChild(lastNode) 1288 } 1289 1290 // Steps 15-17. Reparent nodes from the furthest block's children 1291 // to a clone of the formatting element. 1292 clone := formattingElement.clone() 1293 reparentChildren(clone, furthestBlock) 1294 furthestBlock.AppendChild(clone) 1295 1296 // Step 18. Fix up the list of active formatting elements. 1297 if oldLoc := p.afe.index(formattingElement); oldLoc != -1 && oldLoc < bookmark { 1298 // Move the bookmark with the rest of the list. 1299 bookmark-- 1300 } 1301 p.afe.remove(formattingElement) 1302 p.afe.insert(bookmark, clone) 1303 1304 // Step 19. Fix up the stack of open elements. 1305 p.oe.remove(formattingElement) 1306 p.oe.insert(p.oe.index(furthestBlock)+1, clone) 1307 } 1308 } 1309 1310 // inBodyEndTagOther performs the "any other end tag" algorithm for inBodyIM. 1311 // "Any other end tag" handling from 12.2.6.5 The rules for parsing tokens in foreign content 1312 // https://html.spec.whatwg.org/multipage/syntax.html#parsing-main-inforeign 1313 func (p *parser) inBodyEndTagOther(tagAtom a.Atom, tagName string) { 1314 for i := len(p.oe) - 1; i >= 0; i-- { 1315 // Two element nodes have the same tag if they have the same Data (a 1316 // string-typed field). As an optimization, for common HTML tags, each 1317 // Data string is assigned a unique, non-zero DataAtom (a uint32-typed 1318 // field), since integer comparison is faster than string comparison. 1319 // Uncommon (custom) tags get a zero DataAtom. 1320 // 1321 // The if condition here is equivalent to (p.oe[i].Data == tagName). 1322 if (p.oe[i].DataAtom == tagAtom) && 1323 ((tagAtom != 0) || (p.oe[i].Data == tagName)) { 1324 p.oe = p.oe[:i] 1325 break 1326 } 1327 if isSpecialElement(p.oe[i]) { 1328 break 1329 } 1330 } 1331 } 1332 1333 // Section 12.2.6.4.8. 1334 func textIM(p *parser) bool { 1335 switch p.tok.Type { 1336 case ErrorToken: 1337 p.oe.pop() 1338 case TextToken: 1339 d := p.tok.Data 1340 if n := p.oe.top(); n.DataAtom == a.Textarea && n.FirstChild == nil { 1341 // Ignore a newline at the start of a <textarea> block. 1342 if d != "" && d[0] == '\r' { 1343 d = d[1:] 1344 } 1345 if d != "" && d[0] == '\n' { 1346 d = d[1:] 1347 } 1348 } 1349 if d == "" { 1350 return true 1351 } 1352 p.addText(d) 1353 return true 1354 case EndTagToken: 1355 p.oe.pop() 1356 } 1357 p.im = p.originalIM 1358 p.originalIM = nil 1359 return p.tok.Type == EndTagToken 1360 } 1361 1362 // Section 12.2.6.4.9. 1363 func inTableIM(p *parser) bool { 1364 switch p.tok.Type { 1365 case TextToken: 1366 p.tok.Data = strings.Replace(p.tok.Data, "\x00", "", -1) 1367 switch p.oe.top().DataAtom { 1368 case a.Table, a.Tbody, a.Tfoot, a.Thead, a.Tr: 1369 if strings.Trim(p.tok.Data, whitespace) == "" { 1370 p.addText(p.tok.Data) 1371 return true 1372 } 1373 } 1374 case StartTagToken: 1375 switch p.tok.DataAtom { 1376 case a.Caption: 1377 p.clearStackToContext(tableScope) 1378 p.afe = append(p.afe, &scopeMarker) 1379 p.addElement() 1380 p.im = inCaptionIM 1381 return true 1382 case a.Colgroup: 1383 p.clearStackToContext(tableScope) 1384 p.addElement() 1385 p.im = inColumnGroupIM 1386 return true 1387 case a.Col: 1388 p.parseImpliedToken(StartTagToken, a.Colgroup, a.Colgroup.String()) 1389 return false 1390 case a.Tbody, a.Tfoot, a.Thead: 1391 p.clearStackToContext(tableScope) 1392 p.addElement() 1393 p.im = inTableBodyIM 1394 return true 1395 case a.Td, a.Th, a.Tr: 1396 p.parseImpliedToken(StartTagToken, a.Tbody, a.Tbody.String()) 1397 return false 1398 case a.Table: 1399 if p.popUntil(tableScope, a.Table) { 1400 p.resetInsertionMode() 1401 return false 1402 } 1403 // Ignore the token. 1404 return true 1405 case a.Style, a.Script, a.Template: 1406 return inHeadIM(p) 1407 case a.Input: 1408 for _, t := range p.tok.Attr { 1409 if t.Key == "type" && strings.ToLower(t.Val) == "hidden" { 1410 p.addElement() 1411 p.oe.pop() 1412 return true 1413 } 1414 } 1415 // Otherwise drop down to the default action. 1416 case a.Form: 1417 if p.oe.contains(a.Template) || p.form != nil { 1418 // Ignore the token. 1419 return true 1420 } 1421 p.addElement() 1422 p.form = p.oe.pop() 1423 case a.Select: 1424 p.reconstructActiveFormattingElements() 1425 switch p.top().DataAtom { 1426 case a.Table, a.Tbody, a.Tfoot, a.Thead, a.Tr: 1427 p.fosterParenting = true 1428 } 1429 p.addElement() 1430 p.fosterParenting = false 1431 p.framesetOK = false 1432 p.im = inSelectInTableIM 1433 return true 1434 } 1435 case EndTagToken: 1436 switch p.tok.DataAtom { 1437 case a.Table: 1438 if p.popUntil(tableScope, a.Table) { 1439 p.resetInsertionMode() 1440 return true 1441 } 1442 // Ignore the token. 1443 return true 1444 case a.Body, a.Caption, a.Col, a.Colgroup, a.Html, a.Tbody, a.Td, a.Tfoot, a.Th, a.Thead, a.Tr: 1445 // Ignore the token. 1446 return true 1447 case a.Template: 1448 return inHeadIM(p) 1449 } 1450 case CommentToken: 1451 p.addChild(&Node{ 1452 Type: CommentNode, 1453 Data: p.tok.Data, 1454 Line: p.tok.Line, 1455 Column: p.tok.Column, 1456 }) 1457 return true 1458 case DoctypeToken: 1459 // Ignore the token. 1460 return true 1461 case ErrorToken: 1462 return inBodyIM(p) 1463 } 1464 1465 p.fosterParenting = true 1466 defer func() { p.fosterParenting = false }() 1467 1468 return inBodyIM(p) 1469 } 1470 1471 // Section 12.2.6.4.11. 1472 func inCaptionIM(p *parser) bool { 1473 switch p.tok.Type { 1474 case StartTagToken: 1475 switch p.tok.DataAtom { 1476 case a.Caption, a.Col, a.Colgroup, a.Tbody, a.Td, a.Tfoot, a.Thead, a.Tr: 1477 if p.popUntil(tableScope, a.Caption) { 1478 p.clearActiveFormattingElements() 1479 p.im = inTableIM 1480 return false 1481 } else { 1482 // Ignore the token. 1483 return true 1484 } 1485 case a.Select: 1486 p.reconstructActiveFormattingElements() 1487 p.addElement() 1488 p.framesetOK = false 1489 p.im = inSelectInTableIM 1490 return true 1491 } 1492 case EndTagToken: 1493 switch p.tok.DataAtom { 1494 case a.Caption: 1495 if p.popUntil(tableScope, a.Caption) { 1496 p.clearActiveFormattingElements() 1497 p.im = inTableIM 1498 } 1499 return true 1500 case a.Table: 1501 if p.popUntil(tableScope, a.Caption) { 1502 p.clearActiveFormattingElements() 1503 p.im = inTableIM 1504 return false 1505 } else { 1506 // Ignore the token. 1507 return true 1508 } 1509 case a.Body, a.Col, a.Colgroup, a.Html, a.Tbody, a.Td, a.Tfoot, a.Th, a.Thead, a.Tr: 1510 // Ignore the token. 1511 return true 1512 } 1513 } 1514 return inBodyIM(p) 1515 } 1516 1517 // Section 12.2.6.4.12. 1518 func inColumnGroupIM(p *parser) bool { 1519 switch p.tok.Type { 1520 case TextToken: 1521 s := strings.TrimLeft(p.tok.Data, whitespace) 1522 if len(s) < len(p.tok.Data) { 1523 // Add the initial whitespace to the current node. 1524 p.addText(p.tok.Data[:len(p.tok.Data)-len(s)]) 1525 if s == "" { 1526 return true 1527 } 1528 p.tok.Data = s 1529 } 1530 case CommentToken: 1531 p.addChild(&Node{ 1532 Type: CommentNode, 1533 Data: p.tok.Data, 1534 Line: p.tok.Line, 1535 Column: p.tok.Column, 1536 }) 1537 return true 1538 case DoctypeToken: 1539 // Ignore the token. 1540 return true 1541 case StartTagToken: 1542 switch p.tok.DataAtom { 1543 case a.Html: 1544 return inBodyIM(p) 1545 case a.Col: 1546 p.addElement() 1547 p.oe.pop() 1548 p.acknowledgeSelfClosingTag() 1549 return true 1550 case a.Template: 1551 return inHeadIM(p) 1552 } 1553 case EndTagToken: 1554 switch p.tok.DataAtom { 1555 case a.Colgroup: 1556 if p.oe.top().DataAtom == a.Colgroup { 1557 p.oe.pop() 1558 p.im = inTableIM 1559 } 1560 return true 1561 case a.Col: 1562 // Ignore the token. 1563 return true 1564 case a.Template: 1565 return inHeadIM(p) 1566 } 1567 case ErrorToken: 1568 return inBodyIM(p) 1569 } 1570 if p.oe.top().DataAtom != a.Colgroup { 1571 return true 1572 } 1573 p.oe.pop() 1574 p.im = inTableIM 1575 return false 1576 } 1577 1578 // Section 12.2.6.4.13. 1579 func inTableBodyIM(p *parser) bool { 1580 switch p.tok.Type { 1581 case StartTagToken: 1582 switch p.tok.DataAtom { 1583 case a.Tr: 1584 p.clearStackToContext(tableBodyScope) 1585 p.addElement() 1586 p.im = inRowIM 1587 return true 1588 case a.Td, a.Th: 1589 p.parseImpliedToken(StartTagToken, a.Tr, a.Tr.String()) 1590 return false 1591 case a.Caption, a.Col, a.Colgroup, a.Tbody, a.Tfoot, a.Thead: 1592 if p.popUntil(tableScope, a.Tbody, a.Thead, a.Tfoot) { 1593 p.im = inTableIM 1594 return false 1595 } 1596 // Ignore the token. 1597 return true 1598 } 1599 case EndTagToken: 1600 switch p.tok.DataAtom { 1601 case a.Tbody, a.Tfoot, a.Thead: 1602 if p.elementInScope(tableScope, p.tok.DataAtom) { 1603 p.clearStackToContext(tableBodyScope) 1604 p.oe.pop() 1605 p.im = inTableIM 1606 } 1607 return true 1608 case a.Table: 1609 if p.popUntil(tableScope, a.Tbody, a.Thead, a.Tfoot) { 1610 p.im = inTableIM 1611 return false 1612 } 1613 // Ignore the token. 1614 return true 1615 case a.Body, a.Caption, a.Col, a.Colgroup, a.Html, a.Td, a.Th, a.Tr: 1616 // Ignore the token. 1617 return true 1618 } 1619 case CommentToken: 1620 p.addChild(&Node{ 1621 Type: CommentNode, 1622 Data: p.tok.Data, 1623 Line: p.tok.Line, 1624 Column: p.tok.Column, 1625 }) 1626 return true 1627 } 1628 1629 return inTableIM(p) 1630 } 1631 1632 // Section 12.2.6.4.14. 1633 func inRowIM(p *parser) bool { 1634 switch p.tok.Type { 1635 case StartTagToken: 1636 switch p.tok.DataAtom { 1637 case a.Td, a.Th: 1638 p.clearStackToContext(tableRowScope) 1639 p.addElement() 1640 p.afe = append(p.afe, &scopeMarker) 1641 p.im = inCellIM 1642 return true 1643 case a.Caption, a.Col, a.Colgroup, a.Tbody, a.Tfoot, a.Thead, a.Tr: 1644 if p.popUntil(tableScope, a.Tr) { 1645 p.im = inTableBodyIM 1646 return false 1647 } 1648 // Ignore the token. 1649 return true 1650 } 1651 case EndTagToken: 1652 switch p.tok.DataAtom { 1653 case a.Tr: 1654 if p.popUntil(tableScope, a.Tr) { 1655 p.im = inTableBodyIM 1656 return true 1657 } 1658 // Ignore the token. 1659 return true 1660 case a.Table: 1661 if p.popUntil(tableScope, a.Tr) { 1662 p.im = inTableBodyIM 1663 return false 1664 } 1665 // Ignore the token. 1666 return true 1667 case a.Tbody, a.Tfoot, a.Thead: 1668 if p.elementInScope(tableScope, p.tok.DataAtom) { 1669 p.parseImpliedToken(EndTagToken, a.Tr, a.Tr.String()) 1670 return false 1671 } 1672 // Ignore the token. 1673 return true 1674 case a.Body, a.Caption, a.Col, a.Colgroup, a.Html, a.Td, a.Th: 1675 // Ignore the token. 1676 return true 1677 } 1678 } 1679 1680 return inTableIM(p) 1681 } 1682 1683 // Section 12.2.6.4.15. 1684 func inCellIM(p *parser) bool { 1685 switch p.tok.Type { 1686 case StartTagToken: 1687 switch p.tok.DataAtom { 1688 case a.Caption, a.Col, a.Colgroup, a.Tbody, a.Td, a.Tfoot, a.Th, a.Thead, a.Tr: 1689 if p.popUntil(tableScope, a.Td, a.Th) { 1690 // Close the cell and reprocess. 1691 p.clearActiveFormattingElements() 1692 p.im = inRowIM 1693 return false 1694 } 1695 // Ignore the token. 1696 return true 1697 case a.Select: 1698 p.reconstructActiveFormattingElements() 1699 p.addElement() 1700 p.framesetOK = false 1701 p.im = inSelectInTableIM 1702 return true 1703 } 1704 case EndTagToken: 1705 switch p.tok.DataAtom { 1706 case a.Td, a.Th: 1707 if !p.popUntil(tableScope, p.tok.DataAtom) { 1708 // Ignore the token. 1709 return true 1710 } 1711 p.clearActiveFormattingElements() 1712 p.im = inRowIM 1713 return true 1714 case a.Body, a.Caption, a.Col, a.Colgroup, a.Html: 1715 // Ignore the token. 1716 return true 1717 case a.Table, a.Tbody, a.Tfoot, a.Thead, a.Tr: 1718 if !p.elementInScope(tableScope, p.tok.DataAtom) { 1719 // Ignore the token. 1720 return true 1721 } 1722 // Close the cell and reprocess. 1723 p.popUntil(tableScope, a.Td, a.Th) 1724 p.clearActiveFormattingElements() 1725 p.im = inRowIM 1726 return false 1727 } 1728 } 1729 return inBodyIM(p) 1730 } 1731 1732 // Section 12.2.6.4.16. 1733 func inSelectIM(p *parser) bool { 1734 switch p.tok.Type { 1735 case TextToken: 1736 p.addText(strings.Replace(p.tok.Data, "\x00", "", -1)) 1737 case StartTagToken: 1738 switch p.tok.DataAtom { 1739 case a.Html: 1740 return inBodyIM(p) 1741 case a.Option: 1742 if p.top().DataAtom == a.Option { 1743 p.oe.pop() 1744 } 1745 p.addElement() 1746 case a.Optgroup: 1747 if p.top().DataAtom == a.Option { 1748 p.oe.pop() 1749 } 1750 if p.top().DataAtom == a.Optgroup { 1751 p.oe.pop() 1752 } 1753 p.addElement() 1754 case a.Select: 1755 if p.popUntil(selectScope, a.Select) { 1756 p.resetInsertionMode() 1757 } else { 1758 // Ignore the token. 1759 return true 1760 } 1761 case a.Input, a.Keygen, a.Textarea: 1762 if p.elementInScope(selectScope, a.Select) { 1763 p.parseImpliedToken(EndTagToken, a.Select, a.Select.String()) 1764 return false 1765 } 1766 // In order to properly ignore <textarea>, we need to change the tokenizer mode. 1767 p.tokenizer.NextIsNotRawText() 1768 // Ignore the token. 1769 return true 1770 case a.Script, a.Template: 1771 return inHeadIM(p) 1772 } 1773 case EndTagToken: 1774 switch p.tok.DataAtom { 1775 case a.Option: 1776 if p.top().DataAtom == a.Option { 1777 p.oe.pop() 1778 } 1779 case a.Optgroup: 1780 i := len(p.oe) - 1 1781 if p.oe[i].DataAtom == a.Option { 1782 i-- 1783 } 1784 if p.oe[i].DataAtom == a.Optgroup { 1785 p.oe = p.oe[:i] 1786 } 1787 case a.Select: 1788 if p.popUntil(selectScope, a.Select) { 1789 p.resetInsertionMode() 1790 } else { 1791 // Ignore the token. 1792 return true 1793 } 1794 case a.Template: 1795 return inHeadIM(p) 1796 } 1797 case CommentToken: 1798 p.addChild(&Node{ 1799 Type: CommentNode, 1800 Data: p.tok.Data, 1801 Line: p.tok.Line, 1802 Column: p.tok.Column, 1803 }) 1804 case DoctypeToken: 1805 // Ignore the token. 1806 return true 1807 case ErrorToken: 1808 return inBodyIM(p) 1809 } 1810 1811 return true 1812 } 1813 1814 // Section 12.2.6.4.17. 1815 func inSelectInTableIM(p *parser) bool { 1816 switch p.tok.Type { 1817 case StartTagToken, EndTagToken: 1818 switch p.tok.DataAtom { 1819 case a.Caption, a.Table, a.Tbody, a.Tfoot, a.Thead, a.Tr, a.Td, a.Th: 1820 if p.tok.Type == EndTagToken && !p.elementInScope(tableScope, p.tok.DataAtom) { 1821 // Ignore the token. 1822 return true 1823 } 1824 // This is like p.popUntil(selectScope, a.Select), but it also 1825 // matches <math select>, not just <select>. Matching the MathML 1826 // tag is arguably incorrect (conceptually), but it mimics what 1827 // Chromium does. 1828 for i := len(p.oe) - 1; i >= 0; i-- { 1829 if n := p.oe[i]; n.DataAtom == a.Select { 1830 p.oe = p.oe[:i] 1831 break 1832 } 1833 } 1834 p.resetInsertionMode() 1835 return false 1836 } 1837 } 1838 return inSelectIM(p) 1839 } 1840 1841 // Section 12.2.6.4.18. 1842 func inTemplateIM(p *parser) bool { 1843 switch p.tok.Type { 1844 case TextToken, CommentToken, DoctypeToken: 1845 return inBodyIM(p) 1846 case StartTagToken: 1847 switch p.tok.DataAtom { 1848 case a.Base, a.Basefont, a.Bgsound, a.Link, a.Meta, a.Noframes, a.Script, a.Style, a.Template, a.Title: 1849 return inHeadIM(p) 1850 case a.Caption, a.Colgroup, a.Tbody, a.Tfoot, a.Thead: 1851 p.templateStack.pop() 1852 p.templateStack = append(p.templateStack, inTableIM) 1853 p.im = inTableIM 1854 return false 1855 case a.Col: 1856 p.templateStack.pop() 1857 p.templateStack = append(p.templateStack, inColumnGroupIM) 1858 p.im = inColumnGroupIM 1859 return false 1860 case a.Tr: 1861 p.templateStack.pop() 1862 p.templateStack = append(p.templateStack, inTableBodyIM) 1863 p.im = inTableBodyIM 1864 return false 1865 case a.Td, a.Th: 1866 p.templateStack.pop() 1867 p.templateStack = append(p.templateStack, inRowIM) 1868 p.im = inRowIM 1869 return false 1870 default: 1871 p.templateStack.pop() 1872 p.templateStack = append(p.templateStack, inBodyIM) 1873 p.im = inBodyIM 1874 return false 1875 } 1876 case EndTagToken: 1877 switch p.tok.DataAtom { 1878 case a.Template: 1879 return inHeadIM(p) 1880 default: 1881 // Ignore the token. 1882 return true 1883 } 1884 case ErrorToken: 1885 if !p.oe.contains(a.Template) { 1886 // Ignore the token. 1887 return true 1888 } 1889 // TODO: remove this divergence from the HTML5 spec. 1890 // 1891 // See https://bugs.chromium.org/p/chromium/issues/detail?id=829668 1892 p.generateImpliedEndTags() 1893 for i := len(p.oe) - 1; i >= 0; i-- { 1894 if n := p.oe[i]; n.Namespace == "" && n.DataAtom == a.Template { 1895 p.oe = p.oe[:i] 1896 break 1897 } 1898 } 1899 p.clearActiveFormattingElements() 1900 p.templateStack.pop() 1901 p.resetInsertionMode() 1902 return false 1903 } 1904 return false 1905 } 1906 1907 // Section 12.2.6.4.19. 1908 func afterBodyIM(p *parser) bool { 1909 switch p.tok.Type { 1910 case ErrorToken: 1911 // Stop parsing. 1912 return true 1913 case TextToken: 1914 s := strings.TrimLeft(p.tok.Data, whitespace) 1915 if len(s) == 0 { 1916 // It was all whitespace. 1917 return inBodyIM(p) 1918 } 1919 case StartTagToken: 1920 if p.tok.DataAtom == a.Html { 1921 return inBodyIM(p) 1922 } 1923 case EndTagToken: 1924 if p.tok.DataAtom == a.Html { 1925 if !p.fragment { 1926 p.im = afterAfterBodyIM 1927 } 1928 return true 1929 } 1930 case CommentToken: 1931 // The comment is attached to the <html> element. 1932 if len(p.oe) < 1 || p.oe[0].DataAtom != a.Html { 1933 panic("html: bad parser state: <html> element not found, in the after-body insertion mode") 1934 } 1935 p.oe[0].AppendChild(&Node{ 1936 Type: CommentNode, 1937 Data: p.tok.Data, 1938 Line: p.tok.Line, 1939 Column: p.tok.Column, 1940 }) 1941 return true 1942 } 1943 p.im = inBodyIM 1944 return false 1945 } 1946 1947 // Section 12.2.6.4.20. 1948 func inFramesetIM(p *parser) bool { 1949 switch p.tok.Type { 1950 case CommentToken: 1951 p.addChild(&Node{ 1952 Type: CommentNode, 1953 Data: p.tok.Data, 1954 Line: p.tok.Line, 1955 Column: p.tok.Column, 1956 }) 1957 case TextToken: 1958 // Ignore all text but whitespace. 1959 s := strings.Map(func(c rune) rune { 1960 switch c { 1961 case ' ', '\t', '\n', '\f', '\r': 1962 return c 1963 } 1964 return -1 1965 }, p.tok.Data) 1966 if s != "" { 1967 p.addText(s) 1968 } 1969 case StartTagToken: 1970 switch p.tok.DataAtom { 1971 case a.Html: 1972 return inBodyIM(p) 1973 case a.Frameset: 1974 p.addElement() 1975 case a.Frame: 1976 p.addElement() 1977 p.oe.pop() 1978 p.acknowledgeSelfClosingTag() 1979 case a.Noframes: 1980 return inHeadIM(p) 1981 } 1982 case EndTagToken: 1983 switch p.tok.DataAtom { 1984 case a.Frameset: 1985 if p.oe.top().DataAtom != a.Html { 1986 p.oe.pop() 1987 if p.oe.top().DataAtom != a.Frameset { 1988 p.im = afterFramesetIM 1989 return true 1990 } 1991 } 1992 } 1993 default: 1994 // Ignore the token. 1995 } 1996 return true 1997 } 1998 1999 // Section 12.2.6.4.21. 2000 func afterFramesetIM(p *parser) bool { 2001 switch p.tok.Type { 2002 case CommentToken: 2003 p.addChild(&Node{ 2004 Type: CommentNode, 2005 Data: p.tok.Data, 2006 Line: p.tok.Line, 2007 Column: p.tok.Column, 2008 }) 2009 case TextToken: 2010 // Ignore all text but whitespace. 2011 s := strings.Map(func(c rune) rune { 2012 switch c { 2013 case ' ', '\t', '\n', '\f', '\r': 2014 return c 2015 } 2016 return -1 2017 }, p.tok.Data) 2018 if s != "" { 2019 p.addText(s) 2020 } 2021 case StartTagToken: 2022 switch p.tok.DataAtom { 2023 case a.Html: 2024 return inBodyIM(p) 2025 case a.Noframes: 2026 return inHeadIM(p) 2027 } 2028 case EndTagToken: 2029 switch p.tok.DataAtom { 2030 case a.Html: 2031 p.im = afterAfterFramesetIM 2032 return true 2033 } 2034 default: 2035 // Ignore the token. 2036 } 2037 return true 2038 } 2039 2040 // Section 12.2.6.4.22. 2041 func afterAfterBodyIM(p *parser) bool { 2042 switch p.tok.Type { 2043 case ErrorToken: 2044 // Stop parsing. 2045 return true 2046 case TextToken: 2047 s := strings.TrimLeft(p.tok.Data, whitespace) 2048 if len(s) == 0 { 2049 // It was all whitespace. 2050 return inBodyIM(p) 2051 } 2052 case StartTagToken: 2053 if p.tok.DataAtom == a.Html { 2054 return inBodyIM(p) 2055 } 2056 case CommentToken: 2057 p.doc.AppendChild(&Node{ 2058 Type: CommentNode, 2059 Data: p.tok.Data, 2060 Line: p.tok.Line, 2061 Column: p.tok.Column, 2062 }) 2063 return true 2064 case DoctypeToken: 2065 return inBodyIM(p) 2066 } 2067 p.im = inBodyIM 2068 return false 2069 } 2070 2071 // Section 12.2.6.4.23. 2072 func afterAfterFramesetIM(p *parser) bool { 2073 switch p.tok.Type { 2074 case CommentToken: 2075 p.doc.AppendChild(&Node{ 2076 Type: CommentNode, 2077 Data: p.tok.Data, 2078 Line: p.tok.Line, 2079 Column: p.tok.Column, 2080 }) 2081 case TextToken: 2082 // Ignore all text but whitespace. 2083 s := strings.Map(func(c rune) rune { 2084 switch c { 2085 case ' ', '\t', '\n', '\f', '\r': 2086 return c 2087 } 2088 return -1 2089 }, p.tok.Data) 2090 if s != "" { 2091 p.tok.Data = s 2092 return inBodyIM(p) 2093 } 2094 case StartTagToken: 2095 switch p.tok.DataAtom { 2096 case a.Html: 2097 return inBodyIM(p) 2098 case a.Noframes: 2099 return inHeadIM(p) 2100 } 2101 case DoctypeToken: 2102 return inBodyIM(p) 2103 default: 2104 // Ignore the token. 2105 } 2106 return true 2107 } 2108 2109 const whitespaceOrNUL = whitespace + "\x00" 2110 2111 // Section 12.2.6.5 2112 func parseForeignContent(p *parser) bool { 2113 switch p.tok.Type { 2114 case TextToken: 2115 if p.framesetOK { 2116 p.framesetOK = strings.TrimLeft(p.tok.Data, whitespaceOrNUL) == "" 2117 } 2118 p.tok.Data = strings.Replace(p.tok.Data, "\x00", "\ufffd", -1) 2119 p.addText(p.tok.Data) 2120 case CommentToken: 2121 p.addChild(&Node{ 2122 Type: CommentNode, 2123 Data: p.tok.Data, 2124 Line: p.tok.Line, 2125 Column: p.tok.Column, 2126 }) 2127 case StartTagToken: 2128 b := breakout[p.tok.Data] 2129 if p.tok.DataAtom == a.Font { 2130 loop: 2131 for _, attr := range p.tok.Attr { 2132 switch attr.Key { 2133 case "color", "face", "size": 2134 b = true 2135 break loop 2136 } 2137 } 2138 } 2139 if b { 2140 for i := len(p.oe) - 1; i >= 0; i-- { 2141 n := p.oe[i] 2142 if n.Namespace == "" || htmlIntegrationPoint(n) || mathMLTextIntegrationPoint(n) { 2143 p.oe = p.oe[:i+1] 2144 break 2145 } 2146 } 2147 return false 2148 } 2149 switch p.top().Namespace { 2150 case "math": 2151 adjustAttributeNames(p.tok.Attr, mathMLAttributeAdjustments) 2152 case "svg": 2153 // Adjust SVG tag names. The tokenizer lower-cases tag names, but 2154 // SVG wants e.g. "foreignObject" with a capital second "O". 2155 if x := svgTagNameAdjustments[p.tok.Data]; x != "" { 2156 p.tok.DataAtom = a.Lookup([]byte(x)) 2157 p.tok.Data = x 2158 } 2159 adjustAttributeNames(p.tok.Attr, svgAttributeAdjustments) 2160 default: 2161 panic("html: bad parser state: unexpected namespace") 2162 } 2163 adjustForeignAttributes(p.tok.Attr) 2164 namespace := p.top().Namespace 2165 p.addElement() 2166 p.top().Namespace = namespace 2167 if namespace != "" { 2168 // Don't let the tokenizer go into raw text mode in foreign content 2169 // (e.g. in an SVG <title> tag). 2170 p.tokenizer.NextIsNotRawText() 2171 } 2172 if p.hasSelfClosingToken { 2173 p.oe.pop() 2174 p.acknowledgeSelfClosingTag() 2175 } 2176 case EndTagToken: 2177 for i := len(p.oe) - 1; i >= 0; i-- { 2178 if p.oe[i].Namespace == "" { 2179 return p.im(p) 2180 } 2181 if strings.EqualFold(p.oe[i].Data, p.tok.Data) { 2182 p.oe = p.oe[:i] 2183 break 2184 } 2185 } 2186 return true 2187 default: 2188 // Ignore the token. 2189 } 2190 return true 2191 } 2192 2193 // Section 12.2.6. 2194 func (p *parser) inForeignContent() bool { 2195 if len(p.oe) == 0 { 2196 return false 2197 } 2198 n := p.oe[len(p.oe)-1] 2199 if n.Namespace == "" { 2200 return false 2201 } 2202 if mathMLTextIntegrationPoint(n) { 2203 if p.tok.Type == StartTagToken && p.tok.DataAtom != a.Mglyph && p.tok.DataAtom != a.Malignmark { 2204 return false 2205 } 2206 if p.tok.Type == TextToken { 2207 return false 2208 } 2209 } 2210 if n.Namespace == "math" && n.DataAtom == a.AnnotationXml && p.tok.Type == StartTagToken && p.tok.DataAtom == a.Svg { 2211 return false 2212 } 2213 if htmlIntegrationPoint(n) && (p.tok.Type == StartTagToken || p.tok.Type == TextToken) { 2214 return false 2215 } 2216 if p.tok.Type == ErrorToken { 2217 return false 2218 } 2219 return true 2220 } 2221 2222 // parseImpliedToken parses a token as though it had appeared in the parser's 2223 // input. 2224 func (p *parser) parseImpliedToken(t TokenType, dataAtom a.Atom, data string) { 2225 realToken, selfClosing := p.tok, p.hasSelfClosingToken 2226 p.tok = Token{ 2227 Type: t, 2228 DataAtom: dataAtom, 2229 Data: data, 2230 } 2231 p.hasSelfClosingToken = false 2232 p.parseCurrentToken() 2233 p.tok, p.hasSelfClosingToken = realToken, selfClosing 2234 } 2235 2236 // parseCurrentToken runs the current token through the parsing routines 2237 // until it is consumed. 2238 func (p *parser) parseCurrentToken() { 2239 if p.tok.Type == SelfClosingTagToken { 2240 p.hasSelfClosingToken = true 2241 p.tok.Type = StartTagToken 2242 } 2243 2244 consumed := false 2245 for !consumed { 2246 if p.inForeignContent() { 2247 consumed = parseForeignContent(p) 2248 } else { 2249 consumed = p.im(p) 2250 } 2251 } 2252 2253 if p.hasSelfClosingToken { 2254 // This is a parse error, but ignore it. 2255 p.hasSelfClosingToken = false 2256 } 2257 } 2258 2259 func (p *parser) parse() error { 2260 // Iterate until EOF. Any other error will cause an early return. 2261 var err error 2262 for err != io.EOF { 2263 // CDATA sections are allowed only in foreign content. 2264 n := p.oe.top() 2265 p.tokenizer.AllowCDATA(n != nil && n.Namespace != "") 2266 // Read and parse the next token. 2267 p.tokenizer.Next() 2268 p.tok = p.tokenizer.Token() 2269 if p.tok.Type == ErrorToken { 2270 err = p.tokenizer.Err() 2271 if err != nil && err != io.EOF { 2272 return err 2273 } 2274 } 2275 p.parseCurrentToken() 2276 } 2277 return nil 2278 } 2279 2280 // Parse returns the parse tree for the HTML from the given Reader. 2281 // 2282 // It implements the HTML5 parsing algorithm 2283 // (https://html.spec.whatwg.org/multipage/syntax.html#tree-construction), 2284 // which is very complicated. The resultant tree can contain implicitly created 2285 // nodes that have no explicit <tag> listed in r's data, and nodes' parents can 2286 // differ from the nesting implied by a naive processing of start and end 2287 // <tag>s. Conversely, explicit <tag>s in r's data can be silently dropped, 2288 // with no corresponding node in the resulting tree. 2289 // 2290 // The input is assumed to be UTF-8 encoded. 2291 func Parse(r io.Reader) (*Node, error) { 2292 p := &parser{ 2293 tokenizer: NewTokenizer(r), 2294 doc: &Node{ 2295 Type: DocumentNode, 2296 }, 2297 scripting: true, 2298 framesetOK: true, 2299 im: initialIM, 2300 } 2301 err := p.parse() 2302 if err != nil { 2303 return nil, err 2304 } 2305 return p.doc, nil 2306 } 2307 2308 // ParseFragment parses a fragment of HTML and returns the nodes that were 2309 // found. If the fragment is the InnerHTML for an existing element, pass that 2310 // element in context. 2311 // 2312 // It has the same intricacies as Parse. 2313 func ParseFragment(r io.Reader, context *Node) ([]*Node, error) { 2314 contextTag := "" 2315 if context != nil { 2316 if context.Type != ElementNode { 2317 return nil, errors.New("html: ParseFragment of non-element Node") 2318 } 2319 // The next check isn't just context.DataAtom.String() == context.Data because 2320 // it is valid to pass an element whose tag isn't a known atom. For example, 2321 // DataAtom == 0 and Data = "tagfromthefuture" is perfectly consistent. 2322 if context.DataAtom != a.Lookup([]byte(context.Data)) { 2323 return nil, fmt.Errorf("html: inconsistent Node: DataAtom=%q, Data=%q", context.DataAtom, context.Data) 2324 } 2325 contextTag = context.DataAtom.String() 2326 } 2327 p := &parser{ 2328 tokenizer: NewTokenizerFragment(r, contextTag), 2329 doc: &Node{ 2330 Type: DocumentNode, 2331 }, 2332 scripting: true, 2333 fragment: true, 2334 context: context, 2335 } 2336 2337 root := &Node{ 2338 Type: ElementNode, 2339 DataAtom: a.Html, 2340 Data: a.Html.String(), 2341 } 2342 p.doc.AppendChild(root) 2343 p.oe = nodeStack{root} 2344 if context != nil && context.DataAtom == a.Template { 2345 p.templateStack = append(p.templateStack, inTemplateIM) 2346 } 2347 p.resetInsertionMode() 2348 2349 for n := context; n != nil; n = n.Parent { 2350 if n.Type == ElementNode && n.DataAtom == a.Form { 2351 p.form = n 2352 break 2353 } 2354 } 2355 2356 err := p.parse() 2357 if err != nil { 2358 return nil, err 2359 } 2360 2361 parent := p.doc 2362 if context != nil { 2363 parent = root 2364 } 2365 2366 var result []*Node 2367 for c := parent.FirstChild; c != nil; { 2368 next := c.NextSibling 2369 parent.RemoveChild(c) 2370 result = append(result, c) 2371 c = next 2372 } 2373 return result, nil 2374 }