github.com/Andyfoo/golang/x/net@v0.0.0-20190901054642-57c1bf301704/html/parse.go (about) 1 // Copyright 2010 The Go Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 package html 6 7 import ( 8 "errors" 9 "fmt" 10 "io" 11 "strings" 12 13 a "github.com/Andyfoo/golang/x/net/html/atom" 14 ) 15 16 // A parser implements the HTML5 parsing algorithm: 17 // https://html.spec.whatwg.org/multipage/syntax.html#tree-construction 18 type parser struct { 19 // tokenizer provides the tokens for the parser. 20 tokenizer *Tokenizer 21 // tok is the most recently read token. 22 tok Token 23 // Self-closing tags like <hr/> are treated as start tags, except that 24 // hasSelfClosingToken is set while they are being processed. 25 hasSelfClosingToken bool 26 // doc is the document root element. 27 doc *Node 28 // The stack of open elements (section 12.2.4.2) and active formatting 29 // elements (section 12.2.4.3). 30 oe, afe nodeStack 31 // Element pointers (section 12.2.4.4). 32 head, form *Node 33 // Other parsing state flags (section 12.2.4.5). 34 scripting, framesetOK bool 35 // The stack of template insertion modes 36 templateStack insertionModeStack 37 // im is the current insertion mode. 38 im insertionMode 39 // originalIM is the insertion mode to go back to after completing a text 40 // or inTableText insertion mode. 41 originalIM insertionMode 42 // fosterParenting is whether new elements should be inserted according to 43 // the foster parenting rules (section 12.2.6.1). 44 fosterParenting bool 45 // quirks is whether the parser is operating in "quirks mode." 46 quirks bool 47 // fragment is whether the parser is parsing an HTML fragment. 48 fragment bool 49 // context is the context element when parsing an HTML fragment 50 // (section 12.4). 51 context *Node 52 } 53 54 func (p *parser) top() *Node { 55 if n := p.oe.top(); n != nil { 56 return n 57 } 58 return p.doc 59 } 60 61 // Stop tags for use in popUntil. These come from section 12.2.4.2. 62 var ( 63 defaultScopeStopTags = map[string][]a.Atom{ 64 "": {a.Applet, a.Caption, a.Html, a.Table, a.Td, a.Th, a.Marquee, a.Object, a.Template}, 65 "math": {a.AnnotationXml, a.Mi, a.Mn, a.Mo, a.Ms, a.Mtext}, 66 "svg": {a.Desc, a.ForeignObject, a.Title}, 67 } 68 ) 69 70 type scope int 71 72 const ( 73 defaultScope scope = iota 74 listItemScope 75 buttonScope 76 tableScope 77 tableRowScope 78 tableBodyScope 79 selectScope 80 ) 81 82 // popUntil pops the stack of open elements at the highest element whose tag 83 // is in matchTags, provided there is no higher element in the scope's stop 84 // tags (as defined in section 12.2.4.2). It returns whether or not there was 85 // such an element. If there was not, popUntil leaves the stack unchanged. 86 // 87 // For example, the set of stop tags for table scope is: "html", "table". If 88 // the stack was: 89 // ["html", "body", "font", "table", "b", "i", "u"] 90 // then popUntil(tableScope, "font") would return false, but 91 // popUntil(tableScope, "i") would return true and the stack would become: 92 // ["html", "body", "font", "table", "b"] 93 // 94 // If an element's tag is in both the stop tags and matchTags, then the stack 95 // will be popped and the function returns true (provided, of course, there was 96 // no higher element in the stack that was also in the stop tags). For example, 97 // popUntil(tableScope, "table") returns true and leaves: 98 // ["html", "body", "font"] 99 func (p *parser) popUntil(s scope, matchTags ...a.Atom) bool { 100 if i := p.indexOfElementInScope(s, matchTags...); i != -1 { 101 p.oe = p.oe[:i] 102 return true 103 } 104 return false 105 } 106 107 // indexOfElementInScope returns the index in p.oe of the highest element whose 108 // tag is in matchTags that is in scope. If no matching element is in scope, it 109 // returns -1. 110 func (p *parser) indexOfElementInScope(s scope, matchTags ...a.Atom) int { 111 for i := len(p.oe) - 1; i >= 0; i-- { 112 tagAtom := p.oe[i].DataAtom 113 if p.oe[i].Namespace == "" { 114 for _, t := range matchTags { 115 if t == tagAtom { 116 return i 117 } 118 } 119 switch s { 120 case defaultScope: 121 // No-op. 122 case listItemScope: 123 if tagAtom == a.Ol || tagAtom == a.Ul { 124 return -1 125 } 126 case buttonScope: 127 if tagAtom == a.Button { 128 return -1 129 } 130 case tableScope: 131 if tagAtom == a.Html || tagAtom == a.Table || tagAtom == a.Template { 132 return -1 133 } 134 case selectScope: 135 if tagAtom != a.Optgroup && tagAtom != a.Option { 136 return -1 137 } 138 default: 139 panic("unreachable") 140 } 141 } 142 switch s { 143 case defaultScope, listItemScope, buttonScope: 144 for _, t := range defaultScopeStopTags[p.oe[i].Namespace] { 145 if t == tagAtom { 146 return -1 147 } 148 } 149 } 150 } 151 return -1 152 } 153 154 // elementInScope is like popUntil, except that it doesn't modify the stack of 155 // open elements. 156 func (p *parser) elementInScope(s scope, matchTags ...a.Atom) bool { 157 return p.indexOfElementInScope(s, matchTags...) != -1 158 } 159 160 // clearStackToContext pops elements off the stack of open elements until a 161 // scope-defined element is found. 162 func (p *parser) clearStackToContext(s scope) { 163 for i := len(p.oe) - 1; i >= 0; i-- { 164 tagAtom := p.oe[i].DataAtom 165 switch s { 166 case tableScope: 167 if tagAtom == a.Html || tagAtom == a.Table || tagAtom == a.Template { 168 p.oe = p.oe[:i+1] 169 return 170 } 171 case tableRowScope: 172 if tagAtom == a.Html || tagAtom == a.Tr || tagAtom == a.Template { 173 p.oe = p.oe[:i+1] 174 return 175 } 176 case tableBodyScope: 177 if tagAtom == a.Html || tagAtom == a.Tbody || tagAtom == a.Tfoot || tagAtom == a.Thead || tagAtom == a.Template { 178 p.oe = p.oe[:i+1] 179 return 180 } 181 default: 182 panic("unreachable") 183 } 184 } 185 } 186 187 // generateImpliedEndTags pops nodes off the stack of open elements as long as 188 // the top node has a tag name of dd, dt, li, optgroup, option, p, rb, rp, rt or rtc. 189 // If exceptions are specified, nodes with that name will not be popped off. 190 func (p *parser) generateImpliedEndTags(exceptions ...string) { 191 var i int 192 loop: 193 for i = len(p.oe) - 1; i >= 0; i-- { 194 n := p.oe[i] 195 if n.Type == ElementNode { 196 switch n.DataAtom { 197 case a.Dd, a.Dt, a.Li, a.Optgroup, a.Option, a.P, a.Rb, a.Rp, a.Rt, a.Rtc: 198 for _, except := range exceptions { 199 if n.Data == except { 200 break loop 201 } 202 } 203 continue 204 } 205 } 206 break 207 } 208 209 p.oe = p.oe[:i+1] 210 } 211 212 // addChild adds a child node n to the top element, and pushes n onto the stack 213 // of open elements if it is an element node. 214 func (p *parser) addChild(n *Node) { 215 if p.shouldFosterParent() { 216 p.fosterParent(n) 217 } else { 218 p.top().AppendChild(n) 219 } 220 221 if n.Type == ElementNode { 222 p.oe = append(p.oe, n) 223 } 224 } 225 226 // shouldFosterParent returns whether the next node to be added should be 227 // foster parented. 228 func (p *parser) shouldFosterParent() bool { 229 if p.fosterParenting { 230 switch p.top().DataAtom { 231 case a.Table, a.Tbody, a.Tfoot, a.Thead, a.Tr: 232 return true 233 } 234 } 235 return false 236 } 237 238 // fosterParent adds a child node according to the foster parenting rules. 239 // Section 12.2.6.1, "foster parenting". 240 func (p *parser) fosterParent(n *Node) { 241 var table, parent, prev, template *Node 242 var i int 243 for i = len(p.oe) - 1; i >= 0; i-- { 244 if p.oe[i].DataAtom == a.Table { 245 table = p.oe[i] 246 break 247 } 248 } 249 250 var j int 251 for j = len(p.oe) - 1; j >= 0; j-- { 252 if p.oe[j].DataAtom == a.Template { 253 template = p.oe[j] 254 break 255 } 256 } 257 258 if template != nil && (table == nil || j > i) { 259 template.AppendChild(n) 260 return 261 } 262 263 if table == nil { 264 // The foster parent is the html element. 265 parent = p.oe[0] 266 } else { 267 parent = table.Parent 268 } 269 if parent == nil { 270 parent = p.oe[i-1] 271 } 272 273 if table != nil { 274 prev = table.PrevSibling 275 } else { 276 prev = parent.LastChild 277 } 278 if prev != nil && prev.Type == TextNode && n.Type == TextNode { 279 prev.Data += n.Data 280 return 281 } 282 283 parent.InsertBefore(n, table) 284 } 285 286 // addText adds text to the preceding node if it is a text node, or else it 287 // calls addChild with a new text node. 288 func (p *parser) addText(text string) { 289 if text == "" { 290 return 291 } 292 293 if p.shouldFosterParent() { 294 p.fosterParent(&Node{ 295 Type: TextNode, 296 Data: text, 297 }) 298 return 299 } 300 301 t := p.top() 302 if n := t.LastChild; n != nil && n.Type == TextNode { 303 n.Data += text 304 return 305 } 306 p.addChild(&Node{ 307 Type: TextNode, 308 Data: text, 309 }) 310 } 311 312 // addElement adds a child element based on the current token. 313 func (p *parser) addElement() { 314 p.addChild(&Node{ 315 Type: ElementNode, 316 DataAtom: p.tok.DataAtom, 317 Data: p.tok.Data, 318 Attr: p.tok.Attr, 319 }) 320 } 321 322 // Section 12.2.4.3. 323 func (p *parser) addFormattingElement() { 324 tagAtom, attr := p.tok.DataAtom, p.tok.Attr 325 p.addElement() 326 327 // Implement the Noah's Ark clause, but with three per family instead of two. 328 identicalElements := 0 329 findIdenticalElements: 330 for i := len(p.afe) - 1; i >= 0; i-- { 331 n := p.afe[i] 332 if n.Type == scopeMarkerNode { 333 break 334 } 335 if n.Type != ElementNode { 336 continue 337 } 338 if n.Namespace != "" { 339 continue 340 } 341 if n.DataAtom != tagAtom { 342 continue 343 } 344 if len(n.Attr) != len(attr) { 345 continue 346 } 347 compareAttributes: 348 for _, t0 := range n.Attr { 349 for _, t1 := range attr { 350 if t0.Key == t1.Key && t0.Namespace == t1.Namespace && t0.Val == t1.Val { 351 // Found a match for this attribute, continue with the next attribute. 352 continue compareAttributes 353 } 354 } 355 // If we get here, there is no attribute that matches a. 356 // Therefore the element is not identical to the new one. 357 continue findIdenticalElements 358 } 359 360 identicalElements++ 361 if identicalElements >= 3 { 362 p.afe.remove(n) 363 } 364 } 365 366 p.afe = append(p.afe, p.top()) 367 } 368 369 // Section 12.2.4.3. 370 func (p *parser) clearActiveFormattingElements() { 371 for { 372 n := p.afe.pop() 373 if len(p.afe) == 0 || n.Type == scopeMarkerNode { 374 return 375 } 376 } 377 } 378 379 // Section 12.2.4.3. 380 func (p *parser) reconstructActiveFormattingElements() { 381 n := p.afe.top() 382 if n == nil { 383 return 384 } 385 if n.Type == scopeMarkerNode || p.oe.index(n) != -1 { 386 return 387 } 388 i := len(p.afe) - 1 389 for n.Type != scopeMarkerNode && p.oe.index(n) == -1 { 390 if i == 0 { 391 i = -1 392 break 393 } 394 i-- 395 n = p.afe[i] 396 } 397 for { 398 i++ 399 clone := p.afe[i].clone() 400 p.addChild(clone) 401 p.afe[i] = clone 402 if i == len(p.afe)-1 { 403 break 404 } 405 } 406 } 407 408 // Section 12.2.5. 409 func (p *parser) acknowledgeSelfClosingTag() { 410 p.hasSelfClosingToken = false 411 } 412 413 // An insertion mode (section 12.2.4.1) is the state transition function from 414 // a particular state in the HTML5 parser's state machine. It updates the 415 // parser's fields depending on parser.tok (where ErrorToken means EOF). 416 // It returns whether the token was consumed. 417 type insertionMode func(*parser) bool 418 419 // setOriginalIM sets the insertion mode to return to after completing a text or 420 // inTableText insertion mode. 421 // Section 12.2.4.1, "using the rules for". 422 func (p *parser) setOriginalIM() { 423 if p.originalIM != nil { 424 panic("html: bad parser state: originalIM was set twice") 425 } 426 p.originalIM = p.im 427 } 428 429 // Section 12.2.4.1, "reset the insertion mode". 430 func (p *parser) resetInsertionMode() { 431 for i := len(p.oe) - 1; i >= 0; i-- { 432 n := p.oe[i] 433 last := i == 0 434 if last && p.context != nil { 435 n = p.context 436 } 437 438 switch n.DataAtom { 439 case a.Select: 440 if !last { 441 for ancestor, first := n, p.oe[0]; ancestor != first; { 442 ancestor = p.oe[p.oe.index(ancestor)-1] 443 switch ancestor.DataAtom { 444 case a.Template: 445 p.im = inSelectIM 446 return 447 case a.Table: 448 p.im = inSelectInTableIM 449 return 450 } 451 } 452 } 453 p.im = inSelectIM 454 case a.Td, a.Th: 455 // TODO: remove this divergence from the HTML5 spec. 456 // 457 // See https://bugs.chromium.org/p/chromium/issues/detail?id=829668 458 p.im = inCellIM 459 case a.Tr: 460 p.im = inRowIM 461 case a.Tbody, a.Thead, a.Tfoot: 462 p.im = inTableBodyIM 463 case a.Caption: 464 p.im = inCaptionIM 465 case a.Colgroup: 466 p.im = inColumnGroupIM 467 case a.Table: 468 p.im = inTableIM 469 case a.Template: 470 // TODO: remove this divergence from the HTML5 spec. 471 if n.Namespace != "" { 472 continue 473 } 474 p.im = p.templateStack.top() 475 case a.Head: 476 // TODO: remove this divergence from the HTML5 spec. 477 // 478 // See https://bugs.chromium.org/p/chromium/issues/detail?id=829668 479 p.im = inHeadIM 480 case a.Body: 481 p.im = inBodyIM 482 case a.Frameset: 483 p.im = inFramesetIM 484 case a.Html: 485 if p.head == nil { 486 p.im = beforeHeadIM 487 } else { 488 p.im = afterHeadIM 489 } 490 default: 491 if last { 492 p.im = inBodyIM 493 return 494 } 495 continue 496 } 497 return 498 } 499 } 500 501 const whitespace = " \t\r\n\f" 502 503 // Section 12.2.6.4.1. 504 func initialIM(p *parser) bool { 505 switch p.tok.Type { 506 case TextToken: 507 p.tok.Data = strings.TrimLeft(p.tok.Data, whitespace) 508 if len(p.tok.Data) == 0 { 509 // It was all whitespace, so ignore it. 510 return true 511 } 512 case CommentToken: 513 p.doc.AppendChild(&Node{ 514 Type: CommentNode, 515 Data: p.tok.Data, 516 }) 517 return true 518 case DoctypeToken: 519 n, quirks := parseDoctype(p.tok.Data) 520 p.doc.AppendChild(n) 521 p.quirks = quirks 522 p.im = beforeHTMLIM 523 return true 524 } 525 p.quirks = true 526 p.im = beforeHTMLIM 527 return false 528 } 529 530 // Section 12.2.6.4.2. 531 func beforeHTMLIM(p *parser) bool { 532 switch p.tok.Type { 533 case DoctypeToken: 534 // Ignore the token. 535 return true 536 case TextToken: 537 p.tok.Data = strings.TrimLeft(p.tok.Data, whitespace) 538 if len(p.tok.Data) == 0 { 539 // It was all whitespace, so ignore it. 540 return true 541 } 542 case StartTagToken: 543 if p.tok.DataAtom == a.Html { 544 p.addElement() 545 p.im = beforeHeadIM 546 return true 547 } 548 case EndTagToken: 549 switch p.tok.DataAtom { 550 case a.Head, a.Body, a.Html, a.Br: 551 p.parseImpliedToken(StartTagToken, a.Html, a.Html.String()) 552 return false 553 default: 554 // Ignore the token. 555 return true 556 } 557 case CommentToken: 558 p.doc.AppendChild(&Node{ 559 Type: CommentNode, 560 Data: p.tok.Data, 561 }) 562 return true 563 } 564 p.parseImpliedToken(StartTagToken, a.Html, a.Html.String()) 565 return false 566 } 567 568 // Section 12.2.6.4.3. 569 func beforeHeadIM(p *parser) bool { 570 switch p.tok.Type { 571 case TextToken: 572 p.tok.Data = strings.TrimLeft(p.tok.Data, whitespace) 573 if len(p.tok.Data) == 0 { 574 // It was all whitespace, so ignore it. 575 return true 576 } 577 case StartTagToken: 578 switch p.tok.DataAtom { 579 case a.Head: 580 p.addElement() 581 p.head = p.top() 582 p.im = inHeadIM 583 return true 584 case a.Html: 585 return inBodyIM(p) 586 } 587 case EndTagToken: 588 switch p.tok.DataAtom { 589 case a.Head, a.Body, a.Html, a.Br: 590 p.parseImpliedToken(StartTagToken, a.Head, a.Head.String()) 591 return false 592 default: 593 // Ignore the token. 594 return true 595 } 596 case CommentToken: 597 p.addChild(&Node{ 598 Type: CommentNode, 599 Data: p.tok.Data, 600 }) 601 return true 602 case DoctypeToken: 603 // Ignore the token. 604 return true 605 } 606 607 p.parseImpliedToken(StartTagToken, a.Head, a.Head.String()) 608 return false 609 } 610 611 // Section 12.2.6.4.4. 612 func inHeadIM(p *parser) bool { 613 switch p.tok.Type { 614 case TextToken: 615 s := strings.TrimLeft(p.tok.Data, whitespace) 616 if len(s) < len(p.tok.Data) { 617 // Add the initial whitespace to the current node. 618 p.addText(p.tok.Data[:len(p.tok.Data)-len(s)]) 619 if s == "" { 620 return true 621 } 622 p.tok.Data = s 623 } 624 case StartTagToken: 625 switch p.tok.DataAtom { 626 case a.Html: 627 return inBodyIM(p) 628 case a.Base, a.Basefont, a.Bgsound, a.Command, a.Link, a.Meta: 629 p.addElement() 630 p.oe.pop() 631 p.acknowledgeSelfClosingTag() 632 return true 633 case a.Noscript: 634 p.addElement() 635 if p.scripting { 636 p.setOriginalIM() 637 p.im = textIM 638 } else { 639 p.im = inHeadNoscriptIM 640 } 641 return true 642 case a.Script, a.Title, a.Noframes, a.Style: 643 p.addElement() 644 p.setOriginalIM() 645 p.im = textIM 646 return true 647 case a.Head: 648 // Ignore the token. 649 return true 650 case a.Template: 651 p.addElement() 652 p.afe = append(p.afe, &scopeMarker) 653 p.framesetOK = false 654 p.im = inTemplateIM 655 p.templateStack = append(p.templateStack, inTemplateIM) 656 return true 657 } 658 case EndTagToken: 659 switch p.tok.DataAtom { 660 case a.Head: 661 p.oe.pop() 662 p.im = afterHeadIM 663 return true 664 case a.Body, a.Html, a.Br: 665 p.parseImpliedToken(EndTagToken, a.Head, a.Head.String()) 666 return false 667 case a.Template: 668 if !p.oe.contains(a.Template) { 669 return true 670 } 671 // TODO: remove this divergence from the HTML5 spec. 672 // 673 // See https://bugs.chromium.org/p/chromium/issues/detail?id=829668 674 p.generateImpliedEndTags() 675 for i := len(p.oe) - 1; i >= 0; i-- { 676 if n := p.oe[i]; n.Namespace == "" && n.DataAtom == a.Template { 677 p.oe = p.oe[:i] 678 break 679 } 680 } 681 p.clearActiveFormattingElements() 682 p.templateStack.pop() 683 p.resetInsertionMode() 684 return true 685 default: 686 // Ignore the token. 687 return true 688 } 689 case CommentToken: 690 p.addChild(&Node{ 691 Type: CommentNode, 692 Data: p.tok.Data, 693 }) 694 return true 695 case DoctypeToken: 696 // Ignore the token. 697 return true 698 } 699 700 p.parseImpliedToken(EndTagToken, a.Head, a.Head.String()) 701 return false 702 } 703 704 // 12.2.6.4.5. 705 func inHeadNoscriptIM(p *parser) bool { 706 switch p.tok.Type { 707 case DoctypeToken: 708 // Ignore the token. 709 return true 710 case StartTagToken: 711 switch p.tok.DataAtom { 712 case a.Html: 713 return inBodyIM(p) 714 case a.Basefont, a.Bgsound, a.Link, a.Meta, a.Noframes, a.Style: 715 return inHeadIM(p) 716 case a.Head, a.Noscript: 717 // Ignore the token. 718 return true 719 } 720 case EndTagToken: 721 switch p.tok.DataAtom { 722 case a.Noscript, a.Br: 723 default: 724 // Ignore the token. 725 return true 726 } 727 case TextToken: 728 s := strings.TrimLeft(p.tok.Data, whitespace) 729 if len(s) == 0 { 730 // It was all whitespace. 731 return inHeadIM(p) 732 } 733 case CommentToken: 734 return inHeadIM(p) 735 } 736 p.oe.pop() 737 if p.top().DataAtom != a.Head { 738 panic("html: the new current node will be a head element.") 739 } 740 p.im = inHeadIM 741 if p.tok.DataAtom == a.Noscript { 742 return true 743 } 744 return false 745 } 746 747 // Section 12.2.6.4.6. 748 func afterHeadIM(p *parser) bool { 749 switch p.tok.Type { 750 case TextToken: 751 s := strings.TrimLeft(p.tok.Data, whitespace) 752 if len(s) < len(p.tok.Data) { 753 // Add the initial whitespace to the current node. 754 p.addText(p.tok.Data[:len(p.tok.Data)-len(s)]) 755 if s == "" { 756 return true 757 } 758 p.tok.Data = s 759 } 760 case StartTagToken: 761 switch p.tok.DataAtom { 762 case a.Html: 763 return inBodyIM(p) 764 case a.Body: 765 p.addElement() 766 p.framesetOK = false 767 p.im = inBodyIM 768 return true 769 case a.Frameset: 770 p.addElement() 771 p.im = inFramesetIM 772 return true 773 case a.Base, a.Basefont, a.Bgsound, a.Link, a.Meta, a.Noframes, a.Script, a.Style, a.Template, a.Title: 774 p.oe = append(p.oe, p.head) 775 defer p.oe.remove(p.head) 776 return inHeadIM(p) 777 case a.Head: 778 // Ignore the token. 779 return true 780 } 781 case EndTagToken: 782 switch p.tok.DataAtom { 783 case a.Body, a.Html, a.Br: 784 // Drop down to creating an implied <body> tag. 785 case a.Template: 786 return inHeadIM(p) 787 default: 788 // Ignore the token. 789 return true 790 } 791 case CommentToken: 792 p.addChild(&Node{ 793 Type: CommentNode, 794 Data: p.tok.Data, 795 }) 796 return true 797 case DoctypeToken: 798 // Ignore the token. 799 return true 800 } 801 802 p.parseImpliedToken(StartTagToken, a.Body, a.Body.String()) 803 p.framesetOK = true 804 return false 805 } 806 807 // copyAttributes copies attributes of src not found on dst to dst. 808 func copyAttributes(dst *Node, src Token) { 809 if len(src.Attr) == 0 { 810 return 811 } 812 attr := map[string]string{} 813 for _, t := range dst.Attr { 814 attr[t.Key] = t.Val 815 } 816 for _, t := range src.Attr { 817 if _, ok := attr[t.Key]; !ok { 818 dst.Attr = append(dst.Attr, t) 819 attr[t.Key] = t.Val 820 } 821 } 822 } 823 824 // Section 12.2.6.4.7. 825 func inBodyIM(p *parser) bool { 826 switch p.tok.Type { 827 case TextToken: 828 d := p.tok.Data 829 switch n := p.oe.top(); n.DataAtom { 830 case a.Pre, a.Listing: 831 if n.FirstChild == nil { 832 // Ignore a newline at the start of a <pre> block. 833 if d != "" && d[0] == '\r' { 834 d = d[1:] 835 } 836 if d != "" && d[0] == '\n' { 837 d = d[1:] 838 } 839 } 840 } 841 d = strings.Replace(d, "\x00", "", -1) 842 if d == "" { 843 return true 844 } 845 p.reconstructActiveFormattingElements() 846 p.addText(d) 847 if p.framesetOK && strings.TrimLeft(d, whitespace) != "" { 848 // There were non-whitespace characters inserted. 849 p.framesetOK = false 850 } 851 case StartTagToken: 852 switch p.tok.DataAtom { 853 case a.Html: 854 if p.oe.contains(a.Template) { 855 return true 856 } 857 copyAttributes(p.oe[0], p.tok) 858 case a.Base, a.Basefont, a.Bgsound, a.Command, a.Link, a.Meta, a.Noframes, a.Script, a.Style, a.Template, a.Title: 859 return inHeadIM(p) 860 case a.Body: 861 if p.oe.contains(a.Template) { 862 return true 863 } 864 if len(p.oe) >= 2 { 865 body := p.oe[1] 866 if body.Type == ElementNode && body.DataAtom == a.Body { 867 p.framesetOK = false 868 copyAttributes(body, p.tok) 869 } 870 } 871 case a.Frameset: 872 if !p.framesetOK || len(p.oe) < 2 || p.oe[1].DataAtom != a.Body { 873 // Ignore the token. 874 return true 875 } 876 body := p.oe[1] 877 if body.Parent != nil { 878 body.Parent.RemoveChild(body) 879 } 880 p.oe = p.oe[:1] 881 p.addElement() 882 p.im = inFramesetIM 883 return true 884 case a.Address, a.Article, a.Aside, a.Blockquote, a.Center, a.Details, a.Dir, a.Div, a.Dl, a.Fieldset, a.Figcaption, a.Figure, a.Footer, a.Header, a.Hgroup, a.Menu, a.Nav, a.Ol, a.P, a.Section, a.Summary, a.Ul: 885 p.popUntil(buttonScope, a.P) 886 p.addElement() 887 case a.H1, a.H2, a.H3, a.H4, a.H5, a.H6: 888 p.popUntil(buttonScope, a.P) 889 switch n := p.top(); n.DataAtom { 890 case a.H1, a.H2, a.H3, a.H4, a.H5, a.H6: 891 p.oe.pop() 892 } 893 p.addElement() 894 case a.Pre, a.Listing: 895 p.popUntil(buttonScope, a.P) 896 p.addElement() 897 // The newline, if any, will be dealt with by the TextToken case. 898 p.framesetOK = false 899 case a.Form: 900 if p.form != nil && !p.oe.contains(a.Template) { 901 // Ignore the token 902 return true 903 } 904 p.popUntil(buttonScope, a.P) 905 p.addElement() 906 if !p.oe.contains(a.Template) { 907 p.form = p.top() 908 } 909 case a.Li: 910 p.framesetOK = false 911 for i := len(p.oe) - 1; i >= 0; i-- { 912 node := p.oe[i] 913 switch node.DataAtom { 914 case a.Li: 915 p.oe = p.oe[:i] 916 case a.Address, a.Div, a.P: 917 continue 918 default: 919 if !isSpecialElement(node) { 920 continue 921 } 922 } 923 break 924 } 925 p.popUntil(buttonScope, a.P) 926 p.addElement() 927 case a.Dd, a.Dt: 928 p.framesetOK = false 929 for i := len(p.oe) - 1; i >= 0; i-- { 930 node := p.oe[i] 931 switch node.DataAtom { 932 case a.Dd, a.Dt: 933 p.oe = p.oe[:i] 934 case a.Address, a.Div, a.P: 935 continue 936 default: 937 if !isSpecialElement(node) { 938 continue 939 } 940 } 941 break 942 } 943 p.popUntil(buttonScope, a.P) 944 p.addElement() 945 case a.Plaintext: 946 p.popUntil(buttonScope, a.P) 947 p.addElement() 948 case a.Button: 949 p.popUntil(defaultScope, a.Button) 950 p.reconstructActiveFormattingElements() 951 p.addElement() 952 p.framesetOK = false 953 case a.A: 954 for i := len(p.afe) - 1; i >= 0 && p.afe[i].Type != scopeMarkerNode; i-- { 955 if n := p.afe[i]; n.Type == ElementNode && n.DataAtom == a.A { 956 p.inBodyEndTagFormatting(a.A, "a") 957 p.oe.remove(n) 958 p.afe.remove(n) 959 break 960 } 961 } 962 p.reconstructActiveFormattingElements() 963 p.addFormattingElement() 964 case a.B, a.Big, a.Code, a.Em, a.Font, a.I, a.S, a.Small, a.Strike, a.Strong, a.Tt, a.U: 965 p.reconstructActiveFormattingElements() 966 p.addFormattingElement() 967 case a.Nobr: 968 p.reconstructActiveFormattingElements() 969 if p.elementInScope(defaultScope, a.Nobr) { 970 p.inBodyEndTagFormatting(a.Nobr, "nobr") 971 p.reconstructActiveFormattingElements() 972 } 973 p.addFormattingElement() 974 case a.Applet, a.Marquee, a.Object: 975 p.reconstructActiveFormattingElements() 976 p.addElement() 977 p.afe = append(p.afe, &scopeMarker) 978 p.framesetOK = false 979 case a.Table: 980 if !p.quirks { 981 p.popUntil(buttonScope, a.P) 982 } 983 p.addElement() 984 p.framesetOK = false 985 p.im = inTableIM 986 return true 987 case a.Area, a.Br, a.Embed, a.Img, a.Input, a.Keygen, a.Wbr: 988 p.reconstructActiveFormattingElements() 989 p.addElement() 990 p.oe.pop() 991 p.acknowledgeSelfClosingTag() 992 if p.tok.DataAtom == a.Input { 993 for _, t := range p.tok.Attr { 994 if t.Key == "type" { 995 if strings.ToLower(t.Val) == "hidden" { 996 // Skip setting framesetOK = false 997 return true 998 } 999 } 1000 } 1001 } 1002 p.framesetOK = false 1003 case a.Param, a.Source, a.Track: 1004 p.addElement() 1005 p.oe.pop() 1006 p.acknowledgeSelfClosingTag() 1007 case a.Hr: 1008 p.popUntil(buttonScope, a.P) 1009 p.addElement() 1010 p.oe.pop() 1011 p.acknowledgeSelfClosingTag() 1012 p.framesetOK = false 1013 case a.Image: 1014 p.tok.DataAtom = a.Img 1015 p.tok.Data = a.Img.String() 1016 return false 1017 case a.Isindex: 1018 if p.form != nil { 1019 // Ignore the token. 1020 return true 1021 } 1022 action := "" 1023 prompt := "This is a searchable index. Enter search keywords: " 1024 attr := []Attribute{{Key: "name", Val: "isindex"}} 1025 for _, t := range p.tok.Attr { 1026 switch t.Key { 1027 case "action": 1028 action = t.Val 1029 case "name": 1030 // Ignore the attribute. 1031 case "prompt": 1032 prompt = t.Val 1033 default: 1034 attr = append(attr, t) 1035 } 1036 } 1037 p.acknowledgeSelfClosingTag() 1038 p.popUntil(buttonScope, a.P) 1039 p.parseImpliedToken(StartTagToken, a.Form, a.Form.String()) 1040 if p.form == nil { 1041 // NOTE: The 'isindex' element has been removed, 1042 // and the 'template' element has not been designed to be 1043 // collaborative with the index element. 1044 // 1045 // Ignore the token. 1046 return true 1047 } 1048 if action != "" { 1049 p.form.Attr = []Attribute{{Key: "action", Val: action}} 1050 } 1051 p.parseImpliedToken(StartTagToken, a.Hr, a.Hr.String()) 1052 p.parseImpliedToken(StartTagToken, a.Label, a.Label.String()) 1053 p.addText(prompt) 1054 p.addChild(&Node{ 1055 Type: ElementNode, 1056 DataAtom: a.Input, 1057 Data: a.Input.String(), 1058 Attr: attr, 1059 }) 1060 p.oe.pop() 1061 p.parseImpliedToken(EndTagToken, a.Label, a.Label.String()) 1062 p.parseImpliedToken(StartTagToken, a.Hr, a.Hr.String()) 1063 p.parseImpliedToken(EndTagToken, a.Form, a.Form.String()) 1064 case a.Textarea: 1065 p.addElement() 1066 p.setOriginalIM() 1067 p.framesetOK = false 1068 p.im = textIM 1069 case a.Xmp: 1070 p.popUntil(buttonScope, a.P) 1071 p.reconstructActiveFormattingElements() 1072 p.framesetOK = false 1073 p.addElement() 1074 p.setOriginalIM() 1075 p.im = textIM 1076 case a.Iframe: 1077 p.framesetOK = false 1078 p.addElement() 1079 p.setOriginalIM() 1080 p.im = textIM 1081 case a.Noembed, a.Noscript: 1082 p.addElement() 1083 p.setOriginalIM() 1084 p.im = textIM 1085 case a.Select: 1086 p.reconstructActiveFormattingElements() 1087 p.addElement() 1088 p.framesetOK = false 1089 p.im = inSelectIM 1090 return true 1091 case a.Optgroup, a.Option: 1092 if p.top().DataAtom == a.Option { 1093 p.oe.pop() 1094 } 1095 p.reconstructActiveFormattingElements() 1096 p.addElement() 1097 case a.Rb, a.Rtc: 1098 if p.elementInScope(defaultScope, a.Ruby) { 1099 p.generateImpliedEndTags() 1100 } 1101 p.addElement() 1102 case a.Rp, a.Rt: 1103 if p.elementInScope(defaultScope, a.Ruby) { 1104 p.generateImpliedEndTags("rtc") 1105 } 1106 p.addElement() 1107 case a.Math, a.Svg: 1108 p.reconstructActiveFormattingElements() 1109 if p.tok.DataAtom == a.Math { 1110 adjustAttributeNames(p.tok.Attr, mathMLAttributeAdjustments) 1111 } else { 1112 adjustAttributeNames(p.tok.Attr, svgAttributeAdjustments) 1113 } 1114 adjustForeignAttributes(p.tok.Attr) 1115 p.addElement() 1116 p.top().Namespace = p.tok.Data 1117 if p.hasSelfClosingToken { 1118 p.oe.pop() 1119 p.acknowledgeSelfClosingTag() 1120 } 1121 return true 1122 case a.Caption, a.Col, a.Colgroup, a.Frame, a.Head, a.Tbody, a.Td, a.Tfoot, a.Th, a.Thead, a.Tr: 1123 // Ignore the token. 1124 default: 1125 p.reconstructActiveFormattingElements() 1126 p.addElement() 1127 } 1128 case EndTagToken: 1129 switch p.tok.DataAtom { 1130 case a.Body: 1131 if p.elementInScope(defaultScope, a.Body) { 1132 p.im = afterBodyIM 1133 } 1134 case a.Html: 1135 if p.elementInScope(defaultScope, a.Body) { 1136 p.parseImpliedToken(EndTagToken, a.Body, a.Body.String()) 1137 return false 1138 } 1139 return true 1140 case a.Address, a.Article, a.Aside, a.Blockquote, a.Button, a.Center, a.Details, a.Dir, a.Div, a.Dl, a.Fieldset, a.Figcaption, a.Figure, a.Footer, a.Header, a.Hgroup, a.Listing, a.Menu, a.Nav, a.Ol, a.Pre, a.Section, a.Summary, a.Ul: 1141 p.popUntil(defaultScope, p.tok.DataAtom) 1142 case a.Form: 1143 if p.oe.contains(a.Template) { 1144 i := p.indexOfElementInScope(defaultScope, a.Form) 1145 if i == -1 { 1146 // Ignore the token. 1147 return true 1148 } 1149 p.generateImpliedEndTags() 1150 if p.oe[i].DataAtom != a.Form { 1151 // Ignore the token. 1152 return true 1153 } 1154 p.popUntil(defaultScope, a.Form) 1155 } else { 1156 node := p.form 1157 p.form = nil 1158 i := p.indexOfElementInScope(defaultScope, a.Form) 1159 if node == nil || i == -1 || p.oe[i] != node { 1160 // Ignore the token. 1161 return true 1162 } 1163 p.generateImpliedEndTags() 1164 p.oe.remove(node) 1165 } 1166 case a.P: 1167 if !p.elementInScope(buttonScope, a.P) { 1168 p.parseImpliedToken(StartTagToken, a.P, a.P.String()) 1169 } 1170 p.popUntil(buttonScope, a.P) 1171 case a.Li: 1172 p.popUntil(listItemScope, a.Li) 1173 case a.Dd, a.Dt: 1174 p.popUntil(defaultScope, p.tok.DataAtom) 1175 case a.H1, a.H2, a.H3, a.H4, a.H5, a.H6: 1176 p.popUntil(defaultScope, a.H1, a.H2, a.H3, a.H4, a.H5, a.H6) 1177 case a.A, a.B, a.Big, a.Code, a.Em, a.Font, a.I, a.Nobr, a.S, a.Small, a.Strike, a.Strong, a.Tt, a.U: 1178 p.inBodyEndTagFormatting(p.tok.DataAtom, p.tok.Data) 1179 case a.Applet, a.Marquee, a.Object: 1180 if p.popUntil(defaultScope, p.tok.DataAtom) { 1181 p.clearActiveFormattingElements() 1182 } 1183 case a.Br: 1184 p.tok.Type = StartTagToken 1185 return false 1186 case a.Template: 1187 return inHeadIM(p) 1188 default: 1189 p.inBodyEndTagOther(p.tok.DataAtom, p.tok.Data) 1190 } 1191 case CommentToken: 1192 p.addChild(&Node{ 1193 Type: CommentNode, 1194 Data: p.tok.Data, 1195 }) 1196 case ErrorToken: 1197 // TODO: remove this divergence from the HTML5 spec. 1198 if len(p.templateStack) > 0 { 1199 p.im = inTemplateIM 1200 return false 1201 } else { 1202 for _, e := range p.oe { 1203 switch e.DataAtom { 1204 case a.Dd, a.Dt, a.Li, a.Optgroup, a.Option, a.P, a.Rb, a.Rp, a.Rt, a.Rtc, a.Tbody, a.Td, a.Tfoot, a.Th, 1205 a.Thead, a.Tr, a.Body, a.Html: 1206 default: 1207 return true 1208 } 1209 } 1210 } 1211 } 1212 1213 return true 1214 } 1215 1216 func (p *parser) inBodyEndTagFormatting(tagAtom a.Atom, tagName string) { 1217 // This is the "adoption agency" algorithm, described at 1218 // https://html.spec.whatwg.org/multipage/syntax.html#adoptionAgency 1219 1220 // TODO: this is a fairly literal line-by-line translation of that algorithm. 1221 // Once the code successfully parses the comprehensive test suite, we should 1222 // refactor this code to be more idiomatic. 1223 1224 // Steps 1-4. The outer loop. 1225 for i := 0; i < 8; i++ { 1226 // Step 5. Find the formatting element. 1227 var formattingElement *Node 1228 for j := len(p.afe) - 1; j >= 0; j-- { 1229 if p.afe[j].Type == scopeMarkerNode { 1230 break 1231 } 1232 if p.afe[j].DataAtom == tagAtom { 1233 formattingElement = p.afe[j] 1234 break 1235 } 1236 } 1237 if formattingElement == nil { 1238 p.inBodyEndTagOther(tagAtom, tagName) 1239 return 1240 } 1241 feIndex := p.oe.index(formattingElement) 1242 if feIndex == -1 { 1243 p.afe.remove(formattingElement) 1244 return 1245 } 1246 if !p.elementInScope(defaultScope, tagAtom) { 1247 // Ignore the tag. 1248 return 1249 } 1250 1251 // Steps 9-10. Find the furthest block. 1252 var furthestBlock *Node 1253 for _, e := range p.oe[feIndex:] { 1254 if isSpecialElement(e) { 1255 furthestBlock = e 1256 break 1257 } 1258 } 1259 if furthestBlock == nil { 1260 e := p.oe.pop() 1261 for e != formattingElement { 1262 e = p.oe.pop() 1263 } 1264 p.afe.remove(e) 1265 return 1266 } 1267 1268 // Steps 11-12. Find the common ancestor and bookmark node. 1269 commonAncestor := p.oe[feIndex-1] 1270 bookmark := p.afe.index(formattingElement) 1271 1272 // Step 13. The inner loop. Find the lastNode to reparent. 1273 lastNode := furthestBlock 1274 node := furthestBlock 1275 x := p.oe.index(node) 1276 // Steps 13.1-13.2 1277 for j := 0; j < 3; j++ { 1278 // Step 13.3. 1279 x-- 1280 node = p.oe[x] 1281 // Step 13.4 - 13.5. 1282 if p.afe.index(node) == -1 { 1283 p.oe.remove(node) 1284 continue 1285 } 1286 // Step 13.6. 1287 if node == formattingElement { 1288 break 1289 } 1290 // Step 13.7. 1291 clone := node.clone() 1292 p.afe[p.afe.index(node)] = clone 1293 p.oe[p.oe.index(node)] = clone 1294 node = clone 1295 // Step 13.8. 1296 if lastNode == furthestBlock { 1297 bookmark = p.afe.index(node) + 1 1298 } 1299 // Step 13.9. 1300 if lastNode.Parent != nil { 1301 lastNode.Parent.RemoveChild(lastNode) 1302 } 1303 node.AppendChild(lastNode) 1304 // Step 13.10. 1305 lastNode = node 1306 } 1307 1308 // Step 14. Reparent lastNode to the common ancestor, 1309 // or for misnested table nodes, to the foster parent. 1310 if lastNode.Parent != nil { 1311 lastNode.Parent.RemoveChild(lastNode) 1312 } 1313 switch commonAncestor.DataAtom { 1314 case a.Table, a.Tbody, a.Tfoot, a.Thead, a.Tr: 1315 p.fosterParent(lastNode) 1316 default: 1317 commonAncestor.AppendChild(lastNode) 1318 } 1319 1320 // Steps 15-17. Reparent nodes from the furthest block's children 1321 // to a clone of the formatting element. 1322 clone := formattingElement.clone() 1323 reparentChildren(clone, furthestBlock) 1324 furthestBlock.AppendChild(clone) 1325 1326 // Step 18. Fix up the list of active formatting elements. 1327 if oldLoc := p.afe.index(formattingElement); oldLoc != -1 && oldLoc < bookmark { 1328 // Move the bookmark with the rest of the list. 1329 bookmark-- 1330 } 1331 p.afe.remove(formattingElement) 1332 p.afe.insert(bookmark, clone) 1333 1334 // Step 19. Fix up the stack of open elements. 1335 p.oe.remove(formattingElement) 1336 p.oe.insert(p.oe.index(furthestBlock)+1, clone) 1337 } 1338 } 1339 1340 // inBodyEndTagOther performs the "any other end tag" algorithm for inBodyIM. 1341 // "Any other end tag" handling from 12.2.6.5 The rules for parsing tokens in foreign content 1342 // https://html.spec.whatwg.org/multipage/syntax.html#parsing-main-inforeign 1343 func (p *parser) inBodyEndTagOther(tagAtom a.Atom, tagName string) { 1344 for i := len(p.oe) - 1; i >= 0; i-- { 1345 // Two element nodes have the same tag if they have the same Data (a 1346 // string-typed field). As an optimization, for common HTML tags, each 1347 // Data string is assigned a unique, non-zero DataAtom (a uint32-typed 1348 // field), since integer comparison is faster than string comparison. 1349 // Uncommon (custom) tags get a zero DataAtom. 1350 // 1351 // The if condition here is equivalent to (p.oe[i].Data == tagName). 1352 if (p.oe[i].DataAtom == tagAtom) && 1353 ((tagAtom != 0) || (p.oe[i].Data == tagName)) { 1354 p.oe = p.oe[:i] 1355 break 1356 } 1357 if isSpecialElement(p.oe[i]) { 1358 break 1359 } 1360 } 1361 } 1362 1363 // Section 12.2.6.4.8. 1364 func textIM(p *parser) bool { 1365 switch p.tok.Type { 1366 case ErrorToken: 1367 p.oe.pop() 1368 case TextToken: 1369 d := p.tok.Data 1370 if n := p.oe.top(); n.DataAtom == a.Textarea && n.FirstChild == nil { 1371 // Ignore a newline at the start of a <textarea> block. 1372 if d != "" && d[0] == '\r' { 1373 d = d[1:] 1374 } 1375 if d != "" && d[0] == '\n' { 1376 d = d[1:] 1377 } 1378 } 1379 if d == "" { 1380 return true 1381 } 1382 p.addText(d) 1383 return true 1384 case EndTagToken: 1385 p.oe.pop() 1386 } 1387 p.im = p.originalIM 1388 p.originalIM = nil 1389 return p.tok.Type == EndTagToken 1390 } 1391 1392 // Section 12.2.6.4.9. 1393 func inTableIM(p *parser) bool { 1394 switch p.tok.Type { 1395 case TextToken: 1396 p.tok.Data = strings.Replace(p.tok.Data, "\x00", "", -1) 1397 switch p.oe.top().DataAtom { 1398 case a.Table, a.Tbody, a.Tfoot, a.Thead, a.Tr: 1399 if strings.Trim(p.tok.Data, whitespace) == "" { 1400 p.addText(p.tok.Data) 1401 return true 1402 } 1403 } 1404 case StartTagToken: 1405 switch p.tok.DataAtom { 1406 case a.Caption: 1407 p.clearStackToContext(tableScope) 1408 p.afe = append(p.afe, &scopeMarker) 1409 p.addElement() 1410 p.im = inCaptionIM 1411 return true 1412 case a.Colgroup: 1413 p.clearStackToContext(tableScope) 1414 p.addElement() 1415 p.im = inColumnGroupIM 1416 return true 1417 case a.Col: 1418 p.parseImpliedToken(StartTagToken, a.Colgroup, a.Colgroup.String()) 1419 return false 1420 case a.Tbody, a.Tfoot, a.Thead: 1421 p.clearStackToContext(tableScope) 1422 p.addElement() 1423 p.im = inTableBodyIM 1424 return true 1425 case a.Td, a.Th, a.Tr: 1426 p.parseImpliedToken(StartTagToken, a.Tbody, a.Tbody.String()) 1427 return false 1428 case a.Table: 1429 if p.popUntil(tableScope, a.Table) { 1430 p.resetInsertionMode() 1431 return false 1432 } 1433 // Ignore the token. 1434 return true 1435 case a.Style, a.Script, a.Template: 1436 return inHeadIM(p) 1437 case a.Input: 1438 for _, t := range p.tok.Attr { 1439 if t.Key == "type" && strings.ToLower(t.Val) == "hidden" { 1440 p.addElement() 1441 p.oe.pop() 1442 return true 1443 } 1444 } 1445 // Otherwise drop down to the default action. 1446 case a.Form: 1447 if p.oe.contains(a.Template) || p.form != nil { 1448 // Ignore the token. 1449 return true 1450 } 1451 p.addElement() 1452 p.form = p.oe.pop() 1453 case a.Select: 1454 p.reconstructActiveFormattingElements() 1455 switch p.top().DataAtom { 1456 case a.Table, a.Tbody, a.Tfoot, a.Thead, a.Tr: 1457 p.fosterParenting = true 1458 } 1459 p.addElement() 1460 p.fosterParenting = false 1461 p.framesetOK = false 1462 p.im = inSelectInTableIM 1463 return true 1464 } 1465 case EndTagToken: 1466 switch p.tok.DataAtom { 1467 case a.Table: 1468 if p.popUntil(tableScope, a.Table) { 1469 p.resetInsertionMode() 1470 return true 1471 } 1472 // Ignore the token. 1473 return true 1474 case a.Body, a.Caption, a.Col, a.Colgroup, a.Html, a.Tbody, a.Td, a.Tfoot, a.Th, a.Thead, a.Tr: 1475 // Ignore the token. 1476 return true 1477 case a.Template: 1478 return inHeadIM(p) 1479 } 1480 case CommentToken: 1481 p.addChild(&Node{ 1482 Type: CommentNode, 1483 Data: p.tok.Data, 1484 }) 1485 return true 1486 case DoctypeToken: 1487 // Ignore the token. 1488 return true 1489 case ErrorToken: 1490 return inBodyIM(p) 1491 } 1492 1493 p.fosterParenting = true 1494 defer func() { p.fosterParenting = false }() 1495 1496 return inBodyIM(p) 1497 } 1498 1499 // Section 12.2.6.4.11. 1500 func inCaptionIM(p *parser) bool { 1501 switch p.tok.Type { 1502 case StartTagToken: 1503 switch p.tok.DataAtom { 1504 case a.Caption, a.Col, a.Colgroup, a.Tbody, a.Td, a.Tfoot, a.Thead, a.Tr: 1505 if p.popUntil(tableScope, a.Caption) { 1506 p.clearActiveFormattingElements() 1507 p.im = inTableIM 1508 return false 1509 } else { 1510 // Ignore the token. 1511 return true 1512 } 1513 case a.Select: 1514 p.reconstructActiveFormattingElements() 1515 p.addElement() 1516 p.framesetOK = false 1517 p.im = inSelectInTableIM 1518 return true 1519 } 1520 case EndTagToken: 1521 switch p.tok.DataAtom { 1522 case a.Caption: 1523 if p.popUntil(tableScope, a.Caption) { 1524 p.clearActiveFormattingElements() 1525 p.im = inTableIM 1526 } 1527 return true 1528 case a.Table: 1529 if p.popUntil(tableScope, a.Caption) { 1530 p.clearActiveFormattingElements() 1531 p.im = inTableIM 1532 return false 1533 } else { 1534 // Ignore the token. 1535 return true 1536 } 1537 case a.Body, a.Col, a.Colgroup, a.Html, a.Tbody, a.Td, a.Tfoot, a.Th, a.Thead, a.Tr: 1538 // Ignore the token. 1539 return true 1540 } 1541 } 1542 return inBodyIM(p) 1543 } 1544 1545 // Section 12.2.6.4.12. 1546 func inColumnGroupIM(p *parser) bool { 1547 switch p.tok.Type { 1548 case TextToken: 1549 s := strings.TrimLeft(p.tok.Data, whitespace) 1550 if len(s) < len(p.tok.Data) { 1551 // Add the initial whitespace to the current node. 1552 p.addText(p.tok.Data[:len(p.tok.Data)-len(s)]) 1553 if s == "" { 1554 return true 1555 } 1556 p.tok.Data = s 1557 } 1558 case CommentToken: 1559 p.addChild(&Node{ 1560 Type: CommentNode, 1561 Data: p.tok.Data, 1562 }) 1563 return true 1564 case DoctypeToken: 1565 // Ignore the token. 1566 return true 1567 case StartTagToken: 1568 switch p.tok.DataAtom { 1569 case a.Html: 1570 return inBodyIM(p) 1571 case a.Col: 1572 p.addElement() 1573 p.oe.pop() 1574 p.acknowledgeSelfClosingTag() 1575 return true 1576 case a.Template: 1577 return inHeadIM(p) 1578 } 1579 case EndTagToken: 1580 switch p.tok.DataAtom { 1581 case a.Colgroup: 1582 if p.oe.top().DataAtom == a.Colgroup { 1583 p.oe.pop() 1584 p.im = inTableIM 1585 } 1586 return true 1587 case a.Col: 1588 // Ignore the token. 1589 return true 1590 case a.Template: 1591 return inHeadIM(p) 1592 } 1593 case ErrorToken: 1594 return inBodyIM(p) 1595 } 1596 if p.oe.top().DataAtom != a.Colgroup { 1597 return true 1598 } 1599 p.oe.pop() 1600 p.im = inTableIM 1601 return false 1602 } 1603 1604 // Section 12.2.6.4.13. 1605 func inTableBodyIM(p *parser) bool { 1606 switch p.tok.Type { 1607 case StartTagToken: 1608 switch p.tok.DataAtom { 1609 case a.Tr: 1610 p.clearStackToContext(tableBodyScope) 1611 p.addElement() 1612 p.im = inRowIM 1613 return true 1614 case a.Td, a.Th: 1615 p.parseImpliedToken(StartTagToken, a.Tr, a.Tr.String()) 1616 return false 1617 case a.Caption, a.Col, a.Colgroup, a.Tbody, a.Tfoot, a.Thead: 1618 if p.popUntil(tableScope, a.Tbody, a.Thead, a.Tfoot) { 1619 p.im = inTableIM 1620 return false 1621 } 1622 // Ignore the token. 1623 return true 1624 } 1625 case EndTagToken: 1626 switch p.tok.DataAtom { 1627 case a.Tbody, a.Tfoot, a.Thead: 1628 if p.elementInScope(tableScope, p.tok.DataAtom) { 1629 p.clearStackToContext(tableBodyScope) 1630 p.oe.pop() 1631 p.im = inTableIM 1632 } 1633 return true 1634 case a.Table: 1635 if p.popUntil(tableScope, a.Tbody, a.Thead, a.Tfoot) { 1636 p.im = inTableIM 1637 return false 1638 } 1639 // Ignore the token. 1640 return true 1641 case a.Body, a.Caption, a.Col, a.Colgroup, a.Html, a.Td, a.Th, a.Tr: 1642 // Ignore the token. 1643 return true 1644 } 1645 case CommentToken: 1646 p.addChild(&Node{ 1647 Type: CommentNode, 1648 Data: p.tok.Data, 1649 }) 1650 return true 1651 } 1652 1653 return inTableIM(p) 1654 } 1655 1656 // Section 12.2.6.4.14. 1657 func inRowIM(p *parser) bool { 1658 switch p.tok.Type { 1659 case StartTagToken: 1660 switch p.tok.DataAtom { 1661 case a.Td, a.Th: 1662 p.clearStackToContext(tableRowScope) 1663 p.addElement() 1664 p.afe = append(p.afe, &scopeMarker) 1665 p.im = inCellIM 1666 return true 1667 case a.Caption, a.Col, a.Colgroup, a.Tbody, a.Tfoot, a.Thead, a.Tr: 1668 if p.popUntil(tableScope, a.Tr) { 1669 p.im = inTableBodyIM 1670 return false 1671 } 1672 // Ignore the token. 1673 return true 1674 } 1675 case EndTagToken: 1676 switch p.tok.DataAtom { 1677 case a.Tr: 1678 if p.popUntil(tableScope, a.Tr) { 1679 p.im = inTableBodyIM 1680 return true 1681 } 1682 // Ignore the token. 1683 return true 1684 case a.Table: 1685 if p.popUntil(tableScope, a.Tr) { 1686 p.im = inTableBodyIM 1687 return false 1688 } 1689 // Ignore the token. 1690 return true 1691 case a.Tbody, a.Tfoot, a.Thead: 1692 if p.elementInScope(tableScope, p.tok.DataAtom) { 1693 p.parseImpliedToken(EndTagToken, a.Tr, a.Tr.String()) 1694 return false 1695 } 1696 // Ignore the token. 1697 return true 1698 case a.Body, a.Caption, a.Col, a.Colgroup, a.Html, a.Td, a.Th: 1699 // Ignore the token. 1700 return true 1701 } 1702 } 1703 1704 return inTableIM(p) 1705 } 1706 1707 // Section 12.2.6.4.15. 1708 func inCellIM(p *parser) bool { 1709 switch p.tok.Type { 1710 case StartTagToken: 1711 switch p.tok.DataAtom { 1712 case a.Caption, a.Col, a.Colgroup, a.Tbody, a.Td, a.Tfoot, a.Th, a.Thead, a.Tr: 1713 if p.popUntil(tableScope, a.Td, a.Th) { 1714 // Close the cell and reprocess. 1715 p.clearActiveFormattingElements() 1716 p.im = inRowIM 1717 return false 1718 } 1719 // Ignore the token. 1720 return true 1721 case a.Select: 1722 p.reconstructActiveFormattingElements() 1723 p.addElement() 1724 p.framesetOK = false 1725 p.im = inSelectInTableIM 1726 return true 1727 } 1728 case EndTagToken: 1729 switch p.tok.DataAtom { 1730 case a.Td, a.Th: 1731 if !p.popUntil(tableScope, p.tok.DataAtom) { 1732 // Ignore the token. 1733 return true 1734 } 1735 p.clearActiveFormattingElements() 1736 p.im = inRowIM 1737 return true 1738 case a.Body, a.Caption, a.Col, a.Colgroup, a.Html: 1739 // Ignore the token. 1740 return true 1741 case a.Table, a.Tbody, a.Tfoot, a.Thead, a.Tr: 1742 if !p.elementInScope(tableScope, p.tok.DataAtom) { 1743 // Ignore the token. 1744 return true 1745 } 1746 // Close the cell and reprocess. 1747 if p.popUntil(tableScope, a.Td, a.Th) { 1748 p.clearActiveFormattingElements() 1749 } 1750 p.im = inRowIM 1751 return false 1752 } 1753 } 1754 return inBodyIM(p) 1755 } 1756 1757 // Section 12.2.6.4.16. 1758 func inSelectIM(p *parser) bool { 1759 switch p.tok.Type { 1760 case TextToken: 1761 p.addText(strings.Replace(p.tok.Data, "\x00", "", -1)) 1762 case StartTagToken: 1763 switch p.tok.DataAtom { 1764 case a.Html: 1765 return inBodyIM(p) 1766 case a.Option: 1767 if p.top().DataAtom == a.Option { 1768 p.oe.pop() 1769 } 1770 p.addElement() 1771 case a.Optgroup: 1772 if p.top().DataAtom == a.Option { 1773 p.oe.pop() 1774 } 1775 if p.top().DataAtom == a.Optgroup { 1776 p.oe.pop() 1777 } 1778 p.addElement() 1779 case a.Select: 1780 if p.popUntil(selectScope, a.Select) { 1781 p.resetInsertionMode() 1782 } else { 1783 // Ignore the token. 1784 return true 1785 } 1786 case a.Input, a.Keygen, a.Textarea: 1787 if p.elementInScope(selectScope, a.Select) { 1788 p.parseImpliedToken(EndTagToken, a.Select, a.Select.String()) 1789 return false 1790 } 1791 // In order to properly ignore <textarea>, we need to change the tokenizer mode. 1792 p.tokenizer.NextIsNotRawText() 1793 // Ignore the token. 1794 return true 1795 case a.Script, a.Template: 1796 return inHeadIM(p) 1797 } 1798 case EndTagToken: 1799 switch p.tok.DataAtom { 1800 case a.Option: 1801 if p.top().DataAtom == a.Option { 1802 p.oe.pop() 1803 } 1804 case a.Optgroup: 1805 i := len(p.oe) - 1 1806 if p.oe[i].DataAtom == a.Option { 1807 i-- 1808 } 1809 if p.oe[i].DataAtom == a.Optgroup { 1810 p.oe = p.oe[:i] 1811 } 1812 case a.Select: 1813 if p.popUntil(selectScope, a.Select) { 1814 p.resetInsertionMode() 1815 } else { 1816 // Ignore the token. 1817 return true 1818 } 1819 case a.Template: 1820 return inHeadIM(p) 1821 } 1822 case CommentToken: 1823 p.addChild(&Node{ 1824 Type: CommentNode, 1825 Data: p.tok.Data, 1826 }) 1827 case DoctypeToken: 1828 // Ignore the token. 1829 return true 1830 case ErrorToken: 1831 return inBodyIM(p) 1832 } 1833 1834 return true 1835 } 1836 1837 // Section 12.2.6.4.17. 1838 func inSelectInTableIM(p *parser) bool { 1839 switch p.tok.Type { 1840 case StartTagToken, EndTagToken: 1841 switch p.tok.DataAtom { 1842 case a.Caption, a.Table, a.Tbody, a.Tfoot, a.Thead, a.Tr, a.Td, a.Th: 1843 if p.tok.Type == EndTagToken && !p.elementInScope(tableScope, p.tok.DataAtom) { 1844 // Ignore the token. 1845 return true 1846 } 1847 // This is like p.popUntil(selectScope, a.Select), but it also 1848 // matches <math select>, not just <select>. Matching the MathML 1849 // tag is arguably incorrect (conceptually), but it mimics what 1850 // Chromium does. 1851 for i := len(p.oe) - 1; i >= 0; i-- { 1852 if n := p.oe[i]; n.DataAtom == a.Select { 1853 p.oe = p.oe[:i] 1854 break 1855 } 1856 } 1857 p.resetInsertionMode() 1858 return false 1859 } 1860 } 1861 return inSelectIM(p) 1862 } 1863 1864 // Section 12.2.6.4.18. 1865 func inTemplateIM(p *parser) bool { 1866 switch p.tok.Type { 1867 case TextToken, CommentToken, DoctypeToken: 1868 return inBodyIM(p) 1869 case StartTagToken: 1870 switch p.tok.DataAtom { 1871 case a.Base, a.Basefont, a.Bgsound, a.Link, a.Meta, a.Noframes, a.Script, a.Style, a.Template, a.Title: 1872 return inHeadIM(p) 1873 case a.Caption, a.Colgroup, a.Tbody, a.Tfoot, a.Thead: 1874 p.templateStack.pop() 1875 p.templateStack = append(p.templateStack, inTableIM) 1876 p.im = inTableIM 1877 return false 1878 case a.Col: 1879 p.templateStack.pop() 1880 p.templateStack = append(p.templateStack, inColumnGroupIM) 1881 p.im = inColumnGroupIM 1882 return false 1883 case a.Tr: 1884 p.templateStack.pop() 1885 p.templateStack = append(p.templateStack, inTableBodyIM) 1886 p.im = inTableBodyIM 1887 return false 1888 case a.Td, a.Th: 1889 p.templateStack.pop() 1890 p.templateStack = append(p.templateStack, inRowIM) 1891 p.im = inRowIM 1892 return false 1893 default: 1894 p.templateStack.pop() 1895 p.templateStack = append(p.templateStack, inBodyIM) 1896 p.im = inBodyIM 1897 return false 1898 } 1899 case EndTagToken: 1900 switch p.tok.DataAtom { 1901 case a.Template: 1902 return inHeadIM(p) 1903 default: 1904 // Ignore the token. 1905 return true 1906 } 1907 case ErrorToken: 1908 if !p.oe.contains(a.Template) { 1909 // Ignore the token. 1910 return true 1911 } 1912 // TODO: remove this divergence from the HTML5 spec. 1913 // 1914 // See https://bugs.chromium.org/p/chromium/issues/detail?id=829668 1915 p.generateImpliedEndTags() 1916 for i := len(p.oe) - 1; i >= 0; i-- { 1917 if n := p.oe[i]; n.Namespace == "" && n.DataAtom == a.Template { 1918 p.oe = p.oe[:i] 1919 break 1920 } 1921 } 1922 p.clearActiveFormattingElements() 1923 p.templateStack.pop() 1924 p.resetInsertionMode() 1925 return false 1926 } 1927 return false 1928 } 1929 1930 // Section 12.2.6.4.19. 1931 func afterBodyIM(p *parser) bool { 1932 switch p.tok.Type { 1933 case ErrorToken: 1934 // Stop parsing. 1935 return true 1936 case TextToken: 1937 s := strings.TrimLeft(p.tok.Data, whitespace) 1938 if len(s) == 0 { 1939 // It was all whitespace. 1940 return inBodyIM(p) 1941 } 1942 case StartTagToken: 1943 if p.tok.DataAtom == a.Html { 1944 return inBodyIM(p) 1945 } 1946 case EndTagToken: 1947 if p.tok.DataAtom == a.Html { 1948 if !p.fragment { 1949 p.im = afterAfterBodyIM 1950 } 1951 return true 1952 } 1953 case CommentToken: 1954 // The comment is attached to the <html> element. 1955 if len(p.oe) < 1 || p.oe[0].DataAtom != a.Html { 1956 panic("html: bad parser state: <html> element not found, in the after-body insertion mode") 1957 } 1958 p.oe[0].AppendChild(&Node{ 1959 Type: CommentNode, 1960 Data: p.tok.Data, 1961 }) 1962 return true 1963 } 1964 p.im = inBodyIM 1965 return false 1966 } 1967 1968 // Section 12.2.6.4.20. 1969 func inFramesetIM(p *parser) bool { 1970 switch p.tok.Type { 1971 case CommentToken: 1972 p.addChild(&Node{ 1973 Type: CommentNode, 1974 Data: p.tok.Data, 1975 }) 1976 case TextToken: 1977 // Ignore all text but whitespace. 1978 s := strings.Map(func(c rune) rune { 1979 switch c { 1980 case ' ', '\t', '\n', '\f', '\r': 1981 return c 1982 } 1983 return -1 1984 }, p.tok.Data) 1985 if s != "" { 1986 p.addText(s) 1987 } 1988 case StartTagToken: 1989 switch p.tok.DataAtom { 1990 case a.Html: 1991 return inBodyIM(p) 1992 case a.Frameset: 1993 p.addElement() 1994 case a.Frame: 1995 p.addElement() 1996 p.oe.pop() 1997 p.acknowledgeSelfClosingTag() 1998 case a.Noframes: 1999 return inHeadIM(p) 2000 } 2001 case EndTagToken: 2002 switch p.tok.DataAtom { 2003 case a.Frameset: 2004 if p.oe.top().DataAtom != a.Html { 2005 p.oe.pop() 2006 if p.oe.top().DataAtom != a.Frameset { 2007 p.im = afterFramesetIM 2008 return true 2009 } 2010 } 2011 } 2012 default: 2013 // Ignore the token. 2014 } 2015 return true 2016 } 2017 2018 // Section 12.2.6.4.21. 2019 func afterFramesetIM(p *parser) bool { 2020 switch p.tok.Type { 2021 case CommentToken: 2022 p.addChild(&Node{ 2023 Type: CommentNode, 2024 Data: p.tok.Data, 2025 }) 2026 case TextToken: 2027 // Ignore all text but whitespace. 2028 s := strings.Map(func(c rune) rune { 2029 switch c { 2030 case ' ', '\t', '\n', '\f', '\r': 2031 return c 2032 } 2033 return -1 2034 }, p.tok.Data) 2035 if s != "" { 2036 p.addText(s) 2037 } 2038 case StartTagToken: 2039 switch p.tok.DataAtom { 2040 case a.Html: 2041 return inBodyIM(p) 2042 case a.Noframes: 2043 return inHeadIM(p) 2044 } 2045 case EndTagToken: 2046 switch p.tok.DataAtom { 2047 case a.Html: 2048 p.im = afterAfterFramesetIM 2049 return true 2050 } 2051 default: 2052 // Ignore the token. 2053 } 2054 return true 2055 } 2056 2057 // Section 12.2.6.4.22. 2058 func afterAfterBodyIM(p *parser) bool { 2059 switch p.tok.Type { 2060 case ErrorToken: 2061 // Stop parsing. 2062 return true 2063 case TextToken: 2064 s := strings.TrimLeft(p.tok.Data, whitespace) 2065 if len(s) == 0 { 2066 // It was all whitespace. 2067 return inBodyIM(p) 2068 } 2069 case StartTagToken: 2070 if p.tok.DataAtom == a.Html { 2071 return inBodyIM(p) 2072 } 2073 case CommentToken: 2074 p.doc.AppendChild(&Node{ 2075 Type: CommentNode, 2076 Data: p.tok.Data, 2077 }) 2078 return true 2079 case DoctypeToken: 2080 return inBodyIM(p) 2081 } 2082 p.im = inBodyIM 2083 return false 2084 } 2085 2086 // Section 12.2.6.4.23. 2087 func afterAfterFramesetIM(p *parser) bool { 2088 switch p.tok.Type { 2089 case CommentToken: 2090 p.doc.AppendChild(&Node{ 2091 Type: CommentNode, 2092 Data: p.tok.Data, 2093 }) 2094 case TextToken: 2095 // Ignore all text but whitespace. 2096 s := strings.Map(func(c rune) rune { 2097 switch c { 2098 case ' ', '\t', '\n', '\f', '\r': 2099 return c 2100 } 2101 return -1 2102 }, p.tok.Data) 2103 if s != "" { 2104 p.tok.Data = s 2105 return inBodyIM(p) 2106 } 2107 case StartTagToken: 2108 switch p.tok.DataAtom { 2109 case a.Html: 2110 return inBodyIM(p) 2111 case a.Noframes: 2112 return inHeadIM(p) 2113 } 2114 case DoctypeToken: 2115 return inBodyIM(p) 2116 default: 2117 // Ignore the token. 2118 } 2119 return true 2120 } 2121 2122 const whitespaceOrNUL = whitespace + "\x00" 2123 2124 // Section 12.2.6.5 2125 func parseForeignContent(p *parser) bool { 2126 switch p.tok.Type { 2127 case TextToken: 2128 if p.framesetOK { 2129 p.framesetOK = strings.TrimLeft(p.tok.Data, whitespaceOrNUL) == "" 2130 } 2131 p.tok.Data = strings.Replace(p.tok.Data, "\x00", "\ufffd", -1) 2132 p.addText(p.tok.Data) 2133 case CommentToken: 2134 p.addChild(&Node{ 2135 Type: CommentNode, 2136 Data: p.tok.Data, 2137 }) 2138 case StartTagToken: 2139 b := breakout[p.tok.Data] 2140 if p.tok.DataAtom == a.Font { 2141 loop: 2142 for _, attr := range p.tok.Attr { 2143 switch attr.Key { 2144 case "color", "face", "size": 2145 b = true 2146 break loop 2147 } 2148 } 2149 } 2150 if b { 2151 for i := len(p.oe) - 1; i >= 0; i-- { 2152 n := p.oe[i] 2153 if n.Namespace == "" || htmlIntegrationPoint(n) || mathMLTextIntegrationPoint(n) { 2154 p.oe = p.oe[:i+1] 2155 break 2156 } 2157 } 2158 return false 2159 } 2160 switch p.top().Namespace { 2161 case "math": 2162 adjustAttributeNames(p.tok.Attr, mathMLAttributeAdjustments) 2163 case "svg": 2164 // Adjust SVG tag names. The tokenizer lower-cases tag names, but 2165 // SVG wants e.g. "foreignObject" with a capital second "O". 2166 if x := svgTagNameAdjustments[p.tok.Data]; x != "" { 2167 p.tok.DataAtom = a.Lookup([]byte(x)) 2168 p.tok.Data = x 2169 } 2170 adjustAttributeNames(p.tok.Attr, svgAttributeAdjustments) 2171 default: 2172 panic("html: bad parser state: unexpected namespace") 2173 } 2174 adjustForeignAttributes(p.tok.Attr) 2175 namespace := p.top().Namespace 2176 p.addElement() 2177 p.top().Namespace = namespace 2178 if namespace != "" { 2179 // Don't let the tokenizer go into raw text mode in foreign content 2180 // (e.g. in an SVG <title> tag). 2181 p.tokenizer.NextIsNotRawText() 2182 } 2183 if p.hasSelfClosingToken { 2184 p.oe.pop() 2185 p.acknowledgeSelfClosingTag() 2186 } 2187 case EndTagToken: 2188 for i := len(p.oe) - 1; i >= 0; i-- { 2189 if p.oe[i].Namespace == "" { 2190 return p.im(p) 2191 } 2192 if strings.EqualFold(p.oe[i].Data, p.tok.Data) { 2193 p.oe = p.oe[:i] 2194 break 2195 } 2196 } 2197 return true 2198 default: 2199 // Ignore the token. 2200 } 2201 return true 2202 } 2203 2204 // Section 12.2.6. 2205 func (p *parser) inForeignContent() bool { 2206 if len(p.oe) == 0 { 2207 return false 2208 } 2209 n := p.oe[len(p.oe)-1] 2210 if n.Namespace == "" { 2211 return false 2212 } 2213 if mathMLTextIntegrationPoint(n) { 2214 if p.tok.Type == StartTagToken && p.tok.DataAtom != a.Mglyph && p.tok.DataAtom != a.Malignmark { 2215 return false 2216 } 2217 if p.tok.Type == TextToken { 2218 return false 2219 } 2220 } 2221 if n.Namespace == "math" && n.DataAtom == a.AnnotationXml && p.tok.Type == StartTagToken && p.tok.DataAtom == a.Svg { 2222 return false 2223 } 2224 if htmlIntegrationPoint(n) && (p.tok.Type == StartTagToken || p.tok.Type == TextToken) { 2225 return false 2226 } 2227 if p.tok.Type == ErrorToken { 2228 return false 2229 } 2230 return true 2231 } 2232 2233 // parseImpliedToken parses a token as though it had appeared in the parser's 2234 // input. 2235 func (p *parser) parseImpliedToken(t TokenType, dataAtom a.Atom, data string) { 2236 realToken, selfClosing := p.tok, p.hasSelfClosingToken 2237 p.tok = Token{ 2238 Type: t, 2239 DataAtom: dataAtom, 2240 Data: data, 2241 } 2242 p.hasSelfClosingToken = false 2243 p.parseCurrentToken() 2244 p.tok, p.hasSelfClosingToken = realToken, selfClosing 2245 } 2246 2247 // parseCurrentToken runs the current token through the parsing routines 2248 // until it is consumed. 2249 func (p *parser) parseCurrentToken() { 2250 if p.tok.Type == SelfClosingTagToken { 2251 p.hasSelfClosingToken = true 2252 p.tok.Type = StartTagToken 2253 } 2254 2255 consumed := false 2256 for !consumed { 2257 if p.inForeignContent() { 2258 consumed = parseForeignContent(p) 2259 } else { 2260 consumed = p.im(p) 2261 } 2262 } 2263 2264 if p.hasSelfClosingToken { 2265 // This is a parse error, but ignore it. 2266 p.hasSelfClosingToken = false 2267 } 2268 } 2269 2270 func (p *parser) parse() error { 2271 // Iterate until EOF. Any other error will cause an early return. 2272 var err error 2273 for err != io.EOF { 2274 // CDATA sections are allowed only in foreign content. 2275 n := p.oe.top() 2276 p.tokenizer.AllowCDATA(n != nil && n.Namespace != "") 2277 // Read and parse the next token. 2278 p.tokenizer.Next() 2279 p.tok = p.tokenizer.Token() 2280 if p.tok.Type == ErrorToken { 2281 err = p.tokenizer.Err() 2282 if err != nil && err != io.EOF { 2283 return err 2284 } 2285 } 2286 p.parseCurrentToken() 2287 } 2288 return nil 2289 } 2290 2291 // Parse returns the parse tree for the HTML from the given Reader. 2292 // 2293 // It implements the HTML5 parsing algorithm 2294 // (https://html.spec.whatwg.org/multipage/syntax.html#tree-construction), 2295 // which is very complicated. The resultant tree can contain implicitly created 2296 // nodes that have no explicit <tag> listed in r's data, and nodes' parents can 2297 // differ from the nesting implied by a naive processing of start and end 2298 // <tag>s. Conversely, explicit <tag>s in r's data can be silently dropped, 2299 // with no corresponding node in the resulting tree. 2300 // 2301 // The input is assumed to be UTF-8 encoded. 2302 func Parse(r io.Reader) (*Node, error) { 2303 return ParseWithOptions(r) 2304 } 2305 2306 // ParseFragment parses a fragment of HTML and returns the nodes that were 2307 // found. If the fragment is the InnerHTML for an existing element, pass that 2308 // element in context. 2309 // 2310 // It has the same intricacies as Parse. 2311 func ParseFragment(r io.Reader, context *Node) ([]*Node, error) { 2312 return ParseFragmentWithOptions(r, context) 2313 } 2314 2315 // ParseOption configures a parser. 2316 type ParseOption func(p *parser) 2317 2318 // ParseOptionEnableScripting configures the scripting flag. 2319 // https://html.spec.whatwg.org/multipage/webappapis.html#enabling-and-disabling-scripting 2320 // 2321 // By default, scripting is enabled. 2322 func ParseOptionEnableScripting(enable bool) ParseOption { 2323 return func(p *parser) { 2324 p.scripting = enable 2325 } 2326 } 2327 2328 // ParseWithOptions is like Parse, with options. 2329 func ParseWithOptions(r io.Reader, opts ...ParseOption) (*Node, error) { 2330 p := &parser{ 2331 tokenizer: NewTokenizer(r), 2332 doc: &Node{ 2333 Type: DocumentNode, 2334 }, 2335 scripting: true, 2336 framesetOK: true, 2337 im: initialIM, 2338 } 2339 2340 for _, f := range opts { 2341 f(p) 2342 } 2343 2344 err := p.parse() 2345 if err != nil { 2346 return nil, err 2347 } 2348 return p.doc, nil 2349 } 2350 2351 // ParseFragmentWithOptions is like ParseFragment, with options. 2352 func ParseFragmentWithOptions(r io.Reader, context *Node, opts ...ParseOption) ([]*Node, error) { 2353 contextTag := "" 2354 if context != nil { 2355 if context.Type != ElementNode { 2356 return nil, errors.New("html: ParseFragment of non-element Node") 2357 } 2358 // The next check isn't just context.DataAtom.String() == context.Data because 2359 // it is valid to pass an element whose tag isn't a known atom. For example, 2360 // DataAtom == 0 and Data = "tagfromthefuture" is perfectly consistent. 2361 if context.DataAtom != a.Lookup([]byte(context.Data)) { 2362 return nil, fmt.Errorf("html: inconsistent Node: DataAtom=%q, Data=%q", context.DataAtom, context.Data) 2363 } 2364 contextTag = context.DataAtom.String() 2365 } 2366 p := &parser{ 2367 tokenizer: NewTokenizerFragment(r, contextTag), 2368 doc: &Node{ 2369 Type: DocumentNode, 2370 }, 2371 scripting: true, 2372 fragment: true, 2373 context: context, 2374 } 2375 2376 for _, f := range opts { 2377 f(p) 2378 } 2379 2380 root := &Node{ 2381 Type: ElementNode, 2382 DataAtom: a.Html, 2383 Data: a.Html.String(), 2384 } 2385 p.doc.AppendChild(root) 2386 p.oe = nodeStack{root} 2387 if context != nil && context.DataAtom == a.Template { 2388 p.templateStack = append(p.templateStack, inTemplateIM) 2389 } 2390 p.resetInsertionMode() 2391 2392 for n := context; n != nil; n = n.Parent { 2393 if n.Type == ElementNode && n.DataAtom == a.Form { 2394 p.form = n 2395 break 2396 } 2397 } 2398 2399 err := p.parse() 2400 if err != nil { 2401 return nil, err 2402 } 2403 2404 parent := p.doc 2405 if context != nil { 2406 parent = root 2407 } 2408 2409 var result []*Node 2410 for c := parent.FirstChild; c != nil; { 2411 next := c.NextSibling 2412 parent.RemoveChild(c) 2413 result = append(result, c) 2414 c = next 2415 } 2416 return result, nil 2417 }