github.com/lianghucheng/zrddz@v0.0.0-20200923083010-c71f680932e2/src/golang.org/x/net/html/parse.go (about) 1 // Copyright 2010 The Go Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 package html 6 7 import ( 8 "errors" 9 "fmt" 10 "io" 11 "strings" 12 13 a "golang.org/x/net/html/atom" 14 ) 15 16 // A parser implements the HTML5 parsing algorithm: 17 // https://html.spec.whatwg.org/multipage/syntax.html#tree-construction 18 type parser struct { 19 // tokenizer provides the tokens for the parser. 20 tokenizer *Tokenizer 21 // tok is the most recently read token. 22 tok Token 23 // Self-closing tags like <hr/> are treated as start tags, except that 24 // hasSelfClosingToken is set while they are being processed. 25 hasSelfClosingToken bool 26 // doc is the document root element. 27 doc *Node 28 // The stack of open elements (section 12.2.4.2) and active formatting 29 // elements (section 12.2.4.3). 30 oe, afe nodeStack 31 // Element pointers (section 12.2.4.4). 32 head, form *Node 33 // Other parsing state flags (section 12.2.4.5). 34 scripting, framesetOK bool 35 // The stack of template insertion modes 36 templateStack insertionModeStack 37 // im is the current insertion mode. 38 im insertionMode 39 // originalIM is the insertion mode to go back to after completing a text 40 // or inTableText insertion mode. 41 originalIM insertionMode 42 // fosterParenting is whether new elements should be inserted according to 43 // the foster parenting rules (section 12.2.6.1). 44 fosterParenting bool 45 // quirks is whether the parser is operating in "quirks mode." 46 quirks bool 47 // fragment is whether the parser is parsing an HTML fragment. 48 fragment bool 49 // context is the context element when parsing an HTML fragment 50 // (section 12.4). 51 context *Node 52 } 53 54 func (p *parser) top() *Node { 55 if n := p.oe.top(); n != nil { 56 return n 57 } 58 return p.doc 59 } 60 61 // Stop tags for use in popUntil. These come from section 12.2.4.2. 62 var ( 63 defaultScopeStopTags = map[string][]a.Atom{ 64 "": {a.Applet, a.Caption, a.Html, a.Table, a.Td, a.Th, a.Marquee, a.Object, a.Template}, 65 "math": {a.AnnotationXml, a.Mi, a.Mn, a.Mo, a.Ms, a.Mtext}, 66 "svg": {a.Desc, a.ForeignObject, a.Title}, 67 } 68 ) 69 70 type scope int 71 72 const ( 73 defaultScope scope = iota 74 listItemScope 75 buttonScope 76 tableScope 77 tableRowScope 78 tableBodyScope 79 selectScope 80 ) 81 82 // popUntil pops the stack of open elements at the highest element whose tag 83 // is in matchTags, provided there is no higher element in the scope's stop 84 // tags (as defined in section 12.2.4.2). It returns whether or not there was 85 // such an element. If there was not, popUntil leaves the stack unchanged. 86 // 87 // For example, the set of stop tags for table scope is: "html", "table". If 88 // the stack was: 89 // ["html", "body", "font", "table", "b", "i", "u"] 90 // then popUntil(tableScope, "font") would return false, but 91 // popUntil(tableScope, "i") would return true and the stack would become: 92 // ["html", "body", "font", "table", "b"] 93 // 94 // If an element's tag is in both the stop tags and matchTags, then the stack 95 // will be popped and the function returns true (provided, of course, there was 96 // no higher element in the stack that was also in the stop tags). For example, 97 // popUntil(tableScope, "table") returns true and leaves: 98 // ["html", "body", "font"] 99 func (p *parser) popUntil(s scope, matchTags ...a.Atom) bool { 100 if i := p.indexOfElementInScope(s, matchTags...); i != -1 { 101 p.oe = p.oe[:i] 102 return true 103 } 104 return false 105 } 106 107 // indexOfElementInScope returns the index in p.oe of the highest element whose 108 // tag is in matchTags that is in scope. If no matching element is in scope, it 109 // returns -1. 110 func (p *parser) indexOfElementInScope(s scope, matchTags ...a.Atom) int { 111 for i := len(p.oe) - 1; i >= 0; i-- { 112 tagAtom := p.oe[i].DataAtom 113 if p.oe[i].Namespace == "" { 114 for _, t := range matchTags { 115 if t == tagAtom { 116 return i 117 } 118 } 119 switch s { 120 case defaultScope: 121 // No-op. 122 case listItemScope: 123 if tagAtom == a.Ol || tagAtom == a.Ul { 124 return -1 125 } 126 case buttonScope: 127 if tagAtom == a.Button { 128 return -1 129 } 130 case tableScope: 131 if tagAtom == a.Html || tagAtom == a.Table || tagAtom == a.Template { 132 return -1 133 } 134 case selectScope: 135 if tagAtom != a.Optgroup && tagAtom != a.Option { 136 return -1 137 } 138 default: 139 panic("unreachable") 140 } 141 } 142 switch s { 143 case defaultScope, listItemScope, buttonScope: 144 for _, t := range defaultScopeStopTags[p.oe[i].Namespace] { 145 if t == tagAtom { 146 return -1 147 } 148 } 149 } 150 } 151 return -1 152 } 153 154 // elementInScope is like popUntil, except that it doesn't modify the stack of 155 // open elements. 156 func (p *parser) elementInScope(s scope, matchTags ...a.Atom) bool { 157 return p.indexOfElementInScope(s, matchTags...) != -1 158 } 159 160 // clearStackToContext pops elements off the stack of open elements until a 161 // scope-defined element is found. 162 func (p *parser) clearStackToContext(s scope) { 163 for i := len(p.oe) - 1; i >= 0; i-- { 164 tagAtom := p.oe[i].DataAtom 165 switch s { 166 case tableScope: 167 if tagAtom == a.Html || tagAtom == a.Table || tagAtom == a.Template { 168 p.oe = p.oe[:i+1] 169 return 170 } 171 case tableRowScope: 172 if tagAtom == a.Html || tagAtom == a.Tr || tagAtom == a.Template { 173 p.oe = p.oe[:i+1] 174 return 175 } 176 case tableBodyScope: 177 if tagAtom == a.Html || tagAtom == a.Tbody || tagAtom == a.Tfoot || tagAtom == a.Thead || tagAtom == a.Template { 178 p.oe = p.oe[:i+1] 179 return 180 } 181 default: 182 panic("unreachable") 183 } 184 } 185 } 186 187 // generateImpliedEndTags pops nodes off the stack of open elements as long as 188 // the top node has a tag name of dd, dt, li, optgroup, option, p, rb, rp, rt or rtc. 189 // If exceptions are specified, nodes with that name will not be popped off. 190 func (p *parser) generateImpliedEndTags(exceptions ...string) { 191 var i int 192 loop: 193 for i = len(p.oe) - 1; i >= 0; i-- { 194 n := p.oe[i] 195 if n.Type == ElementNode { 196 switch n.DataAtom { 197 case a.Dd, a.Dt, a.Li, a.Optgroup, a.Option, a.P, a.Rb, a.Rp, a.Rt, a.Rtc: 198 for _, except := range exceptions { 199 if n.Data == except { 200 break loop 201 } 202 } 203 continue 204 } 205 } 206 break 207 } 208 209 p.oe = p.oe[:i+1] 210 } 211 212 // addChild adds a child node n to the top element, and pushes n onto the stack 213 // of open elements if it is an element node. 214 func (p *parser) addChild(n *Node) { 215 if p.shouldFosterParent() { 216 p.fosterParent(n) 217 } else { 218 p.top().AppendChild(n) 219 } 220 221 if n.Type == ElementNode { 222 p.oe = append(p.oe, n) 223 } 224 } 225 226 // shouldFosterParent returns whether the next node to be added should be 227 // foster parented. 228 func (p *parser) shouldFosterParent() bool { 229 if p.fosterParenting { 230 switch p.top().DataAtom { 231 case a.Table, a.Tbody, a.Tfoot, a.Thead, a.Tr: 232 return true 233 } 234 } 235 return false 236 } 237 238 // fosterParent adds a child node according to the foster parenting rules. 239 // Section 12.2.6.1, "foster parenting". 240 func (p *parser) fosterParent(n *Node) { 241 var table, parent, prev, template *Node 242 var i int 243 for i = len(p.oe) - 1; i >= 0; i-- { 244 if p.oe[i].DataAtom == a.Table { 245 table = p.oe[i] 246 break 247 } 248 } 249 250 var j int 251 for j = len(p.oe) - 1; j >= 0; j-- { 252 if p.oe[j].DataAtom == a.Template { 253 template = p.oe[j] 254 break 255 } 256 } 257 258 if template != nil && (table == nil || j > i) { 259 template.AppendChild(n) 260 return 261 } 262 263 if table == nil { 264 // The foster parent is the html element. 265 parent = p.oe[0] 266 } else { 267 parent = table.Parent 268 } 269 if parent == nil { 270 parent = p.oe[i-1] 271 } 272 273 if table != nil { 274 prev = table.PrevSibling 275 } else { 276 prev = parent.LastChild 277 } 278 if prev != nil && prev.Type == TextNode && n.Type == TextNode { 279 prev.Data += n.Data 280 return 281 } 282 283 parent.InsertBefore(n, table) 284 } 285 286 // addText adds text to the preceding node if it is a text node, or else it 287 // calls addChild with a new text node. 288 func (p *parser) addText(text string) { 289 if text == "" { 290 return 291 } 292 293 if p.shouldFosterParent() { 294 p.fosterParent(&Node{ 295 Type: TextNode, 296 Data: text, 297 }) 298 return 299 } 300 301 t := p.top() 302 if n := t.LastChild; n != nil && n.Type == TextNode { 303 n.Data += text 304 return 305 } 306 p.addChild(&Node{ 307 Type: TextNode, 308 Data: text, 309 }) 310 } 311 312 // addElement adds a child element based on the current token. 313 func (p *parser) addElement() { 314 p.addChild(&Node{ 315 Type: ElementNode, 316 DataAtom: p.tok.DataAtom, 317 Data: p.tok.Data, 318 Attr: p.tok.Attr, 319 }) 320 } 321 322 // Section 12.2.4.3. 323 func (p *parser) addFormattingElement() { 324 tagAtom, attr := p.tok.DataAtom, p.tok.Attr 325 p.addElement() 326 327 // Implement the Noah's Ark clause, but with three per family instead of two. 328 identicalElements := 0 329 findIdenticalElements: 330 for i := len(p.afe) - 1; i >= 0; i-- { 331 n := p.afe[i] 332 if n.Type == scopeMarkerNode { 333 break 334 } 335 if n.Type != ElementNode { 336 continue 337 } 338 if n.Namespace != "" { 339 continue 340 } 341 if n.DataAtom != tagAtom { 342 continue 343 } 344 if len(n.Attr) != len(attr) { 345 continue 346 } 347 compareAttributes: 348 for _, t0 := range n.Attr { 349 for _, t1 := range attr { 350 if t0.Key == t1.Key && t0.Namespace == t1.Namespace && t0.Val == t1.Val { 351 // Found a match for this attribute, continue with the next attribute. 352 continue compareAttributes 353 } 354 } 355 // If we get here, there is no attribute that matches a. 356 // Therefore the element is not identical to the new one. 357 continue findIdenticalElements 358 } 359 360 identicalElements++ 361 if identicalElements >= 3 { 362 p.afe.remove(n) 363 } 364 } 365 366 p.afe = append(p.afe, p.top()) 367 } 368 369 // Section 12.2.4.3. 370 func (p *parser) clearActiveFormattingElements() { 371 for { 372 n := p.afe.pop() 373 if len(p.afe) == 0 || n.Type == scopeMarkerNode { 374 return 375 } 376 } 377 } 378 379 // Section 12.2.4.3. 380 func (p *parser) reconstructActiveFormattingElements() { 381 n := p.afe.top() 382 if n == nil { 383 return 384 } 385 if n.Type == scopeMarkerNode || p.oe.index(n) != -1 { 386 return 387 } 388 i := len(p.afe) - 1 389 for n.Type != scopeMarkerNode && p.oe.index(n) == -1 { 390 if i == 0 { 391 i = -1 392 break 393 } 394 i-- 395 n = p.afe[i] 396 } 397 for { 398 i++ 399 clone := p.afe[i].clone() 400 p.addChild(clone) 401 p.afe[i] = clone 402 if i == len(p.afe)-1 { 403 break 404 } 405 } 406 } 407 408 // Section 12.2.5. 409 func (p *parser) acknowledgeSelfClosingTag() { 410 p.hasSelfClosingToken = false 411 } 412 413 // An insertion mode (section 12.2.4.1) is the state transition function from 414 // a particular state in the HTML5 parser's state machine. It updates the 415 // parser's fields depending on parser.tok (where ErrorToken means EOF). 416 // It returns whether the token was consumed. 417 type insertionMode func(*parser) bool 418 419 // setOriginalIM sets the insertion mode to return to after completing a text or 420 // inTableText insertion mode. 421 // Section 12.2.4.1, "using the rules for". 422 func (p *parser) setOriginalIM() { 423 if p.originalIM != nil { 424 panic("html: bad parser state: originalIM was set twice") 425 } 426 p.originalIM = p.im 427 } 428 429 // Section 12.2.4.1, "reset the insertion mode". 430 func (p *parser) resetInsertionMode() { 431 for i := len(p.oe) - 1; i >= 0; i-- { 432 n := p.oe[i] 433 last := i == 0 434 if last && p.context != nil { 435 n = p.context 436 } 437 438 switch n.DataAtom { 439 case a.Select: 440 if !last { 441 for ancestor, first := n, p.oe[0]; ancestor != first; { 442 ancestor = p.oe[p.oe.index(ancestor)-1] 443 switch ancestor.DataAtom { 444 case a.Template: 445 p.im = inSelectIM 446 return 447 case a.Table: 448 p.im = inSelectInTableIM 449 return 450 } 451 } 452 } 453 p.im = inSelectIM 454 case a.Td, a.Th: 455 // TODO: remove this divergence from the HTML5 spec. 456 // 457 // See https://bugs.chromium.org/p/chromium/issues/detail?id=829668 458 p.im = inCellIM 459 case a.Tr: 460 p.im = inRowIM 461 case a.Tbody, a.Thead, a.Tfoot: 462 p.im = inTableBodyIM 463 case a.Caption: 464 p.im = inCaptionIM 465 case a.Colgroup: 466 p.im = inColumnGroupIM 467 case a.Table: 468 p.im = inTableIM 469 case a.Template: 470 // TODO: remove this divergence from the HTML5 spec. 471 if n.Namespace != "" { 472 continue 473 } 474 p.im = p.templateStack.top() 475 case a.Head: 476 // TODO: remove this divergence from the HTML5 spec. 477 // 478 // See https://bugs.chromium.org/p/chromium/issues/detail?id=829668 479 p.im = inHeadIM 480 case a.Body: 481 p.im = inBodyIM 482 case a.Frameset: 483 p.im = inFramesetIM 484 case a.Html: 485 if p.head == nil { 486 p.im = beforeHeadIM 487 } else { 488 p.im = afterHeadIM 489 } 490 default: 491 if last { 492 p.im = inBodyIM 493 return 494 } 495 continue 496 } 497 return 498 } 499 } 500 501 const whitespace = " \t\r\n\f" 502 503 // Section 12.2.6.4.1. 504 func initialIM(p *parser) bool { 505 switch p.tok.Type { 506 case TextToken: 507 p.tok.Data = strings.TrimLeft(p.tok.Data, whitespace) 508 if len(p.tok.Data) == 0 { 509 // It was all whitespace, so ignore it. 510 return true 511 } 512 case CommentToken: 513 p.doc.AppendChild(&Node{ 514 Type: CommentNode, 515 Data: p.tok.Data, 516 }) 517 return true 518 case DoctypeToken: 519 n, quirks := parseDoctype(p.tok.Data) 520 p.doc.AppendChild(n) 521 p.quirks = quirks 522 p.im = beforeHTMLIM 523 return true 524 } 525 p.quirks = true 526 p.im = beforeHTMLIM 527 return false 528 } 529 530 // Section 12.2.6.4.2. 531 func beforeHTMLIM(p *parser) bool { 532 switch p.tok.Type { 533 case DoctypeToken: 534 // Ignore the token. 535 return true 536 case TextToken: 537 p.tok.Data = strings.TrimLeft(p.tok.Data, whitespace) 538 if len(p.tok.Data) == 0 { 539 // It was all whitespace, so ignore it. 540 return true 541 } 542 case StartTagToken: 543 if p.tok.DataAtom == a.Html { 544 p.addElement() 545 p.im = beforeHeadIM 546 return true 547 } 548 case EndTagToken: 549 switch p.tok.DataAtom { 550 case a.Head, a.Body, a.Html, a.Br: 551 p.parseImpliedToken(StartTagToken, a.Html, a.Html.String()) 552 return false 553 default: 554 // Ignore the token. 555 return true 556 } 557 case CommentToken: 558 p.doc.AppendChild(&Node{ 559 Type: CommentNode, 560 Data: p.tok.Data, 561 }) 562 return true 563 } 564 p.parseImpliedToken(StartTagToken, a.Html, a.Html.String()) 565 return false 566 } 567 568 // Section 12.2.6.4.3. 569 func beforeHeadIM(p *parser) bool { 570 switch p.tok.Type { 571 case TextToken: 572 p.tok.Data = strings.TrimLeft(p.tok.Data, whitespace) 573 if len(p.tok.Data) == 0 { 574 // It was all whitespace, so ignore it. 575 return true 576 } 577 case StartTagToken: 578 switch p.tok.DataAtom { 579 case a.Head: 580 p.addElement() 581 p.head = p.top() 582 p.im = inHeadIM 583 return true 584 case a.Html: 585 return inBodyIM(p) 586 } 587 case EndTagToken: 588 switch p.tok.DataAtom { 589 case a.Head, a.Body, a.Html, a.Br: 590 p.parseImpliedToken(StartTagToken, a.Head, a.Head.String()) 591 return false 592 default: 593 // Ignore the token. 594 return true 595 } 596 case CommentToken: 597 p.addChild(&Node{ 598 Type: CommentNode, 599 Data: p.tok.Data, 600 }) 601 return true 602 case DoctypeToken: 603 // Ignore the token. 604 return true 605 } 606 607 p.parseImpliedToken(StartTagToken, a.Head, a.Head.String()) 608 return false 609 } 610 611 // Section 12.2.6.4.4. 612 func inHeadIM(p *parser) bool { 613 switch p.tok.Type { 614 case TextToken: 615 s := strings.TrimLeft(p.tok.Data, whitespace) 616 if len(s) < len(p.tok.Data) { 617 // Add the initial whitespace to the current node. 618 p.addText(p.tok.Data[:len(p.tok.Data)-len(s)]) 619 if s == "" { 620 return true 621 } 622 p.tok.Data = s 623 } 624 case StartTagToken: 625 switch p.tok.DataAtom { 626 case a.Html: 627 return inBodyIM(p) 628 case a.Base, a.Basefont, a.Bgsound, a.Command, a.Link, a.Meta: 629 p.addElement() 630 p.oe.pop() 631 p.acknowledgeSelfClosingTag() 632 return true 633 case a.Script, a.Title, a.Noscript, a.Noframes, a.Style: 634 p.addElement() 635 p.setOriginalIM() 636 p.im = textIM 637 return true 638 case a.Head: 639 // Ignore the token. 640 return true 641 case a.Template: 642 p.addElement() 643 p.afe = append(p.afe, &scopeMarker) 644 p.framesetOK = false 645 p.im = inTemplateIM 646 p.templateStack = append(p.templateStack, inTemplateIM) 647 return true 648 } 649 case EndTagToken: 650 switch p.tok.DataAtom { 651 case a.Head: 652 p.oe.pop() 653 p.im = afterHeadIM 654 return true 655 case a.Body, a.Html, a.Br: 656 p.parseImpliedToken(EndTagToken, a.Head, a.Head.String()) 657 return false 658 case a.Template: 659 if !p.oe.contains(a.Template) { 660 return true 661 } 662 // TODO: remove this divergence from the HTML5 spec. 663 // 664 // See https://bugs.chromium.org/p/chromium/issues/detail?id=829668 665 p.generateImpliedEndTags() 666 for i := len(p.oe) - 1; i >= 0; i-- { 667 if n := p.oe[i]; n.Namespace == "" && n.DataAtom == a.Template { 668 p.oe = p.oe[:i] 669 break 670 } 671 } 672 p.clearActiveFormattingElements() 673 p.templateStack.pop() 674 p.resetInsertionMode() 675 return true 676 default: 677 // Ignore the token. 678 return true 679 } 680 case CommentToken: 681 p.addChild(&Node{ 682 Type: CommentNode, 683 Data: p.tok.Data, 684 }) 685 return true 686 case DoctypeToken: 687 // Ignore the token. 688 return true 689 } 690 691 p.parseImpliedToken(EndTagToken, a.Head, a.Head.String()) 692 return false 693 } 694 695 // Section 12.2.6.4.6. 696 func afterHeadIM(p *parser) bool { 697 switch p.tok.Type { 698 case TextToken: 699 s := strings.TrimLeft(p.tok.Data, whitespace) 700 if len(s) < len(p.tok.Data) { 701 // Add the initial whitespace to the current node. 702 p.addText(p.tok.Data[:len(p.tok.Data)-len(s)]) 703 if s == "" { 704 return true 705 } 706 p.tok.Data = s 707 } 708 case StartTagToken: 709 switch p.tok.DataAtom { 710 case a.Html: 711 return inBodyIM(p) 712 case a.Body: 713 p.addElement() 714 p.framesetOK = false 715 p.im = inBodyIM 716 return true 717 case a.Frameset: 718 p.addElement() 719 p.im = inFramesetIM 720 return true 721 case a.Base, a.Basefont, a.Bgsound, a.Link, a.Meta, a.Noframes, a.Script, a.Style, a.Template, a.Title: 722 p.oe = append(p.oe, p.head) 723 defer p.oe.remove(p.head) 724 return inHeadIM(p) 725 case a.Head: 726 // Ignore the token. 727 return true 728 } 729 case EndTagToken: 730 switch p.tok.DataAtom { 731 case a.Body, a.Html, a.Br: 732 // Drop down to creating an implied <body> tag. 733 case a.Template: 734 return inHeadIM(p) 735 default: 736 // Ignore the token. 737 return true 738 } 739 case CommentToken: 740 p.addChild(&Node{ 741 Type: CommentNode, 742 Data: p.tok.Data, 743 }) 744 return true 745 case DoctypeToken: 746 // Ignore the token. 747 return true 748 } 749 750 p.parseImpliedToken(StartTagToken, a.Body, a.Body.String()) 751 p.framesetOK = true 752 return false 753 } 754 755 // copyAttributes copies attributes of src not found on dst to dst. 756 func copyAttributes(dst *Node, src Token) { 757 if len(src.Attr) == 0 { 758 return 759 } 760 attr := map[string]string{} 761 for _, t := range dst.Attr { 762 attr[t.Key] = t.Val 763 } 764 for _, t := range src.Attr { 765 if _, ok := attr[t.Key]; !ok { 766 dst.Attr = append(dst.Attr, t) 767 attr[t.Key] = t.Val 768 } 769 } 770 } 771 772 // Section 12.2.6.4.7. 773 func inBodyIM(p *parser) bool { 774 switch p.tok.Type { 775 case TextToken: 776 d := p.tok.Data 777 switch n := p.oe.top(); n.DataAtom { 778 case a.Pre, a.Listing: 779 if n.FirstChild == nil { 780 // Ignore a newline at the start of a <pre> block. 781 if d != "" && d[0] == '\r' { 782 d = d[1:] 783 } 784 if d != "" && d[0] == '\n' { 785 d = d[1:] 786 } 787 } 788 } 789 d = strings.Replace(d, "\x00", "", -1) 790 if d == "" { 791 return true 792 } 793 p.reconstructActiveFormattingElements() 794 p.addText(d) 795 if p.framesetOK && strings.TrimLeft(d, whitespace) != "" { 796 // There were non-whitespace characters inserted. 797 p.framesetOK = false 798 } 799 case StartTagToken: 800 switch p.tok.DataAtom { 801 case a.Html: 802 if p.oe.contains(a.Template) { 803 return true 804 } 805 copyAttributes(p.oe[0], p.tok) 806 case a.Base, a.Basefont, a.Bgsound, a.Command, a.Link, a.Meta, a.Noframes, a.Script, a.Style, a.Template, a.Title: 807 return inHeadIM(p) 808 case a.Body: 809 if p.oe.contains(a.Template) { 810 return true 811 } 812 if len(p.oe) >= 2 { 813 body := p.oe[1] 814 if body.Type == ElementNode && body.DataAtom == a.Body { 815 p.framesetOK = false 816 copyAttributes(body, p.tok) 817 } 818 } 819 case a.Frameset: 820 if !p.framesetOK || len(p.oe) < 2 || p.oe[1].DataAtom != a.Body { 821 // Ignore the token. 822 return true 823 } 824 body := p.oe[1] 825 if body.Parent != nil { 826 body.Parent.RemoveChild(body) 827 } 828 p.oe = p.oe[:1] 829 p.addElement() 830 p.im = inFramesetIM 831 return true 832 case a.Address, a.Article, a.Aside, a.Blockquote, a.Center, a.Details, a.Dir, a.Div, a.Dl, a.Fieldset, a.Figcaption, a.Figure, a.Footer, a.Header, a.Hgroup, a.Menu, a.Nav, a.Ol, a.P, a.Section, a.Summary, a.Ul: 833 p.popUntil(buttonScope, a.P) 834 p.addElement() 835 case a.H1, a.H2, a.H3, a.H4, a.H5, a.H6: 836 p.popUntil(buttonScope, a.P) 837 switch n := p.top(); n.DataAtom { 838 case a.H1, a.H2, a.H3, a.H4, a.H5, a.H6: 839 p.oe.pop() 840 } 841 p.addElement() 842 case a.Pre, a.Listing: 843 p.popUntil(buttonScope, a.P) 844 p.addElement() 845 // The newline, if any, will be dealt with by the TextToken case. 846 p.framesetOK = false 847 case a.Form: 848 if p.form != nil && !p.oe.contains(a.Template) { 849 // Ignore the token 850 return true 851 } 852 p.popUntil(buttonScope, a.P) 853 p.addElement() 854 if !p.oe.contains(a.Template) { 855 p.form = p.top() 856 } 857 case a.Li: 858 p.framesetOK = false 859 for i := len(p.oe) - 1; i >= 0; i-- { 860 node := p.oe[i] 861 switch node.DataAtom { 862 case a.Li: 863 p.oe = p.oe[:i] 864 case a.Address, a.Div, a.P: 865 continue 866 default: 867 if !isSpecialElement(node) { 868 continue 869 } 870 } 871 break 872 } 873 p.popUntil(buttonScope, a.P) 874 p.addElement() 875 case a.Dd, a.Dt: 876 p.framesetOK = false 877 for i := len(p.oe) - 1; i >= 0; i-- { 878 node := p.oe[i] 879 switch node.DataAtom { 880 case a.Dd, a.Dt: 881 p.oe = p.oe[:i] 882 case a.Address, a.Div, a.P: 883 continue 884 default: 885 if !isSpecialElement(node) { 886 continue 887 } 888 } 889 break 890 } 891 p.popUntil(buttonScope, a.P) 892 p.addElement() 893 case a.Plaintext: 894 p.popUntil(buttonScope, a.P) 895 p.addElement() 896 case a.Button: 897 p.popUntil(defaultScope, a.Button) 898 p.reconstructActiveFormattingElements() 899 p.addElement() 900 p.framesetOK = false 901 case a.A: 902 for i := len(p.afe) - 1; i >= 0 && p.afe[i].Type != scopeMarkerNode; i-- { 903 if n := p.afe[i]; n.Type == ElementNode && n.DataAtom == a.A { 904 p.inBodyEndTagFormatting(a.A, "a") 905 p.oe.remove(n) 906 p.afe.remove(n) 907 break 908 } 909 } 910 p.reconstructActiveFormattingElements() 911 p.addFormattingElement() 912 case a.B, a.Big, a.Code, a.Em, a.Font, a.I, a.S, a.Small, a.Strike, a.Strong, a.Tt, a.U: 913 p.reconstructActiveFormattingElements() 914 p.addFormattingElement() 915 case a.Nobr: 916 p.reconstructActiveFormattingElements() 917 if p.elementInScope(defaultScope, a.Nobr) { 918 p.inBodyEndTagFormatting(a.Nobr, "nobr") 919 p.reconstructActiveFormattingElements() 920 } 921 p.addFormattingElement() 922 case a.Applet, a.Marquee, a.Object: 923 p.reconstructActiveFormattingElements() 924 p.addElement() 925 p.afe = append(p.afe, &scopeMarker) 926 p.framesetOK = false 927 case a.Table: 928 if !p.quirks { 929 p.popUntil(buttonScope, a.P) 930 } 931 p.addElement() 932 p.framesetOK = false 933 p.im = inTableIM 934 return true 935 case a.Area, a.Br, a.Embed, a.Img, a.Input, a.Keygen, a.Wbr: 936 p.reconstructActiveFormattingElements() 937 p.addElement() 938 p.oe.pop() 939 p.acknowledgeSelfClosingTag() 940 if p.tok.DataAtom == a.Input { 941 for _, t := range p.tok.Attr { 942 if t.Key == "type" { 943 if strings.ToLower(t.Val) == "hidden" { 944 // Skip setting framesetOK = false 945 return true 946 } 947 } 948 } 949 } 950 p.framesetOK = false 951 case a.Param, a.Source, a.Track: 952 p.addElement() 953 p.oe.pop() 954 p.acknowledgeSelfClosingTag() 955 case a.Hr: 956 p.popUntil(buttonScope, a.P) 957 p.addElement() 958 p.oe.pop() 959 p.acknowledgeSelfClosingTag() 960 p.framesetOK = false 961 case a.Image: 962 p.tok.DataAtom = a.Img 963 p.tok.Data = a.Img.String() 964 return false 965 case a.Isindex: 966 if p.form != nil { 967 // Ignore the token. 968 return true 969 } 970 action := "" 971 prompt := "This is a searchable index. Enter search keywords: " 972 attr := []Attribute{{Key: "name", Val: "isindex"}} 973 for _, t := range p.tok.Attr { 974 switch t.Key { 975 case "action": 976 action = t.Val 977 case "name": 978 // Ignore the attribute. 979 case "prompt": 980 prompt = t.Val 981 default: 982 attr = append(attr, t) 983 } 984 } 985 p.acknowledgeSelfClosingTag() 986 p.popUntil(buttonScope, a.P) 987 p.parseImpliedToken(StartTagToken, a.Form, a.Form.String()) 988 if p.form == nil { 989 // NOTE: The 'isindex' element has been removed, 990 // and the 'template' element has not been designed to be 991 // collaborative with the index element. 992 // 993 // Ignore the token. 994 return true 995 } 996 if action != "" { 997 p.form.Attr = []Attribute{{Key: "action", Val: action}} 998 } 999 p.parseImpliedToken(StartTagToken, a.Hr, a.Hr.String()) 1000 p.parseImpliedToken(StartTagToken, a.Label, a.Label.String()) 1001 p.addText(prompt) 1002 p.addChild(&Node{ 1003 Type: ElementNode, 1004 DataAtom: a.Input, 1005 Data: a.Input.String(), 1006 Attr: attr, 1007 }) 1008 p.oe.pop() 1009 p.parseImpliedToken(EndTagToken, a.Label, a.Label.String()) 1010 p.parseImpliedToken(StartTagToken, a.Hr, a.Hr.String()) 1011 p.parseImpliedToken(EndTagToken, a.Form, a.Form.String()) 1012 case a.Textarea: 1013 p.addElement() 1014 p.setOriginalIM() 1015 p.framesetOK = false 1016 p.im = textIM 1017 case a.Xmp: 1018 p.popUntil(buttonScope, a.P) 1019 p.reconstructActiveFormattingElements() 1020 p.framesetOK = false 1021 p.addElement() 1022 p.setOriginalIM() 1023 p.im = textIM 1024 case a.Iframe: 1025 p.framesetOK = false 1026 p.addElement() 1027 p.setOriginalIM() 1028 p.im = textIM 1029 case a.Noembed, a.Noscript: 1030 p.addElement() 1031 p.setOriginalIM() 1032 p.im = textIM 1033 case a.Select: 1034 p.reconstructActiveFormattingElements() 1035 p.addElement() 1036 p.framesetOK = false 1037 p.im = inSelectIM 1038 return true 1039 case a.Optgroup, a.Option: 1040 if p.top().DataAtom == a.Option { 1041 p.oe.pop() 1042 } 1043 p.reconstructActiveFormattingElements() 1044 p.addElement() 1045 case a.Rb, a.Rtc: 1046 if p.elementInScope(defaultScope, a.Ruby) { 1047 p.generateImpliedEndTags() 1048 } 1049 p.addElement() 1050 case a.Rp, a.Rt: 1051 if p.elementInScope(defaultScope, a.Ruby) { 1052 p.generateImpliedEndTags("rtc") 1053 } 1054 p.addElement() 1055 case a.Math, a.Svg: 1056 p.reconstructActiveFormattingElements() 1057 if p.tok.DataAtom == a.Math { 1058 adjustAttributeNames(p.tok.Attr, mathMLAttributeAdjustments) 1059 } else { 1060 adjustAttributeNames(p.tok.Attr, svgAttributeAdjustments) 1061 } 1062 adjustForeignAttributes(p.tok.Attr) 1063 p.addElement() 1064 p.top().Namespace = p.tok.Data 1065 if p.hasSelfClosingToken { 1066 p.oe.pop() 1067 p.acknowledgeSelfClosingTag() 1068 } 1069 return true 1070 case a.Caption, a.Col, a.Colgroup, a.Frame, a.Head, a.Tbody, a.Td, a.Tfoot, a.Th, a.Thead, a.Tr: 1071 // Ignore the token. 1072 default: 1073 p.reconstructActiveFormattingElements() 1074 p.addElement() 1075 } 1076 case EndTagToken: 1077 switch p.tok.DataAtom { 1078 case a.Body: 1079 if p.elementInScope(defaultScope, a.Body) { 1080 p.im = afterBodyIM 1081 } 1082 case a.Html: 1083 if p.elementInScope(defaultScope, a.Body) { 1084 p.parseImpliedToken(EndTagToken, a.Body, a.Body.String()) 1085 return false 1086 } 1087 return true 1088 case a.Address, a.Article, a.Aside, a.Blockquote, a.Button, a.Center, a.Details, a.Dir, a.Div, a.Dl, a.Fieldset, a.Figcaption, a.Figure, a.Footer, a.Header, a.Hgroup, a.Listing, a.Menu, a.Nav, a.Ol, a.Pre, a.Section, a.Summary, a.Ul: 1089 p.popUntil(defaultScope, p.tok.DataAtom) 1090 case a.Form: 1091 if p.oe.contains(a.Template) { 1092 i := p.indexOfElementInScope(defaultScope, a.Form) 1093 if i == -1 { 1094 // Ignore the token. 1095 return true 1096 } 1097 p.generateImpliedEndTags() 1098 if p.oe[i].DataAtom != a.Form { 1099 // Ignore the token. 1100 return true 1101 } 1102 p.popUntil(defaultScope, a.Form) 1103 } else { 1104 node := p.form 1105 p.form = nil 1106 i := p.indexOfElementInScope(defaultScope, a.Form) 1107 if node == nil || i == -1 || p.oe[i] != node { 1108 // Ignore the token. 1109 return true 1110 } 1111 p.generateImpliedEndTags() 1112 p.oe.remove(node) 1113 } 1114 case a.P: 1115 if !p.elementInScope(buttonScope, a.P) { 1116 p.parseImpliedToken(StartTagToken, a.P, a.P.String()) 1117 } 1118 p.popUntil(buttonScope, a.P) 1119 case a.Li: 1120 p.popUntil(listItemScope, a.Li) 1121 case a.Dd, a.Dt: 1122 p.popUntil(defaultScope, p.tok.DataAtom) 1123 case a.H1, a.H2, a.H3, a.H4, a.H5, a.H6: 1124 p.popUntil(defaultScope, a.H1, a.H2, a.H3, a.H4, a.H5, a.H6) 1125 case a.A, a.B, a.Big, a.Code, a.Em, a.Font, a.I, a.Nobr, a.S, a.Small, a.Strike, a.Strong, a.Tt, a.U: 1126 p.inBodyEndTagFormatting(p.tok.DataAtom, p.tok.Data) 1127 case a.Applet, a.Marquee, a.Object: 1128 if p.popUntil(defaultScope, p.tok.DataAtom) { 1129 p.clearActiveFormattingElements() 1130 } 1131 case a.Br: 1132 p.tok.Type = StartTagToken 1133 return false 1134 case a.Template: 1135 return inHeadIM(p) 1136 default: 1137 p.inBodyEndTagOther(p.tok.DataAtom, p.tok.Data) 1138 } 1139 case CommentToken: 1140 p.addChild(&Node{ 1141 Type: CommentNode, 1142 Data: p.tok.Data, 1143 }) 1144 case ErrorToken: 1145 // TODO: remove this divergence from the HTML5 spec. 1146 if len(p.templateStack) > 0 { 1147 p.im = inTemplateIM 1148 return false 1149 } else { 1150 for _, e := range p.oe { 1151 switch e.DataAtom { 1152 case a.Dd, a.Dt, a.Li, a.Optgroup, a.Option, a.P, a.Rb, a.Rp, a.Rt, a.Rtc, a.Tbody, a.Td, a.Tfoot, a.Th, 1153 a.Thead, a.Tr, a.Body, a.Html: 1154 default: 1155 return true 1156 } 1157 } 1158 } 1159 } 1160 1161 return true 1162 } 1163 1164 func (p *parser) inBodyEndTagFormatting(tagAtom a.Atom, tagName string) { 1165 // This is the "adoption agency" algorithm, described at 1166 // https://html.spec.whatwg.org/multipage/syntax.html#adoptionAgency 1167 1168 // TODO: this is a fairly literal line-by-line translation of that algorithm. 1169 // Once the code successfully parses the comprehensive test suite, we should 1170 // refactor this code to be more idiomatic. 1171 1172 // Steps 1-4. The outer loop. 1173 for i := 0; i < 8; i++ { 1174 // Step 5. Find the formatting element. 1175 var formattingElement *Node 1176 for j := len(p.afe) - 1; j >= 0; j-- { 1177 if p.afe[j].Type == scopeMarkerNode { 1178 break 1179 } 1180 if p.afe[j].DataAtom == tagAtom { 1181 formattingElement = p.afe[j] 1182 break 1183 } 1184 } 1185 if formattingElement == nil { 1186 p.inBodyEndTagOther(tagAtom, tagName) 1187 return 1188 } 1189 feIndex := p.oe.index(formattingElement) 1190 if feIndex == -1 { 1191 p.afe.remove(formattingElement) 1192 return 1193 } 1194 if !p.elementInScope(defaultScope, tagAtom) { 1195 // Ignore the tag. 1196 return 1197 } 1198 1199 // Steps 9-10. Find the furthest block. 1200 var furthestBlock *Node 1201 for _, e := range p.oe[feIndex:] { 1202 if isSpecialElement(e) { 1203 furthestBlock = e 1204 break 1205 } 1206 } 1207 if furthestBlock == nil { 1208 e := p.oe.pop() 1209 for e != formattingElement { 1210 e = p.oe.pop() 1211 } 1212 p.afe.remove(e) 1213 return 1214 } 1215 1216 // Steps 11-12. Find the common ancestor and bookmark node. 1217 commonAncestor := p.oe[feIndex-1] 1218 bookmark := p.afe.index(formattingElement) 1219 1220 // Step 13. The inner loop. Find the lastNode to reparent. 1221 lastNode := furthestBlock 1222 node := furthestBlock 1223 x := p.oe.index(node) 1224 // Steps 13.1-13.2 1225 for j := 0; j < 3; j++ { 1226 // Step 13.3. 1227 x-- 1228 node = p.oe[x] 1229 // Step 13.4 - 13.5. 1230 if p.afe.index(node) == -1 { 1231 p.oe.remove(node) 1232 continue 1233 } 1234 // Step 13.6. 1235 if node == formattingElement { 1236 break 1237 } 1238 // Step 13.7. 1239 clone := node.clone() 1240 p.afe[p.afe.index(node)] = clone 1241 p.oe[p.oe.index(node)] = clone 1242 node = clone 1243 // Step 13.8. 1244 if lastNode == furthestBlock { 1245 bookmark = p.afe.index(node) + 1 1246 } 1247 // Step 13.9. 1248 if lastNode.Parent != nil { 1249 lastNode.Parent.RemoveChild(lastNode) 1250 } 1251 node.AppendChild(lastNode) 1252 // Step 13.10. 1253 lastNode = node 1254 } 1255 1256 // Step 14. Reparent lastNode to the common ancestor, 1257 // or for misnested table nodes, to the foster parent. 1258 if lastNode.Parent != nil { 1259 lastNode.Parent.RemoveChild(lastNode) 1260 } 1261 switch commonAncestor.DataAtom { 1262 case a.Table, a.Tbody, a.Tfoot, a.Thead, a.Tr: 1263 p.fosterParent(lastNode) 1264 default: 1265 commonAncestor.AppendChild(lastNode) 1266 } 1267 1268 // Steps 15-17. Reparent nodes from the furthest block's children 1269 // to a clone of the formatting element. 1270 clone := formattingElement.clone() 1271 reparentChildren(clone, furthestBlock) 1272 furthestBlock.AppendChild(clone) 1273 1274 // Step 18. Fix up the list of active formatting elements. 1275 if oldLoc := p.afe.index(formattingElement); oldLoc != -1 && oldLoc < bookmark { 1276 // Move the bookmark with the rest of the list. 1277 bookmark-- 1278 } 1279 p.afe.remove(formattingElement) 1280 p.afe.insert(bookmark, clone) 1281 1282 // Step 19. Fix up the stack of open elements. 1283 p.oe.remove(formattingElement) 1284 p.oe.insert(p.oe.index(furthestBlock)+1, clone) 1285 } 1286 } 1287 1288 // inBodyEndTagOther performs the "any other end tag" algorithm for inBodyIM. 1289 // "Any other end tag" handling from 12.2.6.5 The rules for parsing tokens in foreign content 1290 // https://html.spec.whatwg.org/multipage/syntax.html#parsing-main-inforeign 1291 func (p *parser) inBodyEndTagOther(tagAtom a.Atom, tagName string) { 1292 for i := len(p.oe) - 1; i >= 0; i-- { 1293 // Two element nodes have the same tag if they have the same Data (a 1294 // string-typed field). As an optimization, for common HTML tags, each 1295 // Data string is assigned a unique, non-zero DataAtom (a uint32-typed 1296 // field), since integer comparison is faster than string comparison. 1297 // Uncommon (custom) tags get a zero DataAtom. 1298 // 1299 // The if condition here is equivalent to (p.oe[i].Data == tagName). 1300 if (p.oe[i].DataAtom == tagAtom) && 1301 ((tagAtom != 0) || (p.oe[i].Data == tagName)) { 1302 p.oe = p.oe[:i] 1303 break 1304 } 1305 if isSpecialElement(p.oe[i]) { 1306 break 1307 } 1308 } 1309 } 1310 1311 // Section 12.2.6.4.8. 1312 func textIM(p *parser) bool { 1313 switch p.tok.Type { 1314 case ErrorToken: 1315 p.oe.pop() 1316 case TextToken: 1317 d := p.tok.Data 1318 if n := p.oe.top(); n.DataAtom == a.Textarea && n.FirstChild == nil { 1319 // Ignore a newline at the start of a <textarea> block. 1320 if d != "" && d[0] == '\r' { 1321 d = d[1:] 1322 } 1323 if d != "" && d[0] == '\n' { 1324 d = d[1:] 1325 } 1326 } 1327 if d == "" { 1328 return true 1329 } 1330 p.addText(d) 1331 return true 1332 case EndTagToken: 1333 p.oe.pop() 1334 } 1335 p.im = p.originalIM 1336 p.originalIM = nil 1337 return p.tok.Type == EndTagToken 1338 } 1339 1340 // Section 12.2.6.4.9. 1341 func inTableIM(p *parser) bool { 1342 switch p.tok.Type { 1343 case TextToken: 1344 p.tok.Data = strings.Replace(p.tok.Data, "\x00", "", -1) 1345 switch p.oe.top().DataAtom { 1346 case a.Table, a.Tbody, a.Tfoot, a.Thead, a.Tr: 1347 if strings.Trim(p.tok.Data, whitespace) == "" { 1348 p.addText(p.tok.Data) 1349 return true 1350 } 1351 } 1352 case StartTagToken: 1353 switch p.tok.DataAtom { 1354 case a.Caption: 1355 p.clearStackToContext(tableScope) 1356 p.afe = append(p.afe, &scopeMarker) 1357 p.addElement() 1358 p.im = inCaptionIM 1359 return true 1360 case a.Colgroup: 1361 p.clearStackToContext(tableScope) 1362 p.addElement() 1363 p.im = inColumnGroupIM 1364 return true 1365 case a.Col: 1366 p.parseImpliedToken(StartTagToken, a.Colgroup, a.Colgroup.String()) 1367 return false 1368 case a.Tbody, a.Tfoot, a.Thead: 1369 p.clearStackToContext(tableScope) 1370 p.addElement() 1371 p.im = inTableBodyIM 1372 return true 1373 case a.Td, a.Th, a.Tr: 1374 p.parseImpliedToken(StartTagToken, a.Tbody, a.Tbody.String()) 1375 return false 1376 case a.Table: 1377 if p.popUntil(tableScope, a.Table) { 1378 p.resetInsertionMode() 1379 return false 1380 } 1381 // Ignore the token. 1382 return true 1383 case a.Style, a.Script, a.Template: 1384 return inHeadIM(p) 1385 case a.Input: 1386 for _, t := range p.tok.Attr { 1387 if t.Key == "type" && strings.ToLower(t.Val) == "hidden" { 1388 p.addElement() 1389 p.oe.pop() 1390 return true 1391 } 1392 } 1393 // Otherwise drop down to the default action. 1394 case a.Form: 1395 if p.oe.contains(a.Template) || p.form != nil { 1396 // Ignore the token. 1397 return true 1398 } 1399 p.addElement() 1400 p.form = p.oe.pop() 1401 case a.Select: 1402 p.reconstructActiveFormattingElements() 1403 switch p.top().DataAtom { 1404 case a.Table, a.Tbody, a.Tfoot, a.Thead, a.Tr: 1405 p.fosterParenting = true 1406 } 1407 p.addElement() 1408 p.fosterParenting = false 1409 p.framesetOK = false 1410 p.im = inSelectInTableIM 1411 return true 1412 } 1413 case EndTagToken: 1414 switch p.tok.DataAtom { 1415 case a.Table: 1416 if p.popUntil(tableScope, a.Table) { 1417 p.resetInsertionMode() 1418 return true 1419 } 1420 // Ignore the token. 1421 return true 1422 case a.Body, a.Caption, a.Col, a.Colgroup, a.Html, a.Tbody, a.Td, a.Tfoot, a.Th, a.Thead, a.Tr: 1423 // Ignore the token. 1424 return true 1425 case a.Template: 1426 return inHeadIM(p) 1427 } 1428 case CommentToken: 1429 p.addChild(&Node{ 1430 Type: CommentNode, 1431 Data: p.tok.Data, 1432 }) 1433 return true 1434 case DoctypeToken: 1435 // Ignore the token. 1436 return true 1437 case ErrorToken: 1438 return inBodyIM(p) 1439 } 1440 1441 p.fosterParenting = true 1442 defer func() { p.fosterParenting = false }() 1443 1444 return inBodyIM(p) 1445 } 1446 1447 // Section 12.2.6.4.11. 1448 func inCaptionIM(p *parser) bool { 1449 switch p.tok.Type { 1450 case StartTagToken: 1451 switch p.tok.DataAtom { 1452 case a.Caption, a.Col, a.Colgroup, a.Tbody, a.Td, a.Tfoot, a.Thead, a.Tr: 1453 if p.popUntil(tableScope, a.Caption) { 1454 p.clearActiveFormattingElements() 1455 p.im = inTableIM 1456 return false 1457 } else { 1458 // Ignore the token. 1459 return true 1460 } 1461 case a.Select: 1462 p.reconstructActiveFormattingElements() 1463 p.addElement() 1464 p.framesetOK = false 1465 p.im = inSelectInTableIM 1466 return true 1467 } 1468 case EndTagToken: 1469 switch p.tok.DataAtom { 1470 case a.Caption: 1471 if p.popUntil(tableScope, a.Caption) { 1472 p.clearActiveFormattingElements() 1473 p.im = inTableIM 1474 } 1475 return true 1476 case a.Table: 1477 if p.popUntil(tableScope, a.Caption) { 1478 p.clearActiveFormattingElements() 1479 p.im = inTableIM 1480 return false 1481 } else { 1482 // Ignore the token. 1483 return true 1484 } 1485 case a.Body, a.Col, a.Colgroup, a.Html, a.Tbody, a.Td, a.Tfoot, a.Th, a.Thead, a.Tr: 1486 // Ignore the token. 1487 return true 1488 } 1489 } 1490 return inBodyIM(p) 1491 } 1492 1493 // Section 12.2.6.4.12. 1494 func inColumnGroupIM(p *parser) bool { 1495 switch p.tok.Type { 1496 case TextToken: 1497 s := strings.TrimLeft(p.tok.Data, whitespace) 1498 if len(s) < len(p.tok.Data) { 1499 // Add the initial whitespace to the current node. 1500 p.addText(p.tok.Data[:len(p.tok.Data)-len(s)]) 1501 if s == "" { 1502 return true 1503 } 1504 p.tok.Data = s 1505 } 1506 case CommentToken: 1507 p.addChild(&Node{ 1508 Type: CommentNode, 1509 Data: p.tok.Data, 1510 }) 1511 return true 1512 case DoctypeToken: 1513 // Ignore the token. 1514 return true 1515 case StartTagToken: 1516 switch p.tok.DataAtom { 1517 case a.Html: 1518 return inBodyIM(p) 1519 case a.Col: 1520 p.addElement() 1521 p.oe.pop() 1522 p.acknowledgeSelfClosingTag() 1523 return true 1524 case a.Template: 1525 return inHeadIM(p) 1526 } 1527 case EndTagToken: 1528 switch p.tok.DataAtom { 1529 case a.Colgroup: 1530 if p.oe.top().DataAtom == a.Colgroup { 1531 p.oe.pop() 1532 p.im = inTableIM 1533 } 1534 return true 1535 case a.Col: 1536 // Ignore the token. 1537 return true 1538 case a.Template: 1539 return inHeadIM(p) 1540 } 1541 case ErrorToken: 1542 return inBodyIM(p) 1543 } 1544 if p.oe.top().DataAtom != a.Colgroup { 1545 return true 1546 } 1547 p.oe.pop() 1548 p.im = inTableIM 1549 return false 1550 } 1551 1552 // Section 12.2.6.4.13. 1553 func inTableBodyIM(p *parser) bool { 1554 switch p.tok.Type { 1555 case StartTagToken: 1556 switch p.tok.DataAtom { 1557 case a.Tr: 1558 p.clearStackToContext(tableBodyScope) 1559 p.addElement() 1560 p.im = inRowIM 1561 return true 1562 case a.Td, a.Th: 1563 p.parseImpliedToken(StartTagToken, a.Tr, a.Tr.String()) 1564 return false 1565 case a.Caption, a.Col, a.Colgroup, a.Tbody, a.Tfoot, a.Thead: 1566 if p.popUntil(tableScope, a.Tbody, a.Thead, a.Tfoot) { 1567 p.im = inTableIM 1568 return false 1569 } 1570 // Ignore the token. 1571 return true 1572 } 1573 case EndTagToken: 1574 switch p.tok.DataAtom { 1575 case a.Tbody, a.Tfoot, a.Thead: 1576 if p.elementInScope(tableScope, p.tok.DataAtom) { 1577 p.clearStackToContext(tableBodyScope) 1578 p.oe.pop() 1579 p.im = inTableIM 1580 } 1581 return true 1582 case a.Table: 1583 if p.popUntil(tableScope, a.Tbody, a.Thead, a.Tfoot) { 1584 p.im = inTableIM 1585 return false 1586 } 1587 // Ignore the token. 1588 return true 1589 case a.Body, a.Caption, a.Col, a.Colgroup, a.Html, a.Td, a.Th, a.Tr: 1590 // Ignore the token. 1591 return true 1592 } 1593 case CommentToken: 1594 p.addChild(&Node{ 1595 Type: CommentNode, 1596 Data: p.tok.Data, 1597 }) 1598 return true 1599 } 1600 1601 return inTableIM(p) 1602 } 1603 1604 // Section 12.2.6.4.14. 1605 func inRowIM(p *parser) bool { 1606 switch p.tok.Type { 1607 case StartTagToken: 1608 switch p.tok.DataAtom { 1609 case a.Td, a.Th: 1610 p.clearStackToContext(tableRowScope) 1611 p.addElement() 1612 p.afe = append(p.afe, &scopeMarker) 1613 p.im = inCellIM 1614 return true 1615 case a.Caption, a.Col, a.Colgroup, a.Tbody, a.Tfoot, a.Thead, a.Tr: 1616 if p.popUntil(tableScope, a.Tr) { 1617 p.im = inTableBodyIM 1618 return false 1619 } 1620 // Ignore the token. 1621 return true 1622 } 1623 case EndTagToken: 1624 switch p.tok.DataAtom { 1625 case a.Tr: 1626 if p.popUntil(tableScope, a.Tr) { 1627 p.im = inTableBodyIM 1628 return true 1629 } 1630 // Ignore the token. 1631 return true 1632 case a.Table: 1633 if p.popUntil(tableScope, a.Tr) { 1634 p.im = inTableBodyIM 1635 return false 1636 } 1637 // Ignore the token. 1638 return true 1639 case a.Tbody, a.Tfoot, a.Thead: 1640 if p.elementInScope(tableScope, p.tok.DataAtom) { 1641 p.parseImpliedToken(EndTagToken, a.Tr, a.Tr.String()) 1642 return false 1643 } 1644 // Ignore the token. 1645 return true 1646 case a.Body, a.Caption, a.Col, a.Colgroup, a.Html, a.Td, a.Th: 1647 // Ignore the token. 1648 return true 1649 } 1650 } 1651 1652 return inTableIM(p) 1653 } 1654 1655 // Section 12.2.6.4.15. 1656 func inCellIM(p *parser) bool { 1657 switch p.tok.Type { 1658 case StartTagToken: 1659 switch p.tok.DataAtom { 1660 case a.Caption, a.Col, a.Colgroup, a.Tbody, a.Td, a.Tfoot, a.Th, a.Thead, a.Tr: 1661 if p.popUntil(tableScope, a.Td, a.Th) { 1662 // Close the cell and reprocess. 1663 p.clearActiveFormattingElements() 1664 p.im = inRowIM 1665 return false 1666 } 1667 // Ignore the token. 1668 return true 1669 case a.Select: 1670 p.reconstructActiveFormattingElements() 1671 p.addElement() 1672 p.framesetOK = false 1673 p.im = inSelectInTableIM 1674 return true 1675 } 1676 case EndTagToken: 1677 switch p.tok.DataAtom { 1678 case a.Td, a.Th: 1679 if !p.popUntil(tableScope, p.tok.DataAtom) { 1680 // Ignore the token. 1681 return true 1682 } 1683 p.clearActiveFormattingElements() 1684 p.im = inRowIM 1685 return true 1686 case a.Body, a.Caption, a.Col, a.Colgroup, a.Html: 1687 // Ignore the token. 1688 return true 1689 case a.Table, a.Tbody, a.Tfoot, a.Thead, a.Tr: 1690 if !p.elementInScope(tableScope, p.tok.DataAtom) { 1691 // Ignore the token. 1692 return true 1693 } 1694 // Close the cell and reprocess. 1695 p.popUntil(tableScope, a.Td, a.Th) 1696 p.clearActiveFormattingElements() 1697 p.im = inRowIM 1698 return false 1699 } 1700 } 1701 return inBodyIM(p) 1702 } 1703 1704 // Section 12.2.6.4.16. 1705 func inSelectIM(p *parser) bool { 1706 switch p.tok.Type { 1707 case TextToken: 1708 p.addText(strings.Replace(p.tok.Data, "\x00", "", -1)) 1709 case StartTagToken: 1710 switch p.tok.DataAtom { 1711 case a.Html: 1712 return inBodyIM(p) 1713 case a.Option: 1714 if p.top().DataAtom == a.Option { 1715 p.oe.pop() 1716 } 1717 p.addElement() 1718 case a.Optgroup: 1719 if p.top().DataAtom == a.Option { 1720 p.oe.pop() 1721 } 1722 if p.top().DataAtom == a.Optgroup { 1723 p.oe.pop() 1724 } 1725 p.addElement() 1726 case a.Select: 1727 if p.popUntil(selectScope, a.Select) { 1728 p.resetInsertionMode() 1729 } else { 1730 // Ignore the token. 1731 return true 1732 } 1733 case a.Input, a.Keygen, a.Textarea: 1734 if p.elementInScope(selectScope, a.Select) { 1735 p.parseImpliedToken(EndTagToken, a.Select, a.Select.String()) 1736 return false 1737 } 1738 // In order to properly ignore <textarea>, we need to change the tokenizer mode. 1739 p.tokenizer.NextIsNotRawText() 1740 // Ignore the token. 1741 return true 1742 case a.Script, a.Template: 1743 return inHeadIM(p) 1744 } 1745 case EndTagToken: 1746 switch p.tok.DataAtom { 1747 case a.Option: 1748 if p.top().DataAtom == a.Option { 1749 p.oe.pop() 1750 } 1751 case a.Optgroup: 1752 i := len(p.oe) - 1 1753 if p.oe[i].DataAtom == a.Option { 1754 i-- 1755 } 1756 if p.oe[i].DataAtom == a.Optgroup { 1757 p.oe = p.oe[:i] 1758 } 1759 case a.Select: 1760 if p.popUntil(selectScope, a.Select) { 1761 p.resetInsertionMode() 1762 } else { 1763 // Ignore the token. 1764 return true 1765 } 1766 case a.Template: 1767 return inHeadIM(p) 1768 } 1769 case CommentToken: 1770 p.addChild(&Node{ 1771 Type: CommentNode, 1772 Data: p.tok.Data, 1773 }) 1774 case DoctypeToken: 1775 // Ignore the token. 1776 return true 1777 case ErrorToken: 1778 return inBodyIM(p) 1779 } 1780 1781 return true 1782 } 1783 1784 // Section 12.2.6.4.17. 1785 func inSelectInTableIM(p *parser) bool { 1786 switch p.tok.Type { 1787 case StartTagToken, EndTagToken: 1788 switch p.tok.DataAtom { 1789 case a.Caption, a.Table, a.Tbody, a.Tfoot, a.Thead, a.Tr, a.Td, a.Th: 1790 if p.tok.Type == EndTagToken && !p.elementInScope(tableScope, p.tok.DataAtom) { 1791 // Ignore the token. 1792 return true 1793 } 1794 // This is like p.popUntil(selectScope, a.Select), but it also 1795 // matches <math select>, not just <select>. Matching the MathML 1796 // tag is arguably incorrect (conceptually), but it mimics what 1797 // Chromium does. 1798 for i := len(p.oe) - 1; i >= 0; i-- { 1799 if n := p.oe[i]; n.DataAtom == a.Select { 1800 p.oe = p.oe[:i] 1801 break 1802 } 1803 } 1804 p.resetInsertionMode() 1805 return false 1806 } 1807 } 1808 return inSelectIM(p) 1809 } 1810 1811 // Section 12.2.6.4.18. 1812 func inTemplateIM(p *parser) bool { 1813 switch p.tok.Type { 1814 case TextToken, CommentToken, DoctypeToken: 1815 return inBodyIM(p) 1816 case StartTagToken: 1817 switch p.tok.DataAtom { 1818 case a.Base, a.Basefont, a.Bgsound, a.Link, a.Meta, a.Noframes, a.Script, a.Style, a.Template, a.Title: 1819 return inHeadIM(p) 1820 case a.Caption, a.Colgroup, a.Tbody, a.Tfoot, a.Thead: 1821 p.templateStack.pop() 1822 p.templateStack = append(p.templateStack, inTableIM) 1823 p.im = inTableIM 1824 return false 1825 case a.Col: 1826 p.templateStack.pop() 1827 p.templateStack = append(p.templateStack, inColumnGroupIM) 1828 p.im = inColumnGroupIM 1829 return false 1830 case a.Tr: 1831 p.templateStack.pop() 1832 p.templateStack = append(p.templateStack, inTableBodyIM) 1833 p.im = inTableBodyIM 1834 return false 1835 case a.Td, a.Th: 1836 p.templateStack.pop() 1837 p.templateStack = append(p.templateStack, inRowIM) 1838 p.im = inRowIM 1839 return false 1840 default: 1841 p.templateStack.pop() 1842 p.templateStack = append(p.templateStack, inBodyIM) 1843 p.im = inBodyIM 1844 return false 1845 } 1846 case EndTagToken: 1847 switch p.tok.DataAtom { 1848 case a.Template: 1849 return inHeadIM(p) 1850 default: 1851 // Ignore the token. 1852 return true 1853 } 1854 case ErrorToken: 1855 if !p.oe.contains(a.Template) { 1856 // Ignore the token. 1857 return true 1858 } 1859 // TODO: remove this divergence from the HTML5 spec. 1860 // 1861 // See https://bugs.chromium.org/p/chromium/issues/detail?id=829668 1862 p.generateImpliedEndTags() 1863 for i := len(p.oe) - 1; i >= 0; i-- { 1864 if n := p.oe[i]; n.Namespace == "" && n.DataAtom == a.Template { 1865 p.oe = p.oe[:i] 1866 break 1867 } 1868 } 1869 p.clearActiveFormattingElements() 1870 p.templateStack.pop() 1871 p.resetInsertionMode() 1872 return false 1873 } 1874 return false 1875 } 1876 1877 // Section 12.2.6.4.19. 1878 func afterBodyIM(p *parser) bool { 1879 switch p.tok.Type { 1880 case ErrorToken: 1881 // Stop parsing. 1882 return true 1883 case TextToken: 1884 s := strings.TrimLeft(p.tok.Data, whitespace) 1885 if len(s) == 0 { 1886 // It was all whitespace. 1887 return inBodyIM(p) 1888 } 1889 case StartTagToken: 1890 if p.tok.DataAtom == a.Html { 1891 return inBodyIM(p) 1892 } 1893 case EndTagToken: 1894 if p.tok.DataAtom == a.Html { 1895 if !p.fragment { 1896 p.im = afterAfterBodyIM 1897 } 1898 return true 1899 } 1900 case CommentToken: 1901 // The comment is attached to the <html> element. 1902 if len(p.oe) < 1 || p.oe[0].DataAtom != a.Html { 1903 panic("html: bad parser state: <html> element not found, in the after-body insertion mode") 1904 } 1905 p.oe[0].AppendChild(&Node{ 1906 Type: CommentNode, 1907 Data: p.tok.Data, 1908 }) 1909 return true 1910 } 1911 p.im = inBodyIM 1912 return false 1913 } 1914 1915 // Section 12.2.6.4.20. 1916 func inFramesetIM(p *parser) bool { 1917 switch p.tok.Type { 1918 case CommentToken: 1919 p.addChild(&Node{ 1920 Type: CommentNode, 1921 Data: p.tok.Data, 1922 }) 1923 case TextToken: 1924 // Ignore all text but whitespace. 1925 s := strings.Map(func(c rune) rune { 1926 switch c { 1927 case ' ', '\t', '\n', '\f', '\r': 1928 return c 1929 } 1930 return -1 1931 }, p.tok.Data) 1932 if s != "" { 1933 p.addText(s) 1934 } 1935 case StartTagToken: 1936 switch p.tok.DataAtom { 1937 case a.Html: 1938 return inBodyIM(p) 1939 case a.Frameset: 1940 p.addElement() 1941 case a.Frame: 1942 p.addElement() 1943 p.oe.pop() 1944 p.acknowledgeSelfClosingTag() 1945 case a.Noframes: 1946 return inHeadIM(p) 1947 } 1948 case EndTagToken: 1949 switch p.tok.DataAtom { 1950 case a.Frameset: 1951 if p.oe.top().DataAtom != a.Html { 1952 p.oe.pop() 1953 if p.oe.top().DataAtom != a.Frameset { 1954 p.im = afterFramesetIM 1955 return true 1956 } 1957 } 1958 } 1959 default: 1960 // Ignore the token. 1961 } 1962 return true 1963 } 1964 1965 // Section 12.2.6.4.21. 1966 func afterFramesetIM(p *parser) bool { 1967 switch p.tok.Type { 1968 case CommentToken: 1969 p.addChild(&Node{ 1970 Type: CommentNode, 1971 Data: p.tok.Data, 1972 }) 1973 case TextToken: 1974 // Ignore all text but whitespace. 1975 s := strings.Map(func(c rune) rune { 1976 switch c { 1977 case ' ', '\t', '\n', '\f', '\r': 1978 return c 1979 } 1980 return -1 1981 }, p.tok.Data) 1982 if s != "" { 1983 p.addText(s) 1984 } 1985 case StartTagToken: 1986 switch p.tok.DataAtom { 1987 case a.Html: 1988 return inBodyIM(p) 1989 case a.Noframes: 1990 return inHeadIM(p) 1991 } 1992 case EndTagToken: 1993 switch p.tok.DataAtom { 1994 case a.Html: 1995 p.im = afterAfterFramesetIM 1996 return true 1997 } 1998 default: 1999 // Ignore the token. 2000 } 2001 return true 2002 } 2003 2004 // Section 12.2.6.4.22. 2005 func afterAfterBodyIM(p *parser) bool { 2006 switch p.tok.Type { 2007 case ErrorToken: 2008 // Stop parsing. 2009 return true 2010 case TextToken: 2011 s := strings.TrimLeft(p.tok.Data, whitespace) 2012 if len(s) == 0 { 2013 // It was all whitespace. 2014 return inBodyIM(p) 2015 } 2016 case StartTagToken: 2017 if p.tok.DataAtom == a.Html { 2018 return inBodyIM(p) 2019 } 2020 case CommentToken: 2021 p.doc.AppendChild(&Node{ 2022 Type: CommentNode, 2023 Data: p.tok.Data, 2024 }) 2025 return true 2026 case DoctypeToken: 2027 return inBodyIM(p) 2028 } 2029 p.im = inBodyIM 2030 return false 2031 } 2032 2033 // Section 12.2.6.4.23. 2034 func afterAfterFramesetIM(p *parser) bool { 2035 switch p.tok.Type { 2036 case CommentToken: 2037 p.doc.AppendChild(&Node{ 2038 Type: CommentNode, 2039 Data: p.tok.Data, 2040 }) 2041 case TextToken: 2042 // Ignore all text but whitespace. 2043 s := strings.Map(func(c rune) rune { 2044 switch c { 2045 case ' ', '\t', '\n', '\f', '\r': 2046 return c 2047 } 2048 return -1 2049 }, p.tok.Data) 2050 if s != "" { 2051 p.tok.Data = s 2052 return inBodyIM(p) 2053 } 2054 case StartTagToken: 2055 switch p.tok.DataAtom { 2056 case a.Html: 2057 return inBodyIM(p) 2058 case a.Noframes: 2059 return inHeadIM(p) 2060 } 2061 case DoctypeToken: 2062 return inBodyIM(p) 2063 default: 2064 // Ignore the token. 2065 } 2066 return true 2067 } 2068 2069 const whitespaceOrNUL = whitespace + "\x00" 2070 2071 // Section 12.2.6.5 2072 func parseForeignContent(p *parser) bool { 2073 switch p.tok.Type { 2074 case TextToken: 2075 if p.framesetOK { 2076 p.framesetOK = strings.TrimLeft(p.tok.Data, whitespaceOrNUL) == "" 2077 } 2078 p.tok.Data = strings.Replace(p.tok.Data, "\x00", "\ufffd", -1) 2079 p.addText(p.tok.Data) 2080 case CommentToken: 2081 p.addChild(&Node{ 2082 Type: CommentNode, 2083 Data: p.tok.Data, 2084 }) 2085 case StartTagToken: 2086 b := breakout[p.tok.Data] 2087 if p.tok.DataAtom == a.Font { 2088 loop: 2089 for _, attr := range p.tok.Attr { 2090 switch attr.Key { 2091 case "color", "face", "size": 2092 b = true 2093 break loop 2094 } 2095 } 2096 } 2097 if b { 2098 for i := len(p.oe) - 1; i >= 0; i-- { 2099 n := p.oe[i] 2100 if n.Namespace == "" || htmlIntegrationPoint(n) || mathMLTextIntegrationPoint(n) { 2101 p.oe = p.oe[:i+1] 2102 break 2103 } 2104 } 2105 return false 2106 } 2107 switch p.top().Namespace { 2108 case "math": 2109 adjustAttributeNames(p.tok.Attr, mathMLAttributeAdjustments) 2110 case "svg": 2111 // Adjust SVG tag names. The tokenizer lower-cases tag names, but 2112 // SVG wants e.g. "foreignObject" with a capital second "O". 2113 if x := svgTagNameAdjustments[p.tok.Data]; x != "" { 2114 p.tok.DataAtom = a.Lookup([]byte(x)) 2115 p.tok.Data = x 2116 } 2117 adjustAttributeNames(p.tok.Attr, svgAttributeAdjustments) 2118 default: 2119 panic("html: bad parser state: unexpected namespace") 2120 } 2121 adjustForeignAttributes(p.tok.Attr) 2122 namespace := p.top().Namespace 2123 p.addElement() 2124 p.top().Namespace = namespace 2125 if namespace != "" { 2126 // Don't let the tokenizer go into raw text mode in foreign content 2127 // (e.g. in an SVG <title> tag). 2128 p.tokenizer.NextIsNotRawText() 2129 } 2130 if p.hasSelfClosingToken { 2131 p.oe.pop() 2132 p.acknowledgeSelfClosingTag() 2133 } 2134 case EndTagToken: 2135 for i := len(p.oe) - 1; i >= 0; i-- { 2136 if p.oe[i].Namespace == "" { 2137 return p.im(p) 2138 } 2139 if strings.EqualFold(p.oe[i].Data, p.tok.Data) { 2140 p.oe = p.oe[:i] 2141 break 2142 } 2143 } 2144 return true 2145 default: 2146 // Ignore the token. 2147 } 2148 return true 2149 } 2150 2151 // Section 12.2.6. 2152 func (p *parser) inForeignContent() bool { 2153 if len(p.oe) == 0 { 2154 return false 2155 } 2156 n := p.oe[len(p.oe)-1] 2157 if n.Namespace == "" { 2158 return false 2159 } 2160 if mathMLTextIntegrationPoint(n) { 2161 if p.tok.Type == StartTagToken && p.tok.DataAtom != a.Mglyph && p.tok.DataAtom != a.Malignmark { 2162 return false 2163 } 2164 if p.tok.Type == TextToken { 2165 return false 2166 } 2167 } 2168 if n.Namespace == "math" && n.DataAtom == a.AnnotationXml && p.tok.Type == StartTagToken && p.tok.DataAtom == a.Svg { 2169 return false 2170 } 2171 if htmlIntegrationPoint(n) && (p.tok.Type == StartTagToken || p.tok.Type == TextToken) { 2172 return false 2173 } 2174 if p.tok.Type == ErrorToken { 2175 return false 2176 } 2177 return true 2178 } 2179 2180 // parseImpliedToken parses a token as though it had appeared in the parser's 2181 // input. 2182 func (p *parser) parseImpliedToken(t TokenType, dataAtom a.Atom, data string) { 2183 realToken, selfClosing := p.tok, p.hasSelfClosingToken 2184 p.tok = Token{ 2185 Type: t, 2186 DataAtom: dataAtom, 2187 Data: data, 2188 } 2189 p.hasSelfClosingToken = false 2190 p.parseCurrentToken() 2191 p.tok, p.hasSelfClosingToken = realToken, selfClosing 2192 } 2193 2194 // parseCurrentToken runs the current token through the parsing routines 2195 // until it is consumed. 2196 func (p *parser) parseCurrentToken() { 2197 if p.tok.Type == SelfClosingTagToken { 2198 p.hasSelfClosingToken = true 2199 p.tok.Type = StartTagToken 2200 } 2201 2202 consumed := false 2203 for !consumed { 2204 if p.inForeignContent() { 2205 consumed = parseForeignContent(p) 2206 } else { 2207 consumed = p.im(p) 2208 } 2209 } 2210 2211 if p.hasSelfClosingToken { 2212 // This is a parse error, but ignore it. 2213 p.hasSelfClosingToken = false 2214 } 2215 } 2216 2217 func (p *parser) parse() error { 2218 // Iterate until EOF. Any other error will cause an early return. 2219 var err error 2220 for err != io.EOF { 2221 // CDATA sections are allowed only in foreign content. 2222 n := p.oe.top() 2223 p.tokenizer.AllowCDATA(n != nil && n.Namespace != "") 2224 // Read and parse the next token. 2225 p.tokenizer.Next() 2226 p.tok = p.tokenizer.Token() 2227 if p.tok.Type == ErrorToken { 2228 err = p.tokenizer.Err() 2229 if err != nil && err != io.EOF { 2230 return err 2231 } 2232 } 2233 p.parseCurrentToken() 2234 } 2235 return nil 2236 } 2237 2238 // Parse returns the parse tree for the HTML from the given Reader. 2239 // 2240 // It implements the HTML5 parsing algorithm 2241 // (https://html.spec.whatwg.org/multipage/syntax.html#tree-construction), 2242 // which is very complicated. The resultant tree can contain implicitly created 2243 // nodes that have no explicit <tag> listed in r's data, and nodes' parents can 2244 // differ from the nesting implied by a naive processing of start and end 2245 // <tag>s. Conversely, explicit <tag>s in r's data can be silently dropped, 2246 // with no corresponding node in the resulting tree. 2247 // 2248 // The input is assumed to be UTF-8 encoded. 2249 func Parse(r io.Reader) (*Node, error) { 2250 p := &parser{ 2251 tokenizer: NewTokenizer(r), 2252 doc: &Node{ 2253 Type: DocumentNode, 2254 }, 2255 scripting: true, 2256 framesetOK: true, 2257 im: initialIM, 2258 } 2259 err := p.parse() 2260 if err != nil { 2261 return nil, err 2262 } 2263 return p.doc, nil 2264 } 2265 2266 // ParseFragment parses a fragment of HTML and returns the nodes that were 2267 // found. If the fragment is the InnerHTML for an existing element, pass that 2268 // element in context. 2269 // 2270 // It has the same intricacies as Parse. 2271 func ParseFragment(r io.Reader, context *Node) ([]*Node, error) { 2272 contextTag := "" 2273 if context != nil { 2274 if context.Type != ElementNode { 2275 return nil, errors.New("html: ParseFragment of non-element Node") 2276 } 2277 // The next check isn't just context.DataAtom.String() == context.Data because 2278 // it is valid to pass an element whose tag isn't a known atom. For example, 2279 // DataAtom == 0 and Data = "tagfromthefuture" is perfectly consistent. 2280 if context.DataAtom != a.Lookup([]byte(context.Data)) { 2281 return nil, fmt.Errorf("html: inconsistent Node: DataAtom=%q, Data=%q", context.DataAtom, context.Data) 2282 } 2283 contextTag = context.DataAtom.String() 2284 } 2285 p := &parser{ 2286 tokenizer: NewTokenizerFragment(r, contextTag), 2287 doc: &Node{ 2288 Type: DocumentNode, 2289 }, 2290 scripting: true, 2291 fragment: true, 2292 context: context, 2293 } 2294 2295 root := &Node{ 2296 Type: ElementNode, 2297 DataAtom: a.Html, 2298 Data: a.Html.String(), 2299 } 2300 p.doc.AppendChild(root) 2301 p.oe = nodeStack{root} 2302 if context != nil && context.DataAtom == a.Template { 2303 p.templateStack = append(p.templateStack, inTemplateIM) 2304 } 2305 p.resetInsertionMode() 2306 2307 for n := context; n != nil; n = n.Parent { 2308 if n.Type == ElementNode && n.DataAtom == a.Form { 2309 p.form = n 2310 break 2311 } 2312 } 2313 2314 err := p.parse() 2315 if err != nil { 2316 return nil, err 2317 } 2318 2319 parent := p.doc 2320 if context != nil { 2321 parent = root 2322 } 2323 2324 var result []*Node 2325 for c := parent.FirstChild; c != nil; { 2326 next := c.NextSibling 2327 parent.RemoveChild(c) 2328 result = append(result, c) 2329 c = next 2330 } 2331 return result, nil 2332 }