github.com/stackdocker/rkt@v0.10.1-0.20151109095037-1aa827478248/Godeps/_workspace/src/golang.org/x/net/html/parse.go (about) 1 // Copyright 2010 The Go Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 package html 6 7 import ( 8 "errors" 9 "fmt" 10 "io" 11 "strings" 12 13 a "github.com/coreos/rkt/Godeps/_workspace/src/golang.org/x/net/html/atom" 14 ) 15 16 // A parser implements the HTML5 parsing algorithm: 17 // https://html.spec.whatwg.org/multipage/syntax.html#tree-construction 18 type parser struct { 19 // tokenizer provides the tokens for the parser. 20 tokenizer *Tokenizer 21 // tok is the most recently read token. 22 tok Token 23 // Self-closing tags like <hr/> are treated as start tags, except that 24 // hasSelfClosingToken is set while they are being processed. 25 hasSelfClosingToken bool 26 // doc is the document root element. 27 doc *Node 28 // The stack of open elements (section 12.2.3.2) and active formatting 29 // elements (section 12.2.3.3). 30 oe, afe nodeStack 31 // Element pointers (section 12.2.3.4). 32 head, form *Node 33 // Other parsing state flags (section 12.2.3.5). 34 scripting, framesetOK bool 35 // im is the current insertion mode. 36 im insertionMode 37 // originalIM is the insertion mode to go back to after completing a text 38 // or inTableText insertion mode. 39 originalIM insertionMode 40 // fosterParenting is whether new elements should be inserted according to 41 // the foster parenting rules (section 12.2.5.3). 42 fosterParenting bool 43 // quirks is whether the parser is operating in "quirks mode." 44 quirks bool 45 // fragment is whether the parser is parsing an HTML fragment. 46 fragment bool 47 // context is the context element when parsing an HTML fragment 48 // (section 12.4). 49 context *Node 50 } 51 52 func (p *parser) top() *Node { 53 if n := p.oe.top(); n != nil { 54 return n 55 } 56 return p.doc 57 } 58 59 // Stop tags for use in popUntil. These come from section 12.2.3.2. 60 var ( 61 defaultScopeStopTags = map[string][]a.Atom{ 62 "": {a.Applet, a.Caption, a.Html, a.Table, a.Td, a.Th, a.Marquee, a.Object, a.Template}, 63 "math": {a.AnnotationXml, a.Mi, a.Mn, a.Mo, a.Ms, a.Mtext}, 64 "svg": {a.Desc, a.ForeignObject, a.Title}, 65 } 66 ) 67 68 type scope int 69 70 const ( 71 defaultScope scope = iota 72 listItemScope 73 buttonScope 74 tableScope 75 tableRowScope 76 tableBodyScope 77 selectScope 78 ) 79 80 // popUntil pops the stack of open elements at the highest element whose tag 81 // is in matchTags, provided there is no higher element in the scope's stop 82 // tags (as defined in section 12.2.3.2). It returns whether or not there was 83 // such an element. If there was not, popUntil leaves the stack unchanged. 84 // 85 // For example, the set of stop tags for table scope is: "html", "table". If 86 // the stack was: 87 // ["html", "body", "font", "table", "b", "i", "u"] 88 // then popUntil(tableScope, "font") would return false, but 89 // popUntil(tableScope, "i") would return true and the stack would become: 90 // ["html", "body", "font", "table", "b"] 91 // 92 // If an element's tag is in both the stop tags and matchTags, then the stack 93 // will be popped and the function returns true (provided, of course, there was 94 // no higher element in the stack that was also in the stop tags). For example, 95 // popUntil(tableScope, "table") returns true and leaves: 96 // ["html", "body", "font"] 97 func (p *parser) popUntil(s scope, matchTags ...a.Atom) bool { 98 if i := p.indexOfElementInScope(s, matchTags...); i != -1 { 99 p.oe = p.oe[:i] 100 return true 101 } 102 return false 103 } 104 105 // indexOfElementInScope returns the index in p.oe of the highest element whose 106 // tag is in matchTags that is in scope. If no matching element is in scope, it 107 // returns -1. 108 func (p *parser) indexOfElementInScope(s scope, matchTags ...a.Atom) int { 109 for i := len(p.oe) - 1; i >= 0; i-- { 110 tagAtom := p.oe[i].DataAtom 111 if p.oe[i].Namespace == "" { 112 for _, t := range matchTags { 113 if t == tagAtom { 114 return i 115 } 116 } 117 switch s { 118 case defaultScope: 119 // No-op. 120 case listItemScope: 121 if tagAtom == a.Ol || tagAtom == a.Ul { 122 return -1 123 } 124 case buttonScope: 125 if tagAtom == a.Button { 126 return -1 127 } 128 case tableScope: 129 if tagAtom == a.Html || tagAtom == a.Table { 130 return -1 131 } 132 case selectScope: 133 if tagAtom != a.Optgroup && tagAtom != a.Option { 134 return -1 135 } 136 default: 137 panic("unreachable") 138 } 139 } 140 switch s { 141 case defaultScope, listItemScope, buttonScope: 142 for _, t := range defaultScopeStopTags[p.oe[i].Namespace] { 143 if t == tagAtom { 144 return -1 145 } 146 } 147 } 148 } 149 return -1 150 } 151 152 // elementInScope is like popUntil, except that it doesn't modify the stack of 153 // open elements. 154 func (p *parser) elementInScope(s scope, matchTags ...a.Atom) bool { 155 return p.indexOfElementInScope(s, matchTags...) != -1 156 } 157 158 // clearStackToContext pops elements off the stack of open elements until a 159 // scope-defined element is found. 160 func (p *parser) clearStackToContext(s scope) { 161 for i := len(p.oe) - 1; i >= 0; i-- { 162 tagAtom := p.oe[i].DataAtom 163 switch s { 164 case tableScope: 165 if tagAtom == a.Html || tagAtom == a.Table { 166 p.oe = p.oe[:i+1] 167 return 168 } 169 case tableRowScope: 170 if tagAtom == a.Html || tagAtom == a.Tr { 171 p.oe = p.oe[:i+1] 172 return 173 } 174 case tableBodyScope: 175 if tagAtom == a.Html || tagAtom == a.Tbody || tagAtom == a.Tfoot || tagAtom == a.Thead { 176 p.oe = p.oe[:i+1] 177 return 178 } 179 default: 180 panic("unreachable") 181 } 182 } 183 } 184 185 // generateImpliedEndTags pops nodes off the stack of open elements as long as 186 // the top node has a tag name of dd, dt, li, option, optgroup, p, rp, or rt. 187 // If exceptions are specified, nodes with that name will not be popped off. 188 func (p *parser) generateImpliedEndTags(exceptions ...string) { 189 var i int 190 loop: 191 for i = len(p.oe) - 1; i >= 0; i-- { 192 n := p.oe[i] 193 if n.Type == ElementNode { 194 switch n.DataAtom { 195 case a.Dd, a.Dt, a.Li, a.Option, a.Optgroup, a.P, a.Rp, a.Rt: 196 for _, except := range exceptions { 197 if n.Data == except { 198 break loop 199 } 200 } 201 continue 202 } 203 } 204 break 205 } 206 207 p.oe = p.oe[:i+1] 208 } 209 210 // addChild adds a child node n to the top element, and pushes n onto the stack 211 // of open elements if it is an element node. 212 func (p *parser) addChild(n *Node) { 213 if p.shouldFosterParent() { 214 p.fosterParent(n) 215 } else { 216 p.top().AppendChild(n) 217 } 218 219 if n.Type == ElementNode { 220 p.oe = append(p.oe, n) 221 } 222 } 223 224 // shouldFosterParent returns whether the next node to be added should be 225 // foster parented. 226 func (p *parser) shouldFosterParent() bool { 227 if p.fosterParenting { 228 switch p.top().DataAtom { 229 case a.Table, a.Tbody, a.Tfoot, a.Thead, a.Tr: 230 return true 231 } 232 } 233 return false 234 } 235 236 // fosterParent adds a child node according to the foster parenting rules. 237 // Section 12.2.5.3, "foster parenting". 238 func (p *parser) fosterParent(n *Node) { 239 var table, parent, prev *Node 240 var i int 241 for i = len(p.oe) - 1; i >= 0; i-- { 242 if p.oe[i].DataAtom == a.Table { 243 table = p.oe[i] 244 break 245 } 246 } 247 248 if table == nil { 249 // The foster parent is the html element. 250 parent = p.oe[0] 251 } else { 252 parent = table.Parent 253 } 254 if parent == nil { 255 parent = p.oe[i-1] 256 } 257 258 if table != nil { 259 prev = table.PrevSibling 260 } else { 261 prev = parent.LastChild 262 } 263 if prev != nil && prev.Type == TextNode && n.Type == TextNode { 264 prev.Data += n.Data 265 return 266 } 267 268 parent.InsertBefore(n, table) 269 } 270 271 // addText adds text to the preceding node if it is a text node, or else it 272 // calls addChild with a new text node. 273 func (p *parser) addText(text string) { 274 if text == "" { 275 return 276 } 277 278 if p.shouldFosterParent() { 279 p.fosterParent(&Node{ 280 Type: TextNode, 281 Data: text, 282 }) 283 return 284 } 285 286 t := p.top() 287 if n := t.LastChild; n != nil && n.Type == TextNode { 288 n.Data += text 289 return 290 } 291 p.addChild(&Node{ 292 Type: TextNode, 293 Data: text, 294 }) 295 } 296 297 // addElement adds a child element based on the current token. 298 func (p *parser) addElement() { 299 p.addChild(&Node{ 300 Type: ElementNode, 301 DataAtom: p.tok.DataAtom, 302 Data: p.tok.Data, 303 Attr: p.tok.Attr, 304 }) 305 } 306 307 // Section 12.2.3.3. 308 func (p *parser) addFormattingElement() { 309 tagAtom, attr := p.tok.DataAtom, p.tok.Attr 310 p.addElement() 311 312 // Implement the Noah's Ark clause, but with three per family instead of two. 313 identicalElements := 0 314 findIdenticalElements: 315 for i := len(p.afe) - 1; i >= 0; i-- { 316 n := p.afe[i] 317 if n.Type == scopeMarkerNode { 318 break 319 } 320 if n.Type != ElementNode { 321 continue 322 } 323 if n.Namespace != "" { 324 continue 325 } 326 if n.DataAtom != tagAtom { 327 continue 328 } 329 if len(n.Attr) != len(attr) { 330 continue 331 } 332 compareAttributes: 333 for _, t0 := range n.Attr { 334 for _, t1 := range attr { 335 if t0.Key == t1.Key && t0.Namespace == t1.Namespace && t0.Val == t1.Val { 336 // Found a match for this attribute, continue with the next attribute. 337 continue compareAttributes 338 } 339 } 340 // If we get here, there is no attribute that matches a. 341 // Therefore the element is not identical to the new one. 342 continue findIdenticalElements 343 } 344 345 identicalElements++ 346 if identicalElements >= 3 { 347 p.afe.remove(n) 348 } 349 } 350 351 p.afe = append(p.afe, p.top()) 352 } 353 354 // Section 12.2.3.3. 355 func (p *parser) clearActiveFormattingElements() { 356 for { 357 n := p.afe.pop() 358 if len(p.afe) == 0 || n.Type == scopeMarkerNode { 359 return 360 } 361 } 362 } 363 364 // Section 12.2.3.3. 365 func (p *parser) reconstructActiveFormattingElements() { 366 n := p.afe.top() 367 if n == nil { 368 return 369 } 370 if n.Type == scopeMarkerNode || p.oe.index(n) != -1 { 371 return 372 } 373 i := len(p.afe) - 1 374 for n.Type != scopeMarkerNode && p.oe.index(n) == -1 { 375 if i == 0 { 376 i = -1 377 break 378 } 379 i-- 380 n = p.afe[i] 381 } 382 for { 383 i++ 384 clone := p.afe[i].clone() 385 p.addChild(clone) 386 p.afe[i] = clone 387 if i == len(p.afe)-1 { 388 break 389 } 390 } 391 } 392 393 // Section 12.2.4. 394 func (p *parser) acknowledgeSelfClosingTag() { 395 p.hasSelfClosingToken = false 396 } 397 398 // An insertion mode (section 12.2.3.1) is the state transition function from 399 // a particular state in the HTML5 parser's state machine. It updates the 400 // parser's fields depending on parser.tok (where ErrorToken means EOF). 401 // It returns whether the token was consumed. 402 type insertionMode func(*parser) bool 403 404 // setOriginalIM sets the insertion mode to return to after completing a text or 405 // inTableText insertion mode. 406 // Section 12.2.3.1, "using the rules for". 407 func (p *parser) setOriginalIM() { 408 if p.originalIM != nil { 409 panic("html: bad parser state: originalIM was set twice") 410 } 411 p.originalIM = p.im 412 } 413 414 // Section 12.2.3.1, "reset the insertion mode". 415 func (p *parser) resetInsertionMode() { 416 for i := len(p.oe) - 1; i >= 0; i-- { 417 n := p.oe[i] 418 if i == 0 && p.context != nil { 419 n = p.context 420 } 421 422 switch n.DataAtom { 423 case a.Select: 424 p.im = inSelectIM 425 case a.Td, a.Th: 426 p.im = inCellIM 427 case a.Tr: 428 p.im = inRowIM 429 case a.Tbody, a.Thead, a.Tfoot: 430 p.im = inTableBodyIM 431 case a.Caption: 432 p.im = inCaptionIM 433 case a.Colgroup: 434 p.im = inColumnGroupIM 435 case a.Table: 436 p.im = inTableIM 437 case a.Head: 438 p.im = inBodyIM 439 case a.Body: 440 p.im = inBodyIM 441 case a.Frameset: 442 p.im = inFramesetIM 443 case a.Html: 444 p.im = beforeHeadIM 445 default: 446 continue 447 } 448 return 449 } 450 p.im = inBodyIM 451 } 452 453 const whitespace = " \t\r\n\f" 454 455 // Section 12.2.5.4.1. 456 func initialIM(p *parser) bool { 457 switch p.tok.Type { 458 case TextToken: 459 p.tok.Data = strings.TrimLeft(p.tok.Data, whitespace) 460 if len(p.tok.Data) == 0 { 461 // It was all whitespace, so ignore it. 462 return true 463 } 464 case CommentToken: 465 p.doc.AppendChild(&Node{ 466 Type: CommentNode, 467 Data: p.tok.Data, 468 }) 469 return true 470 case DoctypeToken: 471 n, quirks := parseDoctype(p.tok.Data) 472 p.doc.AppendChild(n) 473 p.quirks = quirks 474 p.im = beforeHTMLIM 475 return true 476 } 477 p.quirks = true 478 p.im = beforeHTMLIM 479 return false 480 } 481 482 // Section 12.2.5.4.2. 483 func beforeHTMLIM(p *parser) bool { 484 switch p.tok.Type { 485 case DoctypeToken: 486 // Ignore the token. 487 return true 488 case TextToken: 489 p.tok.Data = strings.TrimLeft(p.tok.Data, whitespace) 490 if len(p.tok.Data) == 0 { 491 // It was all whitespace, so ignore it. 492 return true 493 } 494 case StartTagToken: 495 if p.tok.DataAtom == a.Html { 496 p.addElement() 497 p.im = beforeHeadIM 498 return true 499 } 500 case EndTagToken: 501 switch p.tok.DataAtom { 502 case a.Head, a.Body, a.Html, a.Br: 503 p.parseImpliedToken(StartTagToken, a.Html, a.Html.String()) 504 return false 505 default: 506 // Ignore the token. 507 return true 508 } 509 case CommentToken: 510 p.doc.AppendChild(&Node{ 511 Type: CommentNode, 512 Data: p.tok.Data, 513 }) 514 return true 515 } 516 p.parseImpliedToken(StartTagToken, a.Html, a.Html.String()) 517 return false 518 } 519 520 // Section 12.2.5.4.3. 521 func beforeHeadIM(p *parser) bool { 522 switch p.tok.Type { 523 case TextToken: 524 p.tok.Data = strings.TrimLeft(p.tok.Data, whitespace) 525 if len(p.tok.Data) == 0 { 526 // It was all whitespace, so ignore it. 527 return true 528 } 529 case StartTagToken: 530 switch p.tok.DataAtom { 531 case a.Head: 532 p.addElement() 533 p.head = p.top() 534 p.im = inHeadIM 535 return true 536 case a.Html: 537 return inBodyIM(p) 538 } 539 case EndTagToken: 540 switch p.tok.DataAtom { 541 case a.Head, a.Body, a.Html, a.Br: 542 p.parseImpliedToken(StartTagToken, a.Head, a.Head.String()) 543 return false 544 default: 545 // Ignore the token. 546 return true 547 } 548 case CommentToken: 549 p.addChild(&Node{ 550 Type: CommentNode, 551 Data: p.tok.Data, 552 }) 553 return true 554 case DoctypeToken: 555 // Ignore the token. 556 return true 557 } 558 559 p.parseImpliedToken(StartTagToken, a.Head, a.Head.String()) 560 return false 561 } 562 563 // Section 12.2.5.4.4. 564 func inHeadIM(p *parser) bool { 565 switch p.tok.Type { 566 case TextToken: 567 s := strings.TrimLeft(p.tok.Data, whitespace) 568 if len(s) < len(p.tok.Data) { 569 // Add the initial whitespace to the current node. 570 p.addText(p.tok.Data[:len(p.tok.Data)-len(s)]) 571 if s == "" { 572 return true 573 } 574 p.tok.Data = s 575 } 576 case StartTagToken: 577 switch p.tok.DataAtom { 578 case a.Html: 579 return inBodyIM(p) 580 case a.Base, a.Basefont, a.Bgsound, a.Command, a.Link, a.Meta: 581 p.addElement() 582 p.oe.pop() 583 p.acknowledgeSelfClosingTag() 584 return true 585 case a.Script, a.Title, a.Noscript, a.Noframes, a.Style: 586 p.addElement() 587 p.setOriginalIM() 588 p.im = textIM 589 return true 590 case a.Head: 591 // Ignore the token. 592 return true 593 } 594 case EndTagToken: 595 switch p.tok.DataAtom { 596 case a.Head: 597 n := p.oe.pop() 598 if n.DataAtom != a.Head { 599 panic("html: bad parser state: <head> element not found, in the in-head insertion mode") 600 } 601 p.im = afterHeadIM 602 return true 603 case a.Body, a.Html, a.Br: 604 p.parseImpliedToken(EndTagToken, a.Head, a.Head.String()) 605 return false 606 default: 607 // Ignore the token. 608 return true 609 } 610 case CommentToken: 611 p.addChild(&Node{ 612 Type: CommentNode, 613 Data: p.tok.Data, 614 }) 615 return true 616 case DoctypeToken: 617 // Ignore the token. 618 return true 619 } 620 621 p.parseImpliedToken(EndTagToken, a.Head, a.Head.String()) 622 return false 623 } 624 625 // Section 12.2.5.4.6. 626 func afterHeadIM(p *parser) bool { 627 switch p.tok.Type { 628 case TextToken: 629 s := strings.TrimLeft(p.tok.Data, whitespace) 630 if len(s) < len(p.tok.Data) { 631 // Add the initial whitespace to the current node. 632 p.addText(p.tok.Data[:len(p.tok.Data)-len(s)]) 633 if s == "" { 634 return true 635 } 636 p.tok.Data = s 637 } 638 case StartTagToken: 639 switch p.tok.DataAtom { 640 case a.Html: 641 return inBodyIM(p) 642 case a.Body: 643 p.addElement() 644 p.framesetOK = false 645 p.im = inBodyIM 646 return true 647 case a.Frameset: 648 p.addElement() 649 p.im = inFramesetIM 650 return true 651 case a.Base, a.Basefont, a.Bgsound, a.Link, a.Meta, a.Noframes, a.Script, a.Style, a.Title: 652 p.oe = append(p.oe, p.head) 653 defer p.oe.remove(p.head) 654 return inHeadIM(p) 655 case a.Head: 656 // Ignore the token. 657 return true 658 } 659 case EndTagToken: 660 switch p.tok.DataAtom { 661 case a.Body, a.Html, a.Br: 662 // Drop down to creating an implied <body> tag. 663 default: 664 // Ignore the token. 665 return true 666 } 667 case CommentToken: 668 p.addChild(&Node{ 669 Type: CommentNode, 670 Data: p.tok.Data, 671 }) 672 return true 673 case DoctypeToken: 674 // Ignore the token. 675 return true 676 } 677 678 p.parseImpliedToken(StartTagToken, a.Body, a.Body.String()) 679 p.framesetOK = true 680 return false 681 } 682 683 // copyAttributes copies attributes of src not found on dst to dst. 684 func copyAttributes(dst *Node, src Token) { 685 if len(src.Attr) == 0 { 686 return 687 } 688 attr := map[string]string{} 689 for _, t := range dst.Attr { 690 attr[t.Key] = t.Val 691 } 692 for _, t := range src.Attr { 693 if _, ok := attr[t.Key]; !ok { 694 dst.Attr = append(dst.Attr, t) 695 attr[t.Key] = t.Val 696 } 697 } 698 } 699 700 // Section 12.2.5.4.7. 701 func inBodyIM(p *parser) bool { 702 switch p.tok.Type { 703 case TextToken: 704 d := p.tok.Data 705 switch n := p.oe.top(); n.DataAtom { 706 case a.Pre, a.Listing: 707 if n.FirstChild == nil { 708 // Ignore a newline at the start of a <pre> block. 709 if d != "" && d[0] == '\r' { 710 d = d[1:] 711 } 712 if d != "" && d[0] == '\n' { 713 d = d[1:] 714 } 715 } 716 } 717 d = strings.Replace(d, "\x00", "", -1) 718 if d == "" { 719 return true 720 } 721 p.reconstructActiveFormattingElements() 722 p.addText(d) 723 if p.framesetOK && strings.TrimLeft(d, whitespace) != "" { 724 // There were non-whitespace characters inserted. 725 p.framesetOK = false 726 } 727 case StartTagToken: 728 switch p.tok.DataAtom { 729 case a.Html: 730 copyAttributes(p.oe[0], p.tok) 731 case a.Base, a.Basefont, a.Bgsound, a.Command, a.Link, a.Meta, a.Noframes, a.Script, a.Style, a.Title: 732 return inHeadIM(p) 733 case a.Body: 734 if len(p.oe) >= 2 { 735 body := p.oe[1] 736 if body.Type == ElementNode && body.DataAtom == a.Body { 737 p.framesetOK = false 738 copyAttributes(body, p.tok) 739 } 740 } 741 case a.Frameset: 742 if !p.framesetOK || len(p.oe) < 2 || p.oe[1].DataAtom != a.Body { 743 // Ignore the token. 744 return true 745 } 746 body := p.oe[1] 747 if body.Parent != nil { 748 body.Parent.RemoveChild(body) 749 } 750 p.oe = p.oe[:1] 751 p.addElement() 752 p.im = inFramesetIM 753 return true 754 case a.Address, a.Article, a.Aside, a.Blockquote, a.Center, a.Details, a.Dir, a.Div, a.Dl, a.Fieldset, a.Figcaption, a.Figure, a.Footer, a.Header, a.Hgroup, a.Menu, a.Nav, a.Ol, a.P, a.Section, a.Summary, a.Ul: 755 p.popUntil(buttonScope, a.P) 756 p.addElement() 757 case a.H1, a.H2, a.H3, a.H4, a.H5, a.H6: 758 p.popUntil(buttonScope, a.P) 759 switch n := p.top(); n.DataAtom { 760 case a.H1, a.H2, a.H3, a.H4, a.H5, a.H6: 761 p.oe.pop() 762 } 763 p.addElement() 764 case a.Pre, a.Listing: 765 p.popUntil(buttonScope, a.P) 766 p.addElement() 767 // The newline, if any, will be dealt with by the TextToken case. 768 p.framesetOK = false 769 case a.Form: 770 if p.form == nil { 771 p.popUntil(buttonScope, a.P) 772 p.addElement() 773 p.form = p.top() 774 } 775 case a.Li: 776 p.framesetOK = false 777 for i := len(p.oe) - 1; i >= 0; i-- { 778 node := p.oe[i] 779 switch node.DataAtom { 780 case a.Li: 781 p.oe = p.oe[:i] 782 case a.Address, a.Div, a.P: 783 continue 784 default: 785 if !isSpecialElement(node) { 786 continue 787 } 788 } 789 break 790 } 791 p.popUntil(buttonScope, a.P) 792 p.addElement() 793 case a.Dd, a.Dt: 794 p.framesetOK = false 795 for i := len(p.oe) - 1; i >= 0; i-- { 796 node := p.oe[i] 797 switch node.DataAtom { 798 case a.Dd, a.Dt: 799 p.oe = p.oe[:i] 800 case a.Address, a.Div, a.P: 801 continue 802 default: 803 if !isSpecialElement(node) { 804 continue 805 } 806 } 807 break 808 } 809 p.popUntil(buttonScope, a.P) 810 p.addElement() 811 case a.Plaintext: 812 p.popUntil(buttonScope, a.P) 813 p.addElement() 814 case a.Button: 815 p.popUntil(defaultScope, a.Button) 816 p.reconstructActiveFormattingElements() 817 p.addElement() 818 p.framesetOK = false 819 case a.A: 820 for i := len(p.afe) - 1; i >= 0 && p.afe[i].Type != scopeMarkerNode; i-- { 821 if n := p.afe[i]; n.Type == ElementNode && n.DataAtom == a.A { 822 p.inBodyEndTagFormatting(a.A) 823 p.oe.remove(n) 824 p.afe.remove(n) 825 break 826 } 827 } 828 p.reconstructActiveFormattingElements() 829 p.addFormattingElement() 830 case a.B, a.Big, a.Code, a.Em, a.Font, a.I, a.S, a.Small, a.Strike, a.Strong, a.Tt, a.U: 831 p.reconstructActiveFormattingElements() 832 p.addFormattingElement() 833 case a.Nobr: 834 p.reconstructActiveFormattingElements() 835 if p.elementInScope(defaultScope, a.Nobr) { 836 p.inBodyEndTagFormatting(a.Nobr) 837 p.reconstructActiveFormattingElements() 838 } 839 p.addFormattingElement() 840 case a.Applet, a.Marquee, a.Object: 841 p.reconstructActiveFormattingElements() 842 p.addElement() 843 p.afe = append(p.afe, &scopeMarker) 844 p.framesetOK = false 845 case a.Table: 846 if !p.quirks { 847 p.popUntil(buttonScope, a.P) 848 } 849 p.addElement() 850 p.framesetOK = false 851 p.im = inTableIM 852 return true 853 case a.Area, a.Br, a.Embed, a.Img, a.Input, a.Keygen, a.Wbr: 854 p.reconstructActiveFormattingElements() 855 p.addElement() 856 p.oe.pop() 857 p.acknowledgeSelfClosingTag() 858 if p.tok.DataAtom == a.Input { 859 for _, t := range p.tok.Attr { 860 if t.Key == "type" { 861 if strings.ToLower(t.Val) == "hidden" { 862 // Skip setting framesetOK = false 863 return true 864 } 865 } 866 } 867 } 868 p.framesetOK = false 869 case a.Param, a.Source, a.Track: 870 p.addElement() 871 p.oe.pop() 872 p.acknowledgeSelfClosingTag() 873 case a.Hr: 874 p.popUntil(buttonScope, a.P) 875 p.addElement() 876 p.oe.pop() 877 p.acknowledgeSelfClosingTag() 878 p.framesetOK = false 879 case a.Image: 880 p.tok.DataAtom = a.Img 881 p.tok.Data = a.Img.String() 882 return false 883 case a.Isindex: 884 if p.form != nil { 885 // Ignore the token. 886 return true 887 } 888 action := "" 889 prompt := "This is a searchable index. Enter search keywords: " 890 attr := []Attribute{{Key: "name", Val: "isindex"}} 891 for _, t := range p.tok.Attr { 892 switch t.Key { 893 case "action": 894 action = t.Val 895 case "name": 896 // Ignore the attribute. 897 case "prompt": 898 prompt = t.Val 899 default: 900 attr = append(attr, t) 901 } 902 } 903 p.acknowledgeSelfClosingTag() 904 p.popUntil(buttonScope, a.P) 905 p.parseImpliedToken(StartTagToken, a.Form, a.Form.String()) 906 if action != "" { 907 p.form.Attr = []Attribute{{Key: "action", Val: action}} 908 } 909 p.parseImpliedToken(StartTagToken, a.Hr, a.Hr.String()) 910 p.parseImpliedToken(StartTagToken, a.Label, a.Label.String()) 911 p.addText(prompt) 912 p.addChild(&Node{ 913 Type: ElementNode, 914 DataAtom: a.Input, 915 Data: a.Input.String(), 916 Attr: attr, 917 }) 918 p.oe.pop() 919 p.parseImpliedToken(EndTagToken, a.Label, a.Label.String()) 920 p.parseImpliedToken(StartTagToken, a.Hr, a.Hr.String()) 921 p.parseImpliedToken(EndTagToken, a.Form, a.Form.String()) 922 case a.Textarea: 923 p.addElement() 924 p.setOriginalIM() 925 p.framesetOK = false 926 p.im = textIM 927 case a.Xmp: 928 p.popUntil(buttonScope, a.P) 929 p.reconstructActiveFormattingElements() 930 p.framesetOK = false 931 p.addElement() 932 p.setOriginalIM() 933 p.im = textIM 934 case a.Iframe: 935 p.framesetOK = false 936 p.addElement() 937 p.setOriginalIM() 938 p.im = textIM 939 case a.Noembed, a.Noscript: 940 p.addElement() 941 p.setOriginalIM() 942 p.im = textIM 943 case a.Select: 944 p.reconstructActiveFormattingElements() 945 p.addElement() 946 p.framesetOK = false 947 p.im = inSelectIM 948 return true 949 case a.Optgroup, a.Option: 950 if p.top().DataAtom == a.Option { 951 p.oe.pop() 952 } 953 p.reconstructActiveFormattingElements() 954 p.addElement() 955 case a.Rp, a.Rt: 956 if p.elementInScope(defaultScope, a.Ruby) { 957 p.generateImpliedEndTags() 958 } 959 p.addElement() 960 case a.Math, a.Svg: 961 p.reconstructActiveFormattingElements() 962 if p.tok.DataAtom == a.Math { 963 adjustAttributeNames(p.tok.Attr, mathMLAttributeAdjustments) 964 } else { 965 adjustAttributeNames(p.tok.Attr, svgAttributeAdjustments) 966 } 967 adjustForeignAttributes(p.tok.Attr) 968 p.addElement() 969 p.top().Namespace = p.tok.Data 970 if p.hasSelfClosingToken { 971 p.oe.pop() 972 p.acknowledgeSelfClosingTag() 973 } 974 return true 975 case a.Caption, a.Col, a.Colgroup, a.Frame, a.Head, a.Tbody, a.Td, a.Tfoot, a.Th, a.Thead, a.Tr: 976 // Ignore the token. 977 default: 978 p.reconstructActiveFormattingElements() 979 p.addElement() 980 } 981 case EndTagToken: 982 switch p.tok.DataAtom { 983 case a.Body: 984 if p.elementInScope(defaultScope, a.Body) { 985 p.im = afterBodyIM 986 } 987 case a.Html: 988 if p.elementInScope(defaultScope, a.Body) { 989 p.parseImpliedToken(EndTagToken, a.Body, a.Body.String()) 990 return false 991 } 992 return true 993 case a.Address, a.Article, a.Aside, a.Blockquote, a.Button, a.Center, a.Details, a.Dir, a.Div, a.Dl, a.Fieldset, a.Figcaption, a.Figure, a.Footer, a.Header, a.Hgroup, a.Listing, a.Menu, a.Nav, a.Ol, a.Pre, a.Section, a.Summary, a.Ul: 994 p.popUntil(defaultScope, p.tok.DataAtom) 995 case a.Form: 996 node := p.form 997 p.form = nil 998 i := p.indexOfElementInScope(defaultScope, a.Form) 999 if node == nil || i == -1 || p.oe[i] != node { 1000 // Ignore the token. 1001 return true 1002 } 1003 p.generateImpliedEndTags() 1004 p.oe.remove(node) 1005 case a.P: 1006 if !p.elementInScope(buttonScope, a.P) { 1007 p.parseImpliedToken(StartTagToken, a.P, a.P.String()) 1008 } 1009 p.popUntil(buttonScope, a.P) 1010 case a.Li: 1011 p.popUntil(listItemScope, a.Li) 1012 case a.Dd, a.Dt: 1013 p.popUntil(defaultScope, p.tok.DataAtom) 1014 case a.H1, a.H2, a.H3, a.H4, a.H5, a.H6: 1015 p.popUntil(defaultScope, a.H1, a.H2, a.H3, a.H4, a.H5, a.H6) 1016 case a.A, a.B, a.Big, a.Code, a.Em, a.Font, a.I, a.Nobr, a.S, a.Small, a.Strike, a.Strong, a.Tt, a.U: 1017 p.inBodyEndTagFormatting(p.tok.DataAtom) 1018 case a.Applet, a.Marquee, a.Object: 1019 if p.popUntil(defaultScope, p.tok.DataAtom) { 1020 p.clearActiveFormattingElements() 1021 } 1022 case a.Br: 1023 p.tok.Type = StartTagToken 1024 return false 1025 default: 1026 p.inBodyEndTagOther(p.tok.DataAtom) 1027 } 1028 case CommentToken: 1029 p.addChild(&Node{ 1030 Type: CommentNode, 1031 Data: p.tok.Data, 1032 }) 1033 } 1034 1035 return true 1036 } 1037 1038 func (p *parser) inBodyEndTagFormatting(tagAtom a.Atom) { 1039 // This is the "adoption agency" algorithm, described at 1040 // https://html.spec.whatwg.org/multipage/syntax.html#adoptionAgency 1041 1042 // TODO: this is a fairly literal line-by-line translation of that algorithm. 1043 // Once the code successfully parses the comprehensive test suite, we should 1044 // refactor this code to be more idiomatic. 1045 1046 // Steps 1-4. The outer loop. 1047 for i := 0; i < 8; i++ { 1048 // Step 5. Find the formatting element. 1049 var formattingElement *Node 1050 for j := len(p.afe) - 1; j >= 0; j-- { 1051 if p.afe[j].Type == scopeMarkerNode { 1052 break 1053 } 1054 if p.afe[j].DataAtom == tagAtom { 1055 formattingElement = p.afe[j] 1056 break 1057 } 1058 } 1059 if formattingElement == nil { 1060 p.inBodyEndTagOther(tagAtom) 1061 return 1062 } 1063 feIndex := p.oe.index(formattingElement) 1064 if feIndex == -1 { 1065 p.afe.remove(formattingElement) 1066 return 1067 } 1068 if !p.elementInScope(defaultScope, tagAtom) { 1069 // Ignore the tag. 1070 return 1071 } 1072 1073 // Steps 9-10. Find the furthest block. 1074 var furthestBlock *Node 1075 for _, e := range p.oe[feIndex:] { 1076 if isSpecialElement(e) { 1077 furthestBlock = e 1078 break 1079 } 1080 } 1081 if furthestBlock == nil { 1082 e := p.oe.pop() 1083 for e != formattingElement { 1084 e = p.oe.pop() 1085 } 1086 p.afe.remove(e) 1087 return 1088 } 1089 1090 // Steps 11-12. Find the common ancestor and bookmark node. 1091 commonAncestor := p.oe[feIndex-1] 1092 bookmark := p.afe.index(formattingElement) 1093 1094 // Step 13. The inner loop. Find the lastNode to reparent. 1095 lastNode := furthestBlock 1096 node := furthestBlock 1097 x := p.oe.index(node) 1098 // Steps 13.1-13.2 1099 for j := 0; j < 3; j++ { 1100 // Step 13.3. 1101 x-- 1102 node = p.oe[x] 1103 // Step 13.4 - 13.5. 1104 if p.afe.index(node) == -1 { 1105 p.oe.remove(node) 1106 continue 1107 } 1108 // Step 13.6. 1109 if node == formattingElement { 1110 break 1111 } 1112 // Step 13.7. 1113 clone := node.clone() 1114 p.afe[p.afe.index(node)] = clone 1115 p.oe[p.oe.index(node)] = clone 1116 node = clone 1117 // Step 13.8. 1118 if lastNode == furthestBlock { 1119 bookmark = p.afe.index(node) + 1 1120 } 1121 // Step 13.9. 1122 if lastNode.Parent != nil { 1123 lastNode.Parent.RemoveChild(lastNode) 1124 } 1125 node.AppendChild(lastNode) 1126 // Step 13.10. 1127 lastNode = node 1128 } 1129 1130 // Step 14. Reparent lastNode to the common ancestor, 1131 // or for misnested table nodes, to the foster parent. 1132 if lastNode.Parent != nil { 1133 lastNode.Parent.RemoveChild(lastNode) 1134 } 1135 switch commonAncestor.DataAtom { 1136 case a.Table, a.Tbody, a.Tfoot, a.Thead, a.Tr: 1137 p.fosterParent(lastNode) 1138 default: 1139 commonAncestor.AppendChild(lastNode) 1140 } 1141 1142 // Steps 15-17. Reparent nodes from the furthest block's children 1143 // to a clone of the formatting element. 1144 clone := formattingElement.clone() 1145 reparentChildren(clone, furthestBlock) 1146 furthestBlock.AppendChild(clone) 1147 1148 // Step 18. Fix up the list of active formatting elements. 1149 if oldLoc := p.afe.index(formattingElement); oldLoc != -1 && oldLoc < bookmark { 1150 // Move the bookmark with the rest of the list. 1151 bookmark-- 1152 } 1153 p.afe.remove(formattingElement) 1154 p.afe.insert(bookmark, clone) 1155 1156 // Step 19. Fix up the stack of open elements. 1157 p.oe.remove(formattingElement) 1158 p.oe.insert(p.oe.index(furthestBlock)+1, clone) 1159 } 1160 } 1161 1162 // inBodyEndTagOther performs the "any other end tag" algorithm for inBodyIM. 1163 // "Any other end tag" handling from 12.2.5.5 The rules for parsing tokens in foreign content 1164 // https://html.spec.whatwg.org/multipage/syntax.html#parsing-main-inforeign 1165 func (p *parser) inBodyEndTagOther(tagAtom a.Atom) { 1166 for i := len(p.oe) - 1; i >= 0; i-- { 1167 if p.oe[i].DataAtom == tagAtom { 1168 p.oe = p.oe[:i] 1169 break 1170 } 1171 if isSpecialElement(p.oe[i]) { 1172 break 1173 } 1174 } 1175 } 1176 1177 // Section 12.2.5.4.8. 1178 func textIM(p *parser) bool { 1179 switch p.tok.Type { 1180 case ErrorToken: 1181 p.oe.pop() 1182 case TextToken: 1183 d := p.tok.Data 1184 if n := p.oe.top(); n.DataAtom == a.Textarea && n.FirstChild == nil { 1185 // Ignore a newline at the start of a <textarea> block. 1186 if d != "" && d[0] == '\r' { 1187 d = d[1:] 1188 } 1189 if d != "" && d[0] == '\n' { 1190 d = d[1:] 1191 } 1192 } 1193 if d == "" { 1194 return true 1195 } 1196 p.addText(d) 1197 return true 1198 case EndTagToken: 1199 p.oe.pop() 1200 } 1201 p.im = p.originalIM 1202 p.originalIM = nil 1203 return p.tok.Type == EndTagToken 1204 } 1205 1206 // Section 12.2.5.4.9. 1207 func inTableIM(p *parser) bool { 1208 switch p.tok.Type { 1209 case ErrorToken: 1210 // Stop parsing. 1211 return true 1212 case TextToken: 1213 p.tok.Data = strings.Replace(p.tok.Data, "\x00", "", -1) 1214 switch p.oe.top().DataAtom { 1215 case a.Table, a.Tbody, a.Tfoot, a.Thead, a.Tr: 1216 if strings.Trim(p.tok.Data, whitespace) == "" { 1217 p.addText(p.tok.Data) 1218 return true 1219 } 1220 } 1221 case StartTagToken: 1222 switch p.tok.DataAtom { 1223 case a.Caption: 1224 p.clearStackToContext(tableScope) 1225 p.afe = append(p.afe, &scopeMarker) 1226 p.addElement() 1227 p.im = inCaptionIM 1228 return true 1229 case a.Colgroup: 1230 p.clearStackToContext(tableScope) 1231 p.addElement() 1232 p.im = inColumnGroupIM 1233 return true 1234 case a.Col: 1235 p.parseImpliedToken(StartTagToken, a.Colgroup, a.Colgroup.String()) 1236 return false 1237 case a.Tbody, a.Tfoot, a.Thead: 1238 p.clearStackToContext(tableScope) 1239 p.addElement() 1240 p.im = inTableBodyIM 1241 return true 1242 case a.Td, a.Th, a.Tr: 1243 p.parseImpliedToken(StartTagToken, a.Tbody, a.Tbody.String()) 1244 return false 1245 case a.Table: 1246 if p.popUntil(tableScope, a.Table) { 1247 p.resetInsertionMode() 1248 return false 1249 } 1250 // Ignore the token. 1251 return true 1252 case a.Style, a.Script: 1253 return inHeadIM(p) 1254 case a.Input: 1255 for _, t := range p.tok.Attr { 1256 if t.Key == "type" && strings.ToLower(t.Val) == "hidden" { 1257 p.addElement() 1258 p.oe.pop() 1259 return true 1260 } 1261 } 1262 // Otherwise drop down to the default action. 1263 case a.Form: 1264 if p.form != nil { 1265 // Ignore the token. 1266 return true 1267 } 1268 p.addElement() 1269 p.form = p.oe.pop() 1270 case a.Select: 1271 p.reconstructActiveFormattingElements() 1272 switch p.top().DataAtom { 1273 case a.Table, a.Tbody, a.Tfoot, a.Thead, a.Tr: 1274 p.fosterParenting = true 1275 } 1276 p.addElement() 1277 p.fosterParenting = false 1278 p.framesetOK = false 1279 p.im = inSelectInTableIM 1280 return true 1281 } 1282 case EndTagToken: 1283 switch p.tok.DataAtom { 1284 case a.Table: 1285 if p.popUntil(tableScope, a.Table) { 1286 p.resetInsertionMode() 1287 return true 1288 } 1289 // Ignore the token. 1290 return true 1291 case a.Body, a.Caption, a.Col, a.Colgroup, a.Html, a.Tbody, a.Td, a.Tfoot, a.Th, a.Thead, a.Tr: 1292 // Ignore the token. 1293 return true 1294 } 1295 case CommentToken: 1296 p.addChild(&Node{ 1297 Type: CommentNode, 1298 Data: p.tok.Data, 1299 }) 1300 return true 1301 case DoctypeToken: 1302 // Ignore the token. 1303 return true 1304 } 1305 1306 p.fosterParenting = true 1307 defer func() { p.fosterParenting = false }() 1308 1309 return inBodyIM(p) 1310 } 1311 1312 // Section 12.2.5.4.11. 1313 func inCaptionIM(p *parser) bool { 1314 switch p.tok.Type { 1315 case StartTagToken: 1316 switch p.tok.DataAtom { 1317 case a.Caption, a.Col, a.Colgroup, a.Tbody, a.Td, a.Tfoot, a.Thead, a.Tr: 1318 if p.popUntil(tableScope, a.Caption) { 1319 p.clearActiveFormattingElements() 1320 p.im = inTableIM 1321 return false 1322 } else { 1323 // Ignore the token. 1324 return true 1325 } 1326 case a.Select: 1327 p.reconstructActiveFormattingElements() 1328 p.addElement() 1329 p.framesetOK = false 1330 p.im = inSelectInTableIM 1331 return true 1332 } 1333 case EndTagToken: 1334 switch p.tok.DataAtom { 1335 case a.Caption: 1336 if p.popUntil(tableScope, a.Caption) { 1337 p.clearActiveFormattingElements() 1338 p.im = inTableIM 1339 } 1340 return true 1341 case a.Table: 1342 if p.popUntil(tableScope, a.Caption) { 1343 p.clearActiveFormattingElements() 1344 p.im = inTableIM 1345 return false 1346 } else { 1347 // Ignore the token. 1348 return true 1349 } 1350 case a.Body, a.Col, a.Colgroup, a.Html, a.Tbody, a.Td, a.Tfoot, a.Th, a.Thead, a.Tr: 1351 // Ignore the token. 1352 return true 1353 } 1354 } 1355 return inBodyIM(p) 1356 } 1357 1358 // Section 12.2.5.4.12. 1359 func inColumnGroupIM(p *parser) bool { 1360 switch p.tok.Type { 1361 case TextToken: 1362 s := strings.TrimLeft(p.tok.Data, whitespace) 1363 if len(s) < len(p.tok.Data) { 1364 // Add the initial whitespace to the current node. 1365 p.addText(p.tok.Data[:len(p.tok.Data)-len(s)]) 1366 if s == "" { 1367 return true 1368 } 1369 p.tok.Data = s 1370 } 1371 case CommentToken: 1372 p.addChild(&Node{ 1373 Type: CommentNode, 1374 Data: p.tok.Data, 1375 }) 1376 return true 1377 case DoctypeToken: 1378 // Ignore the token. 1379 return true 1380 case StartTagToken: 1381 switch p.tok.DataAtom { 1382 case a.Html: 1383 return inBodyIM(p) 1384 case a.Col: 1385 p.addElement() 1386 p.oe.pop() 1387 p.acknowledgeSelfClosingTag() 1388 return true 1389 } 1390 case EndTagToken: 1391 switch p.tok.DataAtom { 1392 case a.Colgroup: 1393 if p.oe.top().DataAtom != a.Html { 1394 p.oe.pop() 1395 p.im = inTableIM 1396 } 1397 return true 1398 case a.Col: 1399 // Ignore the token. 1400 return true 1401 } 1402 } 1403 if p.oe.top().DataAtom != a.Html { 1404 p.oe.pop() 1405 p.im = inTableIM 1406 return false 1407 } 1408 return true 1409 } 1410 1411 // Section 12.2.5.4.13. 1412 func inTableBodyIM(p *parser) bool { 1413 switch p.tok.Type { 1414 case StartTagToken: 1415 switch p.tok.DataAtom { 1416 case a.Tr: 1417 p.clearStackToContext(tableBodyScope) 1418 p.addElement() 1419 p.im = inRowIM 1420 return true 1421 case a.Td, a.Th: 1422 p.parseImpliedToken(StartTagToken, a.Tr, a.Tr.String()) 1423 return false 1424 case a.Caption, a.Col, a.Colgroup, a.Tbody, a.Tfoot, a.Thead: 1425 if p.popUntil(tableScope, a.Tbody, a.Thead, a.Tfoot) { 1426 p.im = inTableIM 1427 return false 1428 } 1429 // Ignore the token. 1430 return true 1431 } 1432 case EndTagToken: 1433 switch p.tok.DataAtom { 1434 case a.Tbody, a.Tfoot, a.Thead: 1435 if p.elementInScope(tableScope, p.tok.DataAtom) { 1436 p.clearStackToContext(tableBodyScope) 1437 p.oe.pop() 1438 p.im = inTableIM 1439 } 1440 return true 1441 case a.Table: 1442 if p.popUntil(tableScope, a.Tbody, a.Thead, a.Tfoot) { 1443 p.im = inTableIM 1444 return false 1445 } 1446 // Ignore the token. 1447 return true 1448 case a.Body, a.Caption, a.Col, a.Colgroup, a.Html, a.Td, a.Th, a.Tr: 1449 // Ignore the token. 1450 return true 1451 } 1452 case CommentToken: 1453 p.addChild(&Node{ 1454 Type: CommentNode, 1455 Data: p.tok.Data, 1456 }) 1457 return true 1458 } 1459 1460 return inTableIM(p) 1461 } 1462 1463 // Section 12.2.5.4.14. 1464 func inRowIM(p *parser) bool { 1465 switch p.tok.Type { 1466 case StartTagToken: 1467 switch p.tok.DataAtom { 1468 case a.Td, a.Th: 1469 p.clearStackToContext(tableRowScope) 1470 p.addElement() 1471 p.afe = append(p.afe, &scopeMarker) 1472 p.im = inCellIM 1473 return true 1474 case a.Caption, a.Col, a.Colgroup, a.Tbody, a.Tfoot, a.Thead, a.Tr: 1475 if p.popUntil(tableScope, a.Tr) { 1476 p.im = inTableBodyIM 1477 return false 1478 } 1479 // Ignore the token. 1480 return true 1481 } 1482 case EndTagToken: 1483 switch p.tok.DataAtom { 1484 case a.Tr: 1485 if p.popUntil(tableScope, a.Tr) { 1486 p.im = inTableBodyIM 1487 return true 1488 } 1489 // Ignore the token. 1490 return true 1491 case a.Table: 1492 if p.popUntil(tableScope, a.Tr) { 1493 p.im = inTableBodyIM 1494 return false 1495 } 1496 // Ignore the token. 1497 return true 1498 case a.Tbody, a.Tfoot, a.Thead: 1499 if p.elementInScope(tableScope, p.tok.DataAtom) { 1500 p.parseImpliedToken(EndTagToken, a.Tr, a.Tr.String()) 1501 return false 1502 } 1503 // Ignore the token. 1504 return true 1505 case a.Body, a.Caption, a.Col, a.Colgroup, a.Html, a.Td, a.Th: 1506 // Ignore the token. 1507 return true 1508 } 1509 } 1510 1511 return inTableIM(p) 1512 } 1513 1514 // Section 12.2.5.4.15. 1515 func inCellIM(p *parser) bool { 1516 switch p.tok.Type { 1517 case StartTagToken: 1518 switch p.tok.DataAtom { 1519 case a.Caption, a.Col, a.Colgroup, a.Tbody, a.Td, a.Tfoot, a.Th, a.Thead, a.Tr: 1520 if p.popUntil(tableScope, a.Td, a.Th) { 1521 // Close the cell and reprocess. 1522 p.clearActiveFormattingElements() 1523 p.im = inRowIM 1524 return false 1525 } 1526 // Ignore the token. 1527 return true 1528 case a.Select: 1529 p.reconstructActiveFormattingElements() 1530 p.addElement() 1531 p.framesetOK = false 1532 p.im = inSelectInTableIM 1533 return true 1534 } 1535 case EndTagToken: 1536 switch p.tok.DataAtom { 1537 case a.Td, a.Th: 1538 if !p.popUntil(tableScope, p.tok.DataAtom) { 1539 // Ignore the token. 1540 return true 1541 } 1542 p.clearActiveFormattingElements() 1543 p.im = inRowIM 1544 return true 1545 case a.Body, a.Caption, a.Col, a.Colgroup, a.Html: 1546 // Ignore the token. 1547 return true 1548 case a.Table, a.Tbody, a.Tfoot, a.Thead, a.Tr: 1549 if !p.elementInScope(tableScope, p.tok.DataAtom) { 1550 // Ignore the token. 1551 return true 1552 } 1553 // Close the cell and reprocess. 1554 p.popUntil(tableScope, a.Td, a.Th) 1555 p.clearActiveFormattingElements() 1556 p.im = inRowIM 1557 return false 1558 } 1559 } 1560 return inBodyIM(p) 1561 } 1562 1563 // Section 12.2.5.4.16. 1564 func inSelectIM(p *parser) bool { 1565 switch p.tok.Type { 1566 case ErrorToken: 1567 // Stop parsing. 1568 return true 1569 case TextToken: 1570 p.addText(strings.Replace(p.tok.Data, "\x00", "", -1)) 1571 case StartTagToken: 1572 switch p.tok.DataAtom { 1573 case a.Html: 1574 return inBodyIM(p) 1575 case a.Option: 1576 if p.top().DataAtom == a.Option { 1577 p.oe.pop() 1578 } 1579 p.addElement() 1580 case a.Optgroup: 1581 if p.top().DataAtom == a.Option { 1582 p.oe.pop() 1583 } 1584 if p.top().DataAtom == a.Optgroup { 1585 p.oe.pop() 1586 } 1587 p.addElement() 1588 case a.Select: 1589 p.tok.Type = EndTagToken 1590 return false 1591 case a.Input, a.Keygen, a.Textarea: 1592 if p.elementInScope(selectScope, a.Select) { 1593 p.parseImpliedToken(EndTagToken, a.Select, a.Select.String()) 1594 return false 1595 } 1596 // In order to properly ignore <textarea>, we need to change the tokenizer mode. 1597 p.tokenizer.NextIsNotRawText() 1598 // Ignore the token. 1599 return true 1600 case a.Script: 1601 return inHeadIM(p) 1602 } 1603 case EndTagToken: 1604 switch p.tok.DataAtom { 1605 case a.Option: 1606 if p.top().DataAtom == a.Option { 1607 p.oe.pop() 1608 } 1609 case a.Optgroup: 1610 i := len(p.oe) - 1 1611 if p.oe[i].DataAtom == a.Option { 1612 i-- 1613 } 1614 if p.oe[i].DataAtom == a.Optgroup { 1615 p.oe = p.oe[:i] 1616 } 1617 case a.Select: 1618 if p.popUntil(selectScope, a.Select) { 1619 p.resetInsertionMode() 1620 } 1621 } 1622 case CommentToken: 1623 p.addChild(&Node{ 1624 Type: CommentNode, 1625 Data: p.tok.Data, 1626 }) 1627 case DoctypeToken: 1628 // Ignore the token. 1629 return true 1630 } 1631 1632 return true 1633 } 1634 1635 // Section 12.2.5.4.17. 1636 func inSelectInTableIM(p *parser) bool { 1637 switch p.tok.Type { 1638 case StartTagToken, EndTagToken: 1639 switch p.tok.DataAtom { 1640 case a.Caption, a.Table, a.Tbody, a.Tfoot, a.Thead, a.Tr, a.Td, a.Th: 1641 if p.tok.Type == StartTagToken || p.elementInScope(tableScope, p.tok.DataAtom) { 1642 p.parseImpliedToken(EndTagToken, a.Select, a.Select.String()) 1643 return false 1644 } else { 1645 // Ignore the token. 1646 return true 1647 } 1648 } 1649 } 1650 return inSelectIM(p) 1651 } 1652 1653 // Section 12.2.5.4.18. 1654 func afterBodyIM(p *parser) bool { 1655 switch p.tok.Type { 1656 case ErrorToken: 1657 // Stop parsing. 1658 return true 1659 case TextToken: 1660 s := strings.TrimLeft(p.tok.Data, whitespace) 1661 if len(s) == 0 { 1662 // It was all whitespace. 1663 return inBodyIM(p) 1664 } 1665 case StartTagToken: 1666 if p.tok.DataAtom == a.Html { 1667 return inBodyIM(p) 1668 } 1669 case EndTagToken: 1670 if p.tok.DataAtom == a.Html { 1671 if !p.fragment { 1672 p.im = afterAfterBodyIM 1673 } 1674 return true 1675 } 1676 case CommentToken: 1677 // The comment is attached to the <html> element. 1678 if len(p.oe) < 1 || p.oe[0].DataAtom != a.Html { 1679 panic("html: bad parser state: <html> element not found, in the after-body insertion mode") 1680 } 1681 p.oe[0].AppendChild(&Node{ 1682 Type: CommentNode, 1683 Data: p.tok.Data, 1684 }) 1685 return true 1686 } 1687 p.im = inBodyIM 1688 return false 1689 } 1690 1691 // Section 12.2.5.4.19. 1692 func inFramesetIM(p *parser) bool { 1693 switch p.tok.Type { 1694 case CommentToken: 1695 p.addChild(&Node{ 1696 Type: CommentNode, 1697 Data: p.tok.Data, 1698 }) 1699 case TextToken: 1700 // Ignore all text but whitespace. 1701 s := strings.Map(func(c rune) rune { 1702 switch c { 1703 case ' ', '\t', '\n', '\f', '\r': 1704 return c 1705 } 1706 return -1 1707 }, p.tok.Data) 1708 if s != "" { 1709 p.addText(s) 1710 } 1711 case StartTagToken: 1712 switch p.tok.DataAtom { 1713 case a.Html: 1714 return inBodyIM(p) 1715 case a.Frameset: 1716 p.addElement() 1717 case a.Frame: 1718 p.addElement() 1719 p.oe.pop() 1720 p.acknowledgeSelfClosingTag() 1721 case a.Noframes: 1722 return inHeadIM(p) 1723 } 1724 case EndTagToken: 1725 switch p.tok.DataAtom { 1726 case a.Frameset: 1727 if p.oe.top().DataAtom != a.Html { 1728 p.oe.pop() 1729 if p.oe.top().DataAtom != a.Frameset { 1730 p.im = afterFramesetIM 1731 return true 1732 } 1733 } 1734 } 1735 default: 1736 // Ignore the token. 1737 } 1738 return true 1739 } 1740 1741 // Section 12.2.5.4.20. 1742 func afterFramesetIM(p *parser) bool { 1743 switch p.tok.Type { 1744 case CommentToken: 1745 p.addChild(&Node{ 1746 Type: CommentNode, 1747 Data: p.tok.Data, 1748 }) 1749 case TextToken: 1750 // Ignore all text but whitespace. 1751 s := strings.Map(func(c rune) rune { 1752 switch c { 1753 case ' ', '\t', '\n', '\f', '\r': 1754 return c 1755 } 1756 return -1 1757 }, p.tok.Data) 1758 if s != "" { 1759 p.addText(s) 1760 } 1761 case StartTagToken: 1762 switch p.tok.DataAtom { 1763 case a.Html: 1764 return inBodyIM(p) 1765 case a.Noframes: 1766 return inHeadIM(p) 1767 } 1768 case EndTagToken: 1769 switch p.tok.DataAtom { 1770 case a.Html: 1771 p.im = afterAfterFramesetIM 1772 return true 1773 } 1774 default: 1775 // Ignore the token. 1776 } 1777 return true 1778 } 1779 1780 // Section 12.2.5.4.21. 1781 func afterAfterBodyIM(p *parser) bool { 1782 switch p.tok.Type { 1783 case ErrorToken: 1784 // Stop parsing. 1785 return true 1786 case TextToken: 1787 s := strings.TrimLeft(p.tok.Data, whitespace) 1788 if len(s) == 0 { 1789 // It was all whitespace. 1790 return inBodyIM(p) 1791 } 1792 case StartTagToken: 1793 if p.tok.DataAtom == a.Html { 1794 return inBodyIM(p) 1795 } 1796 case CommentToken: 1797 p.doc.AppendChild(&Node{ 1798 Type: CommentNode, 1799 Data: p.tok.Data, 1800 }) 1801 return true 1802 case DoctypeToken: 1803 return inBodyIM(p) 1804 } 1805 p.im = inBodyIM 1806 return false 1807 } 1808 1809 // Section 12.2.5.4.22. 1810 func afterAfterFramesetIM(p *parser) bool { 1811 switch p.tok.Type { 1812 case CommentToken: 1813 p.doc.AppendChild(&Node{ 1814 Type: CommentNode, 1815 Data: p.tok.Data, 1816 }) 1817 case TextToken: 1818 // Ignore all text but whitespace. 1819 s := strings.Map(func(c rune) rune { 1820 switch c { 1821 case ' ', '\t', '\n', '\f', '\r': 1822 return c 1823 } 1824 return -1 1825 }, p.tok.Data) 1826 if s != "" { 1827 p.tok.Data = s 1828 return inBodyIM(p) 1829 } 1830 case StartTagToken: 1831 switch p.tok.DataAtom { 1832 case a.Html: 1833 return inBodyIM(p) 1834 case a.Noframes: 1835 return inHeadIM(p) 1836 } 1837 case DoctypeToken: 1838 return inBodyIM(p) 1839 default: 1840 // Ignore the token. 1841 } 1842 return true 1843 } 1844 1845 const whitespaceOrNUL = whitespace + "\x00" 1846 1847 // Section 12.2.5.5. 1848 func parseForeignContent(p *parser) bool { 1849 switch p.tok.Type { 1850 case TextToken: 1851 if p.framesetOK { 1852 p.framesetOK = strings.TrimLeft(p.tok.Data, whitespaceOrNUL) == "" 1853 } 1854 p.tok.Data = strings.Replace(p.tok.Data, "\x00", "\ufffd", -1) 1855 p.addText(p.tok.Data) 1856 case CommentToken: 1857 p.addChild(&Node{ 1858 Type: CommentNode, 1859 Data: p.tok.Data, 1860 }) 1861 case StartTagToken: 1862 b := breakout[p.tok.Data] 1863 if p.tok.DataAtom == a.Font { 1864 loop: 1865 for _, attr := range p.tok.Attr { 1866 switch attr.Key { 1867 case "color", "face", "size": 1868 b = true 1869 break loop 1870 } 1871 } 1872 } 1873 if b { 1874 for i := len(p.oe) - 1; i >= 0; i-- { 1875 n := p.oe[i] 1876 if n.Namespace == "" || htmlIntegrationPoint(n) || mathMLTextIntegrationPoint(n) { 1877 p.oe = p.oe[:i+1] 1878 break 1879 } 1880 } 1881 return false 1882 } 1883 switch p.top().Namespace { 1884 case "math": 1885 adjustAttributeNames(p.tok.Attr, mathMLAttributeAdjustments) 1886 case "svg": 1887 // Adjust SVG tag names. The tokenizer lower-cases tag names, but 1888 // SVG wants e.g. "foreignObject" with a capital second "O". 1889 if x := svgTagNameAdjustments[p.tok.Data]; x != "" { 1890 p.tok.DataAtom = a.Lookup([]byte(x)) 1891 p.tok.Data = x 1892 } 1893 adjustAttributeNames(p.tok.Attr, svgAttributeAdjustments) 1894 default: 1895 panic("html: bad parser state: unexpected namespace") 1896 } 1897 adjustForeignAttributes(p.tok.Attr) 1898 namespace := p.top().Namespace 1899 p.addElement() 1900 p.top().Namespace = namespace 1901 if namespace != "" { 1902 // Don't let the tokenizer go into raw text mode in foreign content 1903 // (e.g. in an SVG <title> tag). 1904 p.tokenizer.NextIsNotRawText() 1905 } 1906 if p.hasSelfClosingToken { 1907 p.oe.pop() 1908 p.acknowledgeSelfClosingTag() 1909 } 1910 case EndTagToken: 1911 for i := len(p.oe) - 1; i >= 0; i-- { 1912 if p.oe[i].Namespace == "" { 1913 return p.im(p) 1914 } 1915 if strings.EqualFold(p.oe[i].Data, p.tok.Data) { 1916 p.oe = p.oe[:i] 1917 break 1918 } 1919 } 1920 return true 1921 default: 1922 // Ignore the token. 1923 } 1924 return true 1925 } 1926 1927 // Section 12.2.5. 1928 func (p *parser) inForeignContent() bool { 1929 if len(p.oe) == 0 { 1930 return false 1931 } 1932 n := p.oe[len(p.oe)-1] 1933 if n.Namespace == "" { 1934 return false 1935 } 1936 if mathMLTextIntegrationPoint(n) { 1937 if p.tok.Type == StartTagToken && p.tok.DataAtom != a.Mglyph && p.tok.DataAtom != a.Malignmark { 1938 return false 1939 } 1940 if p.tok.Type == TextToken { 1941 return false 1942 } 1943 } 1944 if n.Namespace == "math" && n.DataAtom == a.AnnotationXml && p.tok.Type == StartTagToken && p.tok.DataAtom == a.Svg { 1945 return false 1946 } 1947 if htmlIntegrationPoint(n) && (p.tok.Type == StartTagToken || p.tok.Type == TextToken) { 1948 return false 1949 } 1950 if p.tok.Type == ErrorToken { 1951 return false 1952 } 1953 return true 1954 } 1955 1956 // parseImpliedToken parses a token as though it had appeared in the parser's 1957 // input. 1958 func (p *parser) parseImpliedToken(t TokenType, dataAtom a.Atom, data string) { 1959 realToken, selfClosing := p.tok, p.hasSelfClosingToken 1960 p.tok = Token{ 1961 Type: t, 1962 DataAtom: dataAtom, 1963 Data: data, 1964 } 1965 p.hasSelfClosingToken = false 1966 p.parseCurrentToken() 1967 p.tok, p.hasSelfClosingToken = realToken, selfClosing 1968 } 1969 1970 // parseCurrentToken runs the current token through the parsing routines 1971 // until it is consumed. 1972 func (p *parser) parseCurrentToken() { 1973 if p.tok.Type == SelfClosingTagToken { 1974 p.hasSelfClosingToken = true 1975 p.tok.Type = StartTagToken 1976 } 1977 1978 consumed := false 1979 for !consumed { 1980 if p.inForeignContent() { 1981 consumed = parseForeignContent(p) 1982 } else { 1983 consumed = p.im(p) 1984 } 1985 } 1986 1987 if p.hasSelfClosingToken { 1988 // This is a parse error, but ignore it. 1989 p.hasSelfClosingToken = false 1990 } 1991 } 1992 1993 func (p *parser) parse() error { 1994 // Iterate until EOF. Any other error will cause an early return. 1995 var err error 1996 for err != io.EOF { 1997 // CDATA sections are allowed only in foreign content. 1998 n := p.oe.top() 1999 p.tokenizer.AllowCDATA(n != nil && n.Namespace != "") 2000 // Read and parse the next token. 2001 p.tokenizer.Next() 2002 p.tok = p.tokenizer.Token() 2003 if p.tok.Type == ErrorToken { 2004 err = p.tokenizer.Err() 2005 if err != nil && err != io.EOF { 2006 return err 2007 } 2008 } 2009 p.parseCurrentToken() 2010 } 2011 return nil 2012 } 2013 2014 // Parse returns the parse tree for the HTML from the given Reader. 2015 // The input is assumed to be UTF-8 encoded. 2016 func Parse(r io.Reader) (*Node, error) { 2017 p := &parser{ 2018 tokenizer: NewTokenizer(r), 2019 doc: &Node{ 2020 Type: DocumentNode, 2021 }, 2022 scripting: true, 2023 framesetOK: true, 2024 im: initialIM, 2025 } 2026 err := p.parse() 2027 if err != nil { 2028 return nil, err 2029 } 2030 return p.doc, nil 2031 } 2032 2033 // ParseFragment parses a fragment of HTML and returns the nodes that were 2034 // found. If the fragment is the InnerHTML for an existing element, pass that 2035 // element in context. 2036 func ParseFragment(r io.Reader, context *Node) ([]*Node, error) { 2037 contextTag := "" 2038 if context != nil { 2039 if context.Type != ElementNode { 2040 return nil, errors.New("html: ParseFragment of non-element Node") 2041 } 2042 // The next check isn't just context.DataAtom.String() == context.Data because 2043 // it is valid to pass an element whose tag isn't a known atom. For example, 2044 // DataAtom == 0 and Data = "tagfromthefuture" is perfectly consistent. 2045 if context.DataAtom != a.Lookup([]byte(context.Data)) { 2046 return nil, fmt.Errorf("html: inconsistent Node: DataAtom=%q, Data=%q", context.DataAtom, context.Data) 2047 } 2048 contextTag = context.DataAtom.String() 2049 } 2050 p := &parser{ 2051 tokenizer: NewTokenizerFragment(r, contextTag), 2052 doc: &Node{ 2053 Type: DocumentNode, 2054 }, 2055 scripting: true, 2056 fragment: true, 2057 context: context, 2058 } 2059 2060 root := &Node{ 2061 Type: ElementNode, 2062 DataAtom: a.Html, 2063 Data: a.Html.String(), 2064 } 2065 p.doc.AppendChild(root) 2066 p.oe = nodeStack{root} 2067 p.resetInsertionMode() 2068 2069 for n := context; n != nil; n = n.Parent { 2070 if n.Type == ElementNode && n.DataAtom == a.Form { 2071 p.form = n 2072 break 2073 } 2074 } 2075 2076 err := p.parse() 2077 if err != nil { 2078 return nil, err 2079 } 2080 2081 parent := p.doc 2082 if context != nil { 2083 parent = root 2084 } 2085 2086 var result []*Node 2087 for c := parent.FirstChild; c != nil; { 2088 next := c.NextSibling 2089 parent.RemoveChild(c) 2090 result = append(result, c) 2091 c = next 2092 } 2093 return result, nil 2094 }