github.com/slspeek/camlistore_namedsearch@v0.0.0-20140519202248-ed6f70f7721a/third_party/code.google.com/p/go.net/html/parse.go (about) 1 // Copyright 2010 The Go Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 package html 6 7 import ( 8 "errors" 9 "fmt" 10 "io" 11 "strings" 12 13 a "camlistore.org/third_party/code.google.com/p/go.net/html/atom" 14 ) 15 16 // A parser implements the HTML5 parsing algorithm: 17 // http://www.whatwg.org/specs/web-apps/current-work/multipage/tokenization.html#tree-construction 18 type parser struct { 19 // tokenizer provides the tokens for the parser. 20 tokenizer *Tokenizer 21 // tok is the most recently read token. 22 tok Token 23 // Self-closing tags like <hr/> are treated as start tags, except that 24 // hasSelfClosingToken is set while they are being processed. 25 hasSelfClosingToken bool 26 // doc is the document root element. 27 doc *Node 28 // The stack of open elements (section 12.2.3.2) and active formatting 29 // elements (section 12.2.3.3). 30 oe, afe nodeStack 31 // Element pointers (section 12.2.3.4). 32 head, form *Node 33 // Other parsing state flags (section 12.2.3.5). 34 scripting, framesetOK bool 35 // im is the current insertion mode. 36 im insertionMode 37 // originalIM is the insertion mode to go back to after completing a text 38 // or inTableText insertion mode. 39 originalIM insertionMode 40 // fosterParenting is whether new elements should be inserted according to 41 // the foster parenting rules (section 12.2.5.3). 42 fosterParenting bool 43 // quirks is whether the parser is operating in "quirks mode." 44 quirks bool 45 // fragment is whether the parser is parsing an HTML fragment. 46 fragment bool 47 // context is the context element when parsing an HTML fragment 48 // (section 12.4). 49 context *Node 50 } 51 52 func (p *parser) top() *Node { 53 if n := p.oe.top(); n != nil { 54 return n 55 } 56 return p.doc 57 } 58 59 // Stop tags for use in popUntil. These come from section 12.2.3.2. 60 var ( 61 defaultScopeStopTags = map[string][]a.Atom{ 62 "": {a.Applet, a.Caption, a.Html, a.Table, a.Td, a.Th, a.Marquee, a.Object}, 63 "math": {a.AnnotationXml, a.Mi, a.Mn, a.Mo, a.Ms, a.Mtext}, 64 "svg": {a.Desc, a.ForeignObject, a.Title}, 65 } 66 ) 67 68 type scope int 69 70 const ( 71 defaultScope scope = iota 72 listItemScope 73 buttonScope 74 tableScope 75 tableRowScope 76 tableBodyScope 77 selectScope 78 ) 79 80 // popUntil pops the stack of open elements at the highest element whose tag 81 // is in matchTags, provided there is no higher element in the scope's stop 82 // tags (as defined in section 12.2.3.2). It returns whether or not there was 83 // such an element. If there was not, popUntil leaves the stack unchanged. 84 // 85 // For example, the set of stop tags for table scope is: "html", "table". If 86 // the stack was: 87 // ["html", "body", "font", "table", "b", "i", "u"] 88 // then popUntil(tableScope, "font") would return false, but 89 // popUntil(tableScope, "i") would return true and the stack would become: 90 // ["html", "body", "font", "table", "b"] 91 // 92 // If an element's tag is in both the stop tags and matchTags, then the stack 93 // will be popped and the function returns true (provided, of course, there was 94 // no higher element in the stack that was also in the stop tags). For example, 95 // popUntil(tableScope, "table") returns true and leaves: 96 // ["html", "body", "font"] 97 func (p *parser) popUntil(s scope, matchTags ...a.Atom) bool { 98 if i := p.indexOfElementInScope(s, matchTags...); i != -1 { 99 p.oe = p.oe[:i] 100 return true 101 } 102 return false 103 } 104 105 // indexOfElementInScope returns the index in p.oe of the highest element whose 106 // tag is in matchTags that is in scope. If no matching element is in scope, it 107 // returns -1. 108 func (p *parser) indexOfElementInScope(s scope, matchTags ...a.Atom) int { 109 for i := len(p.oe) - 1; i >= 0; i-- { 110 tagAtom := p.oe[i].DataAtom 111 if p.oe[i].Namespace == "" { 112 for _, t := range matchTags { 113 if t == tagAtom { 114 return i 115 } 116 } 117 switch s { 118 case defaultScope: 119 // No-op. 120 case listItemScope: 121 if tagAtom == a.Ol || tagAtom == a.Ul { 122 return -1 123 } 124 case buttonScope: 125 if tagAtom == a.Button { 126 return -1 127 } 128 case tableScope: 129 if tagAtom == a.Html || tagAtom == a.Table { 130 return -1 131 } 132 case selectScope: 133 if tagAtom != a.Optgroup && tagAtom != a.Option { 134 return -1 135 } 136 default: 137 panic("unreachable") 138 } 139 } 140 switch s { 141 case defaultScope, listItemScope, buttonScope: 142 for _, t := range defaultScopeStopTags[p.oe[i].Namespace] { 143 if t == tagAtom { 144 return -1 145 } 146 } 147 } 148 } 149 return -1 150 } 151 152 // elementInScope is like popUntil, except that it doesn't modify the stack of 153 // open elements. 154 func (p *parser) elementInScope(s scope, matchTags ...a.Atom) bool { 155 return p.indexOfElementInScope(s, matchTags...) != -1 156 } 157 158 // clearStackToContext pops elements off the stack of open elements until a 159 // scope-defined element is found. 160 func (p *parser) clearStackToContext(s scope) { 161 for i := len(p.oe) - 1; i >= 0; i-- { 162 tagAtom := p.oe[i].DataAtom 163 switch s { 164 case tableScope: 165 if tagAtom == a.Html || tagAtom == a.Table { 166 p.oe = p.oe[:i+1] 167 return 168 } 169 case tableRowScope: 170 if tagAtom == a.Html || tagAtom == a.Tr { 171 p.oe = p.oe[:i+1] 172 return 173 } 174 case tableBodyScope: 175 if tagAtom == a.Html || tagAtom == a.Tbody || tagAtom == a.Tfoot || tagAtom == a.Thead { 176 p.oe = p.oe[:i+1] 177 return 178 } 179 default: 180 panic("unreachable") 181 } 182 } 183 } 184 185 // generateImpliedEndTags pops nodes off the stack of open elements as long as 186 // the top node has a tag name of dd, dt, li, option, optgroup, p, rp, or rt. 187 // If exceptions are specified, nodes with that name will not be popped off. 188 func (p *parser) generateImpliedEndTags(exceptions ...string) { 189 var i int 190 loop: 191 for i = len(p.oe) - 1; i >= 0; i-- { 192 n := p.oe[i] 193 if n.Type == ElementNode { 194 switch n.DataAtom { 195 case a.Dd, a.Dt, a.Li, a.Option, a.Optgroup, a.P, a.Rp, a.Rt: 196 for _, except := range exceptions { 197 if n.Data == except { 198 break loop 199 } 200 } 201 continue 202 } 203 } 204 break 205 } 206 207 p.oe = p.oe[:i+1] 208 } 209 210 // addChild adds a child node n to the top element, and pushes n onto the stack 211 // of open elements if it is an element node. 212 func (p *parser) addChild(n *Node) { 213 if p.shouldFosterParent() { 214 p.fosterParent(n) 215 } else { 216 p.top().AppendChild(n) 217 } 218 219 if n.Type == ElementNode { 220 p.oe = append(p.oe, n) 221 } 222 } 223 224 // shouldFosterParent returns whether the next node to be added should be 225 // foster parented. 226 func (p *parser) shouldFosterParent() bool { 227 if p.fosterParenting { 228 switch p.top().DataAtom { 229 case a.Table, a.Tbody, a.Tfoot, a.Thead, a.Tr: 230 return true 231 } 232 } 233 return false 234 } 235 236 // fosterParent adds a child node according to the foster parenting rules. 237 // Section 12.2.5.3, "foster parenting". 238 func (p *parser) fosterParent(n *Node) { 239 var table, parent, prev *Node 240 var i int 241 for i = len(p.oe) - 1; i >= 0; i-- { 242 if p.oe[i].DataAtom == a.Table { 243 table = p.oe[i] 244 break 245 } 246 } 247 248 if table == nil { 249 // The foster parent is the html element. 250 parent = p.oe[0] 251 } else { 252 parent = table.Parent 253 } 254 if parent == nil { 255 parent = p.oe[i-1] 256 } 257 258 if table != nil { 259 prev = table.PrevSibling 260 } else { 261 prev = parent.LastChild 262 } 263 if prev != nil && prev.Type == TextNode && n.Type == TextNode { 264 prev.Data += n.Data 265 return 266 } 267 268 parent.InsertBefore(n, table) 269 } 270 271 // addText adds text to the preceding node if it is a text node, or else it 272 // calls addChild with a new text node. 273 func (p *parser) addText(text string) { 274 if text == "" { 275 return 276 } 277 278 if p.shouldFosterParent() { 279 p.fosterParent(&Node{ 280 Type: TextNode, 281 Data: text, 282 }) 283 return 284 } 285 286 t := p.top() 287 if n := t.LastChild; n != nil && n.Type == TextNode { 288 n.Data += text 289 return 290 } 291 p.addChild(&Node{ 292 Type: TextNode, 293 Data: text, 294 }) 295 } 296 297 // addElement adds a child element based on the current token. 298 func (p *parser) addElement() { 299 p.addChild(&Node{ 300 Type: ElementNode, 301 DataAtom: p.tok.DataAtom, 302 Data: p.tok.Data, 303 Attr: p.tok.Attr, 304 }) 305 } 306 307 // Section 12.2.3.3. 308 func (p *parser) addFormattingElement() { 309 tagAtom, attr := p.tok.DataAtom, p.tok.Attr 310 p.addElement() 311 312 // Implement the Noah's Ark clause, but with three per family instead of two. 313 identicalElements := 0 314 findIdenticalElements: 315 for i := len(p.afe) - 1; i >= 0; i-- { 316 n := p.afe[i] 317 if n.Type == scopeMarkerNode { 318 break 319 } 320 if n.Type != ElementNode { 321 continue 322 } 323 if n.Namespace != "" { 324 continue 325 } 326 if n.DataAtom != tagAtom { 327 continue 328 } 329 if len(n.Attr) != len(attr) { 330 continue 331 } 332 compareAttributes: 333 for _, t0 := range n.Attr { 334 for _, t1 := range attr { 335 if t0.Key == t1.Key && t0.Namespace == t1.Namespace && t0.Val == t1.Val { 336 // Found a match for this attribute, continue with the next attribute. 337 continue compareAttributes 338 } 339 } 340 // If we get here, there is no attribute that matches a. 341 // Therefore the element is not identical to the new one. 342 continue findIdenticalElements 343 } 344 345 identicalElements++ 346 if identicalElements >= 3 { 347 p.afe.remove(n) 348 } 349 } 350 351 p.afe = append(p.afe, p.top()) 352 } 353 354 // Section 12.2.3.3. 355 func (p *parser) clearActiveFormattingElements() { 356 for { 357 n := p.afe.pop() 358 if len(p.afe) == 0 || n.Type == scopeMarkerNode { 359 return 360 } 361 } 362 } 363 364 // Section 12.2.3.3. 365 func (p *parser) reconstructActiveFormattingElements() { 366 n := p.afe.top() 367 if n == nil { 368 return 369 } 370 if n.Type == scopeMarkerNode || p.oe.index(n) != -1 { 371 return 372 } 373 i := len(p.afe) - 1 374 for n.Type != scopeMarkerNode && p.oe.index(n) == -1 { 375 if i == 0 { 376 i = -1 377 break 378 } 379 i-- 380 n = p.afe[i] 381 } 382 for { 383 i++ 384 clone := p.afe[i].clone() 385 p.addChild(clone) 386 p.afe[i] = clone 387 if i == len(p.afe)-1 { 388 break 389 } 390 } 391 } 392 393 // Section 12.2.4. 394 func (p *parser) acknowledgeSelfClosingTag() { 395 p.hasSelfClosingToken = false 396 } 397 398 // An insertion mode (section 12.2.3.1) is the state transition function from 399 // a particular state in the HTML5 parser's state machine. It updates the 400 // parser's fields depending on parser.tok (where ErrorToken means EOF). 401 // It returns whether the token was consumed. 402 type insertionMode func(*parser) bool 403 404 // setOriginalIM sets the insertion mode to return to after completing a text or 405 // inTableText insertion mode. 406 // Section 12.2.3.1, "using the rules for". 407 func (p *parser) setOriginalIM() { 408 if p.originalIM != nil { 409 panic("html: bad parser state: originalIM was set twice") 410 } 411 p.originalIM = p.im 412 } 413 414 // Section 12.2.3.1, "reset the insertion mode". 415 func (p *parser) resetInsertionMode() { 416 for i := len(p.oe) - 1; i >= 0; i-- { 417 n := p.oe[i] 418 if i == 0 && p.context != nil { 419 n = p.context 420 } 421 422 switch n.DataAtom { 423 case a.Select: 424 p.im = inSelectIM 425 case a.Td, a.Th: 426 p.im = inCellIM 427 case a.Tr: 428 p.im = inRowIM 429 case a.Tbody, a.Thead, a.Tfoot: 430 p.im = inTableBodyIM 431 case a.Caption: 432 p.im = inCaptionIM 433 case a.Colgroup: 434 p.im = inColumnGroupIM 435 case a.Table: 436 p.im = inTableIM 437 case a.Head: 438 p.im = inBodyIM 439 case a.Body: 440 p.im = inBodyIM 441 case a.Frameset: 442 p.im = inFramesetIM 443 case a.Html: 444 p.im = beforeHeadIM 445 default: 446 continue 447 } 448 return 449 } 450 p.im = inBodyIM 451 } 452 453 const whitespace = " \t\r\n\f" 454 455 // Section 12.2.5.4.1. 456 func initialIM(p *parser) bool { 457 switch p.tok.Type { 458 case TextToken: 459 p.tok.Data = strings.TrimLeft(p.tok.Data, whitespace) 460 if len(p.tok.Data) == 0 { 461 // It was all whitespace, so ignore it. 462 return true 463 } 464 case CommentToken: 465 p.doc.AppendChild(&Node{ 466 Type: CommentNode, 467 Data: p.tok.Data, 468 }) 469 return true 470 case DoctypeToken: 471 n, quirks := parseDoctype(p.tok.Data) 472 p.doc.AppendChild(n) 473 p.quirks = quirks 474 p.im = beforeHTMLIM 475 return true 476 } 477 p.quirks = true 478 p.im = beforeHTMLIM 479 return false 480 } 481 482 // Section 12.2.5.4.2. 483 func beforeHTMLIM(p *parser) bool { 484 switch p.tok.Type { 485 case DoctypeToken: 486 // Ignore the token. 487 return true 488 case TextToken: 489 p.tok.Data = strings.TrimLeft(p.tok.Data, whitespace) 490 if len(p.tok.Data) == 0 { 491 // It was all whitespace, so ignore it. 492 return true 493 } 494 case StartTagToken: 495 if p.tok.DataAtom == a.Html { 496 p.addElement() 497 p.im = beforeHeadIM 498 return true 499 } 500 case EndTagToken: 501 switch p.tok.DataAtom { 502 case a.Head, a.Body, a.Html, a.Br: 503 p.parseImpliedToken(StartTagToken, a.Html, a.Html.String()) 504 return false 505 default: 506 // Ignore the token. 507 return true 508 } 509 case CommentToken: 510 p.doc.AppendChild(&Node{ 511 Type: CommentNode, 512 Data: p.tok.Data, 513 }) 514 return true 515 } 516 p.parseImpliedToken(StartTagToken, a.Html, a.Html.String()) 517 return false 518 } 519 520 // Section 12.2.5.4.3. 521 func beforeHeadIM(p *parser) bool { 522 switch p.tok.Type { 523 case TextToken: 524 p.tok.Data = strings.TrimLeft(p.tok.Data, whitespace) 525 if len(p.tok.Data) == 0 { 526 // It was all whitespace, so ignore it. 527 return true 528 } 529 case StartTagToken: 530 switch p.tok.DataAtom { 531 case a.Head: 532 p.addElement() 533 p.head = p.top() 534 p.im = inHeadIM 535 return true 536 case a.Html: 537 return inBodyIM(p) 538 } 539 case EndTagToken: 540 switch p.tok.DataAtom { 541 case a.Head, a.Body, a.Html, a.Br: 542 p.parseImpliedToken(StartTagToken, a.Head, a.Head.String()) 543 return false 544 default: 545 // Ignore the token. 546 return true 547 } 548 case CommentToken: 549 p.addChild(&Node{ 550 Type: CommentNode, 551 Data: p.tok.Data, 552 }) 553 return true 554 case DoctypeToken: 555 // Ignore the token. 556 return true 557 } 558 559 p.parseImpliedToken(StartTagToken, a.Head, a.Head.String()) 560 return false 561 } 562 563 // Section 12.2.5.4.4. 564 func inHeadIM(p *parser) bool { 565 switch p.tok.Type { 566 case TextToken: 567 s := strings.TrimLeft(p.tok.Data, whitespace) 568 if len(s) < len(p.tok.Data) { 569 // Add the initial whitespace to the current node. 570 p.addText(p.tok.Data[:len(p.tok.Data)-len(s)]) 571 if s == "" { 572 return true 573 } 574 p.tok.Data = s 575 } 576 case StartTagToken: 577 switch p.tok.DataAtom { 578 case a.Html: 579 return inBodyIM(p) 580 case a.Base, a.Basefont, a.Bgsound, a.Command, a.Link, a.Meta: 581 p.addElement() 582 p.oe.pop() 583 p.acknowledgeSelfClosingTag() 584 return true 585 case a.Script, a.Title, a.Noscript, a.Noframes, a.Style: 586 p.addElement() 587 p.setOriginalIM() 588 p.im = textIM 589 return true 590 case a.Head: 591 // Ignore the token. 592 return true 593 } 594 case EndTagToken: 595 switch p.tok.DataAtom { 596 case a.Head: 597 n := p.oe.pop() 598 if n.DataAtom != a.Head { 599 panic("html: bad parser state: <head> element not found, in the in-head insertion mode") 600 } 601 p.im = afterHeadIM 602 return true 603 case a.Body, a.Html, a.Br: 604 p.parseImpliedToken(EndTagToken, a.Head, a.Head.String()) 605 return false 606 default: 607 // Ignore the token. 608 return true 609 } 610 case CommentToken: 611 p.addChild(&Node{ 612 Type: CommentNode, 613 Data: p.tok.Data, 614 }) 615 return true 616 case DoctypeToken: 617 // Ignore the token. 618 return true 619 } 620 621 p.parseImpliedToken(EndTagToken, a.Head, a.Head.String()) 622 return false 623 } 624 625 // Section 12.2.5.4.6. 626 func afterHeadIM(p *parser) bool { 627 switch p.tok.Type { 628 case TextToken: 629 s := strings.TrimLeft(p.tok.Data, whitespace) 630 if len(s) < len(p.tok.Data) { 631 // Add the initial whitespace to the current node. 632 p.addText(p.tok.Data[:len(p.tok.Data)-len(s)]) 633 if s == "" { 634 return true 635 } 636 p.tok.Data = s 637 } 638 case StartTagToken: 639 switch p.tok.DataAtom { 640 case a.Html: 641 return inBodyIM(p) 642 case a.Body: 643 p.addElement() 644 p.framesetOK = false 645 p.im = inBodyIM 646 return true 647 case a.Frameset: 648 p.addElement() 649 p.im = inFramesetIM 650 return true 651 case a.Base, a.Basefont, a.Bgsound, a.Link, a.Meta, a.Noframes, a.Script, a.Style, a.Title: 652 p.oe = append(p.oe, p.head) 653 defer p.oe.remove(p.head) 654 return inHeadIM(p) 655 case a.Head: 656 // Ignore the token. 657 return true 658 } 659 case EndTagToken: 660 switch p.tok.DataAtom { 661 case a.Body, a.Html, a.Br: 662 // Drop down to creating an implied <body> tag. 663 default: 664 // Ignore the token. 665 return true 666 } 667 case CommentToken: 668 p.addChild(&Node{ 669 Type: CommentNode, 670 Data: p.tok.Data, 671 }) 672 return true 673 case DoctypeToken: 674 // Ignore the token. 675 return true 676 } 677 678 p.parseImpliedToken(StartTagToken, a.Body, a.Body.String()) 679 p.framesetOK = true 680 return false 681 } 682 683 // copyAttributes copies attributes of src not found on dst to dst. 684 func copyAttributes(dst *Node, src Token) { 685 if len(src.Attr) == 0 { 686 return 687 } 688 attr := map[string]string{} 689 for _, t := range dst.Attr { 690 attr[t.Key] = t.Val 691 } 692 for _, t := range src.Attr { 693 if _, ok := attr[t.Key]; !ok { 694 dst.Attr = append(dst.Attr, t) 695 attr[t.Key] = t.Val 696 } 697 } 698 } 699 700 // Section 12.2.5.4.7. 701 func inBodyIM(p *parser) bool { 702 switch p.tok.Type { 703 case TextToken: 704 d := p.tok.Data 705 switch n := p.oe.top(); n.DataAtom { 706 case a.Pre, a.Listing: 707 if n.FirstChild == nil { 708 // Ignore a newline at the start of a <pre> block. 709 if d != "" && d[0] == '\r' { 710 d = d[1:] 711 } 712 if d != "" && d[0] == '\n' { 713 d = d[1:] 714 } 715 } 716 } 717 d = strings.Replace(d, "\x00", "", -1) 718 if d == "" { 719 return true 720 } 721 p.reconstructActiveFormattingElements() 722 p.addText(d) 723 if p.framesetOK && strings.TrimLeft(d, whitespace) != "" { 724 // There were non-whitespace characters inserted. 725 p.framesetOK = false 726 } 727 case StartTagToken: 728 switch p.tok.DataAtom { 729 case a.Html: 730 copyAttributes(p.oe[0], p.tok) 731 case a.Base, a.Basefont, a.Bgsound, a.Command, a.Link, a.Meta, a.Noframes, a.Script, a.Style, a.Title: 732 return inHeadIM(p) 733 case a.Body: 734 if len(p.oe) >= 2 { 735 body := p.oe[1] 736 if body.Type == ElementNode && body.DataAtom == a.Body { 737 p.framesetOK = false 738 copyAttributes(body, p.tok) 739 } 740 } 741 case a.Frameset: 742 if !p.framesetOK || len(p.oe) < 2 || p.oe[1].DataAtom != a.Body { 743 // Ignore the token. 744 return true 745 } 746 body := p.oe[1] 747 if body.Parent != nil { 748 body.Parent.RemoveChild(body) 749 } 750 p.oe = p.oe[:1] 751 p.addElement() 752 p.im = inFramesetIM 753 return true 754 case a.Address, a.Article, a.Aside, a.Blockquote, a.Center, a.Details, a.Dir, a.Div, a.Dl, a.Fieldset, a.Figcaption, a.Figure, a.Footer, a.Header, a.Hgroup, a.Menu, a.Nav, a.Ol, a.P, a.Section, a.Summary, a.Ul: 755 p.popUntil(buttonScope, a.P) 756 p.addElement() 757 case a.H1, a.H2, a.H3, a.H4, a.H5, a.H6: 758 p.popUntil(buttonScope, a.P) 759 switch n := p.top(); n.DataAtom { 760 case a.H1, a.H2, a.H3, a.H4, a.H5, a.H6: 761 p.oe.pop() 762 } 763 p.addElement() 764 case a.Pre, a.Listing: 765 p.popUntil(buttonScope, a.P) 766 p.addElement() 767 // The newline, if any, will be dealt with by the TextToken case. 768 p.framesetOK = false 769 case a.Form: 770 if p.form == nil { 771 p.popUntil(buttonScope, a.P) 772 p.addElement() 773 p.form = p.top() 774 } 775 case a.Li: 776 p.framesetOK = false 777 for i := len(p.oe) - 1; i >= 0; i-- { 778 node := p.oe[i] 779 switch node.DataAtom { 780 case a.Li: 781 p.oe = p.oe[:i] 782 case a.Address, a.Div, a.P: 783 continue 784 default: 785 if !isSpecialElement(node) { 786 continue 787 } 788 } 789 break 790 } 791 p.popUntil(buttonScope, a.P) 792 p.addElement() 793 case a.Dd, a.Dt: 794 p.framesetOK = false 795 for i := len(p.oe) - 1; i >= 0; i-- { 796 node := p.oe[i] 797 switch node.DataAtom { 798 case a.Dd, a.Dt: 799 p.oe = p.oe[:i] 800 case a.Address, a.Div, a.P: 801 continue 802 default: 803 if !isSpecialElement(node) { 804 continue 805 } 806 } 807 break 808 } 809 p.popUntil(buttonScope, a.P) 810 p.addElement() 811 case a.Plaintext: 812 p.popUntil(buttonScope, a.P) 813 p.addElement() 814 case a.Button: 815 p.popUntil(defaultScope, a.Button) 816 p.reconstructActiveFormattingElements() 817 p.addElement() 818 p.framesetOK = false 819 case a.A: 820 for i := len(p.afe) - 1; i >= 0 && p.afe[i].Type != scopeMarkerNode; i-- { 821 if n := p.afe[i]; n.Type == ElementNode && n.DataAtom == a.A { 822 p.inBodyEndTagFormatting(a.A) 823 p.oe.remove(n) 824 p.afe.remove(n) 825 break 826 } 827 } 828 p.reconstructActiveFormattingElements() 829 p.addFormattingElement() 830 case a.B, a.Big, a.Code, a.Em, a.Font, a.I, a.S, a.Small, a.Strike, a.Strong, a.Tt, a.U: 831 p.reconstructActiveFormattingElements() 832 p.addFormattingElement() 833 case a.Nobr: 834 p.reconstructActiveFormattingElements() 835 if p.elementInScope(defaultScope, a.Nobr) { 836 p.inBodyEndTagFormatting(a.Nobr) 837 p.reconstructActiveFormattingElements() 838 } 839 p.addFormattingElement() 840 case a.Applet, a.Marquee, a.Object: 841 p.reconstructActiveFormattingElements() 842 p.addElement() 843 p.afe = append(p.afe, &scopeMarker) 844 p.framesetOK = false 845 case a.Table: 846 if !p.quirks { 847 p.popUntil(buttonScope, a.P) 848 } 849 p.addElement() 850 p.framesetOK = false 851 p.im = inTableIM 852 return true 853 case a.Area, a.Br, a.Embed, a.Img, a.Input, a.Keygen, a.Wbr: 854 p.reconstructActiveFormattingElements() 855 p.addElement() 856 p.oe.pop() 857 p.acknowledgeSelfClosingTag() 858 if p.tok.DataAtom == a.Input { 859 for _, t := range p.tok.Attr { 860 if t.Key == "type" { 861 if strings.ToLower(t.Val) == "hidden" { 862 // Skip setting framesetOK = false 863 return true 864 } 865 } 866 } 867 } 868 p.framesetOK = false 869 case a.Param, a.Source, a.Track: 870 p.addElement() 871 p.oe.pop() 872 p.acknowledgeSelfClosingTag() 873 case a.Hr: 874 p.popUntil(buttonScope, a.P) 875 p.addElement() 876 p.oe.pop() 877 p.acknowledgeSelfClosingTag() 878 p.framesetOK = false 879 case a.Image: 880 p.tok.DataAtom = a.Img 881 p.tok.Data = a.Img.String() 882 return false 883 case a.Isindex: 884 if p.form != nil { 885 // Ignore the token. 886 return true 887 } 888 action := "" 889 prompt := "This is a searchable index. Enter search keywords: " 890 attr := []Attribute{{Key: "name", Val: "isindex"}} 891 for _, t := range p.tok.Attr { 892 switch t.Key { 893 case "action": 894 action = t.Val 895 case "name": 896 // Ignore the attribute. 897 case "prompt": 898 prompt = t.Val 899 default: 900 attr = append(attr, t) 901 } 902 } 903 p.acknowledgeSelfClosingTag() 904 p.popUntil(buttonScope, a.P) 905 p.parseImpliedToken(StartTagToken, a.Form, a.Form.String()) 906 if action != "" { 907 p.form.Attr = []Attribute{{Key: "action", Val: action}} 908 } 909 p.parseImpliedToken(StartTagToken, a.Hr, a.Hr.String()) 910 p.parseImpliedToken(StartTagToken, a.Label, a.Label.String()) 911 p.addText(prompt) 912 p.addChild(&Node{ 913 Type: ElementNode, 914 DataAtom: a.Input, 915 Data: a.Input.String(), 916 Attr: attr, 917 }) 918 p.oe.pop() 919 p.parseImpliedToken(EndTagToken, a.Label, a.Label.String()) 920 p.parseImpliedToken(StartTagToken, a.Hr, a.Hr.String()) 921 p.parseImpliedToken(EndTagToken, a.Form, a.Form.String()) 922 case a.Textarea: 923 p.addElement() 924 p.setOriginalIM() 925 p.framesetOK = false 926 p.im = textIM 927 case a.Xmp: 928 p.popUntil(buttonScope, a.P) 929 p.reconstructActiveFormattingElements() 930 p.framesetOK = false 931 p.addElement() 932 p.setOriginalIM() 933 p.im = textIM 934 case a.Iframe: 935 p.framesetOK = false 936 p.addElement() 937 p.setOriginalIM() 938 p.im = textIM 939 case a.Noembed, a.Noscript: 940 p.addElement() 941 p.setOriginalIM() 942 p.im = textIM 943 case a.Select: 944 p.reconstructActiveFormattingElements() 945 p.addElement() 946 p.framesetOK = false 947 p.im = inSelectIM 948 return true 949 case a.Optgroup, a.Option: 950 if p.top().DataAtom == a.Option { 951 p.oe.pop() 952 } 953 p.reconstructActiveFormattingElements() 954 p.addElement() 955 case a.Rp, a.Rt: 956 if p.elementInScope(defaultScope, a.Ruby) { 957 p.generateImpliedEndTags() 958 } 959 p.addElement() 960 case a.Math, a.Svg: 961 p.reconstructActiveFormattingElements() 962 if p.tok.DataAtom == a.Math { 963 adjustAttributeNames(p.tok.Attr, mathMLAttributeAdjustments) 964 } else { 965 adjustAttributeNames(p.tok.Attr, svgAttributeAdjustments) 966 } 967 adjustForeignAttributes(p.tok.Attr) 968 p.addElement() 969 p.top().Namespace = p.tok.Data 970 if p.hasSelfClosingToken { 971 p.oe.pop() 972 p.acknowledgeSelfClosingTag() 973 } 974 return true 975 case a.Caption, a.Col, a.Colgroup, a.Frame, a.Head, a.Tbody, a.Td, a.Tfoot, a.Th, a.Thead, a.Tr: 976 // Ignore the token. 977 default: 978 p.reconstructActiveFormattingElements() 979 p.addElement() 980 } 981 case EndTagToken: 982 switch p.tok.DataAtom { 983 case a.Body: 984 if p.elementInScope(defaultScope, a.Body) { 985 p.im = afterBodyIM 986 } 987 case a.Html: 988 if p.elementInScope(defaultScope, a.Body) { 989 p.parseImpliedToken(EndTagToken, a.Body, a.Body.String()) 990 return false 991 } 992 return true 993 case a.Address, a.Article, a.Aside, a.Blockquote, a.Button, a.Center, a.Details, a.Dir, a.Div, a.Dl, a.Fieldset, a.Figcaption, a.Figure, a.Footer, a.Header, a.Hgroup, a.Listing, a.Menu, a.Nav, a.Ol, a.Pre, a.Section, a.Summary, a.Ul: 994 p.popUntil(defaultScope, p.tok.DataAtom) 995 case a.Form: 996 node := p.form 997 p.form = nil 998 i := p.indexOfElementInScope(defaultScope, a.Form) 999 if node == nil || i == -1 || p.oe[i] != node { 1000 // Ignore the token. 1001 return true 1002 } 1003 p.generateImpliedEndTags() 1004 p.oe.remove(node) 1005 case a.P: 1006 if !p.elementInScope(buttonScope, a.P) { 1007 p.parseImpliedToken(StartTagToken, a.P, a.P.String()) 1008 } 1009 p.popUntil(buttonScope, a.P) 1010 case a.Li: 1011 p.popUntil(listItemScope, a.Li) 1012 case a.Dd, a.Dt: 1013 p.popUntil(defaultScope, p.tok.DataAtom) 1014 case a.H1, a.H2, a.H3, a.H4, a.H5, a.H6: 1015 p.popUntil(defaultScope, a.H1, a.H2, a.H3, a.H4, a.H5, a.H6) 1016 case a.A, a.B, a.Big, a.Code, a.Em, a.Font, a.I, a.Nobr, a.S, a.Small, a.Strike, a.Strong, a.Tt, a.U: 1017 p.inBodyEndTagFormatting(p.tok.DataAtom) 1018 case a.Applet, a.Marquee, a.Object: 1019 if p.popUntil(defaultScope, p.tok.DataAtom) { 1020 p.clearActiveFormattingElements() 1021 } 1022 case a.Br: 1023 p.tok.Type = StartTagToken 1024 return false 1025 default: 1026 p.inBodyEndTagOther(p.tok.DataAtom) 1027 } 1028 case CommentToken: 1029 p.addChild(&Node{ 1030 Type: CommentNode, 1031 Data: p.tok.Data, 1032 }) 1033 } 1034 1035 return true 1036 } 1037 1038 func (p *parser) inBodyEndTagFormatting(tagAtom a.Atom) { 1039 // This is the "adoption agency" algorithm, described at 1040 // http://www.whatwg.org/specs/web-apps/current-work/multipage/tokenization.html#adoptionAgency 1041 1042 // TODO: this is a fairly literal line-by-line translation of that algorithm. 1043 // Once the code successfully parses the comprehensive test suite, we should 1044 // refactor this code to be more idiomatic. 1045 1046 // Steps 1-3. The outer loop. 1047 for i := 0; i < 8; i++ { 1048 // Step 4. Find the formatting element. 1049 var formattingElement *Node 1050 for j := len(p.afe) - 1; j >= 0; j-- { 1051 if p.afe[j].Type == scopeMarkerNode { 1052 break 1053 } 1054 if p.afe[j].DataAtom == tagAtom { 1055 formattingElement = p.afe[j] 1056 break 1057 } 1058 } 1059 if formattingElement == nil { 1060 p.inBodyEndTagOther(tagAtom) 1061 return 1062 } 1063 feIndex := p.oe.index(formattingElement) 1064 if feIndex == -1 { 1065 p.afe.remove(formattingElement) 1066 return 1067 } 1068 if !p.elementInScope(defaultScope, tagAtom) { 1069 // Ignore the tag. 1070 return 1071 } 1072 1073 // Steps 5-6. Find the furthest block. 1074 var furthestBlock *Node 1075 for _, e := range p.oe[feIndex:] { 1076 if isSpecialElement(e) { 1077 furthestBlock = e 1078 break 1079 } 1080 } 1081 if furthestBlock == nil { 1082 e := p.oe.pop() 1083 for e != formattingElement { 1084 e = p.oe.pop() 1085 } 1086 p.afe.remove(e) 1087 return 1088 } 1089 1090 // Steps 7-8. Find the common ancestor and bookmark node. 1091 commonAncestor := p.oe[feIndex-1] 1092 bookmark := p.afe.index(formattingElement) 1093 1094 // Step 9. The inner loop. Find the lastNode to reparent. 1095 lastNode := furthestBlock 1096 node := furthestBlock 1097 x := p.oe.index(node) 1098 // Steps 9.1-9.3. 1099 for j := 0; j < 3; j++ { 1100 // Step 9.4. 1101 x-- 1102 node = p.oe[x] 1103 // Step 9.5. 1104 if p.afe.index(node) == -1 { 1105 p.oe.remove(node) 1106 continue 1107 } 1108 // Step 9.6. 1109 if node == formattingElement { 1110 break 1111 } 1112 // Step 9.7. 1113 clone := node.clone() 1114 p.afe[p.afe.index(node)] = clone 1115 p.oe[p.oe.index(node)] = clone 1116 node = clone 1117 // Step 9.8. 1118 if lastNode == furthestBlock { 1119 bookmark = p.afe.index(node) + 1 1120 } 1121 // Step 9.9. 1122 if lastNode.Parent != nil { 1123 lastNode.Parent.RemoveChild(lastNode) 1124 } 1125 node.AppendChild(lastNode) 1126 // Step 9.10. 1127 lastNode = node 1128 } 1129 1130 // Step 10. Reparent lastNode to the common ancestor, 1131 // or for misnested table nodes, to the foster parent. 1132 if lastNode.Parent != nil { 1133 lastNode.Parent.RemoveChild(lastNode) 1134 } 1135 switch commonAncestor.DataAtom { 1136 case a.Table, a.Tbody, a.Tfoot, a.Thead, a.Tr: 1137 p.fosterParent(lastNode) 1138 default: 1139 commonAncestor.AppendChild(lastNode) 1140 } 1141 1142 // Steps 11-13. Reparent nodes from the furthest block's children 1143 // to a clone of the formatting element. 1144 clone := formattingElement.clone() 1145 reparentChildren(clone, furthestBlock) 1146 furthestBlock.AppendChild(clone) 1147 1148 // Step 14. Fix up the list of active formatting elements. 1149 if oldLoc := p.afe.index(formattingElement); oldLoc != -1 && oldLoc < bookmark { 1150 // Move the bookmark with the rest of the list. 1151 bookmark-- 1152 } 1153 p.afe.remove(formattingElement) 1154 p.afe.insert(bookmark, clone) 1155 1156 // Step 15. Fix up the stack of open elements. 1157 p.oe.remove(formattingElement) 1158 p.oe.insert(p.oe.index(furthestBlock)+1, clone) 1159 } 1160 } 1161 1162 // inBodyEndTagOther performs the "any other end tag" algorithm for inBodyIM. 1163 func (p *parser) inBodyEndTagOther(tagAtom a.Atom) { 1164 for i := len(p.oe) - 1; i >= 0; i-- { 1165 if p.oe[i].DataAtom == tagAtom { 1166 p.oe = p.oe[:i] 1167 break 1168 } 1169 if isSpecialElement(p.oe[i]) { 1170 break 1171 } 1172 } 1173 } 1174 1175 // Section 12.2.5.4.8. 1176 func textIM(p *parser) bool { 1177 switch p.tok.Type { 1178 case ErrorToken: 1179 p.oe.pop() 1180 case TextToken: 1181 d := p.tok.Data 1182 if n := p.oe.top(); n.DataAtom == a.Textarea && n.FirstChild == nil { 1183 // Ignore a newline at the start of a <textarea> block. 1184 if d != "" && d[0] == '\r' { 1185 d = d[1:] 1186 } 1187 if d != "" && d[0] == '\n' { 1188 d = d[1:] 1189 } 1190 } 1191 if d == "" { 1192 return true 1193 } 1194 p.addText(d) 1195 return true 1196 case EndTagToken: 1197 p.oe.pop() 1198 } 1199 p.im = p.originalIM 1200 p.originalIM = nil 1201 return p.tok.Type == EndTagToken 1202 } 1203 1204 // Section 12.2.5.4.9. 1205 func inTableIM(p *parser) bool { 1206 switch p.tok.Type { 1207 case ErrorToken: 1208 // Stop parsing. 1209 return true 1210 case TextToken: 1211 p.tok.Data = strings.Replace(p.tok.Data, "\x00", "", -1) 1212 switch p.oe.top().DataAtom { 1213 case a.Table, a.Tbody, a.Tfoot, a.Thead, a.Tr: 1214 if strings.Trim(p.tok.Data, whitespace) == "" { 1215 p.addText(p.tok.Data) 1216 return true 1217 } 1218 } 1219 case StartTagToken: 1220 switch p.tok.DataAtom { 1221 case a.Caption: 1222 p.clearStackToContext(tableScope) 1223 p.afe = append(p.afe, &scopeMarker) 1224 p.addElement() 1225 p.im = inCaptionIM 1226 return true 1227 case a.Colgroup: 1228 p.clearStackToContext(tableScope) 1229 p.addElement() 1230 p.im = inColumnGroupIM 1231 return true 1232 case a.Col: 1233 p.parseImpliedToken(StartTagToken, a.Colgroup, a.Colgroup.String()) 1234 return false 1235 case a.Tbody, a.Tfoot, a.Thead: 1236 p.clearStackToContext(tableScope) 1237 p.addElement() 1238 p.im = inTableBodyIM 1239 return true 1240 case a.Td, a.Th, a.Tr: 1241 p.parseImpliedToken(StartTagToken, a.Tbody, a.Tbody.String()) 1242 return false 1243 case a.Table: 1244 if p.popUntil(tableScope, a.Table) { 1245 p.resetInsertionMode() 1246 return false 1247 } 1248 // Ignore the token. 1249 return true 1250 case a.Style, a.Script: 1251 return inHeadIM(p) 1252 case a.Input: 1253 for _, t := range p.tok.Attr { 1254 if t.Key == "type" && strings.ToLower(t.Val) == "hidden" { 1255 p.addElement() 1256 p.oe.pop() 1257 return true 1258 } 1259 } 1260 // Otherwise drop down to the default action. 1261 case a.Form: 1262 if p.form != nil { 1263 // Ignore the token. 1264 return true 1265 } 1266 p.addElement() 1267 p.form = p.oe.pop() 1268 case a.Select: 1269 p.reconstructActiveFormattingElements() 1270 switch p.top().DataAtom { 1271 case a.Table, a.Tbody, a.Tfoot, a.Thead, a.Tr: 1272 p.fosterParenting = true 1273 } 1274 p.addElement() 1275 p.fosterParenting = false 1276 p.framesetOK = false 1277 p.im = inSelectInTableIM 1278 return true 1279 } 1280 case EndTagToken: 1281 switch p.tok.DataAtom { 1282 case a.Table: 1283 if p.popUntil(tableScope, a.Table) { 1284 p.resetInsertionMode() 1285 return true 1286 } 1287 // Ignore the token. 1288 return true 1289 case a.Body, a.Caption, a.Col, a.Colgroup, a.Html, a.Tbody, a.Td, a.Tfoot, a.Th, a.Thead, a.Tr: 1290 // Ignore the token. 1291 return true 1292 } 1293 case CommentToken: 1294 p.addChild(&Node{ 1295 Type: CommentNode, 1296 Data: p.tok.Data, 1297 }) 1298 return true 1299 case DoctypeToken: 1300 // Ignore the token. 1301 return true 1302 } 1303 1304 p.fosterParenting = true 1305 defer func() { p.fosterParenting = false }() 1306 1307 return inBodyIM(p) 1308 } 1309 1310 // Section 12.2.5.4.11. 1311 func inCaptionIM(p *parser) bool { 1312 switch p.tok.Type { 1313 case StartTagToken: 1314 switch p.tok.DataAtom { 1315 case a.Caption, a.Col, a.Colgroup, a.Tbody, a.Td, a.Tfoot, a.Thead, a.Tr: 1316 if p.popUntil(tableScope, a.Caption) { 1317 p.clearActiveFormattingElements() 1318 p.im = inTableIM 1319 return false 1320 } else { 1321 // Ignore the token. 1322 return true 1323 } 1324 case a.Select: 1325 p.reconstructActiveFormattingElements() 1326 p.addElement() 1327 p.framesetOK = false 1328 p.im = inSelectInTableIM 1329 return true 1330 } 1331 case EndTagToken: 1332 switch p.tok.DataAtom { 1333 case a.Caption: 1334 if p.popUntil(tableScope, a.Caption) { 1335 p.clearActiveFormattingElements() 1336 p.im = inTableIM 1337 } 1338 return true 1339 case a.Table: 1340 if p.popUntil(tableScope, a.Caption) { 1341 p.clearActiveFormattingElements() 1342 p.im = inTableIM 1343 return false 1344 } else { 1345 // Ignore the token. 1346 return true 1347 } 1348 case a.Body, a.Col, a.Colgroup, a.Html, a.Tbody, a.Td, a.Tfoot, a.Th, a.Thead, a.Tr: 1349 // Ignore the token. 1350 return true 1351 } 1352 } 1353 return inBodyIM(p) 1354 } 1355 1356 // Section 12.2.5.4.12. 1357 func inColumnGroupIM(p *parser) bool { 1358 switch p.tok.Type { 1359 case TextToken: 1360 s := strings.TrimLeft(p.tok.Data, whitespace) 1361 if len(s) < len(p.tok.Data) { 1362 // Add the initial whitespace to the current node. 1363 p.addText(p.tok.Data[:len(p.tok.Data)-len(s)]) 1364 if s == "" { 1365 return true 1366 } 1367 p.tok.Data = s 1368 } 1369 case CommentToken: 1370 p.addChild(&Node{ 1371 Type: CommentNode, 1372 Data: p.tok.Data, 1373 }) 1374 return true 1375 case DoctypeToken: 1376 // Ignore the token. 1377 return true 1378 case StartTagToken: 1379 switch p.tok.DataAtom { 1380 case a.Html: 1381 return inBodyIM(p) 1382 case a.Col: 1383 p.addElement() 1384 p.oe.pop() 1385 p.acknowledgeSelfClosingTag() 1386 return true 1387 } 1388 case EndTagToken: 1389 switch p.tok.DataAtom { 1390 case a.Colgroup: 1391 if p.oe.top().DataAtom != a.Html { 1392 p.oe.pop() 1393 p.im = inTableIM 1394 } 1395 return true 1396 case a.Col: 1397 // Ignore the token. 1398 return true 1399 } 1400 } 1401 if p.oe.top().DataAtom != a.Html { 1402 p.oe.pop() 1403 p.im = inTableIM 1404 return false 1405 } 1406 return true 1407 } 1408 1409 // Section 12.2.5.4.13. 1410 func inTableBodyIM(p *parser) bool { 1411 switch p.tok.Type { 1412 case StartTagToken: 1413 switch p.tok.DataAtom { 1414 case a.Tr: 1415 p.clearStackToContext(tableBodyScope) 1416 p.addElement() 1417 p.im = inRowIM 1418 return true 1419 case a.Td, a.Th: 1420 p.parseImpliedToken(StartTagToken, a.Tr, a.Tr.String()) 1421 return false 1422 case a.Caption, a.Col, a.Colgroup, a.Tbody, a.Tfoot, a.Thead: 1423 if p.popUntil(tableScope, a.Tbody, a.Thead, a.Tfoot) { 1424 p.im = inTableIM 1425 return false 1426 } 1427 // Ignore the token. 1428 return true 1429 } 1430 case EndTagToken: 1431 switch p.tok.DataAtom { 1432 case a.Tbody, a.Tfoot, a.Thead: 1433 if p.elementInScope(tableScope, p.tok.DataAtom) { 1434 p.clearStackToContext(tableBodyScope) 1435 p.oe.pop() 1436 p.im = inTableIM 1437 } 1438 return true 1439 case a.Table: 1440 if p.popUntil(tableScope, a.Tbody, a.Thead, a.Tfoot) { 1441 p.im = inTableIM 1442 return false 1443 } 1444 // Ignore the token. 1445 return true 1446 case a.Body, a.Caption, a.Col, a.Colgroup, a.Html, a.Td, a.Th, a.Tr: 1447 // Ignore the token. 1448 return true 1449 } 1450 case CommentToken: 1451 p.addChild(&Node{ 1452 Type: CommentNode, 1453 Data: p.tok.Data, 1454 }) 1455 return true 1456 } 1457 1458 return inTableIM(p) 1459 } 1460 1461 // Section 12.2.5.4.14. 1462 func inRowIM(p *parser) bool { 1463 switch p.tok.Type { 1464 case StartTagToken: 1465 switch p.tok.DataAtom { 1466 case a.Td, a.Th: 1467 p.clearStackToContext(tableRowScope) 1468 p.addElement() 1469 p.afe = append(p.afe, &scopeMarker) 1470 p.im = inCellIM 1471 return true 1472 case a.Caption, a.Col, a.Colgroup, a.Tbody, a.Tfoot, a.Thead, a.Tr: 1473 if p.popUntil(tableScope, a.Tr) { 1474 p.im = inTableBodyIM 1475 return false 1476 } 1477 // Ignore the token. 1478 return true 1479 } 1480 case EndTagToken: 1481 switch p.tok.DataAtom { 1482 case a.Tr: 1483 if p.popUntil(tableScope, a.Tr) { 1484 p.im = inTableBodyIM 1485 return true 1486 } 1487 // Ignore the token. 1488 return true 1489 case a.Table: 1490 if p.popUntil(tableScope, a.Tr) { 1491 p.im = inTableBodyIM 1492 return false 1493 } 1494 // Ignore the token. 1495 return true 1496 case a.Tbody, a.Tfoot, a.Thead: 1497 if p.elementInScope(tableScope, p.tok.DataAtom) { 1498 p.parseImpliedToken(EndTagToken, a.Tr, a.Tr.String()) 1499 return false 1500 } 1501 // Ignore the token. 1502 return true 1503 case a.Body, a.Caption, a.Col, a.Colgroup, a.Html, a.Td, a.Th: 1504 // Ignore the token. 1505 return true 1506 } 1507 } 1508 1509 return inTableIM(p) 1510 } 1511 1512 // Section 12.2.5.4.15. 1513 func inCellIM(p *parser) bool { 1514 switch p.tok.Type { 1515 case StartTagToken: 1516 switch p.tok.DataAtom { 1517 case a.Caption, a.Col, a.Colgroup, a.Tbody, a.Td, a.Tfoot, a.Th, a.Thead, a.Tr: 1518 if p.popUntil(tableScope, a.Td, a.Th) { 1519 // Close the cell and reprocess. 1520 p.clearActiveFormattingElements() 1521 p.im = inRowIM 1522 return false 1523 } 1524 // Ignore the token. 1525 return true 1526 case a.Select: 1527 p.reconstructActiveFormattingElements() 1528 p.addElement() 1529 p.framesetOK = false 1530 p.im = inSelectInTableIM 1531 return true 1532 } 1533 case EndTagToken: 1534 switch p.tok.DataAtom { 1535 case a.Td, a.Th: 1536 if !p.popUntil(tableScope, p.tok.DataAtom) { 1537 // Ignore the token. 1538 return true 1539 } 1540 p.clearActiveFormattingElements() 1541 p.im = inRowIM 1542 return true 1543 case a.Body, a.Caption, a.Col, a.Colgroup, a.Html: 1544 // Ignore the token. 1545 return true 1546 case a.Table, a.Tbody, a.Tfoot, a.Thead, a.Tr: 1547 if !p.elementInScope(tableScope, p.tok.DataAtom) { 1548 // Ignore the token. 1549 return true 1550 } 1551 // Close the cell and reprocess. 1552 p.popUntil(tableScope, a.Td, a.Th) 1553 p.clearActiveFormattingElements() 1554 p.im = inRowIM 1555 return false 1556 } 1557 } 1558 return inBodyIM(p) 1559 } 1560 1561 // Section 12.2.5.4.16. 1562 func inSelectIM(p *parser) bool { 1563 switch p.tok.Type { 1564 case ErrorToken: 1565 // Stop parsing. 1566 return true 1567 case TextToken: 1568 p.addText(strings.Replace(p.tok.Data, "\x00", "", -1)) 1569 case StartTagToken: 1570 switch p.tok.DataAtom { 1571 case a.Html: 1572 return inBodyIM(p) 1573 case a.Option: 1574 if p.top().DataAtom == a.Option { 1575 p.oe.pop() 1576 } 1577 p.addElement() 1578 case a.Optgroup: 1579 if p.top().DataAtom == a.Option { 1580 p.oe.pop() 1581 } 1582 if p.top().DataAtom == a.Optgroup { 1583 p.oe.pop() 1584 } 1585 p.addElement() 1586 case a.Select: 1587 p.tok.Type = EndTagToken 1588 return false 1589 case a.Input, a.Keygen, a.Textarea: 1590 if p.elementInScope(selectScope, a.Select) { 1591 p.parseImpliedToken(EndTagToken, a.Select, a.Select.String()) 1592 return false 1593 } 1594 // In order to properly ignore <textarea>, we need to change the tokenizer mode. 1595 p.tokenizer.NextIsNotRawText() 1596 // Ignore the token. 1597 return true 1598 case a.Script: 1599 return inHeadIM(p) 1600 } 1601 case EndTagToken: 1602 switch p.tok.DataAtom { 1603 case a.Option: 1604 if p.top().DataAtom == a.Option { 1605 p.oe.pop() 1606 } 1607 case a.Optgroup: 1608 i := len(p.oe) - 1 1609 if p.oe[i].DataAtom == a.Option { 1610 i-- 1611 } 1612 if p.oe[i].DataAtom == a.Optgroup { 1613 p.oe = p.oe[:i] 1614 } 1615 case a.Select: 1616 if p.popUntil(selectScope, a.Select) { 1617 p.resetInsertionMode() 1618 } 1619 } 1620 case CommentToken: 1621 p.doc.AppendChild(&Node{ 1622 Type: CommentNode, 1623 Data: p.tok.Data, 1624 }) 1625 case DoctypeToken: 1626 // Ignore the token. 1627 return true 1628 } 1629 1630 return true 1631 } 1632 1633 // Section 12.2.5.4.17. 1634 func inSelectInTableIM(p *parser) bool { 1635 switch p.tok.Type { 1636 case StartTagToken, EndTagToken: 1637 switch p.tok.DataAtom { 1638 case a.Caption, a.Table, a.Tbody, a.Tfoot, a.Thead, a.Tr, a.Td, a.Th: 1639 if p.tok.Type == StartTagToken || p.elementInScope(tableScope, p.tok.DataAtom) { 1640 p.parseImpliedToken(EndTagToken, a.Select, a.Select.String()) 1641 return false 1642 } else { 1643 // Ignore the token. 1644 return true 1645 } 1646 } 1647 } 1648 return inSelectIM(p) 1649 } 1650 1651 // Section 12.2.5.4.18. 1652 func afterBodyIM(p *parser) bool { 1653 switch p.tok.Type { 1654 case ErrorToken: 1655 // Stop parsing. 1656 return true 1657 case TextToken: 1658 s := strings.TrimLeft(p.tok.Data, whitespace) 1659 if len(s) == 0 { 1660 // It was all whitespace. 1661 return inBodyIM(p) 1662 } 1663 case StartTagToken: 1664 if p.tok.DataAtom == a.Html { 1665 return inBodyIM(p) 1666 } 1667 case EndTagToken: 1668 if p.tok.DataAtom == a.Html { 1669 if !p.fragment { 1670 p.im = afterAfterBodyIM 1671 } 1672 return true 1673 } 1674 case CommentToken: 1675 // The comment is attached to the <html> element. 1676 if len(p.oe) < 1 || p.oe[0].DataAtom != a.Html { 1677 panic("html: bad parser state: <html> element not found, in the after-body insertion mode") 1678 } 1679 p.oe[0].AppendChild(&Node{ 1680 Type: CommentNode, 1681 Data: p.tok.Data, 1682 }) 1683 return true 1684 } 1685 p.im = inBodyIM 1686 return false 1687 } 1688 1689 // Section 12.2.5.4.19. 1690 func inFramesetIM(p *parser) bool { 1691 switch p.tok.Type { 1692 case CommentToken: 1693 p.addChild(&Node{ 1694 Type: CommentNode, 1695 Data: p.tok.Data, 1696 }) 1697 case TextToken: 1698 // Ignore all text but whitespace. 1699 s := strings.Map(func(c rune) rune { 1700 switch c { 1701 case ' ', '\t', '\n', '\f', '\r': 1702 return c 1703 } 1704 return -1 1705 }, p.tok.Data) 1706 if s != "" { 1707 p.addText(s) 1708 } 1709 case StartTagToken: 1710 switch p.tok.DataAtom { 1711 case a.Html: 1712 return inBodyIM(p) 1713 case a.Frameset: 1714 p.addElement() 1715 case a.Frame: 1716 p.addElement() 1717 p.oe.pop() 1718 p.acknowledgeSelfClosingTag() 1719 case a.Noframes: 1720 return inHeadIM(p) 1721 } 1722 case EndTagToken: 1723 switch p.tok.DataAtom { 1724 case a.Frameset: 1725 if p.oe.top().DataAtom != a.Html { 1726 p.oe.pop() 1727 if p.oe.top().DataAtom != a.Frameset { 1728 p.im = afterFramesetIM 1729 return true 1730 } 1731 } 1732 } 1733 default: 1734 // Ignore the token. 1735 } 1736 return true 1737 } 1738 1739 // Section 12.2.5.4.20. 1740 func afterFramesetIM(p *parser) bool { 1741 switch p.tok.Type { 1742 case CommentToken: 1743 p.addChild(&Node{ 1744 Type: CommentNode, 1745 Data: p.tok.Data, 1746 }) 1747 case TextToken: 1748 // Ignore all text but whitespace. 1749 s := strings.Map(func(c rune) rune { 1750 switch c { 1751 case ' ', '\t', '\n', '\f', '\r': 1752 return c 1753 } 1754 return -1 1755 }, p.tok.Data) 1756 if s != "" { 1757 p.addText(s) 1758 } 1759 case StartTagToken: 1760 switch p.tok.DataAtom { 1761 case a.Html: 1762 return inBodyIM(p) 1763 case a.Noframes: 1764 return inHeadIM(p) 1765 } 1766 case EndTagToken: 1767 switch p.tok.DataAtom { 1768 case a.Html: 1769 p.im = afterAfterFramesetIM 1770 return true 1771 } 1772 default: 1773 // Ignore the token. 1774 } 1775 return true 1776 } 1777 1778 // Section 12.2.5.4.21. 1779 func afterAfterBodyIM(p *parser) bool { 1780 switch p.tok.Type { 1781 case ErrorToken: 1782 // Stop parsing. 1783 return true 1784 case TextToken: 1785 s := strings.TrimLeft(p.tok.Data, whitespace) 1786 if len(s) == 0 { 1787 // It was all whitespace. 1788 return inBodyIM(p) 1789 } 1790 case StartTagToken: 1791 if p.tok.DataAtom == a.Html { 1792 return inBodyIM(p) 1793 } 1794 case CommentToken: 1795 p.doc.AppendChild(&Node{ 1796 Type: CommentNode, 1797 Data: p.tok.Data, 1798 }) 1799 return true 1800 case DoctypeToken: 1801 return inBodyIM(p) 1802 } 1803 p.im = inBodyIM 1804 return false 1805 } 1806 1807 // Section 12.2.5.4.22. 1808 func afterAfterFramesetIM(p *parser) bool { 1809 switch p.tok.Type { 1810 case CommentToken: 1811 p.doc.AppendChild(&Node{ 1812 Type: CommentNode, 1813 Data: p.tok.Data, 1814 }) 1815 case TextToken: 1816 // Ignore all text but whitespace. 1817 s := strings.Map(func(c rune) rune { 1818 switch c { 1819 case ' ', '\t', '\n', '\f', '\r': 1820 return c 1821 } 1822 return -1 1823 }, p.tok.Data) 1824 if s != "" { 1825 p.tok.Data = s 1826 return inBodyIM(p) 1827 } 1828 case StartTagToken: 1829 switch p.tok.DataAtom { 1830 case a.Html: 1831 return inBodyIM(p) 1832 case a.Noframes: 1833 return inHeadIM(p) 1834 } 1835 case DoctypeToken: 1836 return inBodyIM(p) 1837 default: 1838 // Ignore the token. 1839 } 1840 return true 1841 } 1842 1843 const whitespaceOrNUL = whitespace + "\x00" 1844 1845 // Section 12.2.5.5. 1846 func parseForeignContent(p *parser) bool { 1847 switch p.tok.Type { 1848 case TextToken: 1849 if p.framesetOK { 1850 p.framesetOK = strings.TrimLeft(p.tok.Data, whitespaceOrNUL) == "" 1851 } 1852 p.tok.Data = strings.Replace(p.tok.Data, "\x00", "\ufffd", -1) 1853 p.addText(p.tok.Data) 1854 case CommentToken: 1855 p.addChild(&Node{ 1856 Type: CommentNode, 1857 Data: p.tok.Data, 1858 }) 1859 case StartTagToken: 1860 b := breakout[p.tok.Data] 1861 if p.tok.DataAtom == a.Font { 1862 loop: 1863 for _, attr := range p.tok.Attr { 1864 switch attr.Key { 1865 case "color", "face", "size": 1866 b = true 1867 break loop 1868 } 1869 } 1870 } 1871 if b { 1872 for i := len(p.oe) - 1; i >= 0; i-- { 1873 n := p.oe[i] 1874 if n.Namespace == "" || htmlIntegrationPoint(n) || mathMLTextIntegrationPoint(n) { 1875 p.oe = p.oe[:i+1] 1876 break 1877 } 1878 } 1879 return false 1880 } 1881 switch p.top().Namespace { 1882 case "math": 1883 adjustAttributeNames(p.tok.Attr, mathMLAttributeAdjustments) 1884 case "svg": 1885 // Adjust SVG tag names. The tokenizer lower-cases tag names, but 1886 // SVG wants e.g. "foreignObject" with a capital second "O". 1887 if x := svgTagNameAdjustments[p.tok.Data]; x != "" { 1888 p.tok.DataAtom = a.Lookup([]byte(x)) 1889 p.tok.Data = x 1890 } 1891 adjustAttributeNames(p.tok.Attr, svgAttributeAdjustments) 1892 default: 1893 panic("html: bad parser state: unexpected namespace") 1894 } 1895 adjustForeignAttributes(p.tok.Attr) 1896 namespace := p.top().Namespace 1897 p.addElement() 1898 p.top().Namespace = namespace 1899 if namespace != "" { 1900 // Don't let the tokenizer go into raw text mode in foreign content 1901 // (e.g. in an SVG <title> tag). 1902 p.tokenizer.NextIsNotRawText() 1903 } 1904 if p.hasSelfClosingToken { 1905 p.oe.pop() 1906 p.acknowledgeSelfClosingTag() 1907 } 1908 case EndTagToken: 1909 for i := len(p.oe) - 1; i >= 0; i-- { 1910 if p.oe[i].Namespace == "" { 1911 return p.im(p) 1912 } 1913 if strings.EqualFold(p.oe[i].Data, p.tok.Data) { 1914 p.oe = p.oe[:i] 1915 break 1916 } 1917 } 1918 return true 1919 default: 1920 // Ignore the token. 1921 } 1922 return true 1923 } 1924 1925 // Section 12.2.5. 1926 func (p *parser) inForeignContent() bool { 1927 if len(p.oe) == 0 { 1928 return false 1929 } 1930 n := p.oe[len(p.oe)-1] 1931 if n.Namespace == "" { 1932 return false 1933 } 1934 if mathMLTextIntegrationPoint(n) { 1935 if p.tok.Type == StartTagToken && p.tok.DataAtom != a.Mglyph && p.tok.DataAtom != a.Malignmark { 1936 return false 1937 } 1938 if p.tok.Type == TextToken { 1939 return false 1940 } 1941 } 1942 if n.Namespace == "math" && n.DataAtom == a.AnnotationXml && p.tok.Type == StartTagToken && p.tok.DataAtom == a.Svg { 1943 return false 1944 } 1945 if htmlIntegrationPoint(n) && (p.tok.Type == StartTagToken || p.tok.Type == TextToken) { 1946 return false 1947 } 1948 if p.tok.Type == ErrorToken { 1949 return false 1950 } 1951 return true 1952 } 1953 1954 // parseImpliedToken parses a token as though it had appeared in the parser's 1955 // input. 1956 func (p *parser) parseImpliedToken(t TokenType, dataAtom a.Atom, data string) { 1957 realToken, selfClosing := p.tok, p.hasSelfClosingToken 1958 p.tok = Token{ 1959 Type: t, 1960 DataAtom: dataAtom, 1961 Data: data, 1962 } 1963 p.hasSelfClosingToken = false 1964 p.parseCurrentToken() 1965 p.tok, p.hasSelfClosingToken = realToken, selfClosing 1966 } 1967 1968 // parseCurrentToken runs the current token through the parsing routines 1969 // until it is consumed. 1970 func (p *parser) parseCurrentToken() { 1971 if p.tok.Type == SelfClosingTagToken { 1972 p.hasSelfClosingToken = true 1973 p.tok.Type = StartTagToken 1974 } 1975 1976 consumed := false 1977 for !consumed { 1978 if p.inForeignContent() { 1979 consumed = parseForeignContent(p) 1980 } else { 1981 consumed = p.im(p) 1982 } 1983 } 1984 1985 if p.hasSelfClosingToken { 1986 // This is a parse error, but ignore it. 1987 p.hasSelfClosingToken = false 1988 } 1989 } 1990 1991 func (p *parser) parse() error { 1992 // Iterate until EOF. Any other error will cause an early return. 1993 var err error 1994 for err != io.EOF { 1995 // CDATA sections are allowed only in foreign content. 1996 n := p.oe.top() 1997 p.tokenizer.AllowCDATA(n != nil && n.Namespace != "") 1998 // Read and parse the next token. 1999 p.tokenizer.Next() 2000 p.tok = p.tokenizer.Token() 2001 if p.tok.Type == ErrorToken { 2002 err = p.tokenizer.Err() 2003 if err != nil && err != io.EOF { 2004 return err 2005 } 2006 } 2007 p.parseCurrentToken() 2008 } 2009 return nil 2010 } 2011 2012 // Parse returns the parse tree for the HTML from the given Reader. 2013 // The input is assumed to be UTF-8 encoded. 2014 func Parse(r io.Reader) (*Node, error) { 2015 p := &parser{ 2016 tokenizer: NewTokenizer(r), 2017 doc: &Node{ 2018 Type: DocumentNode, 2019 }, 2020 scripting: true, 2021 framesetOK: true, 2022 im: initialIM, 2023 } 2024 err := p.parse() 2025 if err != nil { 2026 return nil, err 2027 } 2028 return p.doc, nil 2029 } 2030 2031 // ParseFragment parses a fragment of HTML and returns the nodes that were 2032 // found. If the fragment is the InnerHTML for an existing element, pass that 2033 // element in context. 2034 func ParseFragment(r io.Reader, context *Node) ([]*Node, error) { 2035 contextTag := "" 2036 if context != nil { 2037 if context.Type != ElementNode { 2038 return nil, errors.New("html: ParseFragment of non-element Node") 2039 } 2040 // The next check isn't just context.DataAtom.String() == context.Data because 2041 // it is valid to pass an element whose tag isn't a known atom. For example, 2042 // DataAtom == 0 and Data = "tagfromthefuture" is perfectly consistent. 2043 if context.DataAtom != a.Lookup([]byte(context.Data)) { 2044 return nil, fmt.Errorf("html: inconsistent Node: DataAtom=%q, Data=%q", context.DataAtom, context.Data) 2045 } 2046 contextTag = context.DataAtom.String() 2047 } 2048 p := &parser{ 2049 tokenizer: NewTokenizerFragment(r, contextTag), 2050 doc: &Node{ 2051 Type: DocumentNode, 2052 }, 2053 scripting: true, 2054 fragment: true, 2055 context: context, 2056 } 2057 2058 root := &Node{ 2059 Type: ElementNode, 2060 DataAtom: a.Html, 2061 Data: a.Html.String(), 2062 } 2063 p.doc.AppendChild(root) 2064 p.oe = nodeStack{root} 2065 p.resetInsertionMode() 2066 2067 for n := context; n != nil; n = n.Parent { 2068 if n.Type == ElementNode && n.DataAtom == a.Form { 2069 p.form = n 2070 break 2071 } 2072 } 2073 2074 err := p.parse() 2075 if err != nil { 2076 return nil, err 2077 } 2078 2079 parent := p.doc 2080 if context != nil { 2081 parent = root 2082 } 2083 2084 var result []*Node 2085 for c := parent.FirstChild; c != nil; { 2086 next := c.NextSibling 2087 parent.RemoveChild(c) 2088 result = append(result, c) 2089 c = next 2090 } 2091 return result, nil 2092 }