golang.org/x/net@v0.25.1-0.20240516223405-c87a5b62e243/html/parse_test.go (about) 1 // Copyright 2010 The Go Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 package html 6 7 import ( 8 "bufio" 9 "bytes" 10 "errors" 11 "fmt" 12 "io" 13 "io/ioutil" 14 "os" 15 "path/filepath" 16 "runtime" 17 "sort" 18 "strings" 19 "testing" 20 21 "golang.org/x/net/html/atom" 22 ) 23 24 type testAttrs struct { 25 text, want, context string 26 scripting bool 27 } 28 29 // readParseTest reads a single test case from r. 30 func readParseTest(r *bufio.Reader) (*testAttrs, error) { 31 ta := &testAttrs{scripting: true} 32 line, err := r.ReadSlice('\n') 33 if err != nil { 34 return nil, err 35 } 36 var b []byte 37 38 // Read the HTML. 39 if string(line) != "#data\n" { 40 return nil, fmt.Errorf(`got %q want "#data\n"`, line) 41 } 42 for { 43 line, err = r.ReadSlice('\n') 44 if err != nil { 45 return nil, err 46 } 47 if line[0] == '#' { 48 break 49 } 50 b = append(b, line...) 51 } 52 ta.text = strings.TrimSuffix(string(b), "\n") 53 b = b[:0] 54 55 // Skip the error list. 56 if string(line) != "#errors\n" { 57 return nil, fmt.Errorf(`got %q want "#errors\n"`, line) 58 } 59 for { 60 line, err = r.ReadSlice('\n') 61 if err != nil { 62 return nil, err 63 } 64 if line[0] == '#' { 65 break 66 } 67 } 68 69 // Skip the new-errors list. 70 if string(line) == "#new-errors\n" { 71 for { 72 line, err = r.ReadSlice('\n') 73 if err != nil { 74 return nil, err 75 } 76 if line[0] == '#' { 77 break 78 } 79 } 80 } 81 82 if ls := string(line); strings.HasPrefix(ls, "#script-") { 83 switch { 84 case strings.HasSuffix(ls, "-on\n"): 85 ta.scripting = true 86 case strings.HasSuffix(ls, "-off\n"): 87 ta.scripting = false 88 default: 89 return nil, fmt.Errorf(`got %q, want "#script-on" or "#script-off"`, line) 90 } 91 for { 92 line, err = r.ReadSlice('\n') 93 if err != nil { 94 return nil, err 95 } 96 if line[0] == '#' { 97 break 98 } 99 } 100 } 101 102 if string(line) == "#document-fragment\n" { 103 line, err = r.ReadSlice('\n') 104 if err != nil { 105 return nil, err 106 } 107 ta.context = strings.TrimSpace(string(line)) 108 line, err = r.ReadSlice('\n') 109 if err != nil { 110 return nil, err 111 } 112 } 113 114 // Read the dump of what the parse tree should be. 115 if string(line) != "#document\n" { 116 return nil, fmt.Errorf(`got %q want "#document\n"`, line) 117 } 118 inQuote := false 119 for { 120 line, err = r.ReadSlice('\n') 121 if err != nil && err != io.EOF { 122 return nil, err 123 } 124 trimmed := bytes.Trim(line, "| \n") 125 if len(trimmed) > 0 { 126 if line[0] == '|' && trimmed[0] == '"' { 127 inQuote = true 128 } 129 if trimmed[len(trimmed)-1] == '"' && !(line[0] == '|' && len(trimmed) == 1) { 130 inQuote = false 131 } 132 } 133 if len(line) == 0 || len(line) == 1 && line[0] == '\n' && !inQuote { 134 break 135 } 136 b = append(b, line...) 137 } 138 ta.want = string(b) 139 return ta, nil 140 } 141 142 func dumpIndent(w io.Writer, level int) { 143 io.WriteString(w, "| ") 144 for i := 0; i < level; i++ { 145 io.WriteString(w, " ") 146 } 147 } 148 149 type sortedAttributes []Attribute 150 151 func (a sortedAttributes) Len() int { 152 return len(a) 153 } 154 155 func (a sortedAttributes) Less(i, j int) bool { 156 if a[i].Namespace != a[j].Namespace { 157 return a[i].Namespace < a[j].Namespace 158 } 159 return a[i].Key < a[j].Key 160 } 161 162 func (a sortedAttributes) Swap(i, j int) { 163 a[i], a[j] = a[j], a[i] 164 } 165 166 func dumpLevel(w io.Writer, n *Node, level int) error { 167 dumpIndent(w, level) 168 level++ 169 switch n.Type { 170 case ErrorNode: 171 return errors.New("unexpected ErrorNode") 172 case DocumentNode: 173 return errors.New("unexpected DocumentNode") 174 case ElementNode: 175 if n.Namespace != "" { 176 fmt.Fprintf(w, "<%s %s>", n.Namespace, n.Data) 177 } else { 178 fmt.Fprintf(w, "<%s>", n.Data) 179 } 180 attr := sortedAttributes(n.Attr) 181 sort.Sort(attr) 182 for _, a := range attr { 183 io.WriteString(w, "\n") 184 dumpIndent(w, level) 185 if a.Namespace != "" { 186 fmt.Fprintf(w, `%s %s="%s"`, a.Namespace, a.Key, a.Val) 187 } else { 188 fmt.Fprintf(w, `%s="%s"`, a.Key, a.Val) 189 } 190 } 191 if n.Namespace == "" && n.DataAtom == atom.Template { 192 io.WriteString(w, "\n") 193 dumpIndent(w, level) 194 level++ 195 io.WriteString(w, "content") 196 } 197 case TextNode: 198 fmt.Fprintf(w, `"%s"`, n.Data) 199 case CommentNode: 200 fmt.Fprintf(w, "<!-- %s -->", n.Data) 201 case DoctypeNode: 202 fmt.Fprintf(w, "<!DOCTYPE %s", n.Data) 203 if n.Attr != nil { 204 var p, s string 205 for _, a := range n.Attr { 206 switch a.Key { 207 case "public": 208 p = a.Val 209 case "system": 210 s = a.Val 211 } 212 } 213 if p != "" || s != "" { 214 fmt.Fprintf(w, ` "%s"`, p) 215 fmt.Fprintf(w, ` "%s"`, s) 216 } 217 } 218 io.WriteString(w, ">") 219 case scopeMarkerNode: 220 return errors.New("unexpected scopeMarkerNode") 221 default: 222 return errors.New("unknown node type") 223 } 224 io.WriteString(w, "\n") 225 for c := n.FirstChild; c != nil; c = c.NextSibling { 226 if err := dumpLevel(w, c, level); err != nil { 227 return err 228 } 229 } 230 return nil 231 } 232 233 func dump(n *Node) (string, error) { 234 if n == nil || n.FirstChild == nil { 235 return "", nil 236 } 237 var b bytes.Buffer 238 for c := n.FirstChild; c != nil; c = c.NextSibling { 239 if err := dumpLevel(&b, c, 0); err != nil { 240 return "", err 241 } 242 } 243 return b.String(), nil 244 } 245 246 var testDataDirs = []string{"testdata/webkit/", "testdata/go/"} 247 248 func TestParser(t *testing.T) { 249 for _, testDataDir := range testDataDirs { 250 testFiles, err := filepath.Glob(testDataDir + "*.dat") 251 if err != nil { 252 t.Fatal(err) 253 } 254 for _, tf := range testFiles { 255 f, err := os.Open(tf) 256 if err != nil { 257 t.Fatal(err) 258 } 259 defer f.Close() 260 r := bufio.NewReader(f) 261 262 for i := 0; ; i++ { 263 ta, err := readParseTest(r) 264 if err == io.EOF { 265 break 266 } 267 if err != nil { 268 t.Fatal(err) 269 } 270 if parseTestBlacklist[ta.text] { 271 continue 272 } 273 274 err = testParseCase(ta.text, ta.want, ta.context, ParseOptionEnableScripting(ta.scripting)) 275 276 if err != nil { 277 t.Errorf("%s test #%d %q, %s", tf, i, ta.text, err) 278 } 279 } 280 } 281 } 282 } 283 284 // Issue 16318 285 func TestParserWithoutScripting(t *testing.T) { 286 text := `<noscript><img src='https://golang.org/doc/gopher/frontpage.png' /></noscript><p><img src='https://golang.org/doc/gopher/doc.png' /></p>` 287 want := `| <html> 288 | <head> 289 | <noscript> 290 | <body> 291 | <img> 292 | src="https://golang.org/doc/gopher/frontpage.png" 293 | <p> 294 | <img> 295 | src="https://golang.org/doc/gopher/doc.png" 296 ` 297 298 if err := testParseCase(text, want, "", ParseOptionEnableScripting(false)); err != nil { 299 t.Errorf("test with scripting is disabled, %q, %s", text, err) 300 } 301 } 302 303 // testParseCase tests one test case from the test files. If the test does not 304 // pass, it returns an error that explains the failure. 305 // text is the HTML to be parsed, want is a dump of the correct parse tree, 306 // and context is the name of the context node, if any. 307 func testParseCase(text, want, context string, opts ...ParseOption) (err error) { 308 defer func() { 309 if x := recover(); x != nil { 310 switch e := x.(type) { 311 case error: 312 err = e 313 default: 314 err = fmt.Errorf("%v", e) 315 } 316 } 317 }() 318 319 var doc *Node 320 if context == "" { 321 doc, err = ParseWithOptions(strings.NewReader(text), opts...) 322 if err != nil { 323 return err 324 } 325 } else { 326 namespace := "" 327 if i := strings.IndexByte(context, ' '); i >= 0 { 328 namespace, context = context[:i], context[i+1:] 329 } 330 contextNode := &Node{ 331 Data: context, 332 DataAtom: atom.Lookup([]byte(context)), 333 Namespace: namespace, 334 Type: ElementNode, 335 } 336 nodes, err := ParseFragmentWithOptions(strings.NewReader(text), contextNode, opts...) 337 if err != nil { 338 return err 339 } 340 doc = &Node{ 341 Type: DocumentNode, 342 } 343 for _, n := range nodes { 344 doc.AppendChild(n) 345 } 346 } 347 348 if err := checkTreeConsistency(doc); err != nil { 349 return err 350 } 351 352 got, err := dump(doc) 353 if err != nil { 354 return err 355 } 356 // Compare the parsed tree to the #document section. 357 if got != want { 358 return fmt.Errorf("got vs want:\n----\n%s----\n%s----", got, want) 359 } 360 361 if renderTestBlacklist[text] || context != "" { 362 return nil 363 } 364 365 // Check that rendering and re-parsing results in an identical tree. 366 pr, pw := io.Pipe() 367 go func() { 368 pw.CloseWithError(Render(pw, doc)) 369 }() 370 doc1, err := ParseWithOptions(pr, opts...) 371 if err != nil { 372 return err 373 } 374 got1, err := dump(doc1) 375 if err != nil { 376 return err 377 } 378 if got != got1 { 379 return fmt.Errorf("got vs got1:\n----\n%s----\n%s----", got, got1) 380 } 381 382 return nil 383 } 384 385 // Some test inputs are simply skipped - we would otherwise fail the test. We 386 // blacklist such inputs from the parse test. 387 var parseTestBlacklist = map[string]bool{ 388 // See the a.Template TODO in inHeadIM. 389 `<math><template><mo><template>`: true, 390 `<template><svg><foo><template><foreignObject><div></template><div>`: true, 391 } 392 393 // Some test input result in parse trees are not 'well-formed' despite 394 // following the HTML5 recovery algorithms. Rendering and re-parsing such a 395 // tree will not result in an exact clone of that tree. We blacklist such 396 // inputs from the render test. 397 var renderTestBlacklist = map[string]bool{ 398 // The second <a> will be reparented to the first <table>'s parent. This 399 // results in an <a> whose parent is an <a>, which is not 'well-formed'. 400 `<a><table><td><a><table></table><a></tr><a></table><b>X</b>C<a>Y`: true, 401 // The same thing with a <p>: 402 `<p><table></p>`: true, 403 // More cases of <a> being reparented: 404 `<a href="blah">aba<table><a href="foo">br<tr><td></td></tr>x</table>aoe`: true, 405 `<a><table><a></table><p><a><div><a>`: true, 406 `<a><table><td><a><table></table><a></tr><a></table><a>`: true, 407 `<template><a><table><a>`: true, 408 // A similar reparenting situation involving <nobr>: 409 `<!DOCTYPE html><body><b><nobr>1<table><nobr></b><i><nobr>2<nobr></i>3`: true, 410 // A <plaintext> element is reparented, putting it before a table. 411 // A <plaintext> element can't have anything after it in HTML. 412 `<table><plaintext><td>`: true, 413 `<!doctype html><table><plaintext></plaintext>`: true, 414 `<!doctype html><table><tbody><plaintext></plaintext>`: true, 415 `<!doctype html><table><tbody><tr><plaintext></plaintext>`: true, 416 // A form inside a table inside a form doesn't work either. 417 `<!doctype html><form><table></form><form></table></form>`: true, 418 // A script that ends at EOF may escape its own closing tag when rendered. 419 `<!doctype html><script><!--<script `: true, 420 `<!doctype html><script><!--<script <`: true, 421 `<!doctype html><script><!--<script <a`: true, 422 `<!doctype html><script><!--<script </`: true, 423 `<!doctype html><script><!--<script </s`: true, 424 `<!doctype html><script><!--<script </script`: true, 425 `<!doctype html><script><!--<script </scripta`: true, 426 `<!doctype html><script><!--<script -`: true, 427 `<!doctype html><script><!--<script -a`: true, 428 `<!doctype html><script><!--<script -<`: true, 429 `<!doctype html><script><!--<script --`: true, 430 `<!doctype html><script><!--<script --a`: true, 431 `<!doctype html><script><!--<script --<`: true, 432 `<script><!--<script `: true, 433 `<script><!--<script <a`: true, 434 `<script><!--<script </script`: true, 435 `<script><!--<script </scripta`: true, 436 `<script><!--<script -`: true, 437 `<script><!--<script -a`: true, 438 `<script><!--<script --`: true, 439 `<script><!--<script --a`: true, 440 `<script><!--<script <`: true, 441 `<script><!--<script </`: true, 442 `<script><!--<script </s`: true, 443 // Reconstructing the active formatting elements results in a <plaintext> 444 // element that contains an <a> element. 445 `<!doctype html><p><a><plaintext>b`: true, 446 `<table><math><select><mi><select></table>`: true, 447 `<!doctype html><table><colgroup><plaintext></plaintext>`: true, 448 `<!doctype html><svg><plaintext>a</plaintext>b`: true, 449 } 450 451 func TestNodeConsistency(t *testing.T) { 452 // inconsistentNode is a Node whose DataAtom and Data do not agree. 453 inconsistentNode := &Node{ 454 Type: ElementNode, 455 DataAtom: atom.Frameset, 456 Data: "table", 457 } 458 if _, err := ParseFragment(strings.NewReader("<p>hello</p>"), inconsistentNode); err == nil { 459 t.Errorf("got nil error, want non-nil") 460 } 461 } 462 463 func TestParseFragmentWithNilContext(t *testing.T) { 464 // This shouldn't panic. 465 ParseFragment(strings.NewReader("<p>hello</p>"), nil) 466 } 467 468 func TestParseFragmentForeignContentTemplates(t *testing.T) { 469 srcs := []string{ 470 "<math><html><template><mn><template></template></template>", 471 "<math><math><head><mi><template>", 472 } 473 for _, src := range srcs { 474 // The next line shouldn't infinite-loop. 475 ParseFragment(strings.NewReader(src), nil) 476 } 477 } 478 479 func BenchmarkParser(b *testing.B) { 480 buf, err := ioutil.ReadFile("testdata/go1.html") 481 if err != nil { 482 b.Fatalf("could not read testdata/go1.html: %v", err) 483 } 484 b.SetBytes(int64(len(buf))) 485 runtime.GC() 486 b.ReportAllocs() 487 b.ResetTimer() 488 for i := 0; i < b.N; i++ { 489 Parse(bytes.NewBuffer(buf)) 490 } 491 }