github.com/Andyfoo/golang/x/net@v0.0.0-20190901054642-57c1bf301704/html/parse_test.go (about) 1 // Copyright 2010 The Go Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 package html 6 7 import ( 8 "bufio" 9 "bytes" 10 "errors" 11 "fmt" 12 "io" 13 "io/ioutil" 14 "os" 15 "path/filepath" 16 "runtime" 17 "sort" 18 "strings" 19 "testing" 20 21 "github.com/Andyfoo/golang/x/net/html/atom" 22 ) 23 24 // readParseTest reads a single test case from r. 25 func readParseTest(r *bufio.Reader) (text, want, context string, err error) { 26 line, err := r.ReadSlice('\n') 27 if err != nil { 28 return "", "", "", err 29 } 30 var b []byte 31 32 // Read the HTML. 33 if string(line) != "#data\n" { 34 return "", "", "", fmt.Errorf(`got %q want "#data\n"`, line) 35 } 36 for { 37 line, err = r.ReadSlice('\n') 38 if err != nil { 39 return "", "", "", err 40 } 41 if line[0] == '#' { 42 break 43 } 44 b = append(b, line...) 45 } 46 text = strings.TrimSuffix(string(b), "\n") 47 b = b[:0] 48 49 // Skip the error list. 50 if string(line) != "#errors\n" { 51 return "", "", "", fmt.Errorf(`got %q want "#errors\n"`, line) 52 } 53 for { 54 line, err = r.ReadSlice('\n') 55 if err != nil { 56 return "", "", "", err 57 } 58 if line[0] == '#' { 59 break 60 } 61 } 62 63 if string(line) == "#document-fragment\n" { 64 line, err = r.ReadSlice('\n') 65 if err != nil { 66 return "", "", "", err 67 } 68 context = strings.TrimSpace(string(line)) 69 line, err = r.ReadSlice('\n') 70 if err != nil { 71 return "", "", "", err 72 } 73 } 74 75 // Read the dump of what the parse tree should be. 76 if string(line) != "#document\n" { 77 return "", "", "", fmt.Errorf(`got %q want "#document\n"`, line) 78 } 79 inQuote := false 80 for { 81 line, err = r.ReadSlice('\n') 82 if err != nil && err != io.EOF { 83 return "", "", "", err 84 } 85 trimmed := bytes.Trim(line, "| \n") 86 if len(trimmed) > 0 { 87 if line[0] == '|' && trimmed[0] == '"' { 88 inQuote = true 89 } 90 if trimmed[len(trimmed)-1] == '"' && !(line[0] == '|' && len(trimmed) == 1) { 91 inQuote = false 92 } 93 } 94 if len(line) == 0 || len(line) == 1 && line[0] == '\n' && !inQuote { 95 break 96 } 97 b = append(b, line...) 98 } 99 return text, string(b), context, nil 100 } 101 102 func dumpIndent(w io.Writer, level int) { 103 io.WriteString(w, "| ") 104 for i := 0; i < level; i++ { 105 io.WriteString(w, " ") 106 } 107 } 108 109 type sortedAttributes []Attribute 110 111 func (a sortedAttributes) Len() int { 112 return len(a) 113 } 114 115 func (a sortedAttributes) Less(i, j int) bool { 116 if a[i].Namespace != a[j].Namespace { 117 return a[i].Namespace < a[j].Namespace 118 } 119 return a[i].Key < a[j].Key 120 } 121 122 func (a sortedAttributes) Swap(i, j int) { 123 a[i], a[j] = a[j], a[i] 124 } 125 126 func dumpLevel(w io.Writer, n *Node, level int) error { 127 dumpIndent(w, level) 128 level++ 129 switch n.Type { 130 case ErrorNode: 131 return errors.New("unexpected ErrorNode") 132 case DocumentNode: 133 return errors.New("unexpected DocumentNode") 134 case ElementNode: 135 if n.Namespace != "" { 136 fmt.Fprintf(w, "<%s %s>", n.Namespace, n.Data) 137 } else { 138 fmt.Fprintf(w, "<%s>", n.Data) 139 } 140 attr := sortedAttributes(n.Attr) 141 sort.Sort(attr) 142 for _, a := range attr { 143 io.WriteString(w, "\n") 144 dumpIndent(w, level) 145 if a.Namespace != "" { 146 fmt.Fprintf(w, `%s %s="%s"`, a.Namespace, a.Key, a.Val) 147 } else { 148 fmt.Fprintf(w, `%s="%s"`, a.Key, a.Val) 149 } 150 } 151 if n.Namespace == "" && n.DataAtom == atom.Template { 152 io.WriteString(w, "\n") 153 dumpIndent(w, level) 154 level++ 155 io.WriteString(w, "content") 156 } 157 case TextNode: 158 fmt.Fprintf(w, `"%s"`, n.Data) 159 case CommentNode: 160 fmt.Fprintf(w, "<!-- %s -->", n.Data) 161 case DoctypeNode: 162 fmt.Fprintf(w, "<!DOCTYPE %s", n.Data) 163 if n.Attr != nil { 164 var p, s string 165 for _, a := range n.Attr { 166 switch a.Key { 167 case "public": 168 p = a.Val 169 case "system": 170 s = a.Val 171 } 172 } 173 if p != "" || s != "" { 174 fmt.Fprintf(w, ` "%s"`, p) 175 fmt.Fprintf(w, ` "%s"`, s) 176 } 177 } 178 io.WriteString(w, ">") 179 case scopeMarkerNode: 180 return errors.New("unexpected scopeMarkerNode") 181 default: 182 return errors.New("unknown node type") 183 } 184 io.WriteString(w, "\n") 185 for c := n.FirstChild; c != nil; c = c.NextSibling { 186 if err := dumpLevel(w, c, level); err != nil { 187 return err 188 } 189 } 190 return nil 191 } 192 193 func dump(n *Node) (string, error) { 194 if n == nil || n.FirstChild == nil { 195 return "", nil 196 } 197 var b bytes.Buffer 198 for c := n.FirstChild; c != nil; c = c.NextSibling { 199 if err := dumpLevel(&b, c, 0); err != nil { 200 return "", err 201 } 202 } 203 return b.String(), nil 204 } 205 206 var testDataDirs = []string{"testdata/webkit/", "testdata/go/"} 207 208 func TestParser(t *testing.T) { 209 for _, testDataDir := range testDataDirs { 210 testFiles, err := filepath.Glob(testDataDir + "*.dat") 211 if err != nil { 212 t.Fatal(err) 213 } 214 for _, tf := range testFiles { 215 f, err := os.Open(tf) 216 if err != nil { 217 t.Fatal(err) 218 } 219 defer f.Close() 220 r := bufio.NewReader(f) 221 222 for i := 0; ; i++ { 223 text, want, context, err := readParseTest(r) 224 if err == io.EOF { 225 break 226 } 227 if err != nil { 228 t.Fatal(err) 229 } 230 231 err = testParseCase(text, want, context) 232 233 if err != nil { 234 t.Errorf("%s test #%d %q, %s", tf, i, text, err) 235 } 236 } 237 } 238 } 239 } 240 241 // Issue 16318 242 func TestParserWithoutScripting(t *testing.T) { 243 text := `<noscript><img src='https://golang.org/doc/gopher/frontpage.png' /></noscript><p><img src='https://golang.org/doc/gopher/doc.png' /></p>` 244 want := `| <html> 245 | <head> 246 | <noscript> 247 | <body> 248 | "<img src='https://golang.org/doc/gopher/frontpage.png' />" 249 | <p> 250 | <img> 251 | src="https://golang.org/doc/gopher/doc.png" 252 ` 253 err := testParseCase(text, want, "", ParseOptionEnableScripting(false)) 254 255 if err != nil { 256 t.Errorf("test with scripting is disabled, %q, %s", text, err) 257 } 258 } 259 260 // testParseCase tests one test case from the test files. If the test does not 261 // pass, it returns an error that explains the failure. 262 // text is the HTML to be parsed, want is a dump of the correct parse tree, 263 // and context is the name of the context node, if any. 264 func testParseCase(text, want, context string, opts ...ParseOption) (err error) { 265 defer func() { 266 if x := recover(); x != nil { 267 switch e := x.(type) { 268 case error: 269 err = e 270 default: 271 err = fmt.Errorf("%v", e) 272 } 273 } 274 }() 275 276 var doc *Node 277 if context == "" { 278 doc, err = ParseWithOptions(strings.NewReader(text), opts...) 279 if err != nil { 280 return err 281 } 282 } else { 283 contextNode := &Node{ 284 Type: ElementNode, 285 DataAtom: atom.Lookup([]byte(context)), 286 Data: context, 287 } 288 nodes, err := ParseFragmentWithOptions(strings.NewReader(text), contextNode, opts...) 289 if err != nil { 290 return err 291 } 292 doc = &Node{ 293 Type: DocumentNode, 294 } 295 for _, n := range nodes { 296 doc.AppendChild(n) 297 } 298 } 299 300 if err := checkTreeConsistency(doc); err != nil { 301 return err 302 } 303 304 got, err := dump(doc) 305 if err != nil { 306 return err 307 } 308 // Compare the parsed tree to the #document section. 309 if got != want { 310 return fmt.Errorf("got vs want:\n----\n%s----\n%s----", got, want) 311 } 312 313 if renderTestBlacklist[text] || context != "" { 314 return nil 315 } 316 317 // Check that rendering and re-parsing results in an identical tree. 318 pr, pw := io.Pipe() 319 go func() { 320 pw.CloseWithError(Render(pw, doc)) 321 }() 322 doc1, err := Parse(pr) 323 if err != nil { 324 return err 325 } 326 got1, err := dump(doc1) 327 if err != nil { 328 return err 329 } 330 if got != got1 { 331 return fmt.Errorf("got vs got1:\n----\n%s----\n%s----", got, got1) 332 } 333 334 return nil 335 } 336 337 // Some test input result in parse trees are not 'well-formed' despite 338 // following the HTML5 recovery algorithms. Rendering and re-parsing such a 339 // tree will not result in an exact clone of that tree. We blacklist such 340 // inputs from the render test. 341 var renderTestBlacklist = map[string]bool{ 342 // The second <a> will be reparented to the first <table>'s parent. This 343 // results in an <a> whose parent is an <a>, which is not 'well-formed'. 344 `<a><table><td><a><table></table><a></tr><a></table><b>X</b>C<a>Y`: true, 345 // The same thing with a <p>: 346 `<p><table></p>`: true, 347 // More cases of <a> being reparented: 348 `<a href="blah">aba<table><a href="foo">br<tr><td></td></tr>x</table>aoe`: true, 349 `<a><table><a></table><p><a><div><a>`: true, 350 `<a><table><td><a><table></table><a></tr><a></table><a>`: true, 351 `<template><a><table><a>`: true, 352 // A similar reparenting situation involving <nobr>: 353 `<!DOCTYPE html><body><b><nobr>1<table><nobr></b><i><nobr>2<nobr></i>3`: true, 354 // A <plaintext> element is reparented, putting it before a table. 355 // A <plaintext> element can't have anything after it in HTML. 356 `<table><plaintext><td>`: true, 357 `<!doctype html><table><plaintext></plaintext>`: true, 358 `<!doctype html><table><tbody><plaintext></plaintext>`: true, 359 `<!doctype html><table><tbody><tr><plaintext></plaintext>`: true, 360 // A form inside a table inside a form doesn't work either. 361 `<!doctype html><form><table></form><form></table></form>`: true, 362 // A script that ends at EOF may escape its own closing tag when rendered. 363 `<!doctype html><script><!--<script `: true, 364 `<!doctype html><script><!--<script <`: true, 365 `<!doctype html><script><!--<script <a`: true, 366 `<!doctype html><script><!--<script </`: true, 367 `<!doctype html><script><!--<script </s`: true, 368 `<!doctype html><script><!--<script </script`: true, 369 `<!doctype html><script><!--<script </scripta`: true, 370 `<!doctype html><script><!--<script -`: true, 371 `<!doctype html><script><!--<script -a`: true, 372 `<!doctype html><script><!--<script -<`: true, 373 `<!doctype html><script><!--<script --`: true, 374 `<!doctype html><script><!--<script --a`: true, 375 `<!doctype html><script><!--<script --<`: true, 376 `<script><!--<script `: true, 377 `<script><!--<script <a`: true, 378 `<script><!--<script </script`: true, 379 `<script><!--<script </scripta`: true, 380 `<script><!--<script -`: true, 381 `<script><!--<script -a`: true, 382 `<script><!--<script --`: true, 383 `<script><!--<script --a`: true, 384 `<script><!--<script <`: true, 385 `<script><!--<script </`: true, 386 `<script><!--<script </s`: true, 387 // Reconstructing the active formatting elements results in a <plaintext> 388 // element that contains an <a> element. 389 `<!doctype html><p><a><plaintext>b`: true, 390 `<table><math><select><mi><select></table>`: true, 391 } 392 393 func TestNodeConsistency(t *testing.T) { 394 // inconsistentNode is a Node whose DataAtom and Data do not agree. 395 inconsistentNode := &Node{ 396 Type: ElementNode, 397 DataAtom: atom.Frameset, 398 Data: "table", 399 } 400 _, err := ParseFragment(strings.NewReader("<p>hello</p>"), inconsistentNode) 401 if err == nil { 402 t.Errorf("got nil error, want non-nil") 403 } 404 } 405 406 func TestParseFragmentWithNilContext(t *testing.T) { 407 // This shouldn't panic. 408 ParseFragment(strings.NewReader("<p>hello</p>"), nil) 409 } 410 411 func BenchmarkParser(b *testing.B) { 412 buf, err := ioutil.ReadFile("testdata/go1.html") 413 if err != nil { 414 b.Fatalf("could not read testdata/go1.html: %v", err) 415 } 416 b.SetBytes(int64(len(buf))) 417 runtime.GC() 418 b.ReportAllocs() 419 b.ResetTimer() 420 for i := 0; i < b.N; i++ { 421 Parse(bytes.NewBuffer(buf)) 422 } 423 }