github.com/lianghucheng/zrddz@v0.0.0-20200923083010-c71f680932e2/src/golang.org/x/net/html/parse_test.go (about) 1 // Copyright 2010 The Go Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 package html 6 7 import ( 8 "bufio" 9 "bytes" 10 "errors" 11 "fmt" 12 "io" 13 "io/ioutil" 14 "os" 15 "path/filepath" 16 "runtime" 17 "sort" 18 "strings" 19 "testing" 20 21 "golang.org/x/net/html/atom" 22 ) 23 24 // readParseTest reads a single test case from r. 25 func readParseTest(r *bufio.Reader) (text, want, context string, err error) { 26 line, err := r.ReadSlice('\n') 27 if err != nil { 28 return "", "", "", err 29 } 30 var b []byte 31 32 // Read the HTML. 33 if string(line) != "#data\n" { 34 return "", "", "", fmt.Errorf(`got %q want "#data\n"`, line) 35 } 36 for { 37 line, err = r.ReadSlice('\n') 38 if err != nil { 39 return "", "", "", err 40 } 41 if line[0] == '#' { 42 break 43 } 44 b = append(b, line...) 45 } 46 text = strings.TrimSuffix(string(b), "\n") 47 b = b[:0] 48 49 // Skip the error list. 50 if string(line) != "#errors\n" { 51 return "", "", "", fmt.Errorf(`got %q want "#errors\n"`, line) 52 } 53 for { 54 line, err = r.ReadSlice('\n') 55 if err != nil { 56 return "", "", "", err 57 } 58 if line[0] == '#' { 59 break 60 } 61 } 62 63 if string(line) == "#document-fragment\n" { 64 line, err = r.ReadSlice('\n') 65 if err != nil { 66 return "", "", "", err 67 } 68 context = strings.TrimSpace(string(line)) 69 line, err = r.ReadSlice('\n') 70 if err != nil { 71 return "", "", "", err 72 } 73 } 74 75 // Read the dump of what the parse tree should be. 76 if string(line) != "#document\n" { 77 return "", "", "", fmt.Errorf(`got %q want "#document\n"`, line) 78 } 79 inQuote := false 80 for { 81 line, err = r.ReadSlice('\n') 82 if err != nil && err != io.EOF { 83 return "", "", "", err 84 } 85 trimmed := bytes.Trim(line, "| \n") 86 if len(trimmed) > 0 { 87 if line[0] == '|' && trimmed[0] == '"' { 88 inQuote = true 89 } 90 if trimmed[len(trimmed)-1] == '"' && !(line[0] == '|' && len(trimmed) == 1) { 91 inQuote = false 92 } 93 } 94 if len(line) == 0 || len(line) == 1 && line[0] == '\n' && !inQuote { 95 break 96 } 97 b = append(b, line...) 98 } 99 return text, string(b), context, nil 100 } 101 102 func dumpIndent(w io.Writer, level int) { 103 io.WriteString(w, "| ") 104 for i := 0; i < level; i++ { 105 io.WriteString(w, " ") 106 } 107 } 108 109 type sortedAttributes []Attribute 110 111 func (a sortedAttributes) Len() int { 112 return len(a) 113 } 114 115 func (a sortedAttributes) Less(i, j int) bool { 116 if a[i].Namespace != a[j].Namespace { 117 return a[i].Namespace < a[j].Namespace 118 } 119 return a[i].Key < a[j].Key 120 } 121 122 func (a sortedAttributes) Swap(i, j int) { 123 a[i], a[j] = a[j], a[i] 124 } 125 126 func dumpLevel(w io.Writer, n *Node, level int) error { 127 dumpIndent(w, level) 128 level++ 129 switch n.Type { 130 case ErrorNode: 131 return errors.New("unexpected ErrorNode") 132 case DocumentNode: 133 return errors.New("unexpected DocumentNode") 134 case ElementNode: 135 if n.Namespace != "" { 136 fmt.Fprintf(w, "<%s %s>", n.Namespace, n.Data) 137 } else { 138 fmt.Fprintf(w, "<%s>", n.Data) 139 } 140 attr := sortedAttributes(n.Attr) 141 sort.Sort(attr) 142 for _, a := range attr { 143 io.WriteString(w, "\n") 144 dumpIndent(w, level) 145 if a.Namespace != "" { 146 fmt.Fprintf(w, `%s %s="%s"`, a.Namespace, a.Key, a.Val) 147 } else { 148 fmt.Fprintf(w, `%s="%s"`, a.Key, a.Val) 149 } 150 } 151 if n.Namespace == "" && n.DataAtom == atom.Template { 152 io.WriteString(w, "\n") 153 dumpIndent(w, level) 154 level++ 155 io.WriteString(w, "content") 156 } 157 case TextNode: 158 fmt.Fprintf(w, `"%s"`, n.Data) 159 case CommentNode: 160 fmt.Fprintf(w, "<!-- %s -->", n.Data) 161 case DoctypeNode: 162 fmt.Fprintf(w, "<!DOCTYPE %s", n.Data) 163 if n.Attr != nil { 164 var p, s string 165 for _, a := range n.Attr { 166 switch a.Key { 167 case "public": 168 p = a.Val 169 case "system": 170 s = a.Val 171 } 172 } 173 if p != "" || s != "" { 174 fmt.Fprintf(w, ` "%s"`, p) 175 fmt.Fprintf(w, ` "%s"`, s) 176 } 177 } 178 io.WriteString(w, ">") 179 case scopeMarkerNode: 180 return errors.New("unexpected scopeMarkerNode") 181 default: 182 return errors.New("unknown node type") 183 } 184 io.WriteString(w, "\n") 185 for c := n.FirstChild; c != nil; c = c.NextSibling { 186 if err := dumpLevel(w, c, level); err != nil { 187 return err 188 } 189 } 190 return nil 191 } 192 193 func dump(n *Node) (string, error) { 194 if n == nil || n.FirstChild == nil { 195 return "", nil 196 } 197 var b bytes.Buffer 198 for c := n.FirstChild; c != nil; c = c.NextSibling { 199 if err := dumpLevel(&b, c, 0); err != nil { 200 return "", err 201 } 202 } 203 return b.String(), nil 204 } 205 206 var testDataDirs = []string{"testdata/webkit/", "testdata/go/"} 207 208 func TestParser(t *testing.T) { 209 for _, testDataDir := range testDataDirs { 210 testFiles, err := filepath.Glob(testDataDir + "*.dat") 211 if err != nil { 212 t.Fatal(err) 213 } 214 for _, tf := range testFiles { 215 f, err := os.Open(tf) 216 if err != nil { 217 t.Fatal(err) 218 } 219 defer f.Close() 220 r := bufio.NewReader(f) 221 222 for i := 0; ; i++ { 223 text, want, context, err := readParseTest(r) 224 if err == io.EOF { 225 break 226 } 227 if err != nil { 228 t.Fatal(err) 229 } 230 231 err = testParseCase(text, want, context) 232 233 if err != nil { 234 t.Errorf("%s test #%d %q, %s", tf, i, text, err) 235 } 236 } 237 } 238 } 239 } 240 241 // testParseCase tests one test case from the test files. If the test does not 242 // pass, it returns an error that explains the failure. 243 // text is the HTML to be parsed, want is a dump of the correct parse tree, 244 // and context is the name of the context node, if any. 245 func testParseCase(text, want, context string) (err error) { 246 defer func() { 247 if x := recover(); x != nil { 248 switch e := x.(type) { 249 case error: 250 err = e 251 default: 252 err = fmt.Errorf("%v", e) 253 } 254 } 255 }() 256 257 var doc *Node 258 if context == "" { 259 doc, err = Parse(strings.NewReader(text)) 260 if err != nil { 261 return err 262 } 263 } else { 264 contextNode := &Node{ 265 Type: ElementNode, 266 DataAtom: atom.Lookup([]byte(context)), 267 Data: context, 268 } 269 nodes, err := ParseFragment(strings.NewReader(text), contextNode) 270 if err != nil { 271 return err 272 } 273 doc = &Node{ 274 Type: DocumentNode, 275 } 276 for _, n := range nodes { 277 doc.AppendChild(n) 278 } 279 } 280 281 if err := checkTreeConsistency(doc); err != nil { 282 return err 283 } 284 285 got, err := dump(doc) 286 if err != nil { 287 return err 288 } 289 // Compare the parsed tree to the #document section. 290 if got != want { 291 return fmt.Errorf("got vs want:\n----\n%s----\n%s----", got, want) 292 } 293 294 if renderTestBlacklist[text] || context != "" { 295 return nil 296 } 297 298 // Check that rendering and re-parsing results in an identical tree. 299 pr, pw := io.Pipe() 300 go func() { 301 pw.CloseWithError(Render(pw, doc)) 302 }() 303 doc1, err := Parse(pr) 304 if err != nil { 305 return err 306 } 307 got1, err := dump(doc1) 308 if err != nil { 309 return err 310 } 311 if got != got1 { 312 return fmt.Errorf("got vs got1:\n----\n%s----\n%s----", got, got1) 313 } 314 315 return nil 316 } 317 318 // Some test input result in parse trees are not 'well-formed' despite 319 // following the HTML5 recovery algorithms. Rendering and re-parsing such a 320 // tree will not result in an exact clone of that tree. We blacklist such 321 // inputs from the render test. 322 var renderTestBlacklist = map[string]bool{ 323 // The second <a> will be reparented to the first <table>'s parent. This 324 // results in an <a> whose parent is an <a>, which is not 'well-formed'. 325 `<a><table><td><a><table></table><a></tr><a></table><b>X</b>C<a>Y`: true, 326 // The same thing with a <p>: 327 `<p><table></p>`: true, 328 // More cases of <a> being reparented: 329 `<a href="blah">aba<table><a href="foo">br<tr><td></td></tr>x</table>aoe`: true, 330 `<a><table><a></table><p><a><div><a>`: true, 331 `<a><table><td><a><table></table><a></tr><a></table><a>`: true, 332 `<template><a><table><a>`: true, 333 // A similar reparenting situation involving <nobr>: 334 `<!DOCTYPE html><body><b><nobr>1<table><nobr></b><i><nobr>2<nobr></i>3`: true, 335 // A <plaintext> element is reparented, putting it before a table. 336 // A <plaintext> element can't have anything after it in HTML. 337 `<table><plaintext><td>`: true, 338 `<!doctype html><table><plaintext></plaintext>`: true, 339 `<!doctype html><table><tbody><plaintext></plaintext>`: true, 340 `<!doctype html><table><tbody><tr><plaintext></plaintext>`: true, 341 // A form inside a table inside a form doesn't work either. 342 `<!doctype html><form><table></form><form></table></form>`: true, 343 // A script that ends at EOF may escape its own closing tag when rendered. 344 `<!doctype html><script><!--<script `: true, 345 `<!doctype html><script><!--<script <`: true, 346 `<!doctype html><script><!--<script <a`: true, 347 `<!doctype html><script><!--<script </`: true, 348 `<!doctype html><script><!--<script </s`: true, 349 `<!doctype html><script><!--<script </script`: true, 350 `<!doctype html><script><!--<script </scripta`: true, 351 `<!doctype html><script><!--<script -`: true, 352 `<!doctype html><script><!--<script -a`: true, 353 `<!doctype html><script><!--<script -<`: true, 354 `<!doctype html><script><!--<script --`: true, 355 `<!doctype html><script><!--<script --a`: true, 356 `<!doctype html><script><!--<script --<`: true, 357 `<script><!--<script `: true, 358 `<script><!--<script <a`: true, 359 `<script><!--<script </script`: true, 360 `<script><!--<script </scripta`: true, 361 `<script><!--<script -`: true, 362 `<script><!--<script -a`: true, 363 `<script><!--<script --`: true, 364 `<script><!--<script --a`: true, 365 `<script><!--<script <`: true, 366 `<script><!--<script </`: true, 367 `<script><!--<script </s`: true, 368 // Reconstructing the active formatting elements results in a <plaintext> 369 // element that contains an <a> element. 370 `<!doctype html><p><a><plaintext>b`: true, 371 `<table><math><select><mi><select></table>`: true, 372 } 373 374 func TestNodeConsistency(t *testing.T) { 375 // inconsistentNode is a Node whose DataAtom and Data do not agree. 376 inconsistentNode := &Node{ 377 Type: ElementNode, 378 DataAtom: atom.Frameset, 379 Data: "table", 380 } 381 _, err := ParseFragment(strings.NewReader("<p>hello</p>"), inconsistentNode) 382 if err == nil { 383 t.Errorf("got nil error, want non-nil") 384 } 385 } 386 387 func TestParseFragmentWithNilContext(t *testing.T) { 388 // This shouldn't panic. 389 ParseFragment(strings.NewReader("<p>hello</p>"), nil) 390 } 391 392 func BenchmarkParser(b *testing.B) { 393 buf, err := ioutil.ReadFile("testdata/go1.html") 394 if err != nil { 395 b.Fatalf("could not read testdata/go1.html: %v", err) 396 } 397 b.SetBytes(int64(len(buf))) 398 runtime.GC() 399 b.ReportAllocs() 400 b.ResetTimer() 401 for i := 0; i < b.N; i++ { 402 Parse(bytes.NewBuffer(buf)) 403 } 404 }