github.com/graybobo/golang.org-package-offline-cache@v0.0.0-20200626051047-6608995c132f/x/net/html/parse_test.go (about) 1 // Copyright 2010 The Go Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 package html 6 7 import ( 8 "bufio" 9 "bytes" 10 "errors" 11 "fmt" 12 "io" 13 "io/ioutil" 14 "os" 15 "path/filepath" 16 "runtime" 17 "sort" 18 "strings" 19 "testing" 20 21 "golang.org/x/net/html/atom" 22 ) 23 24 // readParseTest reads a single test case from r. 25 func readParseTest(r *bufio.Reader) (text, want, context string, err error) { 26 line, err := r.ReadSlice('\n') 27 if err != nil { 28 return "", "", "", err 29 } 30 var b []byte 31 32 // Read the HTML. 33 if string(line) != "#data\n" { 34 return "", "", "", fmt.Errorf(`got %q want "#data\n"`, line) 35 } 36 for { 37 line, err = r.ReadSlice('\n') 38 if err != nil { 39 return "", "", "", err 40 } 41 if line[0] == '#' { 42 break 43 } 44 b = append(b, line...) 45 } 46 text = strings.TrimSuffix(string(b), "\n") 47 b = b[:0] 48 49 // Skip the error list. 50 if string(line) != "#errors\n" { 51 return "", "", "", fmt.Errorf(`got %q want "#errors\n"`, line) 52 } 53 for { 54 line, err = r.ReadSlice('\n') 55 if err != nil { 56 return "", "", "", err 57 } 58 if line[0] == '#' { 59 break 60 } 61 } 62 63 if string(line) == "#document-fragment\n" { 64 line, err = r.ReadSlice('\n') 65 if err != nil { 66 return "", "", "", err 67 } 68 context = strings.TrimSpace(string(line)) 69 line, err = r.ReadSlice('\n') 70 if err != nil { 71 return "", "", "", err 72 } 73 } 74 75 // Read the dump of what the parse tree should be. 76 if string(line) != "#document\n" { 77 return "", "", "", fmt.Errorf(`got %q want "#document\n"`, line) 78 } 79 inQuote := false 80 for { 81 line, err = r.ReadSlice('\n') 82 if err != nil && err != io.EOF { 83 return "", "", "", err 84 } 85 trimmed := bytes.Trim(line, "| \n") 86 if len(trimmed) > 0 { 87 if line[0] == '|' && trimmed[0] == '"' { 88 inQuote = true 89 } 90 if trimmed[len(trimmed)-1] == '"' && !(line[0] == '|' && len(trimmed) == 1) { 91 inQuote = false 92 } 93 } 94 if len(line) == 0 || len(line) == 1 && line[0] == '\n' && !inQuote { 95 break 96 } 97 b = append(b, line...) 98 } 99 return text, string(b), context, nil 100 } 101 102 func dumpIndent(w io.Writer, level int) { 103 io.WriteString(w, "| ") 104 for i := 0; i < level; i++ { 105 io.WriteString(w, " ") 106 } 107 } 108 109 type sortedAttributes []Attribute 110 111 func (a sortedAttributes) Len() int { 112 return len(a) 113 } 114 115 func (a sortedAttributes) Less(i, j int) bool { 116 if a[i].Namespace != a[j].Namespace { 117 return a[i].Namespace < a[j].Namespace 118 } 119 return a[i].Key < a[j].Key 120 } 121 122 func (a sortedAttributes) Swap(i, j int) { 123 a[i], a[j] = a[j], a[i] 124 } 125 126 func dumpLevel(w io.Writer, n *Node, level int) error { 127 dumpIndent(w, level) 128 switch n.Type { 129 case ErrorNode: 130 return errors.New("unexpected ErrorNode") 131 case DocumentNode: 132 return errors.New("unexpected DocumentNode") 133 case ElementNode: 134 if n.Namespace != "" { 135 fmt.Fprintf(w, "<%s %s>", n.Namespace, n.Data) 136 } else { 137 fmt.Fprintf(w, "<%s>", n.Data) 138 } 139 attr := sortedAttributes(n.Attr) 140 sort.Sort(attr) 141 for _, a := range attr { 142 io.WriteString(w, "\n") 143 dumpIndent(w, level+1) 144 if a.Namespace != "" { 145 fmt.Fprintf(w, `%s %s="%s"`, a.Namespace, a.Key, a.Val) 146 } else { 147 fmt.Fprintf(w, `%s="%s"`, a.Key, a.Val) 148 } 149 } 150 case TextNode: 151 fmt.Fprintf(w, `"%s"`, n.Data) 152 case CommentNode: 153 fmt.Fprintf(w, "<!-- %s -->", n.Data) 154 case DoctypeNode: 155 fmt.Fprintf(w, "<!DOCTYPE %s", n.Data) 156 if n.Attr != nil { 157 var p, s string 158 for _, a := range n.Attr { 159 switch a.Key { 160 case "public": 161 p = a.Val 162 case "system": 163 s = a.Val 164 } 165 } 166 if p != "" || s != "" { 167 fmt.Fprintf(w, ` "%s"`, p) 168 fmt.Fprintf(w, ` "%s"`, s) 169 } 170 } 171 io.WriteString(w, ">") 172 case scopeMarkerNode: 173 return errors.New("unexpected scopeMarkerNode") 174 default: 175 return errors.New("unknown node type") 176 } 177 io.WriteString(w, "\n") 178 for c := n.FirstChild; c != nil; c = c.NextSibling { 179 if err := dumpLevel(w, c, level+1); err != nil { 180 return err 181 } 182 } 183 return nil 184 } 185 186 func dump(n *Node) (string, error) { 187 if n == nil || n.FirstChild == nil { 188 return "", nil 189 } 190 var b bytes.Buffer 191 for c := n.FirstChild; c != nil; c = c.NextSibling { 192 if err := dumpLevel(&b, c, 0); err != nil { 193 return "", err 194 } 195 } 196 return b.String(), nil 197 } 198 199 const testDataDir = "testdata/webkit/" 200 201 func TestParser(t *testing.T) { 202 testFiles, err := filepath.Glob(testDataDir + "*.dat") 203 if err != nil { 204 t.Fatal(err) 205 } 206 for _, tf := range testFiles { 207 f, err := os.Open(tf) 208 if err != nil { 209 t.Fatal(err) 210 } 211 defer f.Close() 212 r := bufio.NewReader(f) 213 214 for i := 0; ; i++ { 215 text, want, context, err := readParseTest(r) 216 if err == io.EOF { 217 break 218 } 219 if err != nil { 220 t.Fatal(err) 221 } 222 223 err = testParseCase(text, want, context) 224 225 if err != nil { 226 t.Errorf("%s test #%d %q, %s", tf, i, text, err) 227 } 228 } 229 } 230 } 231 232 // testParseCase tests one test case from the test files. If the test does not 233 // pass, it returns an error that explains the failure. 234 // text is the HTML to be parsed, want is a dump of the correct parse tree, 235 // and context is the name of the context node, if any. 236 func testParseCase(text, want, context string) (err error) { 237 defer func() { 238 if x := recover(); x != nil { 239 switch e := x.(type) { 240 case error: 241 err = e 242 default: 243 err = fmt.Errorf("%v", e) 244 } 245 } 246 }() 247 248 var doc *Node 249 if context == "" { 250 doc, err = Parse(strings.NewReader(text)) 251 if err != nil { 252 return err 253 } 254 } else { 255 contextNode := &Node{ 256 Type: ElementNode, 257 DataAtom: atom.Lookup([]byte(context)), 258 Data: context, 259 } 260 nodes, err := ParseFragment(strings.NewReader(text), contextNode) 261 if err != nil { 262 return err 263 } 264 doc = &Node{ 265 Type: DocumentNode, 266 } 267 for _, n := range nodes { 268 doc.AppendChild(n) 269 } 270 } 271 272 if err := checkTreeConsistency(doc); err != nil { 273 return err 274 } 275 276 got, err := dump(doc) 277 if err != nil { 278 return err 279 } 280 // Compare the parsed tree to the #document section. 281 if got != want { 282 return fmt.Errorf("got vs want:\n----\n%s----\n%s----", got, want) 283 } 284 285 if renderTestBlacklist[text] || context != "" { 286 return nil 287 } 288 289 // Check that rendering and re-parsing results in an identical tree. 290 pr, pw := io.Pipe() 291 go func() { 292 pw.CloseWithError(Render(pw, doc)) 293 }() 294 doc1, err := Parse(pr) 295 if err != nil { 296 return err 297 } 298 got1, err := dump(doc1) 299 if err != nil { 300 return err 301 } 302 if got != got1 { 303 return fmt.Errorf("got vs got1:\n----\n%s----\n%s----", got, got1) 304 } 305 306 return nil 307 } 308 309 // Some test input result in parse trees are not 'well-formed' despite 310 // following the HTML5 recovery algorithms. Rendering and re-parsing such a 311 // tree will not result in an exact clone of that tree. We blacklist such 312 // inputs from the render test. 313 var renderTestBlacklist = map[string]bool{ 314 // The second <a> will be reparented to the first <table>'s parent. This 315 // results in an <a> whose parent is an <a>, which is not 'well-formed'. 316 `<a><table><td><a><table></table><a></tr><a></table><b>X</b>C<a>Y`: true, 317 // The same thing with a <p>: 318 `<p><table></p>`: true, 319 // More cases of <a> being reparented: 320 `<a href="blah">aba<table><a href="foo">br<tr><td></td></tr>x</table>aoe`: true, 321 `<a><table><a></table><p><a><div><a>`: true, 322 `<a><table><td><a><table></table><a></tr><a></table><a>`: true, 323 // A similar reparenting situation involving <nobr>: 324 `<!DOCTYPE html><body><b><nobr>1<table><nobr></b><i><nobr>2<nobr></i>3`: true, 325 // A <plaintext> element is reparented, putting it before a table. 326 // A <plaintext> element can't have anything after it in HTML. 327 `<table><plaintext><td>`: true, 328 `<!doctype html><table><plaintext></plaintext>`: true, 329 `<!doctype html><table><tbody><plaintext></plaintext>`: true, 330 `<!doctype html><table><tbody><tr><plaintext></plaintext>`: true, 331 // A form inside a table inside a form doesn't work either. 332 `<!doctype html><form><table></form><form></table></form>`: true, 333 // A script that ends at EOF may escape its own closing tag when rendered. 334 `<!doctype html><script><!--<script `: true, 335 `<!doctype html><script><!--<script <`: true, 336 `<!doctype html><script><!--<script <a`: true, 337 `<!doctype html><script><!--<script </`: true, 338 `<!doctype html><script><!--<script </s`: true, 339 `<!doctype html><script><!--<script </script`: true, 340 `<!doctype html><script><!--<script </scripta`: true, 341 `<!doctype html><script><!--<script -`: true, 342 `<!doctype html><script><!--<script -a`: true, 343 `<!doctype html><script><!--<script -<`: true, 344 `<!doctype html><script><!--<script --`: true, 345 `<!doctype html><script><!--<script --a`: true, 346 `<!doctype html><script><!--<script --<`: true, 347 `<script><!--<script `: true, 348 `<script><!--<script <a`: true, 349 `<script><!--<script </script`: true, 350 `<script><!--<script </scripta`: true, 351 `<script><!--<script -`: true, 352 `<script><!--<script -a`: true, 353 `<script><!--<script --`: true, 354 `<script><!--<script --a`: true, 355 `<script><!--<script <`: true, 356 `<script><!--<script </`: true, 357 `<script><!--<script </s`: true, 358 // Reconstructing the active formatting elements results in a <plaintext> 359 // element that contains an <a> element. 360 `<!doctype html><p><a><plaintext>b`: true, 361 } 362 363 func TestNodeConsistency(t *testing.T) { 364 // inconsistentNode is a Node whose DataAtom and Data do not agree. 365 inconsistentNode := &Node{ 366 Type: ElementNode, 367 DataAtom: atom.Frameset, 368 Data: "table", 369 } 370 _, err := ParseFragment(strings.NewReader("<p>hello</p>"), inconsistentNode) 371 if err == nil { 372 t.Errorf("got nil error, want non-nil") 373 } 374 } 375 376 func BenchmarkParser(b *testing.B) { 377 buf, err := ioutil.ReadFile("testdata/go1.html") 378 if err != nil { 379 b.Fatalf("could not read testdata/go1.html: %v", err) 380 } 381 b.SetBytes(int64(len(buf))) 382 runtime.GC() 383 b.ReportAllocs() 384 b.ResetTimer() 385 for i := 0; i < b.N; i++ { 386 Parse(bytes.NewBuffer(buf)) 387 } 388 }