github.com/Andyfoo/golang/x/net@v0.0.0-20190901054642-57c1bf301704/html/parse_test.go

github.com/Andyfoo/golang/x/net@v0.0.0-20190901054642-57c1bf301704/html/parse_test.go (about)

     1  // Copyright 2010 The Go Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  package html
     6  
     7  import (
     8  	"bufio"
     9  	"bytes"
    10  	"errors"
    11  	"fmt"
    12  	"io"
    13  	"io/ioutil"
    14  	"os"
    15  	"path/filepath"
    16  	"runtime"
    17  	"sort"
    18  	"strings"
    19  	"testing"
    20  
    21  	"github.com/Andyfoo/golang/x/net/html/atom"
    22  )
    23  
    24  // readParseTest reads a single test case from r.
    25  func readParseTest(r *bufio.Reader) (text, want, context string, err error) {
    26  	line, err := r.ReadSlice('\n')
    27  	if err != nil {
    28  		return "", "", "", err
    29  	}
    30  	var b []byte
    31  
    32  	// Read the HTML.
    33  	if string(line) != "#data\n" {
    34  		return "", "", "", fmt.Errorf(`got %q want "#data\n"`, line)
    35  	}
    36  	for {
    37  		line, err = r.ReadSlice('\n')
    38  		if err != nil {
    39  			return "", "", "", err
    40  		}
    41  		if line[0] == '#' {
    42  			break
    43  		}
    44  		b = append(b, line...)
    45  	}
    46  	text = strings.TrimSuffix(string(b), "\n")
    47  	b = b[:0]
    48  
    49  	// Skip the error list.
    50  	if string(line) != "#errors\n" {
    51  		return "", "", "", fmt.Errorf(`got %q want "#errors\n"`, line)
    52  	}
    53  	for {
    54  		line, err = r.ReadSlice('\n')
    55  		if err != nil {
    56  			return "", "", "", err
    57  		}
    58  		if line[0] == '#' {
    59  			break
    60  		}
    61  	}
    62  
    63  	if string(line) == "#document-fragment\n" {
    64  		line, err = r.ReadSlice('\n')
    65  		if err != nil {
    66  			return "", "", "", err
    67  		}
    68  		context = strings.TrimSpace(string(line))
    69  		line, err = r.ReadSlice('\n')
    70  		if err != nil {
    71  			return "", "", "", err
    72  		}
    73  	}
    74  
    75  	// Read the dump of what the parse tree should be.
    76  	if string(line) != "#document\n" {
    77  		return "", "", "", fmt.Errorf(`got %q want "#document\n"`, line)
    78  	}
    79  	inQuote := false
    80  	for {
    81  		line, err = r.ReadSlice('\n')
    82  		if err != nil && err != io.EOF {
    83  			return "", "", "", err
    84  		}
    85  		trimmed := bytes.Trim(line, "| \n")
    86  		if len(trimmed) > 0 {
    87  			if line[0] == '|' && trimmed[0] == '"' {
    88  				inQuote = true
    89  			}
    90  			if trimmed[len(trimmed)-1] == '"' && !(line[0] == '|' && len(trimmed) == 1) {
    91  				inQuote = false
    92  			}
    93  		}
    94  		if len(line) == 0 || len(line) == 1 && line[0] == '\n' && !inQuote {
    95  			break
    96  		}
    97  		b = append(b, line...)
    98  	}
    99  	return text, string(b), context, nil
   100  }
   101  
   102  func dumpIndent(w io.Writer, level int) {
   103  	io.WriteString(w, "| ")
   104  	for i := 0; i < level; i++ {
   105  		io.WriteString(w, "  ")
   106  	}
   107  }
   108  
   109  type sortedAttributes []Attribute
   110  
   111  func (a sortedAttributes) Len() int {
   112  	return len(a)
   113  }
   114  
   115  func (a sortedAttributes) Less(i, j int) bool {
   116  	if a[i].Namespace != a[j].Namespace {
   117  		return a[i].Namespace < a[j].Namespace
   118  	}
   119  	return a[i].Key < a[j].Key
   120  }
   121  
   122  func (a sortedAttributes) Swap(i, j int) {
   123  	a[i], a[j] = a[j], a[i]
   124  }
   125  
   126  func dumpLevel(w io.Writer, n *Node, level int) error {
   127  	dumpIndent(w, level)
   128  	level++
   129  	switch n.Type {
   130  	case ErrorNode:
   131  		return errors.New("unexpected ErrorNode")
   132  	case DocumentNode:
   133  		return errors.New("unexpected DocumentNode")
   134  	case ElementNode:
   135  		if n.Namespace != "" {
   136  			fmt.Fprintf(w, "<%s %s>", n.Namespace, n.Data)
   137  		} else {
   138  			fmt.Fprintf(w, "<%s>", n.Data)
   139  		}
   140  		attr := sortedAttributes(n.Attr)
   141  		sort.Sort(attr)
   142  		for _, a := range attr {
   143  			io.WriteString(w, "\n")
   144  			dumpIndent(w, level)
   145  			if a.Namespace != "" {
   146  				fmt.Fprintf(w, `%s %s="%s"`, a.Namespace, a.Key, a.Val)
   147  			} else {
   148  				fmt.Fprintf(w, `%s="%s"`, a.Key, a.Val)
   149  			}
   150  		}
   151  		if n.Namespace == "" && n.DataAtom == atom.Template {
   152  			io.WriteString(w, "\n")
   153  			dumpIndent(w, level)
   154  			level++
   155  			io.WriteString(w, "content")
   156  		}
   157  	case TextNode:
   158  		fmt.Fprintf(w, `"%s"`, n.Data)
   159  	case CommentNode:
   160  		fmt.Fprintf(w, "<!-- %s -->", n.Data)
   161  	case DoctypeNode:
   162  		fmt.Fprintf(w, "<!DOCTYPE %s", n.Data)
   163  		if n.Attr != nil {
   164  			var p, s string
   165  			for _, a := range n.Attr {
   166  				switch a.Key {
   167  				case "public":
   168  					p = a.Val
   169  				case "system":
   170  					s = a.Val
   171  				}
   172  			}
   173  			if p != "" || s != "" {
   174  				fmt.Fprintf(w, ` "%s"`, p)
   175  				fmt.Fprintf(w, ` "%s"`, s)
   176  			}
   177  		}
   178  		io.WriteString(w, ">")
   179  	case scopeMarkerNode:
   180  		return errors.New("unexpected scopeMarkerNode")
   181  	default:
   182  		return errors.New("unknown node type")
   183  	}
   184  	io.WriteString(w, "\n")
   185  	for c := n.FirstChild; c != nil; c = c.NextSibling {
   186  		if err := dumpLevel(w, c, level); err != nil {
   187  			return err
   188  		}
   189  	}
   190  	return nil
   191  }
   192  
   193  func dump(n *Node) (string, error) {
   194  	if n == nil || n.FirstChild == nil {
   195  		return "", nil
   196  	}
   197  	var b bytes.Buffer
   198  	for c := n.FirstChild; c != nil; c = c.NextSibling {
   199  		if err := dumpLevel(&b, c, 0); err != nil {
   200  			return "", err
   201  		}
   202  	}
   203  	return b.String(), nil
   204  }
   205  
   206  var testDataDirs = []string{"testdata/webkit/", "testdata/go/"}
   207  
   208  func TestParser(t *testing.T) {
   209  	for _, testDataDir := range testDataDirs {
   210  		testFiles, err := filepath.Glob(testDataDir + "*.dat")
   211  		if err != nil {
   212  			t.Fatal(err)
   213  		}
   214  		for _, tf := range testFiles {
   215  			f, err := os.Open(tf)
   216  			if err != nil {
   217  				t.Fatal(err)
   218  			}
   219  			defer f.Close()
   220  			r := bufio.NewReader(f)
   221  
   222  			for i := 0; ; i++ {
   223  				text, want, context, err := readParseTest(r)
   224  				if err == io.EOF {
   225  					break
   226  				}
   227  				if err != nil {
   228  					t.Fatal(err)
   229  				}
   230  
   231  				err = testParseCase(text, want, context)
   232  
   233  				if err != nil {
   234  					t.Errorf("%s test #%d %q, %s", tf, i, text, err)
   235  				}
   236  			}
   237  		}
   238  	}
   239  }
   240  
   241  // Issue 16318
   242  func TestParserWithoutScripting(t *testing.T) {
   243  	text := `<noscript><img src='https://golang.org/doc/gopher/frontpage.png' /></noscript><p><img src='https://golang.org/doc/gopher/doc.png' /></p>`
   244  	want := `| <html>
   245  |   <head>
   246  |     <noscript>
   247  |   <body>
   248  |     "<img src='https://golang.org/doc/gopher/frontpage.png' />"
   249  |     <p>
   250  |       <img>
   251  |         src="https://golang.org/doc/gopher/doc.png"
   252  `
   253  	err := testParseCase(text, want, "", ParseOptionEnableScripting(false))
   254  
   255  	if err != nil {
   256  		t.Errorf("test with scripting is disabled, %q, %s", text, err)
   257  	}
   258  }
   259  
   260  // testParseCase tests one test case from the test files. If the test does not
   261  // pass, it returns an error that explains the failure.
   262  // text is the HTML to be parsed, want is a dump of the correct parse tree,
   263  // and context is the name of the context node, if any.
   264  func testParseCase(text, want, context string, opts ...ParseOption) (err error) {
   265  	defer func() {
   266  		if x := recover(); x != nil {
   267  			switch e := x.(type) {
   268  			case error:
   269  				err = e
   270  			default:
   271  				err = fmt.Errorf("%v", e)
   272  			}
   273  		}
   274  	}()
   275  
   276  	var doc *Node
   277  	if context == "" {
   278  		doc, err = ParseWithOptions(strings.NewReader(text), opts...)
   279  		if err != nil {
   280  			return err
   281  		}
   282  	} else {
   283  		contextNode := &Node{
   284  			Type:     ElementNode,
   285  			DataAtom: atom.Lookup([]byte(context)),
   286  			Data:     context,
   287  		}
   288  		nodes, err := ParseFragmentWithOptions(strings.NewReader(text), contextNode, opts...)
   289  		if err != nil {
   290  			return err
   291  		}
   292  		doc = &Node{
   293  			Type: DocumentNode,
   294  		}
   295  		for _, n := range nodes {
   296  			doc.AppendChild(n)
   297  		}
   298  	}
   299  
   300  	if err := checkTreeConsistency(doc); err != nil {
   301  		return err
   302  	}
   303  
   304  	got, err := dump(doc)
   305  	if err != nil {
   306  		return err
   307  	}
   308  	// Compare the parsed tree to the #document section.
   309  	if got != want {
   310  		return fmt.Errorf("got vs want:\n----\n%s----\n%s----", got, want)
   311  	}
   312  
   313  	if renderTestBlacklist[text] || context != "" {
   314  		return nil
   315  	}
   316  
   317  	// Check that rendering and re-parsing results in an identical tree.
   318  	pr, pw := io.Pipe()
   319  	go func() {
   320  		pw.CloseWithError(Render(pw, doc))
   321  	}()
   322  	doc1, err := Parse(pr)
   323  	if err != nil {
   324  		return err
   325  	}
   326  	got1, err := dump(doc1)
   327  	if err != nil {
   328  		return err
   329  	}
   330  	if got != got1 {
   331  		return fmt.Errorf("got vs got1:\n----\n%s----\n%s----", got, got1)
   332  	}
   333  
   334  	return nil
   335  }
   336  
   337  // Some test input result in parse trees are not 'well-formed' despite
   338  // following the HTML5 recovery algorithms. Rendering and re-parsing such a
   339  // tree will not result in an exact clone of that tree. We blacklist such
   340  // inputs from the render test.
   341  var renderTestBlacklist = map[string]bool{
   342  	// The second <a> will be reparented to the first <table>'s parent. This
   343  	// results in an <a> whose parent is an <a>, which is not 'well-formed'.
   344  	`<a><table><td><a><table></table><a></tr><a></table><b>X</b>C<a>Y`: true,
   345  	// The same thing with a <p>:
   346  	`<p><table></p>`: true,
   347  	// More cases of <a> being reparented:
   348  	`<a href="blah">aba<table><a href="foo">br<tr><td></td></tr>x</table>aoe`: true,
   349  	`<a><table><a></table><p><a><div><a>`:                                     true,
   350  	`<a><table><td><a><table></table><a></tr><a></table><a>`:                  true,
   351  	`<template><a><table><a>`:                                                 true,
   352  	// A similar reparenting situation involving <nobr>:
   353  	`<!DOCTYPE html><body><b><nobr>1<table><nobr></b><i><nobr>2<nobr></i>3`: true,
   354  	// A <plaintext> element is reparented, putting it before a table.
   355  	// A <plaintext> element can't have anything after it in HTML.
   356  	`<table><plaintext><td>`:                                   true,
   357  	`<!doctype html><table><plaintext></plaintext>`:            true,
   358  	`<!doctype html><table><tbody><plaintext></plaintext>`:     true,
   359  	`<!doctype html><table><tbody><tr><plaintext></plaintext>`: true,
   360  	// A form inside a table inside a form doesn't work either.
   361  	`<!doctype html><form><table></form><form></table></form>`: true,
   362  	// A script that ends at EOF may escape its own closing tag when rendered.
   363  	`<!doctype html><script><!--<script `:          true,
   364  	`<!doctype html><script><!--<script <`:         true,
   365  	`<!doctype html><script><!--<script <a`:        true,
   366  	`<!doctype html><script><!--<script </`:        true,
   367  	`<!doctype html><script><!--<script </s`:       true,
   368  	`<!doctype html><script><!--<script </script`:  true,
   369  	`<!doctype html><script><!--<script </scripta`: true,
   370  	`<!doctype html><script><!--<script -`:         true,
   371  	`<!doctype html><script><!--<script -a`:        true,
   372  	`<!doctype html><script><!--<script -<`:        true,
   373  	`<!doctype html><script><!--<script --`:        true,
   374  	`<!doctype html><script><!--<script --a`:       true,
   375  	`<!doctype html><script><!--<script --<`:       true,
   376  	`<script><!--<script `:                         true,
   377  	`<script><!--<script <a`:                       true,
   378  	`<script><!--<script </script`:                 true,
   379  	`<script><!--<script </scripta`:                true,
   380  	`<script><!--<script -`:                        true,
   381  	`<script><!--<script -a`:                       true,
   382  	`<script><!--<script --`:                       true,
   383  	`<script><!--<script --a`:                      true,
   384  	`<script><!--<script <`:                        true,
   385  	`<script><!--<script </`:                       true,
   386  	`<script><!--<script </s`:                      true,
   387  	// Reconstructing the active formatting elements results in a <plaintext>
   388  	// element that contains an <a> element.
   389  	`<!doctype html><p><a><plaintext>b`:         true,
   390  	`<table><math><select><mi><select></table>`: true,
   391  }
   392  
   393  func TestNodeConsistency(t *testing.T) {
   394  	// inconsistentNode is a Node whose DataAtom and Data do not agree.
   395  	inconsistentNode := &Node{
   396  		Type:     ElementNode,
   397  		DataAtom: atom.Frameset,
   398  		Data:     "table",
   399  	}
   400  	_, err := ParseFragment(strings.NewReader("<p>hello</p>"), inconsistentNode)
   401  	if err == nil {
   402  		t.Errorf("got nil error, want non-nil")
   403  	}
   404  }
   405  
   406  func TestParseFragmentWithNilContext(t *testing.T) {
   407  	// This shouldn't panic.
   408  	ParseFragment(strings.NewReader("<p>hello</p>"), nil)
   409  }
   410  
   411  func BenchmarkParser(b *testing.B) {
   412  	buf, err := ioutil.ReadFile("testdata/go1.html")
   413  	if err != nil {
   414  		b.Fatalf("could not read testdata/go1.html: %v", err)
   415  	}
   416  	b.SetBytes(int64(len(buf)))
   417  	runtime.GC()
   418  	b.ReportAllocs()
   419  	b.ResetTimer()
   420  	for i := 0; i < b.N; i++ {
   421  		Parse(bytes.NewBuffer(buf))
   422  	}
   423  }