github.com/graybobo/golang.org-package-offline-cache@v0.0.0-20200626051047-6608995c132f/x/net/html/parse_test.go (about)

     1  // Copyright 2010 The Go Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  package html
     6  
     7  import (
     8  	"bufio"
     9  	"bytes"
    10  	"errors"
    11  	"fmt"
    12  	"io"
    13  	"io/ioutil"
    14  	"os"
    15  	"path/filepath"
    16  	"runtime"
    17  	"sort"
    18  	"strings"
    19  	"testing"
    20  
    21  	"golang.org/x/net/html/atom"
    22  )
    23  
    24  // readParseTest reads a single test case from r.
    25  func readParseTest(r *bufio.Reader) (text, want, context string, err error) {
    26  	line, err := r.ReadSlice('\n')
    27  	if err != nil {
    28  		return "", "", "", err
    29  	}
    30  	var b []byte
    31  
    32  	// Read the HTML.
    33  	if string(line) != "#data\n" {
    34  		return "", "", "", fmt.Errorf(`got %q want "#data\n"`, line)
    35  	}
    36  	for {
    37  		line, err = r.ReadSlice('\n')
    38  		if err != nil {
    39  			return "", "", "", err
    40  		}
    41  		if line[0] == '#' {
    42  			break
    43  		}
    44  		b = append(b, line...)
    45  	}
    46  	text = strings.TrimSuffix(string(b), "\n")
    47  	b = b[:0]
    48  
    49  	// Skip the error list.
    50  	if string(line) != "#errors\n" {
    51  		return "", "", "", fmt.Errorf(`got %q want "#errors\n"`, line)
    52  	}
    53  	for {
    54  		line, err = r.ReadSlice('\n')
    55  		if err != nil {
    56  			return "", "", "", err
    57  		}
    58  		if line[0] == '#' {
    59  			break
    60  		}
    61  	}
    62  
    63  	if string(line) == "#document-fragment\n" {
    64  		line, err = r.ReadSlice('\n')
    65  		if err != nil {
    66  			return "", "", "", err
    67  		}
    68  		context = strings.TrimSpace(string(line))
    69  		line, err = r.ReadSlice('\n')
    70  		if err != nil {
    71  			return "", "", "", err
    72  		}
    73  	}
    74  
    75  	// Read the dump of what the parse tree should be.
    76  	if string(line) != "#document\n" {
    77  		return "", "", "", fmt.Errorf(`got %q want "#document\n"`, line)
    78  	}
    79  	inQuote := false
    80  	for {
    81  		line, err = r.ReadSlice('\n')
    82  		if err != nil && err != io.EOF {
    83  			return "", "", "", err
    84  		}
    85  		trimmed := bytes.Trim(line, "| \n")
    86  		if len(trimmed) > 0 {
    87  			if line[0] == '|' && trimmed[0] == '"' {
    88  				inQuote = true
    89  			}
    90  			if trimmed[len(trimmed)-1] == '"' && !(line[0] == '|' && len(trimmed) == 1) {
    91  				inQuote = false
    92  			}
    93  		}
    94  		if len(line) == 0 || len(line) == 1 && line[0] == '\n' && !inQuote {
    95  			break
    96  		}
    97  		b = append(b, line...)
    98  	}
    99  	return text, string(b), context, nil
   100  }
   101  
   102  func dumpIndent(w io.Writer, level int) {
   103  	io.WriteString(w, "| ")
   104  	for i := 0; i < level; i++ {
   105  		io.WriteString(w, "  ")
   106  	}
   107  }
   108  
   109  type sortedAttributes []Attribute
   110  
   111  func (a sortedAttributes) Len() int {
   112  	return len(a)
   113  }
   114  
   115  func (a sortedAttributes) Less(i, j int) bool {
   116  	if a[i].Namespace != a[j].Namespace {
   117  		return a[i].Namespace < a[j].Namespace
   118  	}
   119  	return a[i].Key < a[j].Key
   120  }
   121  
   122  func (a sortedAttributes) Swap(i, j int) {
   123  	a[i], a[j] = a[j], a[i]
   124  }
   125  
   126  func dumpLevel(w io.Writer, n *Node, level int) error {
   127  	dumpIndent(w, level)
   128  	switch n.Type {
   129  	case ErrorNode:
   130  		return errors.New("unexpected ErrorNode")
   131  	case DocumentNode:
   132  		return errors.New("unexpected DocumentNode")
   133  	case ElementNode:
   134  		if n.Namespace != "" {
   135  			fmt.Fprintf(w, "<%s %s>", n.Namespace, n.Data)
   136  		} else {
   137  			fmt.Fprintf(w, "<%s>", n.Data)
   138  		}
   139  		attr := sortedAttributes(n.Attr)
   140  		sort.Sort(attr)
   141  		for _, a := range attr {
   142  			io.WriteString(w, "\n")
   143  			dumpIndent(w, level+1)
   144  			if a.Namespace != "" {
   145  				fmt.Fprintf(w, `%s %s="%s"`, a.Namespace, a.Key, a.Val)
   146  			} else {
   147  				fmt.Fprintf(w, `%s="%s"`, a.Key, a.Val)
   148  			}
   149  		}
   150  	case TextNode:
   151  		fmt.Fprintf(w, `"%s"`, n.Data)
   152  	case CommentNode:
   153  		fmt.Fprintf(w, "<!-- %s -->", n.Data)
   154  	case DoctypeNode:
   155  		fmt.Fprintf(w, "<!DOCTYPE %s", n.Data)
   156  		if n.Attr != nil {
   157  			var p, s string
   158  			for _, a := range n.Attr {
   159  				switch a.Key {
   160  				case "public":
   161  					p = a.Val
   162  				case "system":
   163  					s = a.Val
   164  				}
   165  			}
   166  			if p != "" || s != "" {
   167  				fmt.Fprintf(w, ` "%s"`, p)
   168  				fmt.Fprintf(w, ` "%s"`, s)
   169  			}
   170  		}
   171  		io.WriteString(w, ">")
   172  	case scopeMarkerNode:
   173  		return errors.New("unexpected scopeMarkerNode")
   174  	default:
   175  		return errors.New("unknown node type")
   176  	}
   177  	io.WriteString(w, "\n")
   178  	for c := n.FirstChild; c != nil; c = c.NextSibling {
   179  		if err := dumpLevel(w, c, level+1); err != nil {
   180  			return err
   181  		}
   182  	}
   183  	return nil
   184  }
   185  
   186  func dump(n *Node) (string, error) {
   187  	if n == nil || n.FirstChild == nil {
   188  		return "", nil
   189  	}
   190  	var b bytes.Buffer
   191  	for c := n.FirstChild; c != nil; c = c.NextSibling {
   192  		if err := dumpLevel(&b, c, 0); err != nil {
   193  			return "", err
   194  		}
   195  	}
   196  	return b.String(), nil
   197  }
   198  
   199  const testDataDir = "testdata/webkit/"
   200  
   201  func TestParser(t *testing.T) {
   202  	testFiles, err := filepath.Glob(testDataDir + "*.dat")
   203  	if err != nil {
   204  		t.Fatal(err)
   205  	}
   206  	for _, tf := range testFiles {
   207  		f, err := os.Open(tf)
   208  		if err != nil {
   209  			t.Fatal(err)
   210  		}
   211  		defer f.Close()
   212  		r := bufio.NewReader(f)
   213  
   214  		for i := 0; ; i++ {
   215  			text, want, context, err := readParseTest(r)
   216  			if err == io.EOF {
   217  				break
   218  			}
   219  			if err != nil {
   220  				t.Fatal(err)
   221  			}
   222  
   223  			err = testParseCase(text, want, context)
   224  
   225  			if err != nil {
   226  				t.Errorf("%s test #%d %q, %s", tf, i, text, err)
   227  			}
   228  		}
   229  	}
   230  }
   231  
   232  // testParseCase tests one test case from the test files. If the test does not
   233  // pass, it returns an error that explains the failure.
   234  // text is the HTML to be parsed, want is a dump of the correct parse tree,
   235  // and context is the name of the context node, if any.
   236  func testParseCase(text, want, context string) (err error) {
   237  	defer func() {
   238  		if x := recover(); x != nil {
   239  			switch e := x.(type) {
   240  			case error:
   241  				err = e
   242  			default:
   243  				err = fmt.Errorf("%v", e)
   244  			}
   245  		}
   246  	}()
   247  
   248  	var doc *Node
   249  	if context == "" {
   250  		doc, err = Parse(strings.NewReader(text))
   251  		if err != nil {
   252  			return err
   253  		}
   254  	} else {
   255  		contextNode := &Node{
   256  			Type:     ElementNode,
   257  			DataAtom: atom.Lookup([]byte(context)),
   258  			Data:     context,
   259  		}
   260  		nodes, err := ParseFragment(strings.NewReader(text), contextNode)
   261  		if err != nil {
   262  			return err
   263  		}
   264  		doc = &Node{
   265  			Type: DocumentNode,
   266  		}
   267  		for _, n := range nodes {
   268  			doc.AppendChild(n)
   269  		}
   270  	}
   271  
   272  	if err := checkTreeConsistency(doc); err != nil {
   273  		return err
   274  	}
   275  
   276  	got, err := dump(doc)
   277  	if err != nil {
   278  		return err
   279  	}
   280  	// Compare the parsed tree to the #document section.
   281  	if got != want {
   282  		return fmt.Errorf("got vs want:\n----\n%s----\n%s----", got, want)
   283  	}
   284  
   285  	if renderTestBlacklist[text] || context != "" {
   286  		return nil
   287  	}
   288  
   289  	// Check that rendering and re-parsing results in an identical tree.
   290  	pr, pw := io.Pipe()
   291  	go func() {
   292  		pw.CloseWithError(Render(pw, doc))
   293  	}()
   294  	doc1, err := Parse(pr)
   295  	if err != nil {
   296  		return err
   297  	}
   298  	got1, err := dump(doc1)
   299  	if err != nil {
   300  		return err
   301  	}
   302  	if got != got1 {
   303  		return fmt.Errorf("got vs got1:\n----\n%s----\n%s----", got, got1)
   304  	}
   305  
   306  	return nil
   307  }
   308  
   309  // Some test input result in parse trees are not 'well-formed' despite
   310  // following the HTML5 recovery algorithms. Rendering and re-parsing such a
   311  // tree will not result in an exact clone of that tree. We blacklist such
   312  // inputs from the render test.
   313  var renderTestBlacklist = map[string]bool{
   314  	// The second <a> will be reparented to the first <table>'s parent. This
   315  	// results in an <a> whose parent is an <a>, which is not 'well-formed'.
   316  	`<a><table><td><a><table></table><a></tr><a></table><b>X</b>C<a>Y`: true,
   317  	// The same thing with a <p>:
   318  	`<p><table></p>`: true,
   319  	// More cases of <a> being reparented:
   320  	`<a href="blah">aba<table><a href="foo">br<tr><td></td></tr>x</table>aoe`: true,
   321  	`<a><table><a></table><p><a><div><a>`:                                     true,
   322  	`<a><table><td><a><table></table><a></tr><a></table><a>`:                  true,
   323  	// A similar reparenting situation involving <nobr>:
   324  	`<!DOCTYPE html><body><b><nobr>1<table><nobr></b><i><nobr>2<nobr></i>3`: true,
   325  	// A <plaintext> element is reparented, putting it before a table.
   326  	// A <plaintext> element can't have anything after it in HTML.
   327  	`<table><plaintext><td>`:                                   true,
   328  	`<!doctype html><table><plaintext></plaintext>`:            true,
   329  	`<!doctype html><table><tbody><plaintext></plaintext>`:     true,
   330  	`<!doctype html><table><tbody><tr><plaintext></plaintext>`: true,
   331  	// A form inside a table inside a form doesn't work either.
   332  	`<!doctype html><form><table></form><form></table></form>`: true,
   333  	// A script that ends at EOF may escape its own closing tag when rendered.
   334  	`<!doctype html><script><!--<script `:          true,
   335  	`<!doctype html><script><!--<script <`:         true,
   336  	`<!doctype html><script><!--<script <a`:        true,
   337  	`<!doctype html><script><!--<script </`:        true,
   338  	`<!doctype html><script><!--<script </s`:       true,
   339  	`<!doctype html><script><!--<script </script`:  true,
   340  	`<!doctype html><script><!--<script </scripta`: true,
   341  	`<!doctype html><script><!--<script -`:         true,
   342  	`<!doctype html><script><!--<script -a`:        true,
   343  	`<!doctype html><script><!--<script -<`:        true,
   344  	`<!doctype html><script><!--<script --`:        true,
   345  	`<!doctype html><script><!--<script --a`:       true,
   346  	`<!doctype html><script><!--<script --<`:       true,
   347  	`<script><!--<script `:                         true,
   348  	`<script><!--<script <a`:                       true,
   349  	`<script><!--<script </script`:                 true,
   350  	`<script><!--<script </scripta`:                true,
   351  	`<script><!--<script -`:                        true,
   352  	`<script><!--<script -a`:                       true,
   353  	`<script><!--<script --`:                       true,
   354  	`<script><!--<script --a`:                      true,
   355  	`<script><!--<script <`:                        true,
   356  	`<script><!--<script </`:                       true,
   357  	`<script><!--<script </s`:                      true,
   358  	// Reconstructing the active formatting elements results in a <plaintext>
   359  	// element that contains an <a> element.
   360  	`<!doctype html><p><a><plaintext>b`: true,
   361  }
   362  
   363  func TestNodeConsistency(t *testing.T) {
   364  	// inconsistentNode is a Node whose DataAtom and Data do not agree.
   365  	inconsistentNode := &Node{
   366  		Type:     ElementNode,
   367  		DataAtom: atom.Frameset,
   368  		Data:     "table",
   369  	}
   370  	_, err := ParseFragment(strings.NewReader("<p>hello</p>"), inconsistentNode)
   371  	if err == nil {
   372  		t.Errorf("got nil error, want non-nil")
   373  	}
   374  }
   375  
   376  func BenchmarkParser(b *testing.B) {
   377  	buf, err := ioutil.ReadFile("testdata/go1.html")
   378  	if err != nil {
   379  		b.Fatalf("could not read testdata/go1.html: %v", err)
   380  	}
   381  	b.SetBytes(int64(len(buf)))
   382  	runtime.GC()
   383  	b.ReportAllocs()
   384  	b.ResetTimer()
   385  	for i := 0; i < b.N; i++ {
   386  		Parse(bytes.NewBuffer(buf))
   387  	}
   388  }