golang.org/x/net@v0.25.1-0.20240516223405-c87a5b62e243/html/parse_test.go (about)

     1  // Copyright 2010 The Go Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  package html
     6  
     7  import (
     8  	"bufio"
     9  	"bytes"
    10  	"errors"
    11  	"fmt"
    12  	"io"
    13  	"io/ioutil"
    14  	"os"
    15  	"path/filepath"
    16  	"runtime"
    17  	"sort"
    18  	"strings"
    19  	"testing"
    20  
    21  	"golang.org/x/net/html/atom"
    22  )
    23  
    24  type testAttrs struct {
    25  	text, want, context string
    26  	scripting           bool
    27  }
    28  
    29  // readParseTest reads a single test case from r.
    30  func readParseTest(r *bufio.Reader) (*testAttrs, error) {
    31  	ta := &testAttrs{scripting: true}
    32  	line, err := r.ReadSlice('\n')
    33  	if err != nil {
    34  		return nil, err
    35  	}
    36  	var b []byte
    37  
    38  	// Read the HTML.
    39  	if string(line) != "#data\n" {
    40  		return nil, fmt.Errorf(`got %q want "#data\n"`, line)
    41  	}
    42  	for {
    43  		line, err = r.ReadSlice('\n')
    44  		if err != nil {
    45  			return nil, err
    46  		}
    47  		if line[0] == '#' {
    48  			break
    49  		}
    50  		b = append(b, line...)
    51  	}
    52  	ta.text = strings.TrimSuffix(string(b), "\n")
    53  	b = b[:0]
    54  
    55  	// Skip the error list.
    56  	if string(line) != "#errors\n" {
    57  		return nil, fmt.Errorf(`got %q want "#errors\n"`, line)
    58  	}
    59  	for {
    60  		line, err = r.ReadSlice('\n')
    61  		if err != nil {
    62  			return nil, err
    63  		}
    64  		if line[0] == '#' {
    65  			break
    66  		}
    67  	}
    68  
    69  	// Skip the new-errors list.
    70  	if string(line) == "#new-errors\n" {
    71  		for {
    72  			line, err = r.ReadSlice('\n')
    73  			if err != nil {
    74  				return nil, err
    75  			}
    76  			if line[0] == '#' {
    77  				break
    78  			}
    79  		}
    80  	}
    81  
    82  	if ls := string(line); strings.HasPrefix(ls, "#script-") {
    83  		switch {
    84  		case strings.HasSuffix(ls, "-on\n"):
    85  			ta.scripting = true
    86  		case strings.HasSuffix(ls, "-off\n"):
    87  			ta.scripting = false
    88  		default:
    89  			return nil, fmt.Errorf(`got %q, want "#script-on" or "#script-off"`, line)
    90  		}
    91  		for {
    92  			line, err = r.ReadSlice('\n')
    93  			if err != nil {
    94  				return nil, err
    95  			}
    96  			if line[0] == '#' {
    97  				break
    98  			}
    99  		}
   100  	}
   101  
   102  	if string(line) == "#document-fragment\n" {
   103  		line, err = r.ReadSlice('\n')
   104  		if err != nil {
   105  			return nil, err
   106  		}
   107  		ta.context = strings.TrimSpace(string(line))
   108  		line, err = r.ReadSlice('\n')
   109  		if err != nil {
   110  			return nil, err
   111  		}
   112  	}
   113  
   114  	// Read the dump of what the parse tree should be.
   115  	if string(line) != "#document\n" {
   116  		return nil, fmt.Errorf(`got %q want "#document\n"`, line)
   117  	}
   118  	inQuote := false
   119  	for {
   120  		line, err = r.ReadSlice('\n')
   121  		if err != nil && err != io.EOF {
   122  			return nil, err
   123  		}
   124  		trimmed := bytes.Trim(line, "| \n")
   125  		if len(trimmed) > 0 {
   126  			if line[0] == '|' && trimmed[0] == '"' {
   127  				inQuote = true
   128  			}
   129  			if trimmed[len(trimmed)-1] == '"' && !(line[0] == '|' && len(trimmed) == 1) {
   130  				inQuote = false
   131  			}
   132  		}
   133  		if len(line) == 0 || len(line) == 1 && line[0] == '\n' && !inQuote {
   134  			break
   135  		}
   136  		b = append(b, line...)
   137  	}
   138  	ta.want = string(b)
   139  	return ta, nil
   140  }
   141  
   142  func dumpIndent(w io.Writer, level int) {
   143  	io.WriteString(w, "| ")
   144  	for i := 0; i < level; i++ {
   145  		io.WriteString(w, "  ")
   146  	}
   147  }
   148  
   149  type sortedAttributes []Attribute
   150  
   151  func (a sortedAttributes) Len() int {
   152  	return len(a)
   153  }
   154  
   155  func (a sortedAttributes) Less(i, j int) bool {
   156  	if a[i].Namespace != a[j].Namespace {
   157  		return a[i].Namespace < a[j].Namespace
   158  	}
   159  	return a[i].Key < a[j].Key
   160  }
   161  
   162  func (a sortedAttributes) Swap(i, j int) {
   163  	a[i], a[j] = a[j], a[i]
   164  }
   165  
   166  func dumpLevel(w io.Writer, n *Node, level int) error {
   167  	dumpIndent(w, level)
   168  	level++
   169  	switch n.Type {
   170  	case ErrorNode:
   171  		return errors.New("unexpected ErrorNode")
   172  	case DocumentNode:
   173  		return errors.New("unexpected DocumentNode")
   174  	case ElementNode:
   175  		if n.Namespace != "" {
   176  			fmt.Fprintf(w, "<%s %s>", n.Namespace, n.Data)
   177  		} else {
   178  			fmt.Fprintf(w, "<%s>", n.Data)
   179  		}
   180  		attr := sortedAttributes(n.Attr)
   181  		sort.Sort(attr)
   182  		for _, a := range attr {
   183  			io.WriteString(w, "\n")
   184  			dumpIndent(w, level)
   185  			if a.Namespace != "" {
   186  				fmt.Fprintf(w, `%s %s="%s"`, a.Namespace, a.Key, a.Val)
   187  			} else {
   188  				fmt.Fprintf(w, `%s="%s"`, a.Key, a.Val)
   189  			}
   190  		}
   191  		if n.Namespace == "" && n.DataAtom == atom.Template {
   192  			io.WriteString(w, "\n")
   193  			dumpIndent(w, level)
   194  			level++
   195  			io.WriteString(w, "content")
   196  		}
   197  	case TextNode:
   198  		fmt.Fprintf(w, `"%s"`, n.Data)
   199  	case CommentNode:
   200  		fmt.Fprintf(w, "<!-- %s -->", n.Data)
   201  	case DoctypeNode:
   202  		fmt.Fprintf(w, "<!DOCTYPE %s", n.Data)
   203  		if n.Attr != nil {
   204  			var p, s string
   205  			for _, a := range n.Attr {
   206  				switch a.Key {
   207  				case "public":
   208  					p = a.Val
   209  				case "system":
   210  					s = a.Val
   211  				}
   212  			}
   213  			if p != "" || s != "" {
   214  				fmt.Fprintf(w, ` "%s"`, p)
   215  				fmt.Fprintf(w, ` "%s"`, s)
   216  			}
   217  		}
   218  		io.WriteString(w, ">")
   219  	case scopeMarkerNode:
   220  		return errors.New("unexpected scopeMarkerNode")
   221  	default:
   222  		return errors.New("unknown node type")
   223  	}
   224  	io.WriteString(w, "\n")
   225  	for c := n.FirstChild; c != nil; c = c.NextSibling {
   226  		if err := dumpLevel(w, c, level); err != nil {
   227  			return err
   228  		}
   229  	}
   230  	return nil
   231  }
   232  
   233  func dump(n *Node) (string, error) {
   234  	if n == nil || n.FirstChild == nil {
   235  		return "", nil
   236  	}
   237  	var b bytes.Buffer
   238  	for c := n.FirstChild; c != nil; c = c.NextSibling {
   239  		if err := dumpLevel(&b, c, 0); err != nil {
   240  			return "", err
   241  		}
   242  	}
   243  	return b.String(), nil
   244  }
   245  
   246  var testDataDirs = []string{"testdata/webkit/", "testdata/go/"}
   247  
   248  func TestParser(t *testing.T) {
   249  	for _, testDataDir := range testDataDirs {
   250  		testFiles, err := filepath.Glob(testDataDir + "*.dat")
   251  		if err != nil {
   252  			t.Fatal(err)
   253  		}
   254  		for _, tf := range testFiles {
   255  			f, err := os.Open(tf)
   256  			if err != nil {
   257  				t.Fatal(err)
   258  			}
   259  			defer f.Close()
   260  			r := bufio.NewReader(f)
   261  
   262  			for i := 0; ; i++ {
   263  				ta, err := readParseTest(r)
   264  				if err == io.EOF {
   265  					break
   266  				}
   267  				if err != nil {
   268  					t.Fatal(err)
   269  				}
   270  				if parseTestBlacklist[ta.text] {
   271  					continue
   272  				}
   273  
   274  				err = testParseCase(ta.text, ta.want, ta.context, ParseOptionEnableScripting(ta.scripting))
   275  
   276  				if err != nil {
   277  					t.Errorf("%s test #%d %q, %s", tf, i, ta.text, err)
   278  				}
   279  			}
   280  		}
   281  	}
   282  }
   283  
   284  // Issue 16318
   285  func TestParserWithoutScripting(t *testing.T) {
   286  	text := `<noscript><img src='https://golang.org/doc/gopher/frontpage.png' /></noscript><p><img src='https://golang.org/doc/gopher/doc.png' /></p>`
   287  	want := `| <html>
   288  |   <head>
   289  |     <noscript>
   290  |   <body>
   291  |     <img>
   292  |       src="https://golang.org/doc/gopher/frontpage.png"
   293  |     <p>
   294  |       <img>
   295  |         src="https://golang.org/doc/gopher/doc.png"
   296  `
   297  
   298  	if err := testParseCase(text, want, "", ParseOptionEnableScripting(false)); err != nil {
   299  		t.Errorf("test with scripting is disabled, %q, %s", text, err)
   300  	}
   301  }
   302  
   303  // testParseCase tests one test case from the test files. If the test does not
   304  // pass, it returns an error that explains the failure.
   305  // text is the HTML to be parsed, want is a dump of the correct parse tree,
   306  // and context is the name of the context node, if any.
   307  func testParseCase(text, want, context string, opts ...ParseOption) (err error) {
   308  	defer func() {
   309  		if x := recover(); x != nil {
   310  			switch e := x.(type) {
   311  			case error:
   312  				err = e
   313  			default:
   314  				err = fmt.Errorf("%v", e)
   315  			}
   316  		}
   317  	}()
   318  
   319  	var doc *Node
   320  	if context == "" {
   321  		doc, err = ParseWithOptions(strings.NewReader(text), opts...)
   322  		if err != nil {
   323  			return err
   324  		}
   325  	} else {
   326  		namespace := ""
   327  		if i := strings.IndexByte(context, ' '); i >= 0 {
   328  			namespace, context = context[:i], context[i+1:]
   329  		}
   330  		contextNode := &Node{
   331  			Data:      context,
   332  			DataAtom:  atom.Lookup([]byte(context)),
   333  			Namespace: namespace,
   334  			Type:      ElementNode,
   335  		}
   336  		nodes, err := ParseFragmentWithOptions(strings.NewReader(text), contextNode, opts...)
   337  		if err != nil {
   338  			return err
   339  		}
   340  		doc = &Node{
   341  			Type: DocumentNode,
   342  		}
   343  		for _, n := range nodes {
   344  			doc.AppendChild(n)
   345  		}
   346  	}
   347  
   348  	if err := checkTreeConsistency(doc); err != nil {
   349  		return err
   350  	}
   351  
   352  	got, err := dump(doc)
   353  	if err != nil {
   354  		return err
   355  	}
   356  	// Compare the parsed tree to the #document section.
   357  	if got != want {
   358  		return fmt.Errorf("got vs want:\n----\n%s----\n%s----", got, want)
   359  	}
   360  
   361  	if renderTestBlacklist[text] || context != "" {
   362  		return nil
   363  	}
   364  
   365  	// Check that rendering and re-parsing results in an identical tree.
   366  	pr, pw := io.Pipe()
   367  	go func() {
   368  		pw.CloseWithError(Render(pw, doc))
   369  	}()
   370  	doc1, err := ParseWithOptions(pr, opts...)
   371  	if err != nil {
   372  		return err
   373  	}
   374  	got1, err := dump(doc1)
   375  	if err != nil {
   376  		return err
   377  	}
   378  	if got != got1 {
   379  		return fmt.Errorf("got vs got1:\n----\n%s----\n%s----", got, got1)
   380  	}
   381  
   382  	return nil
   383  }
   384  
   385  // Some test inputs are simply skipped - we would otherwise fail the test. We
   386  // blacklist such inputs from the parse test.
   387  var parseTestBlacklist = map[string]bool{
   388  	// See the a.Template TODO in inHeadIM.
   389  	`<math><template><mo><template>`:                                     true,
   390  	`<template><svg><foo><template><foreignObject><div></template><div>`: true,
   391  }
   392  
   393  // Some test input result in parse trees are not 'well-formed' despite
   394  // following the HTML5 recovery algorithms. Rendering and re-parsing such a
   395  // tree will not result in an exact clone of that tree. We blacklist such
   396  // inputs from the render test.
   397  var renderTestBlacklist = map[string]bool{
   398  	// The second <a> will be reparented to the first <table>'s parent. This
   399  	// results in an <a> whose parent is an <a>, which is not 'well-formed'.
   400  	`<a><table><td><a><table></table><a></tr><a></table><b>X</b>C<a>Y`: true,
   401  	// The same thing with a <p>:
   402  	`<p><table></p>`: true,
   403  	// More cases of <a> being reparented:
   404  	`<a href="blah">aba<table><a href="foo">br<tr><td></td></tr>x</table>aoe`: true,
   405  	`<a><table><a></table><p><a><div><a>`:                                     true,
   406  	`<a><table><td><a><table></table><a></tr><a></table><a>`:                  true,
   407  	`<template><a><table><a>`:                                                 true,
   408  	// A similar reparenting situation involving <nobr>:
   409  	`<!DOCTYPE html><body><b><nobr>1<table><nobr></b><i><nobr>2<nobr></i>3`: true,
   410  	// A <plaintext> element is reparented, putting it before a table.
   411  	// A <plaintext> element can't have anything after it in HTML.
   412  	`<table><plaintext><td>`:                                   true,
   413  	`<!doctype html><table><plaintext></plaintext>`:            true,
   414  	`<!doctype html><table><tbody><plaintext></plaintext>`:     true,
   415  	`<!doctype html><table><tbody><tr><plaintext></plaintext>`: true,
   416  	// A form inside a table inside a form doesn't work either.
   417  	`<!doctype html><form><table></form><form></table></form>`: true,
   418  	// A script that ends at EOF may escape its own closing tag when rendered.
   419  	`<!doctype html><script><!--<script `:          true,
   420  	`<!doctype html><script><!--<script <`:         true,
   421  	`<!doctype html><script><!--<script <a`:        true,
   422  	`<!doctype html><script><!--<script </`:        true,
   423  	`<!doctype html><script><!--<script </s`:       true,
   424  	`<!doctype html><script><!--<script </script`:  true,
   425  	`<!doctype html><script><!--<script </scripta`: true,
   426  	`<!doctype html><script><!--<script -`:         true,
   427  	`<!doctype html><script><!--<script -a`:        true,
   428  	`<!doctype html><script><!--<script -<`:        true,
   429  	`<!doctype html><script><!--<script --`:        true,
   430  	`<!doctype html><script><!--<script --a`:       true,
   431  	`<!doctype html><script><!--<script --<`:       true,
   432  	`<script><!--<script `:                         true,
   433  	`<script><!--<script <a`:                       true,
   434  	`<script><!--<script </script`:                 true,
   435  	`<script><!--<script </scripta`:                true,
   436  	`<script><!--<script -`:                        true,
   437  	`<script><!--<script -a`:                       true,
   438  	`<script><!--<script --`:                       true,
   439  	`<script><!--<script --a`:                      true,
   440  	`<script><!--<script <`:                        true,
   441  	`<script><!--<script </`:                       true,
   442  	`<script><!--<script </s`:                      true,
   443  	// Reconstructing the active formatting elements results in a <plaintext>
   444  	// element that contains an <a> element.
   445  	`<!doctype html><p><a><plaintext>b`:                       true,
   446  	`<table><math><select><mi><select></table>`:               true,
   447  	`<!doctype html><table><colgroup><plaintext></plaintext>`: true,
   448  	`<!doctype html><svg><plaintext>a</plaintext>b`:           true,
   449  }
   450  
   451  func TestNodeConsistency(t *testing.T) {
   452  	// inconsistentNode is a Node whose DataAtom and Data do not agree.
   453  	inconsistentNode := &Node{
   454  		Type:     ElementNode,
   455  		DataAtom: atom.Frameset,
   456  		Data:     "table",
   457  	}
   458  	if _, err := ParseFragment(strings.NewReader("<p>hello</p>"), inconsistentNode); err == nil {
   459  		t.Errorf("got nil error, want non-nil")
   460  	}
   461  }
   462  
   463  func TestParseFragmentWithNilContext(t *testing.T) {
   464  	// This shouldn't panic.
   465  	ParseFragment(strings.NewReader("<p>hello</p>"), nil)
   466  }
   467  
   468  func TestParseFragmentForeignContentTemplates(t *testing.T) {
   469  	srcs := []string{
   470  		"<math><html><template><mn><template></template></template>",
   471  		"<math><math><head><mi><template>",
   472  	}
   473  	for _, src := range srcs {
   474  		// The next line shouldn't infinite-loop.
   475  		ParseFragment(strings.NewReader(src), nil)
   476  	}
   477  }
   478  
   479  func BenchmarkParser(b *testing.B) {
   480  	buf, err := ioutil.ReadFile("testdata/go1.html")
   481  	if err != nil {
   482  		b.Fatalf("could not read testdata/go1.html: %v", err)
   483  	}
   484  	b.SetBytes(int64(len(buf)))
   485  	runtime.GC()
   486  	b.ReportAllocs()
   487  	b.ResetTimer()
   488  	for i := 0; i < b.N; i++ {
   489  		Parse(bytes.NewBuffer(buf))
   490  	}
   491  }