golang.org/x/net@v0.25.1-0.20240516223405-c87a5b62e243/html/comment_test.go (about)

     1  // Copyright 2023 The Go Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  package html
     6  
     7  import (
     8  	"bytes"
     9  	"strings"
    10  	"testing"
    11  )
    12  
    13  // TestComments exhaustively tests every 'interesting' N-byte string is
    14  // correctly parsed as a comment. N ranges from 4+1 to 4+maxSuffixLen
    15  // inclusive. 4 is the length of the "<!--" prefix that starts an HTML comment.
    16  //
    17  // 'Interesting' means that the N-4 byte suffix consists entirely of bytes
    18  // sampled from the interestingCommentBytes const string, below. These cover
    19  // all of the possible state transitions from comment-related parser states, as
    20  // listed in the HTML spec (https://html.spec.whatwg.org/#comment-start-state
    21  // and subsequent sections).
    22  //
    23  // The spec is written as an explicit state machine that, as a side effect,
    24  // accumulates "the comment token's data" to a separate buffer.
    25  // Tokenizer.readComment in this package does not have an explicit state
    26  // machine and usually returns the comment text as a sub-slice of the input,
    27  // between the opening '<' and closing '>' or EOF. This test confirms that the
    28  // two algorithms match.
    29  func TestComments(t *testing.T) {
    30  	const prefix = "<!--"
    31  	const maxSuffixLen = 6
    32  	buffer := make([]byte, 0, len(prefix)+maxSuffixLen)
    33  	testAllComments(t, append(buffer, prefix...))
    34  }
    35  
    36  // NUL isn't in this list, even though the HTML spec sections 13.2.5.43 -
    37  // 13.2.5.52 mentions it. It's not interesting in terms of state transitions.
    38  // It's equivalent to any other non-interesting byte (other than being replaced
    39  // by U+FFFD REPLACEMENT CHARACTER).
    40  //
    41  // EOF isn't in this list. The HTML spec treats EOF as "an input character" but
    42  // testOneComment below breaks the loop instead.
    43  //
    44  // 'x' represents all other "non-interesting" comment bytes.
    45  var interestingCommentBytes = [...]byte{
    46  	'!', '-', '<', '>', 'x',
    47  }
    48  
    49  // testAllComments recursively fills in buffer[len(buffer):cap(buffer)] with
    50  // interesting bytes and then tests that this package's tokenization matches
    51  // the HTML spec.
    52  //
    53  // Precondition: len(buffer) < cap(buffer)
    54  // Precondition: string(buffer[:4]) == "<!--"
    55  func testAllComments(t *testing.T, buffer []byte) {
    56  	for _, interesting := range interestingCommentBytes {
    57  		b := append(buffer, interesting)
    58  		testOneComment(t, b)
    59  		if len(b) < cap(b) {
    60  			testAllComments(t, b)
    61  		}
    62  	}
    63  }
    64  
    65  func testOneComment(t *testing.T, b []byte) {
    66  	z := NewTokenizer(bytes.NewReader(b))
    67  	if next := z.Next(); next != CommentToken {
    68  		t.Fatalf("Next(%q): got %v, want %v", b, next, CommentToken)
    69  	}
    70  	gotRemainder := string(b[len(z.Raw()):])
    71  	gotComment := string(z.Text())
    72  
    73  	i := len("<!--")
    74  	wantBuffer := []byte(nil)
    75  loop:
    76  	for state := 43; ; {
    77  		// Consume the next input character, handling EOF.
    78  		if i >= len(b) {
    79  			break
    80  		}
    81  		nextInputCharacter := b[i]
    82  		i++
    83  
    84  		switch state {
    85  		case 43: // 13.2.5.43 Comment start state.
    86  			switch nextInputCharacter {
    87  			case '-':
    88  				state = 44
    89  			case '>':
    90  				break loop
    91  			default:
    92  				i-- // Reconsume.
    93  				state = 45
    94  			}
    95  
    96  		case 44: // 13.2.5.44 Comment start dash state.
    97  			switch nextInputCharacter {
    98  			case '-':
    99  				state = 51
   100  			case '>':
   101  				break loop
   102  			default:
   103  				wantBuffer = append(wantBuffer, '-')
   104  				i-- // Reconsume.
   105  				state = 45
   106  			}
   107  
   108  		case 45: // 13.2.5.45 Comment state.
   109  			switch nextInputCharacter {
   110  			case '-':
   111  				state = 50
   112  			case '<':
   113  				wantBuffer = append(wantBuffer, '<')
   114  				state = 46
   115  			default:
   116  				wantBuffer = append(wantBuffer, nextInputCharacter)
   117  			}
   118  
   119  		case 46: // 13.2.5.46 Comment less-than sign state.
   120  			switch nextInputCharacter {
   121  			case '!':
   122  				wantBuffer = append(wantBuffer, '!')
   123  				state = 47
   124  			case '<':
   125  				wantBuffer = append(wantBuffer, '<')
   126  				state = 46
   127  			default:
   128  				i-- // Reconsume.
   129  				state = 45
   130  			}
   131  
   132  		case 47: // 13.2.5.47 Comment less-than sign bang state.
   133  			switch nextInputCharacter {
   134  			case '-':
   135  				state = 48
   136  			default:
   137  				i-- // Reconsume.
   138  				state = 45
   139  			}
   140  
   141  		case 48: // 13.2.5.48 Comment less-than sign bang dash state.
   142  			switch nextInputCharacter {
   143  			case '-':
   144  				state = 49
   145  			default:
   146  				i-- // Reconsume.
   147  				state = 50
   148  			}
   149  
   150  		case 49: // 13.2.5.49 Comment less-than sign bang dash dash state.
   151  			switch nextInputCharacter {
   152  			case '>':
   153  				break loop
   154  			default:
   155  				i-- // Reconsume.
   156  				state = 51
   157  			}
   158  
   159  		case 50: // 13.2.5.50 Comment end dash state.
   160  			switch nextInputCharacter {
   161  			case '-':
   162  				state = 51
   163  			default:
   164  				wantBuffer = append(wantBuffer, '-')
   165  				i-- // Reconsume.
   166  				state = 45
   167  			}
   168  
   169  		case 51: // 13.2.5.51 Comment end state.
   170  			switch nextInputCharacter {
   171  			case '!':
   172  				state = 52
   173  			case '-':
   174  				wantBuffer = append(wantBuffer, '-')
   175  			case '>':
   176  				break loop
   177  			default:
   178  				wantBuffer = append(wantBuffer, "--"...)
   179  				i-- // Reconsume.
   180  				state = 45
   181  			}
   182  
   183  		case 52: // 13.2.5.52 Comment end bang state.
   184  			switch nextInputCharacter {
   185  			case '-':
   186  				wantBuffer = append(wantBuffer, "--!"...)
   187  				state = 50
   188  			case '>':
   189  				break loop
   190  			default:
   191  				wantBuffer = append(wantBuffer, "--!"...)
   192  				i-- // Reconsume.
   193  				state = 45
   194  			}
   195  
   196  		default:
   197  			t.Fatalf("input=%q: unexpected state %d", b, state)
   198  		}
   199  	}
   200  
   201  	wantRemainder := ""
   202  	if i < len(b) {
   203  		wantRemainder = string(b[i:])
   204  	}
   205  	wantComment := string(wantBuffer)
   206  	if (gotComment != wantComment) || (gotRemainder != wantRemainder) {
   207  		t.Errorf("input=%q\ngot:  %q + %q\nwant: %q + %q",
   208  			b, gotComment, gotRemainder, wantComment, wantRemainder)
   209  		return
   210  	}
   211  
   212  	// suffix is the "N-4 byte suffix" per the TestComments comment.
   213  	suffix := string(b[4:])
   214  
   215  	// Test that a round trip, rendering (escaped) and re-parsing, of a comment
   216  	// token (with that suffix as the Token.Data) preserves that string.
   217  	tok := Token{
   218  		Type: CommentToken,
   219  		Data: suffix,
   220  	}
   221  	z2 := NewTokenizer(strings.NewReader(tok.String()))
   222  	if next := z2.Next(); next != CommentToken {
   223  		t.Fatalf("round-trip Next(%q): got %v, want %v", suffix, next, CommentToken)
   224  	}
   225  	gotComment2 := string(z2.Text())
   226  	if gotComment2 != suffix {
   227  		t.Errorf("round-trip\ngot:  %q\nwant: %q", gotComment2, suffix)
   228  		return
   229  	}
   230  }
   231  
   232  // This table below summarizes the HTML-comment-related state machine from
   233  // 13.2.5.43 "Comment start state" and subsequent sections.
   234  // https://html.spec.whatwg.org/#comment-start-state
   235  //
   236  // Get to state 13.2.5.43 after seeing "<!--". Specifically, starting from the
   237  // initial 13.2.5.1 "Data state":
   238  //   - "<"  moves to 13.2.5.6  "Tag open state",
   239  //   - "!"  moves to 13.2.5.42 "Markup declaration open state",
   240  //   - "--" moves to 13.2.5.43 "Comment start state".
   241  // Each of these transitions are the only way to get to the 6/42/43 states.
   242  //
   243  // State   !         -         <         >         NUL       EOF       default   HTML spec section
   244  // 43      ...       s44       ...       s01.T.E0  ...       ...       r45       13.2.5.43 Comment start state
   245  // 44      ...       s51       ...       s01.T.E0  ...       T.Z.E1    r45.A-    13.2.5.44 Comment start dash state
   246  // 45      ...       s50       s46.A<    ...       t45.A?.E2 T.Z.E1    t45.Ax    13.2.5.45 Comment state
   247  // 46      s47.A!    ...       t46.A<    ...       ...       ...       r45       13.2.5.46 Comment less-than sign state
   248  // 47      ...       s48       ...       ...       ...       ...       r45       13.2.5.47 Comment less-than sign bang state
   249  // 48      ...       s49       ...       ...       ...       ...       r50       13.2.5.48 Comment less-than sign bang dash state
   250  // 49      ...       ...       ...       s01.T     ...       T.Z.E1    r51.E3    13.2.5.49 Comment less-than sign bang dash dash state
   251  // 50      ...       s51       ...       ...       ...       T.Z.E1    r45.A-    13.2.5.50 Comment end dash state
   252  // 51      s52       t51.A-    ...       s01.T     ...       T.Z.E1    r45.A--   13.2.5.51 Comment end state
   253  // 52      ...       s50.A--!  ...       s01.T.E4  ...       T.Z.E1    r45.A--!  13.2.5.52 Comment end bang state
   254  //
   255  // State 43 is the "Comment start state" meaning that we've only seen "<!--"
   256  // and nothing else. Similarly, state 44 means that we've only seen "<!---",
   257  // with three dashes, and nothing else. For the other states, we deduce
   258  // (working backwards) that the immediate prior input must be:
   259  //   - 45  something that's not '-'
   260  //   - 46  "<"
   261  //   - 47  "<!"
   262  //   - 48  "<!-"
   263  //   - 49  "<!--"  not including the opening "<!--"
   264  //   - 50  "-"     not including the opening "<!--" and also not "--"
   265  //   - 51  "--"    not including the opening "<!--"
   266  //   - 52  "--!"
   267  //
   268  // The table cell actions:
   269  //   - ...   do the default action
   270  //   - A!    append "!"      to the comment token's data.
   271  //   - A-    append "-"      to the comment token's data.
   272  //   - A--   append "--"     to the comment token's data.
   273  //   - A--!  append "--!"    to the comment token's data.
   274  //   - A<    append "<"      to the comment token's data.
   275  //   - A?    append "\uFFFD" to the comment token's data.
   276  //   - Ax    append the current input character to the comment token's data.
   277  //   - E0    parse error (abrupt-closing-of-empty-comment).
   278  //   - E1    parse error (eof-in-comment).
   279  //   - E2    parse error (unexpected-null-character).
   280  //   - E3    parse error (nested-comment).
   281  //   - E4    parse error (incorrectly-closed-comment).
   282  //   - T     emit the current comment token.
   283  //   - Z     emit an end-of-file token.
   284  //   - rNN   reconsume in the 13.2.5.NN     state (after any A* or E* operations).
   285  //   - s01   switch to the    13.2.5.1 Data state (after any A* or E* operations).
   286  //   - sNN   switch to the    13.2.5.NN     state (after any A* or E* operations).
   287  //   - tNN   stay in the      13.2.5.NN     state (after any A* or E* operations).
   288  //
   289  // The E* actions are called errors in the HTML spec but they are not fatal
   290  // (https://html.spec.whatwg.org/#parse-errors says "may [but not must] abort
   291  // the parser"). They are warnings that, in practice, browsers simply ignore.