golang.org/x/net@v0.25.1-0.20240516223405-c87a5b62e243/html/comment_test.go (about) 1 // Copyright 2023 The Go Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 package html 6 7 import ( 8 "bytes" 9 "strings" 10 "testing" 11 ) 12 13 // TestComments exhaustively tests every 'interesting' N-byte string is 14 // correctly parsed as a comment. N ranges from 4+1 to 4+maxSuffixLen 15 // inclusive. 4 is the length of the "<!--" prefix that starts an HTML comment. 16 // 17 // 'Interesting' means that the N-4 byte suffix consists entirely of bytes 18 // sampled from the interestingCommentBytes const string, below. These cover 19 // all of the possible state transitions from comment-related parser states, as 20 // listed in the HTML spec (https://html.spec.whatwg.org/#comment-start-state 21 // and subsequent sections). 22 // 23 // The spec is written as an explicit state machine that, as a side effect, 24 // accumulates "the comment token's data" to a separate buffer. 25 // Tokenizer.readComment in this package does not have an explicit state 26 // machine and usually returns the comment text as a sub-slice of the input, 27 // between the opening '<' and closing '>' or EOF. This test confirms that the 28 // two algorithms match. 29 func TestComments(t *testing.T) { 30 const prefix = "<!--" 31 const maxSuffixLen = 6 32 buffer := make([]byte, 0, len(prefix)+maxSuffixLen) 33 testAllComments(t, append(buffer, prefix...)) 34 } 35 36 // NUL isn't in this list, even though the HTML spec sections 13.2.5.43 - 37 // 13.2.5.52 mentions it. It's not interesting in terms of state transitions. 38 // It's equivalent to any other non-interesting byte (other than being replaced 39 // by U+FFFD REPLACEMENT CHARACTER). 40 // 41 // EOF isn't in this list. The HTML spec treats EOF as "an input character" but 42 // testOneComment below breaks the loop instead. 43 // 44 // 'x' represents all other "non-interesting" comment bytes. 45 var interestingCommentBytes = [...]byte{ 46 '!', '-', '<', '>', 'x', 47 } 48 49 // testAllComments recursively fills in buffer[len(buffer):cap(buffer)] with 50 // interesting bytes and then tests that this package's tokenization matches 51 // the HTML spec. 52 // 53 // Precondition: len(buffer) < cap(buffer) 54 // Precondition: string(buffer[:4]) == "<!--" 55 func testAllComments(t *testing.T, buffer []byte) { 56 for _, interesting := range interestingCommentBytes { 57 b := append(buffer, interesting) 58 testOneComment(t, b) 59 if len(b) < cap(b) { 60 testAllComments(t, b) 61 } 62 } 63 } 64 65 func testOneComment(t *testing.T, b []byte) { 66 z := NewTokenizer(bytes.NewReader(b)) 67 if next := z.Next(); next != CommentToken { 68 t.Fatalf("Next(%q): got %v, want %v", b, next, CommentToken) 69 } 70 gotRemainder := string(b[len(z.Raw()):]) 71 gotComment := string(z.Text()) 72 73 i := len("<!--") 74 wantBuffer := []byte(nil) 75 loop: 76 for state := 43; ; { 77 // Consume the next input character, handling EOF. 78 if i >= len(b) { 79 break 80 } 81 nextInputCharacter := b[i] 82 i++ 83 84 switch state { 85 case 43: // 13.2.5.43 Comment start state. 86 switch nextInputCharacter { 87 case '-': 88 state = 44 89 case '>': 90 break loop 91 default: 92 i-- // Reconsume. 93 state = 45 94 } 95 96 case 44: // 13.2.5.44 Comment start dash state. 97 switch nextInputCharacter { 98 case '-': 99 state = 51 100 case '>': 101 break loop 102 default: 103 wantBuffer = append(wantBuffer, '-') 104 i-- // Reconsume. 105 state = 45 106 } 107 108 case 45: // 13.2.5.45 Comment state. 109 switch nextInputCharacter { 110 case '-': 111 state = 50 112 case '<': 113 wantBuffer = append(wantBuffer, '<') 114 state = 46 115 default: 116 wantBuffer = append(wantBuffer, nextInputCharacter) 117 } 118 119 case 46: // 13.2.5.46 Comment less-than sign state. 120 switch nextInputCharacter { 121 case '!': 122 wantBuffer = append(wantBuffer, '!') 123 state = 47 124 case '<': 125 wantBuffer = append(wantBuffer, '<') 126 state = 46 127 default: 128 i-- // Reconsume. 129 state = 45 130 } 131 132 case 47: // 13.2.5.47 Comment less-than sign bang state. 133 switch nextInputCharacter { 134 case '-': 135 state = 48 136 default: 137 i-- // Reconsume. 138 state = 45 139 } 140 141 case 48: // 13.2.5.48 Comment less-than sign bang dash state. 142 switch nextInputCharacter { 143 case '-': 144 state = 49 145 default: 146 i-- // Reconsume. 147 state = 50 148 } 149 150 case 49: // 13.2.5.49 Comment less-than sign bang dash dash state. 151 switch nextInputCharacter { 152 case '>': 153 break loop 154 default: 155 i-- // Reconsume. 156 state = 51 157 } 158 159 case 50: // 13.2.5.50 Comment end dash state. 160 switch nextInputCharacter { 161 case '-': 162 state = 51 163 default: 164 wantBuffer = append(wantBuffer, '-') 165 i-- // Reconsume. 166 state = 45 167 } 168 169 case 51: // 13.2.5.51 Comment end state. 170 switch nextInputCharacter { 171 case '!': 172 state = 52 173 case '-': 174 wantBuffer = append(wantBuffer, '-') 175 case '>': 176 break loop 177 default: 178 wantBuffer = append(wantBuffer, "--"...) 179 i-- // Reconsume. 180 state = 45 181 } 182 183 case 52: // 13.2.5.52 Comment end bang state. 184 switch nextInputCharacter { 185 case '-': 186 wantBuffer = append(wantBuffer, "--!"...) 187 state = 50 188 case '>': 189 break loop 190 default: 191 wantBuffer = append(wantBuffer, "--!"...) 192 i-- // Reconsume. 193 state = 45 194 } 195 196 default: 197 t.Fatalf("input=%q: unexpected state %d", b, state) 198 } 199 } 200 201 wantRemainder := "" 202 if i < len(b) { 203 wantRemainder = string(b[i:]) 204 } 205 wantComment := string(wantBuffer) 206 if (gotComment != wantComment) || (gotRemainder != wantRemainder) { 207 t.Errorf("input=%q\ngot: %q + %q\nwant: %q + %q", 208 b, gotComment, gotRemainder, wantComment, wantRemainder) 209 return 210 } 211 212 // suffix is the "N-4 byte suffix" per the TestComments comment. 213 suffix := string(b[4:]) 214 215 // Test that a round trip, rendering (escaped) and re-parsing, of a comment 216 // token (with that suffix as the Token.Data) preserves that string. 217 tok := Token{ 218 Type: CommentToken, 219 Data: suffix, 220 } 221 z2 := NewTokenizer(strings.NewReader(tok.String())) 222 if next := z2.Next(); next != CommentToken { 223 t.Fatalf("round-trip Next(%q): got %v, want %v", suffix, next, CommentToken) 224 } 225 gotComment2 := string(z2.Text()) 226 if gotComment2 != suffix { 227 t.Errorf("round-trip\ngot: %q\nwant: %q", gotComment2, suffix) 228 return 229 } 230 } 231 232 // This table below summarizes the HTML-comment-related state machine from 233 // 13.2.5.43 "Comment start state" and subsequent sections. 234 // https://html.spec.whatwg.org/#comment-start-state 235 // 236 // Get to state 13.2.5.43 after seeing "<!--". Specifically, starting from the 237 // initial 13.2.5.1 "Data state": 238 // - "<" moves to 13.2.5.6 "Tag open state", 239 // - "!" moves to 13.2.5.42 "Markup declaration open state", 240 // - "--" moves to 13.2.5.43 "Comment start state". 241 // Each of these transitions are the only way to get to the 6/42/43 states. 242 // 243 // State ! - < > NUL EOF default HTML spec section 244 // 43 ... s44 ... s01.T.E0 ... ... r45 13.2.5.43 Comment start state 245 // 44 ... s51 ... s01.T.E0 ... T.Z.E1 r45.A- 13.2.5.44 Comment start dash state 246 // 45 ... s50 s46.A< ... t45.A?.E2 T.Z.E1 t45.Ax 13.2.5.45 Comment state 247 // 46 s47.A! ... t46.A< ... ... ... r45 13.2.5.46 Comment less-than sign state 248 // 47 ... s48 ... ... ... ... r45 13.2.5.47 Comment less-than sign bang state 249 // 48 ... s49 ... ... ... ... r50 13.2.5.48 Comment less-than sign bang dash state 250 // 49 ... ... ... s01.T ... T.Z.E1 r51.E3 13.2.5.49 Comment less-than sign bang dash dash state 251 // 50 ... s51 ... ... ... T.Z.E1 r45.A- 13.2.5.50 Comment end dash state 252 // 51 s52 t51.A- ... s01.T ... T.Z.E1 r45.A-- 13.2.5.51 Comment end state 253 // 52 ... s50.A--! ... s01.T.E4 ... T.Z.E1 r45.A--! 13.2.5.52 Comment end bang state 254 // 255 // State 43 is the "Comment start state" meaning that we've only seen "<!--" 256 // and nothing else. Similarly, state 44 means that we've only seen "<!---", 257 // with three dashes, and nothing else. For the other states, we deduce 258 // (working backwards) that the immediate prior input must be: 259 // - 45 something that's not '-' 260 // - 46 "<" 261 // - 47 "<!" 262 // - 48 "<!-" 263 // - 49 "<!--" not including the opening "<!--" 264 // - 50 "-" not including the opening "<!--" and also not "--" 265 // - 51 "--" not including the opening "<!--" 266 // - 52 "--!" 267 // 268 // The table cell actions: 269 // - ... do the default action 270 // - A! append "!" to the comment token's data. 271 // - A- append "-" to the comment token's data. 272 // - A-- append "--" to the comment token's data. 273 // - A--! append "--!" to the comment token's data. 274 // - A< append "<" to the comment token's data. 275 // - A? append "\uFFFD" to the comment token's data. 276 // - Ax append the current input character to the comment token's data. 277 // - E0 parse error (abrupt-closing-of-empty-comment). 278 // - E1 parse error (eof-in-comment). 279 // - E2 parse error (unexpected-null-character). 280 // - E3 parse error (nested-comment). 281 // - E4 parse error (incorrectly-closed-comment). 282 // - T emit the current comment token. 283 // - Z emit an end-of-file token. 284 // - rNN reconsume in the 13.2.5.NN state (after any A* or E* operations). 285 // - s01 switch to the 13.2.5.1 Data state (after any A* or E* operations). 286 // - sNN switch to the 13.2.5.NN state (after any A* or E* operations). 287 // - tNN stay in the 13.2.5.NN state (after any A* or E* operations). 288 // 289 // The E* actions are called errors in the HTML spec but they are not fatal 290 // (https://html.spec.whatwg.org/#parse-errors says "may [but not must] abort 291 // the parser"). They are warnings that, in practice, browsers simply ignore.