github.com/jhump/protocompile@v0.0.0-20221021153901-4f6f732835e8/parser/lexer_test.go (about) 1 package parser 2 3 import ( 4 "io" 5 "math" 6 "strings" 7 "testing" 8 9 "github.com/stretchr/testify/assert" 10 "github.com/stretchr/testify/require" 11 12 "github.com/jhump/protocompile/ast" 13 "github.com/jhump/protocompile/reporter" 14 ) 15 16 func TestLexer(t *testing.T) { 17 handler := reporter.NewHandler(nil) 18 l := newTestLexer(t, strings.NewReader(` 19 // comment 20 21 /* 22 * block comment 23 */ /* inline comment */ 24 25 int32 "\032\x16\n\rfoobar\"zap" 'another\tstring\'s\t' 26 foo 27 28 // another comment 29 // more and more... 30 31 service rpc message 32 .type 33 .f.q.n 34 name 35 f.q.n 36 37 .01 38 .01e12 39 .01e+5 40 .033e-1 41 42 12345 43 -12345 44 123.1234 45 0.123 46 012345 47 0x2134abcdef30 48 -0543 49 -0xff76 50 101.0102 51 202.0203e1 52 304.0304e-10 53 3.1234e+12 54 55 { } + - , ; 56 57 [option=foo] 58 syntax = "proto2"; 59 60 // some strange cases 61 1.543 g12 /* trailing line comment */ 62 000.000 63 0.1234 .5678 . 64 12e12 1.2345e123412341234 65 66 Random_identifier_with_numbers_0123456789_and_letters... 67 // this is a trailing comment 68 // that spans multiple lines 69 // over two in fact! 70 /* 71 * this is a detached comment 72 * with lots of extra words and stuff... 73 */ 74 75 // this is an attached leading comment 76 foo 77 78 1.23e+20+20 79 // a trailing comment for last element 80 81 // comment attached to no tokens (upcoming token is EOF!) 82 /* another comment followed by some final whitespace*/ 83 84 85 `), handler) 86 87 var prev ast.Node 88 var sym protoSymType 89 expected := []struct { 90 t int 91 line, col int 92 span int 93 v interface{} 94 comments []string 95 trailCount int 96 }{ 97 {t: _INT32, line: 8, col: 9, span: 5, v: "int32", comments: []string{"// comment\n", "/*\n\t * block comment\n\t */", "/* inline comment */"}}, 98 {t: _STRING_LIT, line: 8, col: 16, span: 25, v: "\032\x16\n\rfoobar\"zap"}, 99 {t: _STRING_LIT, line: 8, col: 57, span: 22, v: "another\tstring's\t"}, 100 {t: _NAME, line: 9, col: 1, span: 3, v: "foo"}, 101 {t: _SERVICE, line: 14, col: 9, span: 7, v: "service", comments: []string{"// another comment\n", "// more and more...\n"}}, 102 {t: _RPC, line: 14, col: 17, span: 3, v: "rpc"}, 103 {t: _MESSAGE, line: 14, col: 21, span: 7, v: "message"}, 104 {t: '.', line: 15, col: 9, span: 1}, 105 {t: _NAME, line: 15, col: 10, span: 4, v: "type"}, 106 {t: '.', line: 16, col: 9, span: 1}, 107 {t: _NAME, line: 16, col: 10, span: 1, v: "f"}, 108 {t: '.', line: 16, col: 11, span: 1}, 109 {t: _NAME, line: 16, col: 12, span: 1, v: "q"}, 110 {t: '.', line: 16, col: 13, span: 1}, 111 {t: _NAME, line: 16, col: 14, span: 1, v: "n"}, 112 {t: _NAME, line: 17, col: 9, span: 4, v: "name"}, 113 {t: _NAME, line: 18, col: 9, span: 1, v: "f"}, 114 {t: '.', line: 18, col: 10, span: 1}, 115 {t: _NAME, line: 18, col: 11, span: 1, v: "q"}, 116 {t: '.', line: 18, col: 12, span: 1}, 117 {t: _NAME, line: 18, col: 13, span: 1, v: "n"}, 118 {t: _FLOAT_LIT, line: 20, col: 9, span: 3, v: 0.01}, 119 {t: _FLOAT_LIT, line: 21, col: 9, span: 6, v: 0.01e12}, 120 {t: _FLOAT_LIT, line: 22, col: 9, span: 6, v: 0.01e5}, 121 {t: _FLOAT_LIT, line: 23, col: 9, span: 7, v: 0.033e-1}, 122 {t: _INT_LIT, line: 25, col: 9, span: 5, v: uint64(12345)}, 123 {t: '-', line: 26, col: 9, span: 1, v: nil}, 124 {t: _INT_LIT, line: 26, col: 10, span: 5, v: uint64(12345)}, 125 {t: _FLOAT_LIT, line: 27, col: 9, span: 8, v: 123.1234}, 126 {t: _FLOAT_LIT, line: 28, col: 9, span: 5, v: 0.123}, 127 {t: _INT_LIT, line: 29, col: 9, span: 6, v: uint64(012345)}, 128 {t: _INT_LIT, line: 30, col: 9, span: 14, v: uint64(0x2134abcdef30)}, 129 {t: '-', line: 31, col: 9, span: 1, v: nil}, 130 {t: _INT_LIT, line: 31, col: 10, span: 4, v: uint64(0543)}, 131 {t: '-', line: 32, col: 9, span: 1, v: nil}, 132 {t: _INT_LIT, line: 32, col: 10, span: 6, v: uint64(0xff76)}, 133 {t: _FLOAT_LIT, line: 33, col: 9, span: 8, v: 101.0102}, 134 {t: _FLOAT_LIT, line: 34, col: 9, span: 10, v: 202.0203e1}, 135 {t: _FLOAT_LIT, line: 35, col: 9, span: 12, v: 304.0304e-10}, 136 {t: _FLOAT_LIT, line: 36, col: 9, span: 10, v: 3.1234e+12}, 137 {t: '{', line: 38, col: 9, span: 1, v: nil}, 138 {t: '}', line: 38, col: 11, span: 1, v: nil}, 139 {t: '+', line: 38, col: 13, span: 1, v: nil}, 140 {t: '-', line: 38, col: 15, span: 1, v: nil}, 141 {t: ',', line: 38, col: 17, span: 1, v: nil}, 142 {t: ';', line: 38, col: 19, span: 1, v: nil}, 143 {t: '[', line: 40, col: 9, span: 1, v: nil}, 144 {t: _OPTION, line: 40, col: 10, span: 6, v: "option"}, 145 {t: '=', line: 40, col: 16, span: 1, v: nil}, 146 {t: _NAME, line: 40, col: 17, span: 3, v: "foo"}, 147 {t: ']', line: 40, col: 20, span: 1, v: nil}, 148 {t: _SYNTAX, line: 41, col: 9, span: 6, v: "syntax"}, 149 {t: '=', line: 41, col: 16, span: 1, v: nil}, 150 {t: _STRING_LIT, line: 41, col: 18, span: 8, v: "proto2"}, 151 {t: ';', line: 41, col: 26, span: 1, v: nil}, 152 {t: _FLOAT_LIT, line: 44, col: 9, span: 5, v: 1.543, comments: []string{"// some strange cases\n"}}, 153 {t: _NAME, line: 44, col: 15, span: 3, v: "g12"}, 154 {t: _FLOAT_LIT, line: 45, col: 9, span: 7, v: 0.0, comments: []string{"/* trailing line comment */"}, trailCount: 1}, 155 {t: _FLOAT_LIT, line: 46, col: 9, span: 6, v: 0.1234}, 156 {t: _FLOAT_LIT, line: 46, col: 16, span: 5, v: 0.5678}, 157 {t: '.', line: 46, col: 22, span: 1, v: nil}, 158 {t: _FLOAT_LIT, line: 47, col: 9, span: 5, v: 12e12}, 159 {t: _FLOAT_LIT, line: 47, col: 15, span: 19, v: math.Inf(1)}, 160 {t: _NAME, line: 49, col: 9, span: 53, v: "Random_identifier_with_numbers_0123456789_and_letters"}, 161 {t: '.', line: 49, col: 62, span: 1, v: nil}, 162 {t: '.', line: 49, col: 63, span: 1, v: nil}, 163 {t: '.', line: 49, col: 64, span: 1, v: nil}, 164 {t: _NAME, line: 59, col: 9, span: 3, v: "foo", comments: []string{"// this is a trailing comment\n", "// that spans multiple lines\n", "// over two in fact!\n", "/*\n\t * this is a detached comment\n\t * with lots of extra words and stuff...\n\t */", "// this is an attached leading comment\n"}, trailCount: 3}, 165 {t: _FLOAT_LIT, line: 61, col: 9, span: 8, v: 1.23e+20}, 166 {t: '+', line: 61, col: 17, span: 1, v: nil}, 167 {t: _INT_LIT, line: 61, col: 18, span: 2, v: uint64(20)}, 168 } 169 170 for i, exp := range expected { 171 tok := l.Lex(&sym) 172 if tok == 0 { 173 t.Fatalf("lexer reported EOF but should have returned %v", exp) 174 } 175 var n ast.Node 176 var val interface{} 177 switch tok { 178 case _SYNTAX, _OPTION, _INT32, _SERVICE, _RPC, _MESSAGE, _NAME: 179 n = sym.id 180 val = sym.id.Val 181 case _STRING_LIT: 182 n = sym.s 183 val = sym.s.Val 184 case _INT_LIT: 185 n = sym.i 186 val = sym.i.Val 187 case _FLOAT_LIT: 188 n = sym.f 189 val = sym.f.Val 190 case _ERROR: 191 val = sym.err 192 default: 193 n = sym.b 194 val = nil 195 } 196 if !assert.Equal(t, exp.t, tok, "case %d: wrong token type (expecting %+v, got %+v)", i, exp.v, val) { 197 break 198 } 199 if !assert.Equal(t, exp.v, val, "case %d: wrong token value", i) { 200 break 201 } 202 nodeInfo := l.info.NodeInfo(n) 203 var prevNodeInfo ast.NodeInfo 204 if prev != nil { 205 prevNodeInfo = l.info.NodeInfo(prev) 206 } 207 assert.Equal(t, exp.line, nodeInfo.Start().Line, "case %d: wrong line number", i) 208 assert.Equal(t, exp.col, nodeInfo.Start().Col, "case %d: wrong column number (on line %d)", i, exp.line) 209 assert.Equal(t, exp.line, nodeInfo.End().Line, "case %d: wrong end line number", i) 210 assert.Equal(t, exp.col+exp.span, nodeInfo.End().Col, "case %d: wrong end column number", i) 211 actualTrailCount := 0 212 if prev != nil { 213 actualTrailCount = prevNodeInfo.TrailingComments().Len() 214 } 215 assert.Equal(t, exp.trailCount, actualTrailCount, "case %d: wrong number of trailing comments", i) 216 assert.Equal(t, len(exp.comments)-exp.trailCount, nodeInfo.LeadingComments().Len(), "case %d: wrong number of comments", i) 217 for ci := range exp.comments { 218 var c ast.Comment 219 if ci < exp.trailCount { 220 c = prevNodeInfo.TrailingComments().Index(ci) 221 } else { 222 c = nodeInfo.LeadingComments().Index(ci - exp.trailCount) 223 } 224 assert.Equal(t, exp.comments[ci], c.RawText(), "case %d, comment #%d: unexpected text", i, ci+1) 225 } 226 prev = n 227 } 228 if tok := l.Lex(&sym); tok != 0 { 229 t.Fatalf("lexer reported symbol after what should have been EOF: %d", tok) 230 } 231 require.NoError(t, handler.Error()) 232 // Now we check final state of lexer for unattached comments and final whitespace 233 // One of the final comments get associated as trailing comment for final token 234 prevNodeInfo := l.info.NodeInfo(prev) 235 assert.Equal(t, 1, prevNodeInfo.TrailingComments().Len(), "last token: wrong number of trailing comments") 236 eofNodeInfo := l.info.TokenInfo(l.eof) 237 finalComments := eofNodeInfo.LeadingComments() 238 if assert.Equal(t, 2, finalComments.Len(), "wrong number of final remaining comments") { 239 assert.Equal(t, "// comment attached to no tokens (upcoming token is EOF!)\n", finalComments.Index(0).RawText(), "incorrect final comment text") 240 assert.Equal(t, "/* another comment followed by some final whitespace*/", finalComments.Index(1).RawText(), "incorrect final comment text") 241 } 242 assert.Equal(t, "\n\n\t\n\t", eofNodeInfo.LeadingWhitespace(), "incorrect final whitespace") 243 } 244 245 func TestLexerErrors(t *testing.T) { 246 testCases := []struct { 247 str string 248 errMsg string 249 }{ 250 {str: `0xffffffffffffffffffff`, errMsg: "value out of range"}, 251 {str: `"foobar`, errMsg: "unexpected EOF"}, 252 {str: `"foobar\J"`, errMsg: "invalid escape sequence"}, 253 {str: `"foobar\xgfoo"`, errMsg: "invalid hex escape"}, 254 {str: `"foobar\u09gafoo"`, errMsg: "invalid unicode escape"}, 255 {str: `"foobar\U0010005zfoo"`, errMsg: "invalid unicode escape"}, 256 {str: `"foobar\U00110000foo"`, errMsg: "unicode escape is out of range"}, 257 {str: "'foobar\nbaz'", errMsg: "encountered end-of-line"}, 258 {str: "'foobar\000baz'", errMsg: "null character ('\\0') not allowed"}, 259 {str: `1.543g12`, errMsg: "invalid syntax"}, 260 {str: `0.1234.5678.`, errMsg: "invalid syntax"}, 261 {str: `0x987.345aaf`, errMsg: "invalid syntax"}, 262 {str: `0.987.345`, errMsg: "invalid syntax"}, 263 {str: `0.987e34e-20`, errMsg: "invalid syntax"}, 264 {str: `0.987e-345e20`, errMsg: "invalid syntax"}, 265 {str: `.987to123`, errMsg: "invalid syntax"}, 266 {str: `0b0111`, errMsg: "invalid syntax"}, 267 {str: `0o765432`, errMsg: "invalid syntax"}, 268 {str: `1_000_000`, errMsg: "invalid syntax"}, 269 {str: `1_000.000_001e6`, errMsg: "invalid syntax"}, 270 {str: `0X1F_FFP-16`, errMsg: "invalid syntax"}, 271 {str: "09", errMsg: "invalid syntax in octal integer value: 09"}, 272 {str: "0f", errMsg: "invalid syntax in octal integer value: 0f"}, 273 {str: `/* foobar`, errMsg: "unexpected EOF"}, 274 {str: "\x00", errMsg: "invalid control character"}, 275 {str: "\x03", errMsg: "invalid control character"}, 276 {str: "\x1B", errMsg: "invalid control character"}, 277 {str: "\x7F", errMsg: "invalid control character"}, 278 {str: "#", errMsg: "invalid character"}, 279 {str: "?", errMsg: "invalid character"}, 280 {str: "^", errMsg: "invalid character"}, 281 {str: "\uAAAA", errMsg: "invalid character"}, 282 {str: "\U0010FFFF", errMsg: "invalid character"}, 283 {str: "// foo \x00", errMsg: "invalid control character"}, 284 {str: "/* foo \x00", errMsg: "invalid control character"}, 285 } 286 for i, tc := range testCases { 287 handler := reporter.NewHandler(nil) 288 l := newTestLexer(t, strings.NewReader(tc.str), handler) 289 var sym protoSymType 290 tok := l.Lex(&sym) 291 if assert.Equal(t, _ERROR, tok) { 292 assert.True(t, sym.err != nil) 293 assert.True(t, strings.Contains(sym.err.Error(), tc.errMsg), "case %d: expected message to contain %q but does not: %q", i, tc.errMsg, sym.err.Error()) 294 t.Logf("case %d: %v", i, sym.err) 295 } 296 } 297 } 298 299 func newTestLexer(t *testing.T, in io.Reader, h *reporter.Handler) *protoLex { 300 lexer, err := newLexer(in, "test.proto", h) 301 require.NoError(t, err) 302 return lexer 303 }