github.com/jhump/protocompile@v0.0.0-20221021153901-4f6f732835e8/parser/lexer_test.go (about)

     1  package parser
     2  
     3  import (
     4  	"io"
     5  	"math"
     6  	"strings"
     7  	"testing"
     8  
     9  	"github.com/stretchr/testify/assert"
    10  	"github.com/stretchr/testify/require"
    11  
    12  	"github.com/jhump/protocompile/ast"
    13  	"github.com/jhump/protocompile/reporter"
    14  )
    15  
    16  func TestLexer(t *testing.T) {
    17  	handler := reporter.NewHandler(nil)
    18  	l := newTestLexer(t, strings.NewReader(`
    19  	// comment
    20  
    21  	/*
    22  	 * block comment
    23  	 */ /* inline comment */
    24  
    25  	int32  "\032\x16\n\rfoobar\"zap"		'another\tstring\'s\t'
    26  foo
    27  
    28  	// another comment
    29  	// more and more...
    30  
    31  	service rpc message
    32  	.type
    33  	.f.q.n
    34  	name
    35  	f.q.n
    36  
    37  	.01
    38  	.01e12
    39  	.01e+5
    40  	.033e-1
    41  
    42  	12345
    43  	-12345
    44  	123.1234
    45  	0.123
    46  	012345
    47  	0x2134abcdef30
    48  	-0543
    49  	-0xff76
    50  	101.0102
    51  	202.0203e1
    52  	304.0304e-10
    53  	3.1234e+12
    54  
    55  	{ } + - , ;
    56  
    57  	[option=foo]
    58  	syntax = "proto2";
    59  
    60  	// some strange cases
    61  	1.543 g12 /* trailing line comment */
    62  	000.000
    63  	0.1234 .5678 .
    64  	12e12 1.2345e123412341234
    65  
    66  	Random_identifier_with_numbers_0123456789_and_letters...
    67  	// this is a trailing comment
    68  	// that spans multiple lines
    69  	// over two in fact!
    70  	/*
    71  	 * this is a detached comment
    72  	 * with lots of extra words and stuff...
    73  	 */
    74  
    75  	// this is an attached leading comment
    76  	foo
    77  
    78  	1.23e+20+20
    79  	// a trailing comment for last element
    80  
    81  	// comment attached to no tokens (upcoming token is EOF!)
    82  	/* another comment followed by some final whitespace*/
    83  
    84  	
    85  	`), handler)
    86  
    87  	var prev ast.Node
    88  	var sym protoSymType
    89  	expected := []struct {
    90  		t          int
    91  		line, col  int
    92  		span       int
    93  		v          interface{}
    94  		comments   []string
    95  		trailCount int
    96  	}{
    97  		{t: _INT32, line: 8, col: 9, span: 5, v: "int32", comments: []string{"// comment\n", "/*\n\t * block comment\n\t */", "/* inline comment */"}},
    98  		{t: _STRING_LIT, line: 8, col: 16, span: 25, v: "\032\x16\n\rfoobar\"zap"},
    99  		{t: _STRING_LIT, line: 8, col: 57, span: 22, v: "another\tstring's\t"},
   100  		{t: _NAME, line: 9, col: 1, span: 3, v: "foo"},
   101  		{t: _SERVICE, line: 14, col: 9, span: 7, v: "service", comments: []string{"// another comment\n", "// more and more...\n"}},
   102  		{t: _RPC, line: 14, col: 17, span: 3, v: "rpc"},
   103  		{t: _MESSAGE, line: 14, col: 21, span: 7, v: "message"},
   104  		{t: '.', line: 15, col: 9, span: 1},
   105  		{t: _NAME, line: 15, col: 10, span: 4, v: "type"},
   106  		{t: '.', line: 16, col: 9, span: 1},
   107  		{t: _NAME, line: 16, col: 10, span: 1, v: "f"},
   108  		{t: '.', line: 16, col: 11, span: 1},
   109  		{t: _NAME, line: 16, col: 12, span: 1, v: "q"},
   110  		{t: '.', line: 16, col: 13, span: 1},
   111  		{t: _NAME, line: 16, col: 14, span: 1, v: "n"},
   112  		{t: _NAME, line: 17, col: 9, span: 4, v: "name"},
   113  		{t: _NAME, line: 18, col: 9, span: 1, v: "f"},
   114  		{t: '.', line: 18, col: 10, span: 1},
   115  		{t: _NAME, line: 18, col: 11, span: 1, v: "q"},
   116  		{t: '.', line: 18, col: 12, span: 1},
   117  		{t: _NAME, line: 18, col: 13, span: 1, v: "n"},
   118  		{t: _FLOAT_LIT, line: 20, col: 9, span: 3, v: 0.01},
   119  		{t: _FLOAT_LIT, line: 21, col: 9, span: 6, v: 0.01e12},
   120  		{t: _FLOAT_LIT, line: 22, col: 9, span: 6, v: 0.01e5},
   121  		{t: _FLOAT_LIT, line: 23, col: 9, span: 7, v: 0.033e-1},
   122  		{t: _INT_LIT, line: 25, col: 9, span: 5, v: uint64(12345)},
   123  		{t: '-', line: 26, col: 9, span: 1, v: nil},
   124  		{t: _INT_LIT, line: 26, col: 10, span: 5, v: uint64(12345)},
   125  		{t: _FLOAT_LIT, line: 27, col: 9, span: 8, v: 123.1234},
   126  		{t: _FLOAT_LIT, line: 28, col: 9, span: 5, v: 0.123},
   127  		{t: _INT_LIT, line: 29, col: 9, span: 6, v: uint64(012345)},
   128  		{t: _INT_LIT, line: 30, col: 9, span: 14, v: uint64(0x2134abcdef30)},
   129  		{t: '-', line: 31, col: 9, span: 1, v: nil},
   130  		{t: _INT_LIT, line: 31, col: 10, span: 4, v: uint64(0543)},
   131  		{t: '-', line: 32, col: 9, span: 1, v: nil},
   132  		{t: _INT_LIT, line: 32, col: 10, span: 6, v: uint64(0xff76)},
   133  		{t: _FLOAT_LIT, line: 33, col: 9, span: 8, v: 101.0102},
   134  		{t: _FLOAT_LIT, line: 34, col: 9, span: 10, v: 202.0203e1},
   135  		{t: _FLOAT_LIT, line: 35, col: 9, span: 12, v: 304.0304e-10},
   136  		{t: _FLOAT_LIT, line: 36, col: 9, span: 10, v: 3.1234e+12},
   137  		{t: '{', line: 38, col: 9, span: 1, v: nil},
   138  		{t: '}', line: 38, col: 11, span: 1, v: nil},
   139  		{t: '+', line: 38, col: 13, span: 1, v: nil},
   140  		{t: '-', line: 38, col: 15, span: 1, v: nil},
   141  		{t: ',', line: 38, col: 17, span: 1, v: nil},
   142  		{t: ';', line: 38, col: 19, span: 1, v: nil},
   143  		{t: '[', line: 40, col: 9, span: 1, v: nil},
   144  		{t: _OPTION, line: 40, col: 10, span: 6, v: "option"},
   145  		{t: '=', line: 40, col: 16, span: 1, v: nil},
   146  		{t: _NAME, line: 40, col: 17, span: 3, v: "foo"},
   147  		{t: ']', line: 40, col: 20, span: 1, v: nil},
   148  		{t: _SYNTAX, line: 41, col: 9, span: 6, v: "syntax"},
   149  		{t: '=', line: 41, col: 16, span: 1, v: nil},
   150  		{t: _STRING_LIT, line: 41, col: 18, span: 8, v: "proto2"},
   151  		{t: ';', line: 41, col: 26, span: 1, v: nil},
   152  		{t: _FLOAT_LIT, line: 44, col: 9, span: 5, v: 1.543, comments: []string{"// some strange cases\n"}},
   153  		{t: _NAME, line: 44, col: 15, span: 3, v: "g12"},
   154  		{t: _FLOAT_LIT, line: 45, col: 9, span: 7, v: 0.0, comments: []string{"/* trailing line comment */"}, trailCount: 1},
   155  		{t: _FLOAT_LIT, line: 46, col: 9, span: 6, v: 0.1234},
   156  		{t: _FLOAT_LIT, line: 46, col: 16, span: 5, v: 0.5678},
   157  		{t: '.', line: 46, col: 22, span: 1, v: nil},
   158  		{t: _FLOAT_LIT, line: 47, col: 9, span: 5, v: 12e12},
   159  		{t: _FLOAT_LIT, line: 47, col: 15, span: 19, v: math.Inf(1)},
   160  		{t: _NAME, line: 49, col: 9, span: 53, v: "Random_identifier_with_numbers_0123456789_and_letters"},
   161  		{t: '.', line: 49, col: 62, span: 1, v: nil},
   162  		{t: '.', line: 49, col: 63, span: 1, v: nil},
   163  		{t: '.', line: 49, col: 64, span: 1, v: nil},
   164  		{t: _NAME, line: 59, col: 9, span: 3, v: "foo", comments: []string{"// this is a trailing comment\n", "// that spans multiple lines\n", "// over two in fact!\n", "/*\n\t * this is a detached comment\n\t * with lots of extra words and stuff...\n\t */", "// this is an attached leading comment\n"}, trailCount: 3},
   165  		{t: _FLOAT_LIT, line: 61, col: 9, span: 8, v: 1.23e+20},
   166  		{t: '+', line: 61, col: 17, span: 1, v: nil},
   167  		{t: _INT_LIT, line: 61, col: 18, span: 2, v: uint64(20)},
   168  	}
   169  
   170  	for i, exp := range expected {
   171  		tok := l.Lex(&sym)
   172  		if tok == 0 {
   173  			t.Fatalf("lexer reported EOF but should have returned %v", exp)
   174  		}
   175  		var n ast.Node
   176  		var val interface{}
   177  		switch tok {
   178  		case _SYNTAX, _OPTION, _INT32, _SERVICE, _RPC, _MESSAGE, _NAME:
   179  			n = sym.id
   180  			val = sym.id.Val
   181  		case _STRING_LIT:
   182  			n = sym.s
   183  			val = sym.s.Val
   184  		case _INT_LIT:
   185  			n = sym.i
   186  			val = sym.i.Val
   187  		case _FLOAT_LIT:
   188  			n = sym.f
   189  			val = sym.f.Val
   190  		case _ERROR:
   191  			val = sym.err
   192  		default:
   193  			n = sym.b
   194  			val = nil
   195  		}
   196  		if !assert.Equal(t, exp.t, tok, "case %d: wrong token type (expecting %+v, got %+v)", i, exp.v, val) {
   197  			break
   198  		}
   199  		if !assert.Equal(t, exp.v, val, "case %d: wrong token value", i) {
   200  			break
   201  		}
   202  		nodeInfo := l.info.NodeInfo(n)
   203  		var prevNodeInfo ast.NodeInfo
   204  		if prev != nil {
   205  			prevNodeInfo = l.info.NodeInfo(prev)
   206  		}
   207  		assert.Equal(t, exp.line, nodeInfo.Start().Line, "case %d: wrong line number", i)
   208  		assert.Equal(t, exp.col, nodeInfo.Start().Col, "case %d: wrong column number (on line %d)", i, exp.line)
   209  		assert.Equal(t, exp.line, nodeInfo.End().Line, "case %d: wrong end line number", i)
   210  		assert.Equal(t, exp.col+exp.span, nodeInfo.End().Col, "case %d: wrong end column number", i)
   211  		actualTrailCount := 0
   212  		if prev != nil {
   213  			actualTrailCount = prevNodeInfo.TrailingComments().Len()
   214  		}
   215  		assert.Equal(t, exp.trailCount, actualTrailCount, "case %d: wrong number of trailing comments", i)
   216  		assert.Equal(t, len(exp.comments)-exp.trailCount, nodeInfo.LeadingComments().Len(), "case %d: wrong number of comments", i)
   217  		for ci := range exp.comments {
   218  			var c ast.Comment
   219  			if ci < exp.trailCount {
   220  				c = prevNodeInfo.TrailingComments().Index(ci)
   221  			} else {
   222  				c = nodeInfo.LeadingComments().Index(ci - exp.trailCount)
   223  			}
   224  			assert.Equal(t, exp.comments[ci], c.RawText(), "case %d, comment #%d: unexpected text", i, ci+1)
   225  		}
   226  		prev = n
   227  	}
   228  	if tok := l.Lex(&sym); tok != 0 {
   229  		t.Fatalf("lexer reported symbol after what should have been EOF: %d", tok)
   230  	}
   231  	require.NoError(t, handler.Error())
   232  	// Now we check final state of lexer for unattached comments and final whitespace
   233  	// One of the final comments get associated as trailing comment for final token
   234  	prevNodeInfo := l.info.NodeInfo(prev)
   235  	assert.Equal(t, 1, prevNodeInfo.TrailingComments().Len(), "last token: wrong number of trailing comments")
   236  	eofNodeInfo := l.info.TokenInfo(l.eof)
   237  	finalComments := eofNodeInfo.LeadingComments()
   238  	if assert.Equal(t, 2, finalComments.Len(), "wrong number of final remaining comments") {
   239  		assert.Equal(t, "// comment attached to no tokens (upcoming token is EOF!)\n", finalComments.Index(0).RawText(), "incorrect final comment text")
   240  		assert.Equal(t, "/* another comment followed by some final whitespace*/", finalComments.Index(1).RawText(), "incorrect final comment text")
   241  	}
   242  	assert.Equal(t, "\n\n\t\n\t", eofNodeInfo.LeadingWhitespace(), "incorrect final whitespace")
   243  }
   244  
   245  func TestLexerErrors(t *testing.T) {
   246  	testCases := []struct {
   247  		str    string
   248  		errMsg string
   249  	}{
   250  		{str: `0xffffffffffffffffffff`, errMsg: "value out of range"},
   251  		{str: `"foobar`, errMsg: "unexpected EOF"},
   252  		{str: `"foobar\J"`, errMsg: "invalid escape sequence"},
   253  		{str: `"foobar\xgfoo"`, errMsg: "invalid hex escape"},
   254  		{str: `"foobar\u09gafoo"`, errMsg: "invalid unicode escape"},
   255  		{str: `"foobar\U0010005zfoo"`, errMsg: "invalid unicode escape"},
   256  		{str: `"foobar\U00110000foo"`, errMsg: "unicode escape is out of range"},
   257  		{str: "'foobar\nbaz'", errMsg: "encountered end-of-line"},
   258  		{str: "'foobar\000baz'", errMsg: "null character ('\\0') not allowed"},
   259  		{str: `1.543g12`, errMsg: "invalid syntax"},
   260  		{str: `0.1234.5678.`, errMsg: "invalid syntax"},
   261  		{str: `0x987.345aaf`, errMsg: "invalid syntax"},
   262  		{str: `0.987.345`, errMsg: "invalid syntax"},
   263  		{str: `0.987e34e-20`, errMsg: "invalid syntax"},
   264  		{str: `0.987e-345e20`, errMsg: "invalid syntax"},
   265  		{str: `.987to123`, errMsg: "invalid syntax"},
   266  		{str: `0b0111`, errMsg: "invalid syntax"},
   267  		{str: `0o765432`, errMsg: "invalid syntax"},
   268  		{str: `1_000_000`, errMsg: "invalid syntax"},
   269  		{str: `1_000.000_001e6`, errMsg: "invalid syntax"},
   270  		{str: `0X1F_FFP-16`, errMsg: "invalid syntax"},
   271  		{str: "09", errMsg: "invalid syntax in octal integer value: 09"},
   272  		{str: "0f", errMsg: "invalid syntax in octal integer value: 0f"},
   273  		{str: `/* foobar`, errMsg: "unexpected EOF"},
   274  		{str: "\x00", errMsg: "invalid control character"},
   275  		{str: "\x03", errMsg: "invalid control character"},
   276  		{str: "\x1B", errMsg: "invalid control character"},
   277  		{str: "\x7F", errMsg: "invalid control character"},
   278  		{str: "#", errMsg: "invalid character"},
   279  		{str: "?", errMsg: "invalid character"},
   280  		{str: "^", errMsg: "invalid character"},
   281  		{str: "\uAAAA", errMsg: "invalid character"},
   282  		{str: "\U0010FFFF", errMsg: "invalid character"},
   283  		{str: "// foo \x00", errMsg: "invalid control character"},
   284  		{str: "/* foo \x00", errMsg: "invalid control character"},
   285  	}
   286  	for i, tc := range testCases {
   287  		handler := reporter.NewHandler(nil)
   288  		l := newTestLexer(t, strings.NewReader(tc.str), handler)
   289  		var sym protoSymType
   290  		tok := l.Lex(&sym)
   291  		if assert.Equal(t, _ERROR, tok) {
   292  			assert.True(t, sym.err != nil)
   293  			assert.True(t, strings.Contains(sym.err.Error(), tc.errMsg), "case %d: expected message to contain %q but does not: %q", i, tc.errMsg, sym.err.Error())
   294  			t.Logf("case %d: %v", i, sym.err)
   295  		}
   296  	}
   297  }
   298  
   299  func newTestLexer(t *testing.T, in io.Reader, h *reporter.Handler) *protoLex {
   300  	lexer, err := newLexer(in, "test.proto", h)
   301  	require.NoError(t, err)
   302  	return lexer
   303  }