github.com/benhoyt/goawk@v1.8.1/lexer/lexer_test.go (about)

     1  // Test GoAWK Lexer
     2  
     3  package lexer_test
     4  
     5  import (
     6  	"fmt"
     7  	"strconv"
     8  	"strings"
     9  	"testing"
    10  
    11  	. "github.com/benhoyt/goawk/lexer"
    12  )
    13  
    14  func TestLexer(t *testing.T) {
    15  	tests := []struct {
    16  		input  string
    17  		output string
    18  	}{
    19  		// Comments, whitespace, line continuations
    20  		{"+# foo \n- #foo", `1:1 + "", 1:8 <newline> "", 2:1 - ""`},
    21  		{"+\\\n-", `1:1 + "", 2:1 - ""`},
    22  		{"+\\\r\n-", `1:1 + "", 2:1 - ""`},
    23  		{"+\\-", `1:1 + "", 1:3 <illegal> "expected \\n after \\ line continuation", 1:3 - ""`},
    24  
    25  		// Names and keywords
    26  		{"x", `1:1 name "x"`},
    27  		{"x y0", `1:1 name "x", 1:3 name "y0"`},
    28  		{"x 0y", `1:1 name "x", 1:3 number "0", 1:4 name "y"`},
    29  		{"sub SUB", `1:1 sub "", 1:5 name "SUB"`},
    30  
    31  		// String tokens
    32  		{`"foo"`, `1:1 string "foo"`},
    33  		{`"a\t\r\n\z\'\"\a\b\f\vb"`, `1:1 string "a\t\r\nz'\"\a\b\f\vb"`},
    34  		{`"x`, `1:3 <illegal> "didn't find end quote in string"`},
    35  		{"\"x\n\"", `1:3 <illegal> "can't have newline in string", 1:3 <newline> "", 2:2 <illegal> "didn't find end quote in string"`},
    36  		{`'foo'`, `1:1 string "foo"`},
    37  		{`'a\t\r\n\z\'\"b'`, `1:1 string "a\t\r\nz'\"b"`},
    38  		{`'x`, `1:3 <illegal> "didn't find end quote in string"`},
    39  		{"'x\n'", `1:3 <illegal> "can't have newline in string", 1:3 <newline> "", 2:2 <illegal> "didn't find end quote in string"`},
    40  		{`"\x0.\x00.\x0A\x10\xff\xFF\x41"`, `1:1 string "\x00.\x00.\n\x10\xff\xffA"`},
    41  		{`"\xg"`, `1:4 <illegal> "1 or 2 hex digits expected", 1:4 name "g", 1:6 <illegal> "didn't find end quote in string"`},
    42  		{`"\0\78\7\77\777\0 \141 "`, `1:1 string "\x00\a8\a?\xff\x00 a "`},
    43  
    44  		// Number tokens
    45  		{"0", `1:1 number "0"`},
    46  		{"9", `1:1 number "9"`},
    47  		{" 0 ", `1:2 number "0"`},
    48  		{"\n  1", `1:1 <newline> "", 2:3 number "1"`},
    49  		{"1234", `1:1 number "1234"`},
    50  		{".5", `1:1 number ".5"`},
    51  		{".5e1", `1:1 number ".5e1"`},
    52  		{"5e+1", `1:1 number "5e+1"`},
    53  		{"5e-1", `1:1 number "5e-1"`},
    54  		{"0.", `1:1 number "0."`},
    55  		{"42e", `1:1 number "42e"`},
    56  		{"4.2e", `1:1 number "4.2e"`},
    57  		{"1.e3", `1:1 number "1.e3"`},
    58  		{"1.e3", `1:1 number "1.e3"`},
    59  		{"1e3foo", `1:1 number "1e3", 1:4 name "foo"`},
    60  		{"1e3+", `1:1 number "1e3", 1:4 + ""`},
    61  		{"1e3.4", `1:1 number "1e3", 1:4 number ".4"`},
    62  		{"1e-", `1:4 <illegal> "expected digits"`},
    63  		{"1e+", `1:4 <illegal> "expected digits"`},
    64  		{"42@", `1:1 number "42", 1:3 <illegal> "unexpected char"`},
    65  		{"0..", `1:1 number "0.", 1:4 <illegal> "expected digits"`},
    66  		{".", `1:2 <illegal> "expected digits"`},
    67  
    68  		// Misc errors
    69  		{"&=", `1:2 <illegal> "unexpected char after '&'", 1:2 = ""`},
    70  	}
    71  	for _, test := range tests {
    72  		t.Run(test.input, func(t *testing.T) {
    73  			l := NewLexer([]byte(test.input))
    74  			strs := []string{}
    75  			for {
    76  				pos, tok, val := l.Scan()
    77  				if tok == EOF {
    78  					break
    79  				}
    80  				if tok == NUMBER {
    81  					// Ensure ParseFloat() works, as that's what our
    82  					// parser uses to convert
    83  					trimmed := strings.TrimRight(val, "eE")
    84  					_, err := strconv.ParseFloat(trimmed, 64)
    85  					if err != nil {
    86  						t.Fatalf("couldn't parse float: %q", val)
    87  					}
    88  				}
    89  				strs = append(strs, fmt.Sprintf("%d:%d %s %q", pos.Line, pos.Column, tok, val))
    90  			}
    91  			output := strings.Join(strs, ", ")
    92  			if output != test.output {
    93  				t.Errorf("expected %q, got %q", test.output, output)
    94  			}
    95  		})
    96  	}
    97  }
    98  
    99  func TestRegex(t *testing.T) {
   100  	tests := []struct {
   101  		input  string
   102  		output string
   103  	}{
   104  		{`/foo/`, `1:1 regex "foo"`},
   105  		{`/=foo/`, `1:1 regex "=foo"`},
   106  		{`/a\/b/`, `1:1 regex "a/b"`},
   107  		{`/a\/\zb/`, `1:1 regex "a/\\zb"`},
   108  		{`/a`, `1:3 <illegal> "didn't find end slash in regex"`},
   109  		{"/a\n", `1:3 <illegal> "can't have newline in regex"`},
   110  		{`foo/`, `1:4 <illegal> "unexpected name preceding regex"`},
   111  	}
   112  	for _, test := range tests {
   113  		t.Run(test.input, func(t *testing.T) {
   114  			l := NewLexer([]byte(test.input))
   115  			l.Scan() // Scan first token (probably DIV)
   116  			pos, tok, val := l.ScanRegex()
   117  			output := fmt.Sprintf("%d:%d %s %q", pos.Line, pos.Column, tok, val)
   118  			if output != test.output {
   119  				t.Errorf("expected %q, got %q", test.output, output)
   120  			}
   121  		})
   122  	}
   123  }
   124  
   125  func TestHadSpace(t *testing.T) {
   126  	tests := []struct {
   127  		input  string
   128  		tokens []Token
   129  		spaces []bool
   130  	}{
   131  		{`foo(x)`, []Token{NAME, LPAREN, NAME, RPAREN}, []bool{false, false, false, false}},
   132  		{`foo (x) `, []Token{NAME, LPAREN, NAME, RPAREN}, []bool{false, true, false, false}},
   133  		{` foo ( x ) `, []Token{NAME, LPAREN, NAME, RPAREN}, []bool{true, true, true, true}},
   134  	}
   135  	for _, test := range tests {
   136  		t.Run(test.input, func(t *testing.T) {
   137  			l := NewLexer([]byte(test.input))
   138  			for i := 0; ; i++ {
   139  				_, tok, _ := l.Scan()
   140  				if tok == EOF {
   141  					break
   142  				}
   143  				if tok != test.tokens[i] {
   144  					t.Errorf("expected %s for token %d, got %s", test.tokens[i], i, tok)
   145  				}
   146  				if l.HadSpace() != test.spaces[i] {
   147  					t.Errorf("expected %v for space %d, got %v", test.spaces[i], i, l.HadSpace())
   148  				}
   149  			}
   150  		})
   151  	}
   152  }
   153  
   154  func TestKeywordToken(t *testing.T) {
   155  	tests := []struct {
   156  		name string
   157  		tok  Token
   158  	}{
   159  		{"print", PRINT},
   160  		{"split", F_SPLIT},
   161  		{"BEGIN", BEGIN},
   162  		{"foo", ILLEGAL},
   163  		{"GoAWK", ILLEGAL},
   164  	}
   165  	for _, test := range tests {
   166  		t.Run(test.name, func(t *testing.T) {
   167  			tok := KeywordToken(test.name)
   168  			if tok != test.tok {
   169  				t.Errorf("expected %v, got %v", test.tok, tok)
   170  			}
   171  		})
   172  	}
   173  }
   174  
   175  func TestAllTokens(t *testing.T) {
   176  	input := "# comment line\n" +
   177  		"+ += && = : , -- /\n/= $ == >= > >> ++ { [ < ( #\n" +
   178  		"<= ~ % %= * *= !~ ! != | || ^ ^= ** **= ? } ] ) ; - -= " +
   179  		"BEGIN break continue delete do else END exit " +
   180  		"for function getline if in next print printf return while " +
   181  		"atan2 close cos exp fflush gsub index int length log match rand " +
   182  		"sin split sprintf sqrt srand sub substr system tolower toupper " +
   183  		"x \"str\\n\" 1234\n" +
   184  		"@ ."
   185  
   186  	strs := make([]string, 0, LAST+1)
   187  	seen := make([]bool, LAST+1)
   188  	l := NewLexer([]byte(input))
   189  	for {
   190  		_, tok, _ := l.Scan()
   191  		strs = append(strs, tok.String())
   192  		seen[int(tok)] = true
   193  		if tok == EOF {
   194  			break
   195  		}
   196  	}
   197  	output := strings.Join(strs, " ")
   198  
   199  	expected := "<newline> " +
   200  		"+ += && = : , -- / <newline> /= $ == >= > >> ++ { [ < ( <newline> " +
   201  		"<= ~ % %= * *= !~ ! != | || ^ ^= ^ ^= ? } ] ) ; - -= " +
   202  		"BEGIN break continue delete do else END exit " +
   203  		"for function getline if in next print printf return while " +
   204  		"atan2 close cos exp fflush gsub index int length log match rand " +
   205  		"sin split sprintf sqrt srand sub substr system tolower toupper " +
   206  		"name string number <newline> " +
   207  		"<illegal> <illegal> EOF"
   208  	if output != expected {
   209  		t.Errorf("expected %q, got %q", expected, output)
   210  	}
   211  
   212  	for i, s := range seen {
   213  		if !s && Token(i) != CONCAT && Token(i) != REGEX {
   214  			t.Errorf("token %s (%d) not seen", Token(i), i)
   215  		}
   216  	}
   217  
   218  	l = NewLexer([]byte(`/foo/`))
   219  	_, tok1, _ := l.Scan()
   220  	_, tok2, val := l.ScanRegex()
   221  	if tok1 != Token(DIV) || tok2 != Token(REGEX) || val != "foo" {
   222  		t.Errorf(`expected / regex "foo", got %s %s %q`, tok1, tok2, val)
   223  	}
   224  
   225  	l = NewLexer([]byte(`/=foo/`))
   226  	_, tok1, _ = l.Scan()
   227  	_, tok2, val = l.ScanRegex()
   228  	if tok1 != Token(DIV_ASSIGN) || tok2 != Token(REGEX) || val != "=foo" {
   229  		t.Errorf(`expected /= regex "=foo", got %s %s %q`, tok1, tok2, val)
   230  	}
   231  }
   232  
   233  func benchmarkLexer(b *testing.B, repeat int, source string) {
   234  	fullSource := []byte(strings.Repeat(source+"\n", repeat))
   235  	b.ResetTimer()
   236  	for i := 0; i < b.N; i++ {
   237  		l := NewLexer(fullSource)
   238  		for {
   239  			_, tok, _ := l.Scan()
   240  			if tok == EOF || tok == ILLEGAL {
   241  				break
   242  			}
   243  		}
   244  	}
   245  }
   246  
   247  func BenchmarkProgram(b *testing.B) {
   248  	benchmarkLexer(b, 5, `{ print $1, ($3+$4)*$5 }`)
   249  }
   250  
   251  func BenchmarkNames(b *testing.B) {
   252  	benchmarkLexer(b, 5, `x y i foobar abcdefghij0123456789 _`)
   253  }
   254  
   255  func BenchmarkKeywords(b *testing.B) {
   256  	benchmarkLexer(b, 5, `BEGIN END print sub if length`)
   257  }
   258  
   259  func BenchmarkSimpleTokens(b *testing.B) {
   260  	benchmarkLexer(b, 5, "\n : , { [ ( } ] ) ~ ? ; $")
   261  }
   262  
   263  func BenchmarkChoiceTokens(b *testing.B) {
   264  	benchmarkLexer(b, 5, `/ /=  % %= + ++ += * ** **= *= = == ^ ^= ! != !~ < <= > >= >> && | ||`)
   265  }
   266  
   267  func BenchmarkNumbers(b *testing.B) {
   268  	benchmarkLexer(b, 5, `0 1 .5 1234 1234567890 1234.56789e-50`)
   269  }
   270  
   271  func BenchmarkStrings(b *testing.B) {
   272  	benchmarkLexer(b, 5, `"x" "y" "xyz" "foo" "foo bar baz" "foo\tbar\rbaz\n"`)
   273  }
   274  
   275  func BenchmarkRegex(b *testing.B) {
   276  	source := `/x/ /./ /foo/ /bar/ /=equals=/ /\/\/\/\//`
   277  	fullSource := []byte(strings.Repeat(source+" ", 5))
   278  	b.ResetTimer()
   279  	for i := 0; i < b.N; i++ {
   280  		l := NewLexer(fullSource)
   281  		for {
   282  			_, tok, _ := l.Scan()
   283  			if tok == EOF {
   284  				break
   285  			}
   286  			if tok != DIV && tok != DIV_ASSIGN {
   287  				b.Fatalf("expected / or /=, got %s", tok)
   288  			}
   289  			_, tok, _ = l.ScanRegex()
   290  			if tok != REGEX {
   291  				b.Fatalf("expected regex, got %s", tok)
   292  			}
   293  		}
   294  	}
   295  }
   296  
   297  func Example() {
   298  	lexer := NewLexer([]byte(`$0 { print $1 }`))
   299  	for {
   300  		pos, tok, val := lexer.Scan()
   301  		if tok == EOF {
   302  			break
   303  		}
   304  		fmt.Printf("%d:%d %s %q\n", pos.Line, pos.Column, tok, val)
   305  	}
   306  	// Output:
   307  	// 1:1 $ ""
   308  	// 1:2 number "0"
   309  	// 1:4 { ""
   310  	// 1:6 print ""
   311  	// 1:12 $ ""
   312  	// 1:13 number "1"
   313  	// 1:15 } ""
   314  }