go.starlark.net@v0.0.0-20231101134539-556fd59b42f6/syntax/scan_test.go (about)

     1  // Copyright 2017 The Bazel Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  package syntax
     6  
     7  import (
     8  	"bytes"
     9  	"fmt"
    10  	"go/build"
    11  	"os"
    12  	"path/filepath"
    13  	"strings"
    14  	"testing"
    15  )
    16  
    17  func scan(src interface{}) (tokens string, err error) {
    18  	sc, err := newScanner("foo.star", src, false)
    19  	if err != nil {
    20  		return "", err
    21  	}
    22  
    23  	defer sc.recover(&err)
    24  
    25  	var buf bytes.Buffer
    26  	var val tokenValue
    27  	for {
    28  		tok := sc.nextToken(&val)
    29  
    30  		if buf.Len() > 0 {
    31  			buf.WriteByte(' ')
    32  		}
    33  		switch tok {
    34  		case EOF:
    35  			buf.WriteString("EOF")
    36  		case IDENT:
    37  			buf.WriteString(val.raw)
    38  		case INT:
    39  			if val.bigInt != nil {
    40  				fmt.Fprintf(&buf, "%d", val.bigInt)
    41  			} else {
    42  				fmt.Fprintf(&buf, "%d", val.int)
    43  			}
    44  		case FLOAT:
    45  			fmt.Fprintf(&buf, "%e", val.float)
    46  		case STRING, BYTES:
    47  			buf.WriteString(Quote(val.string, tok == BYTES))
    48  		default:
    49  			buf.WriteString(tok.String())
    50  		}
    51  		if tok == EOF {
    52  			break
    53  		}
    54  	}
    55  	return buf.String(), nil
    56  }
    57  
    58  func TestScanner(t *testing.T) {
    59  	for _, test := range []struct {
    60  		input, want string
    61  	}{
    62  		{``, "EOF"},
    63  		{`123`, "123 EOF"},
    64  		{`x.y`, "x . y EOF"},
    65  		{`chocolate.éclair`, `chocolate . éclair EOF`},
    66  		{`123 "foo" hello x.y`, `123 "foo" hello x . y EOF`},
    67  		{`print(x)`, "print ( x ) EOF"},
    68  		{`print(x); print(y)`, "print ( x ) ; print ( y ) EOF"},
    69  		{"\nprint(\n1\n)\n", "print ( 1 ) newline EOF"}, // final \n is at toplevel on non-blank line => token
    70  		{`/ // /= //= ///=`, "/ // /= //= // /= EOF"},
    71  		{`# hello
    72  print(x)`, "print ( x ) EOF"},
    73  		{`# hello
    74  print(1)
    75  cc_binary(name="foo")
    76  def f(x):
    77  		return x+1
    78  print(1)
    79  `,
    80  			`print ( 1 ) newline ` +
    81  				`cc_binary ( name = "foo" ) newline ` +
    82  				`def f ( x ) : newline ` +
    83  				`indent return x + 1 newline ` +
    84  				`outdent print ( 1 ) newline ` +
    85  				`EOF`},
    86  		// EOF should act line an implicit newline.
    87  		{`def f(): pass`,
    88  			"def f ( ) : pass EOF"},
    89  		{`def f():
    90  	pass`,
    91  			"def f ( ) : newline indent pass newline outdent EOF"},
    92  		{`def f():
    93  	pass
    94  # oops`,
    95  			"def f ( ) : newline indent pass newline outdent EOF"},
    96  		{`def f():
    97  	pass \
    98  `,
    99  			"def f ( ) : newline indent pass newline outdent EOF"},
   100  		{`def f():
   101  	pass
   102  `,
   103  			"def f ( ) : newline indent pass newline outdent EOF"},
   104  		{`pass
   105  
   106  
   107  pass`, "pass newline pass EOF"}, // consecutive newlines are consolidated
   108  		{`def f():
   109      pass
   110      `, "def f ( ) : newline indent pass newline outdent EOF"},
   111  		{`def f():
   112      pass
   113      ` + "\n", "def f ( ) : newline indent pass newline outdent EOF"},
   114  		{"pass", "pass EOF"},
   115  		{"pass\n", "pass newline EOF"},
   116  		{"pass\n ", "pass newline EOF"},
   117  		{"pass\n \n", "pass newline EOF"},
   118  		{"if x:\n  pass\n ", "if x : newline indent pass newline outdent EOF"},
   119  		{`x = 1 + \
   120  2`, `x = 1 + 2 EOF`},
   121  		{`x = 'a\nb'`, `x = "a\nb" EOF`},
   122  		{`x = r'a\nb'`, `x = "a\\nb" EOF`},
   123  		{"x = 'a\\\nb'", `x = "ab" EOF`},
   124  		{`x = '\''`, `x = "'" EOF`},
   125  		{`x = "\""`, `x = "\"" EOF`},
   126  		{`x = r'\''`, `x = "\\'" EOF`},
   127  		{`x = '''\''''`, `x = "'" EOF`},
   128  		{`x = r'''\''''`, `x = "\\'" EOF`},
   129  		{`x = ''''a'b'c'''`, `x = "'a'b'c" EOF`},
   130  		{"x = '''a\nb'''", `x = "a\nb" EOF`},
   131  		{"x = '''a\rb'''", `x = "a\nb" EOF`},
   132  		{"x = '''a\r\nb'''", `x = "a\nb" EOF`},
   133  		{"x = '''a\n\rb'''", `x = "a\n\nb" EOF`},
   134  		{"x = r'a\\\nb'", `x = "a\\\nb" EOF`},
   135  		{"x = r'a\\\rb'", `x = "a\\\nb" EOF`},
   136  		{"x = r'a\\\r\nb'", `x = "a\\\nb" EOF`},
   137  		{"a\rb", `a newline b EOF`},
   138  		{"a\nb", `a newline b EOF`},
   139  		{"a\r\nb", `a newline b EOF`},
   140  		{"a\n\nb", `a newline b EOF`},
   141  		// numbers
   142  		{"0", `0 EOF`},
   143  		{"00", `0 EOF`},
   144  		{"0.", `0.000000e+00 EOF`},
   145  		{"0.e1", `0.000000e+00 EOF`},
   146  		{".0", `0.000000e+00 EOF`},
   147  		{"0.0", `0.000000e+00 EOF`},
   148  		{".e1", `. e1 EOF`},
   149  		{"1", `1 EOF`},
   150  		{"1.", `1.000000e+00 EOF`},
   151  		{".1", `1.000000e-01 EOF`},
   152  		{".1e1", `1.000000e+00 EOF`},
   153  		{".1e+1", `1.000000e+00 EOF`},
   154  		{".1e-1", `1.000000e-02 EOF`},
   155  		{"1e1", `1.000000e+01 EOF`},
   156  		{"1e+1", `1.000000e+01 EOF`},
   157  		{"1e-1", `1.000000e-01 EOF`},
   158  		{"123", `123 EOF`},
   159  		{"123e45", `1.230000e+47 EOF`},
   160  		{"999999999999999999999999999999999999999999999999999", `999999999999999999999999999999999999999999999999999 EOF`},
   161  		{"12345678901234567890", `12345678901234567890 EOF`},
   162  		// hex
   163  		{"0xA", `10 EOF`},
   164  		{"0xAAG", `170 G EOF`},
   165  		{"0xG", `foo.star:1:1: invalid hex literal`},
   166  		{"0XA", `10 EOF`},
   167  		{"0XG", `foo.star:1:1: invalid hex literal`},
   168  		{"0xA.", `10 . EOF`},
   169  		{"0xA.e1", `10 . e1 EOF`},
   170  		{"0x12345678deadbeef12345678", `5634002672576678570168178296 EOF`},
   171  		// binary
   172  		{"0b1010", `10 EOF`},
   173  		{"0B111101", `61 EOF`},
   174  		{"0b3", `foo.star:1:3: invalid binary literal`},
   175  		{"0b1010201", `10 201 EOF`},
   176  		{"0b1010.01", `10 1.000000e-02 EOF`},
   177  		{"0b0000", `0 EOF`},
   178  		// octal
   179  		{"0o123", `83 EOF`},
   180  		{"0o12834", `10 834 EOF`},
   181  		{"0o12934", `10 934 EOF`},
   182  		{"0o12934.", `10 9.340000e+02 EOF`},
   183  		{"0o12934.1", `10 9.341000e+02 EOF`},
   184  		{"0o12934e1", `10 9.340000e+03 EOF`},
   185  		{"0o123.", `83 . EOF`},
   186  		{"0o123.1", `83 1.000000e-01 EOF`},
   187  		{"0123", `foo.star:1:5: obsolete form of octal literal; use 0o123`},
   188  		{"012834", `foo.star:1:1: invalid int literal`},
   189  		{"012934", `foo.star:1:1: invalid int literal`},
   190  		{"i = 012934", `foo.star:1:5: invalid int literal`},
   191  		// octal escapes in string literals
   192  		{`"\037"`, `"\x1f" EOF`},
   193  		{`"\377"`, `foo.star:1:1: non-ASCII octal escape \377 (use \u00FF for the UTF-8 encoding of U+00FF)`},
   194  		{`"\378"`, `"\x1f8" EOF`},                               // = '\37' + '8'
   195  		{`"\400"`, `foo.star:1:1: non-ASCII octal escape \400`}, // unlike Python 2 and 3
   196  		// hex escapes
   197  		{`"\x00\x20\x09\x41\x7e\x7f"`, `"\x00 \tA~\x7f" EOF`}, // DEL is non-printable
   198  		{`"\x80"`, `foo.star:1:1: non-ASCII hex escape`},
   199  		{`"\xff"`, `foo.star:1:1: non-ASCII hex escape`},
   200  		{`"\xFf"`, `foo.star:1:1: non-ASCII hex escape`},
   201  		{`"\xF"`, `foo.star:1:1: truncated escape sequence \xF`},
   202  		{`"\x"`, `foo.star:1:1: truncated escape sequence \x`},
   203  		{`"\xfg"`, `foo.star:1:1: invalid escape sequence \xfg`},
   204  		// Unicode escapes
   205  		// \uXXXX
   206  		{`"\u0400"`, `"Ѐ" EOF`},
   207  		{`"\u100"`, `foo.star:1:1: truncated escape sequence \u100`},
   208  		{`"\u04000"`, `"Ѐ0" EOF`}, // = U+0400 + '0'
   209  		{`"\u100g"`, `foo.star:1:1: invalid escape sequence \u100g`},
   210  		{`"\u4E16"`, `"世" EOF`},
   211  		{`"\udc00"`, `foo.star:1:1: invalid Unicode code point U+DC00`}, // surrogate
   212  		// \UXXXXXXXX
   213  		{`"\U00000400"`, `"Ѐ" EOF`},
   214  		{`"\U0000400"`, `foo.star:1:1: truncated escape sequence \U0000400`},
   215  		{`"\U000004000"`, `"Ѐ0" EOF`}, // = U+0400 + '0'
   216  		{`"\U1000000g"`, `foo.star:1:1: invalid escape sequence \U1000000g`},
   217  		{`"\U0010FFFF"`, `"\U0010ffff" EOF`},
   218  		{`"\U00110000"`, `foo.star:1:1: code point out of range: \U00110000 (max \U00110000)`},
   219  		{`"\U0001F63F"`, `"😿" EOF`},
   220  		{`"\U0000dc00"`, `foo.star:1:1: invalid Unicode code point U+DC00`}, // surrogate
   221  
   222  		// backslash escapes
   223  		// As in Go, a backslash must escape something.
   224  		// (Python started issuing a deprecation warning in 3.6.)
   225  		{`"foo\(bar"`, `foo.star:1:1: invalid escape sequence \(`},
   226  		{`"\+"`, `foo.star:1:1: invalid escape sequence \+`},
   227  		{`"\w"`, `foo.star:1:1: invalid escape sequence \w`},
   228  		{`"\""`, `"\"" EOF`},
   229  		{`"\'"`, `"'" EOF`},
   230  		{`'\w'`, `foo.star:1:1: invalid escape sequence \w`},
   231  		{`'\''`, `"'" EOF`},
   232  		{`'\"'`, `"\"" EOF`},
   233  		{`"""\w"""`, `foo.star:1:1: invalid escape sequence \w`},
   234  		{`"""\""""`, `"\"" EOF`},
   235  		{`"""\'"""`, `"'" EOF`},
   236  		{`'''\w'''`, `foo.star:1:1: invalid escape sequence \w`},
   237  		{`'''\''''`, `"'" EOF`},
   238  		{`'''\"'''`, `"\"" EOF`},
   239  		{`r"\w"`, `"\\w" EOF`},
   240  		{`r"\""`, `"\\\"" EOF`},
   241  		{`r"\'"`, `"\\'" EOF`},
   242  		{`r'\w'`, `"\\w" EOF`},
   243  		{`r'\''`, `"\\'" EOF`},
   244  		{`r'\"'`, `"\\\"" EOF`},
   245  		{`'a\zb'`, `foo.star:1:1: invalid escape sequence \z`},
   246  		{`"\o123"`, `foo.star:1:1: invalid escape sequence \o`},
   247  		// bytes literals (where they differ from text strings)
   248  		{`b"AЀ世😿"`, `b"AЀ世😿`},                                       // 1-4 byte encodings, literal
   249  		{`b"\x41\u0400\u4e16\U0001F63F"`, `b"AЀ世😿"`},                // same, as escapes
   250  		{`b"\377\378\x80\xff\xFf"`, `b"\xff\x1f8\x80\xff\xff" EOF`}, // hex/oct escapes allow non-ASCII
   251  		{`b"\400"`, `foo.star:1:2: invalid escape sequence \400`},
   252  		{`b"\udc00"`, `foo.star:1:2: invalid Unicode code point U+DC00`}, // (same as string)
   253  		// floats starting with octal digits
   254  		{"012934.", `1.293400e+04 EOF`},
   255  		{"012934.1", `1.293410e+04 EOF`},
   256  		{"012934e1", `1.293400e+05 EOF`},
   257  		{"0123.", `1.230000e+02 EOF`},
   258  		{"0123.1", `1.231000e+02 EOF`},
   259  		// github.com/google/skylark/issues/16
   260  		{"x ! 0", "foo.star:1:3: unexpected input character '!'"},
   261  		// github.com/google/starlark-go/issues/80
   262  		{"([{<>}])", "( [ { < > } ] ) EOF"},
   263  		{"f();", "f ( ) ; EOF"},
   264  		// github.com/google/starlark-go/issues/104
   265  		{"def f():\n  if x:\n    pass\n  ", `def f ( ) : newline indent if x : newline indent pass newline outdent outdent EOF`},
   266  		{`while cond: pass`, "while cond : pass EOF"},
   267  		// github.com/google/starlark-go/issues/107
   268  		{"~= ~= 5", "~ = ~ = 5 EOF"},
   269  		{"0in", "0 in EOF"},
   270  		{"0or", "foo.star:1:3: invalid octal literal"},
   271  		{"6in", "6 in EOF"},
   272  		{"6or", "6 or EOF"},
   273  	} {
   274  		got, err := scan(test.input)
   275  		if err != nil {
   276  			got = err.(Error).Error()
   277  		}
   278  		// Prefix match allows us to truncate errors in expectations.
   279  		// Success cases all end in EOF.
   280  		if !strings.HasPrefix(got, test.want) {
   281  			t.Errorf("scan `%s` = [%s], want [%s]", test.input, got, test.want)
   282  		}
   283  	}
   284  }
   285  
   286  // dataFile is the same as starlarktest.DataFile.
   287  // We make a copy to avoid a dependency cycle.
   288  var dataFile = func(pkgdir, filename string) string {
   289  	return filepath.Join(build.Default.GOPATH, "src/go.starlark.net", pkgdir, filename)
   290  }
   291  
   292  func BenchmarkScan(b *testing.B) {
   293  	filename := dataFile("syntax", "testdata/scan.star")
   294  	b.StopTimer()
   295  	data, err := os.ReadFile(filename)
   296  	if err != nil {
   297  		b.Fatal(err)
   298  	}
   299  	b.StartTimer()
   300  
   301  	for i := 0; i < b.N; i++ {
   302  		sc, err := newScanner(filename, data, false)
   303  		if err != nil {
   304  			b.Fatal(err)
   305  		}
   306  		var val tokenValue
   307  		for sc.nextToken(&val) != EOF {
   308  		}
   309  	}
   310  }