github.com/cockroachdb/cockroach@v20.2.0-alpha.1+incompatible/pkg/sql/opt/optgen/lang/scanner.go (about)

     1  // Copyright 2018 The Cockroach Authors.
     2  //
     3  // Use of this software is governed by the Business Source License
     4  // included in the file licenses/BSL.txt.
     5  //
     6  // As of the Change Date specified in that file, in accordance with
     7  // the Business Source License, use of this software will be governed
     8  // by the Apache License, Version 2.0, included in the file
     9  // licenses/APL.txt.
    10  
    11  package lang
    12  
    13  import (
    14  	"bufio"
    15  	"bytes"
    16  	"io"
    17  	"unicode"
    18  )
    19  
    20  //go:generate stringer -type=Token scanner.go
    21  
    22  // Token is the kind of lexical token returned by the scanner (string,
    23  // parentheses, comment, etc).
    24  type Token int
    25  
    26  const (
    27  	// ILLEGAL is the invalid token that indicates the scanner has encountered
    28  	// an invalid lexical pattern.
    29  	ILLEGAL Token = iota
    30  	// ERROR indicates that the scanner encountered an error while reading from
    31  	// the input files. The text of the error can be accessed via the Literal
    32  	// method.
    33  	ERROR
    34  	// EOF indicates the scanner has reached the end of the input.
    35  	EOF
    36  	// IDENT is an identifier composed of Unicode letter and number runes:
    37  	// UnicodeLetter (UnicodeLetter | UnicodeNumber)*
    38  	IDENT
    39  	// STRING is a literal string delimited by double quotes that cannot extend
    40  	// past the end of a line: " [^"\n]* "
    41  	STRING
    42  	// NUMBER is an numeric literal composed of Unicode numeric digits:
    43  	// UnicodeDigit+
    44  	NUMBER
    45  	// WHITESPACE is any sequence of Unicode whitespace characters.
    46  	WHITESPACE
    47  	// COMMENT is a code comment that extends to end of line: # .* EOL
    48  	COMMENT
    49  	// LPAREN is the open parentheses rune: (
    50  	LPAREN
    51  	// RPAREN is the close parentheses rune: )
    52  	RPAREN
    53  	// LBRACKET is the open square bracket rune: [
    54  	LBRACKET
    55  	// RBRACKET is the close square bracket rune: ]
    56  	RBRACKET
    57  	// LBRACE is the open curly brace rune: {
    58  	LBRACE
    59  	// RBRACE is the close curly brace rune: }
    60  	RBRACE
    61  	// DOLLAR is the dollar sign rune: $
    62  	DOLLAR
    63  	// COLON is the colon rune: :
    64  	COLON
    65  	// ASTERISK is the asterisk rune: *
    66  	ASTERISK
    67  	// EQUALS is the equals sign rune: =
    68  	EQUALS
    69  	// ARROW is an equals sign followed by a greater than sign: =>
    70  	ARROW
    71  	// AMPERSAND is the ampersand rune: &
    72  	AMPERSAND
    73  	// COMMA is the comma rune: ,
    74  	COMMA
    75  	// CARET is the caret rune: ^
    76  	CARET
    77  	// ELLIPSES is three periods in succession: ...
    78  	ELLIPSES
    79  	// PIPE is the vertical line rune: |
    80  	PIPE
    81  )
    82  
    83  const (
    84  	errRune = rune(-1)
    85  	eofRune = rune(0)
    86  )
    87  
    88  // Scanner breaks a sequence of characters into a sequence of lexical tokens
    89  // that are consumed by the parser in order to construct an Optgen AST. Each
    90  // token is associated with a literal that is the string representation of that
    91  // token. For many tokens, its literal is a constant. But for other tokens,
    92  // like string and identifier tokens, the literal is the custom text that was
    93  // scanned from the input file. Scanning stops unrecoverably at EOF, the first
    94  // I/O error, or a token too large to fit in the buffer.
    95  type Scanner struct {
    96  	r   *bufio.Reader
    97  	tok Token
    98  	lit string
    99  	err error
   100  
   101  	// lineLoc tracks the current line and position within the current file
   102  	// being scanned.
   103  	lineLoc struct {
   104  		line int
   105  		pos  int
   106  		prev int
   107  	}
   108  }
   109  
   110  // NewScanner constructs a new scanner that will tokenize the given input.
   111  func NewScanner(r io.Reader) *Scanner {
   112  	return &Scanner{r: bufio.NewReader(r)}
   113  }
   114  
   115  // Token returns the last token that was scanned.
   116  func (s *Scanner) Token() Token {
   117  	return s.tok
   118  }
   119  
   120  // Literal returns the literal associated with the last token that was scanned.
   121  func (s *Scanner) Literal() string {
   122  	return s.lit
   123  }
   124  
   125  // LineLoc returns the current 0-based line number and column position of the
   126  // scanner in the current file.
   127  func (s *Scanner) LineLoc() (line, pos int) {
   128  	return s.lineLoc.line, s.lineLoc.pos
   129  }
   130  
   131  // Scan reads the next token from the input and returns it. The Token, Literal,
   132  // and LineLoc methods are also initialized with information about the token
   133  // that was read.
   134  func (s *Scanner) Scan() Token {
   135  	// Read the next rune.
   136  	ch := s.read()
   137  
   138  	// If we see whitespace then consume all contiguous whitespace.
   139  	if unicode.IsSpace(ch) {
   140  		s.unread()
   141  		return s.scanWhitespace()
   142  	}
   143  
   144  	// If we see a letter or underscore then consume as an identifier or keyword.
   145  	if unicode.IsLetter(ch) || ch == '_' {
   146  		s.unread()
   147  		return s.scanIdentifier()
   148  	}
   149  
   150  	// If we see a digit then consume as a numeric literal.
   151  	if unicode.IsDigit(ch) {
   152  		s.unread()
   153  		return s.scanNumericLiteral()
   154  	}
   155  
   156  	// Otherwise read the individual character.
   157  	switch ch {
   158  	case errRune:
   159  		s.tok = ERROR
   160  		s.lit = s.err.Error()
   161  
   162  	case eofRune:
   163  		s.tok = EOF
   164  		s.lit = ""
   165  
   166  	case '(':
   167  		s.tok = LPAREN
   168  		s.lit = "("
   169  
   170  	case ')':
   171  		s.tok = RPAREN
   172  		s.lit = ")"
   173  
   174  	case '[':
   175  		s.tok = LBRACKET
   176  		s.lit = "["
   177  
   178  	case ']':
   179  		s.tok = RBRACKET
   180  		s.lit = "]"
   181  
   182  	case '{':
   183  		s.tok = LBRACE
   184  		s.lit = "{"
   185  
   186  	case '}':
   187  		s.tok = RBRACE
   188  		s.lit = "}"
   189  
   190  	case '$':
   191  		s.tok = DOLLAR
   192  		s.lit = "$"
   193  
   194  	case ':':
   195  		s.tok = COLON
   196  		s.lit = ":"
   197  
   198  	case '*':
   199  		s.tok = ASTERISK
   200  		s.lit = "*"
   201  
   202  	case ',':
   203  		s.tok = COMMA
   204  		s.lit = ","
   205  
   206  	case '^':
   207  		s.tok = CARET
   208  		s.lit = "^"
   209  
   210  	case '|':
   211  		s.tok = PIPE
   212  		s.lit = "|"
   213  
   214  	case '&':
   215  		s.tok = AMPERSAND
   216  		s.lit = "&"
   217  
   218  	case '=':
   219  		if s.read() == '>' {
   220  			s.tok = ARROW
   221  			s.lit = "=>"
   222  			break
   223  		}
   224  
   225  		s.unread()
   226  		s.tok = EQUALS
   227  		s.lit = "="
   228  
   229  	case '.':
   230  		if s.read() == '.' {
   231  			if s.read() == '.' {
   232  				s.tok = ELLIPSES
   233  				s.lit = "..."
   234  				break
   235  			}
   236  		}
   237  
   238  		s.tok = ILLEGAL
   239  		s.lit = "."
   240  
   241  	case '"':
   242  		s.unread()
   243  		return s.scanStringLiteral('"', false /* multiLine */)
   244  
   245  	case '`':
   246  		s.unread()
   247  		return s.scanStringLiteral('`', true /* multiLine */)
   248  
   249  	case '#':
   250  		s.unread()
   251  		return s.scanComment()
   252  
   253  	default:
   254  		s.tok = ILLEGAL
   255  		s.lit = string(ch)
   256  	}
   257  
   258  	return s.tok
   259  }
   260  
   261  // read reads the next rune from the buffered reader. If no reader has yet been
   262  // created, or if the current reader is exhausted, then the reader is reset to
   263  // point to the next file. read returns errRune if there is an I/O error and
   264  // eofRune once there are no more files to read.
   265  func (s *Scanner) read() rune {
   266  	// Once the scanner gets in the error state, it stays there.
   267  	if s.err != nil {
   268  		return errRune
   269  	}
   270  
   271  	ch, _, err := s.r.ReadRune()
   272  	if err == io.EOF {
   273  		return eofRune
   274  	}
   275  
   276  	if err != nil {
   277  		s.err = err
   278  		return errRune
   279  	}
   280  
   281  	s.lineLoc.prev = s.lineLoc.pos
   282  	if ch == '\n' {
   283  		s.lineLoc.line++
   284  		s.lineLoc.pos = 0
   285  	} else {
   286  		s.lineLoc.pos++
   287  	}
   288  
   289  	return ch
   290  }
   291  
   292  // unread places the previously read rune back on the reader.
   293  func (s *Scanner) unread() {
   294  	// Once the scanner gets in the error state, it stays there.
   295  	if s.err != nil {
   296  		return
   297  	}
   298  
   299  	err := s.r.UnreadRune()
   300  	if err != nil {
   301  		// Last read wasn't a rune (probably an eof), so no-op.
   302  		return
   303  	}
   304  
   305  	s.tok = ILLEGAL
   306  	s.lit = ""
   307  
   308  	if s.lineLoc.prev == -1 {
   309  		panic("unread cannot be called twice in succession")
   310  	}
   311  
   312  	if s.lineLoc.pos == 0 {
   313  		s.lineLoc.line--
   314  	}
   315  
   316  	s.lineLoc.pos = s.lineLoc.prev
   317  	s.lineLoc.prev = -1
   318  }
   319  
   320  // scanWhitespace consumes the current rune and all contiguous whitespace.
   321  func (s *Scanner) scanWhitespace() Token {
   322  	// Create a buffer and read the current character into it.
   323  	var buf bytes.Buffer
   324  	buf.WriteRune(s.read())
   325  
   326  	// Read every subsequent whitespace character into the buffer.
   327  	// Non-whitespace characters and EOF will cause the loop to exit.
   328  	for {
   329  		ch := s.read()
   330  		if ch == eofRune {
   331  			break
   332  		}
   333  
   334  		if !unicode.IsSpace(ch) {
   335  			s.unread()
   336  			break
   337  		}
   338  
   339  		buf.WriteRune(ch)
   340  	}
   341  
   342  	s.tok = WHITESPACE
   343  	s.lit = buf.String()
   344  	return WHITESPACE
   345  }
   346  
   347  // scanIdentifier consumes the current rune and all contiguous identifier runes.
   348  func (s *Scanner) scanIdentifier() Token {
   349  	// Create a buffer and read the current character into it.
   350  	var buf bytes.Buffer
   351  	buf.WriteRune(s.read())
   352  
   353  	// Read every subsequent ident character into the buffer.
   354  	// Non-ident characters and EOF will cause the loop to exit.
   355  	for {
   356  		ch := s.read()
   357  		if ch == eofRune {
   358  			break
   359  		}
   360  
   361  		if !unicode.IsLetter(ch) && !unicode.IsDigit(ch) && ch != '_' {
   362  			s.unread()
   363  			break
   364  		}
   365  
   366  		buf.WriteRune(ch)
   367  	}
   368  
   369  	s.tok = IDENT
   370  	s.lit = buf.String()
   371  	return s.tok
   372  }
   373  
   374  func (s *Scanner) scanStringLiteral(endChar rune, multiLine bool) Token {
   375  	// Create a buffer and read the current character into it.
   376  	var buf bytes.Buffer
   377  	buf.WriteRune(s.read())
   378  
   379  	// Read characters until the closing quote is found, or until either error,
   380  	// newline, or EOF is read.
   381  	for {
   382  		ch := s.read()
   383  		if ch == errRune || ch == eofRune || (!multiLine && ch == '\n') {
   384  			s.unread()
   385  			s.tok = ILLEGAL
   386  			break
   387  		}
   388  
   389  		buf.WriteRune(ch)
   390  
   391  		if ch == endChar {
   392  			s.tok = STRING
   393  			break
   394  		}
   395  	}
   396  
   397  	s.lit = buf.String()
   398  	return s.tok
   399  }
   400  
   401  func (s *Scanner) scanNumericLiteral() Token {
   402  	// Create a buffer and read the current character into it.
   403  	var buf bytes.Buffer
   404  	buf.WriteRune(s.read())
   405  
   406  	// Read every subsequent Unicode digit character into the buffer.
   407  	// Non-digit characters and EOF will cause the loop to exit.
   408  	for {
   409  		ch := s.read()
   410  		if ch == eofRune {
   411  			break
   412  		}
   413  
   414  		if !unicode.IsDigit(ch) {
   415  			s.unread()
   416  			break
   417  		}
   418  
   419  		buf.WriteRune(ch)
   420  	}
   421  
   422  	s.tok = NUMBER
   423  	s.lit = buf.String()
   424  	return s.tok
   425  }
   426  
   427  // scanComment consumes the current rune and all characters until newline.
   428  func (s *Scanner) scanComment() Token {
   429  	// Create a buffer and read the current character into it.
   430  	var buf bytes.Buffer
   431  	buf.WriteRune(s.read())
   432  
   433  	// Read every subsequent character into the buffer until either error,
   434  	// newline, or EOF is read.
   435  	for {
   436  		ch := s.read()
   437  		if ch == errRune || ch == eofRune || ch == '\n' {
   438  			s.unread()
   439  			break
   440  		}
   441  
   442  		buf.WriteRune(ch)
   443  	}
   444  
   445  	s.tok = COMMENT
   446  	s.lit = buf.String()
   447  	return COMMENT
   448  }