github.com/mithrandie/csvq@v1.18.1/lib/parser/scanner.go (about)

     1  package parser
     2  
     3  import (
     4  	"bytes"
     5  	"errors"
     6  	"fmt"
     7  	"strconv"
     8  	"strings"
     9  	"unicode"
    10  
    11  	"github.com/mithrandie/csvq/lib/option"
    12  
    13  	"github.com/mithrandie/ternary"
    14  )
    15  
    16  const (
    17  	EOF = -(iota + 1)
    18  	Uncategorized
    19  )
    20  
    21  const (
    22  	TokenFrom   = IDENTIFIER
    23  	TokenTo     = SUBSTITUTION_OP
    24  	KeywordFrom = SELECT
    25  	KeywordTo   = JSON_OBJECT
    26  )
    27  
    28  const (
    29  	VariableSign            = '@'
    30  	EnvironmentVariableSign = '%'
    31  	ExternalCommandSign     = '$'
    32  	RuntimeInformationSign  = '#'
    33  
    34  	SubstitutionOperator = ":="
    35  
    36  	BeginExpression = '{'
    37  	EndExpression   = '}'
    38  
    39  	IdentifierDelimiter = ':'
    40  )
    41  
    42  var errTokenIsNotKeyword = errors.New("token is not keyword")
    43  var errInvalidConstantSyntax = errors.New("invalid constant syntax")
    44  
    45  var comparisonOperators = []string{
    46  	">",
    47  	"<",
    48  	">=",
    49  	"<=",
    50  	"<>",
    51  	"!=",
    52  	"==",
    53  }
    54  
    55  var stringOperators = []string{
    56  	"||",
    57  }
    58  
    59  var runesNotIncludedInUrl = []rune{
    60  	'{',
    61  	'}',
    62  	'|',
    63  	'\\',
    64  	'^',
    65  	'[',
    66  	']',
    67  	'`',
    68  }
    69  
    70  var aggregateFunctions = []string{
    71  	"MIN",
    72  	"MAX",
    73  	"SUM",
    74  	"AVG",
    75  	"STDEV",
    76  	"STDEVP",
    77  	"VARP",
    78  	"MEDIAN",
    79  }
    80  
    81  var listFunctions = []string{
    82  	"LISTAGG",
    83  	"JSON_AGG",
    84  }
    85  
    86  var analyticFunctions = []string{
    87  	"ROW_NUMBER",
    88  	"RANK",
    89  	"DENSE_RANK",
    90  	"CUME_DIST",
    91  	"PERCENT_RANK",
    92  	"NTILE",
    93  }
    94  
    95  var functionsNth = []string{
    96  	"FIRST_VALUE",
    97  	"LAST_VALUE",
    98  	"NTH_VALUE",
    99  }
   100  
   101  var functionsWithIgnoreNulls = []string{
   102  	"LAG",
   103  	"LEAD",
   104  }
   105  
   106  var ConstantDelimiter = string(IdentifierDelimiter) + string(IdentifierDelimiter)
   107  
   108  func TokenLiteral(token int) string {
   109  	if TokenFrom <= token && token <= TokenTo {
   110  		return yyToknames[token-TokenFrom+3]
   111  	}
   112  	return string(rune(token))
   113  }
   114  
   115  func KeywordLiteral(token int) (string, error) {
   116  	if KeywordFrom <= token && token <= KeywordTo {
   117  		return yyToknames[token-TokenFrom+3], nil
   118  	}
   119  	return string(rune(token)), errTokenIsNotKeyword
   120  }
   121  
   122  type Scanner struct {
   123  	src     []rune
   124  	srcPos  int
   125  	literal bytes.Buffer
   126  
   127  	line       int
   128  	char       int
   129  	sourceFile string
   130  
   131  	forPrepared bool
   132  	ansiQuotes  bool
   133  
   134  	holderOrdinal int
   135  	holderNames   []string
   136  	holderNumber  int
   137  }
   138  
   139  func (s *Scanner) Init(src string, sourceFile string, forPrepared bool, ansiQuotes bool) *Scanner {
   140  	s.src = []rune(src)
   141  	s.srcPos = 0
   142  	s.line = 1
   143  	s.char = 0
   144  	s.sourceFile = sourceFile
   145  	s.forPrepared = forPrepared
   146  	s.ansiQuotes = ansiQuotes
   147  	s.holderOrdinal = 0
   148  	s.holderNames = make([]string, 0, 10)
   149  	s.holderNumber = 0
   150  	return s
   151  }
   152  
   153  func (s *Scanner) HolderNumber() int {
   154  	return s.holderNumber
   155  }
   156  
   157  func (s *Scanner) holderNameExists(name string) bool {
   158  	for _, v := range s.holderNames {
   159  		if name == v {
   160  			return true
   161  		}
   162  	}
   163  	return false
   164  }
   165  
   166  func (s *Scanner) peek() rune {
   167  	return s.peekFurtherAhead(1)
   168  }
   169  
   170  func (s *Scanner) peekFurtherAhead(n int) rune {
   171  	pos := n - 1 + s.srcPos
   172  
   173  	if len(s.src) <= pos {
   174  		return EOF
   175  	}
   176  
   177  	return s.src[pos]
   178  }
   179  
   180  func (s *Scanner) peekNextLetter(n int) rune {
   181  	for unicode.IsSpace(s.peekFurtherAhead(n)) {
   182  		n = n + 1
   183  	}
   184  	return s.peekFurtherAhead(n)
   185  }
   186  
   187  func (s *Scanner) next() rune {
   188  	ch := s.peek()
   189  	if ch == EOF {
   190  		return ch
   191  	}
   192  
   193  	s.srcPos++
   194  	s.char++
   195  
   196  	ch = s.checkNewLine(ch)
   197  
   198  	return ch
   199  }
   200  
   201  func (s *Scanner) checkNewLine(ch rune) rune {
   202  	if ch != '\r' && ch != '\n' {
   203  		return ch
   204  	}
   205  
   206  	if ch == '\r' && s.peek() == '\n' {
   207  		s.srcPos++
   208  	}
   209  
   210  	s.line++
   211  	s.char = 0
   212  	return s.src[s.srcPos-1]
   213  }
   214  
   215  func (s *Scanner) Scan() (Token, error) {
   216  	for unicode.IsSpace(s.peek()) {
   217  		s.next()
   218  	}
   219  
   220  	ch := s.next()
   221  	token := ch
   222  	literal := string(ch)
   223  	quoted := false
   224  	line := s.line
   225  	char := s.char
   226  	var err error
   227  
   228  	if s.forPrepared {
   229  		switch ch {
   230  		case '?':
   231  			s.holderOrdinal++
   232  			s.holderNumber++
   233  			return Token{Token: PLACEHOLDER, Literal: literal, HolderOrdinal: s.holderOrdinal, Line: line, Char: char, SourceFile: s.sourceFile}, err
   234  		case ':':
   235  			if s.isIdentRune(s.peek()) {
   236  				s.scanIdentifier(ch)
   237  				holderName := s.literal.String()
   238  				s.holderOrdinal++
   239  				if !s.holderNameExists(holderName) {
   240  					s.holderNames = append(s.holderNames, holderName)
   241  					s.holderNumber++
   242  				}
   243  				return Token{Token: PLACEHOLDER, Literal: holderName, HolderOrdinal: s.holderOrdinal, Line: line, Char: char, SourceFile: s.sourceFile}, err
   244  			}
   245  		}
   246  	}
   247  
   248  	switch {
   249  	case s.isDecimal(ch):
   250  		token, err = s.scanNumber(ch)
   251  		literal = s.literal.String()
   252  	case s.isIdentRune(ch):
   253  		s.scanIdentifier(ch)
   254  
   255  		literal = s.literal.String()
   256  		if _, e := ternary.ConvertFromString(literal); e == nil {
   257  			token = TERNARY
   258  		} else if t, e := s.searchKeyword(literal); e == nil {
   259  			token = rune(t)
   260  		} else if s.isAggregateFunctions(literal) {
   261  			token = AGGREGATE_FUNCTION
   262  		} else if s.isListaggFunctions(literal) {
   263  			token = LIST_FUNCTION
   264  		} else if s.isAnalyticFunctions(literal) {
   265  			token = ANALYTIC_FUNCTION
   266  		} else if s.isFunctionsNth(literal) {
   267  			token = FUNCTION_NTH
   268  		} else if s.isFunctionsWithIgnoreNulls(literal) {
   269  			token = FUNCTION_WITH_INS
   270  		} else {
   271  			if unicode.IsLetter(ch) && s.peek() == ':' {
   272  				if s.peekFurtherAhead(2) == ':' {
   273  					if s.peekNextLetter(3) == '(' {
   274  						s.next()
   275  						s.next()
   276  						token = TABLE_FUNCTION
   277  					} else {
   278  						s.literal.WriteRune(s.next())
   279  						s.literal.WriteRune(s.next())
   280  						err = s.scanConstant()
   281  						literal = s.literal.String()
   282  						token = CONSTANT
   283  						if err != nil {
   284  							token = Uncategorized
   285  						}
   286  					}
   287  				} else {
   288  					s.literal.WriteRune(s.next())
   289  					s.scanUrl()
   290  					literal = s.literal.String()
   291  					token = URL
   292  				}
   293  			} else {
   294  				token = IDENTIFIER
   295  			}
   296  		}
   297  	case s.isOperatorRune(ch):
   298  		s.scanOperator(ch)
   299  
   300  		literal = s.literal.String()
   301  		if s.isComparisonOperators(literal) {
   302  			token = COMPARISON_OP
   303  		} else if s.isStringOperators(literal) {
   304  			token = STRING_OP
   305  		} else if literal == SubstitutionOperator {
   306  			token = SUBSTITUTION_OP
   307  		} else if 1 < len(literal) {
   308  			token = Uncategorized
   309  		}
   310  	case ch == VariableSign:
   311  		switch s.peek() {
   312  		case EnvironmentVariableSign:
   313  			s.next()
   314  			token = ENVIRONMENT_VARIABLE
   315  		case RuntimeInformationSign:
   316  			s.next()
   317  			token = RUNTIME_INFORMATION
   318  		case VariableSign:
   319  			s.next()
   320  			token = FLAG
   321  		default:
   322  			token = VARIABLE
   323  		}
   324  
   325  		if token == ENVIRONMENT_VARIABLE && s.peek() == '`' {
   326  			err = s.scanString(s.next())
   327  			literal = option.UnescapeIdentifier(s.literal.String(), '`')
   328  			quoted = true
   329  		} else {
   330  			if s.isIdentRune(s.peek()) {
   331  				s.scanIdentifier(s.next())
   332  				literal = s.literal.String()
   333  			} else {
   334  				literal = ""
   335  			}
   336  		}
   337  
   338  		if len(literal) < 1 {
   339  			err = errors.New("invalid variable symbol")
   340  		}
   341  	case ch == ExternalCommandSign:
   342  		s.scanExternalCommand()
   343  		literal = s.literal.String()
   344  		token = EXTERNAL_COMMAND
   345  	case s.isCommentRune(ch):
   346  		s.scanComment()
   347  		return s.Scan()
   348  	case s.isLineCommentRune(ch):
   349  		s.scanLineComment()
   350  		return s.Scan()
   351  	default:
   352  		if ch == '\'' || (!s.ansiQuotes && ch == '"') {
   353  			err = s.scanString(ch)
   354  			literal = option.UnescapeString(s.literal.String(), ch)
   355  			token = STRING
   356  		} else if ch == '`' || (s.ansiQuotes && ch == '"') {
   357  			err = s.scanString(ch)
   358  			literal = option.UnescapeIdentifier(s.literal.String(), ch)
   359  			token = IDENTIFIER
   360  			quoted = true
   361  		}
   362  	}
   363  
   364  	return Token{Token: int(token), Literal: literal, Quoted: quoted, Line: line, Char: char, SourceFile: s.sourceFile}, err
   365  }
   366  
   367  func (s *Scanner) scanString(quote rune) error {
   368  	s.literal.Reset()
   369  
   370  	for {
   371  		ch := s.next()
   372  
   373  		if ch == EOF {
   374  			return errors.New("literal not terminated")
   375  		}
   376  
   377  		if ch == quote {
   378  			if s.peek() == quote {
   379  				s.literal.WriteRune(ch)
   380  				ch = s.next()
   381  			} else {
   382  				break
   383  			}
   384  		}
   385  
   386  		if ch == '\\' {
   387  			switch s.peek() {
   388  			case '\\', quote:
   389  				s.literal.WriteRune(ch)
   390  				ch = s.next()
   391  			}
   392  		}
   393  		s.literal.WriteRune(ch)
   394  	}
   395  	return nil
   396  }
   397  
   398  func (s *Scanner) scanIdentifier(head rune) {
   399  	s.literal.Reset()
   400  
   401  	s.literal.WriteRune(head)
   402  	for s.isIdentRune(s.peek()) {
   403  		s.literal.WriteRune(s.next())
   404  	}
   405  }
   406  
   407  func (s *Scanner) scanConstant() error {
   408  	if !s.isIdentRune(s.peek()) {
   409  		return errInvalidConstantSyntax
   410  	}
   411  	s.literal.WriteRune(s.next())
   412  	for s.isIdentRune(s.peek()) {
   413  		s.literal.WriteRune(s.next())
   414  	}
   415  	return nil
   416  }
   417  
   418  func (s *Scanner) scanUrl() int {
   419  	oldPos := s.srcPos
   420  	for !unicode.IsSpace(s.peek()) && !s.isRuneNotIncludedInUrl(s.peek()) && s.peek() != EOF {
   421  		s.literal.WriteRune(s.next())
   422  	}
   423  	return s.srcPos - oldPos
   424  }
   425  
   426  func (s *Scanner) isRuneNotIncludedInUrl(ch rune) bool {
   427  	for _, r := range runesNotIncludedInUrl {
   428  		if r == ch {
   429  			return true
   430  		}
   431  	}
   432  	return false
   433  }
   434  
   435  func (s *Scanner) isIdentRune(ch rune) bool {
   436  	return ch == '_' || unicode.IsLetter(ch) || unicode.IsDigit(ch)
   437  }
   438  
   439  func (s *Scanner) isDecimal(ch rune) bool {
   440  	return '0' <= ch && ch <= '9'
   441  }
   442  
   443  func (s *Scanner) scanNumber(head rune) (rune, error) {
   444  	s.literal.Reset()
   445  	var numType rune = INTEGER
   446  
   447  	s.literal.WriteRune(head)
   448  	for s.isDecimal(s.peek()) {
   449  		s.literal.WriteRune(s.next())
   450  	}
   451  
   452  	if s.peek() == '.' {
   453  		numType = FLOAT
   454  
   455  		s.literal.WriteRune(s.next())
   456  		for s.isDecimal(s.peek()) {
   457  			s.literal.WriteRune(s.next())
   458  		}
   459  	}
   460  
   461  	if s.peek() == 'e' || s.peek() == 'E' {
   462  		numType = FLOAT
   463  
   464  		s.literal.WriteRune(s.next())
   465  		if s.peek() == '+' || s.peek() == '-' {
   466  			s.literal.WriteRune(s.next())
   467  		}
   468  		for s.isDecimal(s.peek()) {
   469  			s.literal.WriteRune(s.next())
   470  		}
   471  	}
   472  
   473  	if numType == INTEGER {
   474  		if _, err := strconv.ParseInt(s.literal.String(), 10, 64); err == nil {
   475  			return numType, nil
   476  		}
   477  		numType = FLOAT
   478  	}
   479  
   480  	if _, err := strconv.ParseFloat(s.literal.String(), 64); err == nil {
   481  		return numType, nil
   482  	}
   483  
   484  	return numType, errors.New(fmt.Sprintf("cound not convert %q to a number", s.literal.String()))
   485  }
   486  
   487  func (s *Scanner) scanOperator(head rune) {
   488  	s.literal.Reset()
   489  
   490  	s.literal.WriteRune(head)
   491  	for s.isOperatorRune(s.peek()) {
   492  		s.literal.WriteRune(s.next())
   493  	}
   494  }
   495  
   496  func (s *Scanner) isOperatorRune(ch rune) bool {
   497  	switch ch {
   498  	case '=', '>', '<', '!', '|', ':':
   499  		return true
   500  	}
   501  	return false
   502  }
   503  
   504  func (s *Scanner) searchKeyword(str string) (int, error) {
   505  	for i := KeywordFrom; i <= KeywordTo; i++ {
   506  		if strings.EqualFold(TokenLiteral(i), str) {
   507  			return i, nil
   508  		}
   509  	}
   510  	return IDENTIFIER, errors.New(fmt.Sprintf("%q is not a keyword", str))
   511  }
   512  
   513  func (s *Scanner) isAggregateFunctions(str string) bool {
   514  	for _, v := range aggregateFunctions {
   515  		if strings.EqualFold(v, str) {
   516  			return true
   517  		}
   518  	}
   519  	return false
   520  }
   521  
   522  func (s *Scanner) isListaggFunctions(str string) bool {
   523  	for _, v := range listFunctions {
   524  		if strings.EqualFold(v, str) {
   525  			return true
   526  		}
   527  	}
   528  	return false
   529  }
   530  
   531  func (s *Scanner) isAnalyticFunctions(str string) bool {
   532  	for _, v := range analyticFunctions {
   533  		if strings.EqualFold(v, str) {
   534  			return true
   535  		}
   536  	}
   537  	return false
   538  }
   539  
   540  func (s *Scanner) isFunctionsNth(str string) bool {
   541  	for _, v := range functionsNth {
   542  		if strings.EqualFold(v, str) {
   543  			return true
   544  		}
   545  	}
   546  	return false
   547  }
   548  
   549  func (s *Scanner) isFunctionsWithIgnoreNulls(str string) bool {
   550  	for _, v := range functionsWithIgnoreNulls {
   551  		if strings.EqualFold(v, str) {
   552  			return true
   553  		}
   554  	}
   555  	return false
   556  }
   557  
   558  func (s *Scanner) isComparisonOperators(str string) bool {
   559  	for _, v := range comparisonOperators {
   560  		if v == str {
   561  			return true
   562  		}
   563  	}
   564  	return false
   565  }
   566  
   567  func (s *Scanner) isStringOperators(str string) bool {
   568  	for _, v := range stringOperators {
   569  		if v == str {
   570  			return true
   571  		}
   572  	}
   573  	return false
   574  }
   575  
   576  func (s *Scanner) isCommentRune(ch rune) bool {
   577  	if ch == '/' && s.peek() == '*' {
   578  		s.next()
   579  		return true
   580  	}
   581  	return false
   582  }
   583  
   584  func (s *Scanner) scanComment() {
   585  	for {
   586  		ch := s.next()
   587  		if ch == EOF {
   588  			break
   589  		} else if ch == '*' {
   590  			if s.peek() == '/' {
   591  				s.next()
   592  				break
   593  			}
   594  		}
   595  	}
   596  }
   597  
   598  func (s *Scanner) isLineCommentRune(ch rune) bool {
   599  	if ch == '-' && s.peek() == '-' {
   600  		s.next()
   601  		return true
   602  	}
   603  	return false
   604  }
   605  
   606  func (s *Scanner) scanLineComment() {
   607  	for {
   608  		ch := s.peek()
   609  		if ch == '\r' || ch == '\n' || ch == EOF {
   610  			break
   611  		}
   612  		s.next()
   613  	}
   614  }
   615  
   616  func (s *Scanner) scanExternalCommand() {
   617  	s.literal.Reset()
   618  
   619  	for {
   620  		ch := s.peek()
   621  		if ch == ';' || ch == EOF {
   622  			break
   623  		}
   624  
   625  		s.literal.WriteRune(s.next())
   626  
   627  		if ch == '"' || ch == '\'' || ch == '`' {
   628  			s.scanExternalCommandQuotedString(ch)
   629  			continue
   630  		}
   631  
   632  		if ch == ExternalCommandSign && s.peek() == BeginExpression {
   633  			s.literal.WriteRune(s.next())
   634  			s.scanExternalCommandCSVQExpression()
   635  		}
   636  	}
   637  }
   638  
   639  func (s *Scanner) scanExternalCommandQuotedString(quote rune) {
   640  	for {
   641  		ch := s.peek()
   642  
   643  		if ch == EOF {
   644  			break
   645  		}
   646  
   647  		s.literal.WriteRune(s.next())
   648  
   649  		if ch == quote {
   650  			break
   651  		}
   652  
   653  		if ch == '\\' {
   654  			switch s.peek() {
   655  			case '\\', quote:
   656  				s.literal.WriteRune(s.next())
   657  			}
   658  		}
   659  	}
   660  }
   661  
   662  func (s *Scanner) scanExternalCommandCSVQExpression() {
   663  	for {
   664  		ch := s.peek()
   665  
   666  		if ch == EOF {
   667  			break
   668  		}
   669  
   670  		s.literal.WriteRune(s.next())
   671  
   672  		if ch == EndExpression {
   673  			break
   674  		}
   675  
   676  		if ch == '\\' {
   677  			switch s.peek() {
   678  			case '\\', BeginExpression, EndExpression:
   679  				s.literal.WriteRune(s.next())
   680  			}
   681  		}
   682  	}
   683  }