github.com/mithrandie/csvq@v1.18.1/lib/parser/scanner.go (about) 1 package parser 2 3 import ( 4 "bytes" 5 "errors" 6 "fmt" 7 "strconv" 8 "strings" 9 "unicode" 10 11 "github.com/mithrandie/csvq/lib/option" 12 13 "github.com/mithrandie/ternary" 14 ) 15 16 const ( 17 EOF = -(iota + 1) 18 Uncategorized 19 ) 20 21 const ( 22 TokenFrom = IDENTIFIER 23 TokenTo = SUBSTITUTION_OP 24 KeywordFrom = SELECT 25 KeywordTo = JSON_OBJECT 26 ) 27 28 const ( 29 VariableSign = '@' 30 EnvironmentVariableSign = '%' 31 ExternalCommandSign = '$' 32 RuntimeInformationSign = '#' 33 34 SubstitutionOperator = ":=" 35 36 BeginExpression = '{' 37 EndExpression = '}' 38 39 IdentifierDelimiter = ':' 40 ) 41 42 var errTokenIsNotKeyword = errors.New("token is not keyword") 43 var errInvalidConstantSyntax = errors.New("invalid constant syntax") 44 45 var comparisonOperators = []string{ 46 ">", 47 "<", 48 ">=", 49 "<=", 50 "<>", 51 "!=", 52 "==", 53 } 54 55 var stringOperators = []string{ 56 "||", 57 } 58 59 var runesNotIncludedInUrl = []rune{ 60 '{', 61 '}', 62 '|', 63 '\\', 64 '^', 65 '[', 66 ']', 67 '`', 68 } 69 70 var aggregateFunctions = []string{ 71 "MIN", 72 "MAX", 73 "SUM", 74 "AVG", 75 "STDEV", 76 "STDEVP", 77 "VARP", 78 "MEDIAN", 79 } 80 81 var listFunctions = []string{ 82 "LISTAGG", 83 "JSON_AGG", 84 } 85 86 var analyticFunctions = []string{ 87 "ROW_NUMBER", 88 "RANK", 89 "DENSE_RANK", 90 "CUME_DIST", 91 "PERCENT_RANK", 92 "NTILE", 93 } 94 95 var functionsNth = []string{ 96 "FIRST_VALUE", 97 "LAST_VALUE", 98 "NTH_VALUE", 99 } 100 101 var functionsWithIgnoreNulls = []string{ 102 "LAG", 103 "LEAD", 104 } 105 106 var ConstantDelimiter = string(IdentifierDelimiter) + string(IdentifierDelimiter) 107 108 func TokenLiteral(token int) string { 109 if TokenFrom <= token && token <= TokenTo { 110 return yyToknames[token-TokenFrom+3] 111 } 112 return string(rune(token)) 113 } 114 115 func KeywordLiteral(token int) (string, error) { 116 if KeywordFrom <= token && token <= KeywordTo { 117 return yyToknames[token-TokenFrom+3], nil 118 } 119 return string(rune(token)), errTokenIsNotKeyword 120 } 121 122 type Scanner struct { 123 src []rune 124 srcPos int 125 literal bytes.Buffer 126 127 line int 128 char int 129 sourceFile string 130 131 forPrepared bool 132 ansiQuotes bool 133 134 holderOrdinal int 135 holderNames []string 136 holderNumber int 137 } 138 139 func (s *Scanner) Init(src string, sourceFile string, forPrepared bool, ansiQuotes bool) *Scanner { 140 s.src = []rune(src) 141 s.srcPos = 0 142 s.line = 1 143 s.char = 0 144 s.sourceFile = sourceFile 145 s.forPrepared = forPrepared 146 s.ansiQuotes = ansiQuotes 147 s.holderOrdinal = 0 148 s.holderNames = make([]string, 0, 10) 149 s.holderNumber = 0 150 return s 151 } 152 153 func (s *Scanner) HolderNumber() int { 154 return s.holderNumber 155 } 156 157 func (s *Scanner) holderNameExists(name string) bool { 158 for _, v := range s.holderNames { 159 if name == v { 160 return true 161 } 162 } 163 return false 164 } 165 166 func (s *Scanner) peek() rune { 167 return s.peekFurtherAhead(1) 168 } 169 170 func (s *Scanner) peekFurtherAhead(n int) rune { 171 pos := n - 1 + s.srcPos 172 173 if len(s.src) <= pos { 174 return EOF 175 } 176 177 return s.src[pos] 178 } 179 180 func (s *Scanner) peekNextLetter(n int) rune { 181 for unicode.IsSpace(s.peekFurtherAhead(n)) { 182 n = n + 1 183 } 184 return s.peekFurtherAhead(n) 185 } 186 187 func (s *Scanner) next() rune { 188 ch := s.peek() 189 if ch == EOF { 190 return ch 191 } 192 193 s.srcPos++ 194 s.char++ 195 196 ch = s.checkNewLine(ch) 197 198 return ch 199 } 200 201 func (s *Scanner) checkNewLine(ch rune) rune { 202 if ch != '\r' && ch != '\n' { 203 return ch 204 } 205 206 if ch == '\r' && s.peek() == '\n' { 207 s.srcPos++ 208 } 209 210 s.line++ 211 s.char = 0 212 return s.src[s.srcPos-1] 213 } 214 215 func (s *Scanner) Scan() (Token, error) { 216 for unicode.IsSpace(s.peek()) { 217 s.next() 218 } 219 220 ch := s.next() 221 token := ch 222 literal := string(ch) 223 quoted := false 224 line := s.line 225 char := s.char 226 var err error 227 228 if s.forPrepared { 229 switch ch { 230 case '?': 231 s.holderOrdinal++ 232 s.holderNumber++ 233 return Token{Token: PLACEHOLDER, Literal: literal, HolderOrdinal: s.holderOrdinal, Line: line, Char: char, SourceFile: s.sourceFile}, err 234 case ':': 235 if s.isIdentRune(s.peek()) { 236 s.scanIdentifier(ch) 237 holderName := s.literal.String() 238 s.holderOrdinal++ 239 if !s.holderNameExists(holderName) { 240 s.holderNames = append(s.holderNames, holderName) 241 s.holderNumber++ 242 } 243 return Token{Token: PLACEHOLDER, Literal: holderName, HolderOrdinal: s.holderOrdinal, Line: line, Char: char, SourceFile: s.sourceFile}, err 244 } 245 } 246 } 247 248 switch { 249 case s.isDecimal(ch): 250 token, err = s.scanNumber(ch) 251 literal = s.literal.String() 252 case s.isIdentRune(ch): 253 s.scanIdentifier(ch) 254 255 literal = s.literal.String() 256 if _, e := ternary.ConvertFromString(literal); e == nil { 257 token = TERNARY 258 } else if t, e := s.searchKeyword(literal); e == nil { 259 token = rune(t) 260 } else if s.isAggregateFunctions(literal) { 261 token = AGGREGATE_FUNCTION 262 } else if s.isListaggFunctions(literal) { 263 token = LIST_FUNCTION 264 } else if s.isAnalyticFunctions(literal) { 265 token = ANALYTIC_FUNCTION 266 } else if s.isFunctionsNth(literal) { 267 token = FUNCTION_NTH 268 } else if s.isFunctionsWithIgnoreNulls(literal) { 269 token = FUNCTION_WITH_INS 270 } else { 271 if unicode.IsLetter(ch) && s.peek() == ':' { 272 if s.peekFurtherAhead(2) == ':' { 273 if s.peekNextLetter(3) == '(' { 274 s.next() 275 s.next() 276 token = TABLE_FUNCTION 277 } else { 278 s.literal.WriteRune(s.next()) 279 s.literal.WriteRune(s.next()) 280 err = s.scanConstant() 281 literal = s.literal.String() 282 token = CONSTANT 283 if err != nil { 284 token = Uncategorized 285 } 286 } 287 } else { 288 s.literal.WriteRune(s.next()) 289 s.scanUrl() 290 literal = s.literal.String() 291 token = URL 292 } 293 } else { 294 token = IDENTIFIER 295 } 296 } 297 case s.isOperatorRune(ch): 298 s.scanOperator(ch) 299 300 literal = s.literal.String() 301 if s.isComparisonOperators(literal) { 302 token = COMPARISON_OP 303 } else if s.isStringOperators(literal) { 304 token = STRING_OP 305 } else if literal == SubstitutionOperator { 306 token = SUBSTITUTION_OP 307 } else if 1 < len(literal) { 308 token = Uncategorized 309 } 310 case ch == VariableSign: 311 switch s.peek() { 312 case EnvironmentVariableSign: 313 s.next() 314 token = ENVIRONMENT_VARIABLE 315 case RuntimeInformationSign: 316 s.next() 317 token = RUNTIME_INFORMATION 318 case VariableSign: 319 s.next() 320 token = FLAG 321 default: 322 token = VARIABLE 323 } 324 325 if token == ENVIRONMENT_VARIABLE && s.peek() == '`' { 326 err = s.scanString(s.next()) 327 literal = option.UnescapeIdentifier(s.literal.String(), '`') 328 quoted = true 329 } else { 330 if s.isIdentRune(s.peek()) { 331 s.scanIdentifier(s.next()) 332 literal = s.literal.String() 333 } else { 334 literal = "" 335 } 336 } 337 338 if len(literal) < 1 { 339 err = errors.New("invalid variable symbol") 340 } 341 case ch == ExternalCommandSign: 342 s.scanExternalCommand() 343 literal = s.literal.String() 344 token = EXTERNAL_COMMAND 345 case s.isCommentRune(ch): 346 s.scanComment() 347 return s.Scan() 348 case s.isLineCommentRune(ch): 349 s.scanLineComment() 350 return s.Scan() 351 default: 352 if ch == '\'' || (!s.ansiQuotes && ch == '"') { 353 err = s.scanString(ch) 354 literal = option.UnescapeString(s.literal.String(), ch) 355 token = STRING 356 } else if ch == '`' || (s.ansiQuotes && ch == '"') { 357 err = s.scanString(ch) 358 literal = option.UnescapeIdentifier(s.literal.String(), ch) 359 token = IDENTIFIER 360 quoted = true 361 } 362 } 363 364 return Token{Token: int(token), Literal: literal, Quoted: quoted, Line: line, Char: char, SourceFile: s.sourceFile}, err 365 } 366 367 func (s *Scanner) scanString(quote rune) error { 368 s.literal.Reset() 369 370 for { 371 ch := s.next() 372 373 if ch == EOF { 374 return errors.New("literal not terminated") 375 } 376 377 if ch == quote { 378 if s.peek() == quote { 379 s.literal.WriteRune(ch) 380 ch = s.next() 381 } else { 382 break 383 } 384 } 385 386 if ch == '\\' { 387 switch s.peek() { 388 case '\\', quote: 389 s.literal.WriteRune(ch) 390 ch = s.next() 391 } 392 } 393 s.literal.WriteRune(ch) 394 } 395 return nil 396 } 397 398 func (s *Scanner) scanIdentifier(head rune) { 399 s.literal.Reset() 400 401 s.literal.WriteRune(head) 402 for s.isIdentRune(s.peek()) { 403 s.literal.WriteRune(s.next()) 404 } 405 } 406 407 func (s *Scanner) scanConstant() error { 408 if !s.isIdentRune(s.peek()) { 409 return errInvalidConstantSyntax 410 } 411 s.literal.WriteRune(s.next()) 412 for s.isIdentRune(s.peek()) { 413 s.literal.WriteRune(s.next()) 414 } 415 return nil 416 } 417 418 func (s *Scanner) scanUrl() int { 419 oldPos := s.srcPos 420 for !unicode.IsSpace(s.peek()) && !s.isRuneNotIncludedInUrl(s.peek()) && s.peek() != EOF { 421 s.literal.WriteRune(s.next()) 422 } 423 return s.srcPos - oldPos 424 } 425 426 func (s *Scanner) isRuneNotIncludedInUrl(ch rune) bool { 427 for _, r := range runesNotIncludedInUrl { 428 if r == ch { 429 return true 430 } 431 } 432 return false 433 } 434 435 func (s *Scanner) isIdentRune(ch rune) bool { 436 return ch == '_' || unicode.IsLetter(ch) || unicode.IsDigit(ch) 437 } 438 439 func (s *Scanner) isDecimal(ch rune) bool { 440 return '0' <= ch && ch <= '9' 441 } 442 443 func (s *Scanner) scanNumber(head rune) (rune, error) { 444 s.literal.Reset() 445 var numType rune = INTEGER 446 447 s.literal.WriteRune(head) 448 for s.isDecimal(s.peek()) { 449 s.literal.WriteRune(s.next()) 450 } 451 452 if s.peek() == '.' { 453 numType = FLOAT 454 455 s.literal.WriteRune(s.next()) 456 for s.isDecimal(s.peek()) { 457 s.literal.WriteRune(s.next()) 458 } 459 } 460 461 if s.peek() == 'e' || s.peek() == 'E' { 462 numType = FLOAT 463 464 s.literal.WriteRune(s.next()) 465 if s.peek() == '+' || s.peek() == '-' { 466 s.literal.WriteRune(s.next()) 467 } 468 for s.isDecimal(s.peek()) { 469 s.literal.WriteRune(s.next()) 470 } 471 } 472 473 if numType == INTEGER { 474 if _, err := strconv.ParseInt(s.literal.String(), 10, 64); err == nil { 475 return numType, nil 476 } 477 numType = FLOAT 478 } 479 480 if _, err := strconv.ParseFloat(s.literal.String(), 64); err == nil { 481 return numType, nil 482 } 483 484 return numType, errors.New(fmt.Sprintf("cound not convert %q to a number", s.literal.String())) 485 } 486 487 func (s *Scanner) scanOperator(head rune) { 488 s.literal.Reset() 489 490 s.literal.WriteRune(head) 491 for s.isOperatorRune(s.peek()) { 492 s.literal.WriteRune(s.next()) 493 } 494 } 495 496 func (s *Scanner) isOperatorRune(ch rune) bool { 497 switch ch { 498 case '=', '>', '<', '!', '|', ':': 499 return true 500 } 501 return false 502 } 503 504 func (s *Scanner) searchKeyword(str string) (int, error) { 505 for i := KeywordFrom; i <= KeywordTo; i++ { 506 if strings.EqualFold(TokenLiteral(i), str) { 507 return i, nil 508 } 509 } 510 return IDENTIFIER, errors.New(fmt.Sprintf("%q is not a keyword", str)) 511 } 512 513 func (s *Scanner) isAggregateFunctions(str string) bool { 514 for _, v := range aggregateFunctions { 515 if strings.EqualFold(v, str) { 516 return true 517 } 518 } 519 return false 520 } 521 522 func (s *Scanner) isListaggFunctions(str string) bool { 523 for _, v := range listFunctions { 524 if strings.EqualFold(v, str) { 525 return true 526 } 527 } 528 return false 529 } 530 531 func (s *Scanner) isAnalyticFunctions(str string) bool { 532 for _, v := range analyticFunctions { 533 if strings.EqualFold(v, str) { 534 return true 535 } 536 } 537 return false 538 } 539 540 func (s *Scanner) isFunctionsNth(str string) bool { 541 for _, v := range functionsNth { 542 if strings.EqualFold(v, str) { 543 return true 544 } 545 } 546 return false 547 } 548 549 func (s *Scanner) isFunctionsWithIgnoreNulls(str string) bool { 550 for _, v := range functionsWithIgnoreNulls { 551 if strings.EqualFold(v, str) { 552 return true 553 } 554 } 555 return false 556 } 557 558 func (s *Scanner) isComparisonOperators(str string) bool { 559 for _, v := range comparisonOperators { 560 if v == str { 561 return true 562 } 563 } 564 return false 565 } 566 567 func (s *Scanner) isStringOperators(str string) bool { 568 for _, v := range stringOperators { 569 if v == str { 570 return true 571 } 572 } 573 return false 574 } 575 576 func (s *Scanner) isCommentRune(ch rune) bool { 577 if ch == '/' && s.peek() == '*' { 578 s.next() 579 return true 580 } 581 return false 582 } 583 584 func (s *Scanner) scanComment() { 585 for { 586 ch := s.next() 587 if ch == EOF { 588 break 589 } else if ch == '*' { 590 if s.peek() == '/' { 591 s.next() 592 break 593 } 594 } 595 } 596 } 597 598 func (s *Scanner) isLineCommentRune(ch rune) bool { 599 if ch == '-' && s.peek() == '-' { 600 s.next() 601 return true 602 } 603 return false 604 } 605 606 func (s *Scanner) scanLineComment() { 607 for { 608 ch := s.peek() 609 if ch == '\r' || ch == '\n' || ch == EOF { 610 break 611 } 612 s.next() 613 } 614 } 615 616 func (s *Scanner) scanExternalCommand() { 617 s.literal.Reset() 618 619 for { 620 ch := s.peek() 621 if ch == ';' || ch == EOF { 622 break 623 } 624 625 s.literal.WriteRune(s.next()) 626 627 if ch == '"' || ch == '\'' || ch == '`' { 628 s.scanExternalCommandQuotedString(ch) 629 continue 630 } 631 632 if ch == ExternalCommandSign && s.peek() == BeginExpression { 633 s.literal.WriteRune(s.next()) 634 s.scanExternalCommandCSVQExpression() 635 } 636 } 637 } 638 639 func (s *Scanner) scanExternalCommandQuotedString(quote rune) { 640 for { 641 ch := s.peek() 642 643 if ch == EOF { 644 break 645 } 646 647 s.literal.WriteRune(s.next()) 648 649 if ch == quote { 650 break 651 } 652 653 if ch == '\\' { 654 switch s.peek() { 655 case '\\', quote: 656 s.literal.WriteRune(s.next()) 657 } 658 } 659 } 660 } 661 662 func (s *Scanner) scanExternalCommandCSVQExpression() { 663 for { 664 ch := s.peek() 665 666 if ch == EOF { 667 break 668 } 669 670 s.literal.WriteRune(s.next()) 671 672 if ch == EndExpression { 673 break 674 } 675 676 if ch == '\\' { 677 switch s.peek() { 678 case '\\', BeginExpression, EndExpression: 679 s.literal.WriteRune(s.next()) 680 } 681 } 682 } 683 }