github.com/viant/toolbox@v0.34.5/tokenizer.go (about) 1 package toolbox 2 3 import ( 4 "fmt" 5 "strings" 6 "unicode" 7 ) 8 9 //Matcher represents a matcher, that matches input from offset position, it returns number of characters matched. 10 type Matcher interface { 11 //Match matches input starting from offset, it return number of characters matched 12 Match(input string, offset int) (matched int) 13 } 14 15 //Token a matchable input 16 type Token struct { 17 Token int 18 Matched string 19 } 20 21 //Tokenizer represents a token scanner. 22 type Tokenizer struct { 23 matchers map[int]Matcher 24 Input string 25 Index int 26 InvalidToken int 27 EndOfFileToken int 28 } 29 30 //Nexts matches the first of the candidates 31 func (t *Tokenizer) Nexts(candidates ...int) *Token { 32 for _, candidate := range candidates { 33 result := t.Next(candidate) 34 if result.Token != t.InvalidToken { 35 return result 36 37 } 38 } 39 return &Token{t.InvalidToken, ""} 40 } 41 42 //Next tries to match a candidate, it returns token if imatching is successful. 43 func (t *Tokenizer) Next(candidate int) *Token { 44 offset := t.Index 45 if !(offset < len(t.Input)) { 46 return &Token{t.EndOfFileToken, ""} 47 } 48 49 if candidate == t.EndOfFileToken { 50 return &Token{t.InvalidToken, ""} 51 } 52 if matcher, ok := t.matchers[candidate]; ok { 53 matchedSize := matcher.Match(t.Input, offset) 54 if matchedSize > 0 { 55 t.Index = t.Index + matchedSize 56 return &Token{candidate, t.Input[offset : offset+matchedSize]} 57 } 58 59 } else { 60 panic(fmt.Sprintf("failed to lookup matcher for %v", candidate)) 61 } 62 return &Token{t.InvalidToken, ""} 63 } 64 65 //NewTokenizer creates a new NewTokenizer, it takes input, invalidToken, endOfFileToeken, and matchers. 66 func NewTokenizer(input string, invalidToken int, endOfFileToken int, matcher map[int]Matcher) *Tokenizer { 67 return &Tokenizer{ 68 matchers: matcher, 69 Input: input, 70 Index: 0, 71 InvalidToken: invalidToken, 72 EndOfFileToken: endOfFileToken, 73 } 74 } 75 76 //CharactersMatcher represents a matcher, that matches any of Chars. 77 type CharactersMatcher struct { 78 Chars string //characters to be matched 79 } 80 81 //Match matches any characters defined in Chars in the input, returns 1 if character has been matched 82 func (m CharactersMatcher) Match(input string, offset int) int { 83 var matched = 0 84 if offset >= len(input) { 85 return matched 86 } 87 outer: 88 for _, r := range input[offset:] { 89 for _, candidate := range m.Chars { 90 if candidate == r { 91 matched++ 92 continue outer 93 } 94 } 95 break 96 } 97 return matched 98 } 99 100 //NewCharactersMatcher creates a new character matcher 101 func NewCharactersMatcher(chars string) Matcher { 102 return &CharactersMatcher{Chars: chars} 103 } 104 105 //EOFMatcher represents end of input matcher 106 type EOFMatcher struct { 107 } 108 109 //Match returns 1 if end of input has been reached otherwise 0 110 func (m EOFMatcher) Match(input string, offset int) int { 111 if offset+1 == len(input) { 112 return 1 113 } 114 return 0 115 } 116 117 //IntMatcher represents a matcher that finds any int in the input 118 type IntMatcher struct{} 119 120 //Match matches a literal in the input, it returns number of character matched. 121 func (m IntMatcher) Match(input string, offset int) int { 122 var matched = 0 123 if offset >= len(input) { 124 return matched 125 } 126 for _, r := range input[offset:] { 127 if !unicode.IsDigit(r) { 128 break 129 } 130 matched++ 131 } 132 return matched 133 } 134 135 //NewIntMatcher returns a new integer matcher 136 func NewIntMatcher() Matcher { 137 return &IntMatcher{} 138 } 139 140 var dotRune = rune('.') 141 var underscoreRune = rune('_') 142 143 //LiteralMatcher represents a matcher that finds any literals in the input 144 type LiteralMatcher struct{} 145 146 //Match matches a literal in the input, it returns number of character matched. 147 func (m LiteralMatcher) Match(input string, offset int) int { 148 var matched = 0 149 if offset >= len(input) { 150 return matched 151 } 152 for i, r := range input[offset:] { 153 if i == 0 { 154 if !unicode.IsLetter(r) { 155 break 156 } 157 } else if !(unicode.IsLetter(r) || unicode.IsDigit(r) || r == dotRune || r == underscoreRune) { 158 break 159 } 160 matched++ 161 } 162 return matched 163 } 164 165 //LiteralMatcher represents a matcher that finds any literals in the input 166 type IdMatcher struct{} 167 168 //Match matches a literal in the input, it returns number of character matched. 169 func (m IdMatcher) Match(input string, offset int) int { 170 var matched = 0 171 if offset >= len(input) { 172 return matched 173 } 174 for i, r := range input[offset:] { 175 if i == 0 { 176 if !(unicode.IsLetter(r) || unicode.IsDigit(r)) { 177 break 178 } 179 } else if !(unicode.IsLetter(r) || unicode.IsDigit(r) || r == dotRune || r == underscoreRune) { 180 break 181 } 182 matched++ 183 } 184 return matched 185 } 186 187 //SequenceMatcher represents a matcher that finds any sequence until find provided terminators 188 type SequenceMatcher struct { 189 Terminators []string 190 CaseSensitive bool 191 matchAllIfNoTerminator bool 192 runeTerminators []rune 193 } 194 195 func (m *SequenceMatcher) hasTerminator(candidate string) bool { 196 var candidateLength = len(candidate) 197 for _, terminator := range m.Terminators { 198 terminatorLength := len(terminator) 199 if len(terminator) > candidateLength { 200 continue 201 } 202 if !m.CaseSensitive { 203 if strings.ToLower(terminator) == strings.ToLower(string(candidate[:terminatorLength])) { 204 return true 205 } 206 } 207 if terminator == string(candidate[:terminatorLength]) { 208 return true 209 } 210 } 211 return false 212 } 213 214 //Match matches a literal in the input, it returns number of character matched. 215 func (m *SequenceMatcher) Match(input string, offset int) int { 216 var matched = 0 217 hasTerminator := false 218 if offset >= len(input) { 219 return matched 220 } 221 if len(m.runeTerminators) > 0 { 222 return m.matchSingleTerminator(input, offset) 223 } 224 var i = 0 225 for ; i < len(input)-offset; i++ { 226 if m.hasTerminator(string(input[offset+i:])) { 227 hasTerminator = true 228 break 229 } 230 } 231 if !hasTerminator && !m.matchAllIfNoTerminator { 232 return 0 233 } 234 return i 235 } 236 237 func (m *SequenceMatcher) matchSingleTerminator(input string, offset int) int { 238 matched := 0 239 hasTerminator := false 240 outer: 241 for i, r := range input[offset:] { 242 for _, terminator := range m.runeTerminators { 243 terminator = unicode.ToLower(terminator) 244 if m.CaseSensitive { 245 r = unicode.ToLower(r) 246 terminator = unicode.ToLower(terminator) 247 } 248 if r == terminator { 249 hasTerminator = true 250 matched = i 251 break outer 252 } 253 } 254 255 } 256 if !hasTerminator && !m.matchAllIfNoTerminator { 257 return 0 258 } 259 return matched 260 } 261 262 //NewSequenceMatcher creates a new matcher that finds all sequence until find at least one of the provided terminators 263 func NewSequenceMatcher(terminators ...string) Matcher { 264 result := &SequenceMatcher{ 265 matchAllIfNoTerminator: true, 266 Terminators: terminators, 267 runeTerminators: []rune{}, 268 } 269 for _, terminator := range terminators { 270 if len(terminator) != 1 { 271 result.runeTerminators = []rune{} 272 break 273 } 274 result.runeTerminators = append(result.runeTerminators, rune(terminator[0])) 275 } 276 return result 277 } 278 279 //NewTerminatorMatcher creates a new matcher that finds any sequence until find at least one of the provided terminators 280 func NewTerminatorMatcher(terminators ...string) Matcher { 281 result := &SequenceMatcher{ 282 Terminators: terminators, 283 runeTerminators: []rune{}, 284 } 285 for _, terminator := range terminators { 286 if len(terminator) != 1 { 287 result.runeTerminators = []rune{} 288 break 289 } 290 result.runeTerminators = append(result.runeTerminators, rune(terminator[0])) 291 } 292 return result 293 } 294 295 //remainingSequenceMatcher represents a matcher that matches all reamining input 296 type remainingSequenceMatcher struct{} 297 298 //Match matches a literal in the input, it returns number of character matched. 299 func (m *remainingSequenceMatcher) Match(input string, offset int) (matched int) { 300 return len(input) - offset 301 } 302 303 //Creates a matcher that matches all remaining input 304 func NewRemainingSequenceMatcher() Matcher { 305 return &remainingSequenceMatcher{} 306 } 307 308 //CustomIdMatcher represents a matcher that finds any literals with additional custom set of characters in the input 309 type customIdMatcher struct { 310 Allowed map[rune]bool 311 } 312 313 func (m *customIdMatcher) isValid(r rune) bool { 314 if unicode.IsLetter(r) || unicode.IsDigit(r) { 315 return true 316 } 317 return m.Allowed[r] 318 } 319 320 //Match matches a literal in the input, it returns number of character matched. 321 func (m *customIdMatcher) Match(input string, offset int) int { 322 var matched = 0 323 if offset >= len(input) { 324 return matched 325 } 326 for _, r := range input[offset:] { 327 if !m.isValid(r) { 328 break 329 } 330 matched++ 331 } 332 return matched 333 } 334 335 //NewCustomIdMatcher creates new custom matcher 336 func NewCustomIdMatcher(allowedChars ...string) Matcher { 337 var result = &customIdMatcher{ 338 Allowed: make(map[rune]bool), 339 } 340 if len(allowedChars) == 1 && len(allowedChars[0]) > 0 { 341 for _, allowed := range allowedChars[0] { 342 result.Allowed[rune(allowed)] = true 343 } 344 } 345 for _, allowed := range allowedChars { 346 result.Allowed[rune(allowed[0])] = true 347 } 348 return result 349 } 350 351 //LiteralMatcher represents a matcher that finds any literals in the input 352 type BodyMatcher struct { 353 Begin string 354 End string 355 } 356 357 //Match matches a literal in the input, it returns number of character matched. 358 func (m *BodyMatcher) Match(input string, offset int) (matched int) { 359 beginLen := len(m.Begin) 360 endLen := len(m.End) 361 uniEnclosed := m.Begin == m.End 362 363 if offset+beginLen >= len(input) { 364 return 0 365 } 366 if input[offset:offset+beginLen] != m.Begin { 367 return 0 368 } 369 var depth = 1 370 var i = 1 371 for ; i < len(input)-offset; i++ { 372 canCheckEnd := offset+i+endLen <= len(input) 373 if !canCheckEnd { 374 return 0 375 } 376 if !uniEnclosed { 377 canCheckBegin := offset+i+beginLen <= len(input) 378 if canCheckBegin { 379 if string(input[offset+i:offset+i+beginLen]) == m.Begin { 380 depth++ 381 } 382 } 383 } 384 if string(input[offset+i:offset+i+endLen]) == m.End { 385 depth-- 386 } 387 if depth == 0 { 388 i += endLen 389 break 390 } 391 } 392 return i 393 } 394 395 //NewBodyMatcher creates a new body matcher 396 func NewBodyMatcher(begin, end string) Matcher { 397 return &BodyMatcher{Begin: begin, End: end} 398 } 399 400 // Parses SQL Begin End blocks 401 func NewBlockMatcher(caseSensitive bool, sequenceStart string, sequenceTerminator string, nestedSequences []string, ignoredTerminators []string) Matcher { 402 return &BlockMatcher{ 403 CaseSensitive: caseSensitive, 404 SequenceStart: sequenceStart, 405 SequenceTerminator: sequenceTerminator, 406 NestedSequences: nestedSequences, 407 IgnoredTerminators: ignoredTerminators, 408 } 409 } 410 411 type BlockMatcher struct { 412 CaseSensitive bool 413 SequenceStart string 414 SequenceTerminator string 415 NestedSequences []string 416 IgnoredTerminators []string 417 } 418 419 func (m *BlockMatcher) Match(input string, offset int) (matched int) { 420 421 sequenceStart := m.SequenceStart 422 terminator := m.SequenceTerminator 423 nestedSequences := m.NestedSequences 424 ignoredTerminators := m.IgnoredTerminators 425 in := input 426 427 starterLen := len(sequenceStart) 428 terminatorLen := len(terminator) 429 430 if !m.CaseSensitive { 431 sequenceStart = strings.ToLower(sequenceStart) 432 terminator = strings.ToLower(terminator) 433 for i, seq := range nestedSequences { 434 nestedSequences[i] = strings.ToLower(seq) 435 } 436 for i, term := range ignoredTerminators { 437 ignoredTerminators[i] = strings.ToLower(term) 438 } 439 in = strings.ToLower(input) 440 } 441 442 if offset+starterLen >= len(in) { 443 return 0 444 } 445 if in[offset:offset+starterLen] != sequenceStart { 446 return 0 447 } 448 var depth = 1 449 var i = 1 450 for ; i < len(in)-offset; i++ { 451 canCheckEnd := offset+i+terminatorLen <= len(in) 452 if !canCheckEnd { 453 return 0 454 } 455 canCheckBegin := offset+i+starterLen <= len(in) 456 if canCheckBegin { 457 beginning := in[offset+i : offset+i+starterLen] 458 459 if beginning == sequenceStart { 460 depth++ 461 } else { 462 for _, nestedSeq := range nestedSequences { 463 nestedLen := len(nestedSeq) 464 if offset+i+nestedLen >= len(in) { 465 continue 466 } 467 468 beginning := in[offset+i : offset+i+nestedLen] 469 if beginning == nestedSeq { 470 depth++ 471 break 472 } 473 } 474 } 475 } 476 ignored := false 477 for _, ignoredTerm := range ignoredTerminators { 478 termLen := len(ignoredTerm) 479 if offset+i+termLen >= len(in) { 480 continue 481 } 482 483 ending := in[offset+i : offset+i+termLen] 484 if ending == ignoredTerm { 485 ignored = true 486 break 487 } 488 } 489 if !ignored && in[offset+i:offset+i+terminatorLen] == terminator && unicode.IsSpace(rune(in[offset+i-1])) { 490 depth-- 491 } 492 if depth == 0 { 493 i += terminatorLen 494 break 495 } 496 } 497 return i 498 } 499 500 //KeywordMatcher represents a keyword matcher 501 type KeywordMatcher struct { 502 Keyword string 503 CaseSensitive bool 504 } 505 506 //Match matches keyword in the input, it returns number of character matched. 507 func (m KeywordMatcher) Match(input string, offset int) (matched int) { 508 if !(offset+len(m.Keyword)-1 < len(input)) { 509 return 0 510 } 511 if m.CaseSensitive { 512 if input[offset:offset+len(m.Keyword)] == m.Keyword { 513 return len(m.Keyword) 514 } 515 } else { 516 if strings.ToLower(input[offset:offset+len(m.Keyword)]) == strings.ToLower(m.Keyword) { 517 return len(m.Keyword) 518 } 519 } 520 return 0 521 } 522 523 //KeywordsMatcher represents a matcher that finds any of specified keywords in the input 524 type KeywordsMatcher struct { 525 Keywords []string 526 CaseSensitive bool 527 } 528 529 //Match matches any specified keyword, it returns number of character matched. 530 func (m KeywordsMatcher) Match(input string, offset int) (matched int) { 531 for _, keyword := range m.Keywords { 532 if len(input)-offset < len(keyword) { 533 continue 534 } 535 if m.CaseSensitive { 536 if input[offset:offset+len(keyword)] == keyword { 537 return len(keyword) 538 } 539 } else { 540 if strings.ToLower(input[offset:offset+len(keyword)]) == strings.ToLower(keyword) { 541 return len(keyword) 542 } 543 } 544 } 545 return 0 546 } 547 548 //NewKeywordsMatcher returns a matcher for supplied keywords 549 func NewKeywordsMatcher(caseSensitive bool, keywords ...string) Matcher { 550 return &KeywordsMatcher{CaseSensitive: caseSensitive, Keywords: keywords} 551 } 552 553 //IllegalTokenError represents illegal token error 554 type IllegalTokenError struct { 555 Illegal *Token 556 Message string 557 Expected []int 558 Position int 559 } 560 561 func (e *IllegalTokenError) Error() string { 562 return fmt.Sprintf("%v; illegal token at %v [%v], expected %v, but had: %v", e.Message, e.Position, e.Illegal.Matched, e.Expected, e.Illegal.Token) 563 } 564 565 //NewIllegalTokenError create a new illegal token error 566 func NewIllegalTokenError(message string, expected []int, position int, found *Token) error { 567 return &IllegalTokenError{ 568 Message: message, 569 Illegal: found, 570 Expected: expected, 571 Position: position, 572 } 573 } 574 575 //ExpectTokenOptionallyFollowedBy returns second matched token or error if first and second group was not matched 576 func ExpectTokenOptionallyFollowedBy(tokenizer *Tokenizer, first int, errorMessage string, second ...int) (*Token, error) { 577 _, _ = ExpectToken(tokenizer, "", first) 578 return ExpectToken(tokenizer, errorMessage, second...) 579 } 580 581 //ExpectToken returns the matched token or error 582 func ExpectToken(tokenizer *Tokenizer, errorMessage string, candidates ...int) (*Token, error) { 583 token := tokenizer.Nexts(candidates...) 584 hasMatch := HasSliceAnyElements(candidates, token.Token) 585 if !hasMatch { 586 return nil, NewIllegalTokenError(errorMessage, candidates, tokenizer.Index, token) 587 } 588 return token, nil 589 }