github.com/viant/toolbox@v0.34.5/tokenizer.go

github.com/viant/toolbox@v0.34.5/tokenizer.go (about)

     1  package toolbox
     2  
     3  import (
     4  	"fmt"
     5  	"strings"
     6  	"unicode"
     7  )
     8  
     9  //Matcher represents a matcher, that matches input from offset position, it returns number of characters matched.
    10  type Matcher interface {
    11  	//Match matches input starting from offset, it return number of characters matched
    12  	Match(input string, offset int) (matched int)
    13  }
    14  
    15  //Token a matchable input
    16  type Token struct {
    17  	Token   int
    18  	Matched string
    19  }
    20  
    21  //Tokenizer represents a token scanner.
    22  type Tokenizer struct {
    23  	matchers       map[int]Matcher
    24  	Input          string
    25  	Index          int
    26  	InvalidToken   int
    27  	EndOfFileToken int
    28  }
    29  
    30  //Nexts matches the first of the candidates
    31  func (t *Tokenizer) Nexts(candidates ...int) *Token {
    32  	for _, candidate := range candidates {
    33  		result := t.Next(candidate)
    34  		if result.Token != t.InvalidToken {
    35  			return result
    36  
    37  		}
    38  	}
    39  	return &Token{t.InvalidToken, ""}
    40  }
    41  
    42  //Next tries to match a candidate, it returns token if imatching is successful.
    43  func (t *Tokenizer) Next(candidate int) *Token {
    44  	offset := t.Index
    45  	if !(offset < len(t.Input)) {
    46  		return &Token{t.EndOfFileToken, ""}
    47  	}
    48  
    49  	if candidate == t.EndOfFileToken {
    50  		return &Token{t.InvalidToken, ""}
    51  	}
    52  	if matcher, ok := t.matchers[candidate]; ok {
    53  		matchedSize := matcher.Match(t.Input, offset)
    54  		if matchedSize > 0 {
    55  			t.Index = t.Index + matchedSize
    56  			return &Token{candidate, t.Input[offset : offset+matchedSize]}
    57  		}
    58  
    59  	} else {
    60  		panic(fmt.Sprintf("failed to lookup matcher for %v", candidate))
    61  	}
    62  	return &Token{t.InvalidToken, ""}
    63  }
    64  
    65  //NewTokenizer creates a new NewTokenizer, it takes input, invalidToken, endOfFileToeken, and matchers.
    66  func NewTokenizer(input string, invalidToken int, endOfFileToken int, matcher map[int]Matcher) *Tokenizer {
    67  	return &Tokenizer{
    68  		matchers:       matcher,
    69  		Input:          input,
    70  		Index:          0,
    71  		InvalidToken:   invalidToken,
    72  		EndOfFileToken: endOfFileToken,
    73  	}
    74  }
    75  
    76  //CharactersMatcher represents a matcher, that matches any of Chars.
    77  type CharactersMatcher struct {
    78  	Chars string //characters to be matched
    79  }
    80  
    81  //Match matches any characters defined in Chars in the input, returns 1 if character has been matched
    82  func (m CharactersMatcher) Match(input string, offset int) int {
    83  	var matched = 0
    84  	if offset >= len(input) {
    85  		return matched
    86  	}
    87  outer:
    88  	for _, r := range input[offset:] {
    89  		for _, candidate := range m.Chars {
    90  			if candidate == r {
    91  				matched++
    92  				continue outer
    93  			}
    94  		}
    95  		break
    96  	}
    97  	return matched
    98  }
    99  
   100  //NewCharactersMatcher creates a new character matcher
   101  func NewCharactersMatcher(chars string) Matcher {
   102  	return &CharactersMatcher{Chars: chars}
   103  }
   104  
   105  //EOFMatcher represents end of input matcher
   106  type EOFMatcher struct {
   107  }
   108  
   109  //Match returns 1 if end of input has been reached otherwise 0
   110  func (m EOFMatcher) Match(input string, offset int) int {
   111  	if offset+1 == len(input) {
   112  		return 1
   113  	}
   114  	return 0
   115  }
   116  
   117  //IntMatcher represents a matcher that finds any int in the input
   118  type IntMatcher struct{}
   119  
   120  //Match matches a literal in the input, it returns number of character matched.
   121  func (m IntMatcher) Match(input string, offset int) int {
   122  	var matched = 0
   123  	if offset >= len(input) {
   124  		return matched
   125  	}
   126  	for _, r := range input[offset:] {
   127  		if !unicode.IsDigit(r) {
   128  			break
   129  		}
   130  		matched++
   131  	}
   132  	return matched
   133  }
   134  
   135  //NewIntMatcher returns a new integer matcher
   136  func NewIntMatcher() Matcher {
   137  	return &IntMatcher{}
   138  }
   139  
   140  var dotRune = rune('.')
   141  var underscoreRune = rune('_')
   142  
   143  //LiteralMatcher represents a matcher that finds any literals in the input
   144  type LiteralMatcher struct{}
   145  
   146  //Match matches a literal in the input, it returns number of character matched.
   147  func (m LiteralMatcher) Match(input string, offset int) int {
   148  	var matched = 0
   149  	if offset >= len(input) {
   150  		return matched
   151  	}
   152  	for i, r := range input[offset:] {
   153  		if i == 0 {
   154  			if !unicode.IsLetter(r) {
   155  				break
   156  			}
   157  		} else if !(unicode.IsLetter(r) || unicode.IsDigit(r) || r == dotRune || r == underscoreRune) {
   158  			break
   159  		}
   160  		matched++
   161  	}
   162  	return matched
   163  }
   164  
   165  //LiteralMatcher represents a matcher that finds any literals in the input
   166  type IdMatcher struct{}
   167  
   168  //Match matches a literal in the input, it returns number of character matched.
   169  func (m IdMatcher) Match(input string, offset int) int {
   170  	var matched = 0
   171  	if offset >= len(input) {
   172  		return matched
   173  	}
   174  	for i, r := range input[offset:] {
   175  		if i == 0 {
   176  			if !(unicode.IsLetter(r) || unicode.IsDigit(r)) {
   177  				break
   178  			}
   179  		} else if !(unicode.IsLetter(r) || unicode.IsDigit(r) || r == dotRune || r == underscoreRune) {
   180  			break
   181  		}
   182  		matched++
   183  	}
   184  	return matched
   185  }
   186  
   187  //SequenceMatcher represents a matcher that finds any sequence until find provided terminators
   188  type SequenceMatcher struct {
   189  	Terminators            []string
   190  	CaseSensitive          bool
   191  	matchAllIfNoTerminator bool
   192  	runeTerminators        []rune
   193  }
   194  
   195  func (m *SequenceMatcher) hasTerminator(candidate string) bool {
   196  	var candidateLength = len(candidate)
   197  	for _, terminator := range m.Terminators {
   198  		terminatorLength := len(terminator)
   199  		if len(terminator) > candidateLength {
   200  			continue
   201  		}
   202  		if !m.CaseSensitive {
   203  			if strings.ToLower(terminator) == strings.ToLower(string(candidate[:terminatorLength])) {
   204  				return true
   205  			}
   206  		}
   207  		if terminator == string(candidate[:terminatorLength]) {
   208  			return true
   209  		}
   210  	}
   211  	return false
   212  }
   213  
   214  //Match matches a literal in the input, it returns number of character matched.
   215  func (m *SequenceMatcher) Match(input string, offset int) int {
   216  	var matched = 0
   217  	hasTerminator := false
   218  	if offset >= len(input) {
   219  		return matched
   220  	}
   221  	if len(m.runeTerminators) > 0 {
   222  		return m.matchSingleTerminator(input, offset)
   223  	}
   224  	var i = 0
   225  	for ; i < len(input)-offset; i++ {
   226  		if m.hasTerminator(string(input[offset+i:])) {
   227  			hasTerminator = true
   228  			break
   229  		}
   230  	}
   231  	if !hasTerminator && !m.matchAllIfNoTerminator {
   232  		return 0
   233  	}
   234  	return i
   235  }
   236  
   237  func (m *SequenceMatcher) matchSingleTerminator(input string, offset int) int {
   238  	matched := 0
   239  	hasTerminator := false
   240  outer:
   241  	for i, r := range input[offset:] {
   242  		for _, terminator := range m.runeTerminators {
   243  			terminator = unicode.ToLower(terminator)
   244  			if m.CaseSensitive {
   245  				r = unicode.ToLower(r)
   246  				terminator = unicode.ToLower(terminator)
   247  			}
   248  			if r == terminator {
   249  				hasTerminator = true
   250  				matched = i
   251  				break outer
   252  			}
   253  		}
   254  
   255  	}
   256  	if !hasTerminator && !m.matchAllIfNoTerminator {
   257  		return 0
   258  	}
   259  	return matched
   260  }
   261  
   262  //NewSequenceMatcher creates a new matcher that finds all sequence until find at least one of the provided terminators
   263  func NewSequenceMatcher(terminators ...string) Matcher {
   264  	result := &SequenceMatcher{
   265  		matchAllIfNoTerminator: true,
   266  		Terminators:            terminators,
   267  		runeTerminators:        []rune{},
   268  	}
   269  	for _, terminator := range terminators {
   270  		if len(terminator) != 1 {
   271  			result.runeTerminators = []rune{}
   272  			break
   273  		}
   274  		result.runeTerminators = append(result.runeTerminators, rune(terminator[0]))
   275  	}
   276  	return result
   277  }
   278  
   279  //NewTerminatorMatcher creates a new matcher that finds any sequence until find at least one of the provided terminators
   280  func NewTerminatorMatcher(terminators ...string) Matcher {
   281  	result := &SequenceMatcher{
   282  		Terminators:     terminators,
   283  		runeTerminators: []rune{},
   284  	}
   285  	for _, terminator := range terminators {
   286  		if len(terminator) != 1 {
   287  			result.runeTerminators = []rune{}
   288  			break
   289  		}
   290  		result.runeTerminators = append(result.runeTerminators, rune(terminator[0]))
   291  	}
   292  	return result
   293  }
   294  
   295  //remainingSequenceMatcher represents a matcher that matches all reamining input
   296  type remainingSequenceMatcher struct{}
   297  
   298  //Match matches a literal in the input, it returns number of character matched.
   299  func (m *remainingSequenceMatcher) Match(input string, offset int) (matched int) {
   300  	return len(input) - offset
   301  }
   302  
   303  //Creates a matcher that matches all remaining input
   304  func NewRemainingSequenceMatcher() Matcher {
   305  	return &remainingSequenceMatcher{}
   306  }
   307  
   308  //CustomIdMatcher represents a matcher that finds any literals with additional custom set of characters in the input
   309  type customIdMatcher struct {
   310  	Allowed map[rune]bool
   311  }
   312  
   313  func (m *customIdMatcher) isValid(r rune) bool {
   314  	if unicode.IsLetter(r) || unicode.IsDigit(r) {
   315  		return true
   316  	}
   317  	return m.Allowed[r]
   318  }
   319  
   320  //Match matches a literal in the input, it returns number of character matched.
   321  func (m *customIdMatcher) Match(input string, offset int) int {
   322  	var matched = 0
   323  	if offset >= len(input) {
   324  		return matched
   325  	}
   326  	for _, r := range input[offset:] {
   327  		if !m.isValid(r) {
   328  			break
   329  		}
   330  		matched++
   331  	}
   332  	return matched
   333  }
   334  
   335  //NewCustomIdMatcher creates new custom matcher
   336  func NewCustomIdMatcher(allowedChars ...string) Matcher {
   337  	var result = &customIdMatcher{
   338  		Allowed: make(map[rune]bool),
   339  	}
   340  	if len(allowedChars) == 1 && len(allowedChars[0]) > 0 {
   341  		for _, allowed := range allowedChars[0] {
   342  			result.Allowed[rune(allowed)] = true
   343  		}
   344  	}
   345  	for _, allowed := range allowedChars {
   346  		result.Allowed[rune(allowed[0])] = true
   347  	}
   348  	return result
   349  }
   350  
   351  //LiteralMatcher represents a matcher that finds any literals in the input
   352  type BodyMatcher struct {
   353  	Begin string
   354  	End   string
   355  }
   356  
   357  //Match matches a literal in the input, it returns number of character matched.
   358  func (m *BodyMatcher) Match(input string, offset int) (matched int) {
   359  	beginLen := len(m.Begin)
   360  	endLen := len(m.End)
   361  	uniEnclosed := m.Begin == m.End
   362  
   363  	if offset+beginLen >= len(input) {
   364  		return 0
   365  	}
   366  	if input[offset:offset+beginLen] != m.Begin {
   367  		return 0
   368  	}
   369  	var depth = 1
   370  	var i = 1
   371  	for ; i < len(input)-offset; i++ {
   372  		canCheckEnd := offset+i+endLen <= len(input)
   373  		if !canCheckEnd {
   374  			return 0
   375  		}
   376  		if !uniEnclosed {
   377  			canCheckBegin := offset+i+beginLen <= len(input)
   378  			if canCheckBegin {
   379  				if string(input[offset+i:offset+i+beginLen]) == m.Begin {
   380  					depth++
   381  				}
   382  			}
   383  		}
   384  		if string(input[offset+i:offset+i+endLen]) == m.End {
   385  			depth--
   386  		}
   387  		if depth == 0 {
   388  			i += endLen
   389  			break
   390  		}
   391  	}
   392  	return i
   393  }
   394  
   395  //NewBodyMatcher creates a new body matcher
   396  func NewBodyMatcher(begin, end string) Matcher {
   397  	return &BodyMatcher{Begin: begin, End: end}
   398  }
   399  
   400  // Parses SQL Begin End blocks
   401  func NewBlockMatcher(caseSensitive bool, sequenceStart string, sequenceTerminator string, nestedSequences []string, ignoredTerminators []string) Matcher {
   402  	return &BlockMatcher{
   403  		CaseSensitive:      caseSensitive,
   404  		SequenceStart:      sequenceStart,
   405  		SequenceTerminator: sequenceTerminator,
   406  		NestedSequences:    nestedSequences,
   407  		IgnoredTerminators: ignoredTerminators,
   408  	}
   409  }
   410  
   411  type BlockMatcher struct {
   412  	CaseSensitive      bool
   413  	SequenceStart      string
   414  	SequenceTerminator string
   415  	NestedSequences    []string
   416  	IgnoredTerminators []string
   417  }
   418  
   419  func (m *BlockMatcher) Match(input string, offset int) (matched int) {
   420  
   421  	sequenceStart := m.SequenceStart
   422  	terminator := m.SequenceTerminator
   423  	nestedSequences := m.NestedSequences
   424  	ignoredTerminators := m.IgnoredTerminators
   425  	in := input
   426  
   427  	starterLen := len(sequenceStart)
   428  	terminatorLen := len(terminator)
   429  
   430  	if !m.CaseSensitive {
   431  		sequenceStart = strings.ToLower(sequenceStart)
   432  		terminator = strings.ToLower(terminator)
   433  		for i, seq := range nestedSequences {
   434  			nestedSequences[i] = strings.ToLower(seq)
   435  		}
   436  		for i, term := range ignoredTerminators {
   437  			ignoredTerminators[i] = strings.ToLower(term)
   438  		}
   439  		in = strings.ToLower(input)
   440  	}
   441  
   442  	if offset+starterLen >= len(in) {
   443  		return 0
   444  	}
   445  	if in[offset:offset+starterLen] != sequenceStart {
   446  		return 0
   447  	}
   448  	var depth = 1
   449  	var i = 1
   450  	for ; i < len(in)-offset; i++ {
   451  		canCheckEnd := offset+i+terminatorLen <= len(in)
   452  		if !canCheckEnd {
   453  			return 0
   454  		}
   455  		canCheckBegin := offset+i+starterLen <= len(in)
   456  		if canCheckBegin {
   457  			beginning := in[offset+i : offset+i+starterLen]
   458  
   459  			if beginning == sequenceStart {
   460  				depth++
   461  			} else {
   462  				for _, nestedSeq := range nestedSequences {
   463  					nestedLen := len(nestedSeq)
   464  					if offset+i+nestedLen >= len(in) {
   465  						continue
   466  					}
   467  
   468  					beginning := in[offset+i : offset+i+nestedLen]
   469  					if beginning == nestedSeq {
   470  						depth++
   471  						break
   472  					}
   473  				}
   474  			}
   475  		}
   476  		ignored := false
   477  		for _, ignoredTerm := range ignoredTerminators {
   478  			termLen := len(ignoredTerm)
   479  			if offset+i+termLen >= len(in) {
   480  				continue
   481  			}
   482  
   483  			ending := in[offset+i : offset+i+termLen]
   484  			if ending == ignoredTerm {
   485  				ignored = true
   486  				break
   487  			}
   488  		}
   489  		if !ignored && in[offset+i:offset+i+terminatorLen] == terminator && unicode.IsSpace(rune(in[offset+i-1])) {
   490  			depth--
   491  		}
   492  		if depth == 0 {
   493  			i += terminatorLen
   494  			break
   495  		}
   496  	}
   497  	return i
   498  }
   499  
   500  //KeywordMatcher represents a keyword matcher
   501  type KeywordMatcher struct {
   502  	Keyword       string
   503  	CaseSensitive bool
   504  }
   505  
   506  //Match matches keyword in the input,  it returns number of character matched.
   507  func (m KeywordMatcher) Match(input string, offset int) (matched int) {
   508  	if !(offset+len(m.Keyword)-1 < len(input)) {
   509  		return 0
   510  	}
   511  	if m.CaseSensitive {
   512  		if input[offset:offset+len(m.Keyword)] == m.Keyword {
   513  			return len(m.Keyword)
   514  		}
   515  	} else {
   516  		if strings.ToLower(input[offset:offset+len(m.Keyword)]) == strings.ToLower(m.Keyword) {
   517  			return len(m.Keyword)
   518  		}
   519  	}
   520  	return 0
   521  }
   522  
   523  //KeywordsMatcher represents a matcher that finds any of specified keywords in the input
   524  type KeywordsMatcher struct {
   525  	Keywords      []string
   526  	CaseSensitive bool
   527  }
   528  
   529  //Match matches any specified keyword,  it returns number of character matched.
   530  func (m KeywordsMatcher) Match(input string, offset int) (matched int) {
   531  	for _, keyword := range m.Keywords {
   532  		if len(input)-offset < len(keyword) {
   533  			continue
   534  		}
   535  		if m.CaseSensitive {
   536  			if input[offset:offset+len(keyword)] == keyword {
   537  				return len(keyword)
   538  			}
   539  		} else {
   540  			if strings.ToLower(input[offset:offset+len(keyword)]) == strings.ToLower(keyword) {
   541  				return len(keyword)
   542  			}
   543  		}
   544  	}
   545  	return 0
   546  }
   547  
   548  //NewKeywordsMatcher returns a matcher for supplied keywords
   549  func NewKeywordsMatcher(caseSensitive bool, keywords ...string) Matcher {
   550  	return &KeywordsMatcher{CaseSensitive: caseSensitive, Keywords: keywords}
   551  }
   552  
   553  //IllegalTokenError represents illegal token error
   554  type IllegalTokenError struct {
   555  	Illegal  *Token
   556  	Message  string
   557  	Expected []int
   558  	Position int
   559  }
   560  
   561  func (e *IllegalTokenError) Error() string {
   562  	return fmt.Sprintf("%v; illegal token at %v [%v], expected %v, but had: %v", e.Message, e.Position, e.Illegal.Matched, e.Expected, e.Illegal.Token)
   563  }
   564  
   565  //NewIllegalTokenError create a new illegal token error
   566  func NewIllegalTokenError(message string, expected []int, position int, found *Token) error {
   567  	return &IllegalTokenError{
   568  		Message:  message,
   569  		Illegal:  found,
   570  		Expected: expected,
   571  		Position: position,
   572  	}
   573  }
   574  
   575  //ExpectTokenOptionallyFollowedBy returns second matched token or error if first and second group was not matched
   576  func ExpectTokenOptionallyFollowedBy(tokenizer *Tokenizer, first int, errorMessage string, second ...int) (*Token, error) {
   577  	_, _ = ExpectToken(tokenizer, "", first)
   578  	return ExpectToken(tokenizer, errorMessage, second...)
   579  }
   580  
   581  //ExpectToken returns the matched token or error
   582  func ExpectToken(tokenizer *Tokenizer, errorMessage string, candidates ...int) (*Token, error) {
   583  	token := tokenizer.Nexts(candidates...)
   584  	hasMatch := HasSliceAnyElements(candidates, token.Token)
   585  	if !hasMatch {
   586  		return nil, NewIllegalTokenError(errorMessage, candidates, tokenizer.Index, token)
   587  	}
   588  	return token, nil
   589  }