github.com/dolthub/dolt/go@v0.40.5-0.20240520175717-68db7794bea6/libraries/doltcore/branch_control/expr_parser.go (about)

     1  // Copyright 2022 Dolthub, Inc.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  package branch_control
    16  
    17  import (
    18  	"math"
    19  	"sync"
    20  	"unicode/utf8"
    21  
    22  	flatbuffers "github.com/dolthub/flatbuffers/v23/go"
    23  	"github.com/dolthub/go-mysql-server/sql"
    24  
    25  	"github.com/dolthub/dolt/go/gen/fb/serial"
    26  )
    27  
    28  const (
    29  	singleMatch  = -1 // Equivalent to the single match character '_'
    30  	anyMatch     = -2 // Equivalent to the any-length match character '%'
    31  	columnMarker = -3 // Marks the start of a new column
    32  )
    33  
    34  // invalidMatchExpression is a match expression that does not match anything
    35  var invalidMatchExpression = MatchExpression{math.MaxUint32, nil}
    36  
    37  // matchExprPool is a pool for MatchExpression slices. Provides a significant performance benefit.
    38  var matchExprPool = &sync.Pool{
    39  	New: func() any {
    40  		return make([]MatchExpression, 0, 32)
    41  	},
    42  }
    43  
    44  // indexPool is a pool for index slices (such as those returned by Match). Provides a decent performance benefit.
    45  var indexPool = &sync.Pool{
    46  	New: func() any {
    47  		return make([]uint32, 0, 32)
    48  	},
    49  }
    50  
    51  // MatchExpression represents a parsed expression that may be matched against. It contains a list of sort orders, which
    52  // each represent a comparable value to determine whether any given character is a match. A character's sort order is
    53  // obtained from a collation. Also contains its index in the table. MatchExpression contents are not meant to be
    54  // comparable to one another, therefore please use the index to compare equivalence.
    55  type MatchExpression struct {
    56  	CollectionIndex uint32  // CollectionIndex represents this expression's index in its parent slice.
    57  	SortOrders      []int32 // These are the sort orders that will be compared against when matching a given rune.
    58  }
    59  
    60  // FoldExpression folds the given expression into its smallest form. Expressions have two wildcard operators:
    61  // '_' and '%'. '_' matches exactly one character, and it can be any character. '%' can match zero or more of any
    62  // character. Taking these two ops into account, the configurations "%_" and "_%" both resolve to matching one or more
    63  // of any character. However, the "_%" form is more economical, as you enforce the single match first before checking
    64  // for remaining matches. Similarly, "%%" is equivalent to a single '%'. Both of these rules are applied in this
    65  // function, guaranteeing that the returned expression is the smallest form that still exactly represents the original.
    66  //
    67  // This also assumes that '\' is the escape character.
    68  func FoldExpression(str string) string {
    69  	// This loop only terminates when we complete a run where no substitutions were made. Substitutions are applied
    70  	// linearly, therefore it's possible that one substitution may create an opportunity for another substitution.
    71  	// To keep the code simple, we continue looping until we have nothing more to do.
    72  	for true {
    73  		newStrRunes := make([]rune, 0, len(str))
    74  		// Skip next is set whenever we encounter the escape character, which is used to explicitly match against '_' and '%'
    75  		skipNext := false
    76  		// Consider next is set whenever we encounter an unescaped '%', indicating we may need to apply the substitutions
    77  		considerNext := false
    78  		for _, r := range str {
    79  			if skipNext {
    80  				skipNext = false
    81  				newStrRunes = append(newStrRunes, r)
    82  				continue
    83  			} else if considerNext {
    84  				considerNext = false
    85  				switch r {
    86  				case '\\':
    87  					newStrRunes = append(newStrRunes, '%', r) // False alarm, reinsert % before this rune
    88  					skipNext = true                           // We also need to ignore the next rune
    89  				case '_':
    90  					newStrRunes = append(newStrRunes, r, '%') // Replacing %_ with _%
    91  				case '%':
    92  					newStrRunes = append(newStrRunes, r) // Replacing %% with %
    93  				default:
    94  					newStrRunes = append(newStrRunes, '%', r) // False alarm, reinsert % before this rune
    95  				}
    96  				continue
    97  			}
    98  
    99  			switch r {
   100  			case '\\':
   101  				newStrRunes = append(newStrRunes, r)
   102  				skipNext = true
   103  			case '%':
   104  				considerNext = true
   105  			default:
   106  				newStrRunes = append(newStrRunes, r)
   107  			}
   108  		}
   109  		// If the very last rune is '%', then this will be true and we need to append it to the end
   110  		if considerNext {
   111  			newStrRunes = append(newStrRunes, '%')
   112  		}
   113  		newStr := string(newStrRunes)
   114  		if str == newStr {
   115  			break
   116  		}
   117  		str = newStr
   118  	}
   119  	return str
   120  }
   121  
   122  // ParseExpression parses the given string expression into a slice of sort ints, which will be used in a MatchExpression.
   123  // Returns nil if the string is too long. Assumes that the given string expression has already been folded.
   124  func ParseExpression(str string, collation sql.CollationID) []int32 {
   125  	if len(str) > math.MaxUint16 {
   126  		return nil
   127  	}
   128  
   129  	sortFunc := collation.Sorter()
   130  	var orders []int32
   131  	escaped := false
   132  	for _, r := range str {
   133  		if escaped {
   134  			escaped = false
   135  			orders = append(orders, sortFunc(r))
   136  		} else {
   137  			switch r {
   138  			case '\\':
   139  				escaped = true
   140  			case '%':
   141  				orders = append(orders, anyMatch)
   142  			case '_':
   143  				orders = append(orders, singleMatch)
   144  			default:
   145  				orders = append(orders, sortFunc(r))
   146  			}
   147  		}
   148  	}
   149  	return orders
   150  }
   151  
   152  // Match takes the match expression collection, and returns a slice of which collection indexes matched against the
   153  // given string. The given indices may be used to further reduce the match expression collection, which will also reduce
   154  // the total number of comparisons as they're narrowed down.
   155  //
   156  // It is vastly more performant to return a slice of collection indexes here, rather than a slice of match expressions.
   157  // This is true even when the match expressions are pooled. The reason is unknown, but as we only need the collection
   158  // indexes anyway, we discard the match expressions and return only their indexes.
   159  func Match(matchExprCollection []MatchExpression, str string, collation sql.CollationID) []uint32 {
   160  	sortFunc := collation.Sorter()
   161  	// Grab the first rune and also remove it from the string
   162  	r, rSize := utf8.DecodeRuneInString(str)
   163  	str = str[rSize:]
   164  	// Grab a slice from the pool, which reduces the GC pressure.
   165  	matchSubset := matchExprPool.Get().([]MatchExpression)[:0]
   166  	// We do a pass using the first rune over all expressions to get the subset that we'll be testing against
   167  	for _, testExpr := range matchExprCollection {
   168  		if matched, next, extra := testExpr.Matches(sortFunc(r)); matched {
   169  			if extra.IsValid() {
   170  				matchSubset = append(matchSubset, next, extra)
   171  			} else {
   172  				matchSubset = append(matchSubset, next)
   173  			}
   174  		}
   175  	}
   176  	// Bail early if there are no matches here
   177  	if len(matchSubset) == 0 {
   178  		matchExprPool.Put(matchSubset)
   179  		// We return a slice from the index pool as we later will return it to the pool. We don't want to stick a
   180  		// nil/empty slice into the pool.
   181  		return indexPool.Get().([]uint32)[:0]
   182  	}
   183  
   184  	// This is the slice that we'll put matches into. This will also flip to become the match subset. This way we reuse
   185  	// the underlying arrays. We also grab this from the pool.
   186  	matches := matchExprPool.Get().([]MatchExpression)[:0]
   187  	// Now that we have our set of expressions to test, we loop over the remainder of the input string
   188  	for _, r = range str {
   189  		for _, testExpr := range matchSubset {
   190  			if matched, next, extra := testExpr.Matches(sortFunc(r)); matched {
   191  				if extra.IsValid() {
   192  					matches = append(matches, next, extra)
   193  				} else {
   194  					matches = append(matches, next)
   195  				}
   196  			}
   197  		}
   198  		// Swap the two, and put the slice of matches to be at the beginning of the previous subset array to reuse it
   199  		matches, matchSubset = matchSubset[:0], matches
   200  	}
   201  	matchExprPool.Put(matches)
   202  
   203  	// Grab the indices of all valid matches
   204  	validMatches := indexPool.Get().([]uint32)[:0]
   205  	for _, match := range matchSubset {
   206  		if match.IsAtEnd() && (len(validMatches) == 0 ||
   207  			(len(validMatches) > 0 && match.CollectionIndex != validMatches[len(validMatches)-1])) {
   208  			validMatches = append(validMatches, match.CollectionIndex)
   209  		}
   210  	}
   211  	matchExprPool.Put(matchSubset)
   212  	return validMatches
   213  }
   214  
   215  // Matches returns true when the given sort order matches the expectation of the calling match expression. Returns a
   216  // reduced match expression as `next`, which should take the place of the calling match function. In the event of a
   217  // branch, returns the branching match expression as `extra`.
   218  //
   219  // Branches occur when the '%' operator sees that the given sort order matches the sort order after the '%'. As it
   220  // cannot be determined which path is the correct one (whether to consume the '%' or continue using it), a branch is
   221  // created. The `extra` should be checked for validity by calling IsValid.
   222  func (matchExpr MatchExpression) Matches(sortOrder int32) (matched bool, next MatchExpression, extra MatchExpression) {
   223  	if len(matchExpr.SortOrders) == 0 {
   224  		return false, invalidMatchExpression, invalidMatchExpression
   225  	}
   226  	switch matchExpr.SortOrders[0] {
   227  	case singleMatch:
   228  		if sortOrder < singleMatch {
   229  			return false, invalidMatchExpression, invalidMatchExpression
   230  		}
   231  		return true, MatchExpression{matchExpr.CollectionIndex, matchExpr.SortOrders[1:]}, invalidMatchExpression
   232  	case anyMatch:
   233  		if len(matchExpr.SortOrders) > 1 && matchExpr.SortOrders[1] == sortOrder {
   234  			return true, matchExpr, MatchExpression{matchExpr.CollectionIndex, matchExpr.SortOrders[2:]}
   235  		}
   236  		return true, matchExpr, invalidMatchExpression
   237  	default:
   238  		if sortOrder == matchExpr.SortOrders[0] {
   239  			return true, MatchExpression{matchExpr.CollectionIndex, matchExpr.SortOrders[1:]}, invalidMatchExpression
   240  		} else {
   241  			return false, invalidMatchExpression, invalidMatchExpression
   242  		}
   243  	}
   244  }
   245  
   246  // IsValid returns whether the match expression is valid. An invalid MatchExpression will have a collection index that
   247  // is at the maximum value for an uint32.
   248  func (matchExpr MatchExpression) IsValid() bool {
   249  	return matchExpr.CollectionIndex < math.MaxUint32
   250  }
   251  
   252  // IsAtEnd returns whether the match expression has matched every character. There is a special case where, if the last
   253  // character is '%', it is considered to be at the end.
   254  func (matchExpr MatchExpression) IsAtEnd() bool {
   255  	return len(matchExpr.SortOrders) == 0 || (len(matchExpr.SortOrders) == 1 && matchExpr.SortOrders[0] == anyMatch)
   256  }
   257  
   258  // Serialize returns the offset for the MatchExpression written to the given builder.
   259  func (matchExpr MatchExpression) Serialize(b *flatbuffers.Builder) flatbuffers.UOffsetT {
   260  	_ = serial.BranchControlMatchExpressionStartSortOrdersVector(b, len(matchExpr.SortOrders))
   261  	for i := len(matchExpr.SortOrders) - 1; i >= 0; i-- {
   262  		b.PrependInt32(matchExpr.SortOrders[i])
   263  	}
   264  	sortOrdersOffset := b.EndVector(len(matchExpr.SortOrders))
   265  
   266  	serial.BranchControlMatchExpressionStart(b)
   267  	serial.BranchControlMatchExpressionAddIndex(b, matchExpr.CollectionIndex)
   268  	serial.BranchControlMatchExpressionAddSortOrders(b, sortOrdersOffset)
   269  	return serial.BranchControlMatchExpressionEnd(b)
   270  }
   271  
   272  // deserializeMatchExpression populates the MatchExpression with the data from the flatbuffers representation.
   273  func deserializeMatchExpression(fb *serial.BranchControlMatchExpression) MatchExpression {
   274  	matchExpr := MatchExpression{
   275  		CollectionIndex: fb.Index(),
   276  		SortOrders:      make([]int32, fb.SortOrdersLength()),
   277  	}
   278  	for i := 0; i < fb.SortOrdersLength(); i++ {
   279  		matchExpr.SortOrders[i] = fb.SortOrders(i)
   280  	}
   281  	return matchExpr
   282  }