github.com/dolthub/dolt/go@v0.40.5-0.20240520175717-68db7794bea6/libraries/doltcore/branch_control/expr_parser.go (about) 1 // Copyright 2022 Dolthub, Inc. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 package branch_control 16 17 import ( 18 "math" 19 "sync" 20 "unicode/utf8" 21 22 flatbuffers "github.com/dolthub/flatbuffers/v23/go" 23 "github.com/dolthub/go-mysql-server/sql" 24 25 "github.com/dolthub/dolt/go/gen/fb/serial" 26 ) 27 28 const ( 29 singleMatch = -1 // Equivalent to the single match character '_' 30 anyMatch = -2 // Equivalent to the any-length match character '%' 31 columnMarker = -3 // Marks the start of a new column 32 ) 33 34 // invalidMatchExpression is a match expression that does not match anything 35 var invalidMatchExpression = MatchExpression{math.MaxUint32, nil} 36 37 // matchExprPool is a pool for MatchExpression slices. Provides a significant performance benefit. 38 var matchExprPool = &sync.Pool{ 39 New: func() any { 40 return make([]MatchExpression, 0, 32) 41 }, 42 } 43 44 // indexPool is a pool for index slices (such as those returned by Match). Provides a decent performance benefit. 45 var indexPool = &sync.Pool{ 46 New: func() any { 47 return make([]uint32, 0, 32) 48 }, 49 } 50 51 // MatchExpression represents a parsed expression that may be matched against. It contains a list of sort orders, which 52 // each represent a comparable value to determine whether any given character is a match. A character's sort order is 53 // obtained from a collation. Also contains its index in the table. MatchExpression contents are not meant to be 54 // comparable to one another, therefore please use the index to compare equivalence. 55 type MatchExpression struct { 56 CollectionIndex uint32 // CollectionIndex represents this expression's index in its parent slice. 57 SortOrders []int32 // These are the sort orders that will be compared against when matching a given rune. 58 } 59 60 // FoldExpression folds the given expression into its smallest form. Expressions have two wildcard operators: 61 // '_' and '%'. '_' matches exactly one character, and it can be any character. '%' can match zero or more of any 62 // character. Taking these two ops into account, the configurations "%_" and "_%" both resolve to matching one or more 63 // of any character. However, the "_%" form is more economical, as you enforce the single match first before checking 64 // for remaining matches. Similarly, "%%" is equivalent to a single '%'. Both of these rules are applied in this 65 // function, guaranteeing that the returned expression is the smallest form that still exactly represents the original. 66 // 67 // This also assumes that '\' is the escape character. 68 func FoldExpression(str string) string { 69 // This loop only terminates when we complete a run where no substitutions were made. Substitutions are applied 70 // linearly, therefore it's possible that one substitution may create an opportunity for another substitution. 71 // To keep the code simple, we continue looping until we have nothing more to do. 72 for true { 73 newStrRunes := make([]rune, 0, len(str)) 74 // Skip next is set whenever we encounter the escape character, which is used to explicitly match against '_' and '%' 75 skipNext := false 76 // Consider next is set whenever we encounter an unescaped '%', indicating we may need to apply the substitutions 77 considerNext := false 78 for _, r := range str { 79 if skipNext { 80 skipNext = false 81 newStrRunes = append(newStrRunes, r) 82 continue 83 } else if considerNext { 84 considerNext = false 85 switch r { 86 case '\\': 87 newStrRunes = append(newStrRunes, '%', r) // False alarm, reinsert % before this rune 88 skipNext = true // We also need to ignore the next rune 89 case '_': 90 newStrRunes = append(newStrRunes, r, '%') // Replacing %_ with _% 91 case '%': 92 newStrRunes = append(newStrRunes, r) // Replacing %% with % 93 default: 94 newStrRunes = append(newStrRunes, '%', r) // False alarm, reinsert % before this rune 95 } 96 continue 97 } 98 99 switch r { 100 case '\\': 101 newStrRunes = append(newStrRunes, r) 102 skipNext = true 103 case '%': 104 considerNext = true 105 default: 106 newStrRunes = append(newStrRunes, r) 107 } 108 } 109 // If the very last rune is '%', then this will be true and we need to append it to the end 110 if considerNext { 111 newStrRunes = append(newStrRunes, '%') 112 } 113 newStr := string(newStrRunes) 114 if str == newStr { 115 break 116 } 117 str = newStr 118 } 119 return str 120 } 121 122 // ParseExpression parses the given string expression into a slice of sort ints, which will be used in a MatchExpression. 123 // Returns nil if the string is too long. Assumes that the given string expression has already been folded. 124 func ParseExpression(str string, collation sql.CollationID) []int32 { 125 if len(str) > math.MaxUint16 { 126 return nil 127 } 128 129 sortFunc := collation.Sorter() 130 var orders []int32 131 escaped := false 132 for _, r := range str { 133 if escaped { 134 escaped = false 135 orders = append(orders, sortFunc(r)) 136 } else { 137 switch r { 138 case '\\': 139 escaped = true 140 case '%': 141 orders = append(orders, anyMatch) 142 case '_': 143 orders = append(orders, singleMatch) 144 default: 145 orders = append(orders, sortFunc(r)) 146 } 147 } 148 } 149 return orders 150 } 151 152 // Match takes the match expression collection, and returns a slice of which collection indexes matched against the 153 // given string. The given indices may be used to further reduce the match expression collection, which will also reduce 154 // the total number of comparisons as they're narrowed down. 155 // 156 // It is vastly more performant to return a slice of collection indexes here, rather than a slice of match expressions. 157 // This is true even when the match expressions are pooled. The reason is unknown, but as we only need the collection 158 // indexes anyway, we discard the match expressions and return only their indexes. 159 func Match(matchExprCollection []MatchExpression, str string, collation sql.CollationID) []uint32 { 160 sortFunc := collation.Sorter() 161 // Grab the first rune and also remove it from the string 162 r, rSize := utf8.DecodeRuneInString(str) 163 str = str[rSize:] 164 // Grab a slice from the pool, which reduces the GC pressure. 165 matchSubset := matchExprPool.Get().([]MatchExpression)[:0] 166 // We do a pass using the first rune over all expressions to get the subset that we'll be testing against 167 for _, testExpr := range matchExprCollection { 168 if matched, next, extra := testExpr.Matches(sortFunc(r)); matched { 169 if extra.IsValid() { 170 matchSubset = append(matchSubset, next, extra) 171 } else { 172 matchSubset = append(matchSubset, next) 173 } 174 } 175 } 176 // Bail early if there are no matches here 177 if len(matchSubset) == 0 { 178 matchExprPool.Put(matchSubset) 179 // We return a slice from the index pool as we later will return it to the pool. We don't want to stick a 180 // nil/empty slice into the pool. 181 return indexPool.Get().([]uint32)[:0] 182 } 183 184 // This is the slice that we'll put matches into. This will also flip to become the match subset. This way we reuse 185 // the underlying arrays. We also grab this from the pool. 186 matches := matchExprPool.Get().([]MatchExpression)[:0] 187 // Now that we have our set of expressions to test, we loop over the remainder of the input string 188 for _, r = range str { 189 for _, testExpr := range matchSubset { 190 if matched, next, extra := testExpr.Matches(sortFunc(r)); matched { 191 if extra.IsValid() { 192 matches = append(matches, next, extra) 193 } else { 194 matches = append(matches, next) 195 } 196 } 197 } 198 // Swap the two, and put the slice of matches to be at the beginning of the previous subset array to reuse it 199 matches, matchSubset = matchSubset[:0], matches 200 } 201 matchExprPool.Put(matches) 202 203 // Grab the indices of all valid matches 204 validMatches := indexPool.Get().([]uint32)[:0] 205 for _, match := range matchSubset { 206 if match.IsAtEnd() && (len(validMatches) == 0 || 207 (len(validMatches) > 0 && match.CollectionIndex != validMatches[len(validMatches)-1])) { 208 validMatches = append(validMatches, match.CollectionIndex) 209 } 210 } 211 matchExprPool.Put(matchSubset) 212 return validMatches 213 } 214 215 // Matches returns true when the given sort order matches the expectation of the calling match expression. Returns a 216 // reduced match expression as `next`, which should take the place of the calling match function. In the event of a 217 // branch, returns the branching match expression as `extra`. 218 // 219 // Branches occur when the '%' operator sees that the given sort order matches the sort order after the '%'. As it 220 // cannot be determined which path is the correct one (whether to consume the '%' or continue using it), a branch is 221 // created. The `extra` should be checked for validity by calling IsValid. 222 func (matchExpr MatchExpression) Matches(sortOrder int32) (matched bool, next MatchExpression, extra MatchExpression) { 223 if len(matchExpr.SortOrders) == 0 { 224 return false, invalidMatchExpression, invalidMatchExpression 225 } 226 switch matchExpr.SortOrders[0] { 227 case singleMatch: 228 if sortOrder < singleMatch { 229 return false, invalidMatchExpression, invalidMatchExpression 230 } 231 return true, MatchExpression{matchExpr.CollectionIndex, matchExpr.SortOrders[1:]}, invalidMatchExpression 232 case anyMatch: 233 if len(matchExpr.SortOrders) > 1 && matchExpr.SortOrders[1] == sortOrder { 234 return true, matchExpr, MatchExpression{matchExpr.CollectionIndex, matchExpr.SortOrders[2:]} 235 } 236 return true, matchExpr, invalidMatchExpression 237 default: 238 if sortOrder == matchExpr.SortOrders[0] { 239 return true, MatchExpression{matchExpr.CollectionIndex, matchExpr.SortOrders[1:]}, invalidMatchExpression 240 } else { 241 return false, invalidMatchExpression, invalidMatchExpression 242 } 243 } 244 } 245 246 // IsValid returns whether the match expression is valid. An invalid MatchExpression will have a collection index that 247 // is at the maximum value for an uint32. 248 func (matchExpr MatchExpression) IsValid() bool { 249 return matchExpr.CollectionIndex < math.MaxUint32 250 } 251 252 // IsAtEnd returns whether the match expression has matched every character. There is a special case where, if the last 253 // character is '%', it is considered to be at the end. 254 func (matchExpr MatchExpression) IsAtEnd() bool { 255 return len(matchExpr.SortOrders) == 0 || (len(matchExpr.SortOrders) == 1 && matchExpr.SortOrders[0] == anyMatch) 256 } 257 258 // Serialize returns the offset for the MatchExpression written to the given builder. 259 func (matchExpr MatchExpression) Serialize(b *flatbuffers.Builder) flatbuffers.UOffsetT { 260 _ = serial.BranchControlMatchExpressionStartSortOrdersVector(b, len(matchExpr.SortOrders)) 261 for i := len(matchExpr.SortOrders) - 1; i >= 0; i-- { 262 b.PrependInt32(matchExpr.SortOrders[i]) 263 } 264 sortOrdersOffset := b.EndVector(len(matchExpr.SortOrders)) 265 266 serial.BranchControlMatchExpressionStart(b) 267 serial.BranchControlMatchExpressionAddIndex(b, matchExpr.CollectionIndex) 268 serial.BranchControlMatchExpressionAddSortOrders(b, sortOrdersOffset) 269 return serial.BranchControlMatchExpressionEnd(b) 270 } 271 272 // deserializeMatchExpression populates the MatchExpression with the data from the flatbuffers representation. 273 func deserializeMatchExpression(fb *serial.BranchControlMatchExpression) MatchExpression { 274 matchExpr := MatchExpression{ 275 CollectionIndex: fb.Index(), 276 SortOrders: make([]int32, fb.SortOrdersLength()), 277 } 278 for i := 0; i < fb.SortOrdersLength(); i++ { 279 matchExpr.SortOrders[i] = fb.SortOrders(i) 280 } 281 return matchExpr 282 }