github.com/dolthub/go-mysql-server@v0.18.0/sql/expression/like.go (about) 1 // Copyright 2020-2021 Dolthub, Inc. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 package expression 16 17 import ( 18 "bytes" 19 "fmt" 20 "regexp" 21 "strings" 22 "sync" 23 "unicode/utf8" 24 25 "github.com/dolthub/go-mysql-server/internal/regex" 26 "github.com/dolthub/go-mysql-server/sql" 27 "github.com/dolthub/go-mysql-server/sql/types" 28 ) 29 30 func newDefaultLikeMatcher(likeStr string) (regex.DisposableMatcher, error) { 31 return regex.NewDisposableMatcher("go", likeStr) 32 } 33 34 // Like performs pattern matching against two strings. 35 type Like struct { 36 BinaryExpressionStub 37 Escape sql.Expression 38 pool *sync.Pool 39 once sync.Once 40 cached bool 41 } 42 43 var _ sql.Expression = (*Like)(nil) 44 var _ sql.CollationCoercible = (*Like)(nil) 45 46 type likeMatcherErrTuple struct { 47 matcher LikeMatcher 48 err error 49 } 50 51 // NewLike creates a new LIKE expression. 52 func NewLike(left, right, escape sql.Expression) sql.Expression { 53 var cached = true 54 sql.Inspect(right, func(e sql.Expression) bool { 55 if _, ok := e.(*GetField); ok { 56 cached = false 57 } 58 return true 59 }) 60 61 return &Like{ 62 BinaryExpressionStub: BinaryExpressionStub{left, right}, 63 Escape: escape, 64 pool: nil, 65 once: sync.Once{}, 66 cached: cached, 67 } 68 } 69 70 // Type implements the sql.Expression interface. 71 func (l *Like) Type() sql.Type { return types.Boolean } 72 73 // CollationCoercibility implements the interface sql.CollationCoercible. 74 func (l *Like) CollationCoercibility(ctx *sql.Context) (collation sql.CollationID, coercibility byte) { 75 leftCollation, leftCoercibility := sql.GetCoercibility(ctx, l.LeftChild) 76 rightCollation, rightCoercibility := sql.GetCoercibility(ctx, l.RightChild) 77 return sql.ResolveCoercibility(leftCollation, leftCoercibility, rightCollation, rightCoercibility) 78 } 79 80 // Eval implements the sql.Expression interface. 81 func (l *Like) Eval(ctx *sql.Context, row sql.Row) (interface{}, error) { 82 span, ctx := ctx.Span("expression.Like") 83 defer span.End() 84 85 left, err := l.LeftChild.Eval(ctx, row) 86 if err != nil || left == nil { 87 return nil, err 88 } 89 if _, ok := left.(string); !ok { 90 left, _, err = types.LongText.Convert(left) 91 if err != nil { 92 return nil, err 93 } 94 } 95 96 var lm LikeMatcher 97 if !l.cached { 98 // for non-cached regex every time create a new matcher 99 right, escape, rerr := l.evalRight(ctx, row) 100 if rerr != nil { 101 return nil, rerr 102 } 103 if right == nil { 104 return nil, nil 105 } 106 collation, _ := l.CollationCoercibility(ctx) 107 lm, err = ConstructLikeMatcher(collation, *right, escape) 108 } else { 109 l.once.Do(func() { 110 right, escape, err := l.evalRight(ctx, row) 111 l.pool = &sync.Pool{ 112 New: func() interface{} { 113 if err != nil || right == nil { 114 return likeMatcherErrTuple{LikeMatcher{}, err} 115 } 116 collation, _ := l.CollationCoercibility(ctx) 117 m, e := ConstructLikeMatcher(collation, *right, escape) 118 return likeMatcherErrTuple{m, e} 119 }, 120 } 121 }) 122 tpl := l.pool.Get().(likeMatcherErrTuple) 123 lm, err = tpl.matcher, tpl.err 124 } 125 if err != nil { 126 return nil, err 127 } 128 if lm.collation == sql.Collation_Unspecified { 129 return false, nil 130 } 131 132 ok := lm.Match(left.(string)) 133 if l.cached { 134 l.pool.Put(likeMatcherErrTuple{lm, nil}) 135 } 136 return ok, nil 137 } 138 139 func (l *Like) evalRight(ctx *sql.Context, row sql.Row) (right *string, escape rune, err error) { 140 rightVal, err := l.RightChild.Eval(ctx, row) 141 if err != nil || rightVal == nil { 142 return nil, 0, err 143 } 144 if _, ok := rightVal.(string); !ok { 145 rightVal, _, err = types.LongText.Convert(rightVal) 146 if err != nil { 147 return nil, 0, err 148 } 149 } 150 151 var escapeVal interface{} 152 if l.Escape != nil { 153 escapeVal, err = l.Escape.Eval(ctx, row) 154 if err != nil { 155 return nil, 0, err 156 } 157 if escapeVal == nil { 158 escapeVal = `\` 159 } 160 if _, ok := escapeVal.(string); !ok { 161 escapeVal, _, err = types.LongText.Convert(escapeVal) 162 if err != nil { 163 return nil, 0, err 164 } 165 } 166 if utf8.RuneCountInString(escapeVal.(string)) > 1 { 167 return nil, 0, sql.ErrInvalidArgument.New("ESCAPE") 168 } 169 } else { 170 escapeVal = `\` 171 } 172 173 rightStr := rightVal.(string) 174 return &rightStr, []rune(escapeVal.(string))[0], nil 175 } 176 177 func (l *Like) String() string { 178 return fmt.Sprintf("%s LIKE %s", l.LeftChild, l.RightChild) 179 } 180 181 // WithChildren implements the Expression interface. 182 func (l *Like) WithChildren(children ...sql.Expression) (sql.Expression, error) { 183 if len(children) != 2 { 184 return nil, sql.ErrInvalidChildrenNumber.New(l, len(children), 2) 185 } 186 return NewLike(children[0], children[1], l.Escape), nil 187 } 188 189 func patternToGoRegex(pattern string) string { 190 var buf bytes.Buffer 191 buf.WriteString("(?s)") 192 buf.WriteRune('^') 193 var escaped bool 194 for _, r := range strings.Replace(regexp.QuoteMeta(pattern), `\\`, `\`, -1) { 195 switch r { 196 case '_': 197 if escaped { 198 buf.WriteRune(r) 199 } else { 200 buf.WriteRune('.') 201 } 202 case '%': 203 if escaped { 204 buf.WriteRune(r) 205 } else { 206 buf.WriteString(".*") 207 } 208 case '\\': 209 if escaped { 210 buf.WriteString(`\\`) 211 } else { 212 escaped = true 213 continue 214 } 215 default: 216 if escaped { 217 buf.WriteString(`\`) 218 } 219 buf.WriteRune(r) 220 } 221 222 if escaped { 223 escaped = false 224 } 225 } 226 227 buf.WriteRune('$') 228 return buf.String() 229 } 230 231 func patternToGoRegexWithEscape(pattern, escape string) string { 232 var buf bytes.Buffer 233 buf.WriteString("(?s)") 234 buf.WriteRune('^') 235 var escaped bool 236 237 for _, r := range strings.Replace(strings.Replace(regexp.QuoteMeta(pattern), `\\`, `\`, -1), regexp.QuoteMeta(escape), escape, -1) { 238 switch r { 239 case rune(escape[0]): 240 if escaped { 241 buf.WriteString(regexp.QuoteMeta(escape)) 242 } else { 243 escaped = true 244 continue 245 } 246 case '_': 247 if escaped { 248 buf.WriteRune(r) 249 } else { 250 buf.WriteRune('.') 251 } 252 case '%': 253 if escaped { 254 buf.WriteRune(r) 255 } else { 256 buf.WriteString(".*") 257 } 258 case '\\': 259 if escaped { 260 buf.WriteString(`\\`) 261 } else { 262 escaped = true 263 continue 264 } 265 default: 266 if escaped { 267 buf.WriteString(`\`) 268 } 269 buf.WriteRune(r) 270 } 271 272 if escaped { 273 escaped = false 274 } 275 } 276 277 buf.WriteRune('$') 278 return buf.String() 279 } 280 281 // LikeMatcher is a collation-supported matcher for LIKE expressions. 282 type LikeMatcher struct { 283 nodes []likeMatcherNode 284 collation sql.CollationID 285 escape rune 286 } 287 288 // ConstructLikeMatcher returns a new LikeMatcher. 289 func ConstructLikeMatcher(collation sql.CollationID, pattern string, escape rune) (LikeMatcher, error) { 290 charsetEncoder := collation.CharacterSet().Encoder() 291 sorter := collation.Sorter() 292 matcher := LikeMatcher{nil, collation, escape} 293 for i := 0; i < len(pattern); { 294 nextRune, advance := charsetEncoder.NextRune(pattern[i:]) 295 if nextRune == utf8.RuneError { 296 return LikeMatcher{}, sql.ErrCharSetInvalidString.New(collation.CharacterSet().Name(), pattern) 297 } 298 i += advance 299 300 switch nextRune { 301 case '_': // Matches any single character 302 matcher.nodes = append(matcher.nodes, likeMatcherRune{'_', -1}) 303 case '%': // Matches any sequence of characters, including the empty sequence 304 matcher.nodes = append(matcher.nodes, likeMatcherAny{}) 305 case escape: // States that the next character should be taken literally 306 nextRune, advance = charsetEncoder.NextRune(pattern[i:]) 307 if nextRune == utf8.RuneError { 308 return LikeMatcher{}, sql.ErrCharSetInvalidString.New(collation.CharacterSet().Name(), pattern) 309 } 310 i += advance 311 matcher.nodes = append(matcher.nodes, likeMatcherRune{nextRune, sorter(nextRune)}) 312 default: // A regular character that we'll match against 313 matcher.nodes = append(matcher.nodes, likeMatcherRune{nextRune, sorter(nextRune)}) 314 } 315 } 316 return matcher, nil 317 } 318 319 // Match returns whether the given string conforms to the nodes contained in this matcher. 320 func (l LikeMatcher) Match(s string) bool { 321 if len(l.nodes) == 0 { 322 if len(s) == 0 { 323 return true 324 } 325 return false 326 } 327 328 charsetEncoder := l.collation.CharacterSet().Encoder() 329 stringIndex := 0 330 nodeIndex := 0 331 nodeNextIndex := make([]int, 0, len(l.nodes)) 332 for { 333 // If both indexes equal their lengths, we've fully matched the string with all nodes 334 if stringIndex == len(s) && nodeIndex == len(l.nodes) { 335 return true 336 } 337 // If all nodes have found a match but we still have runes left in the string, we backtrack to allow earlier 338 // nodes to match more runes. If we're unable to backtrack, then the string does not match. 339 if stringIndex < len(s) && nodeIndex == len(l.nodes) { 340 var matched bool 341 matched, nodeIndex = l.backtrack(s, nodeIndex-1, nodeNextIndex) 342 if !matched { 343 return false 344 } 345 nodeNextIndex = nodeNextIndex[:nodeIndex] 346 stringIndex = nodeNextIndex[nodeIndex-1] 347 continue 348 } 349 // If all runes have found a match but we still have nodes left in the matcher, we check if the remaining nodes 350 // are all "any sequence" nodes. If they're not, then the string is too short and does not match. 351 if stringIndex == len(s) && nodeIndex < len(l.nodes) { 352 for ; nodeIndex < len(l.nodes); nodeIndex++ { 353 if _, ok := l.nodes[nodeIndex].(likeMatcherAny); !ok { 354 return false 355 } 356 } 357 return true 358 } 359 360 nextRune, advance := charsetEncoder.NextRune(s[stringIndex:]) 361 if nextRune == utf8.RuneError { 362 return false 363 } 364 matched, consumed := l.nodes[nodeIndex].Match(l.collation, nextRune) 365 if consumed { 366 stringIndex += advance 367 } 368 if matched { 369 nodeNextIndex = append(nodeNextIndex, stringIndex) 370 nodeIndex++ 371 } else { 372 // If we didn't match on this rune, we backtrack to allow earlier nodes to match more runes 373 matched, nodeIndex = l.backtrack(s, nodeIndex, nodeNextIndex) 374 if !matched { 375 return false 376 } 377 nodeNextIndex = nodeNextIndex[:nodeIndex] 378 stringIndex = nodeNextIndex[nodeIndex-1] 379 continue 380 } 381 } 382 // Must return something here to compile, but the above loop will handle all return cases 383 return false 384 } 385 386 // String returns the string form of this LIKE expression. If an Escape character was provided, it is used instead of 387 // the default. 388 func (l LikeMatcher) String() string { 389 sb := strings.Builder{} 390 for _, node := range l.nodes { 391 switch node := node.(type) { 392 case likeMatcherRune: 393 if node.original == '%' { 394 sb.WriteRune(l.escape) 395 sb.WriteRune('%') 396 } else if node.original == '_' { 397 if node.sortOrder != -1 { 398 sb.WriteRune(l.escape) 399 } 400 sb.WriteRune('_') 401 } else { 402 sb.WriteRune(node.original) 403 } 404 case likeMatcherAny: 405 sb.WriteRune('%') 406 } 407 } 408 return sb.String() 409 } 410 411 // backtrack unwinds the stack until we can find a node that can match the next rune compared to the rune that it last 412 // matched against. The returned node index is the index to use for the next match. 413 func (l LikeMatcher) backtrack(s string, nodeIndex int, nodeNextIndex []int) (matched bool, newNodeIndex int) { 414 charsetEncoder := l.collation.CharacterSet().Encoder() 415 // If the slice doesn't contain an entry for the node, then that node was never matched (and therefore we can't 416 // backtrack over it). 417 if nodeIndex >= len(nodeNextIndex) { 418 nodeIndex = len(nodeNextIndex) - 1 419 } 420 for ; nodeIndex >= 0; nodeIndex-- { 421 stringIndex := nodeNextIndex[nodeIndex] 422 nextRune, advance := charsetEncoder.NextRune(s[stringIndex:]) 423 if nextRune == utf8.RuneError { 424 return false, 0 425 } 426 if l.nodes[nodeIndex].MatchNext(l.collation, nextRune) { 427 nodeNextIndex[nodeIndex] = stringIndex + advance 428 return true, nodeIndex + 1 429 } 430 } 431 // We exhausted all nodes, no nodes may match further 432 return false, 0 433 } 434 435 // likeMatcherNode handles the match characteristics for a particular character from the pattern. 436 type likeMatcherNode interface { 437 // Match returns whether the given rune is matched on the initial match, and also whether this rune is consumed. If 438 // not consumed, the same rune will be given to the next node. It is assumed that consuming a rune always matches 439 // the rune. 440 Match(collation sql.CollationID, r rune) (matched bool, consumed bool) 441 // MatchNext returns whether the given rune is matched on a subsequent match. Only the first match may optionally 442 // consume a rune, all subsequent matches will consume the rune. 443 MatchNext(collation sql.CollationID, r rune) bool 444 } 445 446 // likeMatcherRune matches exactly one rune. If the sort order is negative, then this matches any rune (but still only 447 // a single rune). 448 type likeMatcherRune struct { 449 original rune 450 sortOrder int32 451 } 452 453 var _ likeMatcherNode = likeMatcherRune{} 454 455 // Match implements the interface likeMatcherNode. 456 func (l likeMatcherRune) Match(collation sql.CollationID, r rune) (matched bool, consumed bool) { 457 if l.sortOrder < 0 || collation.Sorter()(r) == l.sortOrder { 458 return true, true 459 } 460 return false, false 461 } 462 463 // MatchNext implements the interface likeMatcherNode. As this only matches a single rune, all subsequent matches will 464 // fail. 465 func (l likeMatcherRune) MatchNext(collation sql.CollationID, r rune) bool { 466 return false 467 } 468 469 // likeMatcherAny matches any sequence of characters, including the empty sequence. 470 type likeMatcherAny struct{} 471 472 var _ likeMatcherNode = likeMatcherAny{} 473 474 // Match implements the interface likeMatcherNode. This node is a reluctant matcher, meaning it attempts to match as few 475 // runes as possible. As this will always match the empty sequence first, we'll return true on the match, but will not 476 // consume the given rune. 477 func (l likeMatcherAny) Match(collation sql.CollationID, r rune) (matched bool, consumed bool) { 478 return true, false 479 } 480 481 // MatchNext implements the interface likeMatcherNode. 482 func (l likeMatcherAny) MatchNext(collation sql.CollationID, r rune) bool { 483 return true 484 }