github.com/matrixorigin/matrixone@v0.7.0/pkg/vectorize/like/like.go (about)

     1  // Copyright 2021 Matrix Origin
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //      http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  package like
    16  
    17  import (
    18  	"bytes"
    19  	"fmt"
    20  	"regexp"
    21  	"unicode/utf8"
    22  	"unsafe"
    23  
    24  	"github.com/matrixorigin/matrixone/pkg/common/moerr"
    25  	"github.com/matrixorigin/matrixone/pkg/container/nulls"
    26  )
    27  
    28  const (
    29  	DEFAULT_ESCAPE_CHAR = '\\'
    30  )
    31  
    32  // <source column> like 'rule'
    33  // XXX: rs here is the selection list.
    34  func BtSliceAndConst(xs []string, expr []byte, rs []bool) ([]bool, error) {
    35  	return BtSliceNullAndConst(xs, expr, nil, rs)
    36  }
    37  
    38  func isNotNull(n *nulls.Nulls, i uint64) bool {
    39  	if n == nil {
    40  		return true
    41  	}
    42  	return !n.Contains(i)
    43  }
    44  
    45  func removeEscapeChar(src []byte, escapeChar byte) []byte {
    46  	var target []byte
    47  	max := len(src)
    48  	for i := 0; i < max; i++ {
    49  		if src[i] == escapeChar && i+1 < max {
    50  			i = i + 1
    51  		}
    52  		target = append(target, src[i])
    53  	}
    54  	return target
    55  }
    56  
    57  func BtSliceNullAndConst(xs []string, expr []byte, ns *nulls.Nulls, rs []bool) ([]bool, error) {
    58  	// Opt Rule #1: if expr is empty string, only empty string like empty string.
    59  	n := uint32(len(expr))
    60  	if n == 0 {
    61  		for i, s := range xs {
    62  			rs[i] = isNotNull(ns, uint64(i)) && len(s) == 0
    63  		}
    64  		return rs, nil
    65  	}
    66  
    67  	// Opt Rule #2: anything matches %
    68  	if n == 1 && expr[0] == '%' {
    69  		for i := range xs {
    70  			rs[i] = isNotNull(ns, uint64(i))
    71  		}
    72  		return rs, nil
    73  	}
    74  
    75  	// Opt Rule #3: single char matches _.
    76  	// XXX in UTF8 world, should we do single RUNE matches _?
    77  	if n == 1 && expr[0] == '_' {
    78  		for i, s := range xs {
    79  			rs[i] = isNotNull(ns, uint64(i)) && len(s) == 1
    80  		}
    81  		return rs, nil
    82  	}
    83  
    84  	// Opt Rule #3.1: single char, no wild card, so it is a simple compare eq.
    85  	if n == 1 && expr[0] != '_' && expr[0] != '%' {
    86  		for i, s := range xs {
    87  			rs[i] = isNotNull(ns, uint64(i)) && len(s) == 1 && s[0] == expr[0]
    88  		}
    89  		return rs, nil
    90  	}
    91  
    92  	// Opt Rule #4.  [_%]somethingInBetween[_%]
    93  	if n > 1 && !bytes.ContainsAny(expr[1:len(expr)-1], "_%") {
    94  		c0 := expr[0]   // first character
    95  		c1 := expr[n-1] // last character
    96  		if n > 2 && expr[n-2] == DEFAULT_ESCAPE_CHAR {
    97  			c1 = DEFAULT_ESCAPE_CHAR
    98  		}
    99  		switch {
   100  		case !(c0 == '%' || c0 == '_') && !(c1 == '%' || c1 == '_'):
   101  			// Rule 4.1: no wild card, so it is a simple compare eq.
   102  			for i, s := range xs {
   103  				rs[i] = isNotNull(ns, uint64(i)) && uint32(len(s)) == n && bytes.Equal(expr, []byte(s))
   104  			}
   105  			return rs, nil
   106  		case c0 == '_' && !(c1 == '%' || c1 == '_'):
   107  			// Rule 4.2: _foobarzoo,
   108  			for i, s := range xs {
   109  				rs[i] = isNotNull(ns, uint64(i)) && uint32(len(s)) == n && bytes.Equal(expr[1:], []byte(s)[1:])
   110  			}
   111  			return rs, nil
   112  		case c0 == '%' && !(c1 == '%' || c1 == '_'):
   113  			// Rule 4.3, %foobarzoo, it turns into a suffix match.
   114  			suffix := removeEscapeChar(expr[1:], DEFAULT_ESCAPE_CHAR)
   115  			for i, s := range xs {
   116  				rs[i] = isNotNull(ns, uint64(i)) && bytes.HasSuffix([]byte(s), suffix)
   117  			}
   118  			return rs, nil
   119  		case c1 == '_' && !(c0 == '%' || c0 == '_'):
   120  			// Rule 4.4, foobarzoo_, it turns into eq ingoring last char.
   121  			prefix := removeEscapeChar(expr[:n-1], DEFAULT_ESCAPE_CHAR)
   122  			for i, s := range xs {
   123  				rs[i] = isNotNull(ns, uint64(i)) && uint32(len(s)) == n && bytes.Equal(prefix, []byte(s)[:n-1])
   124  			}
   125  			return rs, nil
   126  		case c1 == '%' && !(c0 == '%' || c0 == '_'):
   127  			// Rule 4.5 foobarzoo%, prefix match
   128  			prefix := removeEscapeChar(expr[:n-1], DEFAULT_ESCAPE_CHAR)
   129  			for i, s := range xs {
   130  				rs[i] = isNotNull(ns, uint64(i)) && bytes.HasPrefix([]byte(s), prefix)
   131  			}
   132  			return rs, nil
   133  		case c0 == '%' && c1 == '%':
   134  			// Rule 4.6 %foobarzoo%, now it is contains
   135  			substr := removeEscapeChar(expr[1:n-1], DEFAULT_ESCAPE_CHAR)
   136  			for i, s := range xs {
   137  				rs[i] = isNotNull(ns, uint64(i)) && bytes.Contains([]byte(s), substr)
   138  			}
   139  			return rs, nil
   140  		case c0 == '%' && c1 == '_':
   141  			// Rule 4.7 %foobarzoo_,
   142  			suffix := removeEscapeChar(expr[1:n-1], DEFAULT_ESCAPE_CHAR)
   143  			for i, s := range xs {
   144  				bs := []byte(s)
   145  				rs[i] = isNotNull(ns, uint64(i)) && len(s) > 0 && bytes.HasSuffix(bs[:len(bs)-1], suffix)
   146  			}
   147  			return rs, nil
   148  		case c0 == '_' && c1 == '%':
   149  			// Rule 4.8 _foobarzoo%
   150  			prefix := removeEscapeChar(expr[1:n-1], DEFAULT_ESCAPE_CHAR)
   151  			for i, s := range xs {
   152  				rs[i] = isNotNull(ns, uint64(i)) && len(s) > 0 && bytes.HasPrefix([]byte(s)[1:], prefix)
   153  			}
   154  			return rs, nil
   155  		}
   156  	}
   157  
   158  	// Done opt rules, fall back to regexp
   159  	reg, err := regexp.Compile(convert(expr))
   160  	if err != nil {
   161  		return nil, err
   162  	}
   163  	for i, s := range xs {
   164  		rs[i] = isNotNull(ns, uint64(i)) && reg.MatchString(s)
   165  	}
   166  	return rs, nil
   167  }
   168  
   169  // 'source' like 'rule'
   170  func BtConstAndConst(s string, expr []byte) (bool, error) {
   171  	ss := []string{s}
   172  	rs := []bool{false}
   173  	rs, err := BtSliceAndConst(ss, expr, rs)
   174  	if err != nil {
   175  		return false, err
   176  	}
   177  	return rs[0], nil
   178  }
   179  
   180  // <source column> like <rule column>
   181  func BtSliceAndSlice(xs []string, exprs [][]byte, rs []bool) ([]bool, error) {
   182  	if len(xs) != len(exprs) {
   183  		return nil, moerr.NewInternalErrorNoCtx("unexpected error when LIKE operator")
   184  	}
   185  
   186  	for i := range xs {
   187  		isLike, err := BtConstAndConst(xs[i], exprs[i])
   188  		if err != nil {
   189  			return nil, err
   190  		}
   191  		rs[i] = isLike
   192  	}
   193  	return rs, nil
   194  }
   195  
   196  // 'source' like <rule column>
   197  func BtConstAndSliceNull(p string, exprs [][]byte, ns *nulls.Nulls, rs []bool) ([]bool, error) {
   198  	for i, ex := range exprs {
   199  		rs[i] = false
   200  		if isNotNull(ns, uint64(i)) {
   201  			k, err := BtConstAndConst(p, ex)
   202  			if err != nil {
   203  				return nil, err
   204  			}
   205  			rs[i] = k
   206  		}
   207  	}
   208  	return rs, nil
   209  }
   210  
   211  // <source column may contains null> like
   212  func BtSliceNullAndSliceNull(xs []string, exprs [][]byte, ns *nulls.Nulls, rs []bool) ([]bool, error) {
   213  	for i := range xs {
   214  		rs[i] = false
   215  		if isNotNull(ns, uint64(i)) {
   216  			k, err := BtConstAndConst(xs[i], exprs[i])
   217  			if err != nil {
   218  				return nil, err
   219  			}
   220  			rs[i] = k
   221  		}
   222  	}
   223  	return rs, nil
   224  }
   225  
   226  func convert(expr []byte) string {
   227  	return fmt.Sprintf("^(?s:%s)$", replace(*(*string)(unsafe.Pointer(&expr))))
   228  }
   229  
   230  func replace(s string) string {
   231  	var oldCharactor rune
   232  
   233  	r := make([]byte, len(s)*2)
   234  	w := 0
   235  	start := 0
   236  	for len(s) > start {
   237  		character, wid := utf8.DecodeRuneInString(s[start:])
   238  		if oldCharactor == '\\' {
   239  			w += copy(r[w:], s[start:start+wid])
   240  			start += wid
   241  			oldCharactor = 0
   242  			continue
   243  		}
   244  		switch character {
   245  		case '_':
   246  			w += copy(r[w:], []byte{'.'})
   247  		case '%':
   248  			w += copy(r[w:], []byte{'.', '*'})
   249  		case '(':
   250  			w += copy(r[w:], []byte{'\\', '('})
   251  		case ')':
   252  			w += copy(r[w:], []byte{'\\', ')'})
   253  		case '\\':
   254  		default:
   255  			w += copy(r[w:], s[start:start+wid])
   256  		}
   257  		start += wid
   258  		oldCharactor = character
   259  	}
   260  	return string(r[:w])
   261  }