github.com/matrixorigin/matrixone@v1.2.0/pkg/sql/compile/fuzzyCheck.go (about)

     1  // Copyright 2023 Matrix Origin
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //      http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  package compile
    16  
    17  import (
    18  	"bytes"
    19  	"context"
    20  	"encoding/hex"
    21  	"fmt"
    22  	"strconv"
    23  	"strings"
    24  	"time"
    25  
    26  	"github.com/matrixorigin/matrixone/pkg/common/reuse"
    27  
    28  	"github.com/matrixorigin/matrixone/pkg/catalog"
    29  	"github.com/matrixorigin/matrixone/pkg/common/moerr"
    30  	"github.com/matrixorigin/matrixone/pkg/container/batch"
    31  	"github.com/matrixorigin/matrixone/pkg/container/nulls"
    32  	"github.com/matrixorigin/matrixone/pkg/container/types"
    33  	"github.com/matrixorigin/matrixone/pkg/container/vector"
    34  	"github.com/matrixorigin/matrixone/pkg/logutil"
    35  	"github.com/matrixorigin/matrixone/pkg/pb/plan"
    36  )
    37  
    38  /*
    39  fuzzyCheck use to contains some info to run a background SQL when
    40  fuzzy filter can not draw a definite conclusion for duplicate check
    41  */
    42  
    43  func newFuzzyCheck(n *plan.Node) (*fuzzyCheck, error) {
    44  	tblName := n.TableDef.GetName()
    45  	dbName := n.ObjRef.GetSchemaName()
    46  
    47  	if tblName == "" || dbName == "" {
    48  		return nil, moerr.NewInternalErrorNoCtx("fuzzyfilter failed to get the db/tbl name")
    49  	}
    50  
    51  	f := reuse.Alloc[fuzzyCheck](nil)
    52  	f.tbl = tblName
    53  	f.db = dbName
    54  	f.attr = n.TableDef.Pkey.PkeyColName
    55  
    56  	for _, c := range n.TableDef.Cols {
    57  		if c.Name == n.TableDef.Pkey.PkeyColName {
    58  			f.col = c
    59  		}
    60  	}
    61  
    62  	// compound key could be primary key(a, b, c...) or unique key(a, b, c ...)
    63  	// for Decimal type, we need colDef to get the scale
    64  	if n.TableDef.Pkey.PkeyColName == catalog.CPrimaryKeyColName {
    65  		f.isCompound = true
    66  		f.compoundCols = f.sortColDef(n.TableDef.Pkey.Names, n.TableDef.Cols)
    67  	}
    68  
    69  	// for the case like create unique index for existed table,
    70  	// We can only get the table definition of the hidden table.
    71  	// How the original table defines this unique index (for example, which columns are used and whether it is composite) can NOT be confirmed,
    72  	// that introduces some strange logic, and obscures the meaning of some fields, such as fuzzyCheck.isCompound
    73  	if catalog.IsHiddenTable(tblName) && n.Fuzzymessage == nil {
    74  		f.onlyInsertHidden = true
    75  	}
    76  
    77  	if n.Fuzzymessage != nil {
    78  		if len(n.Fuzzymessage.ParentUniqueCols) > 1 {
    79  			f.isCompound = true
    80  			f.tbl = n.Fuzzymessage.ParentTableName
    81  			f.compoundCols = n.Fuzzymessage.ParentUniqueCols
    82  		} else {
    83  			f.col = n.Fuzzymessage.ParentUniqueCols[0]
    84  		}
    85  	}
    86  
    87  	return f, nil
    88  }
    89  
    90  func (f fuzzyCheck) TypeName() string {
    91  	return "compile.fuzzyCheck"
    92  }
    93  
    94  func (f *fuzzyCheck) reset() {
    95  	f.db = ""
    96  	f.tbl = ""
    97  	f.attr = ""
    98  	f.condition = ""
    99  	f.isCompound = false
   100  	f.onlyInsertHidden = false
   101  	f.col = nil
   102  	f.compoundCols = nil
   103  	f.cnt = 0
   104  }
   105  
   106  func (f *fuzzyCheck) release() {
   107  	reuse.Free[fuzzyCheck](f, nil)
   108  }
   109  
   110  // fill will generate condition for background SQL to check if duplicate constraint is satisfied
   111  // for compound primary key, the condtion will look like : "attr_1=x1 and attr_2=y1 and ... attr_n = yn or attr1=x2 and attr=y2 ... "
   112  // and its background SQL will look like : select pkAttrs, count(*) as cnt from db.tbl where **cont** having cnt > 1;
   113  //
   114  // for non compound primary key, cond will look like : "x1, x2, x3, ..."
   115  // and its background SQL will look like :  select pkAttr, count(*) as cnt from db.tbl where pkAttr in (**cond**) having cnt > 1;
   116  //
   117  // for more info, refer func backgroundSQLCheck
   118  func (f *fuzzyCheck) fill(ctx context.Context, bat *batch.Batch) error {
   119  	var collision [][]string
   120  	var err error
   121  
   122  	toCheck := bat.GetVector(0)
   123  	f.cnt = bat.RowCount()
   124  
   125  	f.isCompound, err = f.recheckIfCompound(toCheck)
   126  	if err != nil {
   127  		return err
   128  	}
   129  
   130  	if !f.onlyInsertHidden {
   131  		if err := f.firstlyCheck(ctx, toCheck); err != nil {
   132  			return err // fail to pass duplicate constraint
   133  		}
   134  	}
   135  
   136  	collision, err = f.genCollsionKeys(toCheck)
   137  	if err != nil {
   138  		return err
   139  	}
   140  
   141  	// generate codition used in background SQL
   142  	if !f.onlyInsertHidden {
   143  		if !f.isCompound {
   144  			f.condition = strings.Join(collision[0], ", ")
   145  		} else {
   146  			// not using __mo_cpkey_col search directly because efficiency considerations,
   147  			// HOWEVER This part of the code is still redundant, because currently plan can not support sql like
   148  			//
   149  			// SELECT pk1, pk2, COUNT(*) AS cnt
   150  			// FROM tbl
   151  			// WHERE (pk1, pk2) IN ((1, 1), (2, 1))
   152  			// GROUP BY pk1, pk2
   153  			// HAVING cnt > 1;
   154  			//
   155  			// otherwise, the code will be much more cleaner
   156  			var all bytes.Buffer
   157  			var one bytes.Buffer
   158  			var i int
   159  			var j int
   160  
   161  			lastRow := len(collision[0]) - 1
   162  
   163  			cAttrs := make([]string, len(f.compoundCols))
   164  			for k, c := range f.compoundCols {
   165  				if c == nil {
   166  					panic("compoundCols should not have nil element")
   167  				}
   168  				cAttrs[k] = c.Name
   169  			}
   170  
   171  			for i = 0; i < lastRow; i++ {
   172  				one.Reset()
   173  				one.WriteByte('(')
   174  				// one compound primary key has multiple conditions, use "and" to join them
   175  				for j = 0; j < len(cAttrs)-1; j++ {
   176  					one.WriteString(fmt.Sprintf("%s = %s and ", cAttrs[j], collision[j][i]))
   177  				}
   178  
   179  				// the last condition does not need to be followed by "and"
   180  				one.WriteString(fmt.Sprintf("%s = %s", cAttrs[j], collision[j][i]))
   181  				one.WriteByte(')')
   182  				if _, err = one.WriteTo(&all); err != nil {
   183  					return err
   184  				}
   185  
   186  				// use or join each compound primary keys
   187  				all.WriteString(" or ")
   188  			}
   189  
   190  			// if only have one collision key, there will no "or", same as the last collision key
   191  			for j = 0; j < len(cAttrs)-1; j++ {
   192  				one.WriteString(fmt.Sprintf("%s = %s and ", cAttrs[j], collision[j][lastRow]))
   193  			}
   194  
   195  			one.WriteString(fmt.Sprintf("%s = %s", cAttrs[j], collision[j][lastRow]))
   196  			if _, err = one.WriteTo(&all); err != nil {
   197  				return err
   198  			}
   199  			f.condition = all.String()
   200  		}
   201  	} else {
   202  		keys := collision[0]
   203  		f.condition = strings.Join(keys, ", ")
   204  	}
   205  
   206  	return nil
   207  }
   208  
   209  func (f *fuzzyCheck) firstlyCheck(ctx context.Context, toCheck *vector.Vector) error {
   210  	kcnt := make(map[string]int)
   211  
   212  	if !f.isCompound {
   213  		pkey, err := f.format(toCheck)
   214  		if err != nil {
   215  			return err
   216  		}
   217  		for _, k := range pkey {
   218  			kcnt[k]++
   219  		}
   220  	} else {
   221  		for i := 0; i < toCheck.Length(); i++ {
   222  			b := toCheck.GetRawBytesAt(i)
   223  			t, err := types.Unpack(b)
   224  			if err != nil {
   225  				return err
   226  			}
   227  			scales := make([]int32, len(f.compoundCols))
   228  			for i, c := range f.compoundCols {
   229  				if c == nil {
   230  					panic("compoundCols should not have nil element")
   231  				}
   232  				scales[i] = c.Typ.Scale
   233  			}
   234  			es := t.ErrString(scales)
   235  			kcnt[es]++
   236  		}
   237  	}
   238  
   239  	// firstly check if contains duplicate
   240  	for k, cnt := range kcnt {
   241  		if cnt > 1 {
   242  			ds, e := strconv.Unquote(k)
   243  			if e != nil {
   244  				return moerr.NewDuplicateEntry(ctx, k, f.attr)
   245  			} else {
   246  				return moerr.NewDuplicateEntry(ctx, ds, f.attr)
   247  			}
   248  		}
   249  	}
   250  	return nil
   251  }
   252  
   253  // genCollsionKeys return [][]string to store the string of collsion keys, it will check if
   254  // collision keys are duplicates with each other, if do dup, no need to run background SQL
   255  func (f *fuzzyCheck) genCollsionKeys(toCheck *vector.Vector) ([][]string, error) {
   256  	var keys [][]string
   257  	if !f.isCompound {
   258  		keys = make([][]string, 1)
   259  	} else {
   260  		keys = make([][]string, len(f.compoundCols))
   261  	}
   262  
   263  	if !f.onlyInsertHidden {
   264  		if !f.isCompound {
   265  			pkey, err := f.format(toCheck)
   266  			if err != nil {
   267  				return nil, err
   268  			}
   269  			keys[0] = pkey
   270  		} else {
   271  			scales := make([]int32, len(f.compoundCols))
   272  			for i, c := range f.compoundCols {
   273  				scales[i] = c.Typ.Scale
   274  			}
   275  			for i := 0; i < toCheck.Length(); i++ {
   276  				b := toCheck.GetRawBytesAt(i)
   277  				t, err := types.Unpack(b)
   278  				if err != nil {
   279  					return nil, err
   280  				}
   281  				s := t.SQLStrings(scales)
   282  				for j := 0; j < len(s); j++ {
   283  					keys[j] = append(keys[j], s[j])
   284  				}
   285  			}
   286  		}
   287  	} else {
   288  		pkey, err := f.format(toCheck)
   289  		if err != nil {
   290  			return nil, err
   291  		}
   292  		keys[0] = pkey
   293  	}
   294  
   295  	return keys, nil
   296  }
   297  
   298  // backgroundSQLCheck launches a background SQL to check if there are any duplicates
   299  func (f *fuzzyCheck) backgroundSQLCheck(c *Compile) error {
   300  	var duplicateCheckSql string
   301  
   302  	if !f.onlyInsertHidden {
   303  		if !f.isCompound {
   304  			duplicateCheckSql = fmt.Sprintf(fuzzyNonCompoundCheck, f.attr, f.db, f.tbl, f.attr, f.condition, f.attr)
   305  		} else {
   306  			cAttrs := make([]string, len(f.compoundCols))
   307  			for k, c := range f.compoundCols {
   308  				cAttrs[k] = c.Name
   309  			}
   310  			attrs := strings.Join(cAttrs, ", ")
   311  			duplicateCheckSql = fmt.Sprintf(fuzzyCompoundCheck, attrs, f.db, f.tbl, f.condition, attrs)
   312  		}
   313  	} else {
   314  		duplicateCheckSql = fmt.Sprintf(fuzzyNonCompoundCheck, f.attr, f.db, f.tbl, f.attr, f.condition, f.attr)
   315  	}
   316  
   317  	res, err := c.runSqlWithResult(duplicateCheckSql)
   318  	if err != nil {
   319  		logutil.Errorf("The sql that caused the fuzzy check background SQL failed is %s, and generated background sql is %s", c.sql, duplicateCheckSql)
   320  		return err
   321  	}
   322  	defer res.Close()
   323  
   324  	if res.Batches != nil {
   325  		vs := res.Batches[0].Vecs
   326  		if vs != nil && vs[0].Length() > 0 { // do dup
   327  			toCheck := vs[0]
   328  			if !f.isCompound {
   329  				f.adjustDecimalScale(toCheck)
   330  				if dupKey, e := f.format(toCheck); e != nil {
   331  					err = e
   332  				} else {
   333  					ds, e := strconv.Unquote(dupKey[0])
   334  					if e != nil {
   335  						err = moerr.NewDuplicateEntry(c.ctx, dupKey[0], f.attr)
   336  					} else {
   337  						err = moerr.NewDuplicateEntry(c.ctx, ds, f.attr)
   338  					}
   339  				}
   340  			} else {
   341  				if t, e := types.Unpack(toCheck.GetBytesAt(0)); e != nil {
   342  					err = e
   343  				} else {
   344  					scales := make([]int32, len(f.compoundCols))
   345  					for i, col := range f.compoundCols {
   346  						if col != nil {
   347  							scales[i] = col.Typ.Scale
   348  						} else {
   349  							scales[i] = 0
   350  						}
   351  					}
   352  					err = moerr.NewDuplicateEntry(c.ctx, t.ErrString(scales), f.attr)
   353  				}
   354  			}
   355  		}
   356  	}
   357  
   358  	return err
   359  }
   360  
   361  // -----------------------------utils-----------------------------------
   362  
   363  // make sure that the attr sort by define way
   364  func (f *fuzzyCheck) sortColDef(toSelect []string, cols []*plan.ColDef) []*plan.ColDef {
   365  	ccols := make([]*plan.ColDef, len(toSelect))
   366  	nmap := make(map[string]int)
   367  	for i, n := range toSelect {
   368  		nmap[n] = i
   369  	}
   370  	for _, c := range cols {
   371  		if i, ok := nmap[c.Name]; ok {
   372  			ccols[i] = c
   373  		}
   374  	}
   375  	return ccols
   376  }
   377  
   378  // format format strings from vector
   379  func (f *fuzzyCheck) format(toCheck *vector.Vector) ([]string, error) {
   380  	var ss []string
   381  	typ := toCheck.GetType()
   382  
   383  	for i := 0; i < toCheck.Length(); i++ {
   384  		s, err := vectorToString(toCheck, i)
   385  		if err != nil {
   386  			return nil, err
   387  		}
   388  		ss = append(ss, s)
   389  	}
   390  
   391  	if f.isCompound {
   392  		for i, s := range ss {
   393  			ss[i] = "unhex('" + hex.EncodeToString([]byte(s)) + "')"
   394  		}
   395  		return ss, nil
   396  	}
   397  
   398  	// for table that like CREATE TABLE t1( b CHAR(10), PRIMARY KEY);
   399  	// with : insert into t1 values('ab'), ('ab');
   400  	// background SQL condition should be b='ab' instead of b=ab, as well as time types
   401  	switch typ.Oid {
   402  	// date and time
   403  	case types.T_date, types.T_time, types.T_datetime, types.T_timestamp:
   404  		for i, str := range ss {
   405  			ss[i] = strconv.Quote(str)
   406  		}
   407  		return ss, nil
   408  
   409  	// string family but not include binary
   410  	case types.T_char, types.T_varchar, types.T_varbinary, types.T_text, types.T_uuid, types.T_binary:
   411  		for i, str := range ss {
   412  			ss[i] = strconv.Quote(str)
   413  		}
   414  		return ss, nil
   415  	default:
   416  		return ss, nil
   417  	}
   418  }
   419  
   420  func vectorToString(vec *vector.Vector, rowIndex int) (string, error) {
   421  	if nulls.Any(vec.GetNulls()) {
   422  		return "", nil
   423  	}
   424  	switch vec.GetType().Oid {
   425  	case types.T_bool:
   426  		flag := vector.GetFixedAt[bool](vec, rowIndex)
   427  		if flag {
   428  			return "true", nil
   429  		}
   430  		return "false", nil
   431  	case types.T_bit:
   432  		return fmt.Sprintf("%v", vector.GetFixedAt[uint64](vec, rowIndex)), nil
   433  	case types.T_int8:
   434  		return fmt.Sprintf("%v", vector.GetFixedAt[int8](vec, rowIndex)), nil
   435  	case types.T_int16:
   436  		return fmt.Sprintf("%v", vector.GetFixedAt[int16](vec, rowIndex)), nil
   437  	case types.T_int32:
   438  		return fmt.Sprintf("%v", vector.GetFixedAt[int32](vec, rowIndex)), nil
   439  	case types.T_int64:
   440  		return fmt.Sprintf("%v", vector.GetFixedAt[int64](vec, rowIndex)), nil
   441  	case types.T_uint8:
   442  		return fmt.Sprintf("%v", vector.GetFixedAt[uint8](vec, rowIndex)), nil
   443  	case types.T_uint16:
   444  		return fmt.Sprintf("%v", vector.GetFixedAt[uint16](vec, rowIndex)), nil
   445  	case types.T_uint32:
   446  		return fmt.Sprintf("%v", vector.GetFixedAt[uint32](vec, rowIndex)), nil
   447  	case types.T_uint64:
   448  		return fmt.Sprintf("%v", vector.GetFixedAt[uint64](vec, rowIndex)), nil
   449  	case types.T_float32:
   450  		return fmt.Sprintf("%v", vector.GetFixedAt[float32](vec, rowIndex)), nil
   451  	case types.T_float64:
   452  		return fmt.Sprintf("%v", vector.GetFixedAt[float64](vec, rowIndex)), nil
   453  	case types.T_char, types.T_varchar, types.T_binary, types.T_varbinary, types.T_text, types.T_blob:
   454  		return vec.GetStringAt(rowIndex), nil
   455  	case types.T_array_float32:
   456  		return types.ArrayToString[float32](vector.GetArrayAt[float32](vec, rowIndex)), nil
   457  	case types.T_array_float64:
   458  		return types.ArrayToString[float64](vector.GetArrayAt[float64](vec, rowIndex)), nil
   459  	case types.T_decimal64:
   460  		val := vector.GetFixedAt[types.Decimal64](vec, rowIndex)
   461  		return val.Format(vec.GetType().Scale), nil
   462  	case types.T_decimal128:
   463  		val := vector.GetFixedAt[types.Decimal128](vec, rowIndex)
   464  		return val.Format(vec.GetType().Scale), nil
   465  	case types.T_json:
   466  		val := vec.GetBytesAt(rowIndex)
   467  		byteJson := types.DecodeJson(val)
   468  		return byteJson.String(), nil
   469  	case types.T_uuid:
   470  		val := vector.GetFixedAt[types.Uuid](vec, rowIndex)
   471  		return val.ToString(), nil
   472  	case types.T_date:
   473  		val := vector.GetFixedAt[types.Date](vec, rowIndex)
   474  		return val.String(), nil
   475  	case types.T_time:
   476  		val := vector.GetFixedAt[types.Time](vec, rowIndex)
   477  		return val.String(), nil
   478  	case types.T_timestamp:
   479  		loc := time.Local
   480  		val := vector.GetFixedAt[types.Timestamp](vec, rowIndex)
   481  		return val.String2(loc, vec.GetType().Scale), nil
   482  	case types.T_datetime:
   483  		val := vector.GetFixedAt[types.Datetime](vec, rowIndex)
   484  		return val.String2(vec.GetType().Scale), nil
   485  	case types.T_enum:
   486  		return fmt.Sprintf("%v", vector.GetFixedAt[uint16](vec, rowIndex)), nil
   487  	default:
   488  		return "", moerr.NewInternalErrorNoCtx("fuzzy filter can not parse correct string for type id : %d", vec.GetType().Oid)
   489  	}
   490  }
   491  
   492  // for decimal type in batch.vector that read from pipeline, its scale is empty
   493  // so we have to fill it with tableDef from plan
   494  func (f *fuzzyCheck) adjustDecimalScale(toCheck *vector.Vector) {
   495  	typ := toCheck.GetType()
   496  	switch typ.Oid {
   497  	case types.T_decimal64, types.T_decimal128, types.T_decimal256:
   498  		if typ.Scale == 0 {
   499  			typ.Scale = f.col.Typ.Scale
   500  		}
   501  	default:
   502  	}
   503  }
   504  
   505  func (f *fuzzyCheck) recheckIfCompound(toCheck *vector.Vector) (bool, error) {
   506  
   507  	if f.onlyInsertHidden {
   508  		// for the case that create compound unique index for existed table
   509  		// can only detact if it is compound by the schema of the vector
   510  
   511  		foo, err := vectorToString(toCheck, 0)
   512  		if err != nil {
   513  			return false, err
   514  		}
   515  
   516  		if _, _, schema, err := types.DecodeTuple([]byte(foo)); err == nil {
   517  			f.compoundCols = make([]*plan.ColDef, len(schema))
   518  			return len(schema) > 1, nil
   519  		}
   520  	}
   521  
   522  	return f.isCompound, nil
   523  }