github.com/matrixorigin/matrixone@v1.2.0/pkg/sql/compile/fuzzyCheck.go (about) 1 // Copyright 2023 Matrix Origin 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 package compile 16 17 import ( 18 "bytes" 19 "context" 20 "encoding/hex" 21 "fmt" 22 "strconv" 23 "strings" 24 "time" 25 26 "github.com/matrixorigin/matrixone/pkg/common/reuse" 27 28 "github.com/matrixorigin/matrixone/pkg/catalog" 29 "github.com/matrixorigin/matrixone/pkg/common/moerr" 30 "github.com/matrixorigin/matrixone/pkg/container/batch" 31 "github.com/matrixorigin/matrixone/pkg/container/nulls" 32 "github.com/matrixorigin/matrixone/pkg/container/types" 33 "github.com/matrixorigin/matrixone/pkg/container/vector" 34 "github.com/matrixorigin/matrixone/pkg/logutil" 35 "github.com/matrixorigin/matrixone/pkg/pb/plan" 36 ) 37 38 /* 39 fuzzyCheck use to contains some info to run a background SQL when 40 fuzzy filter can not draw a definite conclusion for duplicate check 41 */ 42 43 func newFuzzyCheck(n *plan.Node) (*fuzzyCheck, error) { 44 tblName := n.TableDef.GetName() 45 dbName := n.ObjRef.GetSchemaName() 46 47 if tblName == "" || dbName == "" { 48 return nil, moerr.NewInternalErrorNoCtx("fuzzyfilter failed to get the db/tbl name") 49 } 50 51 f := reuse.Alloc[fuzzyCheck](nil) 52 f.tbl = tblName 53 f.db = dbName 54 f.attr = n.TableDef.Pkey.PkeyColName 55 56 for _, c := range n.TableDef.Cols { 57 if c.Name == n.TableDef.Pkey.PkeyColName { 58 f.col = c 59 } 60 } 61 62 // compound key could be primary key(a, b, c...) or unique key(a, b, c ...) 63 // for Decimal type, we need colDef to get the scale 64 if n.TableDef.Pkey.PkeyColName == catalog.CPrimaryKeyColName { 65 f.isCompound = true 66 f.compoundCols = f.sortColDef(n.TableDef.Pkey.Names, n.TableDef.Cols) 67 } 68 69 // for the case like create unique index for existed table, 70 // We can only get the table definition of the hidden table. 71 // How the original table defines this unique index (for example, which columns are used and whether it is composite) can NOT be confirmed, 72 // that introduces some strange logic, and obscures the meaning of some fields, such as fuzzyCheck.isCompound 73 if catalog.IsHiddenTable(tblName) && n.Fuzzymessage == nil { 74 f.onlyInsertHidden = true 75 } 76 77 if n.Fuzzymessage != nil { 78 if len(n.Fuzzymessage.ParentUniqueCols) > 1 { 79 f.isCompound = true 80 f.tbl = n.Fuzzymessage.ParentTableName 81 f.compoundCols = n.Fuzzymessage.ParentUniqueCols 82 } else { 83 f.col = n.Fuzzymessage.ParentUniqueCols[0] 84 } 85 } 86 87 return f, nil 88 } 89 90 func (f fuzzyCheck) TypeName() string { 91 return "compile.fuzzyCheck" 92 } 93 94 func (f *fuzzyCheck) reset() { 95 f.db = "" 96 f.tbl = "" 97 f.attr = "" 98 f.condition = "" 99 f.isCompound = false 100 f.onlyInsertHidden = false 101 f.col = nil 102 f.compoundCols = nil 103 f.cnt = 0 104 } 105 106 func (f *fuzzyCheck) release() { 107 reuse.Free[fuzzyCheck](f, nil) 108 } 109 110 // fill will generate condition for background SQL to check if duplicate constraint is satisfied 111 // for compound primary key, the condtion will look like : "attr_1=x1 and attr_2=y1 and ... attr_n = yn or attr1=x2 and attr=y2 ... " 112 // and its background SQL will look like : select pkAttrs, count(*) as cnt from db.tbl where **cont** having cnt > 1; 113 // 114 // for non compound primary key, cond will look like : "x1, x2, x3, ..." 115 // and its background SQL will look like : select pkAttr, count(*) as cnt from db.tbl where pkAttr in (**cond**) having cnt > 1; 116 // 117 // for more info, refer func backgroundSQLCheck 118 func (f *fuzzyCheck) fill(ctx context.Context, bat *batch.Batch) error { 119 var collision [][]string 120 var err error 121 122 toCheck := bat.GetVector(0) 123 f.cnt = bat.RowCount() 124 125 f.isCompound, err = f.recheckIfCompound(toCheck) 126 if err != nil { 127 return err 128 } 129 130 if !f.onlyInsertHidden { 131 if err := f.firstlyCheck(ctx, toCheck); err != nil { 132 return err // fail to pass duplicate constraint 133 } 134 } 135 136 collision, err = f.genCollsionKeys(toCheck) 137 if err != nil { 138 return err 139 } 140 141 // generate codition used in background SQL 142 if !f.onlyInsertHidden { 143 if !f.isCompound { 144 f.condition = strings.Join(collision[0], ", ") 145 } else { 146 // not using __mo_cpkey_col search directly because efficiency considerations, 147 // HOWEVER This part of the code is still redundant, because currently plan can not support sql like 148 // 149 // SELECT pk1, pk2, COUNT(*) AS cnt 150 // FROM tbl 151 // WHERE (pk1, pk2) IN ((1, 1), (2, 1)) 152 // GROUP BY pk1, pk2 153 // HAVING cnt > 1; 154 // 155 // otherwise, the code will be much more cleaner 156 var all bytes.Buffer 157 var one bytes.Buffer 158 var i int 159 var j int 160 161 lastRow := len(collision[0]) - 1 162 163 cAttrs := make([]string, len(f.compoundCols)) 164 for k, c := range f.compoundCols { 165 if c == nil { 166 panic("compoundCols should not have nil element") 167 } 168 cAttrs[k] = c.Name 169 } 170 171 for i = 0; i < lastRow; i++ { 172 one.Reset() 173 one.WriteByte('(') 174 // one compound primary key has multiple conditions, use "and" to join them 175 for j = 0; j < len(cAttrs)-1; j++ { 176 one.WriteString(fmt.Sprintf("%s = %s and ", cAttrs[j], collision[j][i])) 177 } 178 179 // the last condition does not need to be followed by "and" 180 one.WriteString(fmt.Sprintf("%s = %s", cAttrs[j], collision[j][i])) 181 one.WriteByte(')') 182 if _, err = one.WriteTo(&all); err != nil { 183 return err 184 } 185 186 // use or join each compound primary keys 187 all.WriteString(" or ") 188 } 189 190 // if only have one collision key, there will no "or", same as the last collision key 191 for j = 0; j < len(cAttrs)-1; j++ { 192 one.WriteString(fmt.Sprintf("%s = %s and ", cAttrs[j], collision[j][lastRow])) 193 } 194 195 one.WriteString(fmt.Sprintf("%s = %s", cAttrs[j], collision[j][lastRow])) 196 if _, err = one.WriteTo(&all); err != nil { 197 return err 198 } 199 f.condition = all.String() 200 } 201 } else { 202 keys := collision[0] 203 f.condition = strings.Join(keys, ", ") 204 } 205 206 return nil 207 } 208 209 func (f *fuzzyCheck) firstlyCheck(ctx context.Context, toCheck *vector.Vector) error { 210 kcnt := make(map[string]int) 211 212 if !f.isCompound { 213 pkey, err := f.format(toCheck) 214 if err != nil { 215 return err 216 } 217 for _, k := range pkey { 218 kcnt[k]++ 219 } 220 } else { 221 for i := 0; i < toCheck.Length(); i++ { 222 b := toCheck.GetRawBytesAt(i) 223 t, err := types.Unpack(b) 224 if err != nil { 225 return err 226 } 227 scales := make([]int32, len(f.compoundCols)) 228 for i, c := range f.compoundCols { 229 if c == nil { 230 panic("compoundCols should not have nil element") 231 } 232 scales[i] = c.Typ.Scale 233 } 234 es := t.ErrString(scales) 235 kcnt[es]++ 236 } 237 } 238 239 // firstly check if contains duplicate 240 for k, cnt := range kcnt { 241 if cnt > 1 { 242 ds, e := strconv.Unquote(k) 243 if e != nil { 244 return moerr.NewDuplicateEntry(ctx, k, f.attr) 245 } else { 246 return moerr.NewDuplicateEntry(ctx, ds, f.attr) 247 } 248 } 249 } 250 return nil 251 } 252 253 // genCollsionKeys return [][]string to store the string of collsion keys, it will check if 254 // collision keys are duplicates with each other, if do dup, no need to run background SQL 255 func (f *fuzzyCheck) genCollsionKeys(toCheck *vector.Vector) ([][]string, error) { 256 var keys [][]string 257 if !f.isCompound { 258 keys = make([][]string, 1) 259 } else { 260 keys = make([][]string, len(f.compoundCols)) 261 } 262 263 if !f.onlyInsertHidden { 264 if !f.isCompound { 265 pkey, err := f.format(toCheck) 266 if err != nil { 267 return nil, err 268 } 269 keys[0] = pkey 270 } else { 271 scales := make([]int32, len(f.compoundCols)) 272 for i, c := range f.compoundCols { 273 scales[i] = c.Typ.Scale 274 } 275 for i := 0; i < toCheck.Length(); i++ { 276 b := toCheck.GetRawBytesAt(i) 277 t, err := types.Unpack(b) 278 if err != nil { 279 return nil, err 280 } 281 s := t.SQLStrings(scales) 282 for j := 0; j < len(s); j++ { 283 keys[j] = append(keys[j], s[j]) 284 } 285 } 286 } 287 } else { 288 pkey, err := f.format(toCheck) 289 if err != nil { 290 return nil, err 291 } 292 keys[0] = pkey 293 } 294 295 return keys, nil 296 } 297 298 // backgroundSQLCheck launches a background SQL to check if there are any duplicates 299 func (f *fuzzyCheck) backgroundSQLCheck(c *Compile) error { 300 var duplicateCheckSql string 301 302 if !f.onlyInsertHidden { 303 if !f.isCompound { 304 duplicateCheckSql = fmt.Sprintf(fuzzyNonCompoundCheck, f.attr, f.db, f.tbl, f.attr, f.condition, f.attr) 305 } else { 306 cAttrs := make([]string, len(f.compoundCols)) 307 for k, c := range f.compoundCols { 308 cAttrs[k] = c.Name 309 } 310 attrs := strings.Join(cAttrs, ", ") 311 duplicateCheckSql = fmt.Sprintf(fuzzyCompoundCheck, attrs, f.db, f.tbl, f.condition, attrs) 312 } 313 } else { 314 duplicateCheckSql = fmt.Sprintf(fuzzyNonCompoundCheck, f.attr, f.db, f.tbl, f.attr, f.condition, f.attr) 315 } 316 317 res, err := c.runSqlWithResult(duplicateCheckSql) 318 if err != nil { 319 logutil.Errorf("The sql that caused the fuzzy check background SQL failed is %s, and generated background sql is %s", c.sql, duplicateCheckSql) 320 return err 321 } 322 defer res.Close() 323 324 if res.Batches != nil { 325 vs := res.Batches[0].Vecs 326 if vs != nil && vs[0].Length() > 0 { // do dup 327 toCheck := vs[0] 328 if !f.isCompound { 329 f.adjustDecimalScale(toCheck) 330 if dupKey, e := f.format(toCheck); e != nil { 331 err = e 332 } else { 333 ds, e := strconv.Unquote(dupKey[0]) 334 if e != nil { 335 err = moerr.NewDuplicateEntry(c.ctx, dupKey[0], f.attr) 336 } else { 337 err = moerr.NewDuplicateEntry(c.ctx, ds, f.attr) 338 } 339 } 340 } else { 341 if t, e := types.Unpack(toCheck.GetBytesAt(0)); e != nil { 342 err = e 343 } else { 344 scales := make([]int32, len(f.compoundCols)) 345 for i, col := range f.compoundCols { 346 if col != nil { 347 scales[i] = col.Typ.Scale 348 } else { 349 scales[i] = 0 350 } 351 } 352 err = moerr.NewDuplicateEntry(c.ctx, t.ErrString(scales), f.attr) 353 } 354 } 355 } 356 } 357 358 return err 359 } 360 361 // -----------------------------utils----------------------------------- 362 363 // make sure that the attr sort by define way 364 func (f *fuzzyCheck) sortColDef(toSelect []string, cols []*plan.ColDef) []*plan.ColDef { 365 ccols := make([]*plan.ColDef, len(toSelect)) 366 nmap := make(map[string]int) 367 for i, n := range toSelect { 368 nmap[n] = i 369 } 370 for _, c := range cols { 371 if i, ok := nmap[c.Name]; ok { 372 ccols[i] = c 373 } 374 } 375 return ccols 376 } 377 378 // format format strings from vector 379 func (f *fuzzyCheck) format(toCheck *vector.Vector) ([]string, error) { 380 var ss []string 381 typ := toCheck.GetType() 382 383 for i := 0; i < toCheck.Length(); i++ { 384 s, err := vectorToString(toCheck, i) 385 if err != nil { 386 return nil, err 387 } 388 ss = append(ss, s) 389 } 390 391 if f.isCompound { 392 for i, s := range ss { 393 ss[i] = "unhex('" + hex.EncodeToString([]byte(s)) + "')" 394 } 395 return ss, nil 396 } 397 398 // for table that like CREATE TABLE t1( b CHAR(10), PRIMARY KEY); 399 // with : insert into t1 values('ab'), ('ab'); 400 // background SQL condition should be b='ab' instead of b=ab, as well as time types 401 switch typ.Oid { 402 // date and time 403 case types.T_date, types.T_time, types.T_datetime, types.T_timestamp: 404 for i, str := range ss { 405 ss[i] = strconv.Quote(str) 406 } 407 return ss, nil 408 409 // string family but not include binary 410 case types.T_char, types.T_varchar, types.T_varbinary, types.T_text, types.T_uuid, types.T_binary: 411 for i, str := range ss { 412 ss[i] = strconv.Quote(str) 413 } 414 return ss, nil 415 default: 416 return ss, nil 417 } 418 } 419 420 func vectorToString(vec *vector.Vector, rowIndex int) (string, error) { 421 if nulls.Any(vec.GetNulls()) { 422 return "", nil 423 } 424 switch vec.GetType().Oid { 425 case types.T_bool: 426 flag := vector.GetFixedAt[bool](vec, rowIndex) 427 if flag { 428 return "true", nil 429 } 430 return "false", nil 431 case types.T_bit: 432 return fmt.Sprintf("%v", vector.GetFixedAt[uint64](vec, rowIndex)), nil 433 case types.T_int8: 434 return fmt.Sprintf("%v", vector.GetFixedAt[int8](vec, rowIndex)), nil 435 case types.T_int16: 436 return fmt.Sprintf("%v", vector.GetFixedAt[int16](vec, rowIndex)), nil 437 case types.T_int32: 438 return fmt.Sprintf("%v", vector.GetFixedAt[int32](vec, rowIndex)), nil 439 case types.T_int64: 440 return fmt.Sprintf("%v", vector.GetFixedAt[int64](vec, rowIndex)), nil 441 case types.T_uint8: 442 return fmt.Sprintf("%v", vector.GetFixedAt[uint8](vec, rowIndex)), nil 443 case types.T_uint16: 444 return fmt.Sprintf("%v", vector.GetFixedAt[uint16](vec, rowIndex)), nil 445 case types.T_uint32: 446 return fmt.Sprintf("%v", vector.GetFixedAt[uint32](vec, rowIndex)), nil 447 case types.T_uint64: 448 return fmt.Sprintf("%v", vector.GetFixedAt[uint64](vec, rowIndex)), nil 449 case types.T_float32: 450 return fmt.Sprintf("%v", vector.GetFixedAt[float32](vec, rowIndex)), nil 451 case types.T_float64: 452 return fmt.Sprintf("%v", vector.GetFixedAt[float64](vec, rowIndex)), nil 453 case types.T_char, types.T_varchar, types.T_binary, types.T_varbinary, types.T_text, types.T_blob: 454 return vec.GetStringAt(rowIndex), nil 455 case types.T_array_float32: 456 return types.ArrayToString[float32](vector.GetArrayAt[float32](vec, rowIndex)), nil 457 case types.T_array_float64: 458 return types.ArrayToString[float64](vector.GetArrayAt[float64](vec, rowIndex)), nil 459 case types.T_decimal64: 460 val := vector.GetFixedAt[types.Decimal64](vec, rowIndex) 461 return val.Format(vec.GetType().Scale), nil 462 case types.T_decimal128: 463 val := vector.GetFixedAt[types.Decimal128](vec, rowIndex) 464 return val.Format(vec.GetType().Scale), nil 465 case types.T_json: 466 val := vec.GetBytesAt(rowIndex) 467 byteJson := types.DecodeJson(val) 468 return byteJson.String(), nil 469 case types.T_uuid: 470 val := vector.GetFixedAt[types.Uuid](vec, rowIndex) 471 return val.ToString(), nil 472 case types.T_date: 473 val := vector.GetFixedAt[types.Date](vec, rowIndex) 474 return val.String(), nil 475 case types.T_time: 476 val := vector.GetFixedAt[types.Time](vec, rowIndex) 477 return val.String(), nil 478 case types.T_timestamp: 479 loc := time.Local 480 val := vector.GetFixedAt[types.Timestamp](vec, rowIndex) 481 return val.String2(loc, vec.GetType().Scale), nil 482 case types.T_datetime: 483 val := vector.GetFixedAt[types.Datetime](vec, rowIndex) 484 return val.String2(vec.GetType().Scale), nil 485 case types.T_enum: 486 return fmt.Sprintf("%v", vector.GetFixedAt[uint16](vec, rowIndex)), nil 487 default: 488 return "", moerr.NewInternalErrorNoCtx("fuzzy filter can not parse correct string for type id : %d", vec.GetType().Oid) 489 } 490 } 491 492 // for decimal type in batch.vector that read from pipeline, its scale is empty 493 // so we have to fill it with tableDef from plan 494 func (f *fuzzyCheck) adjustDecimalScale(toCheck *vector.Vector) { 495 typ := toCheck.GetType() 496 switch typ.Oid { 497 case types.T_decimal64, types.T_decimal128, types.T_decimal256: 498 if typ.Scale == 0 { 499 typ.Scale = f.col.Typ.Scale 500 } 501 default: 502 } 503 } 504 505 func (f *fuzzyCheck) recheckIfCompound(toCheck *vector.Vector) (bool, error) { 506 507 if f.onlyInsertHidden { 508 // for the case that create compound unique index for existed table 509 // can only detact if it is compound by the schema of the vector 510 511 foo, err := vectorToString(toCheck, 0) 512 if err != nil { 513 return false, err 514 } 515 516 if _, _, schema, err := types.DecodeTuple([]byte(foo)); err == nil { 517 f.compoundCols = make([]*plan.ColDef, len(schema)) 518 return len(schema) > 1, nil 519 } 520 } 521 522 return f.isCompound, nil 523 }