github.com/matrixorigin/matrixone@v1.2.0/pkg/sql/compile/ddl_index_algo.go (about) 1 // Copyright 2023 Matrix Origin 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 package compile 16 17 import ( 18 "fmt" 19 "strconv" 20 21 "github.com/matrixorigin/matrixone/pkg/catalog" 22 "github.com/matrixorigin/matrixone/pkg/common/moerr" 23 "github.com/matrixorigin/matrixone/pkg/container/vector" 24 "github.com/matrixorigin/matrixone/pkg/pb/plan" 25 "github.com/matrixorigin/matrixone/pkg/util/executor" 26 ) 27 28 const ( 29 ivfFlatIndexFlag = "experimental_ivf_index" 30 masterIndexFlag = "experimental_master_index" 31 ) 32 33 func (s *Scope) handleUniqueIndexTable(c *Compile, 34 indexDef *plan.IndexDef, qryDatabase string, 35 originalTableDef *plan.TableDef, indexInfo *plan.CreateTable) error { 36 37 // the logic of detecting whether the unique constraint is violated does not need to be done separately, 38 // it will be processed when inserting into the hidden table. 39 40 return s.createAndInsertForUniqueOrRegularIndexTable(c, indexDef, qryDatabase, originalTableDef, indexInfo) 41 } 42 43 func (s *Scope) handleRegularSecondaryIndexTable(c *Compile, 44 indexDef *plan.IndexDef, qryDatabase string, 45 originalTableDef *plan.TableDef, indexInfo *plan.CreateTable) error { 46 47 return s.createAndInsertForUniqueOrRegularIndexTable(c, indexDef, qryDatabase, originalTableDef, indexInfo) 48 } 49 50 func (s *Scope) createAndInsertForUniqueOrRegularIndexTable(c *Compile, indexDef *plan.IndexDef, 51 qryDatabase string, originalTableDef *plan.TableDef, indexInfo *plan.CreateTable) error { 52 53 if len(indexInfo.GetIndexTables()) != 1 { 54 return moerr.NewInternalErrorNoCtx("index table count not equal to 1") 55 } 56 57 def := indexInfo.GetIndexTables()[0] 58 createSQL := genCreateIndexTableSql(def, indexDef, qryDatabase) 59 err := c.runSql(createSQL) 60 if err != nil { 61 return err 62 } 63 64 insertSQL := genInsertIndexTableSql(originalTableDef, indexDef, qryDatabase, indexDef.Unique) 65 err = c.runSql(insertSQL) 66 if err != nil { 67 return err 68 } 69 return nil 70 } 71 func (s *Scope) handleMasterIndexTable(c *Compile, indexDef *plan.IndexDef, qryDatabase string, 72 originalTableDef *plan.TableDef, indexInfo *plan.CreateTable) error { 73 74 if ok, err := s.isExperimentalEnabled(c, masterIndexFlag); err != nil { 75 return err 76 } else if !ok { 77 return moerr.NewInternalErrorNoCtx("Master index is not enabled") 78 } 79 80 if len(indexInfo.GetIndexTables()) != 1 { 81 return moerr.NewInternalErrorNoCtx("index table count not equal to 1") 82 } 83 84 def := indexInfo.GetIndexTables()[0] 85 createSQL := genCreateIndexTableSql(def, indexDef, qryDatabase) 86 err := c.runSql(createSQL) 87 if err != nil { 88 return err 89 } 90 91 insertSQLs := genInsertIndexTableSqlForMasterIndex(originalTableDef, indexDef, qryDatabase) 92 for _, insertSQL := range insertSQLs { 93 err = c.runSql(insertSQL) 94 if err != nil { 95 return err 96 } 97 } 98 return nil 99 } 100 101 func (s *Scope) handleIndexColCount(c *Compile, indexDef *plan.IndexDef, qryDatabase string, originalTableDef *plan.TableDef) (int64, error) { 102 103 indexColumnName := indexDef.Parts[0] 104 countTotalSql := fmt.Sprintf("select count(`%s`) from `%s`.`%s`;", 105 indexColumnName, 106 qryDatabase, 107 originalTableDef.Name) 108 rs, err := c.runSqlWithResult(countTotalSql) 109 if err != nil { 110 return 0, err 111 } 112 113 var totalCnt int64 114 rs.ReadRows(func(_ int, cols []*vector.Vector) bool { 115 totalCnt = executor.GetFixedRows[int64](cols[0])[0] 116 return false 117 }) 118 rs.Close() 119 120 return totalCnt, nil 121 } 122 123 func (s *Scope) handleIvfIndexMetaTable(c *Compile, indexDef *plan.IndexDef, qryDatabase string) error { 124 125 /* 126 The meta table will contain version number for now. In the future, it can contain `index progress` etc. 127 The version number is incremented monotonically for each re-index. 128 NOTE: We don't handle version number overflow as BIGINT has a large upper bound. 129 130 Sample SQL: 131 132 CREATE TABLE meta ( `key` VARCHAR(255), `value` VARCHAR(255), PRIMARY KEY (`key`)); 133 INSERT INTO meta (`key`, `value`) VALUES ('version', '0') ON DUPLICATE KEY UPDATE `value` = CAST( (cast(`value` AS BIGINT) + 1) AS CHAR); 134 */ 135 136 insertSQL := fmt.Sprintf("insert into `%s`.`%s` (`%s`, `%s`) values('version', '0')"+ 137 "ON DUPLICATE KEY UPDATE `%s` = CAST( (CAST(`%s` AS BIGINT) + 1) AS CHAR);", 138 qryDatabase, 139 indexDef.IndexTableName, 140 catalog.SystemSI_IVFFLAT_TblCol_Metadata_key, 141 catalog.SystemSI_IVFFLAT_TblCol_Metadata_val, 142 143 catalog.SystemSI_IVFFLAT_TblCol_Metadata_val, 144 catalog.SystemSI_IVFFLAT_TblCol_Metadata_val, 145 ) 146 147 err := c.runSql(insertSQL) 148 if err != nil { 149 return err 150 } 151 152 return nil 153 } 154 155 func (s *Scope) handleIvfIndexCentroidsTable(c *Compile, indexDef *plan.IndexDef, 156 qryDatabase string, originalTableDef *plan.TableDef, totalCnt int64, metadataTableName string) error { 157 158 // 1.a algo params 159 params, err := catalog.IndexParamsStringToMap(indexDef.IndexAlgoParams) 160 if err != nil { 161 return err 162 } 163 centroidParamsLists, err := strconv.Atoi(params[catalog.IndexAlgoParamLists]) 164 if err != nil { 165 return err 166 } 167 centroidParamsDistFn := catalog.ToLower(params[catalog.IndexAlgoParamOpType]) 168 kmeansInitType := "kmeansplusplus" 169 kmeansNormalize := "true" 170 171 // 1.b init centroids table with default centroid, if centroids are not enough. 172 // NOTE: we can run re-index to improve the centroid quality. 173 if totalCnt == 0 || totalCnt < int64(centroidParamsLists) { 174 initSQL := fmt.Sprintf("INSERT INTO `%s`.`%s` (`%s`, `%s`, `%s`) "+ 175 "SELECT "+ 176 "(SELECT CAST(`%s` AS BIGINT) FROM `%s`.`%s` WHERE `%s` = 'version'), "+ 177 "1, NULL;", 178 qryDatabase, 179 indexDef.IndexTableName, 180 catalog.SystemSI_IVFFLAT_TblCol_Centroids_version, 181 catalog.SystemSI_IVFFLAT_TblCol_Centroids_id, 182 catalog.SystemSI_IVFFLAT_TblCol_Centroids_centroid, 183 184 catalog.SystemSI_IVFFLAT_TblCol_Metadata_val, 185 qryDatabase, 186 metadataTableName, 187 catalog.SystemSI_IVFFLAT_TblCol_Metadata_key, 188 ) 189 err := c.runSql(initSQL) 190 if err != nil { 191 return err 192 } 193 return nil 194 } 195 196 // 2. Sampling SQL Logic 197 sampleCnt := catalog.CalcSampleCount(int64(centroidParamsLists), totalCnt) 198 indexColumnName := indexDef.Parts[0] 199 sampleSQL := fmt.Sprintf("(select sample(`%s`, %d rows, 'row') as `%s` from `%s`.`%s`)", 200 indexColumnName, 201 sampleCnt, 202 indexColumnName, 203 qryDatabase, 204 originalTableDef.Name, 205 ) 206 207 // 3. Insert into centroids table 208 insertSQL := fmt.Sprintf("insert into `%s`.`%s` (`%s`, `%s`, `%s`)", 209 qryDatabase, 210 indexDef.IndexTableName, 211 catalog.SystemSI_IVFFLAT_TblCol_Centroids_version, 212 catalog.SystemSI_IVFFLAT_TblCol_Centroids_id, 213 catalog.SystemSI_IVFFLAT_TblCol_Centroids_centroid) 214 215 /* 216 Sample SQL: 217 218 SELECT 219 (SELECT CAST(`value` AS BIGINT) FROM meta WHERE `key` = 'version'), 220 ROW_NUMBER() OVER(), 221 cast(`__mo_index_unnest_cols`.`value` as VARCHAR) 222 FROM 223 (SELECT cluster_centers(`embedding` kmeans '2,vector_l2_ops') AS `__mo_index_centroids_string` FROM (select sample(embedding, 10 rows) as embedding from tbl) ) AS `__mo_index_centroids_tbl` 224 CROSS JOIN 225 UNNEST(`__mo_index_centroids_tbl`.`__mo_index_centroids_string`) AS `__mo_index_unnest_cols`; 226 */ 227 // 4. final SQL 228 clusterCentersSQL := fmt.Sprintf("%s "+ 229 "SELECT "+ 230 "(SELECT CAST(`%s` AS BIGINT) FROM `%s` WHERE `%s` = 'version'), "+ 231 "ROW_NUMBER() OVER(), "+ 232 "cast(`__mo_index_unnest_cols`.`value` as VARCHAR) "+ 233 "FROM "+ 234 "(SELECT cluster_centers(`%s` kmeans '%d,%s,%s,%s') AS `__mo_index_centroids_string` FROM %s ) AS `__mo_index_centroids_tbl` "+ 235 "CROSS JOIN "+ 236 "UNNEST(`__mo_index_centroids_tbl`.`__mo_index_centroids_string`) AS `__mo_index_unnest_cols`;", 237 insertSQL, 238 239 catalog.SystemSI_IVFFLAT_TblCol_Metadata_val, 240 metadataTableName, 241 catalog.SystemSI_IVFFLAT_TblCol_Metadata_key, 242 243 indexColumnName, 244 centroidParamsLists, 245 centroidParamsDistFn, 246 kmeansInitType, 247 kmeansNormalize, 248 sampleSQL, 249 ) 250 251 err = s.logTimestamp(c, qryDatabase, metadataTableName, "clustering_start") 252 if err != nil { 253 return err 254 } 255 256 err = c.runSql(clusterCentersSQL) 257 if err != nil { 258 return err 259 } 260 261 err = s.logTimestamp(c, qryDatabase, metadataTableName, "clustering_end") 262 if err != nil { 263 return err 264 } 265 266 return nil 267 } 268 269 func (s *Scope) handleIvfIndexEntriesTable(c *Compile, indexDef *plan.IndexDef, qryDatabase string, originalTableDef *plan.TableDef, 270 metadataTableName string, 271 centroidsTableName string) error { 272 273 // 1. algo params 274 params, err := catalog.IndexParamsStringToMap(indexDef.IndexAlgoParams) 275 if err != nil { 276 return err 277 } 278 algoParamsDistFn := catalog.ToLower(params[catalog.IndexAlgoParamOpType]) 279 ops := make(map[string]string) 280 ops[catalog.IndexAlgoParamOpType_l2] = "l2_distance" 281 //ops[catalog.IndexAlgoParamOpType_ip] = "inner_product" //TODO: verify this is correct @arjun 282 //ops[catalog.IndexAlgoParamOpType_cos] = "cosine_distance" 283 algoParamsDistFn = ops[algoParamsDistFn] 284 285 // 2. Original table's pkey name and value 286 var originalTblPkColsCommaSeperated string 287 var originalTblPkColMaySerial string 288 var originalTblPkColMaySerialColNameAlias = "__mo_org_tbl_pk_may_serial_col" 289 if originalTableDef.Pkey.PkeyColName == catalog.CPrimaryKeyColName { 290 for i, part := range originalTableDef.Pkey.Names { 291 if i > 0 { 292 originalTblPkColsCommaSeperated += "," 293 } 294 originalTblPkColsCommaSeperated += fmt.Sprintf("`%s`.`%s`", originalTableDef.Name, part) 295 } 296 originalTblPkColMaySerial = fmt.Sprintf("serial(%s)", originalTblPkColsCommaSeperated) 297 } else { 298 originalTblPkColsCommaSeperated = fmt.Sprintf("`%s`.`%s`", originalTableDef.Name, originalTableDef.Pkey.PkeyColName) 299 originalTblPkColMaySerial = originalTblPkColsCommaSeperated 300 } 301 302 // 3. insert into entries table 303 indexColumnName := indexDef.Parts[0] 304 insertSQL := fmt.Sprintf("insert into `%s`.`%s` (`%s`, `%s`, `%s`, `%s`) ", 305 qryDatabase, 306 indexDef.IndexTableName, 307 catalog.SystemSI_IVFFLAT_TblCol_Entries_version, 308 catalog.SystemSI_IVFFLAT_TblCol_Entries_id, 309 catalog.SystemSI_IVFFLAT_TblCol_Entries_pk, 310 catalog.SystemSI_IVFFLAT_TblCol_Entries_entry, 311 ) 312 313 // 4. centroids table with latest version 314 centroidsTableForCurrentVersionSql := fmt.Sprintf("(select * from "+ 315 "`%s`.`%s` where `%s` = "+ 316 "(select CAST(%s as BIGINT) from `%s`.`%s` where `%s` = 'version')) as `%s`", 317 qryDatabase, 318 centroidsTableName, 319 catalog.SystemSI_IVFFLAT_TblCol_Centroids_version, 320 321 catalog.SystemSI_IVFFLAT_TblCol_Metadata_val, 322 qryDatabase, 323 metadataTableName, 324 catalog.SystemSI_IVFFLAT_TblCol_Metadata_key, 325 centroidsTableName, 326 ) 327 328 // 5. original table with normalized SK 329 normalizedVecColName := "__mo_org_tbl_norm_vec_col" 330 originalTableWithNormalizedSkSql := fmt.Sprintf("(select "+ 331 "%s as `%s`, "+ 332 "normalize_l2(`%s`.`%s`) as `%s` "+ 333 " from `%s`.`%s`) as `%s`", 334 originalTblPkColMaySerial, 335 originalTblPkColMaySerialColNameAlias, 336 337 originalTableDef.Name, 338 indexColumnName, 339 normalizedVecColName, 340 341 qryDatabase, 342 originalTableDef.Name, 343 originalTableDef.Name, 344 ) 345 346 // 6. find centroid_version, centroid_id and tbl_pk 347 joinedCentroidsId := "__mo_index_joined_centroid_id" 348 joinedNode := "__mo_index_tbl_join_centroids" 349 tblCrossJoinCentroids := fmt.Sprintf("(select "+ 350 "`%s`.`%s` as `%s`, "+ 351 "serial_extract( min( serial_full( %s(`%s`.`%s`, `%s`.`%s`), `%s`.`%s`)), 1 as bigint) as `%s`, "+ 352 "`%s`"+ 353 "from %s CROSS JOIN %s group by `%s`, %s ) as `%s`", 354 centroidsTableName, 355 catalog.SystemSI_IVFFLAT_TblCol_Centroids_version, 356 catalog.SystemSI_IVFFLAT_TblCol_Centroids_version, 357 358 algoParamsDistFn, 359 centroidsTableName, 360 catalog.SystemSI_IVFFLAT_TblCol_Centroids_centroid, 361 originalTableDef.Name, 362 normalizedVecColName, 363 centroidsTableName, 364 catalog.SystemSI_IVFFLAT_TblCol_Centroids_id, 365 joinedCentroidsId, 366 367 originalTblPkColMaySerialColNameAlias, // NOTE: no need to add tableName here, because it could be serial() 368 369 originalTableWithNormalizedSkSql, 370 centroidsTableForCurrentVersionSql, 371 catalog.SystemSI_IVFFLAT_TblCol_Centroids_version, 372 originalTblPkColMaySerialColNameAlias, 373 joinedNode, 374 ) 375 376 // 7. from table, get tbl_pk and embedding 377 originalTableWithPkAndEmbeddingSql := fmt.Sprintf("(select "+ 378 "%s as `%s`, "+ 379 "`%s`.`%s`"+ 380 " from `%s`.`%s`) as `%s`", 381 originalTblPkColMaySerial, 382 originalTblPkColMaySerialColNameAlias, 383 384 originalTableDef.Name, 385 indexColumnName, 386 387 qryDatabase, 388 originalTableDef.Name, 389 originalTableDef.Name, 390 ) 391 392 /* 393 Sample SQL: 394 INSERT INTO `a`.`__mo_index_secondary_018ebbd4-ebb7-7898-b0bb-3b133af1905e` 395 ( 396 `__mo_index_centroid_fk_version`, 397 `__mo_index_centroid_fk_id`, 398 `__mo_index_pri_col`, 399 `__mo_index_centroid_fk_entry` 400 ) 401 SELECT `__mo_index_tbl_join_centroids`.`__mo_index_centroid_version` , 402 `__mo_index_tbl_join_centroids`.`__mo_index_joined_centroid_id` , 403 `__mo_index_tbl_join_centroids`.`__mo_org_tbl_pk_may_serial_col` , 404 `t1`.`b` 405 FROM ( 406 SELECT `t1`.`a` AS `__mo_org_tbl_pk_may_serial_col`, 407 `t1`.`b` 408 FROM `a`.`t1`) AS `t1` 409 INNER JOIN 410 ( 411 SELECT `centroids`.`__mo_index_centroid_version` AS `__mo_index_centroid_version`, 412 serial_extract( min( serial_full( l2_distance(`centroids`.`__mo_index_centroid`, `t1`.`__mo_org_tbl_norm_vec_col`), `centroids`.`__mo_index_centroid_id`)), 1 AS bigint) AS `__mo_index_joined_centroid_id`, 413 `__mo_org_tbl_pk_may_serial_col` 414 FROM ( 415 SELECT `t1`.`a` AS `__mo_org_tbl_pk_may_serial_col`, 416 normalize_l2(`t1`.`b`) AS `__mo_org_tbl_norm_vec_col`, 417 FROM `a`.`t1` 418 ) AS `t1` 419 CROSS JOIN 420 ( 421 SELECT * 422 FROM `a`.`centroids` 423 WHERE `__mo_index_centroid_version` = ( SELECT cast(__mo_index_val AS bigint) FROM `a`.`meta` WHERE `__mo_index_key` = 'version') 424 ) AS `centroids` 425 GROUP BY `__mo_index_centroid_version`, 426 __mo_org_tbl_pk_may_serial_col 427 ) AS `__mo_index_tbl_join_centroids` 428 429 ON `__mo_index_tbl_join_centroids`.`__mo_org_tbl_pk_may_serial_col` = `t1`.`__mo_org_tbl_pk_may_serial_col`; 430 */ 431 // 8. final SQL 432 tblInnerJoinPrevJoin := fmt.Sprintf("%s "+ 433 "select "+ 434 "`%s`.`%s` , "+ 435 "`%s`.`%s` , "+ 436 "`%s`.`%s` , "+ 437 "`%s`.`%s` "+ 438 "from %s inner join %s "+ 439 "on `%s`.`%s` = `%s`.`%s` "+ 440 "order by `%s`.`%s`, `%s`.`%s` ;", 441 insertSQL, 442 443 joinedNode, 444 catalog.SystemSI_IVFFLAT_TblCol_Centroids_version, 445 446 joinedNode, 447 joinedCentroidsId, 448 449 joinedNode, 450 originalTblPkColMaySerialColNameAlias, 451 452 originalTableDef.Name, 453 indexColumnName, 454 455 originalTableWithPkAndEmbeddingSql, 456 tblCrossJoinCentroids, 457 458 joinedNode, 459 originalTblPkColMaySerialColNameAlias, 460 originalTableDef.Name, 461 originalTblPkColMaySerialColNameAlias, 462 463 // Without ORDER BY, we get 20QPS 464 // With ORDER BY, we get 60QPS 465 // I think it's because there are lesser number of segments to scan during JOIN. 466 //TODO: need to revisit this once we have BlockFilter applied on the TableScan. 467 joinedNode, 468 catalog.SystemSI_IVFFLAT_TblCol_Centroids_version, 469 joinedNode, 470 joinedCentroidsId, 471 ) 472 473 err = s.logTimestamp(c, qryDatabase, metadataTableName, "mapping_start") 474 if err != nil { 475 return err 476 } 477 478 err = c.runSql(tblInnerJoinPrevJoin) 479 if err != nil { 480 return err 481 } 482 483 err = s.logTimestamp(c, qryDatabase, metadataTableName, "mapping_end") 484 if err != nil { 485 return err 486 } 487 488 return nil 489 } 490 491 func (s *Scope) logTimestamp(c *Compile, qryDatabase, metadataTableName, metrics string) error { 492 return c.runSql(fmt.Sprintf("INSERT INTO `%s`.`%s` (%s, %s) "+ 493 " VALUES ('%s', NOW()) "+ 494 " ON DUPLICATE KEY UPDATE %s = NOW();", 495 qryDatabase, 496 metadataTableName, 497 catalog.SystemSI_IVFFLAT_TblCol_Metadata_key, 498 catalog.SystemSI_IVFFLAT_TblCol_Metadata_val, 499 500 metrics, 501 502 catalog.SystemSI_IVFFLAT_TblCol_Metadata_val, 503 )) 504 } 505 506 func (s *Scope) isExperimentalEnabled(c *Compile, flag string) (bool, error) { 507 508 val, err := c.proc.GetResolveVariableFunc()(flag, true, false) 509 if err != nil { 510 return false, err 511 } 512 513 if val == nil { 514 return false, nil 515 } 516 517 return fmt.Sprintf("%v", val) == "1", nil 518 } 519 520 func (s *Scope) handleIvfIndexDeleteOldEntries(c *Compile, 521 metadataTableName string, 522 centroidsTableName string, 523 entriesTableName string, 524 qryDatabase string) error { 525 526 pruneCentroidsTbl := fmt.Sprintf("DELETE FROM `%s`.`%s` WHERE `%s` < "+ 527 "(SELECT CAST(`%s` AS BIGINT) FROM `%s`.`%s` WHERE `%s` = 'version');", 528 qryDatabase, 529 centroidsTableName, 530 catalog.SystemSI_IVFFLAT_TblCol_Centroids_version, 531 532 catalog.SystemSI_IVFFLAT_TblCol_Metadata_val, 533 qryDatabase, 534 metadataTableName, 535 catalog.SystemSI_IVFFLAT_TblCol_Metadata_key, 536 ) 537 538 pruneEntriesTbl := fmt.Sprintf("DELETE FROM `%s`.`%s` WHERE `%s` < "+ 539 "(SELECT CAST(`%s` AS BIGINT) FROM `%s`.`%s` WHERE `%s` = 'version');", 540 qryDatabase, 541 entriesTableName, 542 catalog.SystemSI_IVFFLAT_TblCol_Entries_version, 543 544 catalog.SystemSI_IVFFLAT_TblCol_Metadata_val, 545 qryDatabase, 546 metadataTableName, 547 catalog.SystemSI_IVFFLAT_TblCol_Metadata_key, 548 ) 549 550 err := s.logTimestamp(c, qryDatabase, metadataTableName, "pruning_start") 551 if err != nil { 552 return err 553 } 554 555 err = c.runSql(pruneCentroidsTbl) 556 if err != nil { 557 return err 558 } 559 560 err = c.runSql(pruneEntriesTbl) 561 if err != nil { 562 return err 563 } 564 565 err = s.logTimestamp(c, qryDatabase, metadataTableName, "pruning_end") 566 if err != nil { 567 return err 568 } 569 570 return nil 571 }