github.com/matrixorigin/matrixone@v1.2.0/pkg/vm/engine/disttae/txn.go (about) 1 // Copyright 2022 Matrix Origin 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 package disttae 16 17 import ( 18 "context" 19 "encoding/hex" 20 "math" 21 "sync" 22 "time" 23 24 "github.com/matrixorigin/matrixone/pkg/catalog" 25 "github.com/matrixorigin/matrixone/pkg/common/moerr" 26 "github.com/matrixorigin/matrixone/pkg/container/batch" 27 "github.com/matrixorigin/matrixone/pkg/container/types" 28 "github.com/matrixorigin/matrixone/pkg/container/vector" 29 "github.com/matrixorigin/matrixone/pkg/logutil" 30 "github.com/matrixorigin/matrixone/pkg/objectio" 31 "github.com/matrixorigin/matrixone/pkg/pb/plan" 32 "github.com/matrixorigin/matrixone/pkg/pb/timestamp" 33 "github.com/matrixorigin/matrixone/pkg/pb/txn" 34 "github.com/matrixorigin/matrixone/pkg/sql/colexec" 35 "github.com/matrixorigin/matrixone/pkg/txn/client" 36 "github.com/matrixorigin/matrixone/pkg/txn/trace" 37 v2 "github.com/matrixorigin/matrixone/pkg/util/metric/v2" 38 "github.com/matrixorigin/matrixone/pkg/vm/engine" 39 "github.com/matrixorigin/matrixone/pkg/vm/engine/disttae/cache" 40 "github.com/matrixorigin/matrixone/pkg/vm/engine/tae/common" 41 ) 42 43 //func (txn *Transaction) getObjInfos( 44 // ctx context.Context, 45 // tbl *txnTable, 46 //) (objs []logtailreplay.ObjectEntry, err error) { 47 // ts := types.TimestampToTS(txn.op.SnapshotTS()) 48 // state, err := tbl.getPartitionState(ctx) 49 // if err != nil { 50 // return nil, err 51 // } 52 // iter, err := state.NewObjectsIter(ts) 53 // if err != nil { 54 // return nil, err 55 // } 56 // for iter.Next() { 57 // entry := iter.Entry() 58 // objs = append(objs, entry) 59 // } 60 // iter.Close() 61 // return 62 //} 63 64 // detecting whether a transaction is a read-only transaction 65 func (txn *Transaction) ReadOnly() bool { 66 return txn.readOnly.Load() 67 } 68 69 // WriteBatch used to write data to the transaction buffer 70 // insert/delete/update all use this api 71 // insertBatchHasRowId : it denotes the batch has Rowid when the typ is INSERT. 72 // if typ is not INSERT, it is always false. 73 // truncate : it denotes the batch with typ DELETE on mo_tables is generated when Truncating 74 // a table. 75 func (txn *Transaction) WriteBatch( 76 typ int, 77 accountId uint32, 78 databaseId uint64, 79 tableId uint64, 80 databaseName string, 81 tableName string, 82 bat *batch.Batch, 83 tnStore DNStore, 84 primaryIdx int, // pass -1 to indicate no primary key or disable primary key checking 85 insertBatchHasRowId bool, 86 truncate bool) error { 87 start := time.Now() 88 seq := txn.op.NextSequence() 89 trace.GetService().AddTxnDurationAction( 90 txn.op, 91 client.WorkspaceWriteEvent, 92 seq, 93 tableId, 94 0, 95 nil) 96 defer func() { 97 trace.GetService().AddTxnDurationAction( 98 txn.op, 99 client.WorkspaceWriteEvent, 100 seq, 101 tableId, 102 time.Since(start), 103 nil) 104 }() 105 106 txn.readOnly.Store(false) 107 bat.Cnt = 1 108 txn.Lock() 109 defer txn.Unlock() 110 if typ == INSERT || typ == INSERT_TXN { 111 if !insertBatchHasRowId { 112 txn.genBlock() 113 len := bat.RowCount() 114 vec := txn.proc.GetVector(types.T_Rowid.ToType()) 115 for i := 0; i < len; i++ { 116 if err := vector.AppendFixed(vec, txn.genRowId(), false, 117 txn.proc.Mp()); err != nil { 118 return err 119 } 120 } 121 bat.Vecs = append([]*vector.Vector{vec}, bat.Vecs...) 122 bat.Attrs = append([]string{catalog.Row_ID}, bat.Attrs...) 123 } 124 if tableId != catalog.MO_DATABASE_ID && 125 tableId != catalog.MO_TABLES_ID && tableId != catalog.MO_COLUMNS_ID { 126 txn.workspaceSize += uint64(bat.Size()) 127 } 128 } 129 e := Entry{ 130 typ: typ, 131 accountId: accountId, 132 bat: bat, 133 tableId: tableId, 134 databaseId: databaseId, 135 tableName: tableName, 136 databaseName: databaseName, 137 tnStore: tnStore, 138 truncate: truncate, 139 } 140 txn.writes = append(txn.writes, e) 141 txn.pkCount += bat.RowCount() 142 143 trace.GetService().TxnWrite(txn.op, tableId, typesNames[typ], bat) 144 return nil 145 } 146 147 func (txn *Transaction) dumpBatch(offset int) error { 148 txn.Lock() 149 defer txn.Unlock() 150 return txn.dumpBatchLocked(offset) 151 } 152 153 func checkPKDupGeneric[T comparable]( 154 mp map[any]bool, 155 t *types.Type, 156 vals []T, 157 start, count int) (bool, string) { 158 for _, v := range vals[start : start+count] { 159 if _, ok := mp[v]; ok { 160 entry := common.TypeStringValue(*t, v, false) 161 return true, entry 162 } 163 mp[v] = true 164 } 165 return false, "" 166 } 167 168 func checkPKDup( 169 mp map[any]bool, 170 pk *vector.Vector, 171 start, count int) (bool, string) { 172 colType := pk.GetType() 173 switch colType.Oid { 174 case types.T_bool: 175 vs := vector.MustFixedCol[bool](pk) 176 return checkPKDupGeneric[bool](mp, colType, vs, start, count) 177 case types.T_bit: 178 vs := vector.MustFixedCol[uint64](pk) 179 return checkPKDupGeneric[uint64](mp, colType, vs, start, count) 180 case types.T_int8: 181 vs := vector.MustFixedCol[int8](pk) 182 return checkPKDupGeneric[int8](mp, colType, vs, start, count) 183 case types.T_int16: 184 vs := vector.MustFixedCol[int16](pk) 185 return checkPKDupGeneric[int16](mp, colType, vs, start, count) 186 case types.T_int32: 187 vs := vector.MustFixedCol[int32](pk) 188 return checkPKDupGeneric[int32](mp, colType, vs, start, count) 189 case types.T_int64: 190 vs := vector.MustFixedCol[int64](pk) 191 return checkPKDupGeneric[int64](mp, colType, vs, start, count) 192 case types.T_uint8: 193 vs := vector.MustFixedCol[uint8](pk) 194 return checkPKDupGeneric[uint8](mp, colType, vs, start, count) 195 case types.T_uint16: 196 vs := vector.MustFixedCol[uint16](pk) 197 return checkPKDupGeneric[uint16](mp, colType, vs, start, count) 198 case types.T_uint32: 199 vs := vector.MustFixedCol[uint32](pk) 200 return checkPKDupGeneric[uint32](mp, colType, vs, start, count) 201 case types.T_uint64: 202 vs := vector.MustFixedCol[uint64](pk) 203 return checkPKDupGeneric[uint64](mp, colType, vs, start, count) 204 case types.T_decimal64: 205 vs := vector.MustFixedCol[types.Decimal64](pk) 206 return checkPKDupGeneric[types.Decimal64](mp, colType, vs, start, count) 207 case types.T_decimal128: 208 vs := vector.MustFixedCol[types.Decimal128](pk) 209 return checkPKDupGeneric[types.Decimal128](mp, colType, vs, start, count) 210 case types.T_uuid: 211 vs := vector.MustFixedCol[types.Uuid](pk) 212 return checkPKDupGeneric[types.Uuid](mp, colType, vs, start, count) 213 case types.T_float32: 214 vs := vector.MustFixedCol[float32](pk) 215 return checkPKDupGeneric[float32](mp, colType, vs, start, count) 216 case types.T_float64: 217 vs := vector.MustFixedCol[float64](pk) 218 return checkPKDupGeneric[float64](mp, colType, vs, start, count) 219 case types.T_date: 220 vs := vector.MustFixedCol[types.Date](pk) 221 return checkPKDupGeneric[types.Date](mp, colType, vs, start, count) 222 case types.T_timestamp: 223 vs := vector.MustFixedCol[types.Timestamp](pk) 224 return checkPKDupGeneric[types.Timestamp](mp, colType, vs, start, count) 225 case types.T_time: 226 vs := vector.MustFixedCol[types.Time](pk) 227 return checkPKDupGeneric[types.Time](mp, colType, vs, start, count) 228 case types.T_datetime: 229 vs := vector.MustFixedCol[types.Datetime](pk) 230 return checkPKDupGeneric[types.Datetime](mp, colType, vs, start, count) 231 case types.T_enum: 232 vs := vector.MustFixedCol[types.Enum](pk) 233 return checkPKDupGeneric[types.Enum](mp, colType, vs, start, count) 234 case types.T_TS: 235 vs := vector.MustFixedCol[types.TS](pk) 236 return checkPKDupGeneric[types.TS](mp, colType, vs, start, count) 237 case types.T_Rowid: 238 vs := vector.MustFixedCol[types.Rowid](pk) 239 return checkPKDupGeneric[types.Rowid](mp, colType, vs, start, count) 240 case types.T_Blockid: 241 vs := vector.MustFixedCol[types.Blockid](pk) 242 return checkPKDupGeneric[types.Blockid](mp, colType, vs, start, count) 243 case types.T_char, types.T_varchar, types.T_json, 244 types.T_binary, types.T_varbinary, types.T_blob, types.T_text: 245 for i := start; i < start+count; i++ { 246 v := pk.GetStringAt(i) 247 if _, ok := mp[v]; ok { 248 entry := common.TypeStringValue(*colType, []byte(v), false) 249 return true, entry 250 } 251 mp[v] = true 252 } 253 case types.T_array_float32: 254 for i := start; i < start+count; i++ { 255 v := types.ArrayToString[float32](vector.GetArrayAt[float32](pk, i)) 256 if _, ok := mp[v]; ok { 257 entry := common.TypeStringValue(*colType, pk.GetBytesAt(i), false) 258 return true, entry 259 } 260 mp[v] = true 261 } 262 case types.T_array_float64: 263 for i := start; i < start+count; i++ { 264 v := types.ArrayToString[float64](vector.GetArrayAt[float64](pk, i)) 265 if _, ok := mp[v]; ok { 266 entry := common.TypeStringValue(*colType, pk.GetBytesAt(i), false) 267 return true, entry 268 } 269 mp[v] = true 270 } 271 default: 272 panic(moerr.NewInternalErrorNoCtx("%s not supported", pk.GetType().String())) 273 } 274 return false, "" 275 } 276 277 // checkDup check whether the txn.writes has duplicate pk entry 278 func (txn *Transaction) checkDup() error { 279 start := time.Now() 280 defer func() { 281 v2.TxnCheckPKDupDurationHistogram.Observe(time.Since(start).Seconds()) 282 }() 283 //table id is global unique 284 tablesDef := make(map[uint64]*plan.TableDef) 285 pkIndex := make(map[uint64]int) 286 insertPks := make(map[uint64]map[any]bool) 287 delPks := make(map[uint64]map[any]bool) 288 289 for _, e := range txn.writes { 290 if e.bat == nil || e.bat.RowCount() == 0 { 291 continue 292 } 293 if e.fileName != "" { 294 continue 295 } 296 if (e.typ == DELETE || e.typ == DELETE_TXN || e.typ == UPDATE) && 297 e.databaseId == catalog.MO_CATALOG_ID && 298 e.tableId == catalog.MO_COLUMNS_ID { 299 continue 300 } 301 302 dbkey := genDatabaseKey(e.accountId, e.databaseName) 303 if _, ok := txn.deletedDatabaseMap.Load(dbkey); ok { 304 continue 305 } 306 307 tableKey := genTableKey(e.accountId, e.tableName, e.databaseId) 308 if _, ok := txn.deletedTableMap.Load(tableKey); ok { 309 continue 310 } 311 if e.typ == INSERT || e.typ == INSERT_TXN { 312 if _, ok := tablesDef[e.tableId]; !ok { 313 tbl, err := txn.getTable(e.accountId, e.databaseName, e.tableName) 314 if err != nil { 315 return err 316 } 317 tablesDef[e.tableId] = tbl.GetTableDef(txn.proc.Ctx) 318 } 319 tableDef := tablesDef[e.tableId] 320 if _, ok := pkIndex[e.tableId]; !ok { 321 for idx, colDef := range tableDef.Cols { 322 if colDef.Name == tableDef.Pkey.PkeyColName { 323 if colDef.Name == catalog.FakePrimaryKeyColName { 324 pkIndex[e.tableId] = -1 325 } else { 326 pkIndex[e.tableId] = idx 327 } 328 break 329 } 330 } 331 } 332 bat := e.bat 333 if index, ok := pkIndex[e.tableId]; ok && index != -1 { 334 if *bat.Vecs[0].GetType() == types.T_Rowid.ToType() { 335 newBat := batch.NewWithSize(len(bat.Vecs) - 1) 336 newBat.SetAttributes(bat.Attrs[1:]) 337 newBat.Vecs = bat.Vecs[1:] 338 newBat.SetRowCount(bat.Vecs[0].Length()) 339 bat = newBat 340 } 341 if _, ok := insertPks[e.tableId]; !ok { 342 insertPks[e.tableId] = make(map[any]bool) 343 } 344 if dup, pk := checkPKDup( 345 insertPks[e.tableId], 346 bat.Vecs[index], 347 0, 348 bat.RowCount()); dup { 349 logutil.Errorf("txn:%s wants to insert duplicate primary key:%s in table:[%v-%v:%s-%s]", 350 hex.EncodeToString(txn.op.Txn().ID), 351 pk, 352 e.databaseId, 353 e.tableId, 354 e.databaseName, 355 e.tableName) 356 return moerr.NewDuplicateEntryNoCtx(pk, bat.Attrs[index]) 357 } 358 } 359 continue 360 } 361 //if entry.tyep is DELETE, then e.bat.Vecs[0] is rowid,e.bat.Vecs[1] is PK 362 if e.typ == DELETE || e.typ == DELETE_TXN { 363 if len(e.bat.Vecs) < 2 { 364 logutil.Warnf("delete has no pk, database:%s, table:%s", 365 e.databaseName, e.tableName) 366 continue 367 } 368 if _, ok := delPks[e.tableId]; !ok { 369 delPks[e.tableId] = make(map[any]bool) 370 } 371 if dup, pk := checkPKDup( 372 delPks[e.tableId], 373 e.bat.Vecs[1], 374 0, 375 e.bat.RowCount()); dup { 376 logutil.Errorf("txn:%s wants to delete duplicate primary key:%s in table:[%v-%v:%s-%s]", 377 hex.EncodeToString(txn.op.Txn().ID), 378 pk, 379 e.databaseId, 380 e.tableId, 381 e.databaseName, 382 e.tableName) 383 return moerr.NewDuplicateEntryNoCtx(pk, e.bat.Attrs[1]) 384 } 385 } 386 } 387 return nil 388 } 389 390 // dumpBatch if txn.workspaceSize is larger than threshold, cn will write workspace to s3 391 // start from write offset. Pass in offset -1 to dump all. Note that dump all will 392 // modify txn.writes, so it can only be called right before txn.commit. 393 func (txn *Transaction) dumpBatchLocked(offset int) error { 394 var size uint64 395 var pkCount int 396 if txn.workspaceSize < WorkspaceThreshold { 397 return nil 398 } 399 400 dumpAll := offset < 0 401 if dumpAll { 402 offset = 0 403 } 404 405 if !dumpAll { 406 for i := offset; i < len(txn.writes); i++ { 407 if txn.writes[i].tableId == catalog.MO_DATABASE_ID || 408 txn.writes[i].tableId == catalog.MO_TABLES_ID || 409 txn.writes[i].tableId == catalog.MO_COLUMNS_ID { 410 continue 411 } 412 if txn.writes[i].bat == nil || txn.writes[i].bat.RowCount() == 0 { 413 continue 414 } 415 if txn.writes[i].typ == INSERT && txn.writes[i].fileName == "" { 416 size += uint64(txn.writes[i].bat.Size()) 417 } 418 } 419 if size < WorkspaceThreshold { 420 return nil 421 } 422 size = 0 423 } 424 txn.hasS3Op.Store(true) 425 mp := make(map[tableKey][]*batch.Batch) 426 427 lastTxnWritesIndex := offset 428 for i := offset; i < len(txn.writes); i++ { 429 if txn.writes[i].tableId == catalog.MO_DATABASE_ID || 430 txn.writes[i].tableId == catalog.MO_TABLES_ID || 431 txn.writes[i].tableId == catalog.MO_COLUMNS_ID { 432 txn.writes[lastTxnWritesIndex] = txn.writes[i] 433 lastTxnWritesIndex++ 434 continue 435 } 436 if txn.writes[i].bat == nil || txn.writes[i].bat.RowCount() == 0 { 437 txn.writes[lastTxnWritesIndex] = txn.writes[i] 438 lastTxnWritesIndex++ 439 continue 440 } 441 442 keepElement := true 443 if txn.writes[i].typ == INSERT && txn.writes[i].fileName == "" { 444 tbKey := tableKey{ 445 accountId: txn.writes[i].accountId, 446 databaseId: txn.writes[i].databaseId, 447 dbName: txn.writes[i].databaseName, 448 name: txn.writes[i].tableName, 449 } 450 bat := txn.writes[i].bat 451 size += uint64(bat.Size()) 452 pkCount += bat.RowCount() 453 // skip rowid 454 newBat := batch.NewWithSize(len(bat.Vecs) - 1) 455 newBat.SetAttributes(bat.Attrs[1:]) 456 newBat.Vecs = bat.Vecs[1:] 457 newBat.SetRowCount(bat.Vecs[0].Length()) 458 mp[tbKey] = append(mp[tbKey], newBat) 459 txn.toFreeBatches[tbKey] = append(txn.toFreeBatches[tbKey], bat) 460 461 keepElement = false 462 } 463 464 if keepElement { 465 txn.writes[lastTxnWritesIndex] = txn.writes[i] 466 lastTxnWritesIndex++ 467 } 468 } 469 txn.writes = txn.writes[:lastTxnWritesIndex] 470 471 for tbKey := range mp { 472 // scenario 2 for cn write s3, more info in the comment of S3Writer 473 tbl, err := txn.getTable(tbKey.accountId, tbKey.dbName, tbKey.name) 474 if err != nil { 475 return err 476 } 477 478 tableDef := tbl.GetTableDef(txn.proc.Ctx) 479 480 s3Writer, err := colexec.AllocS3Writer(txn.proc, tableDef) 481 if err != nil { 482 return err 483 } 484 defer s3Writer.Free(txn.proc) 485 486 s3Writer.InitBuffers(txn.proc, mp[tbKey][0]) 487 for i := 0; i < len(mp[tbKey]); i++ { 488 s3Writer.Put(mp[tbKey][i], txn.proc) 489 } 490 err = s3Writer.SortAndFlush(txn.proc) 491 492 if err != nil { 493 return err 494 } 495 blockInfo := s3Writer.GetBlockInfoBat() 496 497 lenVecs := len(blockInfo.Attrs) 498 // only remain the metaLoc col and object stats 499 blockInfo.Vecs = blockInfo.Vecs[lenVecs-2:] 500 blockInfo.Attrs = blockInfo.Attrs[lenVecs-2:] 501 blockInfo.SetRowCount(blockInfo.Vecs[0].Length()) 502 503 table := tbl.(*txnTable) 504 fileName := objectio.DecodeBlockInfo( 505 blockInfo.Vecs[0].GetBytesAt(0)). 506 MetaLocation().Name().String() 507 err = table.getTxn().WriteFileLocked( 508 INSERT, 509 table.accountId, 510 table.db.databaseId, 511 table.tableId, 512 table.db.databaseName, 513 table.tableName, 514 fileName, 515 blockInfo, 516 table.getTxn().tnStores[0], 517 ) 518 if err != nil { 519 return err 520 } 521 } 522 523 if dumpAll { 524 txn.workspaceSize = 0 525 txn.pkCount -= pkCount 526 // modifies txn.writes. 527 writes := txn.writes[:0] 528 for i, write := range txn.writes { 529 if write.bat != nil { 530 writes = append(writes, txn.writes[i]) 531 } 532 } 533 txn.writes = writes 534 } else { 535 txn.workspaceSize -= size 536 txn.pkCount -= pkCount 537 } 538 return nil 539 } 540 541 func (txn *Transaction) getTable(id uint32, dbName string, tbName string) (engine.Relation, error) { 542 database, err := txn.engine.DatabaseByAccountID(id, dbName, txn.proc.TxnOperator) 543 if err != nil { 544 return nil, err 545 } 546 tbl, err := database.(*txnDatabase).RelationByAccountID(id, tbName, nil) 547 if err != nil { 548 return nil, err 549 } 550 return tbl, nil 551 } 552 553 // vec contains block infos. 554 func (txn *Transaction) insertPosForCNBlock( 555 vec *vector.Vector, 556 id uint32, 557 b *batch.Batch, 558 dbName string, 559 tbName string) error { 560 blks := vector.MustBytesCol(vec) 561 for i, blk := range blks { 562 blkInfo := *objectio.DecodeBlockInfo(blk) 563 txn.cnBlkId_Pos[blkInfo.BlockID] = Pos{ 564 bat: b, 565 accountId: id, 566 dbName: dbName, 567 tbName: tbName, 568 offset: int64(i), 569 blkInfo: blkInfo} 570 } 571 return nil 572 } 573 574 func (txn *Transaction) WriteFileLocked( 575 typ int, 576 accountId uint32, 577 databaseId, 578 tableId uint64, 579 databaseName, 580 tableName string, 581 fileName string, 582 bat *batch.Batch, 583 tnStore DNStore) error { 584 txn.hasS3Op.Store(true) 585 newBat := bat 586 if typ == INSERT { 587 newBat = batch.NewWithSize(len(bat.Vecs)) 588 newBat.SetAttributes([]string{catalog.BlockMeta_MetaLoc, catalog.ObjectMeta_ObjectStats}) 589 590 for idx := 0; idx < newBat.VectorCount(); idx++ { 591 newBat.SetVector(int32(idx), vector.NewVec(*bat.Vecs[idx].GetType())) 592 } 593 594 blkInfosVec := bat.Vecs[0] 595 for idx := 0; idx < blkInfosVec.Length(); idx++ { 596 blkInfo := *objectio.DecodeBlockInfo(blkInfosVec.GetBytesAt(idx)) 597 vector.AppendBytes(newBat.Vecs[0], []byte(blkInfo.MetaLocation().String()), 598 false, txn.proc.Mp()) 599 colexec.Get().PutCnSegment(&blkInfo.SegmentID, colexec.CnBlockIdType) 600 } 601 602 // append obj stats, may multiple 603 statsListVec := bat.Vecs[1] 604 for idx := 0; idx < statsListVec.Length(); idx++ { 605 vector.AppendBytes(newBat.Vecs[1], statsListVec.GetBytesAt(idx), false, txn.proc.Mp()) 606 } 607 newBat.SetRowCount(bat.Vecs[0].Length()) 608 609 txn.insertPosForCNBlock( 610 bat.GetVector(0), 611 accountId, 612 newBat, 613 databaseName, 614 tableName) 615 } 616 txn.readOnly.Store(false) 617 entry := Entry{ 618 typ: typ, 619 accountId: accountId, 620 tableId: tableId, 621 databaseId: databaseId, 622 tableName: tableName, 623 databaseName: databaseName, 624 fileName: fileName, 625 bat: newBat, 626 tnStore: tnStore, 627 } 628 txn.writes = append(txn.writes, entry) 629 return nil 630 } 631 632 // WriteFile used to add a s3 file information to the transaction buffer 633 // insert/delete/update all use this api 634 func (txn *Transaction) WriteFile( 635 typ int, 636 accountId uint32, 637 databaseId, 638 tableId uint64, 639 databaseName, 640 tableName string, 641 fileName string, 642 bat *batch.Batch, 643 tnStore DNStore) error { 644 txn.Lock() 645 defer txn.Unlock() 646 return txn.WriteFileLocked( 647 typ, accountId, databaseId, tableId, 648 databaseName, tableName, fileName, bat, tnStore) 649 } 650 651 func (txn *Transaction) deleteBatch(bat *batch.Batch, 652 databaseId, tableId uint64) *batch.Batch { 653 start := time.Now() 654 seq := txn.op.NextSequence() 655 trace.GetService().AddTxnDurationAction( 656 txn.op, 657 client.WorkspaceWriteEvent, 658 seq, 659 tableId, 660 0, 661 nil) 662 defer func() { 663 trace.GetService().AddTxnDurationAction( 664 txn.op, 665 client.WorkspaceWriteEvent, 666 seq, 667 tableId, 668 time.Since(start), 669 nil) 670 }() 671 672 trace.GetService().TxnWrite(txn.op, tableId, typesNames[DELETE], bat) 673 674 mp := make(map[types.Rowid]uint8) 675 deleteBlkId := make(map[types.Blockid]bool) 676 rowids := vector.MustFixedCol[types.Rowid](bat.GetVector(0)) 677 min1 := uint32(math.MaxUint32) 678 max1 := uint32(0) 679 cnRowIdOffsets := make([]int64, 0, len(rowids)) 680 for i, rowid := range rowids { 681 // process cn block deletes 682 uid := rowid.BorrowSegmentID() 683 blkid := rowid.CloneBlockID() 684 deleteBlkId[blkid] = true 685 mp[rowid] = 0 686 rowOffset := rowid.GetRowOffset() 687 if colexec.Get() != nil && colexec.Get().GetCnSegmentType(uid) == colexec.CnBlockIdType { 688 txn.deletedBlocks.addDeletedBlocks(&blkid, []int64{int64(rowOffset)}) 689 cnRowIdOffsets = append(cnRowIdOffsets, int64(i)) 690 continue 691 } 692 if rowOffset < (min1) { 693 min1 = rowOffset 694 } 695 696 if rowOffset > max1 { 697 max1 = rowOffset 698 } 699 // update workspace 700 } 701 // cn rowId antiShrink 702 bat.Shrink(cnRowIdOffsets, true) 703 if bat.RowCount() == 0 { 704 return bat 705 } 706 sels := txn.proc.Mp().GetSels() 707 txn.deleteTableWrites(databaseId, tableId, sels, deleteBlkId, min1, max1, mp) 708 709 sels = sels[:0] 710 rowids = vector.MustFixedCol[types.Rowid](bat.GetVector(0)) 711 for k, rowid := range rowids { 712 // put rowid to be deleted into sels. 713 if mp[rowid] != 0 { 714 sels = append(sels, int64(k)) 715 } 716 } 717 bat.Shrink(sels, true) 718 txn.proc.Mp().PutSels(sels) 719 return bat 720 } 721 722 // Delete rows belongs to uncommitted raw data batch in txn's workspace. 723 func (txn *Transaction) deleteTableWrites( 724 databaseId uint64, 725 tableId uint64, 726 sels []int64, 727 deleteBlkId map[types.Blockid]bool, 728 min, max uint32, 729 mp map[types.Rowid]uint8, 730 ) { 731 txn.Lock() 732 defer txn.Unlock() 733 734 // txn worksapce will have four batch type: 735 // 1.RawBatch 2.DN Block RowId(mixed rowid from different block) 736 // 3.CN block Meta batch(record block meta generated by cn insert write s3) 737 // 4.DN delete Block Meta batch(record block meta generated by cn delete write s3) 738 for _, e := range txn.writes { 739 // nil batch will generated by comapction or dumpBatch 740 if e.bat == nil { 741 continue 742 } 743 if e.typ == UPDATE || e.typ == ALTER { 744 continue 745 } 746 // for 3 and 4 above. 747 if e.bat.Attrs[0] == catalog.BlockMeta_MetaLoc || 748 e.bat.Attrs[0] == catalog.BlockMeta_DeltaLoc { 749 continue 750 } 751 sels = sels[:0] 752 if e.tableId == tableId && e.databaseId == databaseId { 753 vs := vector.MustFixedCol[types.Rowid](e.bat.GetVector(0)) 754 if len(vs) == 0 { 755 continue 756 } 757 // skip 2 above 758 if !vs[0].BorrowSegmentID().Eq(txn.segId) { 759 continue 760 } 761 // Now, e.bat is uncommitted raw data batch which belongs to only one block allocated by CN. 762 // so if e.bat is not to be deleted,skip it. 763 if !deleteBlkId[vs[0].CloneBlockID()] { 764 continue 765 } 766 min2 := vs[0].GetRowOffset() 767 max2 := vs[len(vs)-1].GetRowOffset() 768 if min > max2 || max < min2 { 769 continue 770 } 771 for k, v := range vs { 772 if _, ok := mp[v]; !ok { 773 // if the v is not to be deleted, then add its index into the sels. 774 sels = append(sels, int64(k)) 775 } else { 776 mp[v]++ 777 } 778 } 779 if len(sels) != len(vs) { 780 txn.batchSelectList[e.bat] = append(txn.batchSelectList[e.bat], sels...) 781 } 782 } 783 } 784 } 785 786 func (txn *Transaction) allocateID(ctx context.Context) (uint64, error) { 787 ctx, cancel := context.WithTimeout(ctx, time.Minute) 788 defer cancel() 789 return txn.idGen.AllocateID(ctx) 790 } 791 792 func (txn *Transaction) genBlock() { 793 txn.rowId[4]++ 794 txn.rowId[5] = INIT_ROWID_OFFSET 795 } 796 797 func (txn *Transaction) genRowId() types.Rowid { 798 if txn.rowId[5] != INIT_ROWID_OFFSET { 799 txn.rowId[5]++ 800 } else { 801 txn.rowId[5] = 0 802 } 803 return types.DecodeFixed[types.Rowid](types.EncodeSlice(txn.rowId[:])) 804 } 805 806 func (txn *Transaction) mergeTxnWorkspaceLocked() error { 807 if len(txn.batchSelectList) > 0 { 808 for _, e := range txn.writes { 809 if sels, ok := txn.batchSelectList[e.bat]; ok { 810 e.bat.Shrink(sels, false) 811 delete(txn.batchSelectList, e.bat) 812 } 813 } 814 } 815 return txn.compactionBlksLocked() 816 } 817 818 // CN blocks compaction for txn 819 func (txn *Transaction) compactionBlksLocked() error { 820 compactedBlks := make(map[tableKey]map[objectio.ObjectLocation][]int64) 821 compactedEntries := make(map[*batch.Batch][]int64) 822 defer func() { 823 //txn.deletedBlocks = nil 824 txn.deletedBlocks.clean() 825 }() 826 txn.deletedBlocks.iter( 827 func(blkId *types.Blockid, offsets []int64) bool { 828 pos := txn.cnBlkId_Pos[*blkId] 829 if v, ok := compactedBlks[tableKey{ 830 accountId: pos.accountId, 831 dbName: pos.dbName, 832 name: pos.tbName, 833 }]; ok { 834 v[pos.blkInfo.MetaLoc] = offsets 835 } else { 836 compactedBlks[tableKey{ 837 accountId: pos.accountId, 838 dbName: pos.dbName, 839 name: pos.tbName, 840 }] = 841 map[objectio.ObjectLocation][]int64{pos.blkInfo.MetaLoc: offsets} 842 } 843 compactedEntries[pos.bat] = append(compactedEntries[pos.bat], pos.offset) 844 //delete(txn.cnBlkId_Pos, *blkId) 845 return true 846 }) 847 848 for tbKey, blks := range compactedBlks { 849 rel, err := txn.getTable(tbKey.accountId, tbKey.dbName, tbKey.name) 850 if err != nil { 851 return err 852 } 853 //TODO::do parallel compaction for table 854 tbl := rel.(*txnTable) 855 createdBlks, stats, err := tbl.compaction(blks) 856 if err != nil { 857 return err 858 } 859 if len(createdBlks) > 0 { 860 bat := batch.NewWithSize(2) 861 bat.Attrs = []string{catalog.BlockMeta_BlockInfo, catalog.ObjectMeta_ObjectStats} 862 bat.SetVector(0, vector.NewVec(types.T_text.ToType())) 863 bat.SetVector(1, vector.NewVec(types.T_binary.ToType())) 864 for _, blkInfo := range createdBlks { 865 vector.AppendBytes( 866 bat.GetVector(0), 867 objectio.EncodeBlockInfo(blkInfo), 868 false, 869 tbl.getTxn().proc.GetMPool()) 870 } 871 872 // append the object stats to bat 873 for idx := 0; idx < len(stats); idx++ { 874 if stats[idx].IsZero() { 875 continue 876 } 877 if err = vector.AppendBytes(bat.Vecs[1], stats[idx].Marshal(), 878 false, tbl.getTxn().proc.GetMPool()); err != nil { 879 return err 880 } 881 } 882 883 bat.SetRowCount(len(createdBlks)) 884 defer func() { 885 bat.Clean(tbl.getTxn().proc.GetMPool()) 886 }() 887 888 err := txn.WriteFileLocked( 889 INSERT, 890 tbl.accountId, 891 tbl.db.databaseId, 892 tbl.tableId, 893 tbl.db.databaseName, 894 tbl.tableName, 895 createdBlks[0].MetaLocation().Name().String(), 896 bat, 897 tbl.getTxn().tnStores[0], 898 ) 899 if err != nil { 900 return err 901 } 902 } 903 } 904 905 //compaction for txn.writes 906 for i, entry := range txn.writes { 907 if entry.bat == nil || entry.bat.IsEmpty() { 908 continue 909 } 910 911 if entry.typ == INSERT_TXN { 912 continue 913 } 914 915 if entry.typ != INSERT || 916 entry.bat.Attrs[0] != catalog.BlockMeta_MetaLoc { 917 continue 918 } 919 entry.bat.Shrink(compactedEntries[entry.bat], true) 920 if entry.bat.RowCount() == 0 { 921 txn.writes[i].bat.Clean(txn.proc.GetMPool()) 922 txn.writes[i].bat = nil 923 } 924 } 925 return nil 926 } 927 928 func (txn *Transaction) hasDeletesOnUncommitedObject() bool { 929 return !txn.deletedBlocks.isEmpty() 930 } 931 932 func (txn *Transaction) hasUncommittedDeletesOnBlock(id *types.Blockid) bool { 933 return txn.deletedBlocks.hasDeletes(id) 934 } 935 936 // TODO:: refactor in next PR, to make it more efficient and include persisted deletes in S3 937 func (txn *Transaction) forEachTableHasDeletesLocked(f func(tbl *txnTable) error) error { 938 tables := make(map[uint64]*txnTable) 939 for i := 0; i < len(txn.writes); i++ { 940 e := txn.writes[i] 941 if e.typ != DELETE || e.fileName != "" { 942 continue 943 } 944 if _, ok := tables[e.tableId]; ok { 945 continue 946 } 947 db, err := txn.engine.Database(txn.proc.Ctx, e.databaseName, txn.op) 948 if err != nil { 949 return err 950 } 951 rel, err := db.Relation(txn.proc.Ctx, e.tableName, nil) 952 if err != nil { 953 return err 954 } 955 tables[e.tableId] = rel.(*txnTable) 956 } 957 for _, tbl := range tables { 958 if err := f(tbl); err != nil { 959 return err 960 } 961 } 962 return nil 963 } 964 965 func (txn *Transaction) forEachTableWrites(databaseId uint64, tableId uint64, offset int, f func(Entry)) { 966 txn.Lock() 967 defer txn.Unlock() 968 for i := 0; i < offset; i++ { 969 e := txn.writes[i] 970 if e.databaseId != databaseId { 971 continue 972 } 973 if e.tableId != tableId { 974 continue 975 } 976 f(e) 977 } 978 } 979 980 // getCachedTable returns the cached table in this transaction if it exists, nil otherwise. 981 // Before it gets the cached table, it checks whether the table is deleted by another 982 // transaction by go through the delete tables slice, and advance its cachedIndex. 983 func (txn *Transaction) getCachedTable( 984 ctx context.Context, 985 k tableKey, 986 ) *txnTable { 987 var tbl *txnTable 988 if v, ok := txn.tableCache.tableMap.Load(k); ok { 989 tbl = v.(*txnTable) 990 991 tblKey := cache.TableKey{ 992 AccountId: k.accountId, 993 DatabaseId: k.databaseId, 994 Name: k.name, 995 } 996 var catache *cache.CatalogCache 997 var err error 998 if !txn.op.IsSnapOp() { 999 catache = txn.engine.getLatestCatalogCache() 1000 } else { 1001 catache, err = txn.engine.getOrCreateSnapCatalogCache( 1002 ctx, 1003 types.TimestampToTS(txn.op.SnapshotTS())) 1004 if err != nil { 1005 return nil 1006 } 1007 } 1008 val := catache.GetSchemaVersion(tblKey) 1009 if val != nil { 1010 if val.Ts.Greater(tbl.lastTS) && val.Version != tbl.version { 1011 txn.tableCache.tableMap.Delete(genTableKey(k.accountId, k.name, k.databaseId)) 1012 return nil 1013 } 1014 } 1015 1016 } 1017 return tbl 1018 } 1019 1020 func (txn *Transaction) Commit(ctx context.Context) ([]txn.TxnRequest, error) { 1021 logDebugf(txn.op.Txn(), "Transaction.Commit") 1022 txn.IncrStatementID(ctx, true) 1023 defer txn.delTransaction() 1024 if txn.readOnly.Load() { 1025 return nil, nil 1026 } 1027 if err := txn.mergeTxnWorkspaceLocked(); err != nil { 1028 return nil, err 1029 } 1030 if err := txn.dumpBatchLocked(-1); err != nil { 1031 return nil, err 1032 } 1033 1034 txn.traceWorkspaceLocked(true) 1035 1036 if !txn.hasS3Op.Load() && 1037 txn.op.TxnOptions().CheckDupEnabled() { 1038 if err := txn.checkDup(); err != nil { 1039 return nil, err 1040 } 1041 } 1042 reqs, err := genWriteReqs(ctx, txn.writes, txn.op) 1043 if err != nil { 1044 return nil, err 1045 } 1046 return reqs, nil 1047 } 1048 1049 func (txn *Transaction) Rollback(ctx context.Context) error { 1050 logDebugf(txn.op.Txn(), "Transaction.Rollback") 1051 //to gc the s3 objs 1052 if err := txn.gcObjs(0); err != nil { 1053 panic("Rollback txn failed: to gc objects generated by CN failed") 1054 } 1055 txn.delTransaction() 1056 return nil 1057 } 1058 1059 func (txn *Transaction) delTransaction() { 1060 if txn.removed { 1061 return 1062 } 1063 for i := range txn.writes { 1064 if txn.writes[i].bat == nil { 1065 continue 1066 } 1067 txn.proc.PutBatch(txn.writes[i].bat) 1068 } 1069 txn.tableCache.cachedIndex = -1 1070 txn.tableCache.tableMap = nil 1071 txn.createMap = nil 1072 txn.databaseMap = nil 1073 txn.deletedDatabaseMap = nil 1074 txn.deletedTableMap = nil 1075 txn.blockId_tn_delete_metaLoc_batch.data = nil 1076 txn.deletedBlocks = nil 1077 segmentnames := make([]objectio.Segmentid, 0, len(txn.cnBlkId_Pos)+1) 1078 segmentnames = append(segmentnames, txn.segId) 1079 for blkId := range txn.cnBlkId_Pos { 1080 // blkId: 1081 // |------|----------|----------| 1082 // uuid filelen blkoffset 1083 // 16 2 2 1084 segmentnames = append(segmentnames, *blkId.Segment()) 1085 } 1086 colexec.Get().DeleteTxnSegmentIds(segmentnames) 1087 txn.cnBlkId_Pos = nil 1088 txn.hasS3Op.Store(false) 1089 txn.removed = true 1090 } 1091 1092 func (txn *Transaction) addCreateTable( 1093 key tableKey, 1094 value *txnTable) { 1095 txn.Lock() 1096 defer txn.Unlock() 1097 value.createByStatementID = txn.statementID 1098 txn.createMap.Store(key, value) 1099 } 1100 1101 func (txn *Transaction) rollbackCreateTableLocked() { 1102 txn.createMap.Range(func(key, value any) bool { 1103 if value.(*txnTable).createByStatementID == txn.statementID { 1104 txn.createMap.Delete(key) 1105 } 1106 return true 1107 }) 1108 } 1109 1110 func (txn *Transaction) clearTableCache() { 1111 txn.tableCache.tableMap.Range(func(key, value any) bool { 1112 txn.tableCache.tableMap.Delete(key) 1113 return true 1114 }) 1115 } 1116 1117 func (txn *Transaction) GetSnapshotWriteOffset() int { 1118 txn.Lock() 1119 defer txn.Unlock() 1120 return txn.snapshotWriteOffset 1121 } 1122 1123 func (txn *Transaction) transferDeletesLocked() error { 1124 txn.timestamps = append(txn.timestamps, txn.op.SnapshotTS()) 1125 if txn.statementID > 0 && txn.op.Txn().IsRCIsolation() { 1126 var ts timestamp.Timestamp 1127 if txn.statementID == 1 { 1128 ts = txn.timestamps[0] 1129 } else { 1130 //statementID > 1 1131 ts = txn.timestamps[txn.statementID-2] 1132 } 1133 return txn.forEachTableHasDeletesLocked(func(tbl *txnTable) error { 1134 ctx := tbl.proc.Load().Ctx 1135 state, err := tbl.getPartitionState(ctx) 1136 if err != nil { 1137 return err 1138 } 1139 deleteObjs, createObjs := state.GetChangedObjsBetween(types.TimestampToTS(ts), 1140 types.TimestampToTS(tbl.db.op.SnapshotTS())) 1141 1142 trace.GetService().ApplyFlush( 1143 tbl.db.op.Txn().ID, 1144 tbl.tableId, 1145 ts, 1146 tbl.db.op.SnapshotTS(), 1147 len(deleteObjs)) 1148 1149 if len(deleteObjs) > 0 { 1150 if err := tbl.transferDeletes(ctx, state, deleteObjs, createObjs); err != nil { 1151 return err 1152 } 1153 } 1154 return nil 1155 }) 1156 } 1157 return nil 1158 } 1159 1160 func (txn *Transaction) UpdateSnapshotWriteOffset() { 1161 txn.Lock() 1162 defer txn.Unlock() 1163 txn.snapshotWriteOffset = len(txn.writes) 1164 } 1165 1166 func (txn *Transaction) CloneSnapshotWS() client.Workspace { 1167 ws := &Transaction{ 1168 proc: txn.proc, 1169 engine: txn.engine, 1170 tnStores: txn.tnStores, 1171 1172 tableCache: struct { 1173 cachedIndex int 1174 tableMap *sync.Map 1175 }{tableMap: new(sync.Map)}, 1176 databaseMap: new(sync.Map), 1177 deletedDatabaseMap: new(sync.Map), 1178 createMap: new(sync.Map), 1179 deletedTableMap: new(sync.Map), 1180 deletedBlocks: &deletedBlocks{ 1181 offsets: map[types.Blockid][]int64{}, 1182 }, 1183 cnBlkId_Pos: map[types.Blockid]Pos{}, 1184 batchSelectList: make(map[*batch.Batch][]int64), 1185 toFreeBatches: make(map[tableKey][]*batch.Batch), 1186 } 1187 1188 ws.blockId_tn_delete_metaLoc_batch = struct { 1189 sync.RWMutex 1190 data map[types.Blockid][]*batch.Batch 1191 }{data: make(map[types.Blockid][]*batch.Batch)} 1192 1193 ws.readOnly.Store(true) 1194 1195 return ws 1196 } 1197 1198 func (txn *Transaction) BindTxnOp(op client.TxnOperator) { 1199 txn.op = op 1200 }