github.com/matrixorigin/matrixone@v1.2.0/pkg/vm/engine/disttae/logtailreplay/partition_state.go (about) 1 // Copyright 2023 Matrix Origin 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 package logtailreplay 16 17 import ( 18 "bytes" 19 "context" 20 "fmt" 21 "runtime/trace" 22 "sync" 23 "sync/atomic" 24 "unsafe" 25 26 "github.com/matrixorigin/matrixone/pkg/container/batch" 27 "github.com/matrixorigin/matrixone/pkg/container/types" 28 "github.com/matrixorigin/matrixone/pkg/container/vector" 29 "github.com/matrixorigin/matrixone/pkg/fileservice" 30 "github.com/matrixorigin/matrixone/pkg/logutil" 31 "github.com/matrixorigin/matrixone/pkg/objectio" 32 "github.com/matrixorigin/matrixone/pkg/pb/api" 33 "github.com/matrixorigin/matrixone/pkg/perfcounter" 34 txnTrace "github.com/matrixorigin/matrixone/pkg/txn/trace" 35 "github.com/matrixorigin/matrixone/pkg/vm/engine/tae/blockio" 36 "github.com/tidwall/btree" 37 ) 38 39 type PartitionState struct { 40 // also modify the Copy method if adding fields 41 42 // data 43 rows *btree.BTreeG[RowEntry] // use value type to avoid locking on elements 44 //table data objects 45 dataObjects *btree.BTreeG[ObjectEntry] 46 //TODO:: It's transient, should be removed in future PR. 47 blockDeltas *btree.BTreeG[BlockDeltaEntry] 48 checkpoints []string 49 start types.TS 50 end types.TS 51 52 // index 53 primaryIndex *btree.BTreeG[*PrimaryIndexEntry] 54 //for non-appendable block's memory deletes, used to getting dirty 55 // non-appendable blocks quickly. 56 //TODO::remove it 57 dirtyBlocks *btree.BTreeG[types.Blockid] 58 //index for objects by timestamp. 59 objectIndexByTS *btree.BTreeG[ObjectIndexByTSEntry] 60 61 // noData indicates whether to retain data batch 62 // for primary key dedup, reading data is not required 63 noData bool 64 65 // some data need to be shared between all states 66 // should have been in the Partition structure, but doing that requires much more codes changes 67 // so just put it here. 68 shared *sharedStates 69 70 // blocks deleted before minTS is hard deleted. 71 // partition state can't serve txn with snapshotTS less than minTS 72 minTS types.TS 73 } 74 75 // sharedStates is shared among all PartitionStates 76 type sharedStates struct { 77 sync.Mutex 78 // last block flush timestamp for table 79 lastFlushTimestamp types.TS 80 } 81 82 // RowEntry represents a version of a row 83 type RowEntry struct { 84 BlockID types.Blockid // we need to iter by block id, so put it first to allow faster iteration 85 RowID types.Rowid 86 Time types.TS 87 88 ID int64 // a unique version id, for primary index building and validating 89 Deleted bool 90 Batch *batch.Batch 91 Offset int64 92 PrimaryIndexBytes []byte 93 } 94 95 func (r RowEntry) Less(than RowEntry) bool { 96 // asc 97 cmp := r.BlockID.Compare(than.BlockID) 98 if cmp < 0 { 99 return true 100 } 101 if cmp > 0 { 102 return false 103 } 104 // asc 105 if r.RowID.Less(than.RowID) { 106 return true 107 } 108 if than.RowID.Less(r.RowID) { 109 return false 110 } 111 // desc 112 if than.Time.Less(&r.Time) { 113 return true 114 } 115 if r.Time.Less(&than.Time) { 116 return false 117 } 118 return false 119 } 120 121 type BlockEntry struct { 122 objectio.BlockInfo 123 124 CreateTime types.TS 125 DeleteTime types.TS 126 } 127 128 func (b BlockEntry) Less(than BlockEntry) bool { 129 return b.BlockID.Compare(than.BlockID) < 0 130 } 131 132 type BlockDeltaEntry struct { 133 BlockID types.Blockid 134 135 CommitTs types.TS 136 DeltaLoc objectio.ObjectLocation 137 } 138 139 func (b BlockDeltaEntry) Less(than BlockDeltaEntry) bool { 140 return b.BlockID.Compare(than.BlockID) < 0 141 } 142 143 func (b BlockDeltaEntry) DeltaLocation() objectio.Location { 144 return b.DeltaLoc[:] 145 } 146 147 type ObjectInfo struct { 148 objectio.ObjectStats 149 150 EntryState bool 151 Sorted bool 152 HasDeltaLoc bool 153 CommitTS types.TS 154 CreateTime types.TS 155 DeleteTime types.TS 156 } 157 158 func (o ObjectInfo) String() string { 159 return fmt.Sprintf( 160 "%s; entryState: %v; sorted: %v; hasDeltaLoc: %v; commitTS: %s; createTS: %s; deleteTS: %s", 161 o.ObjectStats.String(), o.EntryState, 162 o.Sorted, o.HasDeltaLoc, o.CommitTS.ToString(), 163 o.CreateTime.ToString(), o.DeleteTime.ToString()) 164 } 165 166 func (o ObjectInfo) Location() objectio.Location { 167 return o.ObjectLocation() 168 } 169 170 type ObjectEntry struct { 171 ObjectInfo 172 } 173 174 func (o ObjectEntry) Less(than ObjectEntry) bool { 175 return bytes.Compare((*o.ObjectShortName())[:], (*than.ObjectShortName())[:]) < 0 176 } 177 178 func (o ObjectEntry) IsEmpty() bool { 179 return o.Size() == 0 180 } 181 182 func (o *ObjectEntry) Visible(ts types.TS) bool { 183 return o.CreateTime.LessEq(&ts) && 184 (o.DeleteTime.IsEmpty() || ts.Less(&o.DeleteTime)) 185 } 186 187 func (o ObjectEntry) Location() objectio.Location { 188 return o.ObjectLocation() 189 } 190 191 func (o ObjectInfo) StatsValid() bool { 192 return o.ObjectStats.Rows() != 0 193 } 194 195 type ObjectIndexByCreateTSEntry struct { 196 ObjectInfo 197 } 198 199 func (o ObjectIndexByCreateTSEntry) Less(than ObjectIndexByCreateTSEntry) bool { 200 //asc 201 if o.CreateTime.Less(&than.CreateTime) { 202 203 return true 204 } 205 if than.CreateTime.Less(&o.CreateTime) { 206 return false 207 } 208 209 cmp := bytes.Compare(o.ObjectShortName()[:], than.ObjectShortName()[:]) 210 if cmp < 0 { 211 return true 212 } 213 if cmp > 0 { 214 return false 215 } 216 return false 217 } 218 219 func (o *ObjectIndexByCreateTSEntry) Visible(ts types.TS) bool { 220 return o.CreateTime.LessEq(&ts) && 221 (o.DeleteTime.IsEmpty() || ts.Less(&o.DeleteTime)) 222 } 223 224 type PrimaryIndexEntry struct { 225 Bytes []byte 226 RowEntryID int64 227 228 // fields for validating 229 BlockID types.Blockid 230 RowID types.Rowid 231 Time types.TS 232 } 233 234 func (p *PrimaryIndexEntry) Less(than *PrimaryIndexEntry) bool { 235 if res := bytes.Compare(p.Bytes, than.Bytes); res < 0 { 236 return true 237 } else if res > 0 { 238 return false 239 } 240 return p.RowEntryID < than.RowEntryID 241 } 242 243 type ObjectIndexByTSEntry struct { 244 Time types.TS // insert or delete time 245 ShortObjName objectio.ObjectNameShort 246 247 IsDelete bool 248 IsAppendable bool 249 } 250 251 func (b ObjectIndexByTSEntry) Less(than ObjectIndexByTSEntry) bool { 252 // asc 253 if b.Time.Less(&than.Time) { 254 return true 255 } 256 if than.Time.Less(&b.Time) { 257 return false 258 } 259 260 cmp := bytes.Compare(b.ShortObjName[:], than.ShortObjName[:]) 261 if cmp < 0 { 262 return true 263 } 264 if cmp > 0 { 265 return false 266 } 267 268 //if b.IsDelete && !than.IsDelete { 269 // return true 270 //} 271 //if !b.IsDelete && than.IsDelete { 272 // return false 273 //} 274 275 return false 276 } 277 278 func NewPartitionState(noData bool) *PartitionState { 279 opts := btree.Options{ 280 Degree: 64, 281 } 282 return &PartitionState{ 283 noData: noData, 284 rows: btree.NewBTreeGOptions((RowEntry).Less, opts), 285 dataObjects: btree.NewBTreeGOptions((ObjectEntry).Less, opts), 286 blockDeltas: btree.NewBTreeGOptions((BlockDeltaEntry).Less, opts), 287 primaryIndex: btree.NewBTreeGOptions((*PrimaryIndexEntry).Less, opts), 288 dirtyBlocks: btree.NewBTreeGOptions((types.Blockid).Less, opts), 289 objectIndexByTS: btree.NewBTreeGOptions((ObjectIndexByTSEntry).Less, opts), 290 shared: new(sharedStates), 291 } 292 } 293 294 func (p *PartitionState) Copy() *PartitionState { 295 state := PartitionState{ 296 rows: p.rows.Copy(), 297 dataObjects: p.dataObjects.Copy(), 298 blockDeltas: p.blockDeltas.Copy(), 299 primaryIndex: p.primaryIndex.Copy(), 300 noData: p.noData, 301 dirtyBlocks: p.dirtyBlocks.Copy(), 302 objectIndexByTS: p.objectIndexByTS.Copy(), 303 shared: p.shared, 304 start: p.start, 305 end: p.end, 306 } 307 if len(p.checkpoints) > 0 { 308 state.checkpoints = make([]string, len(p.checkpoints)) 309 copy(state.checkpoints, p.checkpoints) 310 } 311 return &state 312 } 313 314 func (p *PartitionState) RowExists(rowID types.Rowid, ts types.TS) bool { 315 iter := p.rows.Iter() 316 defer iter.Release() 317 318 blockID := rowID.CloneBlockID() 319 for ok := iter.Seek(RowEntry{ 320 BlockID: blockID, 321 RowID: rowID, 322 Time: ts, 323 }); ok; ok = iter.Next() { 324 entry := iter.Item() 325 if entry.BlockID != blockID { 326 break 327 } 328 if entry.RowID != rowID { 329 break 330 } 331 if entry.Time.Greater(&ts) { 332 // not visible 333 continue 334 } 335 if entry.Deleted { 336 // deleted 337 return false 338 } 339 return true 340 } 341 342 return false 343 } 344 345 func (p *PartitionState) HandleLogtailEntry( 346 ctx context.Context, 347 fs fileservice.FileService, 348 entry *api.Entry, 349 primarySeqnum int, 350 packer *types.Packer, 351 ) { 352 txnTrace.GetService().ApplyLogtail(entry, 1) 353 switch entry.EntryType { 354 case api.Entry_Insert: 355 if IsBlkTable(entry.TableName) { 356 p.HandleMetadataInsert(ctx, fs, entry.Bat) 357 } else if IsObjTable(entry.TableName) { 358 p.HandleObjectInsert(ctx, entry.Bat, fs) 359 } else { 360 p.HandleRowsInsert(ctx, entry.Bat, primarySeqnum, packer) 361 } 362 case api.Entry_Delete: 363 if IsBlkTable(entry.TableName) { 364 p.HandleMetadataDelete(ctx, entry.TableId, entry.Bat) 365 } else if IsObjTable(entry.TableName) { 366 p.HandleObjectDelete(entry.TableId, entry.Bat) 367 } else { 368 p.HandleRowsDelete(ctx, entry.Bat, packer) 369 } 370 default: 371 panic("unknown entry type") 372 } 373 } 374 375 func (p *PartitionState) HandleObjectDelete( 376 tableID uint64, 377 bat *api.Batch) { 378 statsVec := mustVectorFromProto(bat.Vecs[2]) 379 stateCol := vector.MustFixedCol[bool](mustVectorFromProto(bat.Vecs[3])) 380 sortedCol := vector.MustFixedCol[bool](mustVectorFromProto(bat.Vecs[4])) 381 createTSCol := vector.MustFixedCol[types.TS](mustVectorFromProto(bat.Vecs[7])) 382 deleteTSCol := vector.MustFixedCol[types.TS](mustVectorFromProto(bat.Vecs[8])) 383 commitTSCol := vector.MustFixedCol[types.TS](mustVectorFromProto(bat.Vecs[11])) 384 385 for idx := 0; idx < len(stateCol); idx++ { 386 var objEntry ObjectEntry 387 388 objEntry.ObjectStats = objectio.ObjectStats(statsVec.GetBytesAt(idx)) 389 390 if objEntry.ObjectStats.BlkCnt() == 0 || objEntry.ObjectStats.Rows() == 0 { 391 continue 392 } 393 394 objEntry.EntryState = stateCol[idx] 395 objEntry.CreateTime = createTSCol[idx] 396 objEntry.DeleteTime = deleteTSCol[idx] 397 objEntry.CommitTS = commitTSCol[idx] 398 objEntry.Sorted = sortedCol[idx] 399 p.objectDeleteHelper(tableID, objEntry, deleteTSCol[idx]) 400 } 401 } 402 403 func (p *PartitionState) HandleObjectInsert(ctx context.Context, bat *api.Batch, fs fileservice.FileService) { 404 405 var numDeleted, blockDeleted, scanCnt int64 406 statsVec := mustVectorFromProto(bat.Vecs[2]) 407 stateCol := vector.MustFixedCol[bool](mustVectorFromProto(bat.Vecs[3])) 408 sortedCol := vector.MustFixedCol[bool](mustVectorFromProto(bat.Vecs[4])) 409 createTSCol := vector.MustFixedCol[types.TS](mustVectorFromProto(bat.Vecs[7])) 410 deleteTSCol := vector.MustFixedCol[types.TS](mustVectorFromProto(bat.Vecs[8])) 411 startTSCol := vector.MustFixedCol[types.TS](mustVectorFromProto(bat.Vecs[9])) 412 commitTSCol := vector.MustFixedCol[types.TS](mustVectorFromProto(bat.Vecs[11])) 413 414 for idx := 0; idx < len(stateCol); idx++ { 415 p.shared.Lock() 416 if t := commitTSCol[idx]; t.Greater(&p.shared.lastFlushTimestamp) { 417 p.shared.lastFlushTimestamp = t 418 } 419 p.shared.Unlock() 420 var objEntry ObjectEntry 421 422 objEntry.ObjectStats = objectio.ObjectStats(statsVec.GetBytesAt(idx)) 423 if objEntry.ObjectStats.BlkCnt() == 0 || objEntry.ObjectStats.Rows() == 0 { 424 logutil.Errorf("skip empty object stats when HandleObjectInsert, %s\n", objEntry.String()) 425 continue 426 } 427 428 objEntry.EntryState = stateCol[idx] 429 objEntry.CreateTime = createTSCol[idx] 430 objEntry.DeleteTime = deleteTSCol[idx] 431 objEntry.CommitTS = commitTSCol[idx] 432 objEntry.Sorted = sortedCol[idx] 433 434 old, exist := p.dataObjects.Get(objEntry) 435 if exist { 436 objEntry.HasDeltaLoc = old.HasDeltaLoc 437 } 438 if exist && !old.IsEmpty() { 439 // why check the deleteTime here? consider this situation: 440 // 1. insert on an object, then these insert operations recorded into a CKP. 441 // 2. and delete this object, this operation recorded into WAL. 442 // 3. restart 443 // 4. replay CKP(lazily) into partition state --> replay WAL into partition state 444 // the delete record in WAL could be overwritten by insert record in CKP, 445 // causing logic err of the objects' visibility(dead object back to life!!). 446 // 447 // if this happened, just skip this object will be fine, why chose to 448 // update the object Stats and leave others unchanged? 449 // 450 // in single txn, the pushed log tail has orders: meta insert, object insert. 451 // as long as delta location generated, there will be meta insert followed by object insert pushed to cn. 452 // in the normal case, the handleMetaInsert will construct objects with empty stats(rows = 0) 453 // and will be updated by HandleObjectInsert later. if we skip this object in such case (non-above situation), 454 // the object stats will be remained empty, has potential impact on where the stats.rows be used. 455 // 456 // so the final logic is that only update the object stats 457 // when an object already exists in the partition state and has the deleteTime value. 458 if !old.DeleteTime.IsEmpty() { 459 // leave these field unchanged 460 objEntry.DeleteTime = old.DeleteTime 461 objEntry.CommitTS = old.CommitTS 462 objEntry.EntryState = old.EntryState 463 objEntry.CreateTime = old.CreateTime 464 objEntry.Sorted = old.Sorted 465 466 // only update object stats 467 } 468 } else { 469 e := ObjectIndexByTSEntry{ 470 Time: createTSCol[idx], 471 ShortObjName: *objEntry.ObjectShortName(), 472 IsDelete: false, 473 474 IsAppendable: objEntry.EntryState, 475 } 476 p.objectIndexByTS.Set(e) 477 } 478 //prefetch the object meta 479 if err := blockio.PrefetchMeta(fs, objEntry.Location()); err != nil { 480 logutil.Errorf("prefetch object meta failed. %v", err) 481 } 482 483 p.dataObjects.Set(objEntry) 484 { 485 //Need to insert an entry in objectIndexByTS, when soft delete appendable object. 486 e := ObjectIndexByTSEntry{ 487 ShortObjName: *objEntry.ObjectShortName(), 488 489 IsAppendable: objEntry.EntryState, 490 } 491 if !deleteTSCol[idx].IsEmpty() { 492 e.Time = deleteTSCol[idx] 493 e.IsDelete = true 494 p.objectIndexByTS.Set(e) 495 } 496 } 497 498 if objEntry.EntryState && objEntry.DeleteTime.IsEmpty() { 499 panic("logic error") 500 } 501 // for appendable object, gc rows when delete object 502 iter := p.rows.Copy().Iter() 503 objID := objEntry.ObjectStats.ObjectName().ObjectId() 504 trunctPoint := startTSCol[idx] 505 blkCnt := objEntry.ObjectStats.BlkCnt() 506 for i := uint32(0); i < blkCnt; i++ { 507 508 blkID := objectio.NewBlockidWithObjectID(objID, uint16(i)) 509 pivot := RowEntry{ 510 // aobj has only one blk 511 BlockID: *blkID, 512 } 513 for ok := iter.Seek(pivot); ok; ok = iter.Next() { 514 entry := iter.Item() 515 if entry.BlockID != *blkID { 516 break 517 } 518 scanCnt++ 519 520 // if the inserting block is appendable, need to delete the rows for it; 521 // if the inserting block is non-appendable and has delta location, need to delete 522 // the deletes for it. 523 if objEntry.EntryState { 524 if entry.Time.LessEq(&trunctPoint) { 525 // delete the row 526 p.rows.Delete(entry) 527 528 // delete the row's primary index 529 if objEntry.EntryState && len(entry.PrimaryIndexBytes) > 0 { 530 p.primaryIndex.Delete(&PrimaryIndexEntry{ 531 Bytes: entry.PrimaryIndexBytes, 532 RowEntryID: entry.ID, 533 }) 534 } 535 numDeleted++ 536 blockDeleted++ 537 } 538 } 539 540 //it's tricky here. 541 //Due to consuming lazily the checkpoint, 542 //we have to take the following scenario into account: 543 //1. CN receives deletes for a non-appendable block from the log tail, 544 // then apply the deletes into PartitionState.rows. 545 //2. CN receives block meta of the above non-appendable block to be inserted 546 // from the checkpoint, then apply the block meta into PartitionState.blocks. 547 // So , if the above scenario happens, we need to set the non-appendable block into 548 // PartitionState.dirtyBlocks. 549 if !objEntry.EntryState && !objEntry.HasDeltaLoc { 550 p.dirtyBlocks.Set(entry.BlockID) 551 break 552 } 553 } 554 iter.Release() 555 556 // if there are no rows for the block, delete the block from the dirty 557 if objEntry.EntryState && scanCnt == blockDeleted && p.dirtyBlocks.Len() > 0 { 558 p.dirtyBlocks.Delete(*blkID) 559 } 560 } 561 } 562 perfcounter.Update(ctx, func(c *perfcounter.CounterSet) { 563 c.DistTAE.Logtail.ActiveRows.Add(-numDeleted) 564 }) 565 } 566 567 var nextRowEntryID = int64(1) 568 569 func (p *PartitionState) HandleRowsInsert( 570 ctx context.Context, 571 input *api.Batch, 572 primarySeqnum int, 573 packer *types.Packer, 574 ) ( 575 primaryKeys [][]byte, 576 ) { 577 ctx, task := trace.NewTask(ctx, "PartitionState.HandleRowsInsert") 578 defer task.End() 579 580 rowIDVector := vector.MustFixedCol[types.Rowid](mustVectorFromProto(input.Vecs[0])) 581 timeVector := vector.MustFixedCol[types.TS](mustVectorFromProto(input.Vecs[1])) 582 batch, err := batch.ProtoBatchToBatch(input) 583 if err != nil { 584 panic(err) 585 } 586 primaryKeys = EncodePrimaryKeyVector( 587 batch.Vecs[2+primarySeqnum], 588 packer, 589 ) 590 591 var numInserted int64 592 for i, rowID := range rowIDVector { 593 594 blockID := rowID.CloneBlockID() 595 pivot := RowEntry{ 596 BlockID: blockID, 597 RowID: rowID, 598 Time: timeVector[i], 599 } 600 entry, ok := p.rows.Get(pivot) 601 if !ok { 602 entry = pivot 603 entry.ID = atomic.AddInt64(&nextRowEntryID, 1) 604 numInserted++ 605 } 606 607 if !p.noData { 608 entry.Batch = batch 609 entry.Offset = int64(i) 610 } 611 entry.PrimaryIndexBytes = primaryKeys[i] 612 p.rows.Set(entry) 613 614 { 615 entry := &PrimaryIndexEntry{ 616 Bytes: primaryKeys[i], 617 RowEntryID: entry.ID, 618 BlockID: blockID, 619 RowID: rowID, 620 Time: entry.Time, 621 } 622 p.primaryIndex.Set(entry) 623 } 624 } 625 626 perfcounter.Update(ctx, func(c *perfcounter.CounterSet) { 627 c.DistTAE.Logtail.Entries.Add(1) 628 c.DistTAE.Logtail.InsertEntries.Add(1) 629 c.DistTAE.Logtail.InsertRows.Add(numInserted) 630 c.DistTAE.Logtail.ActiveRows.Add(numInserted) 631 }) 632 633 return 634 } 635 636 func (p *PartitionState) HandleRowsDelete( 637 ctx context.Context, 638 input *api.Batch, 639 packer *types.Packer, 640 ) { 641 ctx, task := trace.NewTask(ctx, "PartitionState.HandleRowsDelete") 642 defer task.End() 643 644 rowIDVector := vector.MustFixedCol[types.Rowid](mustVectorFromProto(input.Vecs[0])) 645 timeVector := vector.MustFixedCol[types.TS](mustVectorFromProto(input.Vecs[1])) 646 batch, err := batch.ProtoBatchToBatch(input) 647 if err != nil { 648 panic(err) 649 } 650 651 var primaryKeys [][]byte 652 if len(input.Vecs) > 2 { 653 // has primary key 654 primaryKeys = EncodePrimaryKeyVector( 655 batch.Vecs[2], 656 packer, 657 ) 658 } 659 660 numDeletes := int64(0) 661 for i, rowID := range rowIDVector { 662 663 blockID := rowID.CloneBlockID() 664 pivot := RowEntry{ 665 BlockID: blockID, 666 RowID: rowID, 667 Time: timeVector[i], 668 } 669 entry, ok := p.rows.Get(pivot) 670 if !ok { 671 entry = pivot 672 entry.ID = atomic.AddInt64(&nextRowEntryID, 1) 673 numDeletes++ 674 } 675 676 entry.Deleted = true 677 if i < len(primaryKeys) { 678 entry.PrimaryIndexBytes = primaryKeys[i] 679 } 680 if !p.noData { 681 entry.Batch = batch 682 entry.Offset = int64(i) 683 } 684 p.rows.Set(entry) 685 686 //handle memory deletes for non-appendable block. 687 p.dirtyBlocks.Set(blockID) 688 689 // primary key 690 if i < len(primaryKeys) && len(primaryKeys[i]) > 0 { 691 entry := &PrimaryIndexEntry{ 692 Bytes: primaryKeys[i], 693 RowEntryID: entry.ID, 694 BlockID: blockID, 695 RowID: rowID, 696 Time: entry.Time, 697 } 698 p.primaryIndex.Set(entry) 699 } 700 701 } 702 703 perfcounter.Update(ctx, func(c *perfcounter.CounterSet) { 704 c.DistTAE.Logtail.Entries.Add(1) 705 c.DistTAE.Logtail.DeleteEntries.Add(1) 706 c.DistTAE.Logtail.DeleteRows.Add(numDeletes) 707 }) 708 } 709 710 func (p *PartitionState) HandleMetadataInsert( 711 ctx context.Context, 712 fs fileservice.FileService, 713 input *api.Batch) { 714 ctx, task := trace.NewTask(ctx, "PartitionState.HandleMetadataInsert") 715 defer task.End() 716 717 createTimeVector := vector.MustFixedCol[types.TS](mustVectorFromProto(input.Vecs[1])) 718 blockIDVector := vector.MustFixedCol[types.Blockid](mustVectorFromProto(input.Vecs[2])) 719 entryStateVector := vector.MustFixedCol[bool](mustVectorFromProto(input.Vecs[3])) 720 sortedStateVector := vector.MustFixedCol[bool](mustVectorFromProto(input.Vecs[4])) 721 metaLocationVector := mustVectorFromProto(input.Vecs[5]) 722 deltaLocationVector := mustVectorFromProto(input.Vecs[6]) 723 commitTimeVector := vector.MustFixedCol[types.TS](mustVectorFromProto(input.Vecs[7])) 724 //segmentIDVector := vector.MustFixedCol[types.Uuid](mustVectorFromProto(input.Vecs[8])) 725 memTruncTSVector := vector.MustFixedCol[types.TS](mustVectorFromProto(input.Vecs[9])) 726 727 var numInserted, numDeleted int64 728 for i, blockID := range blockIDVector { 729 p.shared.Lock() 730 if t := commitTimeVector[i]; t.Greater(&p.shared.lastFlushTimestamp) { 731 p.shared.lastFlushTimestamp = t 732 } 733 p.shared.Unlock() 734 735 pivot := BlockDeltaEntry{ 736 BlockID: blockID, 737 } 738 blockEntry, ok := p.blockDeltas.Get(pivot) 739 if !ok { 740 blockEntry = pivot 741 numInserted++ 742 } else if blockEntry.CommitTs.GreaterEq(&commitTimeVector[i]) { 743 // it possible to get an older version blk from lazy loaded checkpoint 744 continue 745 } 746 747 // the following codes handle block which be inserted or updated by a newer delta location. 748 // Notice that only delta location can be updated by a newer delta location. 749 if location := objectio.Location(deltaLocationVector.GetBytesAt(i)); !location.IsEmpty() { 750 blockEntry.DeltaLoc = *(*[objectio.LocationLen]byte)(unsafe.Pointer(&location[0])) 751 } 752 if t := commitTimeVector[i]; !t.IsEmpty() { 753 blockEntry.CommitTs = t 754 } 755 756 isAppendable := entryStateVector[i] 757 isEmptyDelta := blockEntry.DeltaLocation().IsEmpty() 758 759 if !isEmptyDelta { 760 p.blockDeltas.Set(blockEntry) 761 } 762 763 { 764 scanCnt := int64(0) 765 blockDeleted := int64(0) 766 trunctPoint := memTruncTSVector[i] 767 iter := p.rows.Copy().Iter() 768 pivot := RowEntry{ 769 BlockID: blockID, 770 } 771 for ok := iter.Seek(pivot); ok; ok = iter.Next() { 772 entry := iter.Item() 773 if entry.BlockID != blockID { 774 break 775 } 776 scanCnt++ 777 //it's tricky here. 778 //Due to consuming lazily the checkpoint, 779 //we have to take the following scenario into account: 780 //1. CN receives deletes for a non-appendable block from the log tail, 781 // then apply the deletes into PartitionState.rows. 782 //2. CN receives block meta of the above non-appendable block to be inserted 783 // from the checkpoint, then apply the block meta into PartitionState.blocks. 784 // So , if the above scenario happens, we need to set the non-appendable block into 785 // PartitionState.dirtyBlocks. 786 if !isAppendable && isEmptyDelta { 787 p.dirtyBlocks.Set(blockID) 788 break 789 } 790 791 // if the inserting block is appendable, need to delete the rows for it; 792 // if the inserting block is non-appendable and has delta location, need to delete 793 // the deletes for it. 794 if isAppendable || (!isAppendable && !isEmptyDelta) { 795 if entry.Time.LessEq(&trunctPoint) { 796 // delete the row 797 p.rows.Delete(entry) 798 799 // delete the row's primary index 800 if isAppendable && len(entry.PrimaryIndexBytes) > 0 { 801 p.primaryIndex.Delete(&PrimaryIndexEntry{ 802 Bytes: entry.PrimaryIndexBytes, 803 RowEntryID: entry.ID, 804 }) 805 } 806 numDeleted++ 807 blockDeleted++ 808 } 809 } 810 } 811 iter.Release() 812 813 // if there are no rows for the block, delete the block from the dirty 814 if scanCnt == blockDeleted && p.dirtyBlocks.Len() > 0 { 815 p.dirtyBlocks.Delete(blockID) 816 } 817 } 818 819 //create object by block insert to set objEntry.HasDeltaLoc 820 //when lazy load, maybe deltalocation is consumed before object is created 821 { 822 objPivot := ObjectEntry{} 823 if metaLoc := objectio.Location(metaLocationVector.GetBytesAt(i)); !metaLoc.IsEmpty() { 824 objectio.SetObjectStatsLocation(&objPivot.ObjectStats, metaLoc) 825 } else { 826 // After block is removed, 827 // HandleMetadataInsert only handle deltaloc. 828 // Meta location is empty. 829 objID := blockID.Object() 830 objName := objectio.BuildObjectNameWithObjectID(objID) 831 objectio.SetObjectStatsObjectName(&objPivot.ObjectStats, objName) 832 } 833 objEntry, ok := p.dataObjects.Get(objPivot) 834 if ok { 835 // don't need to update objEntry, except for HasDeltaLoc and blkCnt 836 if !isEmptyDelta { 837 objEntry.HasDeltaLoc = true 838 } 839 840 blkCnt := blockID.Sequence() + 1 841 if uint32(blkCnt) > objEntry.BlkCnt() { 842 objectio.SetObjectStatsBlkCnt(&objEntry.ObjectStats, uint32(blkCnt)) 843 } 844 p.dataObjects.Set(objEntry) 845 // For deltaloc batch after block is removed, 846 // objEntry.CreateTime is empty. 847 // and it's temporary. 848 // Related dataObjectsByCreateTS will be set in HandleObjectInsert. 849 // 850 // the created ts index have been removed now 851 //if !objEntry.CreateTime.IsEmpty() { 852 // p.dataObjectsByCreateTS.Set(ObjectIndexByCreateTSEntry(objEntry)) 853 //} 854 } else { 855 856 objEntry = objPivot 857 objEntry.EntryState = entryStateVector[i] 858 objEntry.Sorted = sortedStateVector[i] 859 if !isEmptyDelta { 860 objEntry.HasDeltaLoc = true 861 } 862 objEntry.CommitTS = commitTimeVector[i] 863 createTS := createTimeVector[i] 864 // after blk is removed, create ts is empty 865 if !createTS.IsEmpty() { 866 objEntry.CreateTime = createTS 867 } 868 869 blkCnt := blockID.Sequence() + 1 870 if uint32(blkCnt) > objEntry.BlkCnt() { 871 objectio.SetObjectStatsBlkCnt(&objEntry.ObjectStats, uint32(blkCnt)) 872 } 873 874 p.dataObjects.Set(objEntry) 875 876 { 877 e := ObjectIndexByTSEntry{ 878 Time: createTimeVector[i], 879 ShortObjName: *objEntry.ObjectShortName(), 880 IsDelete: false, 881 882 IsAppendable: objEntry.EntryState, 883 } 884 p.objectIndexByTS.Set(e) 885 } 886 } 887 } 888 889 } 890 891 perfcounter.Update(ctx, func(c *perfcounter.CounterSet) { 892 c.DistTAE.Logtail.Entries.Add(1) 893 c.DistTAE.Logtail.MetadataInsertEntries.Add(1) 894 c.DistTAE.Logtail.ActiveRows.Add(-numDeleted) 895 c.DistTAE.Logtail.InsertBlocks.Add(numInserted) 896 }) 897 } 898 899 func (p *PartitionState) objectDeleteHelper( 900 tableID uint64, 901 pivot ObjectEntry, 902 deleteTime types.TS) { 903 objEntry, ok := p.dataObjects.Get(pivot) 904 //TODO non-appendable block' delete maybe arrive before its insert? 905 if !ok { 906 panic(fmt.Sprintf("invalid block id. %v", pivot.String())) 907 } 908 909 if objEntry.DeleteTime.IsEmpty() { 910 // apply first delete 911 objEntry.DeleteTime = deleteTime 912 p.dataObjects.Set(objEntry) 913 914 { 915 e := ObjectIndexByTSEntry{ 916 Time: objEntry.DeleteTime, 917 ShortObjName: *objEntry.ObjectShortName(), 918 IsDelete: true, 919 920 IsAppendable: objEntry.EntryState, 921 } 922 txnTrace.GetService().ApplyDeleteObject( 923 tableID, 924 objEntry.DeleteTime.ToTimestamp(), 925 "", 926 "delete-object") 927 p.objectIndexByTS.Set(e) 928 } 929 } else { 930 // update deletetime, if incoming delete ts is less 931 if objEntry.DeleteTime.Greater(&deleteTime) { 932 old := ObjectIndexByTSEntry{ 933 Time: objEntry.DeleteTime, 934 ShortObjName: *objEntry.ObjectShortName(), 935 IsDelete: true, 936 937 IsAppendable: objEntry.EntryState, 938 } 939 p.objectIndexByTS.Delete(old) 940 objEntry.DeleteTime = deleteTime 941 p.dataObjects.Set(objEntry) 942 943 new := ObjectIndexByTSEntry{ 944 Time: objEntry.DeleteTime, 945 ShortObjName: *objEntry.ObjectShortName(), 946 IsDelete: true, 947 948 IsAppendable: objEntry.EntryState, 949 } 950 p.objectIndexByTS.Set(new) 951 } else if objEntry.DeleteTime.Equal(&deleteTime) { 952 //FIXME:: should we do something here? 953 e := ObjectIndexByTSEntry{ 954 Time: objEntry.DeleteTime, 955 ShortObjName: *objEntry.ObjectShortName(), 956 IsDelete: true, 957 958 IsAppendable: objEntry.EntryState, 959 } 960 p.objectIndexByTS.Set(e) 961 } 962 } 963 } 964 965 func (p *PartitionState) HandleMetadataDelete( 966 ctx context.Context, 967 tableID uint64, 968 input *api.Batch) { 969 ctx, task := trace.NewTask(ctx, "PartitionState.HandleMetadataDelete") 970 defer task.End() 971 972 perfcounter.Update(ctx, func(c *perfcounter.CounterSet) { 973 c.DistTAE.Logtail.Entries.Add(1) 974 c.DistTAE.Logtail.MetadataDeleteEntries.Add(1) 975 }) 976 } 977 978 func (p *PartitionState) CacheCkpDuration( 979 start types.TS, 980 end types.TS, 981 partition *Partition) { 982 if partition.checkpointConsumed.Load() { 983 panic("checkpoints already consumed") 984 } 985 p.start = start 986 p.end = end 987 } 988 989 func (p *PartitionState) AppendCheckpoint( 990 checkpoint string, 991 partiton *Partition) { 992 if partiton.checkpointConsumed.Load() { 993 panic("checkpoints already consumed") 994 } 995 p.checkpoints = append(p.checkpoints, checkpoint) 996 } 997 998 func (p *PartitionState) consumeCheckpoints( 999 fn func(checkpoint string, state *PartitionState) error, 1000 ) error { 1001 for _, checkpoint := range p.checkpoints { 1002 if err := fn(checkpoint, p); err != nil { 1003 return err 1004 } 1005 } 1006 p.checkpoints = p.checkpoints[:0] 1007 return nil 1008 } 1009 1010 func (p *PartitionState) truncate(ids [2]uint64, ts types.TS) { 1011 if p.minTS.Greater(&ts) { 1012 logutil.Errorf("logic error: current minTS %v, incoming ts %v", p.minTS.ToString(), ts.ToString()) 1013 return 1014 } 1015 p.minTS = ts 1016 gced := false 1017 pivot := ObjectIndexByTSEntry{ 1018 Time: ts.Next(), 1019 ShortObjName: objectio.ObjectNameShort{}, 1020 IsDelete: true, 1021 } 1022 iter := p.objectIndexByTS.Copy().Iter() 1023 ok := iter.Seek(pivot) 1024 if !ok { 1025 ok = iter.Last() 1026 } 1027 objIDsToDelete := make(map[objectio.ObjectNameShort]struct{}, 0) 1028 objectsToDelete := "" 1029 for ; ok; ok = iter.Prev() { 1030 entry := iter.Item() 1031 if entry.Time.Greater(&ts) { 1032 continue 1033 } 1034 if entry.IsDelete { 1035 objIDsToDelete[entry.ShortObjName] = struct{}{} 1036 if gced { 1037 objectsToDelete = fmt.Sprintf("%s, %v", objectsToDelete, entry.ShortObjName) 1038 } else { 1039 objectsToDelete = fmt.Sprintf("%s%v", objectsToDelete, entry.ShortObjName) 1040 } 1041 gced = true 1042 } 1043 } 1044 iter = p.objectIndexByTS.Copy().Iter() 1045 ok = iter.Seek(pivot) 1046 if !ok { 1047 ok = iter.Last() 1048 } 1049 for ; ok; ok = iter.Prev() { 1050 entry := iter.Item() 1051 if entry.Time.Greater(&ts) { 1052 continue 1053 } 1054 if _, ok := objIDsToDelete[entry.ShortObjName]; ok { 1055 p.objectIndexByTS.Delete(entry) 1056 } 1057 } 1058 if gced { 1059 logutil.Infof("GC partition_state at %v for table %d:%s", ts.ToString(), ids[1], objectsToDelete) 1060 } 1061 1062 objsToDelete := "" 1063 objIter := p.dataObjects.Copy().Iter() 1064 objGced := false 1065 firstCalled := false 1066 for { 1067 if !firstCalled { 1068 if !objIter.First() { 1069 break 1070 } 1071 firstCalled = true 1072 } else { 1073 if !objIter.Next() { 1074 break 1075 } 1076 } 1077 1078 objEntry := objIter.Item() 1079 1080 if !objEntry.DeleteTime.IsEmpty() && objEntry.DeleteTime.LessEq(&ts) { 1081 p.dataObjects.Delete(objEntry) 1082 //p.dataObjectsByCreateTS.Delete(ObjectIndexByCreateTSEntry{ 1083 // //CreateTime: objEntry.CreateTime, 1084 // //ShortObjName: objEntry.ShortObjName, 1085 // ObjectInfo: objEntry.ObjectInfo, 1086 //}) 1087 if objGced { 1088 objsToDelete = fmt.Sprintf("%s, %s", objsToDelete, objEntry.Location().Name().String()) 1089 } else { 1090 objsToDelete = fmt.Sprintf("%s%s", objsToDelete, objEntry.Location().Name().String()) 1091 } 1092 objGced = true 1093 } 1094 } 1095 if objGced { 1096 logutil.Infof("GC partition_state at %v for table %d:%s", ts.ToString(), ids[1], objsToDelete) 1097 } 1098 } 1099 1100 func (p *PartitionState) LastFlushTimestamp() types.TS { 1101 p.shared.Lock() 1102 defer p.shared.Unlock() 1103 return p.shared.lastFlushTimestamp 1104 }