github.com/matrixorigin/matrixone@v1.2.0/pkg/vm/engine/disttae/logtailreplay/partition_state.go (about)

     1  // Copyright 2023 Matrix Origin
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //      http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  package logtailreplay
    16  
    17  import (
    18  	"bytes"
    19  	"context"
    20  	"fmt"
    21  	"runtime/trace"
    22  	"sync"
    23  	"sync/atomic"
    24  	"unsafe"
    25  
    26  	"github.com/matrixorigin/matrixone/pkg/container/batch"
    27  	"github.com/matrixorigin/matrixone/pkg/container/types"
    28  	"github.com/matrixorigin/matrixone/pkg/container/vector"
    29  	"github.com/matrixorigin/matrixone/pkg/fileservice"
    30  	"github.com/matrixorigin/matrixone/pkg/logutil"
    31  	"github.com/matrixorigin/matrixone/pkg/objectio"
    32  	"github.com/matrixorigin/matrixone/pkg/pb/api"
    33  	"github.com/matrixorigin/matrixone/pkg/perfcounter"
    34  	txnTrace "github.com/matrixorigin/matrixone/pkg/txn/trace"
    35  	"github.com/matrixorigin/matrixone/pkg/vm/engine/tae/blockio"
    36  	"github.com/tidwall/btree"
    37  )
    38  
    39  type PartitionState struct {
    40  	// also modify the Copy method if adding fields
    41  
    42  	// data
    43  	rows *btree.BTreeG[RowEntry] // use value type to avoid locking on elements
    44  	//table data objects
    45  	dataObjects *btree.BTreeG[ObjectEntry]
    46  	//TODO:: It's transient, should be removed in future PR.
    47  	blockDeltas *btree.BTreeG[BlockDeltaEntry]
    48  	checkpoints []string
    49  	start       types.TS
    50  	end         types.TS
    51  
    52  	// index
    53  	primaryIndex *btree.BTreeG[*PrimaryIndexEntry]
    54  	//for non-appendable block's memory deletes, used to getting dirty
    55  	// non-appendable blocks quickly.
    56  	//TODO::remove it
    57  	dirtyBlocks *btree.BTreeG[types.Blockid]
    58  	//index for objects by timestamp.
    59  	objectIndexByTS *btree.BTreeG[ObjectIndexByTSEntry]
    60  
    61  	// noData indicates whether to retain data batch
    62  	// for primary key dedup, reading data is not required
    63  	noData bool
    64  
    65  	// some data need to be shared between all states
    66  	// should have been in the Partition structure, but doing that requires much more codes changes
    67  	// so just put it here.
    68  	shared *sharedStates
    69  
    70  	// blocks deleted before minTS is hard deleted.
    71  	// partition state can't serve txn with snapshotTS less than minTS
    72  	minTS types.TS
    73  }
    74  
    75  // sharedStates is shared among all PartitionStates
    76  type sharedStates struct {
    77  	sync.Mutex
    78  	// last block flush timestamp for table
    79  	lastFlushTimestamp types.TS
    80  }
    81  
    82  // RowEntry represents a version of a row
    83  type RowEntry struct {
    84  	BlockID types.Blockid // we need to iter by block id, so put it first to allow faster iteration
    85  	RowID   types.Rowid
    86  	Time    types.TS
    87  
    88  	ID                int64 // a unique version id, for primary index building and validating
    89  	Deleted           bool
    90  	Batch             *batch.Batch
    91  	Offset            int64
    92  	PrimaryIndexBytes []byte
    93  }
    94  
    95  func (r RowEntry) Less(than RowEntry) bool {
    96  	// asc
    97  	cmp := r.BlockID.Compare(than.BlockID)
    98  	if cmp < 0 {
    99  		return true
   100  	}
   101  	if cmp > 0 {
   102  		return false
   103  	}
   104  	// asc
   105  	if r.RowID.Less(than.RowID) {
   106  		return true
   107  	}
   108  	if than.RowID.Less(r.RowID) {
   109  		return false
   110  	}
   111  	// desc
   112  	if than.Time.Less(&r.Time) {
   113  		return true
   114  	}
   115  	if r.Time.Less(&than.Time) {
   116  		return false
   117  	}
   118  	return false
   119  }
   120  
   121  type BlockEntry struct {
   122  	objectio.BlockInfo
   123  
   124  	CreateTime types.TS
   125  	DeleteTime types.TS
   126  }
   127  
   128  func (b BlockEntry) Less(than BlockEntry) bool {
   129  	return b.BlockID.Compare(than.BlockID) < 0
   130  }
   131  
   132  type BlockDeltaEntry struct {
   133  	BlockID types.Blockid
   134  
   135  	CommitTs types.TS
   136  	DeltaLoc objectio.ObjectLocation
   137  }
   138  
   139  func (b BlockDeltaEntry) Less(than BlockDeltaEntry) bool {
   140  	return b.BlockID.Compare(than.BlockID) < 0
   141  }
   142  
   143  func (b BlockDeltaEntry) DeltaLocation() objectio.Location {
   144  	return b.DeltaLoc[:]
   145  }
   146  
   147  type ObjectInfo struct {
   148  	objectio.ObjectStats
   149  
   150  	EntryState  bool
   151  	Sorted      bool
   152  	HasDeltaLoc bool
   153  	CommitTS    types.TS
   154  	CreateTime  types.TS
   155  	DeleteTime  types.TS
   156  }
   157  
   158  func (o ObjectInfo) String() string {
   159  	return fmt.Sprintf(
   160  		"%s; entryState: %v; sorted: %v; hasDeltaLoc: %v; commitTS: %s; createTS: %s; deleteTS: %s",
   161  		o.ObjectStats.String(), o.EntryState,
   162  		o.Sorted, o.HasDeltaLoc, o.CommitTS.ToString(),
   163  		o.CreateTime.ToString(), o.DeleteTime.ToString())
   164  }
   165  
   166  func (o ObjectInfo) Location() objectio.Location {
   167  	return o.ObjectLocation()
   168  }
   169  
   170  type ObjectEntry struct {
   171  	ObjectInfo
   172  }
   173  
   174  func (o ObjectEntry) Less(than ObjectEntry) bool {
   175  	return bytes.Compare((*o.ObjectShortName())[:], (*than.ObjectShortName())[:]) < 0
   176  }
   177  
   178  func (o ObjectEntry) IsEmpty() bool {
   179  	return o.Size() == 0
   180  }
   181  
   182  func (o *ObjectEntry) Visible(ts types.TS) bool {
   183  	return o.CreateTime.LessEq(&ts) &&
   184  		(o.DeleteTime.IsEmpty() || ts.Less(&o.DeleteTime))
   185  }
   186  
   187  func (o ObjectEntry) Location() objectio.Location {
   188  	return o.ObjectLocation()
   189  }
   190  
   191  func (o ObjectInfo) StatsValid() bool {
   192  	return o.ObjectStats.Rows() != 0
   193  }
   194  
   195  type ObjectIndexByCreateTSEntry struct {
   196  	ObjectInfo
   197  }
   198  
   199  func (o ObjectIndexByCreateTSEntry) Less(than ObjectIndexByCreateTSEntry) bool {
   200  	//asc
   201  	if o.CreateTime.Less(&than.CreateTime) {
   202  
   203  		return true
   204  	}
   205  	if than.CreateTime.Less(&o.CreateTime) {
   206  		return false
   207  	}
   208  
   209  	cmp := bytes.Compare(o.ObjectShortName()[:], than.ObjectShortName()[:])
   210  	if cmp < 0 {
   211  		return true
   212  	}
   213  	if cmp > 0 {
   214  		return false
   215  	}
   216  	return false
   217  }
   218  
   219  func (o *ObjectIndexByCreateTSEntry) Visible(ts types.TS) bool {
   220  	return o.CreateTime.LessEq(&ts) &&
   221  		(o.DeleteTime.IsEmpty() || ts.Less(&o.DeleteTime))
   222  }
   223  
   224  type PrimaryIndexEntry struct {
   225  	Bytes      []byte
   226  	RowEntryID int64
   227  
   228  	// fields for validating
   229  	BlockID types.Blockid
   230  	RowID   types.Rowid
   231  	Time    types.TS
   232  }
   233  
   234  func (p *PrimaryIndexEntry) Less(than *PrimaryIndexEntry) bool {
   235  	if res := bytes.Compare(p.Bytes, than.Bytes); res < 0 {
   236  		return true
   237  	} else if res > 0 {
   238  		return false
   239  	}
   240  	return p.RowEntryID < than.RowEntryID
   241  }
   242  
   243  type ObjectIndexByTSEntry struct {
   244  	Time         types.TS // insert or delete time
   245  	ShortObjName objectio.ObjectNameShort
   246  
   247  	IsDelete     bool
   248  	IsAppendable bool
   249  }
   250  
   251  func (b ObjectIndexByTSEntry) Less(than ObjectIndexByTSEntry) bool {
   252  	// asc
   253  	if b.Time.Less(&than.Time) {
   254  		return true
   255  	}
   256  	if than.Time.Less(&b.Time) {
   257  		return false
   258  	}
   259  
   260  	cmp := bytes.Compare(b.ShortObjName[:], than.ShortObjName[:])
   261  	if cmp < 0 {
   262  		return true
   263  	}
   264  	if cmp > 0 {
   265  		return false
   266  	}
   267  
   268  	//if b.IsDelete && !than.IsDelete {
   269  	//	return true
   270  	//}
   271  	//if !b.IsDelete && than.IsDelete {
   272  	//	return false
   273  	//}
   274  
   275  	return false
   276  }
   277  
   278  func NewPartitionState(noData bool) *PartitionState {
   279  	opts := btree.Options{
   280  		Degree: 64,
   281  	}
   282  	return &PartitionState{
   283  		noData:          noData,
   284  		rows:            btree.NewBTreeGOptions((RowEntry).Less, opts),
   285  		dataObjects:     btree.NewBTreeGOptions((ObjectEntry).Less, opts),
   286  		blockDeltas:     btree.NewBTreeGOptions((BlockDeltaEntry).Less, opts),
   287  		primaryIndex:    btree.NewBTreeGOptions((*PrimaryIndexEntry).Less, opts),
   288  		dirtyBlocks:     btree.NewBTreeGOptions((types.Blockid).Less, opts),
   289  		objectIndexByTS: btree.NewBTreeGOptions((ObjectIndexByTSEntry).Less, opts),
   290  		shared:          new(sharedStates),
   291  	}
   292  }
   293  
   294  func (p *PartitionState) Copy() *PartitionState {
   295  	state := PartitionState{
   296  		rows:            p.rows.Copy(),
   297  		dataObjects:     p.dataObjects.Copy(),
   298  		blockDeltas:     p.blockDeltas.Copy(),
   299  		primaryIndex:    p.primaryIndex.Copy(),
   300  		noData:          p.noData,
   301  		dirtyBlocks:     p.dirtyBlocks.Copy(),
   302  		objectIndexByTS: p.objectIndexByTS.Copy(),
   303  		shared:          p.shared,
   304  		start:           p.start,
   305  		end:             p.end,
   306  	}
   307  	if len(p.checkpoints) > 0 {
   308  		state.checkpoints = make([]string, len(p.checkpoints))
   309  		copy(state.checkpoints, p.checkpoints)
   310  	}
   311  	return &state
   312  }
   313  
   314  func (p *PartitionState) RowExists(rowID types.Rowid, ts types.TS) bool {
   315  	iter := p.rows.Iter()
   316  	defer iter.Release()
   317  
   318  	blockID := rowID.CloneBlockID()
   319  	for ok := iter.Seek(RowEntry{
   320  		BlockID: blockID,
   321  		RowID:   rowID,
   322  		Time:    ts,
   323  	}); ok; ok = iter.Next() {
   324  		entry := iter.Item()
   325  		if entry.BlockID != blockID {
   326  			break
   327  		}
   328  		if entry.RowID != rowID {
   329  			break
   330  		}
   331  		if entry.Time.Greater(&ts) {
   332  			// not visible
   333  			continue
   334  		}
   335  		if entry.Deleted {
   336  			// deleted
   337  			return false
   338  		}
   339  		return true
   340  	}
   341  
   342  	return false
   343  }
   344  
   345  func (p *PartitionState) HandleLogtailEntry(
   346  	ctx context.Context,
   347  	fs fileservice.FileService,
   348  	entry *api.Entry,
   349  	primarySeqnum int,
   350  	packer *types.Packer,
   351  ) {
   352  	txnTrace.GetService().ApplyLogtail(entry, 1)
   353  	switch entry.EntryType {
   354  	case api.Entry_Insert:
   355  		if IsBlkTable(entry.TableName) {
   356  			p.HandleMetadataInsert(ctx, fs, entry.Bat)
   357  		} else if IsObjTable(entry.TableName) {
   358  			p.HandleObjectInsert(ctx, entry.Bat, fs)
   359  		} else {
   360  			p.HandleRowsInsert(ctx, entry.Bat, primarySeqnum, packer)
   361  		}
   362  	case api.Entry_Delete:
   363  		if IsBlkTable(entry.TableName) {
   364  			p.HandleMetadataDelete(ctx, entry.TableId, entry.Bat)
   365  		} else if IsObjTable(entry.TableName) {
   366  			p.HandleObjectDelete(entry.TableId, entry.Bat)
   367  		} else {
   368  			p.HandleRowsDelete(ctx, entry.Bat, packer)
   369  		}
   370  	default:
   371  		panic("unknown entry type")
   372  	}
   373  }
   374  
   375  func (p *PartitionState) HandleObjectDelete(
   376  	tableID uint64,
   377  	bat *api.Batch) {
   378  	statsVec := mustVectorFromProto(bat.Vecs[2])
   379  	stateCol := vector.MustFixedCol[bool](mustVectorFromProto(bat.Vecs[3]))
   380  	sortedCol := vector.MustFixedCol[bool](mustVectorFromProto(bat.Vecs[4]))
   381  	createTSCol := vector.MustFixedCol[types.TS](mustVectorFromProto(bat.Vecs[7]))
   382  	deleteTSCol := vector.MustFixedCol[types.TS](mustVectorFromProto(bat.Vecs[8]))
   383  	commitTSCol := vector.MustFixedCol[types.TS](mustVectorFromProto(bat.Vecs[11]))
   384  
   385  	for idx := 0; idx < len(stateCol); idx++ {
   386  		var objEntry ObjectEntry
   387  
   388  		objEntry.ObjectStats = objectio.ObjectStats(statsVec.GetBytesAt(idx))
   389  
   390  		if objEntry.ObjectStats.BlkCnt() == 0 || objEntry.ObjectStats.Rows() == 0 {
   391  			continue
   392  		}
   393  
   394  		objEntry.EntryState = stateCol[idx]
   395  		objEntry.CreateTime = createTSCol[idx]
   396  		objEntry.DeleteTime = deleteTSCol[idx]
   397  		objEntry.CommitTS = commitTSCol[idx]
   398  		objEntry.Sorted = sortedCol[idx]
   399  		p.objectDeleteHelper(tableID, objEntry, deleteTSCol[idx])
   400  	}
   401  }
   402  
   403  func (p *PartitionState) HandleObjectInsert(ctx context.Context, bat *api.Batch, fs fileservice.FileService) {
   404  
   405  	var numDeleted, blockDeleted, scanCnt int64
   406  	statsVec := mustVectorFromProto(bat.Vecs[2])
   407  	stateCol := vector.MustFixedCol[bool](mustVectorFromProto(bat.Vecs[3]))
   408  	sortedCol := vector.MustFixedCol[bool](mustVectorFromProto(bat.Vecs[4]))
   409  	createTSCol := vector.MustFixedCol[types.TS](mustVectorFromProto(bat.Vecs[7]))
   410  	deleteTSCol := vector.MustFixedCol[types.TS](mustVectorFromProto(bat.Vecs[8]))
   411  	startTSCol := vector.MustFixedCol[types.TS](mustVectorFromProto(bat.Vecs[9]))
   412  	commitTSCol := vector.MustFixedCol[types.TS](mustVectorFromProto(bat.Vecs[11]))
   413  
   414  	for idx := 0; idx < len(stateCol); idx++ {
   415  		p.shared.Lock()
   416  		if t := commitTSCol[idx]; t.Greater(&p.shared.lastFlushTimestamp) {
   417  			p.shared.lastFlushTimestamp = t
   418  		}
   419  		p.shared.Unlock()
   420  		var objEntry ObjectEntry
   421  
   422  		objEntry.ObjectStats = objectio.ObjectStats(statsVec.GetBytesAt(idx))
   423  		if objEntry.ObjectStats.BlkCnt() == 0 || objEntry.ObjectStats.Rows() == 0 {
   424  			logutil.Errorf("skip empty object stats when HandleObjectInsert, %s\n", objEntry.String())
   425  			continue
   426  		}
   427  
   428  		objEntry.EntryState = stateCol[idx]
   429  		objEntry.CreateTime = createTSCol[idx]
   430  		objEntry.DeleteTime = deleteTSCol[idx]
   431  		objEntry.CommitTS = commitTSCol[idx]
   432  		objEntry.Sorted = sortedCol[idx]
   433  
   434  		old, exist := p.dataObjects.Get(objEntry)
   435  		if exist {
   436  			objEntry.HasDeltaLoc = old.HasDeltaLoc
   437  		}
   438  		if exist && !old.IsEmpty() {
   439  			// why check the deleteTime here? consider this situation:
   440  			// 		1. insert on an object, then these insert operations recorded into a CKP.
   441  			// 		2. and delete this object, this operation recorded into WAL.
   442  			// 		3. restart
   443  			// 		4. replay CKP(lazily) into partition state --> replay WAL into partition state
   444  			// the delete record in WAL could be overwritten by insert record in CKP,
   445  			// causing logic err of the objects' visibility(dead object back to life!!).
   446  			//
   447  			// if this happened, just skip this object will be fine, why chose to
   448  			// update the object Stats and leave others unchanged?
   449  			//
   450  			// in single txn, the pushed log tail has orders: meta insert, object insert.
   451  			// as long as delta location generated, there will be meta insert followed by object insert pushed to cn.
   452  			// in the normal case, the handleMetaInsert will construct objects with empty stats(rows = 0)
   453  			// and will be updated by HandleObjectInsert later. if we skip this object in such case (non-above situation),
   454  			// the object stats will be remained empty, has potential impact on where the stats.rows be used.
   455  			//
   456  			// so the final logic is that only update the object stats
   457  			// when an object already exists in the partition state and has the deleteTime value.
   458  			if !old.DeleteTime.IsEmpty() {
   459  				// leave these field unchanged
   460  				objEntry.DeleteTime = old.DeleteTime
   461  				objEntry.CommitTS = old.CommitTS
   462  				objEntry.EntryState = old.EntryState
   463  				objEntry.CreateTime = old.CreateTime
   464  				objEntry.Sorted = old.Sorted
   465  
   466  				// only update object stats
   467  			}
   468  		} else {
   469  			e := ObjectIndexByTSEntry{
   470  				Time:         createTSCol[idx],
   471  				ShortObjName: *objEntry.ObjectShortName(),
   472  				IsDelete:     false,
   473  
   474  				IsAppendable: objEntry.EntryState,
   475  			}
   476  			p.objectIndexByTS.Set(e)
   477  		}
   478  		//prefetch the object meta
   479  		if err := blockio.PrefetchMeta(fs, objEntry.Location()); err != nil {
   480  			logutil.Errorf("prefetch object meta failed. %v", err)
   481  		}
   482  
   483  		p.dataObjects.Set(objEntry)
   484  		{
   485  			//Need to insert an entry in objectIndexByTS, when soft delete appendable object.
   486  			e := ObjectIndexByTSEntry{
   487  				ShortObjName: *objEntry.ObjectShortName(),
   488  
   489  				IsAppendable: objEntry.EntryState,
   490  			}
   491  			if !deleteTSCol[idx].IsEmpty() {
   492  				e.Time = deleteTSCol[idx]
   493  				e.IsDelete = true
   494  				p.objectIndexByTS.Set(e)
   495  			}
   496  		}
   497  
   498  		if objEntry.EntryState && objEntry.DeleteTime.IsEmpty() {
   499  			panic("logic error")
   500  		}
   501  		// for appendable object, gc rows when delete object
   502  		iter := p.rows.Copy().Iter()
   503  		objID := objEntry.ObjectStats.ObjectName().ObjectId()
   504  		trunctPoint := startTSCol[idx]
   505  		blkCnt := objEntry.ObjectStats.BlkCnt()
   506  		for i := uint32(0); i < blkCnt; i++ {
   507  
   508  			blkID := objectio.NewBlockidWithObjectID(objID, uint16(i))
   509  			pivot := RowEntry{
   510  				// aobj has only one blk
   511  				BlockID: *blkID,
   512  			}
   513  			for ok := iter.Seek(pivot); ok; ok = iter.Next() {
   514  				entry := iter.Item()
   515  				if entry.BlockID != *blkID {
   516  					break
   517  				}
   518  				scanCnt++
   519  
   520  				// if the inserting block is appendable, need to delete the rows for it;
   521  				// if the inserting block is non-appendable and has delta location, need to delete
   522  				// the deletes for it.
   523  				if objEntry.EntryState {
   524  					if entry.Time.LessEq(&trunctPoint) {
   525  						// delete the row
   526  						p.rows.Delete(entry)
   527  
   528  						// delete the row's primary index
   529  						if objEntry.EntryState && len(entry.PrimaryIndexBytes) > 0 {
   530  							p.primaryIndex.Delete(&PrimaryIndexEntry{
   531  								Bytes:      entry.PrimaryIndexBytes,
   532  								RowEntryID: entry.ID,
   533  							})
   534  						}
   535  						numDeleted++
   536  						blockDeleted++
   537  					}
   538  				}
   539  
   540  				//it's tricky here.
   541  				//Due to consuming lazily the checkpoint,
   542  				//we have to take the following scenario into account:
   543  				//1. CN receives deletes for a non-appendable block from the log tail,
   544  				//   then apply the deletes into PartitionState.rows.
   545  				//2. CN receives block meta of the above non-appendable block to be inserted
   546  				//   from the checkpoint, then apply the block meta into PartitionState.blocks.
   547  				// So , if the above scenario happens, we need to set the non-appendable block into
   548  				// PartitionState.dirtyBlocks.
   549  				if !objEntry.EntryState && !objEntry.HasDeltaLoc {
   550  					p.dirtyBlocks.Set(entry.BlockID)
   551  					break
   552  				}
   553  			}
   554  			iter.Release()
   555  
   556  			// if there are no rows for the block, delete the block from the dirty
   557  			if objEntry.EntryState && scanCnt == blockDeleted && p.dirtyBlocks.Len() > 0 {
   558  				p.dirtyBlocks.Delete(*blkID)
   559  			}
   560  		}
   561  	}
   562  	perfcounter.Update(ctx, func(c *perfcounter.CounterSet) {
   563  		c.DistTAE.Logtail.ActiveRows.Add(-numDeleted)
   564  	})
   565  }
   566  
   567  var nextRowEntryID = int64(1)
   568  
   569  func (p *PartitionState) HandleRowsInsert(
   570  	ctx context.Context,
   571  	input *api.Batch,
   572  	primarySeqnum int,
   573  	packer *types.Packer,
   574  ) (
   575  	primaryKeys [][]byte,
   576  ) {
   577  	ctx, task := trace.NewTask(ctx, "PartitionState.HandleRowsInsert")
   578  	defer task.End()
   579  
   580  	rowIDVector := vector.MustFixedCol[types.Rowid](mustVectorFromProto(input.Vecs[0]))
   581  	timeVector := vector.MustFixedCol[types.TS](mustVectorFromProto(input.Vecs[1]))
   582  	batch, err := batch.ProtoBatchToBatch(input)
   583  	if err != nil {
   584  		panic(err)
   585  	}
   586  	primaryKeys = EncodePrimaryKeyVector(
   587  		batch.Vecs[2+primarySeqnum],
   588  		packer,
   589  	)
   590  
   591  	var numInserted int64
   592  	for i, rowID := range rowIDVector {
   593  
   594  		blockID := rowID.CloneBlockID()
   595  		pivot := RowEntry{
   596  			BlockID: blockID,
   597  			RowID:   rowID,
   598  			Time:    timeVector[i],
   599  		}
   600  		entry, ok := p.rows.Get(pivot)
   601  		if !ok {
   602  			entry = pivot
   603  			entry.ID = atomic.AddInt64(&nextRowEntryID, 1)
   604  			numInserted++
   605  		}
   606  
   607  		if !p.noData {
   608  			entry.Batch = batch
   609  			entry.Offset = int64(i)
   610  		}
   611  		entry.PrimaryIndexBytes = primaryKeys[i]
   612  		p.rows.Set(entry)
   613  
   614  		{
   615  			entry := &PrimaryIndexEntry{
   616  				Bytes:      primaryKeys[i],
   617  				RowEntryID: entry.ID,
   618  				BlockID:    blockID,
   619  				RowID:      rowID,
   620  				Time:       entry.Time,
   621  			}
   622  			p.primaryIndex.Set(entry)
   623  		}
   624  	}
   625  
   626  	perfcounter.Update(ctx, func(c *perfcounter.CounterSet) {
   627  		c.DistTAE.Logtail.Entries.Add(1)
   628  		c.DistTAE.Logtail.InsertEntries.Add(1)
   629  		c.DistTAE.Logtail.InsertRows.Add(numInserted)
   630  		c.DistTAE.Logtail.ActiveRows.Add(numInserted)
   631  	})
   632  
   633  	return
   634  }
   635  
   636  func (p *PartitionState) HandleRowsDelete(
   637  	ctx context.Context,
   638  	input *api.Batch,
   639  	packer *types.Packer,
   640  ) {
   641  	ctx, task := trace.NewTask(ctx, "PartitionState.HandleRowsDelete")
   642  	defer task.End()
   643  
   644  	rowIDVector := vector.MustFixedCol[types.Rowid](mustVectorFromProto(input.Vecs[0]))
   645  	timeVector := vector.MustFixedCol[types.TS](mustVectorFromProto(input.Vecs[1]))
   646  	batch, err := batch.ProtoBatchToBatch(input)
   647  	if err != nil {
   648  		panic(err)
   649  	}
   650  
   651  	var primaryKeys [][]byte
   652  	if len(input.Vecs) > 2 {
   653  		// has primary key
   654  		primaryKeys = EncodePrimaryKeyVector(
   655  			batch.Vecs[2],
   656  			packer,
   657  		)
   658  	}
   659  
   660  	numDeletes := int64(0)
   661  	for i, rowID := range rowIDVector {
   662  
   663  		blockID := rowID.CloneBlockID()
   664  		pivot := RowEntry{
   665  			BlockID: blockID,
   666  			RowID:   rowID,
   667  			Time:    timeVector[i],
   668  		}
   669  		entry, ok := p.rows.Get(pivot)
   670  		if !ok {
   671  			entry = pivot
   672  			entry.ID = atomic.AddInt64(&nextRowEntryID, 1)
   673  			numDeletes++
   674  		}
   675  
   676  		entry.Deleted = true
   677  		if i < len(primaryKeys) {
   678  			entry.PrimaryIndexBytes = primaryKeys[i]
   679  		}
   680  		if !p.noData {
   681  			entry.Batch = batch
   682  			entry.Offset = int64(i)
   683  		}
   684  		p.rows.Set(entry)
   685  
   686  		//handle memory deletes for non-appendable block.
   687  		p.dirtyBlocks.Set(blockID)
   688  
   689  		// primary key
   690  		if i < len(primaryKeys) && len(primaryKeys[i]) > 0 {
   691  			entry := &PrimaryIndexEntry{
   692  				Bytes:      primaryKeys[i],
   693  				RowEntryID: entry.ID,
   694  				BlockID:    blockID,
   695  				RowID:      rowID,
   696  				Time:       entry.Time,
   697  			}
   698  			p.primaryIndex.Set(entry)
   699  		}
   700  
   701  	}
   702  
   703  	perfcounter.Update(ctx, func(c *perfcounter.CounterSet) {
   704  		c.DistTAE.Logtail.Entries.Add(1)
   705  		c.DistTAE.Logtail.DeleteEntries.Add(1)
   706  		c.DistTAE.Logtail.DeleteRows.Add(numDeletes)
   707  	})
   708  }
   709  
   710  func (p *PartitionState) HandleMetadataInsert(
   711  	ctx context.Context,
   712  	fs fileservice.FileService,
   713  	input *api.Batch) {
   714  	ctx, task := trace.NewTask(ctx, "PartitionState.HandleMetadataInsert")
   715  	defer task.End()
   716  
   717  	createTimeVector := vector.MustFixedCol[types.TS](mustVectorFromProto(input.Vecs[1]))
   718  	blockIDVector := vector.MustFixedCol[types.Blockid](mustVectorFromProto(input.Vecs[2]))
   719  	entryStateVector := vector.MustFixedCol[bool](mustVectorFromProto(input.Vecs[3]))
   720  	sortedStateVector := vector.MustFixedCol[bool](mustVectorFromProto(input.Vecs[4]))
   721  	metaLocationVector := mustVectorFromProto(input.Vecs[5])
   722  	deltaLocationVector := mustVectorFromProto(input.Vecs[6])
   723  	commitTimeVector := vector.MustFixedCol[types.TS](mustVectorFromProto(input.Vecs[7]))
   724  	//segmentIDVector := vector.MustFixedCol[types.Uuid](mustVectorFromProto(input.Vecs[8]))
   725  	memTruncTSVector := vector.MustFixedCol[types.TS](mustVectorFromProto(input.Vecs[9]))
   726  
   727  	var numInserted, numDeleted int64
   728  	for i, blockID := range blockIDVector {
   729  		p.shared.Lock()
   730  		if t := commitTimeVector[i]; t.Greater(&p.shared.lastFlushTimestamp) {
   731  			p.shared.lastFlushTimestamp = t
   732  		}
   733  		p.shared.Unlock()
   734  
   735  		pivot := BlockDeltaEntry{
   736  			BlockID: blockID,
   737  		}
   738  		blockEntry, ok := p.blockDeltas.Get(pivot)
   739  		if !ok {
   740  			blockEntry = pivot
   741  			numInserted++
   742  		} else if blockEntry.CommitTs.GreaterEq(&commitTimeVector[i]) {
   743  			// it possible to get an older version blk from lazy loaded checkpoint
   744  			continue
   745  		}
   746  
   747  		// the following codes handle block which be inserted or updated by a newer delta location.
   748  		// Notice that only delta location can be updated by a newer delta location.
   749  		if location := objectio.Location(deltaLocationVector.GetBytesAt(i)); !location.IsEmpty() {
   750  			blockEntry.DeltaLoc = *(*[objectio.LocationLen]byte)(unsafe.Pointer(&location[0]))
   751  		}
   752  		if t := commitTimeVector[i]; !t.IsEmpty() {
   753  			blockEntry.CommitTs = t
   754  		}
   755  
   756  		isAppendable := entryStateVector[i]
   757  		isEmptyDelta := blockEntry.DeltaLocation().IsEmpty()
   758  
   759  		if !isEmptyDelta {
   760  			p.blockDeltas.Set(blockEntry)
   761  		}
   762  
   763  		{
   764  			scanCnt := int64(0)
   765  			blockDeleted := int64(0)
   766  			trunctPoint := memTruncTSVector[i]
   767  			iter := p.rows.Copy().Iter()
   768  			pivot := RowEntry{
   769  				BlockID: blockID,
   770  			}
   771  			for ok := iter.Seek(pivot); ok; ok = iter.Next() {
   772  				entry := iter.Item()
   773  				if entry.BlockID != blockID {
   774  					break
   775  				}
   776  				scanCnt++
   777  				//it's tricky here.
   778  				//Due to consuming lazily the checkpoint,
   779  				//we have to take the following scenario into account:
   780  				//1. CN receives deletes for a non-appendable block from the log tail,
   781  				//   then apply the deletes into PartitionState.rows.
   782  				//2. CN receives block meta of the above non-appendable block to be inserted
   783  				//   from the checkpoint, then apply the block meta into PartitionState.blocks.
   784  				// So , if the above scenario happens, we need to set the non-appendable block into
   785  				// PartitionState.dirtyBlocks.
   786  				if !isAppendable && isEmptyDelta {
   787  					p.dirtyBlocks.Set(blockID)
   788  					break
   789  				}
   790  
   791  				// if the inserting block is appendable, need to delete the rows for it;
   792  				// if the inserting block is non-appendable and has delta location, need to delete
   793  				// the deletes for it.
   794  				if isAppendable || (!isAppendable && !isEmptyDelta) {
   795  					if entry.Time.LessEq(&trunctPoint) {
   796  						// delete the row
   797  						p.rows.Delete(entry)
   798  
   799  						// delete the row's primary index
   800  						if isAppendable && len(entry.PrimaryIndexBytes) > 0 {
   801  							p.primaryIndex.Delete(&PrimaryIndexEntry{
   802  								Bytes:      entry.PrimaryIndexBytes,
   803  								RowEntryID: entry.ID,
   804  							})
   805  						}
   806  						numDeleted++
   807  						blockDeleted++
   808  					}
   809  				}
   810  			}
   811  			iter.Release()
   812  
   813  			// if there are no rows for the block, delete the block from the dirty
   814  			if scanCnt == blockDeleted && p.dirtyBlocks.Len() > 0 {
   815  				p.dirtyBlocks.Delete(blockID)
   816  			}
   817  		}
   818  
   819  		//create object by block insert to set objEntry.HasDeltaLoc
   820  		//when lazy load, maybe deltalocation is consumed before object is created
   821  		{
   822  			objPivot := ObjectEntry{}
   823  			if metaLoc := objectio.Location(metaLocationVector.GetBytesAt(i)); !metaLoc.IsEmpty() {
   824  				objectio.SetObjectStatsLocation(&objPivot.ObjectStats, metaLoc)
   825  			} else {
   826  				// After block is removed,
   827  				// HandleMetadataInsert only handle deltaloc.
   828  				// Meta location is empty.
   829  				objID := blockID.Object()
   830  				objName := objectio.BuildObjectNameWithObjectID(objID)
   831  				objectio.SetObjectStatsObjectName(&objPivot.ObjectStats, objName)
   832  			}
   833  			objEntry, ok := p.dataObjects.Get(objPivot)
   834  			if ok {
   835  				// don't need to update objEntry, except for HasDeltaLoc and blkCnt
   836  				if !isEmptyDelta {
   837  					objEntry.HasDeltaLoc = true
   838  				}
   839  
   840  				blkCnt := blockID.Sequence() + 1
   841  				if uint32(blkCnt) > objEntry.BlkCnt() {
   842  					objectio.SetObjectStatsBlkCnt(&objEntry.ObjectStats, uint32(blkCnt))
   843  				}
   844  				p.dataObjects.Set(objEntry)
   845  				// For deltaloc batch after block is removed,
   846  				// objEntry.CreateTime is empty.
   847  				// and it's temporary.
   848  				// Related dataObjectsByCreateTS will be set in HandleObjectInsert.
   849  				//
   850  				// the created ts index have been removed now
   851  				//if !objEntry.CreateTime.IsEmpty() {
   852  				//	p.dataObjectsByCreateTS.Set(ObjectIndexByCreateTSEntry(objEntry))
   853  				//}
   854  			} else {
   855  
   856  				objEntry = objPivot
   857  				objEntry.EntryState = entryStateVector[i]
   858  				objEntry.Sorted = sortedStateVector[i]
   859  				if !isEmptyDelta {
   860  					objEntry.HasDeltaLoc = true
   861  				}
   862  				objEntry.CommitTS = commitTimeVector[i]
   863  				createTS := createTimeVector[i]
   864  				// after blk is removed, create ts is empty
   865  				if !createTS.IsEmpty() {
   866  					objEntry.CreateTime = createTS
   867  				}
   868  
   869  				blkCnt := blockID.Sequence() + 1
   870  				if uint32(blkCnt) > objEntry.BlkCnt() {
   871  					objectio.SetObjectStatsBlkCnt(&objEntry.ObjectStats, uint32(blkCnt))
   872  				}
   873  
   874  				p.dataObjects.Set(objEntry)
   875  
   876  				{
   877  					e := ObjectIndexByTSEntry{
   878  						Time:         createTimeVector[i],
   879  						ShortObjName: *objEntry.ObjectShortName(),
   880  						IsDelete:     false,
   881  
   882  						IsAppendable: objEntry.EntryState,
   883  					}
   884  					p.objectIndexByTS.Set(e)
   885  				}
   886  			}
   887  		}
   888  
   889  	}
   890  
   891  	perfcounter.Update(ctx, func(c *perfcounter.CounterSet) {
   892  		c.DistTAE.Logtail.Entries.Add(1)
   893  		c.DistTAE.Logtail.MetadataInsertEntries.Add(1)
   894  		c.DistTAE.Logtail.ActiveRows.Add(-numDeleted)
   895  		c.DistTAE.Logtail.InsertBlocks.Add(numInserted)
   896  	})
   897  }
   898  
   899  func (p *PartitionState) objectDeleteHelper(
   900  	tableID uint64,
   901  	pivot ObjectEntry,
   902  	deleteTime types.TS) {
   903  	objEntry, ok := p.dataObjects.Get(pivot)
   904  	//TODO non-appendable block' delete maybe arrive before its insert?
   905  	if !ok {
   906  		panic(fmt.Sprintf("invalid block id. %v", pivot.String()))
   907  	}
   908  
   909  	if objEntry.DeleteTime.IsEmpty() {
   910  		// apply first delete
   911  		objEntry.DeleteTime = deleteTime
   912  		p.dataObjects.Set(objEntry)
   913  
   914  		{
   915  			e := ObjectIndexByTSEntry{
   916  				Time:         objEntry.DeleteTime,
   917  				ShortObjName: *objEntry.ObjectShortName(),
   918  				IsDelete:     true,
   919  
   920  				IsAppendable: objEntry.EntryState,
   921  			}
   922  			txnTrace.GetService().ApplyDeleteObject(
   923  				tableID,
   924  				objEntry.DeleteTime.ToTimestamp(),
   925  				"",
   926  				"delete-object")
   927  			p.objectIndexByTS.Set(e)
   928  		}
   929  	} else {
   930  		// update deletetime, if incoming delete ts is less
   931  		if objEntry.DeleteTime.Greater(&deleteTime) {
   932  			old := ObjectIndexByTSEntry{
   933  				Time:         objEntry.DeleteTime,
   934  				ShortObjName: *objEntry.ObjectShortName(),
   935  				IsDelete:     true,
   936  
   937  				IsAppendable: objEntry.EntryState,
   938  			}
   939  			p.objectIndexByTS.Delete(old)
   940  			objEntry.DeleteTime = deleteTime
   941  			p.dataObjects.Set(objEntry)
   942  
   943  			new := ObjectIndexByTSEntry{
   944  				Time:         objEntry.DeleteTime,
   945  				ShortObjName: *objEntry.ObjectShortName(),
   946  				IsDelete:     true,
   947  
   948  				IsAppendable: objEntry.EntryState,
   949  			}
   950  			p.objectIndexByTS.Set(new)
   951  		} else if objEntry.DeleteTime.Equal(&deleteTime) {
   952  			//FIXME:: should we do something here?
   953  			e := ObjectIndexByTSEntry{
   954  				Time:         objEntry.DeleteTime,
   955  				ShortObjName: *objEntry.ObjectShortName(),
   956  				IsDelete:     true,
   957  
   958  				IsAppendable: objEntry.EntryState,
   959  			}
   960  			p.objectIndexByTS.Set(e)
   961  		}
   962  	}
   963  }
   964  
   965  func (p *PartitionState) HandleMetadataDelete(
   966  	ctx context.Context,
   967  	tableID uint64,
   968  	input *api.Batch) {
   969  	ctx, task := trace.NewTask(ctx, "PartitionState.HandleMetadataDelete")
   970  	defer task.End()
   971  
   972  	perfcounter.Update(ctx, func(c *perfcounter.CounterSet) {
   973  		c.DistTAE.Logtail.Entries.Add(1)
   974  		c.DistTAE.Logtail.MetadataDeleteEntries.Add(1)
   975  	})
   976  }
   977  
   978  func (p *PartitionState) CacheCkpDuration(
   979  	start types.TS,
   980  	end types.TS,
   981  	partition *Partition) {
   982  	if partition.checkpointConsumed.Load() {
   983  		panic("checkpoints already consumed")
   984  	}
   985  	p.start = start
   986  	p.end = end
   987  }
   988  
   989  func (p *PartitionState) AppendCheckpoint(
   990  	checkpoint string,
   991  	partiton *Partition) {
   992  	if partiton.checkpointConsumed.Load() {
   993  		panic("checkpoints already consumed")
   994  	}
   995  	p.checkpoints = append(p.checkpoints, checkpoint)
   996  }
   997  
   998  func (p *PartitionState) consumeCheckpoints(
   999  	fn func(checkpoint string, state *PartitionState) error,
  1000  ) error {
  1001  	for _, checkpoint := range p.checkpoints {
  1002  		if err := fn(checkpoint, p); err != nil {
  1003  			return err
  1004  		}
  1005  	}
  1006  	p.checkpoints = p.checkpoints[:0]
  1007  	return nil
  1008  }
  1009  
  1010  func (p *PartitionState) truncate(ids [2]uint64, ts types.TS) {
  1011  	if p.minTS.Greater(&ts) {
  1012  		logutil.Errorf("logic error: current minTS %v, incoming ts %v", p.minTS.ToString(), ts.ToString())
  1013  		return
  1014  	}
  1015  	p.minTS = ts
  1016  	gced := false
  1017  	pivot := ObjectIndexByTSEntry{
  1018  		Time:         ts.Next(),
  1019  		ShortObjName: objectio.ObjectNameShort{},
  1020  		IsDelete:     true,
  1021  	}
  1022  	iter := p.objectIndexByTS.Copy().Iter()
  1023  	ok := iter.Seek(pivot)
  1024  	if !ok {
  1025  		ok = iter.Last()
  1026  	}
  1027  	objIDsToDelete := make(map[objectio.ObjectNameShort]struct{}, 0)
  1028  	objectsToDelete := ""
  1029  	for ; ok; ok = iter.Prev() {
  1030  		entry := iter.Item()
  1031  		if entry.Time.Greater(&ts) {
  1032  			continue
  1033  		}
  1034  		if entry.IsDelete {
  1035  			objIDsToDelete[entry.ShortObjName] = struct{}{}
  1036  			if gced {
  1037  				objectsToDelete = fmt.Sprintf("%s, %v", objectsToDelete, entry.ShortObjName)
  1038  			} else {
  1039  				objectsToDelete = fmt.Sprintf("%s%v", objectsToDelete, entry.ShortObjName)
  1040  			}
  1041  			gced = true
  1042  		}
  1043  	}
  1044  	iter = p.objectIndexByTS.Copy().Iter()
  1045  	ok = iter.Seek(pivot)
  1046  	if !ok {
  1047  		ok = iter.Last()
  1048  	}
  1049  	for ; ok; ok = iter.Prev() {
  1050  		entry := iter.Item()
  1051  		if entry.Time.Greater(&ts) {
  1052  			continue
  1053  		}
  1054  		if _, ok := objIDsToDelete[entry.ShortObjName]; ok {
  1055  			p.objectIndexByTS.Delete(entry)
  1056  		}
  1057  	}
  1058  	if gced {
  1059  		logutil.Infof("GC partition_state at %v for table %d:%s", ts.ToString(), ids[1], objectsToDelete)
  1060  	}
  1061  
  1062  	objsToDelete := ""
  1063  	objIter := p.dataObjects.Copy().Iter()
  1064  	objGced := false
  1065  	firstCalled := false
  1066  	for {
  1067  		if !firstCalled {
  1068  			if !objIter.First() {
  1069  				break
  1070  			}
  1071  			firstCalled = true
  1072  		} else {
  1073  			if !objIter.Next() {
  1074  				break
  1075  			}
  1076  		}
  1077  
  1078  		objEntry := objIter.Item()
  1079  
  1080  		if !objEntry.DeleteTime.IsEmpty() && objEntry.DeleteTime.LessEq(&ts) {
  1081  			p.dataObjects.Delete(objEntry)
  1082  			//p.dataObjectsByCreateTS.Delete(ObjectIndexByCreateTSEntry{
  1083  			//	//CreateTime:   objEntry.CreateTime,
  1084  			//	//ShortObjName: objEntry.ShortObjName,
  1085  			//	ObjectInfo: objEntry.ObjectInfo,
  1086  			//})
  1087  			if objGced {
  1088  				objsToDelete = fmt.Sprintf("%s, %s", objsToDelete, objEntry.Location().Name().String())
  1089  			} else {
  1090  				objsToDelete = fmt.Sprintf("%s%s", objsToDelete, objEntry.Location().Name().String())
  1091  			}
  1092  			objGced = true
  1093  		}
  1094  	}
  1095  	if objGced {
  1096  		logutil.Infof("GC partition_state at %v for table %d:%s", ts.ToString(), ids[1], objsToDelete)
  1097  	}
  1098  }
  1099  
  1100  func (p *PartitionState) LastFlushTimestamp() types.TS {
  1101  	p.shared.Lock()
  1102  	defer p.shared.Unlock()
  1103  	return p.shared.lastFlushTimestamp
  1104  }