github.com/matrixorigin/matrixone@v1.2.0/pkg/vm/engine/disttae/reader.go (about)

     1  // Copyright 2022 Matrix Origin
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //      http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  package disttae
    16  
    17  import (
    18  	"context"
    19  	"sort"
    20  	"time"
    21  
    22  	"github.com/matrixorigin/matrixone/pkg/catalog"
    23  	"github.com/matrixorigin/matrixone/pkg/common/moerr"
    24  	"github.com/matrixorigin/matrixone/pkg/common/mpool"
    25  	"github.com/matrixorigin/matrixone/pkg/container/batch"
    26  	"github.com/matrixorigin/matrixone/pkg/container/types"
    27  	"github.com/matrixorigin/matrixone/pkg/container/vector"
    28  	"github.com/matrixorigin/matrixone/pkg/fileservice"
    29  	"github.com/matrixorigin/matrixone/pkg/logutil"
    30  	"github.com/matrixorigin/matrixone/pkg/objectio"
    31  	"github.com/matrixorigin/matrixone/pkg/pb/plan"
    32  	"github.com/matrixorigin/matrixone/pkg/pb/timestamp"
    33  	"github.com/matrixorigin/matrixone/pkg/perfcounter"
    34  	plan2 "github.com/matrixorigin/matrixone/pkg/sql/plan"
    35  	"github.com/matrixorigin/matrixone/pkg/testutil"
    36  	"github.com/matrixorigin/matrixone/pkg/txn/trace"
    37  	v2 "github.com/matrixorigin/matrixone/pkg/util/metric/v2"
    38  	"github.com/matrixorigin/matrixone/pkg/vm/engine"
    39  	"github.com/matrixorigin/matrixone/pkg/vm/engine/disttae/logtailreplay"
    40  	"github.com/matrixorigin/matrixone/pkg/vm/engine/tae/blockio"
    41  	"github.com/matrixorigin/matrixone/pkg/vm/engine/tae/index"
    42  	"github.com/matrixorigin/matrixone/pkg/vm/process"
    43  	"go.uber.org/zap"
    44  )
    45  
    46  // -----------------------------------------------------------------
    47  // ------------------------ withFilterMixin ------------------------
    48  // -----------------------------------------------------------------
    49  
    50  func (mixin *withFilterMixin) reset() {
    51  	mixin.filterState.evaluated = false
    52  	mixin.filterState.filter = nil
    53  	mixin.columns.pkPos = -1
    54  	mixin.columns.indexOfFirstSortedColumn = -1
    55  	mixin.columns.seqnums = nil
    56  	mixin.columns.colTypes = nil
    57  	mixin.sels = nil
    58  }
    59  
    60  // when the reader.Read is called for a new block, it will always
    61  // call tryUpdate to update the seqnums
    62  // NOTE: here we assume the tryUpdate is always called with the same cols
    63  // for all blocks and it will only be updated once
    64  func (mixin *withFilterMixin) tryUpdateColumns(cols []string) {
    65  	if len(cols) == len(mixin.columns.seqnums) {
    66  		return
    67  	}
    68  	if len(mixin.columns.seqnums) != 0 {
    69  		panic(moerr.NewInternalErrorNoCtx("withFilterMixin tryUpdate called with different cols"))
    70  	}
    71  
    72  	// record the column selectivity
    73  	chit, ctotal := len(cols), len(mixin.tableDef.Cols)
    74  	v2.TaskSelColumnTotal.Add(float64(ctotal))
    75  	v2.TaskSelColumnHit.Add(float64(ctotal - chit))
    76  	blockio.RecordColumnSelectivity(chit, ctotal)
    77  
    78  	mixin.columns.seqnums = make([]uint16, len(cols))
    79  	mixin.columns.colTypes = make([]types.Type, len(cols))
    80  	// mixin.columns.colNulls = make([]bool, len(cols))
    81  	mixin.columns.pkPos = -1
    82  	mixin.columns.indexOfFirstSortedColumn = -1
    83  	compPKName2Pos := make(map[string]struct{})
    84  	positions := make(map[string]int)
    85  	if mixin.tableDef.Pkey != nil && mixin.tableDef.Pkey.CompPkeyCol != nil {
    86  		pk := mixin.tableDef.Pkey
    87  		for _, name := range pk.Names {
    88  			compPKName2Pos[name] = struct{}{}
    89  		}
    90  	}
    91  	for i, column := range cols {
    92  		if column == catalog.Row_ID {
    93  			mixin.columns.seqnums[i] = objectio.SEQNUM_ROWID
    94  			mixin.columns.colTypes[i] = objectio.RowidType
    95  		} else {
    96  			if plan2.GetSortOrderByName(mixin.tableDef, column) == 0 {
    97  				mixin.columns.indexOfFirstSortedColumn = i
    98  			}
    99  			colIdx := mixin.tableDef.Name2ColIndex[column]
   100  			colDef := mixin.tableDef.Cols[colIdx]
   101  			mixin.columns.seqnums[i] = uint16(colDef.Seqnum)
   102  
   103  			if _, ok := compPKName2Pos[column]; ok {
   104  				positions[column] = i
   105  			}
   106  
   107  			if mixin.tableDef.Pkey != nil && mixin.tableDef.Pkey.PkeyColName == column {
   108  				// primary key is in the cols
   109  				mixin.columns.pkPos = i
   110  			}
   111  			mixin.columns.colTypes[i] = types.T(colDef.Typ.Id).ToType()
   112  			// if colDef.Default != nil {
   113  			// 	mixin.columns.colNulls[i] = colDef.Default.NullAbility
   114  			// }
   115  		}
   116  	}
   117  	if len(positions) != 0 {
   118  		for _, name := range mixin.tableDef.Pkey.Names {
   119  			if pos, ok := positions[name]; !ok {
   120  				break
   121  			} else {
   122  				mixin.columns.compPKPositions = append(mixin.columns.compPKPositions, uint16(pos))
   123  			}
   124  		}
   125  	}
   126  }
   127  
   128  func (mixin *withFilterMixin) getReadFilter(proc *process.Process, blkCnt int) (
   129  	filter blockio.ReadFilter,
   130  ) {
   131  	if mixin.filterState.evaluated {
   132  		filter = mixin.filterState.filter
   133  		return
   134  	}
   135  	pk := mixin.tableDef.Pkey
   136  	if pk == nil {
   137  		mixin.filterState.evaluated = true
   138  		mixin.filterState.filter = nil
   139  		return
   140  	}
   141  	if pk.CompPkeyCol == nil {
   142  		return mixin.getNonCompositPKFilter(proc, blkCnt)
   143  	}
   144  	return mixin.getCompositPKFilter(proc, blkCnt)
   145  }
   146  
   147  func (mixin *withFilterMixin) getCompositPKFilter(proc *process.Process, blkCnt int) (
   148  	filter blockio.ReadFilter,
   149  ) {
   150  	// if no primary key is included in the columns or no filter expr is given,
   151  	// no filter is needed
   152  	if len(mixin.columns.compPKPositions) == 0 || mixin.filterState.expr == nil {
   153  		mixin.filterState.evaluated = true
   154  		mixin.filterState.filter = nil
   155  		return
   156  	}
   157  
   158  	// evaluate
   159  	pkNames := mixin.tableDef.Pkey.Names
   160  	pkVals := make([]*plan.Literal, len(pkNames))
   161  	ok, hasNull := getCompositPKVals(mixin.filterState.expr, pkNames, pkVals, proc)
   162  
   163  	if !ok || pkVals[0] == nil {
   164  		mixin.filterState.evaluated = true
   165  		mixin.filterState.filter = nil
   166  		mixin.filterState.hasNull = hasNull
   167  		return
   168  	}
   169  	cnt := getValidCompositePKCnt(pkVals)
   170  	pkVals = pkVals[:cnt]
   171  
   172  	filterFuncs := make([]func(*vector.Vector, []int32, *[]int32), len(pkVals))
   173  	for i := range filterFuncs {
   174  		filterFuncs[i] = getCompositeFilterFuncByExpr(pkVals[i], i == 0)
   175  	}
   176  
   177  	filter = func(vecs []*vector.Vector) []int32 {
   178  		var (
   179  			inputSels []int32
   180  		)
   181  		for i := range filterFuncs {
   182  			vec := vecs[i]
   183  			mixin.sels = mixin.sels[:0]
   184  			filterFuncs[i](vec, inputSels, &mixin.sels)
   185  			if len(mixin.sels) == 0 {
   186  				break
   187  			}
   188  			inputSels = mixin.sels
   189  		}
   190  		// logutil.Debugf("%s: %d/%d", mixin.tableDef.Name, len(res), vecs[0].Length())
   191  
   192  		return mixin.sels
   193  	}
   194  
   195  	mixin.filterState.evaluated = true
   196  	mixin.filterState.filter = filter
   197  	mixin.filterState.seqnums = make([]uint16, 0, len(mixin.columns.compPKPositions))
   198  	mixin.filterState.colTypes = make([]types.Type, 0, len(mixin.columns.compPKPositions))
   199  	for _, pos := range mixin.columns.compPKPositions {
   200  		mixin.filterState.seqnums = append(mixin.filterState.seqnums, mixin.columns.seqnums[pos])
   201  		mixin.filterState.colTypes = append(mixin.filterState.colTypes, mixin.columns.colTypes[pos])
   202  	}
   203  	// records how many blks one reader needs to read when having filter
   204  	objectio.BlkReadStats.BlksByReaderStats.Record(1, blkCnt)
   205  	return
   206  }
   207  
   208  func (mixin *withFilterMixin) getNonCompositPKFilter(proc *process.Process, blkCnt int) blockio.ReadFilter {
   209  	// if no primary key is included in the columns or no filter expr is given,
   210  	// no filter is needed
   211  	if mixin.columns.pkPos == -1 || mixin.filterState.expr == nil {
   212  		mixin.filterState.evaluated = true
   213  		mixin.filterState.filter = nil
   214  		return nil
   215  	}
   216  
   217  	// evaluate the search function for the filter
   218  	// if the search function is not found, no filter is needed
   219  	// primary key must be used by the expr in one of the following patterns:
   220  	// A: $pk = const_value
   221  	// B: const_value = $pk
   222  	// C: {A|B} and {A|B}
   223  	// D: {A|B|C} [and {A|B|C}]*
   224  	// for other patterns, no filter is needed
   225  	ok, hasNull, searchFunc := getNonCompositePKSearchFuncByExpr(
   226  		mixin.filterState.expr,
   227  		mixin.tableDef.Pkey.PkeyColName,
   228  		proc,
   229  	)
   230  	if !ok || searchFunc == nil {
   231  		mixin.filterState.evaluated = true
   232  		mixin.filterState.filter = nil
   233  		mixin.filterState.hasNull = hasNull
   234  		return nil
   235  	}
   236  
   237  	// here we will select the primary key column from the vectors, and
   238  	// use the search function to find the offset of the primary key.
   239  	// it returns the offset of the primary key in the pk vector.
   240  	// if the primary key is not found, it returns empty slice
   241  	mixin.filterState.evaluated = true
   242  	mixin.filterState.filter = searchFunc
   243  	mixin.filterState.seqnums = []uint16{mixin.columns.seqnums[mixin.columns.pkPos]}
   244  	mixin.filterState.colTypes = mixin.columns.colTypes[mixin.columns.pkPos : mixin.columns.pkPos+1]
   245  
   246  	// records how many blks one reader needs to read when having filter
   247  	objectio.BlkReadStats.BlksByReaderStats.Record(1, blkCnt)
   248  	return searchFunc
   249  }
   250  
   251  // -----------------------------------------------------------------
   252  // ------------------------ emptyReader ----------------------------
   253  // -----------------------------------------------------------------
   254  
   255  func (r *emptyReader) SetFilterZM(objectio.ZoneMap) {
   256  }
   257  
   258  func (r *emptyReader) GetOrderBy() []*plan.OrderBySpec {
   259  	return nil
   260  }
   261  
   262  func (r *emptyReader) SetOrderBy([]*plan.OrderBySpec) {
   263  }
   264  
   265  func (r *emptyReader) Close() error {
   266  	return nil
   267  }
   268  
   269  func (r *emptyReader) Read(_ context.Context, _ []string,
   270  	_ *plan.Expr, _ *mpool.MPool, _ engine.VectorPool) (*batch.Batch, error) {
   271  	return nil, nil
   272  }
   273  
   274  // -----------------------------------------------------------------
   275  // ------------------------ blockReader ----------------------------
   276  // -----------------------------------------------------------------
   277  
   278  func newBlockReader(
   279  	ctx context.Context,
   280  	tableDef *plan.TableDef,
   281  	ts timestamp.Timestamp,
   282  	blks []*objectio.BlockInfo,
   283  	filterExpr *plan.Expr,
   284  	fs fileservice.FileService,
   285  	proc *process.Process,
   286  ) *blockReader {
   287  	for _, blk := range blks {
   288  		trace.GetService().TxnReadBlock(
   289  			proc.TxnOperator,
   290  			tableDef.TblId,
   291  			blk.BlockID[:])
   292  	}
   293  	r := &blockReader{
   294  		withFilterMixin: withFilterMixin{
   295  			ctx:      ctx,
   296  			fs:       fs,
   297  			ts:       ts,
   298  			proc:     proc,
   299  			tableDef: tableDef,
   300  		},
   301  		blks: blks,
   302  	}
   303  	r.filterState.expr = filterExpr
   304  	return r
   305  }
   306  
   307  func (r *blockReader) Close() error {
   308  	r.withFilterMixin.reset()
   309  	r.blks = nil
   310  	r.buffer = nil
   311  	return nil
   312  }
   313  
   314  func (r *blockReader) SetFilterZM(zm objectio.ZoneMap) {
   315  	if !r.filterZM.IsInited() {
   316  		r.filterZM = zm.Clone()
   317  		return
   318  	}
   319  	if r.desc && r.filterZM.CompareMax(zm) < 0 {
   320  		r.filterZM = zm.Clone()
   321  		return
   322  	}
   323  	if !r.desc && r.filterZM.CompareMin(zm) > 0 {
   324  		r.filterZM = zm.Clone()
   325  		return
   326  	}
   327  }
   328  
   329  func (r *blockReader) GetOrderBy() []*plan.OrderBySpec {
   330  	return r.OrderBy
   331  }
   332  
   333  func (r *blockReader) SetOrderBy(orderby []*plan.OrderBySpec) {
   334  	r.OrderBy = orderby
   335  }
   336  
   337  func (r *blockReader) needReadBlkByZM(i int) bool {
   338  	zm := r.blockZMS[i]
   339  	if !r.filterZM.IsInited() || !zm.IsInited() {
   340  		return true
   341  	}
   342  	if r.desc {
   343  		return r.filterZM.CompareMax(zm) <= 0
   344  	} else {
   345  		return r.filterZM.CompareMin(zm) >= 0
   346  	}
   347  }
   348  
   349  func (r *blockReader) getBlockZMs() {
   350  	orderByCol, _ := r.OrderBy[0].Expr.Expr.(*plan.Expr_Col)
   351  	orderByColIDX := int(r.tableDef.Cols[int(orderByCol.Col.ColPos)].Seqnum)
   352  
   353  	r.blockZMS = make([]index.ZM, len(r.blks))
   354  	var objDataMeta objectio.ObjectDataMeta
   355  	var location objectio.Location
   356  	for i := range r.blks {
   357  		location = r.blks[i].MetaLocation()
   358  		if !objectio.IsSameObjectLocVsMeta(location, objDataMeta) {
   359  			objMeta, err := objectio.FastLoadObjectMeta(r.ctx, &location, false, r.fs)
   360  			if err != nil {
   361  				panic("load object meta error when ordered scan!")
   362  			}
   363  			objDataMeta = objMeta.MustDataMeta()
   364  		}
   365  		blkMeta := objDataMeta.GetBlockMeta(uint32(location.ID()))
   366  		r.blockZMS[i] = blkMeta.ColumnMeta(uint16(orderByColIDX)).ZoneMap()
   367  	}
   368  }
   369  
   370  func (r *blockReader) sortBlockList() {
   371  	helper := make([]*blockSortHelper, len(r.blks))
   372  	for i := range r.blks {
   373  		helper[i] = &blockSortHelper{}
   374  		helper[i].blk = r.blks[i]
   375  		helper[i].zm = r.blockZMS[i]
   376  	}
   377  	if r.desc {
   378  		sort.Slice(helper, func(i, j int) bool {
   379  			zm1 := helper[i].zm
   380  			if !zm1.IsInited() {
   381  				return true
   382  			}
   383  			zm2 := helper[j].zm
   384  			if !zm2.IsInited() {
   385  				return false
   386  			}
   387  			return zm1.CompareMax(zm2) > 0
   388  		})
   389  	} else {
   390  		sort.Slice(helper, func(i, j int) bool {
   391  			zm1 := helper[i].zm
   392  			if !zm1.IsInited() {
   393  				return true
   394  			}
   395  			zm2 := helper[j].zm
   396  			if !zm2.IsInited() {
   397  				return false
   398  			}
   399  			return zm1.CompareMin(zm2) < 0
   400  		})
   401  	}
   402  
   403  	for i := range helper {
   404  		r.blks[i] = helper[i].blk
   405  		r.blockZMS[i] = helper[i].zm
   406  	}
   407  }
   408  
   409  func (r *blockReader) deleteFirstNBlocks(n int) {
   410  	r.blks = r.blks[n:]
   411  	if len(r.OrderBy) > 0 {
   412  		r.blockZMS = r.blockZMS[n:]
   413  	}
   414  }
   415  
   416  func (r *blockReader) Read(
   417  	ctx context.Context,
   418  	cols []string,
   419  	_ *plan.Expr,
   420  	mp *mpool.MPool,
   421  	vp engine.VectorPool,
   422  ) (bat *batch.Batch, err error) {
   423  	start := time.Now()
   424  	defer func() {
   425  		v2.TxnBlockReaderDurationHistogram.Observe(time.Since(start).Seconds())
   426  	}()
   427  
   428  	// for ordered scan, sort blocklist by zonemap info, and then filter by zonemap
   429  	if len(r.OrderBy) > 0 {
   430  		if !r.sorted {
   431  			r.desc = r.OrderBy[0].Flag&plan.OrderBySpec_DESC != 0
   432  			r.getBlockZMs()
   433  			r.sortBlockList()
   434  			r.sorted = true
   435  		}
   436  		i := 0
   437  		for i < len(r.blks) {
   438  			if r.needReadBlkByZM(i) {
   439  				break
   440  			}
   441  			i++
   442  		}
   443  		r.deleteFirstNBlocks(i)
   444  	}
   445  	// if the block list is empty, return nil
   446  	if len(r.blks) == 0 {
   447  		return nil, nil
   448  	}
   449  
   450  	// move to the next block at the end of this call
   451  	defer func() {
   452  		r.deleteFirstNBlocks(1)
   453  		r.buffer = r.buffer[:0]
   454  		r.currentStep++
   455  	}()
   456  
   457  	// get the current block to be read
   458  	blockInfo := r.blks[0]
   459  
   460  	// try to update the columns
   461  	// the columns is only updated once for all blocks
   462  	r.tryUpdateColumns(cols)
   463  
   464  	// get the block read filter
   465  	filter := r.getReadFilter(r.proc, len(r.blks))
   466  
   467  	// if any null expr is found in the primary key (composite primary keys), quick return
   468  	if r.filterState.hasNull {
   469  		return nil, nil
   470  	}
   471  
   472  	if !r.dontPrefetch {
   473  		//prefetch some objects
   474  		for len(r.steps) > 0 && r.steps[0] == r.currentStep {
   475  			// always true for now, will optimize this in the future
   476  			prefetchFile := r.scanType == SMALL || r.scanType == LARGE || r.scanType == NORMAL
   477  			if filter != nil && blockInfo.Sorted {
   478  				err = blockio.BlockPrefetch(r.filterState.seqnums, r.fs, [][]*objectio.BlockInfo{r.infos[0]}, prefetchFile)
   479  			} else {
   480  				err = blockio.BlockPrefetch(r.columns.seqnums, r.fs, [][]*objectio.BlockInfo{r.infos[0]}, prefetchFile)
   481  			}
   482  			if err != nil {
   483  				return nil, err
   484  			}
   485  			r.infos = r.infos[1:]
   486  			r.steps = r.steps[1:]
   487  		}
   488  	}
   489  
   490  	statsCtx, numRead, numHit := r.ctx, int64(0), int64(0)
   491  	if filter != nil {
   492  		// try to store the blkReadStats CounterSet into ctx, so that
   493  		// it can record the mem cache hit stats when call MemCache.Read() later soon.
   494  		statsCtx, numRead, numHit = r.prepareGatherStats()
   495  	}
   496  
   497  	// read the block
   498  	var policy fileservice.Policy
   499  	if r.scanType == LARGE || r.scanType == NORMAL {
   500  		policy = fileservice.SkipMemoryCacheWrites
   501  	}
   502  	bat, err = blockio.BlockRead(
   503  		statsCtx, blockInfo, r.buffer, r.columns.seqnums, r.columns.colTypes, r.ts,
   504  		r.filterState.seqnums,
   505  		r.filterState.colTypes,
   506  		filter,
   507  		r.fs, mp, vp, policy,
   508  	)
   509  	if err != nil {
   510  		return nil, err
   511  	}
   512  
   513  	if filter != nil {
   514  		// we collect mem cache hit related statistics info for blk read here
   515  		r.gatherStats(numRead, numHit)
   516  	}
   517  
   518  	bat.SetAttributes(cols)
   519  
   520  	if blockInfo.Sorted && r.columns.indexOfFirstSortedColumn != -1 {
   521  		bat.GetVector(int32(r.columns.indexOfFirstSortedColumn)).SetSorted(true)
   522  	}
   523  
   524  	if logutil.GetSkip1Logger().Core().Enabled(zap.DebugLevel) {
   525  		logutil.Debug(testutil.OperatorCatchBatch("block reader", bat))
   526  	}
   527  	return bat, nil
   528  }
   529  
   530  func (r *blockReader) prepareGatherStats() (context.Context, int64, int64) {
   531  	ctx := perfcounter.WithCounterSet(r.ctx, objectio.BlkReadStats.CounterSet)
   532  	return ctx, objectio.BlkReadStats.CounterSet.FileService.Cache.Read.Load(),
   533  		objectio.BlkReadStats.CounterSet.FileService.Cache.Hit.Load()
   534  }
   535  
   536  func (r *blockReader) gatherStats(lastNumRead, lastNumHit int64) {
   537  	numRead := objectio.BlkReadStats.CounterSet.FileService.Cache.Read.Load()
   538  	numHit := objectio.BlkReadStats.CounterSet.FileService.Cache.Hit.Load()
   539  
   540  	curNumRead := numRead - lastNumRead
   541  	curNumHit := numHit - lastNumHit
   542  
   543  	if curNumRead > curNumHit {
   544  		objectio.BlkReadStats.BlkCacheHitStats.Record(0, 1)
   545  	} else {
   546  		objectio.BlkReadStats.BlkCacheHitStats.Record(1, 1)
   547  	}
   548  
   549  	objectio.BlkReadStats.EntryCacheHitStats.Record(int(curNumHit), int(curNumRead))
   550  }
   551  
   552  // -----------------------------------------------------------------
   553  // ---------------------- blockMergeReader -------------------------
   554  // -----------------------------------------------------------------
   555  
   556  func newBlockMergeReader(
   557  	ctx context.Context,
   558  	txnTable *txnTable,
   559  	pkVal []byte,
   560  	ts timestamp.Timestamp,
   561  	dirtyBlks []*objectio.BlockInfo,
   562  	filterExpr *plan.Expr,
   563  	fs fileservice.FileService,
   564  	proc *process.Process,
   565  ) *blockMergeReader {
   566  	r := &blockMergeReader{
   567  		table: txnTable,
   568  		blockReader: newBlockReader(
   569  			ctx,
   570  			txnTable.GetTableDef(ctx),
   571  			ts,
   572  			dirtyBlks,
   573  			filterExpr,
   574  			fs,
   575  			proc,
   576  		),
   577  		pkVal:      pkVal,
   578  		deletaLocs: make(map[string][]objectio.Location),
   579  	}
   580  	return r
   581  }
   582  
   583  func (r *blockMergeReader) Close() error {
   584  	r.table = nil
   585  	return r.blockReader.Close()
   586  }
   587  
   588  func (r *blockMergeReader) prefetchDeletes() error {
   589  	//load delta locations for r.blocks.
   590  	r.table.getTxn().blockId_tn_delete_metaLoc_batch.RLock()
   591  	defer r.table.getTxn().blockId_tn_delete_metaLoc_batch.RUnlock()
   592  
   593  	if !r.loaded {
   594  		for _, info := range r.blks {
   595  			bats, ok := r.table.getTxn().blockId_tn_delete_metaLoc_batch.data[info.BlockID]
   596  
   597  			if !ok {
   598  				return nil
   599  			}
   600  			for _, bat := range bats {
   601  				vs := vector.MustStrCol(bat.GetVector(0))
   602  				for _, deltaLoc := range vs {
   603  					location, err := blockio.EncodeLocationFromString(deltaLoc)
   604  					if err != nil {
   605  						return err
   606  					}
   607  					r.deletaLocs[location.Name().String()] =
   608  						append(r.deletaLocs[location.Name().String()], location)
   609  				}
   610  			}
   611  		}
   612  
   613  		// Get Single Col pk index
   614  		for idx, colDef := range r.tableDef.Cols {
   615  			if colDef.Name == r.tableDef.Pkey.PkeyColName {
   616  				r.pkidx = idx
   617  				break
   618  			}
   619  		}
   620  		r.loaded = true
   621  	}
   622  
   623  	//prefetch the deletes
   624  	for name, locs := range r.deletaLocs {
   625  		pref, err := blockio.BuildPrefetchParams(r.fs, locs[0])
   626  		if err != nil {
   627  			return err
   628  		}
   629  		for _, loc := range locs {
   630  			//rowid + pk
   631  			pref.AddBlockWithType([]uint16{0, uint16(r.pkidx)}, []uint16{loc.ID()}, uint16(objectio.SchemaTombstone))
   632  
   633  		}
   634  		delete(r.deletaLocs, name)
   635  		return blockio.PrefetchWithMerged(pref)
   636  	}
   637  	return nil
   638  }
   639  
   640  func (r *blockMergeReader) loadDeletes(ctx context.Context, cols []string) error {
   641  	if len(r.blks) == 0 {
   642  		return nil
   643  	}
   644  	info := r.blks[0]
   645  
   646  	r.tryUpdateColumns(cols)
   647  	// load deletes from txn.blockId_dn_delete_metaLoc_batch
   648  	err := r.table.LoadDeletesForBlock(info.BlockID, &r.buffer)
   649  	if err != nil {
   650  		return err
   651  	}
   652  
   653  	// load deletes from partition state for the specified block
   654  	filter := r.getReadFilter(r.proc, len(r.blks))
   655  
   656  	state, err := r.table.getPartitionState(ctx)
   657  	if err != nil {
   658  		return err
   659  	}
   660  	ts := types.TimestampToTS(r.ts)
   661  
   662  	if filter != nil && info.Sorted && len(r.pkVal) > 0 {
   663  		iter := state.NewPrimaryKeyDelIter(
   664  			ts,
   665  			logtailreplay.Prefix(r.pkVal),
   666  			info.BlockID,
   667  		)
   668  		for iter.Next() {
   669  			entry := iter.Entry()
   670  			if !entry.Deleted {
   671  				continue
   672  			}
   673  			_, offset := entry.RowID.Decode()
   674  			r.buffer = append(r.buffer, int64(offset))
   675  		}
   676  		iter.Close()
   677  	} else {
   678  		iter := state.NewRowsIter(ts, &info.BlockID, true)
   679  		currlen := len(r.buffer)
   680  		for iter.Next() {
   681  			entry := iter.Entry()
   682  			_, offset := entry.RowID.Decode()
   683  			r.buffer = append(r.buffer, int64(offset))
   684  		}
   685  		v2.TaskLoadMemDeletesPerBlockHistogram.Observe(float64(len(r.buffer) - currlen))
   686  		iter.Close()
   687  	}
   688  
   689  	//TODO:: if r.table.writes is a map , the time complexity could be O(1)
   690  	//load deletes from txn.writes for the specified block
   691  	r.table.getTxn().forEachTableWrites(
   692  		r.table.db.databaseId,
   693  		r.table.tableId,
   694  		r.table.getTxn().GetSnapshotWriteOffset(), func(entry Entry) {
   695  			if entry.isGeneratedByTruncate() {
   696  				return
   697  			}
   698  			if (entry.typ == DELETE || entry.typ == DELETE_TXN) && entry.fileName == "" {
   699  				vs := vector.MustFixedCol[types.Rowid](entry.bat.GetVector(0))
   700  				for _, v := range vs {
   701  					id, offset := v.Decode()
   702  					if id == info.BlockID {
   703  						r.buffer = append(r.buffer, int64(offset))
   704  					}
   705  				}
   706  			}
   707  		})
   708  	//load deletes from txn.deletedBlocks.
   709  	txn := r.table.getTxn()
   710  	txn.deletedBlocks.getDeletedOffsetsByBlock(&info.BlockID, &r.buffer)
   711  	return nil
   712  }
   713  
   714  func (r *blockMergeReader) Read(
   715  	ctx context.Context,
   716  	cols []string,
   717  	expr *plan.Expr,
   718  	mp *mpool.MPool,
   719  	vp engine.VectorPool,
   720  ) (*batch.Batch, error) {
   721  	start := time.Now()
   722  	defer func() {
   723  		v2.TxnBlockMergeReaderDurationHistogram.Observe(time.Since(start).Seconds())
   724  	}()
   725  
   726  	//prefetch deletes for r.blks
   727  	if err := r.prefetchDeletes(); err != nil {
   728  		return nil, err
   729  	}
   730  	//load deletes for the specified block
   731  	if err := r.loadDeletes(ctx, cols); err != nil {
   732  		return nil, err
   733  	}
   734  	return r.blockReader.Read(ctx, cols, expr, mp, vp)
   735  }
   736  
   737  // -----------------------------------------------------------------
   738  // ------------------------ mergeReader ----------------------------
   739  // -----------------------------------------------------------------
   740  
   741  func NewMergeReader(readers []engine.Reader) *mergeReader {
   742  	return &mergeReader{
   743  		rds: readers,
   744  	}
   745  }
   746  
   747  func (r *mergeReader) SetFilterZM(zm objectio.ZoneMap) {
   748  	for i := range r.rds {
   749  		r.rds[i].SetFilterZM(zm)
   750  	}
   751  }
   752  
   753  func (r *mergeReader) GetOrderBy() []*plan.OrderBySpec {
   754  	for i := range r.rds {
   755  		if r.rds[i].GetOrderBy() != nil {
   756  			return r.rds[i].GetOrderBy()
   757  		}
   758  	}
   759  	return nil
   760  }
   761  
   762  func (r *mergeReader) SetOrderBy(orderby []*plan.OrderBySpec) {
   763  	for i := range r.rds {
   764  		r.rds[i].SetOrderBy(orderby)
   765  	}
   766  }
   767  
   768  func (r *mergeReader) Close() error {
   769  	return nil
   770  }
   771  
   772  func (r *mergeReader) Read(
   773  	ctx context.Context,
   774  	cols []string,
   775  	expr *plan.Expr,
   776  	mp *mpool.MPool,
   777  	vp engine.VectorPool,
   778  ) (*batch.Batch, error) {
   779  	start := time.Now()
   780  	defer func() {
   781  		v2.TxnMergeReaderDurationHistogram.Observe(time.Since(start).Seconds())
   782  	}()
   783  
   784  	if len(r.rds) == 0 {
   785  		return nil, nil
   786  	}
   787  	for len(r.rds) > 0 {
   788  		bat, err := r.rds[0].Read(ctx, cols, expr, mp, vp)
   789  		if err != nil {
   790  			for _, rd := range r.rds {
   791  				rd.Close()
   792  			}
   793  			return nil, err
   794  		}
   795  		if bat == nil {
   796  			r.rds = r.rds[1:]
   797  		}
   798  		if bat != nil {
   799  			if logutil.GetSkip1Logger().Core().Enabled(zap.DebugLevel) {
   800  				logutil.Debug(testutil.OperatorCatchBatch("merge reader", bat))
   801  			}
   802  			return bat, nil
   803  		}
   804  	}
   805  	return nil, nil
   806  }