github.com/matrixorigin/matrixone@v0.7.0/pkg/vm/engine/disttae/partition.go (about)

     1  // Copyright 2022 Matrix Origin
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //      http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  package disttae
    16  
    17  import (
    18  	"bytes"
    19  	"context"
    20  	"unsafe"
    21  
    22  	"github.com/matrixorigin/matrixone/pkg/catalog"
    23  	"github.com/matrixorigin/matrixone/pkg/common/moerr"
    24  	"github.com/matrixorigin/matrixone/pkg/common/moprobe"
    25  	"github.com/matrixorigin/matrixone/pkg/container/batch"
    26  	"github.com/matrixorigin/matrixone/pkg/container/types"
    27  	"github.com/matrixorigin/matrixone/pkg/container/vector"
    28  	"github.com/matrixorigin/matrixone/pkg/fileservice"
    29  	"github.com/matrixorigin/matrixone/pkg/pb/api"
    30  	"github.com/matrixorigin/matrixone/pkg/pb/plan"
    31  	"github.com/matrixorigin/matrixone/pkg/pb/timestamp"
    32  	"github.com/matrixorigin/matrixone/pkg/txn/storage/memorystorage/memorytable"
    33  	"github.com/matrixorigin/matrixone/pkg/txn/storage/memorystorage/memtable"
    34  	"github.com/matrixorigin/matrixone/pkg/vm/engine"
    35  )
    36  
    37  func NewPartition(
    38  	columnsIndexDefs []ColumnsIndexDef,
    39  ) *Partition {
    40  	lock := make(chan struct{}, 1)
    41  	lock <- struct{}{}
    42  	return &Partition{
    43  		lock:             lock,
    44  		data:             memtable.NewTable[RowID, DataValue, *DataRow](),
    45  		columnsIndexDefs: columnsIndexDefs,
    46  	}
    47  }
    48  
    49  type RowID types.Rowid
    50  
    51  func (r RowID) Less(than RowID) bool {
    52  	return bytes.Compare(r[:], than[:]) < 0
    53  }
    54  
    55  type DataValue struct {
    56  	op    uint8
    57  	value map[string]memtable.Nullable
    58  }
    59  
    60  type DataRow struct {
    61  	rowID         RowID
    62  	value         DataValue
    63  	indexes       []memtable.Tuple
    64  	uniqueIndexes []memtable.Tuple
    65  }
    66  
    67  const (
    68  	opInsert = iota + 1
    69  	opDelete
    70  )
    71  
    72  func (d *DataRow) Key() RowID {
    73  	return d.rowID
    74  }
    75  
    76  func (d *DataRow) Value() DataValue {
    77  	return d.value
    78  }
    79  
    80  func (d *DataRow) Indexes() []memtable.Tuple {
    81  	return d.indexes
    82  }
    83  
    84  func (d *DataRow) UniqueIndexes() []memtable.Tuple {
    85  	return d.uniqueIndexes
    86  }
    87  
    88  var _ MVCC = new(Partition)
    89  
    90  func (p *Partition) BlockList(ctx context.Context, ts timestamp.Timestamp,
    91  	blocks []BlockMeta, entries []Entry) ([]BlockMeta, map[uint64][]int) {
    92  	blks := make([]BlockMeta, 0, len(blocks))
    93  	deletes := make(map[uint64][]int)
    94  	if len(blocks) == 0 {
    95  		return blks, deletes
    96  	}
    97  	ids := make([]uint64, len(blocks))
    98  	for i := range blocks {
    99  		// if cn can see a appendable block, this block must contain all updates
   100  		// in cache, no need to do merge read, BlockRead will filter out
   101  		// invisible and deleted rows with respect to the timestamp
   102  		if !blocks[i].Info.EntryState {
   103  			ids[i] = blocks[i].Info.BlockID
   104  		}
   105  	}
   106  	p.IterDeletedRowIDs(ctx, ids, ts, func(rowID RowID) bool {
   107  		id, offset := catalog.DecodeRowid(types.Rowid(rowID))
   108  		deletes[id] = append(deletes[id], int(offset))
   109  		return true
   110  	})
   111  	for _, entry := range entries {
   112  		if entry.typ == DELETE {
   113  			vs := vector.MustTCols[types.Rowid](entry.bat.GetVector(0))
   114  			for _, v := range vs {
   115  				id, offset := catalog.DecodeRowid(v)
   116  				deletes[id] = append(deletes[id], int(offset))
   117  			}
   118  		}
   119  	}
   120  	for i := range blocks {
   121  		if _, ok := deletes[blocks[i].Info.BlockID]; !ok {
   122  			blks = append(blks, blocks[i])
   123  		}
   124  	}
   125  	return blks, deletes
   126  }
   127  
   128  func (*Partition) CheckPoint(ctx context.Context, ts timestamp.Timestamp) error {
   129  	panic("unimplemented")
   130  }
   131  
   132  func (p *Partition) Get(key types.Rowid, ts timestamp.Timestamp) bool {
   133  	t := memtable.Time{
   134  		Timestamp: ts,
   135  	}
   136  	tx := memtable.NewTransaction(
   137  		newMemTableTransactionID(),
   138  		t,
   139  		memtable.SnapshotIsolation,
   140  	)
   141  	if _, err := p.data.Get(tx, RowID(key)); err != nil {
   142  		return false
   143  	}
   144  	return true
   145  }
   146  
   147  func (p *Partition) Delete(ctx context.Context, b *api.Batch) error {
   148  	bat, err := batch.ProtoBatchToBatch(b)
   149  	if err != nil {
   150  		return err
   151  	}
   152  
   153  	txID := newMemTableTransactionID()
   154  
   155  	iter := memorytable.NewBatchIter(bat)
   156  	for {
   157  		tuple := iter()
   158  		if len(tuple) == 0 {
   159  			break
   160  		}
   161  
   162  		rowID := RowID(tuple[0].Value.(types.Rowid))
   163  		ts := tuple[1].Value.(types.TS)
   164  		t := memtable.Time{
   165  			Timestamp: timestamp.Timestamp{
   166  				PhysicalTime: ts.Physical(),
   167  				LogicalTime:  ts.Logical(),
   168  			},
   169  		}
   170  		tx := memtable.NewTransaction(txID, t, memtable.SnapshotIsolation)
   171  
   172  		// indexes
   173  		var indexes []memtable.Tuple
   174  		// block id, time, op
   175  		indexes = append(indexes, memtable.Tuple{
   176  			index_BlockID_Time_OP,
   177  			memtable.ToOrdered(rowIDToBlockID(rowID)),
   178  			ts,
   179  			memtable.Uint(opDelete),
   180  		})
   181  
   182  		err := p.data.Upsert(tx, &DataRow{
   183  			rowID: rowID,
   184  			value: DataValue{
   185  				op: opDelete,
   186  			},
   187  			indexes: indexes,
   188  		})
   189  		// the reason to ignore, see comments in Insert method
   190  		if moerr.IsMoErrCode(err, moerr.ErrTxnWriteConflict) {
   191  			continue
   192  		}
   193  		if err != nil {
   194  			return err
   195  		}
   196  
   197  		if err := tx.Commit(t); err != nil {
   198  			return err
   199  		}
   200  	}
   201  
   202  	return nil
   203  }
   204  
   205  func (p *Partition) Insert(ctx context.Context, primaryKeyIndex int,
   206  	b *api.Batch, needCheck bool) error {
   207  
   208  	// As an example, lets probe this function.  First we want to find a tag so that
   209  	// if several go routine call this function at the same time, we will not mix them.
   210  	// the pointer b works.
   211  	tag := int64(uintptr(unsafe.Pointer(b)))
   212  
   213  	// enter probe, only need tag.  Adding an extra arg just for demo purpose.
   214  	moprobe.DisttaePartitionInsert(tag, 1)
   215  
   216  	// defer, this is the return probe.  Use same tag value
   217  	defer moprobe.DisttaePartitionInsertRet(tag, 0x1020304050607080)
   218  
   219  	bat, err := batch.ProtoBatchToBatch(b)
   220  	if err != nil {
   221  		return err
   222  	}
   223  
   224  	txID := newMemTableTransactionID()
   225  
   226  	iter := memorytable.NewBatchIter(bat)
   227  	for {
   228  		tuple := iter()
   229  		if len(tuple) == 0 {
   230  			break
   231  		}
   232  
   233  		rowID := RowID(tuple[0].Value.(types.Rowid))
   234  		ts := tuple[1].Value.(types.TS)
   235  		t := memtable.Time{
   236  			Timestamp: timestamp.Timestamp{
   237  				PhysicalTime: ts.Physical(),
   238  				LogicalTime:  ts.Logical(),
   239  			},
   240  		}
   241  		tx := memtable.NewTransaction(txID, t, memtable.SnapshotIsolation)
   242  
   243  		// check primary key
   244  		var primaryKey any
   245  		if primaryKeyIndex >= 0 {
   246  			primaryKey = memtable.ToOrdered(tuple[primaryKeyIndex].Value)
   247  			entries, err := p.data.Index(tx, memtable.Tuple{
   248  				index_PrimaryKey,
   249  				primaryKey,
   250  			})
   251  			if err != nil {
   252  				return err
   253  			}
   254  			if len(entries) > 0 && needCheck {
   255  				return moerr.NewDuplicate(ctx)
   256  			}
   257  		}
   258  
   259  		dataValue := DataValue{
   260  			op:    opInsert,
   261  			value: make(map[string]memtable.Nullable),
   262  		}
   263  		for i := 2; i < len(tuple); i++ {
   264  			dataValue.value[bat.Attrs[i]] = tuple[i]
   265  		}
   266  
   267  		// indexes
   268  		var indexes []memtable.Tuple
   269  		// primary key
   270  		if primaryKey != nil {
   271  			indexes = append(indexes, memtable.Tuple{
   272  				index_PrimaryKey,
   273  				primaryKey,
   274  			})
   275  		}
   276  		// block id, time, op
   277  		indexes = append(indexes, memtable.Tuple{
   278  			index_BlockID_Time_OP,
   279  			memtable.ToOrdered(rowIDToBlockID(rowID)),
   280  			ts,
   281  			memtable.Uint(opInsert),
   282  		})
   283  		// columns indexes
   284  		for _, def := range p.columnsIndexDefs {
   285  			index := memtable.Tuple{
   286  				def.Name,
   287  			}
   288  			for _, col := range def.Columns {
   289  				index = append(index, memtable.ToOrdered(tuple[col].Value))
   290  			}
   291  			indexes = append(indexes, index)
   292  		}
   293  
   294  		err = p.data.Upsert(tx, &DataRow{
   295  			rowID:   rowID,
   296  			value:   dataValue,
   297  			indexes: indexes,
   298  		})
   299  		// if conflict comes up here,  probably the checkpoint from dn
   300  		// has duplicated history versions. As txn write conflict has been
   301  		// checked in dn, so it is safe to ignore this error
   302  		if moerr.IsMoErrCode(err, moerr.ErrTxnWriteConflict) {
   303  			continue
   304  		}
   305  		if err != nil {
   306  			return err
   307  		}
   308  		if err := tx.Commit(t); err != nil {
   309  			return err
   310  		}
   311  	}
   312  
   313  	return nil
   314  }
   315  
   316  func (p *Partition) GC(ts timestamp.Timestamp) error {
   317  	// remove versions only visible before ts
   318  	// assuming no transaction is reading or writing
   319  	t := memtable.Time{
   320  		Timestamp: ts,
   321  	}
   322  	err := p.data.FilterVersions(func(k RowID, versions []memtable.Version[DataValue]) (filtered []memtable.Version[DataValue], err error) {
   323  		for _, version := range versions {
   324  			if version.LockTime.IsZero() {
   325  				// not deleted
   326  				filtered = append(filtered, version)
   327  				continue
   328  			}
   329  			if version.LockTime.Equal(t) ||
   330  				version.LockTime.After(t) {
   331  				// still visible after ts
   332  				filtered = append(filtered, version)
   333  				continue
   334  			}
   335  		}
   336  		return
   337  	})
   338  	if err != nil {
   339  		return err
   340  	}
   341  	return nil
   342  }
   343  
   344  func (p *Partition) GetRowsByIndex(ts timestamp.Timestamp, index memtable.Tuple,
   345  	columns []string, deletes map[types.Rowid]uint8) (rows [][]any, err error) {
   346  	t := memtable.Time{
   347  		Timestamp: ts,
   348  	}
   349  	tx := memtable.NewTransaction(
   350  		newMemTableTransactionID(),
   351  		t,
   352  		memtable.SnapshotIsolation,
   353  	)
   354  	iter := p.data.NewIndexIter(tx, index, index)
   355  	for ok := iter.First(); ok; ok = iter.Next() {
   356  		entry := iter.Item()
   357  		if _, ok := deletes[types.Rowid(entry.Key)]; ok {
   358  			continue
   359  		}
   360  		data, err := p.data.Get(tx, entry.Key)
   361  		if err != nil {
   362  			return nil, err
   363  		}
   364  		rows = append(rows, genRow(&data, columns))
   365  	}
   366  	return
   367  }
   368  
   369  func (p *Partition) GetRowsByIndexPrefix(ts timestamp.Timestamp, prefix memtable.Tuple) (rows []DataValue, err error) {
   370  	t := memtable.Time{
   371  		Timestamp: ts,
   372  	}
   373  	tx := memtable.NewTransaction(
   374  		newMemTableTransactionID(),
   375  		t,
   376  		memtable.SnapshotIsolation,
   377  	)
   378  	iter := p.data.NewIndexIter(
   379  		tx,
   380  		append(append(prefix[:0:0], prefix...), memtable.Min),
   381  		append(append(prefix[:0:0], prefix...), memtable.Max),
   382  	)
   383  	for ok := iter.First(); ok; ok = iter.Next() {
   384  		entry := iter.Item()
   385  		data, err := p.data.Get(tx, entry.Key)
   386  		if err != nil {
   387  			return nil, err
   388  		}
   389  		rows = append(rows, data)
   390  	}
   391  	return
   392  }
   393  
   394  func rowIDToBlockID(rowID RowID) uint64 {
   395  	id, _ := catalog.DecodeRowid(types.Rowid(rowID))
   396  	return id
   397  }
   398  
   399  func (p *Partition) DeleteByBlockID(ctx context.Context, ts timestamp.Timestamp, blockID uint64) error {
   400  	tx := memtable.NewTransaction(newMemTableTransactionID(), memtable.Time{
   401  		Timestamp: ts,
   402  	}, memtable.SnapshotIsolation)
   403  	min := memtable.Tuple{
   404  		index_BlockID_Time_OP,
   405  		memtable.ToOrdered(blockID),
   406  		memtable.Min,
   407  		memtable.Uint(opInsert),
   408  	}
   409  	max := memtable.Tuple{
   410  		index_BlockID_Time_OP,
   411  		memtable.ToOrdered(blockID),
   412  		memtable.Max,
   413  		memtable.Uint(opInsert),
   414  	}
   415  	iter := p.data.NewIndexIter(tx, min, max)
   416  	defer iter.Close()
   417  	for ok := iter.First(); ok; ok = iter.Next() {
   418  		entry := iter.Item()
   419  		if err := p.data.Delete(tx, entry.Key); err != nil {
   420  			return err
   421  		}
   422  	}
   423  	return tx.Commit(tx.Time)
   424  }
   425  
   426  func (p *Partition) IterDeletedRowIDs(ctx context.Context, blockIDs []uint64, ts timestamp.Timestamp, fn func(rowID RowID) bool) {
   427  	tx := memtable.NewTransaction(newMemTableTransactionID(), memtable.Time{
   428  		Timestamp: ts,
   429  	}, memtable.SnapshotIsolation)
   430  
   431  	for _, blockID := range blockIDs {
   432  		min := memtable.Tuple{
   433  			index_BlockID_Time_OP,
   434  			memtable.ToOrdered(blockID),
   435  			memtable.Min,
   436  			memtable.Min,
   437  		}
   438  		max := memtable.Tuple{
   439  			index_BlockID_Time_OP,
   440  			memtable.ToOrdered(blockID),
   441  			types.TimestampToTS(ts),
   442  			memtable.Max,
   443  		}
   444  		iter := p.data.NewIndexIter(tx, min, max)
   445  		defer iter.Close()
   446  		deleted := make(map[RowID]bool)
   447  		inserted := make(map[RowID]bool)
   448  		for ok := iter.First(); ok; ok = iter.Next() {
   449  			entry := iter.Item()
   450  			rowID := entry.Key
   451  			switch entry.Index[3].(memtable.Uint) {
   452  			case opInsert:
   453  				inserted[rowID] = true
   454  			case opDelete:
   455  				deleted[rowID] = true
   456  			}
   457  		}
   458  		for rowID := range deleted {
   459  			if !inserted[rowID] {
   460  				if !fn(rowID) {
   461  					break
   462  				}
   463  			}
   464  		}
   465  	}
   466  }
   467  
   468  func (p *Partition) Rows(
   469  	tx *memtable.Transaction,
   470  	deletes map[types.Rowid]uint8,
   471  	skipBlocks map[uint64]uint8) (int64, error) {
   472  	var rows int64 = 0
   473  	iter := p.data.NewIter(tx)
   474  	defer iter.Close()
   475  	for ok := iter.First(); ok; ok = iter.Next() {
   476  		dataKey, dataValue, err := iter.Read()
   477  		if err != nil {
   478  			return 0, err
   479  		}
   480  
   481  		if _, ok := deletes[types.Rowid(dataKey)]; ok {
   482  			continue
   483  		}
   484  
   485  		if dataValue.op == opDelete {
   486  			continue
   487  		}
   488  
   489  		if skipBlocks != nil {
   490  			if _, ok := skipBlocks[rowIDToBlockID(dataKey)]; ok {
   491  				continue
   492  			}
   493  		}
   494  		rows++
   495  	}
   496  
   497  	return rows, nil
   498  }
   499  
   500  func (p *Partition) NewReader(
   501  	ctx context.Context,
   502  	readerNumber int,
   503  	index memtable.Tuple,
   504  	defs []engine.TableDef,
   505  	tableDef *plan.TableDef,
   506  	skipBlocks map[uint64]uint8,
   507  	blks []ModifyBlockMeta,
   508  	ts timestamp.Timestamp,
   509  	fs fileservice.FileService,
   510  	entries []Entry,
   511  ) ([]engine.Reader, error) {
   512  
   513  	t := memtable.Time{
   514  		Timestamp: ts,
   515  	}
   516  	tx := memtable.NewTransaction(
   517  		newMemTableTransactionID(),
   518  		t,
   519  		memtable.SnapshotIsolation,
   520  	)
   521  
   522  	inserts := make([]*batch.Batch, 0, len(entries))
   523  	deletes := make(map[types.Rowid]uint8)
   524  	for _, entry := range entries {
   525  		if entry.typ == INSERT {
   526  			inserts = append(inserts, entry.bat)
   527  		} else {
   528  			if entry.bat.GetVector(0).GetType().Oid == types.T_Rowid {
   529  				vs := vector.MustTCols[types.Rowid](entry.bat.GetVector(0))
   530  				for _, v := range vs {
   531  					deletes[v] = 0
   532  				}
   533  			}
   534  		}
   535  	}
   536  
   537  	readers := make([]engine.Reader, readerNumber)
   538  
   539  	mp := make(map[string]types.Type)
   540  	colIdxMp := make(map[string]int)
   541  	if tableDef != nil {
   542  		for i := range tableDef.Cols {
   543  			colIdxMp[tableDef.Cols[i].Name] = i
   544  		}
   545  	}
   546  
   547  	mp[catalog.Row_ID] = types.New(types.T_Rowid, 0, 0, 0)
   548  	for _, def := range defs {
   549  		attr, ok := def.(*engine.AttributeDef)
   550  		if !ok {
   551  			continue
   552  		}
   553  		mp[attr.Attr.Name] = attr.Attr.Type
   554  	}
   555  
   556  	partReader := &PartitionReader{
   557  		typsMap:         mp,
   558  		readTime:        t,
   559  		tx:              tx,
   560  		index:           index,
   561  		inserts:         inserts,
   562  		deletes:         deletes,
   563  		skipBlocks:      skipBlocks,
   564  		data:            p.data,
   565  		iter:            p.data.NewIter(tx),
   566  		colIdxMp:        colIdxMp,
   567  		extendId2s3File: make(map[string]int),
   568  		s3FileService:   fs,
   569  	}
   570  	if p.txn != nil {
   571  		partReader.proc = p.txn.proc
   572  	}
   573  	readers[0] = partReader
   574  	if readerNumber == 1 {
   575  		for i := range blks {
   576  			readers = append(readers, &blockMergeReader{
   577  				fs:       fs,
   578  				ts:       ts,
   579  				ctx:      ctx,
   580  				tableDef: tableDef,
   581  				sels:     make([]int64, 0, 1024),
   582  				blks:     []ModifyBlockMeta{blks[i]},
   583  			})
   584  		}
   585  		return []engine.Reader{&mergeReader{readers}}, nil
   586  	}
   587  	if len(blks) < readerNumber-1 {
   588  		for i := range blks {
   589  			readers[i+1] = &blockMergeReader{
   590  				fs:       fs,
   591  				ts:       ts,
   592  				ctx:      ctx,
   593  				tableDef: tableDef,
   594  				sels:     make([]int64, 0, 1024),
   595  				blks:     []ModifyBlockMeta{blks[i]},
   596  			}
   597  		}
   598  		for j := len(blks) + 1; j < readerNumber; j++ {
   599  			readers[j] = &emptyReader{}
   600  		}
   601  		return readers, nil
   602  	}
   603  	step := len(blks) / (readerNumber - 1)
   604  	if step < 1 {
   605  		step = 1
   606  	}
   607  	for i := 1; i < readerNumber; i++ {
   608  		if i == readerNumber-1 {
   609  			readers[i] = &blockMergeReader{
   610  				fs:       fs,
   611  				ts:       ts,
   612  				ctx:      ctx,
   613  				tableDef: tableDef,
   614  				blks:     blks[(i-1)*step:],
   615  				sels:     make([]int64, 0, 1024),
   616  			}
   617  		} else {
   618  			readers[i] = &blockMergeReader{
   619  				fs:       fs,
   620  				ts:       ts,
   621  				ctx:      ctx,
   622  				tableDef: tableDef,
   623  				blks:     blks[(i-1)*step : i*step],
   624  				sels:     make([]int64, 0, 1024),
   625  			}
   626  		}
   627  	}
   628  	return readers, nil
   629  }