github.com/matrixorigin/matrixone@v1.2.0/pkg/sql/colexec/s3util.go (about)

     1  // Copyright 2022 Matrix Origin
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //      http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  package colexec
    16  
    17  import (
    18  	"github.com/matrixorigin/matrixone/pkg/catalog"
    19  	"github.com/matrixorigin/matrixone/pkg/common/moerr"
    20  	"github.com/matrixorigin/matrixone/pkg/common/mpool"
    21  	"github.com/matrixorigin/matrixone/pkg/container/batch"
    22  	"github.com/matrixorigin/matrixone/pkg/container/nulls"
    23  	"github.com/matrixorigin/matrixone/pkg/container/types"
    24  	"github.com/matrixorigin/matrixone/pkg/container/vector"
    25  	"github.com/matrixorigin/matrixone/pkg/defines"
    26  	"github.com/matrixorigin/matrixone/pkg/fileservice"
    27  	"github.com/matrixorigin/matrixone/pkg/logutil"
    28  	"github.com/matrixorigin/matrixone/pkg/objectio"
    29  	"github.com/matrixorigin/matrixone/pkg/pb/plan"
    30  	"github.com/matrixorigin/matrixone/pkg/sort"
    31  	"github.com/matrixorigin/matrixone/pkg/sql/util"
    32  	db_holder "github.com/matrixorigin/matrixone/pkg/util/export/etl/db"
    33  	"github.com/matrixorigin/matrixone/pkg/vm"
    34  	"github.com/matrixorigin/matrixone/pkg/vm/engine/tae/blockio"
    35  	"github.com/matrixorigin/matrixone/pkg/vm/engine/tae/options"
    36  	"github.com/matrixorigin/matrixone/pkg/vm/process"
    37  	"go.uber.org/zap"
    38  )
    39  
    40  // S3Writer is used to write table data to S3 and package a series of `BlockWriter` write operations
    41  // Currently there are two scenarios will let cn write s3 directly
    42  // scenario 1 is insert operator directly go s3, when a one-time insert/load data volume is relatively large will trigger the scenario
    43  // scenario 2 is txn.workspace exceeds the threshold value, in the txn.dumpBatch function trigger a write s3
    44  type S3Writer struct {
    45  	sortIndex      int // When writing table data, if table has sort key, need to sort data and then write to S3
    46  	pk             int
    47  	partitionIndex int16 // This value is aligned with the partition number
    48  	isClusterBy    bool
    49  
    50  	schemaVersion uint32
    51  	seqnums       []uint16
    52  	tablename     string
    53  	attrs         []string
    54  
    55  	writer  *blockio.BlockWriter
    56  	lengths []uint64
    57  
    58  	// the third vector only has several rows, not aligns with the other two vectors.
    59  	blockInfoBat *batch.Batch
    60  
    61  	// An intermediate cache after the merge sort of all `Bats` data
    62  	buffer *batch.Batch
    63  
    64  	//for memory multiplexing.
    65  	tableBatchBuffers []*batch.Batch
    66  
    67  	// Bats[i] used to store the batches of table
    68  	// Each batch in Bats will be sorted internally, and all batches correspond to only one table
    69  	// when the batches' size is over 64M, we will use merge sort, and then write a segment in s3
    70  	Bats []*batch.Batch
    71  
    72  	// tableBatchSizes are used to record the table_i's batches'
    73  	// size in tableBatches
    74  	batSize uint64
    75  
    76  	typs []types.Type
    77  	ufs  []func(*vector.Vector, *vector.Vector, int64) error // function pointers for type conversion
    78  }
    79  
    80  const (
    81  	// WriteS3Threshold when batches'  size of table reaches this, we will
    82  	// trigger write s3
    83  	WriteS3Threshold uint64 = 128 * mpool.MB
    84  
    85  	TagS3Size            uint64 = 10 * mpool.MB
    86  	TagS3SizeForMOLogger uint64 = 1 * mpool.MB
    87  )
    88  
    89  func (w *S3Writer) Free(proc *process.Process) {
    90  	if w.blockInfoBat != nil {
    91  		w.blockInfoBat.Clean(proc.Mp())
    92  		w.blockInfoBat = nil
    93  	}
    94  	if w.buffer != nil {
    95  		w.buffer.Clean(proc.Mp())
    96  		w.buffer = nil
    97  	}
    98  	for _, bat := range w.tableBatchBuffers {
    99  		bat.Clean(proc.Mp())
   100  	}
   101  	w.tableBatchBuffers = nil
   102  	for _, bat := range w.Bats {
   103  		bat.Clean(proc.Mp())
   104  	}
   105  	w.Bats = nil
   106  }
   107  
   108  func (w *S3Writer) GetBlockInfoBat() *batch.Batch {
   109  	return w.blockInfoBat
   110  }
   111  
   112  func (w *S3Writer) SetSortIdx(sortIdx int) {
   113  	w.sortIndex = sortIdx
   114  }
   115  
   116  func (w *S3Writer) SetSchemaVer(ver uint32) {
   117  	w.schemaVersion = ver
   118  }
   119  
   120  func (w *S3Writer) SetTableName(name string) {
   121  	w.tablename = name
   122  }
   123  
   124  func (w *S3Writer) SetSeqnums(seqnums []uint16) {
   125  	w.seqnums = seqnums
   126  	logutil.Debugf("s3 table set directly %q seqnums: %+v", w.tablename, w.seqnums)
   127  }
   128  
   129  func AllocS3Writer(proc *process.Process, tableDef *plan.TableDef) (*S3Writer, error) {
   130  	writer := &S3Writer{
   131  		tablename:      tableDef.GetName(),
   132  		seqnums:        make([]uint16, 0, len(tableDef.Cols)),
   133  		schemaVersion:  tableDef.Version,
   134  		sortIndex:      -1,
   135  		pk:             -1,
   136  		partitionIndex: 0,
   137  	}
   138  
   139  	writer.ResetBlockInfoBat(proc)
   140  	for i, colDef := range tableDef.Cols {
   141  		if colDef.Name != catalog.Row_ID {
   142  			writer.seqnums = append(writer.seqnums, uint16(colDef.Seqnum))
   143  		} else {
   144  			// check rowid as the last column
   145  			if i != len(tableDef.Cols)-1 {
   146  				logutil.Errorf("bad rowid position for %q, %+v", writer.tablename, colDef)
   147  			}
   148  		}
   149  	}
   150  	logutil.Debugf("s3 table set from AllocS3Writer %q seqnums: %+v", writer.tablename, writer.seqnums)
   151  
   152  	// Get Single Col pk index
   153  	for idx, colDef := range tableDef.Cols {
   154  		if colDef.Name == tableDef.Pkey.PkeyColName && colDef.Name != catalog.FakePrimaryKeyColName {
   155  			writer.sortIndex = idx
   156  			writer.pk = idx
   157  			break
   158  		}
   159  	}
   160  
   161  	if tableDef.ClusterBy != nil {
   162  		writer.isClusterBy = true
   163  
   164  		// the `rowId` column has been excluded from target table's `TableDef` for insert statements (insert, load),
   165  		// link: `/pkg/sql/plan/build_constraint_util.go` -> func setTableExprToDmlTableInfo
   166  		// and the `sortIndex` position can be directly obtained using a name that matches the sorting key
   167  		for idx, colDef := range tableDef.Cols {
   168  			if colDef.Name == tableDef.ClusterBy.Name {
   169  				writer.sortIndex = idx
   170  			}
   171  		}
   172  	}
   173  
   174  	return writer, nil
   175  }
   176  
   177  // AllocPartitionS3Writer Alloc S3 writers for partitioned table.
   178  func AllocPartitionS3Writer(proc *process.Process, tableDef *plan.TableDef) ([]*S3Writer, error) {
   179  	partitionNum := len(tableDef.Partition.PartitionTableNames)
   180  	writers := make([]*S3Writer, partitionNum)
   181  	for i := range writers {
   182  		writers[i] = &S3Writer{
   183  			tablename:      tableDef.GetName(),
   184  			seqnums:        make([]uint16, 0, len(tableDef.Cols)),
   185  			schemaVersion:  tableDef.Version,
   186  			sortIndex:      -1,
   187  			pk:             -1,
   188  			partitionIndex: int16(i), // This value is aligned with the partition number
   189  		}
   190  
   191  		writers[i].ResetBlockInfoBat(proc)
   192  		for j, colDef := range tableDef.Cols {
   193  			if colDef.Name != catalog.Row_ID {
   194  				writers[i].seqnums = append(writers[i].seqnums, uint16(colDef.Seqnum))
   195  			} else {
   196  				// check rowid as the last column
   197  				if j != len(tableDef.Cols)-1 {
   198  					logutil.Errorf("bad rowid position for %q, %+v", writers[j].tablename, colDef)
   199  				}
   200  			}
   201  		}
   202  		logutil.Debugf("s3 table set from AllocS3WriterP%d %q seqnums: %+v", i, writers[i].tablename, writers[i].seqnums)
   203  
   204  		// Get Single Col pk index
   205  		for idx, colDef := range tableDef.Cols {
   206  			if colDef.Name == tableDef.Pkey.PkeyColName {
   207  				if colDef.Name != catalog.FakePrimaryKeyColName {
   208  					writers[i].sortIndex = idx
   209  					writers[i].pk = idx
   210  				}
   211  				break
   212  			}
   213  		}
   214  
   215  		if tableDef.ClusterBy != nil {
   216  			writers[i].isClusterBy = true
   217  			if util.JudgeIsCompositeClusterByColumn(tableDef.ClusterBy.Name) {
   218  				// the serialized clusterby col is located in the last of the bat.vecs
   219  				writers[i].sortIndex = len(tableDef.Cols) - 1
   220  			} else {
   221  				for idx, colDef := range tableDef.Cols {
   222  					if colDef.Name == tableDef.ClusterBy.Name {
   223  						writers[i].sortIndex = idx
   224  					}
   225  				}
   226  			}
   227  		}
   228  
   229  	}
   230  	return writers, nil
   231  }
   232  
   233  func (w *S3Writer) ResetBlockInfoBat(proc *process.Process) {
   234  	// A simple explanation of the two vectors held by metaLocBat
   235  	// vecs[0] to mark which table this metaLoc belongs to: [0] means insertTable itself, [1] means the first uniqueIndex table, [2] means the second uniqueIndex table and so on
   236  	// vecs[1] store relative block metadata
   237  	if w.blockInfoBat != nil {
   238  		proc.PutBatch(w.blockInfoBat)
   239  	}
   240  	attrs := []string{catalog.BlockMeta_TableIdx_Insert, catalog.BlockMeta_BlockInfo, catalog.ObjectMeta_ObjectStats}
   241  	blockInfoBat := batch.NewWithSize(len(attrs))
   242  	blockInfoBat.Attrs = attrs
   243  	blockInfoBat.Vecs[0] = proc.GetVector(types.T_int16.ToType())
   244  	blockInfoBat.Vecs[1] = proc.GetVector(types.T_text.ToType())
   245  	blockInfoBat.Vecs[2] = proc.GetVector(types.T_binary.ToType())
   246  
   247  	w.blockInfoBat = blockInfoBat
   248  }
   249  
   250  //func (w *S3Writer) WriteEnd(proc *process.Process) {
   251  //	if w.metaLocBat.vecs[0].Length() > 0 {
   252  //		w.metaLocBat.SetZs(w.metaLocBat.vecs[0].Length(), proc.GetMPool())
   253  //		proc.SetInputBatch(w.metaLocBat)
   254  //	}
   255  //}
   256  
   257  func (w *S3Writer) Output(proc *process.Process, result *vm.CallResult) error {
   258  	bat := batch.NewWithSize(len(w.blockInfoBat.Attrs))
   259  	bat.SetAttributes(w.blockInfoBat.Attrs)
   260  
   261  	for i := range w.blockInfoBat.Attrs {
   262  		vec := proc.GetVector(*w.blockInfoBat.Vecs[i].GetType())
   263  		if err := vec.UnionBatch(w.blockInfoBat.Vecs[i], 0, w.blockInfoBat.Vecs[i].Length(), nil, proc.GetMPool()); err != nil {
   264  			vec.Free(proc.Mp())
   265  			return err
   266  		}
   267  		bat.SetVector(int32(i), vec)
   268  	}
   269  	bat.SetRowCount(w.blockInfoBat.RowCount())
   270  	w.ResetBlockInfoBat(proc)
   271  	result.Batch = bat
   272  	return nil
   273  }
   274  
   275  func (w *S3Writer) WriteS3CacheBatch(proc *process.Process) error {
   276  	var S3SizeThreshold = TagS3SizeForMOLogger
   277  
   278  	if proc != nil && proc.Ctx != nil {
   279  		isMoLogger, ok := proc.Ctx.Value(defines.IsMoLogger{}).(bool)
   280  		if ok && isMoLogger {
   281  			logutil.Debug("WriteS3CacheBatch proc", zap.Bool("isMoLogger", isMoLogger))
   282  			S3SizeThreshold = TagS3SizeForMOLogger
   283  		}
   284  	}
   285  
   286  	if proc.GetSessionInfo() != nil && proc.GetSessionInfo().GetUser() == db_holder.MOLoggerUser {
   287  		logutil.Debug("WriteS3CacheBatch", zap.String("user", proc.GetSessionInfo().GetUser()))
   288  		S3SizeThreshold = TagS3SizeForMOLogger
   289  	}
   290  	if w.batSize >= S3SizeThreshold {
   291  		if err := w.SortAndFlush(proc); err != nil {
   292  			return err
   293  		}
   294  		w.blockInfoBat.SetRowCount(w.blockInfoBat.Vecs[0].Length())
   295  		return nil
   296  	}
   297  	for _, bat := range w.Bats {
   298  		if err := vector.AppendFixed(
   299  			w.blockInfoBat.Vecs[0], -w.partitionIndex-1,
   300  			false, proc.GetMPool()); err != nil {
   301  			return err
   302  		}
   303  		bytes, err := bat.MarshalBinary()
   304  		if err != nil {
   305  			return err
   306  		}
   307  		if err = vector.AppendBytes(
   308  			w.blockInfoBat.Vecs[1], bytes,
   309  			false, proc.GetMPool()); err != nil {
   310  			return err
   311  		}
   312  	}
   313  	w.blockInfoBat.SetRowCount(w.blockInfoBat.Vecs[0].Length())
   314  	return nil
   315  }
   316  
   317  func (w *S3Writer) InitBuffers(proc *process.Process, bat *batch.Batch) {
   318  	if w.buffer == nil {
   319  		w.buffer = getNewBatch(proc, bat)
   320  	}
   321  }
   322  
   323  // Put batch into w.bats , and make sure that each batch in w.bats
   324  //
   325  //	contains options.DefaultBlockMaxRows rows except for the last one.
   326  //	true: the tableBatches[idx] is over threshold
   327  //	false: the tableBatches[idx] is less than or equal threshold
   328  func (w *S3Writer) Put(bat *batch.Batch, proc *process.Process) bool {
   329  	var rbat *batch.Batch
   330  
   331  	if len(w.typs) == 0 {
   332  		for i := 0; i < bat.VectorCount(); i++ {
   333  			typ := *bat.GetVector(int32(i)).GetType()
   334  			w.typs = append(w.typs, typ)
   335  			w.ufs = append(w.ufs, vector.GetUnionOneFunction(typ, proc.Mp()))
   336  		}
   337  	}
   338  	res := false
   339  	start, end := 0, bat.RowCount()
   340  	for start < end {
   341  		n := len(w.Bats)
   342  		if n == 0 || w.Bats[n-1].RowCount() >=
   343  			int(options.DefaultBlockMaxRows) {
   344  			if len(w.tableBatchBuffers) > 0 {
   345  				rbat = w.tableBatchBuffers[0]
   346  				w.tableBatchBuffers = w.tableBatchBuffers[1:]
   347  				rbat.CleanOnlyData()
   348  			} else {
   349  				rbat = batch.NewWithSize(bat.VectorCount())
   350  				rbat.SetAttributes(bat.Attrs)
   351  				for i := range w.typs {
   352  					rbat.Vecs[i] = proc.GetVector(w.typs[i])
   353  				}
   354  			}
   355  			w.Bats = append(w.Bats, rbat)
   356  		} else {
   357  			rbat = w.Bats[n-1]
   358  		}
   359  		rows := end - start
   360  		if left := int(options.DefaultBlockMaxRows) - rbat.RowCount(); rows > left {
   361  			rows = left
   362  		}
   363  
   364  		var err error
   365  		for i := 0; i < bat.VectorCount(); i++ {
   366  			vec := rbat.GetVector(int32(i))
   367  			srcVec := bat.GetVector(int32(i))
   368  			for j := 0; j < rows; j++ {
   369  				if err = w.ufs[i](vec, srcVec, int64(j+start)); err != nil {
   370  					panic(err)
   371  				}
   372  			}
   373  		}
   374  		rbat.AddRowCount(rows)
   375  		start += rows
   376  		if w.batSize = w.batSize + uint64(rbat.Size()); w.batSize > WriteS3Threshold {
   377  			res = true
   378  		}
   379  	}
   380  	return res
   381  }
   382  
   383  func getFixedCols[T types.FixedSizeT](bats []*batch.Batch, idx int) (cols [][]T) {
   384  	cols = make([][]T, 0, len(bats))
   385  	for i := range bats {
   386  		cols = append(cols, vector.MustFixedCol[T](bats[i].Vecs[idx]))
   387  	}
   388  	return
   389  }
   390  
   391  func getStrCols(bats []*batch.Batch, idx int) (cols [][]string) {
   392  	cols = make([][]string, 0, len(bats))
   393  	for i := range bats {
   394  		cols = append(cols, vector.MustStrCol(bats[i].Vecs[idx]))
   395  	}
   396  	return
   397  }
   398  
   399  func (w *S3Writer) SortAndFlush(proc *process.Process) error {
   400  	//bats := w.Bats[:length]
   401  	sortIdx := -1
   402  	for i := range w.Bats {
   403  		// sort bats firstly
   404  		// for main/orgin table and unique index table.
   405  		if w.sortIndex != -1 {
   406  			err := sortByKey(proc, w.Bats[i], w.sortIndex, w.isClusterBy, proc.GetMPool())
   407  			if err != nil {
   408  				return err
   409  			}
   410  			sortIdx = w.sortIndex
   411  		}
   412  	}
   413  	// just write ahead, no need to sort
   414  	if sortIdx == -1 {
   415  		if _, err := w.generateWriter(proc); err != nil {
   416  			return err
   417  		}
   418  
   419  		for i := range w.Bats {
   420  			if err := w.WriteBlock(w.Bats[i]); err != nil {
   421  				return err
   422  			}
   423  		}
   424  		if err := w.writeEndBlocks(proc); err != nil {
   425  			return err
   426  		}
   427  	} else {
   428  		var merge MergeInterface
   429  		var nulls []*nulls.Nulls
   430  		for i := 0; i < len(w.Bats); i++ {
   431  			nulls = append(nulls, w.Bats[i].Vecs[w.sortIndex].GetNulls())
   432  		}
   433  		pos := w.sortIndex
   434  		switch w.Bats[0].Vecs[sortIdx].GetType().Oid {
   435  		case types.T_bool:
   436  			merge = newMerge(len(w.Bats), sort.BoolLess, getFixedCols[bool](w.Bats, pos), nulls)
   437  		case types.T_bit:
   438  			merge = newMerge(len(w.Bats), sort.GenericLess[uint64], getFixedCols[uint64](w.Bats, pos), nulls)
   439  		case types.T_int8:
   440  			merge = newMerge(len(w.Bats), sort.GenericLess[int8], getFixedCols[int8](w.Bats, pos), nulls)
   441  		case types.T_int16:
   442  			merge = newMerge(len(w.Bats), sort.GenericLess[int16], getFixedCols[int16](w.Bats, pos), nulls)
   443  		case types.T_int32:
   444  			merge = newMerge(len(w.Bats), sort.GenericLess[int32], getFixedCols[int32](w.Bats, pos), nulls)
   445  		case types.T_int64:
   446  			merge = newMerge(len(w.Bats), sort.GenericLess[int64], getFixedCols[int64](w.Bats, pos), nulls)
   447  		case types.T_uint8:
   448  			merge = newMerge(len(w.Bats), sort.GenericLess[uint8], getFixedCols[uint8](w.Bats, pos), nulls)
   449  		case types.T_uint16:
   450  			merge = newMerge(len(w.Bats), sort.GenericLess[uint16], getFixedCols[uint16](w.Bats, pos), nulls)
   451  		case types.T_uint32:
   452  			merge = newMerge(len(w.Bats), sort.GenericLess[uint32], getFixedCols[uint32](w.Bats, pos), nulls)
   453  		case types.T_uint64:
   454  			merge = newMerge(len(w.Bats), sort.GenericLess[uint64], getFixedCols[uint64](w.Bats, pos), nulls)
   455  		case types.T_float32:
   456  			merge = newMerge(len(w.Bats), sort.GenericLess[float32], getFixedCols[float32](w.Bats, pos), nulls)
   457  		case types.T_float64:
   458  			merge = newMerge(len(w.Bats), sort.GenericLess[float64], getFixedCols[float64](w.Bats, pos), nulls)
   459  		case types.T_date:
   460  			merge = newMerge(len(w.Bats), sort.GenericLess[types.Date], getFixedCols[types.Date](w.Bats, pos), nulls)
   461  		case types.T_datetime:
   462  			merge = newMerge(len(w.Bats), sort.GenericLess[types.Datetime], getFixedCols[types.Datetime](w.Bats, pos), nulls)
   463  		case types.T_time:
   464  			merge = newMerge(len(w.Bats), sort.GenericLess[types.Time], getFixedCols[types.Time](w.Bats, pos), nulls)
   465  		case types.T_timestamp:
   466  			merge = newMerge(len(w.Bats), sort.GenericLess[types.Timestamp], getFixedCols[types.Timestamp](w.Bats, pos), nulls)
   467  		case types.T_enum:
   468  			merge = newMerge(len(w.Bats), sort.GenericLess[types.Enum], getFixedCols[types.Enum](w.Bats, pos), nulls)
   469  		case types.T_decimal64:
   470  			merge = newMerge(len(w.Bats), sort.Decimal64Less, getFixedCols[types.Decimal64](w.Bats, pos), nulls)
   471  		case types.T_decimal128:
   472  			merge = newMerge(len(w.Bats), sort.Decimal128Less, getFixedCols[types.Decimal128](w.Bats, pos), nulls)
   473  		case types.T_uuid:
   474  			merge = newMerge(len(w.Bats), sort.UuidLess, getFixedCols[types.Uuid](w.Bats, pos), nulls)
   475  		case types.T_char, types.T_varchar, types.T_blob, types.T_text:
   476  			merge = newMerge(len(w.Bats), sort.GenericLess[string], getStrCols(w.Bats, pos), nulls)
   477  			//TODO: check if we need T_array here? T_json is missing here.
   478  			// Update Oct 20 2023: I don't think it is necessary to add T_array here. Keeping this comment,
   479  			// in case anything fails in vector S3 flush in future.
   480  		}
   481  		if _, err := w.generateWriter(proc); err != nil {
   482  			return err
   483  		}
   484  		lens := 0
   485  		size := len(w.Bats)
   486  		w.buffer.CleanOnlyData()
   487  		var batchIndex int
   488  		var rowIndex int
   489  		for size > 0 {
   490  			batchIndex, rowIndex, size = merge.getNextPos()
   491  			for i := range w.buffer.Vecs {
   492  				err := w.buffer.Vecs[i].UnionOne(w.Bats[batchIndex].Vecs[i], int64(rowIndex), proc.GetMPool())
   493  				if err != nil {
   494  					return err
   495  				}
   496  			}
   497  			lens++
   498  			if lens == int(options.DefaultBlockMaxRows) {
   499  				lens = 0
   500  				if err := w.WriteBlock(w.buffer); err != nil {
   501  					return err
   502  				}
   503  				// force clean
   504  				w.buffer.CleanOnlyData()
   505  			}
   506  		}
   507  		if lens > 0 {
   508  			if err := w.WriteBlock(w.buffer); err != nil {
   509  				return err
   510  			}
   511  			w.buffer.CleanOnlyData()
   512  		}
   513  		if err := w.writeEndBlocks(proc); err != nil {
   514  			return err
   515  		}
   516  		// force clean
   517  		w.buffer.CleanOnlyData()
   518  	}
   519  	for i := 0; i < len(w.Bats); i++ {
   520  		//recycle the batch
   521  		w.putBatch(w.Bats[i])
   522  		w.batSize -= uint64(w.Bats[i].Size())
   523  	}
   524  	w.Bats = w.Bats[:0]
   525  	return nil
   526  }
   527  
   528  // WriteS3Batch logic:
   529  // S3Writer caches the batches in memory
   530  // and when the batches size reaches 10M, we
   531  // add a tag to indicate we need to write these data into
   532  // s3, but not immediately. We continue to wait until
   533  // no more data or the data size reaches 64M, at that time
   534  // we will trigger write s3.
   535  func (w *S3Writer) WriteS3Batch(proc *process.Process, bat *batch.Batch) error {
   536  	w.InitBuffers(proc, bat)
   537  	if w.Put(bat, proc) {
   538  		w.SortAndFlush(proc)
   539  	}
   540  	return nil
   541  }
   542  
   543  func (w *S3Writer) putBatch(bat *batch.Batch) {
   544  	w.tableBatchBuffers = append(w.tableBatchBuffers, bat)
   545  }
   546  
   547  func getNewBatch(proc *process.Process, bat *batch.Batch) *batch.Batch {
   548  	newBat := batch.NewWithSize(bat.VectorCount())
   549  	newBat.SetAttributes(bat.Attrs)
   550  	for i := range bat.Vecs {
   551  		newBat.Vecs[i] = proc.GetVector(*bat.Vecs[i].GetType())
   552  	}
   553  	return newBat
   554  }
   555  
   556  func (w *S3Writer) GenerateWriter(proc *process.Process) (objectio.ObjectName, error) {
   557  	return w.generateWriter(proc)
   558  }
   559  
   560  func (w *S3Writer) generateWriter(proc *process.Process) (objectio.ObjectName, error) {
   561  	// Use uuid as segment id
   562  	// TODO: multiple 64m file in one segment
   563  	obj := Get().GenerateObject()
   564  	s3, err := fileservice.Get[fileservice.FileService](proc.FileService, defines.SharedFileServiceName)
   565  	if err != nil {
   566  		return nil, err
   567  	}
   568  	w.writer, err = blockio.NewBlockWriterNew(s3, obj, w.schemaVersion, w.seqnums)
   569  	if err != nil {
   570  		return nil, err
   571  	}
   572  	w.lengths = w.lengths[:0]
   573  	return obj, err
   574  }
   575  
   576  // reference to pkg/sql/colexec/order/order.go logic
   577  func sortByKey(proc *process.Process, bat *batch.Batch, sortIndex int, allow_null bool, m *mpool.MPool) error {
   578  	hasNull := false
   579  	// Not-Null Check, notice that cluster by support null value
   580  	if nulls.Any(bat.Vecs[sortIndex].GetNulls()) {
   581  		hasNull = true
   582  		if !allow_null {
   583  			return moerr.NewConstraintViolation(proc.Ctx,
   584  				"sort key can not be null, sortIndex = %d, sortCol = %s",
   585  				sortIndex, bat.Attrs[sortIndex])
   586  		}
   587  	}
   588  	var strCol []string
   589  	rowCount := bat.RowCount()
   590  	sels := make([]int64, rowCount)
   591  	for i := 0; i < rowCount; i++ {
   592  		sels[i] = int64(i)
   593  	}
   594  	ovec := bat.GetVector(int32(sortIndex))
   595  	if ovec.GetType().IsVarlen() {
   596  		strCol = vector.MustStrCol(ovec)
   597  	} else {
   598  		strCol = nil
   599  	}
   600  	if allow_null {
   601  		// null last
   602  		sort.Sort(false, true, hasNull, sels, ovec, strCol)
   603  	} else {
   604  		sort.Sort(false, false, hasNull, sels, ovec, strCol)
   605  	}
   606  	return bat.Shuffle(sels, m)
   607  }
   608  
   609  func (w *S3Writer) WriteBlock(bat *batch.Batch, dataType ...objectio.DataMetaType) error {
   610  	if w.pk > -1 {
   611  		pkIdx := uint16(w.pk)
   612  		w.writer.SetPrimaryKey(pkIdx)
   613  	}
   614  	if w.sortIndex > -1 {
   615  		w.writer.SetSortKey(uint16(w.sortIndex))
   616  	}
   617  	if w.attrs == nil {
   618  		w.attrs = bat.Attrs
   619  	}
   620  	if len(w.seqnums) != len(bat.Vecs) {
   621  		// just warn becase writing delete s3 file does not need seqnums.
   622  		// print the attrs to tell if it is a delete batch
   623  		logutil.Warnf("CN write s3 table %q: seqnums length not match seqnums: %v, attrs: %v",
   624  			w.tablename, w.seqnums, bat.Attrs)
   625  	}
   626  	// logutil.Infof("write s3 batch(%d) %q: %v, %v", bat.vecs[0].Length(), w.tablename, w.seqnums, w.attrs)
   627  	if len(dataType) > 0 && dataType[0] == objectio.SchemaTombstone {
   628  		_, err := w.writer.WriteTombstoneBatch(bat)
   629  		if err != nil {
   630  			return err
   631  		}
   632  	} else {
   633  		_, err := w.writer.WriteBatch(bat)
   634  		if err != nil {
   635  			return err
   636  		}
   637  	}
   638  	w.lengths = append(w.lengths, uint64(bat.Vecs[0].Length()))
   639  	return nil
   640  }
   641  
   642  func (w *S3Writer) writeEndBlocks(proc *process.Process) error {
   643  	blkInfos, stats, err := w.WriteEndBlocks(proc)
   644  	if err != nil {
   645  		return err
   646  	}
   647  	for _, blkInfo := range blkInfos {
   648  		if err := vector.AppendFixed(
   649  			w.blockInfoBat.Vecs[0],
   650  			w.partitionIndex,
   651  			false,
   652  			proc.GetMPool()); err != nil {
   653  			return err
   654  		}
   655  		if err := vector.AppendBytes(
   656  			w.blockInfoBat.Vecs[1],
   657  			//[]byte(metaLoc),
   658  			objectio.EncodeBlockInfo(blkInfo),
   659  			false,
   660  			proc.GetMPool()); err != nil {
   661  			return err
   662  		}
   663  	}
   664  
   665  	// append the object stats to bat,
   666  	// at most one will append in
   667  	for idx := 0; idx < len(stats); idx++ {
   668  		if stats[idx].IsZero() {
   669  			continue
   670  		}
   671  
   672  		if err = vector.AppendBytes(w.blockInfoBat.Vecs[2],
   673  			stats[idx].Marshal(), false, proc.GetMPool()); err != nil {
   674  			return err
   675  		}
   676  	}
   677  
   678  	w.blockInfoBat.SetRowCount(w.blockInfoBat.Vecs[0].Length())
   679  	return nil
   680  }
   681  
   682  // WriteEndBlocks writes batches in buffer to fileservice(aka s3 in this feature) and get meta data about block on fileservice and put it into metaLocBat
   683  // For more information, please refer to the comment about func WriteEnd in Writer interface
   684  func (w *S3Writer) WriteEndBlocks(proc *process.Process) ([]objectio.BlockInfo, []objectio.ObjectStats, error) {
   685  	blocks, _, err := w.writer.Sync(proc.Ctx)
   686  	logutil.Debugf("write s3 table %q: %v, %v", w.tablename, w.seqnums, w.attrs)
   687  	if err != nil {
   688  		return nil, nil, err
   689  	}
   690  	blkInfos := make([]objectio.BlockInfo, 0, len(blocks))
   691  	//TODO::block id ,segment id and location should be get from BlockObject.
   692  	for j := range blocks {
   693  		location := blockio.EncodeLocation(
   694  			w.writer.GetName(),
   695  			blocks[j].GetExtent(),
   696  			uint32(w.lengths[j]),
   697  			blocks[j].GetID(),
   698  		)
   699  
   700  		sid := location.Name().SegmentId()
   701  		blkInfo := objectio.BlockInfo{
   702  			BlockID: *objectio.NewBlockid(
   703  				&sid,
   704  				location.Name().Num(),
   705  				location.ID()),
   706  			SegmentID: sid,
   707  			//non-appendable block
   708  			EntryState: false,
   709  		}
   710  		blkInfo.SetMetaLocation(location)
   711  		if w.sortIndex != -1 {
   712  			blkInfo.Sorted = true
   713  		}
   714  		blkInfos = append(blkInfos, blkInfo)
   715  	}
   716  	return blkInfos, w.writer.GetObjectStats(), err
   717  }