github.com/matrixorigin/matrixone@v0.7.0/pkg/sql/colexec/s3util.go (about)

     1  // Copyright 2022 Matrix Origin
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //      http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  package colexec
    16  
    17  import (
    18  	"github.com/matrixorigin/matrixone/pkg/catalog"
    19  	"github.com/matrixorigin/matrixone/pkg/common/moerr"
    20  	"github.com/matrixorigin/matrixone/pkg/common/mpool"
    21  	"github.com/matrixorigin/matrixone/pkg/container/batch"
    22  	"github.com/matrixorigin/matrixone/pkg/container/nulls"
    23  	"github.com/matrixorigin/matrixone/pkg/container/types"
    24  	"github.com/matrixorigin/matrixone/pkg/container/vector"
    25  	"github.com/matrixorigin/matrixone/pkg/defines"
    26  	"github.com/matrixorigin/matrixone/pkg/fileservice"
    27  	"github.com/matrixorigin/matrixone/pkg/objectio"
    28  	"github.com/matrixorigin/matrixone/pkg/partition"
    29  	"github.com/matrixorigin/matrixone/pkg/pb/plan"
    30  	"github.com/matrixorigin/matrixone/pkg/sort"
    31  	"github.com/matrixorigin/matrixone/pkg/sql/util"
    32  	"github.com/matrixorigin/matrixone/pkg/vm/engine"
    33  	"github.com/matrixorigin/matrixone/pkg/vm/engine/tae/common"
    34  	"github.com/matrixorigin/matrixone/pkg/vm/engine/tae/containers"
    35  	"github.com/matrixorigin/matrixone/pkg/vm/engine/tae/dataio/blockio"
    36  	"github.com/matrixorigin/matrixone/pkg/vm/engine/tae/options"
    37  	"github.com/matrixorigin/matrixone/pkg/vm/engine/tae/tables/indexwrapper"
    38  	"github.com/matrixorigin/matrixone/pkg/vm/process"
    39  )
    40  
    41  type WriteS3Container struct {
    42  	sortIndex        []int
    43  	nameToNullablity map[string]bool
    44  	pk               map[string]bool
    45  
    46  	writer   objectio.Writer
    47  	lengths  []uint64
    48  	cacheBat []*batch.Batch
    49  
    50  	UniqueRels []engine.Relation
    51  
    52  	metaLocBat *batch.Batch
    53  }
    54  
    55  func NewWriteS3Container(tableDef *plan.TableDef) *WriteS3Container {
    56  	container := &WriteS3Container{
    57  		sortIndex:        make([]int, 0, 1),
    58  		pk:               make(map[string]bool),
    59  		nameToNullablity: make(map[string]bool),
    60  	}
    61  
    62  	// get pk indexes
    63  	if tableDef.CompositePkey != nil {
    64  		names := util.SplitCompositePrimaryKeyColumnName(tableDef.CompositePkey.Name)
    65  		for num, colDef := range tableDef.Cols {
    66  			for _, name := range names {
    67  				if colDef.Name == name {
    68  					container.sortIndex = append(container.sortIndex, num)
    69  				}
    70  			}
    71  		}
    72  	} else {
    73  		// Get Single Col pk index
    74  		for num, colDef := range tableDef.Cols {
    75  			if colDef.Primary {
    76  				container.sortIndex = append(container.sortIndex, num)
    77  				break
    78  			}
    79  		}
    80  	}
    81  
    82  	// Get CPkey index
    83  	if tableDef.CompositePkey != nil {
    84  		// the serialized cpk col is located in the last of the bat.vecs
    85  		container.sortIndex = append(container.sortIndex, len(tableDef.Cols))
    86  	} else {
    87  		// Get Single Col pk index
    88  		for num, colDef := range tableDef.Cols {
    89  			if colDef.Primary {
    90  				container.sortIndex = append(container.sortIndex, num)
    91  				break
    92  			}
    93  		}
    94  		if tableDef.ClusterBy != nil {
    95  			if util.JudgeIsCompositeClusterByColumn(tableDef.ClusterBy.Name) {
    96  				// the serialized clusterby col is located in the last of the bat.vecs
    97  				container.sortIndex = append(container.sortIndex, len(tableDef.Cols))
    98  			} else {
    99  				for num, colDef := range tableDef.Cols {
   100  					if colDef.Name == tableDef.ClusterBy.Name {
   101  						container.sortIndex = append(container.sortIndex, num)
   102  					}
   103  				}
   104  			}
   105  		}
   106  	}
   107  
   108  	// get NameNullAbility
   109  	for _, def := range tableDef.Cols {
   110  		container.nameToNullablity[def.Name] = def.Default.NullAbility
   111  		if def.Primary {
   112  			container.pk[def.Name] = true
   113  		}
   114  	}
   115  	if tableDef.CompositePkey != nil {
   116  		def := tableDef.CompositePkey
   117  		container.nameToNullablity[def.Name] = def.Default.NullAbility
   118  		container.pk[def.Name] = true
   119  	}
   120  
   121  	//if tableDef.Indexes != nil {
   122  	//	for _, indexdef := range tableDef.Indexes {
   123  	//		if indexdef.Unique {
   124  	//			for j := range indexdef.Field.Cols {
   125  	//				coldef := indexdef.Field.Cols[j]
   126  	//				container.nameToNullablity[coldef.Name] = coldef.Default.NullAbility
   127  	//			}
   128  	//		} else {
   129  	//			continue
   130  	//		}
   131  	//	}
   132  	//}
   133  
   134  	if tableDef.ClusterBy != nil {
   135  		container.nameToNullablity[tableDef.ClusterBy.Name] = true
   136  	}
   137  	container.resetMetaLocBat()
   138  
   139  	return container
   140  }
   141  
   142  func (container *WriteS3Container) resetMetaLocBat() {
   143  	// A simple explanation of the two vectors held by metaLocBat
   144  	// vecs[0] to mark which table this metaLoc belongs to: [0] means insertTable itself, [1] means the first uniqueIndex table, [2] means the second uniqueIndex table and so on
   145  	// vecs[1] store relative block metadata
   146  	attrs := []string{catalog.BlockMeta_TableIdx_Insert, catalog.BlockMeta_MetaLoc}
   147  	metaLocBat := batch.New(true, attrs)
   148  	metaLocBat.Vecs[0] = vector.New(types.Type{Oid: types.T(types.T_uint16)})
   149  	metaLocBat.Vecs[1] = vector.New(types.New(types.T_varchar,
   150  		types.MaxVarcharLen, 0, 0))
   151  
   152  	container.metaLocBat = metaLocBat
   153  }
   154  
   155  func (container *WriteS3Container) WriteEnd(proc *process.Process) {
   156  	if container.metaLocBat.Vecs[0].Length() > 0 {
   157  		container.metaLocBat.SetZs(container.metaLocBat.Vecs[0].Length(), proc.GetMPool())
   158  		proc.SetInputBatch(container.metaLocBat)
   159  		container.resetMetaLocBat()
   160  	}
   161  }
   162  
   163  func (container *WriteS3Container) WriteS3CacheBatch(proc *process.Process) error {
   164  	if len(container.cacheBat) > 0 {
   165  		for i, bat := range container.cacheBat {
   166  			if bat != nil {
   167  				err := GetBlockMeta([]*batch.Batch{bat}, container, proc, i)
   168  				if err != nil {
   169  					return err
   170  				}
   171  			}
   172  		}
   173  		container.WriteEnd(proc)
   174  	}
   175  	return nil
   176  }
   177  
   178  func (container *WriteS3Container) WriteS3Batch(bat *batch.Batch, proc *process.Process, idx int) error {
   179  	bats := reSizeBatch(container, bat, proc, idx)
   180  	if len(bats) == 0 {
   181  		proc.SetInputBatch(&batch.Batch{})
   182  		return nil
   183  	}
   184  	return GetBlockMeta(bats, container, proc, idx)
   185  }
   186  
   187  // After cn writes the data to s3, it will get meta data about the block (aka metaloc) by calling func WriteEndBlocks
   188  // and cn needs to pass it to dn for conflict detection
   189  // Except for the case of writing s3 directly, cn doesn't need to sense how dn is labeling the blocks on s3
   190  func GetBlockMeta(bats []*batch.Batch, container *WriteS3Container, proc *process.Process, idx int) error {
   191  	for i := range bats {
   192  		if err := GenerateWriter(container, proc); err != nil {
   193  			return err
   194  		}
   195  		if idx == 0 && len(container.sortIndex) != 0 {
   196  			SortByPrimaryKey(proc, bats[i], container.sortIndex, proc.GetMPool())
   197  		}
   198  		if bats[i].Length() == 0 {
   199  			continue
   200  		}
   201  		if err := WriteBlock(container, bats[i], proc); err != nil {
   202  			return err
   203  		}
   204  		if err := WriteEndBlocks(container, proc, idx); err != nil {
   205  			return err
   206  		}
   207  	}
   208  
   209  	// send it to connector operator.
   210  	// vitually, first it will be recieved by output, then transfer it to connector by rpc
   211  	// metaLocBat.SetZs(metaLocBat.Vecs[0].Length(), proc.GetMPool())
   212  	return nil
   213  }
   214  
   215  // reSizeBatch will try to set the batch with the length of DefaultBlockMaxRows
   216  // consider DefaultBlockMaxRows as unit
   217  // case 1. If the length of bat and cacheBat together is larger than DefaultBlockMaxRows, then split the batch into unit batchs and return, the smaller part store in cacheBat
   218  // case 2. If the length of bat and cacheBat together is less than DefaultBlockMaxRows, then bat is merged into cacheBat
   219  // The expected result is : unitBatch1, unitBatch2, ... unitBatchx, the last Batch that batchSize less than DefaultBlockMaxRows
   220  //
   221  // limit : one segment has only one block, this limit exists because currently, tae caches blocks in memory (instead of disk) before writing them to s3, which means that if limit 1 is removed, it may cause memory problems
   222  func reSizeBatch(container *WriteS3Container, bat *batch.Batch, proc *process.Process, batIdx int) (bats []*batch.Batch) {
   223  	var newBat *batch.Batch
   224  	var cacheLen uint32
   225  	if len(container.cacheBat) <= batIdx {
   226  		container.cacheBat = append(container.cacheBat, nil)
   227  	}
   228  	if container.cacheBat[batIdx] != nil {
   229  		cacheLen = uint32(container.cacheBat[batIdx].Length())
   230  	}
   231  	idx := int(cacheLen)
   232  	cnt := cacheLen + uint32(bat.Length())
   233  
   234  	if cnt >= options.DefaultBlockMaxRows { // case 1
   235  		if container.cacheBat[batIdx] != nil {
   236  			newBat = container.cacheBat[batIdx]
   237  			container.cacheBat[batIdx] = nil
   238  		} else {
   239  			newBat = getNewBatch(bat)
   240  		}
   241  
   242  		for cnt >= options.DefaultBlockMaxRows {
   243  			for i := range newBat.Vecs {
   244  				vector.UnionOne(newBat.Vecs[i], bat.Vecs[i], int64(idx)-int64(cacheLen), proc.GetMPool())
   245  			}
   246  			idx++
   247  			if idx%int(options.DefaultBlockMaxRows) == 0 {
   248  				newBat.SetZs(int(options.DefaultBlockMaxRows), proc.GetMPool())
   249  				bats = append(bats, newBat)
   250  				newBat = getNewBatch(bat)
   251  				cnt -= options.DefaultBlockMaxRows
   252  			}
   253  		}
   254  	}
   255  
   256  	if len(bats) == 0 { // implying the end of this operator, the last Batch that batchSize less than DefaultBlockMaxRows
   257  		if container.cacheBat[batIdx] == nil {
   258  			container.cacheBat[batIdx] = getNewBatch(bat)
   259  		}
   260  		for i := 0; i < bat.Length(); i++ {
   261  			for j := range container.cacheBat[batIdx].Vecs {
   262  				vector.UnionOne(container.cacheBat[batIdx].Vecs[j], bat.Vecs[j], int64(i), proc.GetMPool())
   263  			}
   264  		}
   265  		container.cacheBat[batIdx].SetZs(container.cacheBat[batIdx].Vecs[0].Length(), proc.GetMPool())
   266  	} else {
   267  		if cnt > 0 { // the part less than DefaultBlockMaxRows stored in cacheBat
   268  			if newBat == nil {
   269  				newBat = getNewBatch(bat)
   270  			}
   271  			for cnt > 0 {
   272  				for i := range newBat.Vecs {
   273  					vector.UnionOne(newBat.Vecs[i], bat.Vecs[i], int64(idx)-int64(cacheLen), proc.GetMPool())
   274  				}
   275  				idx++
   276  				cnt--
   277  			}
   278  			container.cacheBat[batIdx] = newBat
   279  			container.cacheBat[batIdx].SetZs(container.cacheBat[batIdx].Vecs[0].Length(), proc.GetMPool())
   280  		}
   281  	}
   282  	return
   283  }
   284  
   285  func getNewBatch(bat *batch.Batch) *batch.Batch {
   286  	attrs := make([]string, len(bat.Attrs))
   287  	copy(attrs, bat.Attrs)
   288  	newBat := batch.New(true, attrs)
   289  	for i := range bat.Vecs {
   290  		newBat.Vecs[i] = vector.New(bat.Vecs[i].GetType())
   291  	}
   292  	return newBat
   293  }
   294  
   295  func GenerateWriter(container *WriteS3Container, proc *process.Process) error {
   296  	segId, err := Srv.GenerateSegment()
   297  
   298  	if err != nil {
   299  		return err
   300  	}
   301  	s3, err := fileservice.Get[fileservice.FileService](proc.FileService, defines.SharedFileServiceName)
   302  	if err != nil {
   303  		return err
   304  	}
   305  	container.writer, err = objectio.NewObjectWriter(segId, s3)
   306  	if err != nil {
   307  		return err
   308  	}
   309  	container.lengths = container.lengths[:0]
   310  	return nil
   311  }
   312  
   313  // referece to pkg/sql/colexec/order/order.go logic
   314  func SortByPrimaryKey(proc *process.Process, bat *batch.Batch, pkIdx []int, m *mpool.MPool) error {
   315  	// Not-Null Check
   316  	for i := 0; i < len(pkIdx); i++ {
   317  		if nulls.Any(bat.Vecs[i].Nsp) {
   318  			// return moerr.NewConstraintViolation(proc.Ctx, fmt.Sprintf("Column '%s' cannot be null", n.InsertCtx.TableDef.Cols[i].GetName()))
   319  			return moerr.NewConstraintViolation(proc.Ctx, "Primary key can not be null")
   320  		}
   321  	}
   322  
   323  	var strCol []string
   324  	sels := make([]int64, len(bat.Zs))
   325  	for i := 0; i < len(bat.Zs); i++ {
   326  		sels[i] = int64(i)
   327  	}
   328  	ovec := bat.GetVector(int32(pkIdx[0]))
   329  	if ovec.Typ.IsString() {
   330  		strCol = vector.GetStrVectorValues(ovec)
   331  	} else {
   332  		strCol = nil
   333  	}
   334  	sort.Sort(false, false, false, sels, ovec, strCol)
   335  	if len(pkIdx) == 1 {
   336  		return bat.Shuffle(sels, m)
   337  	}
   338  	ps := make([]int64, 0, 16)
   339  	ds := make([]bool, len(sels))
   340  	for i, j := 1, len(pkIdx); i < j; i++ {
   341  		ps = partition.Partition(sels, ds, ps, ovec)
   342  		vec := bat.Vecs[pkIdx[i]]
   343  		if vec.Typ.IsString() {
   344  			strCol = vector.GetStrVectorValues(vec)
   345  		} else {
   346  			strCol = nil
   347  		}
   348  		for i, j := 0, len(ps); i < j; i++ {
   349  			if i == j-1 {
   350  				sort.Sort(false, false, false, sels[ps[i]:], vec, strCol)
   351  			} else {
   352  				sort.Sort(false, false, false, sels[ps[i]:ps[i+1]], vec, strCol)
   353  			}
   354  		}
   355  		ovec = vec
   356  	}
   357  	return bat.Shuffle(sels, m)
   358  }
   359  
   360  // WriteBlock WriteBlock writes one batch to a buffer and generate related indexes for this batch
   361  // For more information, please refer to the comment about func Write in Writer interface
   362  func WriteBlock(container *WriteS3Container, bat *batch.Batch, proc *process.Process) error {
   363  	fd, err := container.writer.Write(bat)
   364  
   365  	if err != nil {
   366  		return err
   367  	}
   368  	// atomic.AddUint64(&n.Affected, uint64(bat.Vecs[0].Length()))
   369  	container.lengths = append(container.lengths, uint64(bat.Vecs[0].Length()))
   370  	if err := GenerateIndex(container, fd, container.writer, bat); err != nil {
   371  		return err
   372  	}
   373  
   374  	return nil
   375  }
   376  
   377  // GenerateIndex generates relative indexes for the batch writed directly to s3 from cn
   378  // For more information, please refer to the comment about func WriteIndex in Writer interface
   379  func GenerateIndex(container *WriteS3Container, fd objectio.BlockObject, objectWriter objectio.Writer, bat *batch.Batch) error {
   380  	for i, mvec := range bat.Vecs {
   381  		err := getIndexDataFromVec(fd, objectWriter, uint16(i), mvec, container.nameToNullablity[bat.Attrs[i]], container.pk[bat.Attrs[i]])
   382  		if err != nil {
   383  			return err
   384  		}
   385  	}
   386  	return nil
   387  }
   388  
   389  func getIndexDataFromVec(block objectio.BlockObject, writer objectio.Writer,
   390  	idx uint16,
   391  	vec *vector.Vector, nullAbliaty bool, isPk bool) error {
   392  	var err error
   393  	columnData := containers.NewVectorWithSharedMemory(vec, nullAbliaty)
   394  	zmPos := 0
   395  	zoneMapWriter := indexwrapper.NewZMWriter()
   396  	if err = zoneMapWriter.Init(writer, block, common.Plain, idx, uint16(zmPos)); err != nil {
   397  		return err
   398  	}
   399  	err = zoneMapWriter.AddValues(columnData)
   400  	if err != nil {
   401  		return err
   402  	}
   403  	_, err = zoneMapWriter.Finalize()
   404  	if err != nil {
   405  		return err
   406  	}
   407  	if !isPk {
   408  		return nil
   409  	}
   410  	bfPos := 1
   411  	bfWriter := indexwrapper.NewBFWriter()
   412  	if err = bfWriter.Init(writer, block, common.Plain, idx, uint16(bfPos)); err != nil {
   413  		return err
   414  	}
   415  	if err = bfWriter.AddValues(columnData); err != nil {
   416  		return err
   417  	}
   418  	_, err = bfWriter.Finalize()
   419  	if err != nil {
   420  		return err
   421  	}
   422  	return nil
   423  }
   424  
   425  // WriteEndBlocks WriteEndBlocks write batches in buffer to fileservice(aka s3 in this feature) and get meta data about block on fileservice and put it into metaLocBat
   426  // For more information, please refer to the comment about func WriteEnd in Writer interface
   427  func WriteEndBlocks(container *WriteS3Container, proc *process.Process, idx int) error {
   428  	blocks, err := container.writer.WriteEnd(proc.Ctx)
   429  	if err != nil {
   430  		return err
   431  	}
   432  	for j := range blocks {
   433  		metaLoc, err := blockio.EncodeMetaLocWithObject(
   434  			blocks[0].GetExtent(),
   435  			uint32(container.lengths[j]),
   436  			blocks,
   437  		)
   438  		if err != nil {
   439  			return err
   440  		}
   441  		container.metaLocBat.Vecs[0].Append(uint16(idx), false, proc.GetMPool())
   442  		container.metaLocBat.Vecs[1].Append([]byte(metaLoc), false, proc.GetMPool())
   443  	}
   444  	// for i := range container.unique_writer {
   445  	// 	if blocks, err = container.unique_writer[i].WriteEnd(proc.Ctx); err != nil {
   446  	// 		return err
   447  	// 	}
   448  	// 	for j := range blocks {
   449  	// 		metaLoc, err := blockio.EncodeMetaLocWithObject(
   450  	// 			blocks[0].GetExtent(),
   451  	// 			uint32(container.unique_lengths[i][j]),
   452  	// 			blocks,
   453  	// 		)
   454  	// 		if err != nil {
   455  	// 			return err
   456  	// 		}
   457  	// 		metaLocBat.Vecs[0].Append(uint16(i+1), false, proc.GetMPool())
   458  	// 		metaLocBat.Vecs[1].Append([]byte(metaLoc), false, proc.GetMPool())
   459  	// 	}
   460  	// }
   461  	return nil
   462  }