github.com/matrixorigin/matrixone@v1.2.0/pkg/sql/colexec/s3util.go (about) 1 // Copyright 2022 Matrix Origin 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 package colexec 16 17 import ( 18 "github.com/matrixorigin/matrixone/pkg/catalog" 19 "github.com/matrixorigin/matrixone/pkg/common/moerr" 20 "github.com/matrixorigin/matrixone/pkg/common/mpool" 21 "github.com/matrixorigin/matrixone/pkg/container/batch" 22 "github.com/matrixorigin/matrixone/pkg/container/nulls" 23 "github.com/matrixorigin/matrixone/pkg/container/types" 24 "github.com/matrixorigin/matrixone/pkg/container/vector" 25 "github.com/matrixorigin/matrixone/pkg/defines" 26 "github.com/matrixorigin/matrixone/pkg/fileservice" 27 "github.com/matrixorigin/matrixone/pkg/logutil" 28 "github.com/matrixorigin/matrixone/pkg/objectio" 29 "github.com/matrixorigin/matrixone/pkg/pb/plan" 30 "github.com/matrixorigin/matrixone/pkg/sort" 31 "github.com/matrixorigin/matrixone/pkg/sql/util" 32 db_holder "github.com/matrixorigin/matrixone/pkg/util/export/etl/db" 33 "github.com/matrixorigin/matrixone/pkg/vm" 34 "github.com/matrixorigin/matrixone/pkg/vm/engine/tae/blockio" 35 "github.com/matrixorigin/matrixone/pkg/vm/engine/tae/options" 36 "github.com/matrixorigin/matrixone/pkg/vm/process" 37 "go.uber.org/zap" 38 ) 39 40 // S3Writer is used to write table data to S3 and package a series of `BlockWriter` write operations 41 // Currently there are two scenarios will let cn write s3 directly 42 // scenario 1 is insert operator directly go s3, when a one-time insert/load data volume is relatively large will trigger the scenario 43 // scenario 2 is txn.workspace exceeds the threshold value, in the txn.dumpBatch function trigger a write s3 44 type S3Writer struct { 45 sortIndex int // When writing table data, if table has sort key, need to sort data and then write to S3 46 pk int 47 partitionIndex int16 // This value is aligned with the partition number 48 isClusterBy bool 49 50 schemaVersion uint32 51 seqnums []uint16 52 tablename string 53 attrs []string 54 55 writer *blockio.BlockWriter 56 lengths []uint64 57 58 // the third vector only has several rows, not aligns with the other two vectors. 59 blockInfoBat *batch.Batch 60 61 // An intermediate cache after the merge sort of all `Bats` data 62 buffer *batch.Batch 63 64 //for memory multiplexing. 65 tableBatchBuffers []*batch.Batch 66 67 // Bats[i] used to store the batches of table 68 // Each batch in Bats will be sorted internally, and all batches correspond to only one table 69 // when the batches' size is over 64M, we will use merge sort, and then write a segment in s3 70 Bats []*batch.Batch 71 72 // tableBatchSizes are used to record the table_i's batches' 73 // size in tableBatches 74 batSize uint64 75 76 typs []types.Type 77 ufs []func(*vector.Vector, *vector.Vector, int64) error // function pointers for type conversion 78 } 79 80 const ( 81 // WriteS3Threshold when batches' size of table reaches this, we will 82 // trigger write s3 83 WriteS3Threshold uint64 = 128 * mpool.MB 84 85 TagS3Size uint64 = 10 * mpool.MB 86 TagS3SizeForMOLogger uint64 = 1 * mpool.MB 87 ) 88 89 func (w *S3Writer) Free(proc *process.Process) { 90 if w.blockInfoBat != nil { 91 w.blockInfoBat.Clean(proc.Mp()) 92 w.blockInfoBat = nil 93 } 94 if w.buffer != nil { 95 w.buffer.Clean(proc.Mp()) 96 w.buffer = nil 97 } 98 for _, bat := range w.tableBatchBuffers { 99 bat.Clean(proc.Mp()) 100 } 101 w.tableBatchBuffers = nil 102 for _, bat := range w.Bats { 103 bat.Clean(proc.Mp()) 104 } 105 w.Bats = nil 106 } 107 108 func (w *S3Writer) GetBlockInfoBat() *batch.Batch { 109 return w.blockInfoBat 110 } 111 112 func (w *S3Writer) SetSortIdx(sortIdx int) { 113 w.sortIndex = sortIdx 114 } 115 116 func (w *S3Writer) SetSchemaVer(ver uint32) { 117 w.schemaVersion = ver 118 } 119 120 func (w *S3Writer) SetTableName(name string) { 121 w.tablename = name 122 } 123 124 func (w *S3Writer) SetSeqnums(seqnums []uint16) { 125 w.seqnums = seqnums 126 logutil.Debugf("s3 table set directly %q seqnums: %+v", w.tablename, w.seqnums) 127 } 128 129 func AllocS3Writer(proc *process.Process, tableDef *plan.TableDef) (*S3Writer, error) { 130 writer := &S3Writer{ 131 tablename: tableDef.GetName(), 132 seqnums: make([]uint16, 0, len(tableDef.Cols)), 133 schemaVersion: tableDef.Version, 134 sortIndex: -1, 135 pk: -1, 136 partitionIndex: 0, 137 } 138 139 writer.ResetBlockInfoBat(proc) 140 for i, colDef := range tableDef.Cols { 141 if colDef.Name != catalog.Row_ID { 142 writer.seqnums = append(writer.seqnums, uint16(colDef.Seqnum)) 143 } else { 144 // check rowid as the last column 145 if i != len(tableDef.Cols)-1 { 146 logutil.Errorf("bad rowid position for %q, %+v", writer.tablename, colDef) 147 } 148 } 149 } 150 logutil.Debugf("s3 table set from AllocS3Writer %q seqnums: %+v", writer.tablename, writer.seqnums) 151 152 // Get Single Col pk index 153 for idx, colDef := range tableDef.Cols { 154 if colDef.Name == tableDef.Pkey.PkeyColName && colDef.Name != catalog.FakePrimaryKeyColName { 155 writer.sortIndex = idx 156 writer.pk = idx 157 break 158 } 159 } 160 161 if tableDef.ClusterBy != nil { 162 writer.isClusterBy = true 163 164 // the `rowId` column has been excluded from target table's `TableDef` for insert statements (insert, load), 165 // link: `/pkg/sql/plan/build_constraint_util.go` -> func setTableExprToDmlTableInfo 166 // and the `sortIndex` position can be directly obtained using a name that matches the sorting key 167 for idx, colDef := range tableDef.Cols { 168 if colDef.Name == tableDef.ClusterBy.Name { 169 writer.sortIndex = idx 170 } 171 } 172 } 173 174 return writer, nil 175 } 176 177 // AllocPartitionS3Writer Alloc S3 writers for partitioned table. 178 func AllocPartitionS3Writer(proc *process.Process, tableDef *plan.TableDef) ([]*S3Writer, error) { 179 partitionNum := len(tableDef.Partition.PartitionTableNames) 180 writers := make([]*S3Writer, partitionNum) 181 for i := range writers { 182 writers[i] = &S3Writer{ 183 tablename: tableDef.GetName(), 184 seqnums: make([]uint16, 0, len(tableDef.Cols)), 185 schemaVersion: tableDef.Version, 186 sortIndex: -1, 187 pk: -1, 188 partitionIndex: int16(i), // This value is aligned with the partition number 189 } 190 191 writers[i].ResetBlockInfoBat(proc) 192 for j, colDef := range tableDef.Cols { 193 if colDef.Name != catalog.Row_ID { 194 writers[i].seqnums = append(writers[i].seqnums, uint16(colDef.Seqnum)) 195 } else { 196 // check rowid as the last column 197 if j != len(tableDef.Cols)-1 { 198 logutil.Errorf("bad rowid position for %q, %+v", writers[j].tablename, colDef) 199 } 200 } 201 } 202 logutil.Debugf("s3 table set from AllocS3WriterP%d %q seqnums: %+v", i, writers[i].tablename, writers[i].seqnums) 203 204 // Get Single Col pk index 205 for idx, colDef := range tableDef.Cols { 206 if colDef.Name == tableDef.Pkey.PkeyColName { 207 if colDef.Name != catalog.FakePrimaryKeyColName { 208 writers[i].sortIndex = idx 209 writers[i].pk = idx 210 } 211 break 212 } 213 } 214 215 if tableDef.ClusterBy != nil { 216 writers[i].isClusterBy = true 217 if util.JudgeIsCompositeClusterByColumn(tableDef.ClusterBy.Name) { 218 // the serialized clusterby col is located in the last of the bat.vecs 219 writers[i].sortIndex = len(tableDef.Cols) - 1 220 } else { 221 for idx, colDef := range tableDef.Cols { 222 if colDef.Name == tableDef.ClusterBy.Name { 223 writers[i].sortIndex = idx 224 } 225 } 226 } 227 } 228 229 } 230 return writers, nil 231 } 232 233 func (w *S3Writer) ResetBlockInfoBat(proc *process.Process) { 234 // A simple explanation of the two vectors held by metaLocBat 235 // vecs[0] to mark which table this metaLoc belongs to: [0] means insertTable itself, [1] means the first uniqueIndex table, [2] means the second uniqueIndex table and so on 236 // vecs[1] store relative block metadata 237 if w.blockInfoBat != nil { 238 proc.PutBatch(w.blockInfoBat) 239 } 240 attrs := []string{catalog.BlockMeta_TableIdx_Insert, catalog.BlockMeta_BlockInfo, catalog.ObjectMeta_ObjectStats} 241 blockInfoBat := batch.NewWithSize(len(attrs)) 242 blockInfoBat.Attrs = attrs 243 blockInfoBat.Vecs[0] = proc.GetVector(types.T_int16.ToType()) 244 blockInfoBat.Vecs[1] = proc.GetVector(types.T_text.ToType()) 245 blockInfoBat.Vecs[2] = proc.GetVector(types.T_binary.ToType()) 246 247 w.blockInfoBat = blockInfoBat 248 } 249 250 //func (w *S3Writer) WriteEnd(proc *process.Process) { 251 // if w.metaLocBat.vecs[0].Length() > 0 { 252 // w.metaLocBat.SetZs(w.metaLocBat.vecs[0].Length(), proc.GetMPool()) 253 // proc.SetInputBatch(w.metaLocBat) 254 // } 255 //} 256 257 func (w *S3Writer) Output(proc *process.Process, result *vm.CallResult) error { 258 bat := batch.NewWithSize(len(w.blockInfoBat.Attrs)) 259 bat.SetAttributes(w.blockInfoBat.Attrs) 260 261 for i := range w.blockInfoBat.Attrs { 262 vec := proc.GetVector(*w.blockInfoBat.Vecs[i].GetType()) 263 if err := vec.UnionBatch(w.blockInfoBat.Vecs[i], 0, w.blockInfoBat.Vecs[i].Length(), nil, proc.GetMPool()); err != nil { 264 vec.Free(proc.Mp()) 265 return err 266 } 267 bat.SetVector(int32(i), vec) 268 } 269 bat.SetRowCount(w.blockInfoBat.RowCount()) 270 w.ResetBlockInfoBat(proc) 271 result.Batch = bat 272 return nil 273 } 274 275 func (w *S3Writer) WriteS3CacheBatch(proc *process.Process) error { 276 var S3SizeThreshold = TagS3SizeForMOLogger 277 278 if proc != nil && proc.Ctx != nil { 279 isMoLogger, ok := proc.Ctx.Value(defines.IsMoLogger{}).(bool) 280 if ok && isMoLogger { 281 logutil.Debug("WriteS3CacheBatch proc", zap.Bool("isMoLogger", isMoLogger)) 282 S3SizeThreshold = TagS3SizeForMOLogger 283 } 284 } 285 286 if proc.GetSessionInfo() != nil && proc.GetSessionInfo().GetUser() == db_holder.MOLoggerUser { 287 logutil.Debug("WriteS3CacheBatch", zap.String("user", proc.GetSessionInfo().GetUser())) 288 S3SizeThreshold = TagS3SizeForMOLogger 289 } 290 if w.batSize >= S3SizeThreshold { 291 if err := w.SortAndFlush(proc); err != nil { 292 return err 293 } 294 w.blockInfoBat.SetRowCount(w.blockInfoBat.Vecs[0].Length()) 295 return nil 296 } 297 for _, bat := range w.Bats { 298 if err := vector.AppendFixed( 299 w.blockInfoBat.Vecs[0], -w.partitionIndex-1, 300 false, proc.GetMPool()); err != nil { 301 return err 302 } 303 bytes, err := bat.MarshalBinary() 304 if err != nil { 305 return err 306 } 307 if err = vector.AppendBytes( 308 w.blockInfoBat.Vecs[1], bytes, 309 false, proc.GetMPool()); err != nil { 310 return err 311 } 312 } 313 w.blockInfoBat.SetRowCount(w.blockInfoBat.Vecs[0].Length()) 314 return nil 315 } 316 317 func (w *S3Writer) InitBuffers(proc *process.Process, bat *batch.Batch) { 318 if w.buffer == nil { 319 w.buffer = getNewBatch(proc, bat) 320 } 321 } 322 323 // Put batch into w.bats , and make sure that each batch in w.bats 324 // 325 // contains options.DefaultBlockMaxRows rows except for the last one. 326 // true: the tableBatches[idx] is over threshold 327 // false: the tableBatches[idx] is less than or equal threshold 328 func (w *S3Writer) Put(bat *batch.Batch, proc *process.Process) bool { 329 var rbat *batch.Batch 330 331 if len(w.typs) == 0 { 332 for i := 0; i < bat.VectorCount(); i++ { 333 typ := *bat.GetVector(int32(i)).GetType() 334 w.typs = append(w.typs, typ) 335 w.ufs = append(w.ufs, vector.GetUnionOneFunction(typ, proc.Mp())) 336 } 337 } 338 res := false 339 start, end := 0, bat.RowCount() 340 for start < end { 341 n := len(w.Bats) 342 if n == 0 || w.Bats[n-1].RowCount() >= 343 int(options.DefaultBlockMaxRows) { 344 if len(w.tableBatchBuffers) > 0 { 345 rbat = w.tableBatchBuffers[0] 346 w.tableBatchBuffers = w.tableBatchBuffers[1:] 347 rbat.CleanOnlyData() 348 } else { 349 rbat = batch.NewWithSize(bat.VectorCount()) 350 rbat.SetAttributes(bat.Attrs) 351 for i := range w.typs { 352 rbat.Vecs[i] = proc.GetVector(w.typs[i]) 353 } 354 } 355 w.Bats = append(w.Bats, rbat) 356 } else { 357 rbat = w.Bats[n-1] 358 } 359 rows := end - start 360 if left := int(options.DefaultBlockMaxRows) - rbat.RowCount(); rows > left { 361 rows = left 362 } 363 364 var err error 365 for i := 0; i < bat.VectorCount(); i++ { 366 vec := rbat.GetVector(int32(i)) 367 srcVec := bat.GetVector(int32(i)) 368 for j := 0; j < rows; j++ { 369 if err = w.ufs[i](vec, srcVec, int64(j+start)); err != nil { 370 panic(err) 371 } 372 } 373 } 374 rbat.AddRowCount(rows) 375 start += rows 376 if w.batSize = w.batSize + uint64(rbat.Size()); w.batSize > WriteS3Threshold { 377 res = true 378 } 379 } 380 return res 381 } 382 383 func getFixedCols[T types.FixedSizeT](bats []*batch.Batch, idx int) (cols [][]T) { 384 cols = make([][]T, 0, len(bats)) 385 for i := range bats { 386 cols = append(cols, vector.MustFixedCol[T](bats[i].Vecs[idx])) 387 } 388 return 389 } 390 391 func getStrCols(bats []*batch.Batch, idx int) (cols [][]string) { 392 cols = make([][]string, 0, len(bats)) 393 for i := range bats { 394 cols = append(cols, vector.MustStrCol(bats[i].Vecs[idx])) 395 } 396 return 397 } 398 399 func (w *S3Writer) SortAndFlush(proc *process.Process) error { 400 //bats := w.Bats[:length] 401 sortIdx := -1 402 for i := range w.Bats { 403 // sort bats firstly 404 // for main/orgin table and unique index table. 405 if w.sortIndex != -1 { 406 err := sortByKey(proc, w.Bats[i], w.sortIndex, w.isClusterBy, proc.GetMPool()) 407 if err != nil { 408 return err 409 } 410 sortIdx = w.sortIndex 411 } 412 } 413 // just write ahead, no need to sort 414 if sortIdx == -1 { 415 if _, err := w.generateWriter(proc); err != nil { 416 return err 417 } 418 419 for i := range w.Bats { 420 if err := w.WriteBlock(w.Bats[i]); err != nil { 421 return err 422 } 423 } 424 if err := w.writeEndBlocks(proc); err != nil { 425 return err 426 } 427 } else { 428 var merge MergeInterface 429 var nulls []*nulls.Nulls 430 for i := 0; i < len(w.Bats); i++ { 431 nulls = append(nulls, w.Bats[i].Vecs[w.sortIndex].GetNulls()) 432 } 433 pos := w.sortIndex 434 switch w.Bats[0].Vecs[sortIdx].GetType().Oid { 435 case types.T_bool: 436 merge = newMerge(len(w.Bats), sort.BoolLess, getFixedCols[bool](w.Bats, pos), nulls) 437 case types.T_bit: 438 merge = newMerge(len(w.Bats), sort.GenericLess[uint64], getFixedCols[uint64](w.Bats, pos), nulls) 439 case types.T_int8: 440 merge = newMerge(len(w.Bats), sort.GenericLess[int8], getFixedCols[int8](w.Bats, pos), nulls) 441 case types.T_int16: 442 merge = newMerge(len(w.Bats), sort.GenericLess[int16], getFixedCols[int16](w.Bats, pos), nulls) 443 case types.T_int32: 444 merge = newMerge(len(w.Bats), sort.GenericLess[int32], getFixedCols[int32](w.Bats, pos), nulls) 445 case types.T_int64: 446 merge = newMerge(len(w.Bats), sort.GenericLess[int64], getFixedCols[int64](w.Bats, pos), nulls) 447 case types.T_uint8: 448 merge = newMerge(len(w.Bats), sort.GenericLess[uint8], getFixedCols[uint8](w.Bats, pos), nulls) 449 case types.T_uint16: 450 merge = newMerge(len(w.Bats), sort.GenericLess[uint16], getFixedCols[uint16](w.Bats, pos), nulls) 451 case types.T_uint32: 452 merge = newMerge(len(w.Bats), sort.GenericLess[uint32], getFixedCols[uint32](w.Bats, pos), nulls) 453 case types.T_uint64: 454 merge = newMerge(len(w.Bats), sort.GenericLess[uint64], getFixedCols[uint64](w.Bats, pos), nulls) 455 case types.T_float32: 456 merge = newMerge(len(w.Bats), sort.GenericLess[float32], getFixedCols[float32](w.Bats, pos), nulls) 457 case types.T_float64: 458 merge = newMerge(len(w.Bats), sort.GenericLess[float64], getFixedCols[float64](w.Bats, pos), nulls) 459 case types.T_date: 460 merge = newMerge(len(w.Bats), sort.GenericLess[types.Date], getFixedCols[types.Date](w.Bats, pos), nulls) 461 case types.T_datetime: 462 merge = newMerge(len(w.Bats), sort.GenericLess[types.Datetime], getFixedCols[types.Datetime](w.Bats, pos), nulls) 463 case types.T_time: 464 merge = newMerge(len(w.Bats), sort.GenericLess[types.Time], getFixedCols[types.Time](w.Bats, pos), nulls) 465 case types.T_timestamp: 466 merge = newMerge(len(w.Bats), sort.GenericLess[types.Timestamp], getFixedCols[types.Timestamp](w.Bats, pos), nulls) 467 case types.T_enum: 468 merge = newMerge(len(w.Bats), sort.GenericLess[types.Enum], getFixedCols[types.Enum](w.Bats, pos), nulls) 469 case types.T_decimal64: 470 merge = newMerge(len(w.Bats), sort.Decimal64Less, getFixedCols[types.Decimal64](w.Bats, pos), nulls) 471 case types.T_decimal128: 472 merge = newMerge(len(w.Bats), sort.Decimal128Less, getFixedCols[types.Decimal128](w.Bats, pos), nulls) 473 case types.T_uuid: 474 merge = newMerge(len(w.Bats), sort.UuidLess, getFixedCols[types.Uuid](w.Bats, pos), nulls) 475 case types.T_char, types.T_varchar, types.T_blob, types.T_text: 476 merge = newMerge(len(w.Bats), sort.GenericLess[string], getStrCols(w.Bats, pos), nulls) 477 //TODO: check if we need T_array here? T_json is missing here. 478 // Update Oct 20 2023: I don't think it is necessary to add T_array here. Keeping this comment, 479 // in case anything fails in vector S3 flush in future. 480 } 481 if _, err := w.generateWriter(proc); err != nil { 482 return err 483 } 484 lens := 0 485 size := len(w.Bats) 486 w.buffer.CleanOnlyData() 487 var batchIndex int 488 var rowIndex int 489 for size > 0 { 490 batchIndex, rowIndex, size = merge.getNextPos() 491 for i := range w.buffer.Vecs { 492 err := w.buffer.Vecs[i].UnionOne(w.Bats[batchIndex].Vecs[i], int64(rowIndex), proc.GetMPool()) 493 if err != nil { 494 return err 495 } 496 } 497 lens++ 498 if lens == int(options.DefaultBlockMaxRows) { 499 lens = 0 500 if err := w.WriteBlock(w.buffer); err != nil { 501 return err 502 } 503 // force clean 504 w.buffer.CleanOnlyData() 505 } 506 } 507 if lens > 0 { 508 if err := w.WriteBlock(w.buffer); err != nil { 509 return err 510 } 511 w.buffer.CleanOnlyData() 512 } 513 if err := w.writeEndBlocks(proc); err != nil { 514 return err 515 } 516 // force clean 517 w.buffer.CleanOnlyData() 518 } 519 for i := 0; i < len(w.Bats); i++ { 520 //recycle the batch 521 w.putBatch(w.Bats[i]) 522 w.batSize -= uint64(w.Bats[i].Size()) 523 } 524 w.Bats = w.Bats[:0] 525 return nil 526 } 527 528 // WriteS3Batch logic: 529 // S3Writer caches the batches in memory 530 // and when the batches size reaches 10M, we 531 // add a tag to indicate we need to write these data into 532 // s3, but not immediately. We continue to wait until 533 // no more data or the data size reaches 64M, at that time 534 // we will trigger write s3. 535 func (w *S3Writer) WriteS3Batch(proc *process.Process, bat *batch.Batch) error { 536 w.InitBuffers(proc, bat) 537 if w.Put(bat, proc) { 538 w.SortAndFlush(proc) 539 } 540 return nil 541 } 542 543 func (w *S3Writer) putBatch(bat *batch.Batch) { 544 w.tableBatchBuffers = append(w.tableBatchBuffers, bat) 545 } 546 547 func getNewBatch(proc *process.Process, bat *batch.Batch) *batch.Batch { 548 newBat := batch.NewWithSize(bat.VectorCount()) 549 newBat.SetAttributes(bat.Attrs) 550 for i := range bat.Vecs { 551 newBat.Vecs[i] = proc.GetVector(*bat.Vecs[i].GetType()) 552 } 553 return newBat 554 } 555 556 func (w *S3Writer) GenerateWriter(proc *process.Process) (objectio.ObjectName, error) { 557 return w.generateWriter(proc) 558 } 559 560 func (w *S3Writer) generateWriter(proc *process.Process) (objectio.ObjectName, error) { 561 // Use uuid as segment id 562 // TODO: multiple 64m file in one segment 563 obj := Get().GenerateObject() 564 s3, err := fileservice.Get[fileservice.FileService](proc.FileService, defines.SharedFileServiceName) 565 if err != nil { 566 return nil, err 567 } 568 w.writer, err = blockio.NewBlockWriterNew(s3, obj, w.schemaVersion, w.seqnums) 569 if err != nil { 570 return nil, err 571 } 572 w.lengths = w.lengths[:0] 573 return obj, err 574 } 575 576 // reference to pkg/sql/colexec/order/order.go logic 577 func sortByKey(proc *process.Process, bat *batch.Batch, sortIndex int, allow_null bool, m *mpool.MPool) error { 578 hasNull := false 579 // Not-Null Check, notice that cluster by support null value 580 if nulls.Any(bat.Vecs[sortIndex].GetNulls()) { 581 hasNull = true 582 if !allow_null { 583 return moerr.NewConstraintViolation(proc.Ctx, 584 "sort key can not be null, sortIndex = %d, sortCol = %s", 585 sortIndex, bat.Attrs[sortIndex]) 586 } 587 } 588 var strCol []string 589 rowCount := bat.RowCount() 590 sels := make([]int64, rowCount) 591 for i := 0; i < rowCount; i++ { 592 sels[i] = int64(i) 593 } 594 ovec := bat.GetVector(int32(sortIndex)) 595 if ovec.GetType().IsVarlen() { 596 strCol = vector.MustStrCol(ovec) 597 } else { 598 strCol = nil 599 } 600 if allow_null { 601 // null last 602 sort.Sort(false, true, hasNull, sels, ovec, strCol) 603 } else { 604 sort.Sort(false, false, hasNull, sels, ovec, strCol) 605 } 606 return bat.Shuffle(sels, m) 607 } 608 609 func (w *S3Writer) WriteBlock(bat *batch.Batch, dataType ...objectio.DataMetaType) error { 610 if w.pk > -1 { 611 pkIdx := uint16(w.pk) 612 w.writer.SetPrimaryKey(pkIdx) 613 } 614 if w.sortIndex > -1 { 615 w.writer.SetSortKey(uint16(w.sortIndex)) 616 } 617 if w.attrs == nil { 618 w.attrs = bat.Attrs 619 } 620 if len(w.seqnums) != len(bat.Vecs) { 621 // just warn becase writing delete s3 file does not need seqnums. 622 // print the attrs to tell if it is a delete batch 623 logutil.Warnf("CN write s3 table %q: seqnums length not match seqnums: %v, attrs: %v", 624 w.tablename, w.seqnums, bat.Attrs) 625 } 626 // logutil.Infof("write s3 batch(%d) %q: %v, %v", bat.vecs[0].Length(), w.tablename, w.seqnums, w.attrs) 627 if len(dataType) > 0 && dataType[0] == objectio.SchemaTombstone { 628 _, err := w.writer.WriteTombstoneBatch(bat) 629 if err != nil { 630 return err 631 } 632 } else { 633 _, err := w.writer.WriteBatch(bat) 634 if err != nil { 635 return err 636 } 637 } 638 w.lengths = append(w.lengths, uint64(bat.Vecs[0].Length())) 639 return nil 640 } 641 642 func (w *S3Writer) writeEndBlocks(proc *process.Process) error { 643 blkInfos, stats, err := w.WriteEndBlocks(proc) 644 if err != nil { 645 return err 646 } 647 for _, blkInfo := range blkInfos { 648 if err := vector.AppendFixed( 649 w.blockInfoBat.Vecs[0], 650 w.partitionIndex, 651 false, 652 proc.GetMPool()); err != nil { 653 return err 654 } 655 if err := vector.AppendBytes( 656 w.blockInfoBat.Vecs[1], 657 //[]byte(metaLoc), 658 objectio.EncodeBlockInfo(blkInfo), 659 false, 660 proc.GetMPool()); err != nil { 661 return err 662 } 663 } 664 665 // append the object stats to bat, 666 // at most one will append in 667 for idx := 0; idx < len(stats); idx++ { 668 if stats[idx].IsZero() { 669 continue 670 } 671 672 if err = vector.AppendBytes(w.blockInfoBat.Vecs[2], 673 stats[idx].Marshal(), false, proc.GetMPool()); err != nil { 674 return err 675 } 676 } 677 678 w.blockInfoBat.SetRowCount(w.blockInfoBat.Vecs[0].Length()) 679 return nil 680 } 681 682 // WriteEndBlocks writes batches in buffer to fileservice(aka s3 in this feature) and get meta data about block on fileservice and put it into metaLocBat 683 // For more information, please refer to the comment about func WriteEnd in Writer interface 684 func (w *S3Writer) WriteEndBlocks(proc *process.Process) ([]objectio.BlockInfo, []objectio.ObjectStats, error) { 685 blocks, _, err := w.writer.Sync(proc.Ctx) 686 logutil.Debugf("write s3 table %q: %v, %v", w.tablename, w.seqnums, w.attrs) 687 if err != nil { 688 return nil, nil, err 689 } 690 blkInfos := make([]objectio.BlockInfo, 0, len(blocks)) 691 //TODO::block id ,segment id and location should be get from BlockObject. 692 for j := range blocks { 693 location := blockio.EncodeLocation( 694 w.writer.GetName(), 695 blocks[j].GetExtent(), 696 uint32(w.lengths[j]), 697 blocks[j].GetID(), 698 ) 699 700 sid := location.Name().SegmentId() 701 blkInfo := objectio.BlockInfo{ 702 BlockID: *objectio.NewBlockid( 703 &sid, 704 location.Name().Num(), 705 location.ID()), 706 SegmentID: sid, 707 //non-appendable block 708 EntryState: false, 709 } 710 blkInfo.SetMetaLocation(location) 711 if w.sortIndex != -1 { 712 blkInfo.Sorted = true 713 } 714 blkInfos = append(blkInfos, blkInfo) 715 } 716 return blkInfos, w.writer.GetObjectStats(), err 717 }