github.com/matrixorigin/matrixone@v0.7.0/pkg/sql/colexec/s3util.go (about) 1 // Copyright 2022 Matrix Origin 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 package colexec 16 17 import ( 18 "github.com/matrixorigin/matrixone/pkg/catalog" 19 "github.com/matrixorigin/matrixone/pkg/common/moerr" 20 "github.com/matrixorigin/matrixone/pkg/common/mpool" 21 "github.com/matrixorigin/matrixone/pkg/container/batch" 22 "github.com/matrixorigin/matrixone/pkg/container/nulls" 23 "github.com/matrixorigin/matrixone/pkg/container/types" 24 "github.com/matrixorigin/matrixone/pkg/container/vector" 25 "github.com/matrixorigin/matrixone/pkg/defines" 26 "github.com/matrixorigin/matrixone/pkg/fileservice" 27 "github.com/matrixorigin/matrixone/pkg/objectio" 28 "github.com/matrixorigin/matrixone/pkg/partition" 29 "github.com/matrixorigin/matrixone/pkg/pb/plan" 30 "github.com/matrixorigin/matrixone/pkg/sort" 31 "github.com/matrixorigin/matrixone/pkg/sql/util" 32 "github.com/matrixorigin/matrixone/pkg/vm/engine" 33 "github.com/matrixorigin/matrixone/pkg/vm/engine/tae/common" 34 "github.com/matrixorigin/matrixone/pkg/vm/engine/tae/containers" 35 "github.com/matrixorigin/matrixone/pkg/vm/engine/tae/dataio/blockio" 36 "github.com/matrixorigin/matrixone/pkg/vm/engine/tae/options" 37 "github.com/matrixorigin/matrixone/pkg/vm/engine/tae/tables/indexwrapper" 38 "github.com/matrixorigin/matrixone/pkg/vm/process" 39 ) 40 41 type WriteS3Container struct { 42 sortIndex []int 43 nameToNullablity map[string]bool 44 pk map[string]bool 45 46 writer objectio.Writer 47 lengths []uint64 48 cacheBat []*batch.Batch 49 50 UniqueRels []engine.Relation 51 52 metaLocBat *batch.Batch 53 } 54 55 func NewWriteS3Container(tableDef *plan.TableDef) *WriteS3Container { 56 container := &WriteS3Container{ 57 sortIndex: make([]int, 0, 1), 58 pk: make(map[string]bool), 59 nameToNullablity: make(map[string]bool), 60 } 61 62 // get pk indexes 63 if tableDef.CompositePkey != nil { 64 names := util.SplitCompositePrimaryKeyColumnName(tableDef.CompositePkey.Name) 65 for num, colDef := range tableDef.Cols { 66 for _, name := range names { 67 if colDef.Name == name { 68 container.sortIndex = append(container.sortIndex, num) 69 } 70 } 71 } 72 } else { 73 // Get Single Col pk index 74 for num, colDef := range tableDef.Cols { 75 if colDef.Primary { 76 container.sortIndex = append(container.sortIndex, num) 77 break 78 } 79 } 80 } 81 82 // Get CPkey index 83 if tableDef.CompositePkey != nil { 84 // the serialized cpk col is located in the last of the bat.vecs 85 container.sortIndex = append(container.sortIndex, len(tableDef.Cols)) 86 } else { 87 // Get Single Col pk index 88 for num, colDef := range tableDef.Cols { 89 if colDef.Primary { 90 container.sortIndex = append(container.sortIndex, num) 91 break 92 } 93 } 94 if tableDef.ClusterBy != nil { 95 if util.JudgeIsCompositeClusterByColumn(tableDef.ClusterBy.Name) { 96 // the serialized clusterby col is located in the last of the bat.vecs 97 container.sortIndex = append(container.sortIndex, len(tableDef.Cols)) 98 } else { 99 for num, colDef := range tableDef.Cols { 100 if colDef.Name == tableDef.ClusterBy.Name { 101 container.sortIndex = append(container.sortIndex, num) 102 } 103 } 104 } 105 } 106 } 107 108 // get NameNullAbility 109 for _, def := range tableDef.Cols { 110 container.nameToNullablity[def.Name] = def.Default.NullAbility 111 if def.Primary { 112 container.pk[def.Name] = true 113 } 114 } 115 if tableDef.CompositePkey != nil { 116 def := tableDef.CompositePkey 117 container.nameToNullablity[def.Name] = def.Default.NullAbility 118 container.pk[def.Name] = true 119 } 120 121 //if tableDef.Indexes != nil { 122 // for _, indexdef := range tableDef.Indexes { 123 // if indexdef.Unique { 124 // for j := range indexdef.Field.Cols { 125 // coldef := indexdef.Field.Cols[j] 126 // container.nameToNullablity[coldef.Name] = coldef.Default.NullAbility 127 // } 128 // } else { 129 // continue 130 // } 131 // } 132 //} 133 134 if tableDef.ClusterBy != nil { 135 container.nameToNullablity[tableDef.ClusterBy.Name] = true 136 } 137 container.resetMetaLocBat() 138 139 return container 140 } 141 142 func (container *WriteS3Container) resetMetaLocBat() { 143 // A simple explanation of the two vectors held by metaLocBat 144 // vecs[0] to mark which table this metaLoc belongs to: [0] means insertTable itself, [1] means the first uniqueIndex table, [2] means the second uniqueIndex table and so on 145 // vecs[1] store relative block metadata 146 attrs := []string{catalog.BlockMeta_TableIdx_Insert, catalog.BlockMeta_MetaLoc} 147 metaLocBat := batch.New(true, attrs) 148 metaLocBat.Vecs[0] = vector.New(types.Type{Oid: types.T(types.T_uint16)}) 149 metaLocBat.Vecs[1] = vector.New(types.New(types.T_varchar, 150 types.MaxVarcharLen, 0, 0)) 151 152 container.metaLocBat = metaLocBat 153 } 154 155 func (container *WriteS3Container) WriteEnd(proc *process.Process) { 156 if container.metaLocBat.Vecs[0].Length() > 0 { 157 container.metaLocBat.SetZs(container.metaLocBat.Vecs[0].Length(), proc.GetMPool()) 158 proc.SetInputBatch(container.metaLocBat) 159 container.resetMetaLocBat() 160 } 161 } 162 163 func (container *WriteS3Container) WriteS3CacheBatch(proc *process.Process) error { 164 if len(container.cacheBat) > 0 { 165 for i, bat := range container.cacheBat { 166 if bat != nil { 167 err := GetBlockMeta([]*batch.Batch{bat}, container, proc, i) 168 if err != nil { 169 return err 170 } 171 } 172 } 173 container.WriteEnd(proc) 174 } 175 return nil 176 } 177 178 func (container *WriteS3Container) WriteS3Batch(bat *batch.Batch, proc *process.Process, idx int) error { 179 bats := reSizeBatch(container, bat, proc, idx) 180 if len(bats) == 0 { 181 proc.SetInputBatch(&batch.Batch{}) 182 return nil 183 } 184 return GetBlockMeta(bats, container, proc, idx) 185 } 186 187 // After cn writes the data to s3, it will get meta data about the block (aka metaloc) by calling func WriteEndBlocks 188 // and cn needs to pass it to dn for conflict detection 189 // Except for the case of writing s3 directly, cn doesn't need to sense how dn is labeling the blocks on s3 190 func GetBlockMeta(bats []*batch.Batch, container *WriteS3Container, proc *process.Process, idx int) error { 191 for i := range bats { 192 if err := GenerateWriter(container, proc); err != nil { 193 return err 194 } 195 if idx == 0 && len(container.sortIndex) != 0 { 196 SortByPrimaryKey(proc, bats[i], container.sortIndex, proc.GetMPool()) 197 } 198 if bats[i].Length() == 0 { 199 continue 200 } 201 if err := WriteBlock(container, bats[i], proc); err != nil { 202 return err 203 } 204 if err := WriteEndBlocks(container, proc, idx); err != nil { 205 return err 206 } 207 } 208 209 // send it to connector operator. 210 // vitually, first it will be recieved by output, then transfer it to connector by rpc 211 // metaLocBat.SetZs(metaLocBat.Vecs[0].Length(), proc.GetMPool()) 212 return nil 213 } 214 215 // reSizeBatch will try to set the batch with the length of DefaultBlockMaxRows 216 // consider DefaultBlockMaxRows as unit 217 // case 1. If the length of bat and cacheBat together is larger than DefaultBlockMaxRows, then split the batch into unit batchs and return, the smaller part store in cacheBat 218 // case 2. If the length of bat and cacheBat together is less than DefaultBlockMaxRows, then bat is merged into cacheBat 219 // The expected result is : unitBatch1, unitBatch2, ... unitBatchx, the last Batch that batchSize less than DefaultBlockMaxRows 220 // 221 // limit : one segment has only one block, this limit exists because currently, tae caches blocks in memory (instead of disk) before writing them to s3, which means that if limit 1 is removed, it may cause memory problems 222 func reSizeBatch(container *WriteS3Container, bat *batch.Batch, proc *process.Process, batIdx int) (bats []*batch.Batch) { 223 var newBat *batch.Batch 224 var cacheLen uint32 225 if len(container.cacheBat) <= batIdx { 226 container.cacheBat = append(container.cacheBat, nil) 227 } 228 if container.cacheBat[batIdx] != nil { 229 cacheLen = uint32(container.cacheBat[batIdx].Length()) 230 } 231 idx := int(cacheLen) 232 cnt := cacheLen + uint32(bat.Length()) 233 234 if cnt >= options.DefaultBlockMaxRows { // case 1 235 if container.cacheBat[batIdx] != nil { 236 newBat = container.cacheBat[batIdx] 237 container.cacheBat[batIdx] = nil 238 } else { 239 newBat = getNewBatch(bat) 240 } 241 242 for cnt >= options.DefaultBlockMaxRows { 243 for i := range newBat.Vecs { 244 vector.UnionOne(newBat.Vecs[i], bat.Vecs[i], int64(idx)-int64(cacheLen), proc.GetMPool()) 245 } 246 idx++ 247 if idx%int(options.DefaultBlockMaxRows) == 0 { 248 newBat.SetZs(int(options.DefaultBlockMaxRows), proc.GetMPool()) 249 bats = append(bats, newBat) 250 newBat = getNewBatch(bat) 251 cnt -= options.DefaultBlockMaxRows 252 } 253 } 254 } 255 256 if len(bats) == 0 { // implying the end of this operator, the last Batch that batchSize less than DefaultBlockMaxRows 257 if container.cacheBat[batIdx] == nil { 258 container.cacheBat[batIdx] = getNewBatch(bat) 259 } 260 for i := 0; i < bat.Length(); i++ { 261 for j := range container.cacheBat[batIdx].Vecs { 262 vector.UnionOne(container.cacheBat[batIdx].Vecs[j], bat.Vecs[j], int64(i), proc.GetMPool()) 263 } 264 } 265 container.cacheBat[batIdx].SetZs(container.cacheBat[batIdx].Vecs[0].Length(), proc.GetMPool()) 266 } else { 267 if cnt > 0 { // the part less than DefaultBlockMaxRows stored in cacheBat 268 if newBat == nil { 269 newBat = getNewBatch(bat) 270 } 271 for cnt > 0 { 272 for i := range newBat.Vecs { 273 vector.UnionOne(newBat.Vecs[i], bat.Vecs[i], int64(idx)-int64(cacheLen), proc.GetMPool()) 274 } 275 idx++ 276 cnt-- 277 } 278 container.cacheBat[batIdx] = newBat 279 container.cacheBat[batIdx].SetZs(container.cacheBat[batIdx].Vecs[0].Length(), proc.GetMPool()) 280 } 281 } 282 return 283 } 284 285 func getNewBatch(bat *batch.Batch) *batch.Batch { 286 attrs := make([]string, len(bat.Attrs)) 287 copy(attrs, bat.Attrs) 288 newBat := batch.New(true, attrs) 289 for i := range bat.Vecs { 290 newBat.Vecs[i] = vector.New(bat.Vecs[i].GetType()) 291 } 292 return newBat 293 } 294 295 func GenerateWriter(container *WriteS3Container, proc *process.Process) error { 296 segId, err := Srv.GenerateSegment() 297 298 if err != nil { 299 return err 300 } 301 s3, err := fileservice.Get[fileservice.FileService](proc.FileService, defines.SharedFileServiceName) 302 if err != nil { 303 return err 304 } 305 container.writer, err = objectio.NewObjectWriter(segId, s3) 306 if err != nil { 307 return err 308 } 309 container.lengths = container.lengths[:0] 310 return nil 311 } 312 313 // referece to pkg/sql/colexec/order/order.go logic 314 func SortByPrimaryKey(proc *process.Process, bat *batch.Batch, pkIdx []int, m *mpool.MPool) error { 315 // Not-Null Check 316 for i := 0; i < len(pkIdx); i++ { 317 if nulls.Any(bat.Vecs[i].Nsp) { 318 // return moerr.NewConstraintViolation(proc.Ctx, fmt.Sprintf("Column '%s' cannot be null", n.InsertCtx.TableDef.Cols[i].GetName())) 319 return moerr.NewConstraintViolation(proc.Ctx, "Primary key can not be null") 320 } 321 } 322 323 var strCol []string 324 sels := make([]int64, len(bat.Zs)) 325 for i := 0; i < len(bat.Zs); i++ { 326 sels[i] = int64(i) 327 } 328 ovec := bat.GetVector(int32(pkIdx[0])) 329 if ovec.Typ.IsString() { 330 strCol = vector.GetStrVectorValues(ovec) 331 } else { 332 strCol = nil 333 } 334 sort.Sort(false, false, false, sels, ovec, strCol) 335 if len(pkIdx) == 1 { 336 return bat.Shuffle(sels, m) 337 } 338 ps := make([]int64, 0, 16) 339 ds := make([]bool, len(sels)) 340 for i, j := 1, len(pkIdx); i < j; i++ { 341 ps = partition.Partition(sels, ds, ps, ovec) 342 vec := bat.Vecs[pkIdx[i]] 343 if vec.Typ.IsString() { 344 strCol = vector.GetStrVectorValues(vec) 345 } else { 346 strCol = nil 347 } 348 for i, j := 0, len(ps); i < j; i++ { 349 if i == j-1 { 350 sort.Sort(false, false, false, sels[ps[i]:], vec, strCol) 351 } else { 352 sort.Sort(false, false, false, sels[ps[i]:ps[i+1]], vec, strCol) 353 } 354 } 355 ovec = vec 356 } 357 return bat.Shuffle(sels, m) 358 } 359 360 // WriteBlock WriteBlock writes one batch to a buffer and generate related indexes for this batch 361 // For more information, please refer to the comment about func Write in Writer interface 362 func WriteBlock(container *WriteS3Container, bat *batch.Batch, proc *process.Process) error { 363 fd, err := container.writer.Write(bat) 364 365 if err != nil { 366 return err 367 } 368 // atomic.AddUint64(&n.Affected, uint64(bat.Vecs[0].Length())) 369 container.lengths = append(container.lengths, uint64(bat.Vecs[0].Length())) 370 if err := GenerateIndex(container, fd, container.writer, bat); err != nil { 371 return err 372 } 373 374 return nil 375 } 376 377 // GenerateIndex generates relative indexes for the batch writed directly to s3 from cn 378 // For more information, please refer to the comment about func WriteIndex in Writer interface 379 func GenerateIndex(container *WriteS3Container, fd objectio.BlockObject, objectWriter objectio.Writer, bat *batch.Batch) error { 380 for i, mvec := range bat.Vecs { 381 err := getIndexDataFromVec(fd, objectWriter, uint16(i), mvec, container.nameToNullablity[bat.Attrs[i]], container.pk[bat.Attrs[i]]) 382 if err != nil { 383 return err 384 } 385 } 386 return nil 387 } 388 389 func getIndexDataFromVec(block objectio.BlockObject, writer objectio.Writer, 390 idx uint16, 391 vec *vector.Vector, nullAbliaty bool, isPk bool) error { 392 var err error 393 columnData := containers.NewVectorWithSharedMemory(vec, nullAbliaty) 394 zmPos := 0 395 zoneMapWriter := indexwrapper.NewZMWriter() 396 if err = zoneMapWriter.Init(writer, block, common.Plain, idx, uint16(zmPos)); err != nil { 397 return err 398 } 399 err = zoneMapWriter.AddValues(columnData) 400 if err != nil { 401 return err 402 } 403 _, err = zoneMapWriter.Finalize() 404 if err != nil { 405 return err 406 } 407 if !isPk { 408 return nil 409 } 410 bfPos := 1 411 bfWriter := indexwrapper.NewBFWriter() 412 if err = bfWriter.Init(writer, block, common.Plain, idx, uint16(bfPos)); err != nil { 413 return err 414 } 415 if err = bfWriter.AddValues(columnData); err != nil { 416 return err 417 } 418 _, err = bfWriter.Finalize() 419 if err != nil { 420 return err 421 } 422 return nil 423 } 424 425 // WriteEndBlocks WriteEndBlocks write batches in buffer to fileservice(aka s3 in this feature) and get meta data about block on fileservice and put it into metaLocBat 426 // For more information, please refer to the comment about func WriteEnd in Writer interface 427 func WriteEndBlocks(container *WriteS3Container, proc *process.Process, idx int) error { 428 blocks, err := container.writer.WriteEnd(proc.Ctx) 429 if err != nil { 430 return err 431 } 432 for j := range blocks { 433 metaLoc, err := blockio.EncodeMetaLocWithObject( 434 blocks[0].GetExtent(), 435 uint32(container.lengths[j]), 436 blocks, 437 ) 438 if err != nil { 439 return err 440 } 441 container.metaLocBat.Vecs[0].Append(uint16(idx), false, proc.GetMPool()) 442 container.metaLocBat.Vecs[1].Append([]byte(metaLoc), false, proc.GetMPool()) 443 } 444 // for i := range container.unique_writer { 445 // if blocks, err = container.unique_writer[i].WriteEnd(proc.Ctx); err != nil { 446 // return err 447 // } 448 // for j := range blocks { 449 // metaLoc, err := blockio.EncodeMetaLocWithObject( 450 // blocks[0].GetExtent(), 451 // uint32(container.unique_lengths[i][j]), 452 // blocks, 453 // ) 454 // if err != nil { 455 // return err 456 // } 457 // metaLocBat.Vecs[0].Append(uint16(i+1), false, proc.GetMPool()) 458 // metaLocBat.Vecs[1].Append([]byte(metaLoc), false, proc.GetMPool()) 459 // } 460 // } 461 return nil 462 }