github.com/matrixorigin/matrixone@v1.2.0/pkg/vm/engine/tae/blockio/read.go (about) 1 // Copyright 2021 Matrix Origin 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 package blockio 16 17 import ( 18 "context" 19 "math" 20 "time" 21 22 "github.com/matrixorigin/matrixone/pkg/common/mpool" 23 "github.com/matrixorigin/matrixone/pkg/container/batch" 24 "github.com/matrixorigin/matrixone/pkg/container/nulls" 25 "github.com/matrixorigin/matrixone/pkg/container/types" 26 "github.com/matrixorigin/matrixone/pkg/container/vector" 27 "github.com/matrixorigin/matrixone/pkg/fileservice" 28 "github.com/matrixorigin/matrixone/pkg/logutil" 29 "github.com/matrixorigin/matrixone/pkg/objectio" 30 "github.com/matrixorigin/matrixone/pkg/pb/timestamp" 31 v2 "github.com/matrixorigin/matrixone/pkg/util/metric/v2" 32 "github.com/matrixorigin/matrixone/pkg/vm/engine" 33 "go.uber.org/zap" 34 ) 35 36 type ReadFilter = func([]*vector.Vector) []int32 37 38 func ReadByFilter( 39 ctx context.Context, 40 info *objectio.BlockInfo, 41 inputDeletes []int64, 42 columns []uint16, 43 colTypes []types.Type, 44 ts types.TS, 45 filter ReadFilter, 46 fs fileservice.FileService, 47 mp *mpool.MPool, 48 ) (sels []int32, err error) { 49 bat, release, err := LoadColumns(ctx, columns, colTypes, fs, info.MetaLocation(), mp, fileservice.Policy(0)) 50 if err != nil { 51 return 52 } 53 defer release() 54 var deleteMask *nulls.Nulls 55 56 // merge persisted deletes 57 if !info.DeltaLocation().IsEmpty() { 58 now := time.Now() 59 var persistedDeletes *batch.Batch 60 var persistedByCN bool 61 var release func() 62 // load from storage 63 if persistedDeletes, persistedByCN, release, err = ReadBlockDelete(ctx, info.DeltaLocation(), fs); err != nil { 64 return 65 } 66 defer release() 67 readcost := time.Since(now) 68 var rows *nulls.Nulls 69 var bisect time.Duration 70 if persistedByCN { 71 rows = EvalDeleteRowsByTimestampForDeletesPersistedByCN(persistedDeletes, ts, info.CommitTs) 72 } else { 73 nowx := time.Now() 74 rows = EvalDeleteRowsByTimestamp(persistedDeletes, ts, &info.BlockID) 75 bisect = time.Since(nowx) 76 } 77 if rows != nil { 78 deleteMask = rows 79 } 80 readtotal := time.Since(now) 81 RecordReadDel(readtotal, readcost, bisect) 82 } 83 84 if deleteMask == nil { 85 deleteMask = nulls.NewWithSize(len(inputDeletes)) 86 } 87 88 // merge input deletes 89 for _, row := range inputDeletes { 90 deleteMask.Add(uint64(row)) 91 } 92 93 sels = filter(bat.Vecs) 94 95 // deslect deleted rows from sels 96 if !deleteMask.IsEmpty() { 97 var rows []int32 98 for _, row := range sels { 99 if !deleteMask.Contains(uint64(row)) { 100 rows = append(rows, row) 101 } 102 } 103 sels = rows 104 } 105 return 106 } 107 108 // BlockRead read block data from storage and apply deletes according given timestamp. Caller make sure metaloc is not empty 109 func BlockRead( 110 ctx context.Context, 111 info *objectio.BlockInfo, 112 inputDeletes []int64, 113 columns []uint16, 114 colTypes []types.Type, 115 ts timestamp.Timestamp, 116 filterSeqnums []uint16, 117 filterColTypes []types.Type, 118 filter ReadFilter, 119 fs fileservice.FileService, 120 mp *mpool.MPool, 121 vp engine.VectorPool, 122 policy fileservice.Policy, 123 ) (*batch.Batch, error) { 124 if logutil.GetSkip1Logger().Core().Enabled(zap.DebugLevel) { 125 logutil.Debugf("read block %s, columns %v, types %v", info.BlockID.String(), columns, colTypes) 126 } 127 128 var ( 129 sels []int32 130 err error 131 ) 132 133 if filter != nil && info.Sorted { 134 if sels, err = ReadByFilter( 135 ctx, info, inputDeletes, filterSeqnums, filterColTypes, 136 types.TimestampToTS(ts), filter, fs, mp, 137 ); err != nil { 138 return nil, err 139 } 140 v2.TaskSelReadFilterTotal.Inc() 141 if len(sels) == 0 { 142 RecordReadFilterSelectivity(1, 1) 143 v2.TaskSelReadFilterHit.Inc() 144 } else { 145 RecordReadFilterSelectivity(0, 1) 146 } 147 148 if len(sels) == 0 { 149 result := batch.NewWithSize(len(colTypes)) 150 for i, typ := range colTypes { 151 if vp == nil { 152 result.Vecs[i] = vector.NewVec(typ) 153 } else { 154 result.Vecs[i] = vp.GetVector(typ) 155 } 156 } 157 return result, nil 158 } 159 } 160 161 columnBatch, err := BlockReadInner( 162 ctx, info, inputDeletes, columns, colTypes, 163 types.TimestampToTS(ts), sels, fs, mp, vp, policy, 164 ) 165 if err != nil { 166 return nil, err 167 } 168 169 columnBatch.SetRowCount(columnBatch.Vecs[0].Length()) 170 return columnBatch, nil 171 } 172 173 func BlockCompactionRead( 174 ctx context.Context, 175 location objectio.Location, 176 deletes []int64, 177 seqnums []uint16, 178 colTypes []types.Type, 179 fs fileservice.FileService, 180 mp *mpool.MPool, 181 ) (*batch.Batch, error) { 182 183 loaded, release, err := LoadColumns(ctx, seqnums, colTypes, fs, location, mp, fileservice.Policy(0)) 184 if err != nil { 185 return nil, err 186 } 187 defer release() 188 if len(deletes) == 0 { 189 return loaded, nil 190 } 191 result := batch.NewWithSize(len(loaded.Vecs)) 192 for i, col := range loaded.Vecs { 193 typ := *col.GetType() 194 result.Vecs[i] = vector.NewVec(typ) 195 if err = vector.GetUnionAllFunction(typ, mp)(result.Vecs[i], col); err != nil { 196 break 197 } 198 result.Vecs[i].Shrink(deletes, true) 199 } 200 201 if err != nil { 202 for _, col := range result.Vecs { 203 if col != nil { 204 col.Free(mp) 205 } 206 } 207 return nil, err 208 } 209 result.SetRowCount(result.Vecs[0].Length()) 210 return result, nil 211 } 212 213 func BlockReadInner( 214 ctx context.Context, 215 info *objectio.BlockInfo, 216 inputDeleteRows []int64, 217 columns []uint16, 218 colTypes []types.Type, 219 ts types.TS, 220 selectRows []int32, // if selectRows is not empty, it was already filtered by filter 221 fs fileservice.FileService, 222 mp *mpool.MPool, 223 vp engine.VectorPool, 224 policy fileservice.Policy, 225 ) (result *batch.Batch, err error) { 226 var ( 227 rowidPos int 228 deletedRows []int64 229 deleteMask nulls.Bitmap 230 loaded *batch.Batch 231 release func() 232 ) 233 234 // read block data from storage specified by meta location 235 if loaded, rowidPos, deleteMask, release, err = readBlockData( 236 ctx, columns, colTypes, info, ts, fs, mp, vp, policy, 237 ); err != nil { 238 return 239 } 240 defer release() 241 242 // assemble result batch for return 243 result = batch.NewWithSize(len(loaded.Vecs)) 244 245 if len(selectRows) > 0 { 246 // NOTE: it always goes here if there is a filter and the block is sorted 247 // and there are selected rows after applying the filter and delete mask 248 249 // build rowid column if needed 250 if rowidPos >= 0 { 251 if loaded.Vecs[rowidPos], err = buildRowidColumn( 252 info, selectRows, mp, vp, 253 ); err != nil { 254 return 255 } 256 } 257 258 // assemble result batch only with selected rows 259 for i, col := range loaded.Vecs { 260 typ := *col.GetType() 261 if typ.Oid == types.T_Rowid { 262 result.Vecs[i] = col 263 continue 264 } 265 if vp == nil { 266 result.Vecs[i] = vector.NewVec(typ) 267 } else { 268 result.Vecs[i] = vp.GetVector(typ) 269 } 270 if err = result.Vecs[i].Union(col, selectRows, mp); err != nil { 271 break 272 } 273 } 274 if err != nil { 275 for _, col := range result.Vecs { 276 if col != nil { 277 col.Free(mp) 278 } 279 } 280 } 281 return 282 } 283 284 // read deletes from storage specified by delta location 285 if !info.DeltaLocation().IsEmpty() { 286 var deletes *batch.Batch 287 var persistedByCN bool 288 var release func() 289 now := time.Now() 290 // load from storage 291 if deletes, persistedByCN, release, err = ReadBlockDelete(ctx, info.DeltaLocation(), fs); err != nil { 292 return 293 } 294 defer release() 295 readcost := time.Since(now) 296 297 // eval delete rows by timestamp 298 var rows *nulls.Nulls 299 var bisect time.Duration 300 if persistedByCN { 301 rows = EvalDeleteRowsByTimestampForDeletesPersistedByCN(deletes, ts, info.CommitTs) 302 } else { 303 nowx := time.Now() 304 rows = EvalDeleteRowsByTimestamp(deletes, ts, &info.BlockID) 305 bisect = time.Since(nowx) 306 } 307 308 // merge delete rows 309 deleteMask.Merge(rows) 310 311 readtotal := time.Since(now) 312 RecordReadDel(readtotal, readcost, bisect) 313 314 if logutil.GetSkip1Logger().Core().Enabled(zap.DebugLevel) { 315 logutil.Debugf( 316 "blockread %s read delete %d: base %s filter out %v\n", 317 info.BlockID.String(), deletes.RowCount(), ts.ToString(), deleteMask.Count()) 318 } 319 } 320 321 // merge deletes from input 322 // deletes from storage + deletes from input 323 for _, row := range inputDeleteRows { 324 deleteMask.Add(uint64(row)) 325 } 326 327 // Note: it always goes here if no filter or the block is not sorted 328 329 // transform delete mask to deleted rows 330 // TODO: avoid this transformation 331 if !deleteMask.IsEmpty() { 332 deletedRows = deleteMask.ToI64Arrary() 333 // logutil.Debugf("deleted/length: %d/%d=%f", 334 // len(deletedRows), 335 // loaded.Vecs[0].Length(), 336 // float64(len(deletedRows))/float64(loaded.Vecs[0].Length())) 337 } 338 339 // build rowid column if needed 340 if rowidPos >= 0 { 341 if loaded.Vecs[rowidPos], err = buildRowidColumn( 342 info, nil, mp, vp, 343 ); err != nil { 344 return 345 } 346 } 347 348 // assemble result batch 349 for i, col := range loaded.Vecs { 350 typ := *col.GetType() 351 352 if typ.Oid == types.T_Rowid { 353 // rowid is already allocted by the mpool, no need to create a new vector 354 result.Vecs[i] = col 355 } else { 356 // for other types, we need to create a new vector 357 if vp == nil { 358 result.Vecs[i] = vector.NewVec(typ) 359 } else { 360 result.Vecs[i] = vp.GetVector(typ) 361 } 362 // copy the data from loaded vector to result vector 363 // TODO: avoid this allocation and copy 364 if err = vector.GetUnionAllFunction(typ, mp)(result.Vecs[i], col); err != nil { 365 break 366 } 367 } 368 369 // shrink the vector by deleted rows 370 if len(deletedRows) > 0 { 371 result.Vecs[i].Shrink(deletedRows, true) 372 } 373 } 374 375 // if any error happens, free the result batch allocated 376 if err != nil { 377 for _, col := range result.Vecs { 378 if col != nil { 379 col.Free(mp) 380 } 381 } 382 } 383 return 384 } 385 386 func getRowsIdIndex(colIndexes []uint16, colTypes []types.Type) (int, []uint16, []types.Type) { 387 idx := -1 388 for i, typ := range colTypes { 389 if typ.Oid == types.T_Rowid { 390 idx = i 391 break 392 } 393 } 394 if idx < 0 { 395 return idx, colIndexes, colTypes 396 } 397 idxes := make([]uint16, 0, len(colTypes)-1) 398 typs := make([]types.Type, 0, len(colTypes)-1) 399 idxes = append(idxes, colIndexes[:idx]...) 400 idxes = append(idxes, colIndexes[idx+1:]...) 401 typs = append(typs, colTypes[:idx]...) 402 typs = append(typs, colTypes[idx+1:]...) 403 return idx, idxes, typs 404 } 405 406 func buildRowidColumn( 407 info *objectio.BlockInfo, 408 sels []int32, 409 m *mpool.MPool, 410 vp engine.VectorPool, 411 ) (col *vector.Vector, err error) { 412 if vp == nil { 413 col = vector.NewVec(objectio.RowidType) 414 } else { 415 col = vp.GetVector(objectio.RowidType) 416 } 417 if len(sels) == 0 { 418 err = objectio.ConstructRowidColumnTo( 419 col, 420 &info.BlockID, 421 0, 422 info.MetaLocation().Rows(), 423 m, 424 ) 425 } else { 426 err = objectio.ConstructRowidColumnToWithSels( 427 col, 428 &info.BlockID, 429 sels, 430 m, 431 ) 432 } 433 if err != nil { 434 col.Free(m) 435 col = nil 436 } 437 return 438 } 439 440 func readBlockData( 441 ctx context.Context, 442 colIndexes []uint16, 443 colTypes []types.Type, 444 info *objectio.BlockInfo, 445 ts types.TS, 446 fs fileservice.FileService, 447 m *mpool.MPool, 448 vp engine.VectorPool, 449 policy fileservice.Policy, 450 ) (bat *batch.Batch, rowidPos int, deleteMask nulls.Bitmap, release func(), err error) { 451 rowidPos, idxes, typs := getRowsIdIndex(colIndexes, colTypes) 452 453 readColumns := func(cols []uint16) (result *batch.Batch, loaded *batch.Batch, err error) { 454 if len(cols) == 0 && rowidPos >= 0 { 455 // only read rowid column on non appendable block, return early 456 result = batch.NewWithSize(1) 457 // result.Vecs[0] = rowid 458 release = func() {} 459 return 460 } 461 462 if loaded, release, err = LoadColumns(ctx, cols, typs, fs, info.MetaLocation(), m, policy); err != nil { 463 return 464 } 465 466 colPos := 0 467 result = batch.NewWithSize(len(colTypes)) 468 for i, typ := range colTypes { 469 if typ.Oid != types.T_Rowid { 470 result.Vecs[i] = loaded.Vecs[colPos] 471 colPos++ 472 } 473 } 474 return 475 } 476 477 readABlkColumns := func(cols []uint16) (result *batch.Batch, deletes nulls.Bitmap, err error) { 478 var loaded *batch.Batch 479 // appendable block should be filtered by committs 480 cols = append(cols, objectio.SEQNUM_COMMITTS, objectio.SEQNUM_ABORT) // committs, aborted 481 482 // no need to add typs, the two columns won't be generated 483 if result, loaded, err = readColumns(cols); err != nil { 484 return 485 } 486 487 t0 := time.Now() 488 aborts := vector.MustFixedCol[bool](loaded.Vecs[len(loaded.Vecs)-1]) 489 commits := vector.MustFixedCol[types.TS](loaded.Vecs[len(loaded.Vecs)-2]) 490 for i := 0; i < len(commits); i++ { 491 if aborts[i] || commits[i].Greater(&ts) { 492 deletes.Add(uint64(i)) 493 } 494 } 495 logutil.Debugf( 496 "blockread %s scan filter cost %v: base %s filter out %v\n ", 497 info.BlockID.String(), time.Since(t0), ts.ToString(), deletes.Count()) 498 return 499 } 500 501 if info.EntryState { 502 bat, deleteMask, err = readABlkColumns(idxes) 503 } else { 504 bat, _, err = readColumns(idxes) 505 } 506 507 return 508 } 509 510 func ReadBlockDelete(ctx context.Context, deltaloc objectio.Location, fs fileservice.FileService) (bat *batch.Batch, isPersistedByCN bool, release func(), err error) { 511 isPersistedByCN, err = IsPersistedByCN(ctx, deltaloc, fs) 512 if err != nil { 513 return 514 } 515 bat, release, err = ReadBlockDeleteBySchema(ctx, deltaloc, fs, isPersistedByCN) 516 return 517 } 518 519 func ReadBlockDeleteBySchema(ctx context.Context, deltaloc objectio.Location, fs fileservice.FileService, isPersistedByCN bool) (bat *batch.Batch, release func(), err error) { 520 var cols []uint16 521 if isPersistedByCN { 522 cols = []uint16{0, 1} 523 } else { 524 cols = []uint16{0, 1, 2, 3} 525 } 526 bat, release, err = LoadTombstoneColumns(ctx, cols, nil, fs, deltaloc, nil) 527 return 528 } 529 530 func IsPersistedByCN(ctx context.Context, deltaloc objectio.Location, fs fileservice.FileService) (bool, error) { 531 objectMeta, err := objectio.FastLoadObjectMeta(ctx, &deltaloc, false, fs) 532 if err != nil { 533 return false, err 534 } 535 meta, ok := objectMeta.TombstoneMeta() 536 if !ok { 537 meta = objectMeta.MustDataMeta() 538 } 539 blkmeta := meta.GetBlockMeta(uint32(deltaloc.ID())) 540 columnCount := blkmeta.GetColumnCount() 541 return columnCount == 2, nil 542 } 543 544 func EvalDeleteRowsByTimestamp(deletes *batch.Batch, ts types.TS, blockid *types.Blockid) (rows *nulls.Bitmap) { 545 if deletes == nil { 546 return 547 } 548 // record visible delete rows 549 rows = nulls.NewWithSize(64) 550 551 rowids := vector.MustFixedCol[types.Rowid](deletes.Vecs[0]) 552 tss := vector.MustFixedCol[types.TS](deletes.Vecs[1]) 553 aborts := deletes.Vecs[3] 554 555 start, end := FindIntervalForBlock(rowids, blockid) 556 557 for i := start; i < end; i++ { 558 abort := vector.GetFixedAt[bool](aborts, i) 559 if abort || tss[i].Greater(&ts) { 560 continue 561 } 562 row := rowids[i].GetRowOffset() 563 rows.Add(uint64(row)) 564 } 565 return 566 } 567 568 func EvalDeleteRowsByTimestampForDeletesPersistedByCN(deletes *batch.Batch, ts types.TS, committs types.TS) (rows *nulls.Bitmap) { 569 if deletes == nil || ts.Less(&committs) { 570 return 571 } 572 // record visible delete rows 573 rows = nulls.NewWithSize(0) 574 rowids := vector.MustFixedCol[types.Rowid](deletes.Vecs[0]) 575 576 for _, rowid := range rowids { 577 row := rowid.GetRowOffset() 578 rows.Add(uint64(row)) 579 } 580 return 581 } 582 583 // BlockPrefetch is the interface for cn to call read ahead 584 // columns Which columns should be taken for columns 585 // service fileservice 586 // infos [s3object name][block] 587 func BlockPrefetch(idxes []uint16, service fileservice.FileService, infos [][]*objectio.BlockInfo, prefetchFile bool) error { 588 // Generate prefetch task 589 for i := range infos { 590 // build reader 591 pref, err := BuildPrefetchParams(service, infos[i][0].MetaLocation()) 592 if err != nil { 593 return err 594 } 595 for _, info := range infos[i] { 596 pref.AddBlock(idxes, []uint16{info.MetaLocation().ID()}) 597 if !info.DeltaLocation().IsEmpty() { 598 // Need to read all delete 599 err = PrefetchTombstone([]uint16{0, 1, 2}, []uint16{info.DeltaLocation().ID()}, service, info.DeltaLocation()) 600 if err != nil { 601 return err 602 } 603 } 604 } 605 pref.prefetchFile = prefetchFile 606 err = pipeline.Prefetch(pref) 607 if err != nil { 608 return err 609 } 610 } 611 return nil 612 } 613 614 func RecordReadDel(total, read, bisect time.Duration) { 615 pipeline.stats.selectivityStats.RecordReadDel(total, read, bisect) 616 } 617 618 func RecordReadFilterSelectivity(hit, total int) { 619 pipeline.stats.selectivityStats.RecordReadFilterSelectivity(hit, total) 620 } 621 622 func RecordBlockSelectivity(hit, total int) { 623 pipeline.stats.selectivityStats.RecordBlockSelectivity(hit, total) 624 } 625 626 func RecordColumnSelectivity(hit, total int) { 627 pipeline.stats.selectivityStats.RecordColumnSelectivity(hit, total) 628 } 629 630 func ExportSelectivityString() string { 631 return pipeline.stats.selectivityStats.ExportString() 632 } 633 634 func FindIntervalForBlock(rowids []types.Rowid, id *types.Blockid) (start int, end int) { 635 lowRowid := objectio.NewRowid(id, 0) 636 highRowid := objectio.NewRowid(id, math.MaxUint32) 637 i, j := 0, len(rowids) 638 for i < j { 639 m := (i + j) / 2 640 // first value >= lowRowid 641 if !rowids[m].Less(*lowRowid) { 642 j = m 643 } else { 644 i = m + 1 645 } 646 } 647 start = i 648 649 i, j = 0, len(rowids) 650 for i < j { 651 m := (i + j) / 2 652 // first value > highRowid 653 if highRowid.Less(rowids[m]) { 654 j = m 655 } else { 656 i = m + 1 657 } 658 } 659 end = i 660 return 661 }