github.com/matrixorigin/matrixone@v0.7.0/pkg/vm/engine/disttae/partition.go (about) 1 // Copyright 2022 Matrix Origin 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 package disttae 16 17 import ( 18 "bytes" 19 "context" 20 "unsafe" 21 22 "github.com/matrixorigin/matrixone/pkg/catalog" 23 "github.com/matrixorigin/matrixone/pkg/common/moerr" 24 "github.com/matrixorigin/matrixone/pkg/common/moprobe" 25 "github.com/matrixorigin/matrixone/pkg/container/batch" 26 "github.com/matrixorigin/matrixone/pkg/container/types" 27 "github.com/matrixorigin/matrixone/pkg/container/vector" 28 "github.com/matrixorigin/matrixone/pkg/fileservice" 29 "github.com/matrixorigin/matrixone/pkg/pb/api" 30 "github.com/matrixorigin/matrixone/pkg/pb/plan" 31 "github.com/matrixorigin/matrixone/pkg/pb/timestamp" 32 "github.com/matrixorigin/matrixone/pkg/txn/storage/memorystorage/memorytable" 33 "github.com/matrixorigin/matrixone/pkg/txn/storage/memorystorage/memtable" 34 "github.com/matrixorigin/matrixone/pkg/vm/engine" 35 ) 36 37 func NewPartition( 38 columnsIndexDefs []ColumnsIndexDef, 39 ) *Partition { 40 lock := make(chan struct{}, 1) 41 lock <- struct{}{} 42 return &Partition{ 43 lock: lock, 44 data: memtable.NewTable[RowID, DataValue, *DataRow](), 45 columnsIndexDefs: columnsIndexDefs, 46 } 47 } 48 49 type RowID types.Rowid 50 51 func (r RowID) Less(than RowID) bool { 52 return bytes.Compare(r[:], than[:]) < 0 53 } 54 55 type DataValue struct { 56 op uint8 57 value map[string]memtable.Nullable 58 } 59 60 type DataRow struct { 61 rowID RowID 62 value DataValue 63 indexes []memtable.Tuple 64 uniqueIndexes []memtable.Tuple 65 } 66 67 const ( 68 opInsert = iota + 1 69 opDelete 70 ) 71 72 func (d *DataRow) Key() RowID { 73 return d.rowID 74 } 75 76 func (d *DataRow) Value() DataValue { 77 return d.value 78 } 79 80 func (d *DataRow) Indexes() []memtable.Tuple { 81 return d.indexes 82 } 83 84 func (d *DataRow) UniqueIndexes() []memtable.Tuple { 85 return d.uniqueIndexes 86 } 87 88 var _ MVCC = new(Partition) 89 90 func (p *Partition) BlockList(ctx context.Context, ts timestamp.Timestamp, 91 blocks []BlockMeta, entries []Entry) ([]BlockMeta, map[uint64][]int) { 92 blks := make([]BlockMeta, 0, len(blocks)) 93 deletes := make(map[uint64][]int) 94 if len(blocks) == 0 { 95 return blks, deletes 96 } 97 ids := make([]uint64, len(blocks)) 98 for i := range blocks { 99 // if cn can see a appendable block, this block must contain all updates 100 // in cache, no need to do merge read, BlockRead will filter out 101 // invisible and deleted rows with respect to the timestamp 102 if !blocks[i].Info.EntryState { 103 ids[i] = blocks[i].Info.BlockID 104 } 105 } 106 p.IterDeletedRowIDs(ctx, ids, ts, func(rowID RowID) bool { 107 id, offset := catalog.DecodeRowid(types.Rowid(rowID)) 108 deletes[id] = append(deletes[id], int(offset)) 109 return true 110 }) 111 for _, entry := range entries { 112 if entry.typ == DELETE { 113 vs := vector.MustTCols[types.Rowid](entry.bat.GetVector(0)) 114 for _, v := range vs { 115 id, offset := catalog.DecodeRowid(v) 116 deletes[id] = append(deletes[id], int(offset)) 117 } 118 } 119 } 120 for i := range blocks { 121 if _, ok := deletes[blocks[i].Info.BlockID]; !ok { 122 blks = append(blks, blocks[i]) 123 } 124 } 125 return blks, deletes 126 } 127 128 func (*Partition) CheckPoint(ctx context.Context, ts timestamp.Timestamp) error { 129 panic("unimplemented") 130 } 131 132 func (p *Partition) Get(key types.Rowid, ts timestamp.Timestamp) bool { 133 t := memtable.Time{ 134 Timestamp: ts, 135 } 136 tx := memtable.NewTransaction( 137 newMemTableTransactionID(), 138 t, 139 memtable.SnapshotIsolation, 140 ) 141 if _, err := p.data.Get(tx, RowID(key)); err != nil { 142 return false 143 } 144 return true 145 } 146 147 func (p *Partition) Delete(ctx context.Context, b *api.Batch) error { 148 bat, err := batch.ProtoBatchToBatch(b) 149 if err != nil { 150 return err 151 } 152 153 txID := newMemTableTransactionID() 154 155 iter := memorytable.NewBatchIter(bat) 156 for { 157 tuple := iter() 158 if len(tuple) == 0 { 159 break 160 } 161 162 rowID := RowID(tuple[0].Value.(types.Rowid)) 163 ts := tuple[1].Value.(types.TS) 164 t := memtable.Time{ 165 Timestamp: timestamp.Timestamp{ 166 PhysicalTime: ts.Physical(), 167 LogicalTime: ts.Logical(), 168 }, 169 } 170 tx := memtable.NewTransaction(txID, t, memtable.SnapshotIsolation) 171 172 // indexes 173 var indexes []memtable.Tuple 174 // block id, time, op 175 indexes = append(indexes, memtable.Tuple{ 176 index_BlockID_Time_OP, 177 memtable.ToOrdered(rowIDToBlockID(rowID)), 178 ts, 179 memtable.Uint(opDelete), 180 }) 181 182 err := p.data.Upsert(tx, &DataRow{ 183 rowID: rowID, 184 value: DataValue{ 185 op: opDelete, 186 }, 187 indexes: indexes, 188 }) 189 // the reason to ignore, see comments in Insert method 190 if moerr.IsMoErrCode(err, moerr.ErrTxnWriteConflict) { 191 continue 192 } 193 if err != nil { 194 return err 195 } 196 197 if err := tx.Commit(t); err != nil { 198 return err 199 } 200 } 201 202 return nil 203 } 204 205 func (p *Partition) Insert(ctx context.Context, primaryKeyIndex int, 206 b *api.Batch, needCheck bool) error { 207 208 // As an example, lets probe this function. First we want to find a tag so that 209 // if several go routine call this function at the same time, we will not mix them. 210 // the pointer b works. 211 tag := int64(uintptr(unsafe.Pointer(b))) 212 213 // enter probe, only need tag. Adding an extra arg just for demo purpose. 214 moprobe.DisttaePartitionInsert(tag, 1) 215 216 // defer, this is the return probe. Use same tag value 217 defer moprobe.DisttaePartitionInsertRet(tag, 0x1020304050607080) 218 219 bat, err := batch.ProtoBatchToBatch(b) 220 if err != nil { 221 return err 222 } 223 224 txID := newMemTableTransactionID() 225 226 iter := memorytable.NewBatchIter(bat) 227 for { 228 tuple := iter() 229 if len(tuple) == 0 { 230 break 231 } 232 233 rowID := RowID(tuple[0].Value.(types.Rowid)) 234 ts := tuple[1].Value.(types.TS) 235 t := memtable.Time{ 236 Timestamp: timestamp.Timestamp{ 237 PhysicalTime: ts.Physical(), 238 LogicalTime: ts.Logical(), 239 }, 240 } 241 tx := memtable.NewTransaction(txID, t, memtable.SnapshotIsolation) 242 243 // check primary key 244 var primaryKey any 245 if primaryKeyIndex >= 0 { 246 primaryKey = memtable.ToOrdered(tuple[primaryKeyIndex].Value) 247 entries, err := p.data.Index(tx, memtable.Tuple{ 248 index_PrimaryKey, 249 primaryKey, 250 }) 251 if err != nil { 252 return err 253 } 254 if len(entries) > 0 && needCheck { 255 return moerr.NewDuplicate(ctx) 256 } 257 } 258 259 dataValue := DataValue{ 260 op: opInsert, 261 value: make(map[string]memtable.Nullable), 262 } 263 for i := 2; i < len(tuple); i++ { 264 dataValue.value[bat.Attrs[i]] = tuple[i] 265 } 266 267 // indexes 268 var indexes []memtable.Tuple 269 // primary key 270 if primaryKey != nil { 271 indexes = append(indexes, memtable.Tuple{ 272 index_PrimaryKey, 273 primaryKey, 274 }) 275 } 276 // block id, time, op 277 indexes = append(indexes, memtable.Tuple{ 278 index_BlockID_Time_OP, 279 memtable.ToOrdered(rowIDToBlockID(rowID)), 280 ts, 281 memtable.Uint(opInsert), 282 }) 283 // columns indexes 284 for _, def := range p.columnsIndexDefs { 285 index := memtable.Tuple{ 286 def.Name, 287 } 288 for _, col := range def.Columns { 289 index = append(index, memtable.ToOrdered(tuple[col].Value)) 290 } 291 indexes = append(indexes, index) 292 } 293 294 err = p.data.Upsert(tx, &DataRow{ 295 rowID: rowID, 296 value: dataValue, 297 indexes: indexes, 298 }) 299 // if conflict comes up here, probably the checkpoint from dn 300 // has duplicated history versions. As txn write conflict has been 301 // checked in dn, so it is safe to ignore this error 302 if moerr.IsMoErrCode(err, moerr.ErrTxnWriteConflict) { 303 continue 304 } 305 if err != nil { 306 return err 307 } 308 if err := tx.Commit(t); err != nil { 309 return err 310 } 311 } 312 313 return nil 314 } 315 316 func (p *Partition) GC(ts timestamp.Timestamp) error { 317 // remove versions only visible before ts 318 // assuming no transaction is reading or writing 319 t := memtable.Time{ 320 Timestamp: ts, 321 } 322 err := p.data.FilterVersions(func(k RowID, versions []memtable.Version[DataValue]) (filtered []memtable.Version[DataValue], err error) { 323 for _, version := range versions { 324 if version.LockTime.IsZero() { 325 // not deleted 326 filtered = append(filtered, version) 327 continue 328 } 329 if version.LockTime.Equal(t) || 330 version.LockTime.After(t) { 331 // still visible after ts 332 filtered = append(filtered, version) 333 continue 334 } 335 } 336 return 337 }) 338 if err != nil { 339 return err 340 } 341 return nil 342 } 343 344 func (p *Partition) GetRowsByIndex(ts timestamp.Timestamp, index memtable.Tuple, 345 columns []string, deletes map[types.Rowid]uint8) (rows [][]any, err error) { 346 t := memtable.Time{ 347 Timestamp: ts, 348 } 349 tx := memtable.NewTransaction( 350 newMemTableTransactionID(), 351 t, 352 memtable.SnapshotIsolation, 353 ) 354 iter := p.data.NewIndexIter(tx, index, index) 355 for ok := iter.First(); ok; ok = iter.Next() { 356 entry := iter.Item() 357 if _, ok := deletes[types.Rowid(entry.Key)]; ok { 358 continue 359 } 360 data, err := p.data.Get(tx, entry.Key) 361 if err != nil { 362 return nil, err 363 } 364 rows = append(rows, genRow(&data, columns)) 365 } 366 return 367 } 368 369 func (p *Partition) GetRowsByIndexPrefix(ts timestamp.Timestamp, prefix memtable.Tuple) (rows []DataValue, err error) { 370 t := memtable.Time{ 371 Timestamp: ts, 372 } 373 tx := memtable.NewTransaction( 374 newMemTableTransactionID(), 375 t, 376 memtable.SnapshotIsolation, 377 ) 378 iter := p.data.NewIndexIter( 379 tx, 380 append(append(prefix[:0:0], prefix...), memtable.Min), 381 append(append(prefix[:0:0], prefix...), memtable.Max), 382 ) 383 for ok := iter.First(); ok; ok = iter.Next() { 384 entry := iter.Item() 385 data, err := p.data.Get(tx, entry.Key) 386 if err != nil { 387 return nil, err 388 } 389 rows = append(rows, data) 390 } 391 return 392 } 393 394 func rowIDToBlockID(rowID RowID) uint64 { 395 id, _ := catalog.DecodeRowid(types.Rowid(rowID)) 396 return id 397 } 398 399 func (p *Partition) DeleteByBlockID(ctx context.Context, ts timestamp.Timestamp, blockID uint64) error { 400 tx := memtable.NewTransaction(newMemTableTransactionID(), memtable.Time{ 401 Timestamp: ts, 402 }, memtable.SnapshotIsolation) 403 min := memtable.Tuple{ 404 index_BlockID_Time_OP, 405 memtable.ToOrdered(blockID), 406 memtable.Min, 407 memtable.Uint(opInsert), 408 } 409 max := memtable.Tuple{ 410 index_BlockID_Time_OP, 411 memtable.ToOrdered(blockID), 412 memtable.Max, 413 memtable.Uint(opInsert), 414 } 415 iter := p.data.NewIndexIter(tx, min, max) 416 defer iter.Close() 417 for ok := iter.First(); ok; ok = iter.Next() { 418 entry := iter.Item() 419 if err := p.data.Delete(tx, entry.Key); err != nil { 420 return err 421 } 422 } 423 return tx.Commit(tx.Time) 424 } 425 426 func (p *Partition) IterDeletedRowIDs(ctx context.Context, blockIDs []uint64, ts timestamp.Timestamp, fn func(rowID RowID) bool) { 427 tx := memtable.NewTransaction(newMemTableTransactionID(), memtable.Time{ 428 Timestamp: ts, 429 }, memtable.SnapshotIsolation) 430 431 for _, blockID := range blockIDs { 432 min := memtable.Tuple{ 433 index_BlockID_Time_OP, 434 memtable.ToOrdered(blockID), 435 memtable.Min, 436 memtable.Min, 437 } 438 max := memtable.Tuple{ 439 index_BlockID_Time_OP, 440 memtable.ToOrdered(blockID), 441 types.TimestampToTS(ts), 442 memtable.Max, 443 } 444 iter := p.data.NewIndexIter(tx, min, max) 445 defer iter.Close() 446 deleted := make(map[RowID]bool) 447 inserted := make(map[RowID]bool) 448 for ok := iter.First(); ok; ok = iter.Next() { 449 entry := iter.Item() 450 rowID := entry.Key 451 switch entry.Index[3].(memtable.Uint) { 452 case opInsert: 453 inserted[rowID] = true 454 case opDelete: 455 deleted[rowID] = true 456 } 457 } 458 for rowID := range deleted { 459 if !inserted[rowID] { 460 if !fn(rowID) { 461 break 462 } 463 } 464 } 465 } 466 } 467 468 func (p *Partition) Rows( 469 tx *memtable.Transaction, 470 deletes map[types.Rowid]uint8, 471 skipBlocks map[uint64]uint8) (int64, error) { 472 var rows int64 = 0 473 iter := p.data.NewIter(tx) 474 defer iter.Close() 475 for ok := iter.First(); ok; ok = iter.Next() { 476 dataKey, dataValue, err := iter.Read() 477 if err != nil { 478 return 0, err 479 } 480 481 if _, ok := deletes[types.Rowid(dataKey)]; ok { 482 continue 483 } 484 485 if dataValue.op == opDelete { 486 continue 487 } 488 489 if skipBlocks != nil { 490 if _, ok := skipBlocks[rowIDToBlockID(dataKey)]; ok { 491 continue 492 } 493 } 494 rows++ 495 } 496 497 return rows, nil 498 } 499 500 func (p *Partition) NewReader( 501 ctx context.Context, 502 readerNumber int, 503 index memtable.Tuple, 504 defs []engine.TableDef, 505 tableDef *plan.TableDef, 506 skipBlocks map[uint64]uint8, 507 blks []ModifyBlockMeta, 508 ts timestamp.Timestamp, 509 fs fileservice.FileService, 510 entries []Entry, 511 ) ([]engine.Reader, error) { 512 513 t := memtable.Time{ 514 Timestamp: ts, 515 } 516 tx := memtable.NewTransaction( 517 newMemTableTransactionID(), 518 t, 519 memtable.SnapshotIsolation, 520 ) 521 522 inserts := make([]*batch.Batch, 0, len(entries)) 523 deletes := make(map[types.Rowid]uint8) 524 for _, entry := range entries { 525 if entry.typ == INSERT { 526 inserts = append(inserts, entry.bat) 527 } else { 528 if entry.bat.GetVector(0).GetType().Oid == types.T_Rowid { 529 vs := vector.MustTCols[types.Rowid](entry.bat.GetVector(0)) 530 for _, v := range vs { 531 deletes[v] = 0 532 } 533 } 534 } 535 } 536 537 readers := make([]engine.Reader, readerNumber) 538 539 mp := make(map[string]types.Type) 540 colIdxMp := make(map[string]int) 541 if tableDef != nil { 542 for i := range tableDef.Cols { 543 colIdxMp[tableDef.Cols[i].Name] = i 544 } 545 } 546 547 mp[catalog.Row_ID] = types.New(types.T_Rowid, 0, 0, 0) 548 for _, def := range defs { 549 attr, ok := def.(*engine.AttributeDef) 550 if !ok { 551 continue 552 } 553 mp[attr.Attr.Name] = attr.Attr.Type 554 } 555 556 partReader := &PartitionReader{ 557 typsMap: mp, 558 readTime: t, 559 tx: tx, 560 index: index, 561 inserts: inserts, 562 deletes: deletes, 563 skipBlocks: skipBlocks, 564 data: p.data, 565 iter: p.data.NewIter(tx), 566 colIdxMp: colIdxMp, 567 extendId2s3File: make(map[string]int), 568 s3FileService: fs, 569 } 570 if p.txn != nil { 571 partReader.proc = p.txn.proc 572 } 573 readers[0] = partReader 574 if readerNumber == 1 { 575 for i := range blks { 576 readers = append(readers, &blockMergeReader{ 577 fs: fs, 578 ts: ts, 579 ctx: ctx, 580 tableDef: tableDef, 581 sels: make([]int64, 0, 1024), 582 blks: []ModifyBlockMeta{blks[i]}, 583 }) 584 } 585 return []engine.Reader{&mergeReader{readers}}, nil 586 } 587 if len(blks) < readerNumber-1 { 588 for i := range blks { 589 readers[i+1] = &blockMergeReader{ 590 fs: fs, 591 ts: ts, 592 ctx: ctx, 593 tableDef: tableDef, 594 sels: make([]int64, 0, 1024), 595 blks: []ModifyBlockMeta{blks[i]}, 596 } 597 } 598 for j := len(blks) + 1; j < readerNumber; j++ { 599 readers[j] = &emptyReader{} 600 } 601 return readers, nil 602 } 603 step := len(blks) / (readerNumber - 1) 604 if step < 1 { 605 step = 1 606 } 607 for i := 1; i < readerNumber; i++ { 608 if i == readerNumber-1 { 609 readers[i] = &blockMergeReader{ 610 fs: fs, 611 ts: ts, 612 ctx: ctx, 613 tableDef: tableDef, 614 blks: blks[(i-1)*step:], 615 sels: make([]int64, 0, 1024), 616 } 617 } else { 618 readers[i] = &blockMergeReader{ 619 fs: fs, 620 ts: ts, 621 ctx: ctx, 622 tableDef: tableDef, 623 blks: blks[(i-1)*step : i*step], 624 sels: make([]int64, 0, 1024), 625 } 626 } 627 } 628 return readers, nil 629 }