github.com/cockroachdb/cockroach@v20.2.0-alpha.1+incompatible/pkg/sql/rowcontainer/numbered_row_container.go (about) 1 // Copyright 2020 The Cockroach Authors. 2 // 3 // Use of this software is governed by the Business Source License 4 // included in the file licenses/BSL.txt. 5 // 6 // As of the Change Date specified in that file, in accordance with 7 // the Business Source License, use of this software will be governed 8 // by the Apache License, Version 2.0, included in the file 9 // licenses/APL.txt. 10 11 package rowcontainer 12 13 import ( 14 "container/heap" 15 "context" 16 "fmt" 17 "math" 18 "sync" 19 20 "github.com/cockroachdb/cockroach/pkg/kv/kvserver/diskmap" 21 "github.com/cockroachdb/cockroach/pkg/sql/sem/tree" 22 "github.com/cockroachdb/cockroach/pkg/sql/sqlbase" 23 "github.com/cockroachdb/cockroach/pkg/sql/types" 24 "github.com/cockroachdb/cockroach/pkg/util/encoding" 25 "github.com/cockroachdb/cockroach/pkg/util/log" 26 "github.com/cockroachdb/cockroach/pkg/util/mon" 27 "github.com/cockroachdb/errors" 28 ) 29 30 // DiskBackedNumberedRowContainer that stores a map from idx => row, where idx is a 31 // 0-based dense numbering. Optionally, if deDup is true, it can de-duplicate the 32 // rows before assigning a number. It spills to disk if needed. 33 type DiskBackedNumberedRowContainer struct { 34 deDup bool 35 rc *DiskBackedRowContainer 36 37 deduper DeDupingRowContainer 38 39 storedTypes []*types.T 40 idx int // the index of the next row to be added into the container 41 42 rowIter *numberedDiskRowIterator 43 // cacheMap is a map used in the implementation of rowIter that is kept 44 // in the container to avoid repeated allocation. 45 cacheMap map[int]*cacheElement 46 rowIterMemAcc mon.BoundAccount 47 DisableCache bool 48 } 49 50 // NewDiskBackedNumberedRowContainer creates a DiskBackedNumberedRowContainer. 51 // 52 // Arguments: 53 // - deDup is true if it should de-duplicate. 54 // - types is the schema of rows that will be added to this container. 55 // - evalCtx defines the context. 56 // - engine is the underlying store that rows are stored on when the container 57 // spills to disk. 58 // - memoryMonitor is used to monitor this container's memory usage. 59 // - diskMonitor is used to monitor this container's disk usage. 60 // - rowCapacity (if not 0) specifies the number of rows in-memory container 61 // should be preallocated for. 62 func NewDiskBackedNumberedRowContainer( 63 deDup bool, 64 types []*types.T, 65 evalCtx *tree.EvalContext, 66 engine diskmap.Factory, 67 memoryMonitor *mon.BytesMonitor, 68 diskMonitor *mon.BytesMonitor, 69 rowCapacity int, 70 ) *DiskBackedNumberedRowContainer { 71 d := &DiskBackedNumberedRowContainer{ 72 deDup: deDup, 73 storedTypes: types, 74 rowIterMemAcc: memoryMonitor.MakeBoundAccount(), 75 } 76 d.rc = &DiskBackedRowContainer{} 77 d.rc.Init(nil /*ordering*/, types, evalCtx, engine, memoryMonitor, diskMonitor, rowCapacity) 78 if deDup { 79 ordering := make(sqlbase.ColumnOrdering, len(types)) 80 for i := range types { 81 ordering[i].ColIdx = i 82 ordering[i].Direction = encoding.Ascending 83 } 84 deduper := &DiskBackedRowContainer{} 85 deduper.Init(ordering, types, evalCtx, engine, memoryMonitor, diskMonitor, rowCapacity) 86 deduper.DoDeDuplicate() 87 d.deduper = deduper 88 } 89 return d 90 } 91 92 // Len returns the number of rows in this container. 93 func (d *DiskBackedNumberedRowContainer) Len() int { 94 return d.idx 95 } 96 97 // UsingDisk returns whether the primary container is using disk. 98 func (d *DiskBackedNumberedRowContainer) UsingDisk() bool { 99 return d.rc.UsingDisk() 100 } 101 102 // Spilled returns whether or not the primary container spilled to disk in its 103 // lifetime. 104 func (d *DiskBackedNumberedRowContainer) Spilled() bool { 105 return d.rc.Spilled() 106 } 107 108 // testingSpillToDisk is for tests to spill the container(s) 109 // to disk. 110 func (d *DiskBackedNumberedRowContainer) testingSpillToDisk(ctx context.Context) error { 111 if !d.rc.UsingDisk() { 112 if err := d.rc.SpillToDisk(ctx); err != nil { 113 return err 114 } 115 } 116 if d.deDup && !d.deduper.(*DiskBackedRowContainer).UsingDisk() { 117 if err := d.deduper.(*DiskBackedRowContainer).SpillToDisk(ctx); err != nil { 118 return err 119 } 120 } 121 return nil 122 } 123 124 // AddRow tries to add a row. It returns the position of the 125 // row in the container. 126 func (d *DiskBackedNumberedRowContainer) AddRow( 127 ctx context.Context, row sqlbase.EncDatumRow, 128 ) (int, error) { 129 if d.deDup { 130 assignedIdx, err := d.deduper.AddRowWithDeDup(ctx, row) 131 if err != nil { 132 return 0, err 133 } 134 if assignedIdx < d.idx { 135 // Existing row. 136 return assignedIdx, nil 137 } else if assignedIdx != d.idx { 138 panic(fmt.Sprintf("DiskBackedNumberedRowContainer bug: assignedIdx %d != d.idx %d", 139 assignedIdx, d.idx)) 140 } 141 // Else assignedIdx == d.idx, so a new row. 142 } 143 idx := d.idx 144 // An error in AddRow() will cause the two row containers 145 // to no longer be in-step with each other wrt the numbering 146 // but that is not a concern since the caller will not 147 // continue using d after an error. 148 d.idx++ 149 return idx, d.rc.AddRow(ctx, row) 150 } 151 152 // SetupForRead must be called before calling GetRow(). No more AddRow() calls 153 // are permitted (before UnsafeReset()). See the comment for 154 // NumberedDiskRowIterator for how we use the future accesses. 155 func (d *DiskBackedNumberedRowContainer) SetupForRead(ctx context.Context, accesses [][]int) { 156 if !d.rc.UsingDisk() { 157 return 158 } 159 rowIter := d.rc.drc.newNumberedIterator(ctx) 160 meanBytesPerRow := d.rc.drc.MeanEncodedRowBytes() 161 if meanBytesPerRow == 0 { 162 meanBytesPerRow = 100 // arbitrary 163 } 164 // TODO(sumeer): make bytesPerSSBlock a parameter to 165 // NewDiskBackedNumberedRowContainer. 166 const bytesPerSSBlock = 32 * 1024 167 meanRowsPerSSBlock := bytesPerSSBlock / meanBytesPerRow 168 const maxCacheSize = 4096 169 cacheSize := maxCacheSize 170 if d.DisableCache { 171 // This is not an efficient way to disable the cache, but ok for tests. 172 cacheSize = 0 173 } 174 if d.cacheMap == nil { 175 d.cacheMap = make(map[int]*cacheElement) 176 } 177 d.rowIter = newNumberedDiskRowIterator( 178 ctx, rowIter, accesses, meanRowsPerSSBlock, cacheSize, d.cacheMap, &d.rowIterMemAcc) 179 } 180 181 // GetRow returns a row with the given index. If skip is true the row is not 182 // actually read and just indicates a read that is being skipped. It is used 183 // to maintain synchronization with the future, since the caller can skip 184 // accesses for semi-joins and anti-joins. 185 func (d *DiskBackedNumberedRowContainer) GetRow( 186 ctx context.Context, idx int, skip bool, 187 ) (sqlbase.EncDatumRow, error) { 188 if !d.rc.UsingDisk() { 189 if skip { 190 return nil, nil 191 } 192 return d.rc.mrc.EncRow(idx), nil 193 } 194 return d.rowIter.getRow(ctx, idx, skip) 195 } 196 197 // UnsafeReset resets this container to be reused. 198 func (d *DiskBackedNumberedRowContainer) UnsafeReset(ctx context.Context) error { 199 if d.rowIter != nil { 200 d.rowIter.close() 201 d.rowIterMemAcc.Clear(ctx) 202 d.rowIter = nil 203 } 204 d.idx = 0 205 if err := d.rc.UnsafeReset(ctx); err != nil { 206 return err 207 } 208 if d.deduper != nil { 209 if err := d.deduper.UnsafeReset(ctx); err != nil { 210 return err 211 } 212 } 213 return nil 214 } 215 216 // Close closes the container. 217 func (d *DiskBackedNumberedRowContainer) Close(ctx context.Context) { 218 if d.rowIter != nil { 219 d.rowIter.close() 220 } 221 d.rowIterMemAcc.Close(ctx) 222 d.rc.Close(ctx) 223 if d.deduper != nil { 224 d.deduper.Close(ctx) 225 } 226 } 227 228 // numberedDiskRowIterator wraps a numberedRowIterator and adds two pieces 229 // of functionality: 230 // - decides between seek and next when positioning the iterator. 231 // - maintains a cache. 232 // 233 // Cache: 234 // 235 // The callers of GetRow() know the full pattern of future accesses, 236 // represented as a [][]int where the int is the row number. Within a slice 237 // they access in increasing order of index (since forward iteration is 238 // cheaper), but across different slices there is no such constraint. Caching 239 // policies like LRU work without knowing the future, and use "recency" of row 240 // R (number of other distinct rows accessed since last access to row R) as a 241 // proxy for "reuse distance" of row R (number of distinct rows that will be 242 // accessed from one access of R to the next access of R). More sophisticated 243 // policies try to use the recently observed reuse distance as a predictor for 244 // the future reuse distance. In our case we know the exact reuse distance 245 // from each access of row R to its next access, so one can construct an 246 // optimal cache policy for a certain cache size: keep track of the current 247 // reuse distance for each element of the cache and evict the one with the 248 // highest reuse distance. A cache miss causes the retrieved entry to be added 249 // to a full cache only if its reuse distance to the next access is less than 250 // the highest reuse distance currently in the cache. This optimality requires 251 // some book-keeping overhead: 252 // 253 // - A map with O(R) entries where R is the number of unique rows that will be 254 // accessed and an overall size proportional to the total number of accesses. 255 // Overall this is within a constant factor of [][]int, but the constant could 256 // be high. Note that we need this map because when doing Next() on the iterator 257 // we encounter entries different from the ones that caused this cache miss 258 // and we need to decide whether to cache them -- if we had a random access 259 // iterator such that sequential access was the same cost as random 260 // access, then a single []int with the next reuse position for each access 261 // would have sufficed. 262 // - A heap containing the rows in the cache that is updated on each cache hit, 263 // and whenever a row is evicted or added to the cache. This is O(log N) where 264 // N is the number of entries in the cache. 265 // 266 // Overall, this may be too much memory and cpu overhead for not enough 267 // benefit, but it will put an upper bound on what we can achieve with a 268 // cache. And for inverted index queries involving intersection it is possible 269 // that the row container contains far more rows than the number of unique 270 // rows that will be accessed, so a small cache which knows the future could 271 // be very beneficial. One motivation for this approach was that #48118 272 // mentioned low observed cache hit rates with a simpler approach. And since 273 // we turn off ssblock caching for these underlying storage engine, the 274 // cost of a cache miss is high. 275 // 276 // TODO(sumeer): 277 // - Use some realistic inverted index workloads (including geospatial) to 278 // measure the effect of this cache. 279 type numberedDiskRowIterator struct { 280 rowIter *numberedRowIterator 281 // After creation, the rowIter is not positioned. isPositioned transitions 282 // once from false => true. 283 isPositioned bool 284 // The current index the rowIter is positioned at, when isPositioned == true. 285 idxRowIter int 286 // The mean number of rows per ssblock. 287 meanRowsPerSSBlock int 288 // The maximum number of rows in the cache. This can be shrunk under memory 289 // pressure. 290 maxCacheSize int 291 memAcc *mon.BoundAccount 292 293 // The cache. It contains an entry for all the rows that will be accessed, 294 // and not just the ones for which we currently have a cached EncDatumRow. 295 cache map[int]*cacheElement 296 // The current access index in the sequence of all the accesses. This is 297 // used to know where we are in the known future. 298 accessIdx int 299 // A max heap containing only the rows for which we have a cached 300 // EncDatumRow. The top element has the highest nextAccess and is the 301 // best candidate to evict. 302 cacheHeap cacheMaxNextAccessHeap 303 datumAlloc sqlbase.DatumAlloc 304 rowAlloc sqlbase.EncDatumRowAlloc 305 306 hitCount int 307 missCount int 308 } 309 310 type cacheElement struct { 311 // The future accesses for this row, expressed as the accessIdx when it will 312 // happen. We update this slice to remove the first entry whenever an access 313 // happens, so when non-empty, accesses[0] represents the next access, and 314 // when empty there are no more accesses left. 315 accesses []int 316 // row is non-nil for a cached row. 317 row sqlbase.EncDatumRow 318 // When row is non-nil, this is the element in the heap. 319 heapElement cacheRowHeapElement 320 // Used only when initializing accesses, so that we can allocate a single 321 // shared slice for accesses across all cacheElements. 322 numAccesses int 323 } 324 325 var cacheElementSyncPool = sync.Pool{ 326 New: func() interface{} { 327 return &cacheElement{} 328 }, 329 } 330 331 func freeCacheElement(elem *cacheElement) { 332 elem.accesses = nil 333 elem.row = nil 334 elem.numAccesses = 0 335 cacheElementSyncPool.Put(elem) 336 } 337 338 func newCacheElement() *cacheElement { 339 return cacheElementSyncPool.Get().(*cacheElement) 340 } 341 342 type cacheRowHeapElement struct { 343 // The index of this cached row. 344 rowIdx int 345 // The next access of this cached row. 346 nextAccess int 347 // The index in the heap. 348 heapIdx int 349 } 350 351 type cacheMaxNextAccessHeap []*cacheRowHeapElement 352 353 func (h cacheMaxNextAccessHeap) Len() int { return len(h) } 354 func (h cacheMaxNextAccessHeap) Less(i, j int) bool { 355 return h[i].nextAccess > h[j].nextAccess 356 } 357 func (h cacheMaxNextAccessHeap) Swap(i, j int) { 358 h[i], h[j] = h[j], h[i] 359 h[i].heapIdx = i 360 h[j].heapIdx = j 361 } 362 func (h *cacheMaxNextAccessHeap) Push(x interface{}) { 363 n := len(*h) 364 elem := x.(*cacheRowHeapElement) 365 elem.heapIdx = n 366 *h = append(*h, elem) 367 } 368 func (h *cacheMaxNextAccessHeap) Pop() interface{} { 369 old := *h 370 n := len(old) 371 elem := old[n-1] 372 elem.heapIdx = -1 373 *h = old[0 : n-1] 374 return elem 375 } 376 377 // TODO(sumeer): memory accounting for map and heap. 378 func newNumberedDiskRowIterator( 379 _ context.Context, 380 rowIter *numberedRowIterator, 381 accesses [][]int, 382 meanRowsPerSSBlock int, 383 maxCacheSize int, 384 cache map[int]*cacheElement, 385 memAcc *mon.BoundAccount, 386 ) *numberedDiskRowIterator { 387 n := &numberedDiskRowIterator{ 388 rowIter: rowIter, 389 meanRowsPerSSBlock: meanRowsPerSSBlock, 390 maxCacheSize: maxCacheSize, 391 memAcc: memAcc, 392 cache: cache, 393 } 394 var numAccesses int 395 for _, accSlice := range accesses { 396 for _, rowIdx := range accSlice { 397 elem := n.cache[rowIdx] 398 if elem == nil { 399 elem = newCacheElement() 400 elem.heapElement.rowIdx = rowIdx 401 n.cache[rowIdx] = elem 402 } 403 elem.numAccesses++ 404 numAccesses++ 405 } 406 } 407 allAccesses := make([]int, numAccesses) 408 accessIdx := 0 409 for _, accSlice := range accesses { 410 for _, rowIdx := range accSlice { 411 elem := n.cache[rowIdx] 412 if elem.accesses == nil { 413 // Sub-slice that can grow up to elem.numAccesses 414 elem.accesses = allAccesses[0:0:elem.numAccesses] 415 allAccesses = allAccesses[elem.numAccesses:] 416 } 417 elem.accesses = append(elem.accesses, accessIdx) 418 accessIdx++ 419 } 420 } 421 return n 422 } 423 424 func (n *numberedDiskRowIterator) close() { 425 n.rowIter.Close() 426 for k, v := range n.cache { 427 freeCacheElement(v) 428 delete(n.cache, k) 429 } 430 } 431 432 func (n *numberedDiskRowIterator) getRow( 433 ctx context.Context, idx int, skip bool, 434 ) (sqlbase.EncDatumRow, error) { 435 thisAccessIdx := n.accessIdx 436 n.accessIdx++ 437 elem, ok := n.cache[idx] 438 if !ok { 439 return nil, errors.Errorf("caller is accessing a row that was not specified up front") 440 } 441 if len(elem.accesses) == 0 || elem.accesses[0] != thisAccessIdx { 442 return nil, errors.Errorf("caller is no longer synchronized with future accesses") 443 } 444 elem.accesses = elem.accesses[1:] 445 var nextAccess int 446 if len(elem.accesses) > 0 { 447 nextAccess = elem.accesses[0] 448 } else { 449 nextAccess = math.MaxInt32 450 } 451 452 // Check for cache hit. This also updates the heap position, 453 // which we need to do even for skip == true. 454 if elem.row != nil { 455 n.hitCount++ 456 elem.heapElement.nextAccess = nextAccess 457 heap.Fix(&n.cacheHeap, elem.heapElement.heapIdx) 458 if skip { 459 return nil, nil 460 } 461 return elem.row, nil 462 } 463 464 // Cache miss. 465 n.missCount++ 466 // If skip, we can just return. 467 if skip { 468 return nil, nil 469 } 470 471 // Need to position the rowIter. We could add Prev(), since the engine supports 472 // it, if benchmarks indicate it would help. For now we just Seek() for that 473 // case. 474 if n.isPositioned && idx >= n.idxRowIter && (idx-n.idxRowIter <= n.meanRowsPerSSBlock) { 475 // Need to move forward, possibly within the same ssblock, so use Next(). 476 // It is possible we are already positioned at the right place. 477 for i := idx - n.idxRowIter; i > 0; { 478 n.rowIter.Next() 479 if valid, err := n.rowIter.Valid(); err != nil || !valid { 480 if err != nil { 481 return nil, err 482 } 483 return nil, errors.Errorf("caller is asking for index higher than any added index") 484 } 485 n.idxRowIter++ 486 i-- 487 if i == 0 { 488 break 489 } 490 // i > 0. This is before the row we want to return, but it may 491 // be worthwhile to cache it. 492 preElem, ok := n.cache[n.idxRowIter] 493 if !ok { 494 // This is a row that is never accessed. 495 continue 496 } 497 if preElem.row != nil { 498 // Already in cache. 499 continue 500 } 501 if len(preElem.accesses) == 0 { 502 // No accesses left. 503 continue 504 } 505 if err := n.tryAddCache(ctx, preElem); err != nil { 506 return nil, err 507 } 508 } 509 // Try adding to cache 510 return n.tryAddCacheAndReturnRow(ctx, elem) 511 } 512 n.rowIter.seekToIndex(idx) 513 n.isPositioned = true 514 n.idxRowIter = idx 515 if valid, err := n.rowIter.Valid(); err != nil || !valid { 516 if err != nil { 517 return nil, err 518 } 519 return nil, errors.Errorf("caller is asking for index higher than any added index") 520 } 521 // Try adding to cache 522 return n.tryAddCacheAndReturnRow(ctx, elem) 523 } 524 525 func (n *numberedDiskRowIterator) ensureDecoded(row sqlbase.EncDatumRow) error { 526 for i := range row { 527 if err := row[i].EnsureDecoded(n.rowIter.rowContainer.types[i], &n.datumAlloc); err != nil { 528 return err 529 } 530 } 531 return nil 532 } 533 534 func (n *numberedDiskRowIterator) tryAddCacheAndReturnRow( 535 ctx context.Context, elem *cacheElement, 536 ) (sqlbase.EncDatumRow, error) { 537 r, err := n.rowIter.Row() 538 if err != nil { 539 return nil, err 540 } 541 if err = n.ensureDecoded(r); err != nil { 542 return nil, err 543 } 544 if len(elem.accesses) == 0 { 545 return r, nil 546 } 547 return r, n.tryAddCacheHelper(ctx, elem, r, true) 548 } 549 550 func (n *numberedDiskRowIterator) tryAddCache(ctx context.Context, elem *cacheElement) error { 551 // We don't want to pay the cost of rowIter.Row() if the row will not be 552 // added to the cache. But to do correct memory accounting, which is needed 553 // for the precise caching decision, we do need the EncDatumRow. So we do a 554 // cheap check that is a good predictor of whether the row will be cached, 555 // and then call rowIter.Row(). 556 cacheSize := len(n.cacheHeap) 557 if cacheSize == n.maxCacheSize && (cacheSize == 0 || n.cacheHeap[0].nextAccess <= elem.accesses[0]) { 558 return nil 559 } 560 row, err := n.rowIter.Row() 561 if err != nil { 562 return err 563 } 564 return n.tryAddCacheHelper(ctx, elem, row, false) 565 } 566 567 func (n *numberedDiskRowIterator) tryAddCacheHelper( 568 ctx context.Context, elem *cacheElement, row sqlbase.EncDatumRow, alreadyDecoded bool, 569 ) error { 570 if elem.row != nil { 571 log.Fatalf(ctx, "adding row to cache when it is already in cache") 572 } 573 nextAccess := elem.accesses[0] 574 evict := func() (sqlbase.EncDatumRow, error) { 575 heapElem := heap.Pop(&n.cacheHeap).(*cacheRowHeapElement) 576 evictElem, ok := n.cache[heapElem.rowIdx] 577 if !ok { 578 return nil, errors.Errorf("bug: element not in cache map") 579 } 580 bytes := evictElem.row.Size() 581 n.memAcc.Shrink(ctx, int64(bytes)) 582 evictedRow := evictElem.row 583 evictElem.row = nil 584 return evictedRow, nil 585 } 586 rowBytesUsage := -1 587 var rowToReuse sqlbase.EncDatumRow 588 for { 589 if n.maxCacheSize == 0 { 590 return nil 591 } 592 if len(n.cacheHeap) == n.maxCacheSize && n.cacheHeap[0].nextAccess <= nextAccess { 593 return nil 594 } 595 var err error 596 if len(n.cacheHeap) >= n.maxCacheSize { 597 if rowToReuse, err = evict(); err != nil { 598 return err 599 } 600 continue 601 } 602 603 // We shrink maxCacheSize such that it is a good current indicator of how 604 // many rows memAcc will allow us to place in the cache. So it is likely 605 // that this row can be added. Decode the row to get the correct 606 // rowBytesUsage. 607 if !alreadyDecoded { 608 err = n.ensureDecoded(row) 609 if err != nil { 610 return err 611 } 612 alreadyDecoded = true 613 } 614 if rowBytesUsage == -1 { 615 rowBytesUsage = int(row.Size()) 616 } 617 if err := n.memAcc.Grow(ctx, int64(rowBytesUsage)); err != nil { 618 if sqlbase.IsOutOfMemoryError(err) { 619 // Could not grow the memory to handle this row, so reduce the 620 // maxCacheSize (max count of entries), to the current number of 621 // entries in the cache. The assumption here is that rows in the cache 622 // are of similar size. Using maxCacheSize to make eviction decisions 623 // is cheaper than calling Grow(). 624 n.maxCacheSize = len(n.cacheHeap) 625 continue 626 } else { 627 return err 628 } 629 } else { 630 // There is room in the cache. 631 break 632 } 633 } 634 // Add to cache. 635 elem.heapElement.nextAccess = nextAccess 636 // Need to copy row, since its lifetime is less than the cached row. 637 if rowToReuse == nil { 638 elem.row = n.rowAlloc.CopyRow(row) 639 } else { 640 copy(rowToReuse, row) 641 elem.row = rowToReuse 642 } 643 heap.Push(&n.cacheHeap, &elem.heapElement) 644 return nil 645 }