github.com/cockroachdb/cockroach@v20.2.0-alpha.1+incompatible/pkg/sql/rowcontainer/numbered_row_container.go

github.com/cockroachdb/cockroach@v20.2.0-alpha.1+incompatible/pkg/sql/rowcontainer/numbered_row_container.go (about)

     1  // Copyright 2020 The Cockroach Authors.
     2  //
     3  // Use of this software is governed by the Business Source License
     4  // included in the file licenses/BSL.txt.
     5  //
     6  // As of the Change Date specified in that file, in accordance with
     7  // the Business Source License, use of this software will be governed
     8  // by the Apache License, Version 2.0, included in the file
     9  // licenses/APL.txt.
    10  
    11  package rowcontainer
    12  
    13  import (
    14  	"container/heap"
    15  	"context"
    16  	"fmt"
    17  	"math"
    18  	"sync"
    19  
    20  	"github.com/cockroachdb/cockroach/pkg/kv/kvserver/diskmap"
    21  	"github.com/cockroachdb/cockroach/pkg/sql/sem/tree"
    22  	"github.com/cockroachdb/cockroach/pkg/sql/sqlbase"
    23  	"github.com/cockroachdb/cockroach/pkg/sql/types"
    24  	"github.com/cockroachdb/cockroach/pkg/util/encoding"
    25  	"github.com/cockroachdb/cockroach/pkg/util/log"
    26  	"github.com/cockroachdb/cockroach/pkg/util/mon"
    27  	"github.com/cockroachdb/errors"
    28  )
    29  
    30  // DiskBackedNumberedRowContainer that stores a map from idx => row, where idx is a
    31  // 0-based dense numbering. Optionally, if deDup is true, it can de-duplicate the
    32  // rows before assigning a number. It spills to disk if needed.
    33  type DiskBackedNumberedRowContainer struct {
    34  	deDup bool
    35  	rc    *DiskBackedRowContainer
    36  
    37  	deduper DeDupingRowContainer
    38  
    39  	storedTypes []*types.T
    40  	idx         int // the index of the next row to be added into the container
    41  
    42  	rowIter *numberedDiskRowIterator
    43  	// cacheMap is a map used in the implementation of rowIter that is kept
    44  	// in the container to avoid repeated allocation.
    45  	cacheMap      map[int]*cacheElement
    46  	rowIterMemAcc mon.BoundAccount
    47  	DisableCache  bool
    48  }
    49  
    50  // NewDiskBackedNumberedRowContainer creates a DiskBackedNumberedRowContainer.
    51  //
    52  // Arguments:
    53  //  - deDup is true if it should de-duplicate.
    54  //  - types is the schema of rows that will be added to this container.
    55  //  - evalCtx defines the context.
    56  //  - engine is the underlying store that rows are stored on when the container
    57  //    spills to disk.
    58  //  - memoryMonitor is used to monitor this container's memory usage.
    59  //  - diskMonitor is used to monitor this container's disk usage.
    60  //  - rowCapacity (if not 0) specifies the number of rows in-memory container
    61  //    should be preallocated for.
    62  func NewDiskBackedNumberedRowContainer(
    63  	deDup bool,
    64  	types []*types.T,
    65  	evalCtx *tree.EvalContext,
    66  	engine diskmap.Factory,
    67  	memoryMonitor *mon.BytesMonitor,
    68  	diskMonitor *mon.BytesMonitor,
    69  	rowCapacity int,
    70  ) *DiskBackedNumberedRowContainer {
    71  	d := &DiskBackedNumberedRowContainer{
    72  		deDup:         deDup,
    73  		storedTypes:   types,
    74  		rowIterMemAcc: memoryMonitor.MakeBoundAccount(),
    75  	}
    76  	d.rc = &DiskBackedRowContainer{}
    77  	d.rc.Init(nil /*ordering*/, types, evalCtx, engine, memoryMonitor, diskMonitor, rowCapacity)
    78  	if deDup {
    79  		ordering := make(sqlbase.ColumnOrdering, len(types))
    80  		for i := range types {
    81  			ordering[i].ColIdx = i
    82  			ordering[i].Direction = encoding.Ascending
    83  		}
    84  		deduper := &DiskBackedRowContainer{}
    85  		deduper.Init(ordering, types, evalCtx, engine, memoryMonitor, diskMonitor, rowCapacity)
    86  		deduper.DoDeDuplicate()
    87  		d.deduper = deduper
    88  	}
    89  	return d
    90  }
    91  
    92  // Len returns the number of rows in this container.
    93  func (d *DiskBackedNumberedRowContainer) Len() int {
    94  	return d.idx
    95  }
    96  
    97  // UsingDisk returns whether the primary container is using disk.
    98  func (d *DiskBackedNumberedRowContainer) UsingDisk() bool {
    99  	return d.rc.UsingDisk()
   100  }
   101  
   102  // Spilled returns whether or not the primary container spilled to disk in its
   103  // lifetime.
   104  func (d *DiskBackedNumberedRowContainer) Spilled() bool {
   105  	return d.rc.Spilled()
   106  }
   107  
   108  // testingSpillToDisk is for tests to spill the container(s)
   109  // to disk.
   110  func (d *DiskBackedNumberedRowContainer) testingSpillToDisk(ctx context.Context) error {
   111  	if !d.rc.UsingDisk() {
   112  		if err := d.rc.SpillToDisk(ctx); err != nil {
   113  			return err
   114  		}
   115  	}
   116  	if d.deDup && !d.deduper.(*DiskBackedRowContainer).UsingDisk() {
   117  		if err := d.deduper.(*DiskBackedRowContainer).SpillToDisk(ctx); err != nil {
   118  			return err
   119  		}
   120  	}
   121  	return nil
   122  }
   123  
   124  // AddRow tries to add a row. It returns the position of the
   125  // row in the container.
   126  func (d *DiskBackedNumberedRowContainer) AddRow(
   127  	ctx context.Context, row sqlbase.EncDatumRow,
   128  ) (int, error) {
   129  	if d.deDup {
   130  		assignedIdx, err := d.deduper.AddRowWithDeDup(ctx, row)
   131  		if err != nil {
   132  			return 0, err
   133  		}
   134  		if assignedIdx < d.idx {
   135  			// Existing row.
   136  			return assignedIdx, nil
   137  		} else if assignedIdx != d.idx {
   138  			panic(fmt.Sprintf("DiskBackedNumberedRowContainer bug: assignedIdx %d != d.idx %d",
   139  				assignedIdx, d.idx))
   140  		}
   141  		// Else assignedIdx == d.idx, so a new row.
   142  	}
   143  	idx := d.idx
   144  	// An error in AddRow() will cause the two row containers
   145  	// to no longer be in-step with each other wrt the numbering
   146  	// but that is not a concern since the caller will not
   147  	// continue using d after an error.
   148  	d.idx++
   149  	return idx, d.rc.AddRow(ctx, row)
   150  }
   151  
   152  // SetupForRead must be called before calling GetRow(). No more AddRow() calls
   153  // are permitted (before UnsafeReset()). See the comment for
   154  // NumberedDiskRowIterator for how we use the future accesses.
   155  func (d *DiskBackedNumberedRowContainer) SetupForRead(ctx context.Context, accesses [][]int) {
   156  	if !d.rc.UsingDisk() {
   157  		return
   158  	}
   159  	rowIter := d.rc.drc.newNumberedIterator(ctx)
   160  	meanBytesPerRow := d.rc.drc.MeanEncodedRowBytes()
   161  	if meanBytesPerRow == 0 {
   162  		meanBytesPerRow = 100 // arbitrary
   163  	}
   164  	// TODO(sumeer): make bytesPerSSBlock a parameter to
   165  	// NewDiskBackedNumberedRowContainer.
   166  	const bytesPerSSBlock = 32 * 1024
   167  	meanRowsPerSSBlock := bytesPerSSBlock / meanBytesPerRow
   168  	const maxCacheSize = 4096
   169  	cacheSize := maxCacheSize
   170  	if d.DisableCache {
   171  		// This is not an efficient way to disable the cache, but ok for tests.
   172  		cacheSize = 0
   173  	}
   174  	if d.cacheMap == nil {
   175  		d.cacheMap = make(map[int]*cacheElement)
   176  	}
   177  	d.rowIter = newNumberedDiskRowIterator(
   178  		ctx, rowIter, accesses, meanRowsPerSSBlock, cacheSize, d.cacheMap, &d.rowIterMemAcc)
   179  }
   180  
   181  // GetRow returns a row with the given index. If skip is true the row is not
   182  // actually read and just indicates a read that is being skipped. It is used
   183  // to maintain synchronization with the future, since the caller can skip
   184  // accesses for semi-joins and anti-joins.
   185  func (d *DiskBackedNumberedRowContainer) GetRow(
   186  	ctx context.Context, idx int, skip bool,
   187  ) (sqlbase.EncDatumRow, error) {
   188  	if !d.rc.UsingDisk() {
   189  		if skip {
   190  			return nil, nil
   191  		}
   192  		return d.rc.mrc.EncRow(idx), nil
   193  	}
   194  	return d.rowIter.getRow(ctx, idx, skip)
   195  }
   196  
   197  // UnsafeReset resets this container to be reused.
   198  func (d *DiskBackedNumberedRowContainer) UnsafeReset(ctx context.Context) error {
   199  	if d.rowIter != nil {
   200  		d.rowIter.close()
   201  		d.rowIterMemAcc.Clear(ctx)
   202  		d.rowIter = nil
   203  	}
   204  	d.idx = 0
   205  	if err := d.rc.UnsafeReset(ctx); err != nil {
   206  		return err
   207  	}
   208  	if d.deduper != nil {
   209  		if err := d.deduper.UnsafeReset(ctx); err != nil {
   210  			return err
   211  		}
   212  	}
   213  	return nil
   214  }
   215  
   216  // Close closes the container.
   217  func (d *DiskBackedNumberedRowContainer) Close(ctx context.Context) {
   218  	if d.rowIter != nil {
   219  		d.rowIter.close()
   220  	}
   221  	d.rowIterMemAcc.Close(ctx)
   222  	d.rc.Close(ctx)
   223  	if d.deduper != nil {
   224  		d.deduper.Close(ctx)
   225  	}
   226  }
   227  
   228  // numberedDiskRowIterator wraps a numberedRowIterator and adds two pieces
   229  // of functionality:
   230  // - decides between seek and next when positioning the iterator.
   231  // - maintains a cache.
   232  //
   233  // Cache:
   234  //
   235  // The callers of GetRow() know the full pattern of future accesses,
   236  // represented as a [][]int where the int is the row number. Within a slice
   237  // they access in increasing order of index (since forward iteration is
   238  // cheaper), but across different slices there is no such constraint. Caching
   239  // policies like LRU work without knowing the future, and use "recency" of row
   240  // R (number of other distinct rows accessed since last access to row R) as a
   241  // proxy for "reuse distance" of row R (number of distinct rows that will be
   242  // accessed from one access of R to the next access of R). More sophisticated
   243  // policies try to use the recently observed reuse distance as a predictor for
   244  // the future reuse distance. In our case we know the exact reuse distance
   245  // from each access of row R to its next access, so one can construct an
   246  // optimal cache policy for a certain cache size: keep track of the current
   247  // reuse distance for each element of the cache and evict the one with the
   248  // highest reuse distance. A cache miss causes the retrieved entry to be added
   249  // to a full cache only if its reuse distance to the next access is less than
   250  // the highest reuse distance currently in the cache. This optimality requires
   251  // some book-keeping overhead:
   252  //
   253  // - A map with O(R) entries where R is the number of unique rows that will be
   254  //   accessed and an overall size proportional to the total number of accesses.
   255  //   Overall this is within a constant factor of [][]int, but the constant could
   256  //   be high. Note that we need this map because when doing Next() on the iterator
   257  //   we encounter entries different from the ones that caused this cache miss
   258  //   and we need to decide whether to cache them -- if we had a random access
   259  //   iterator such that sequential access was the same cost as random
   260  //   access, then a single []int with the next reuse position for each access
   261  //   would have sufficed.
   262  // - A heap containing the rows in the cache that is updated on each cache hit,
   263  //   and whenever a row is evicted or added to the cache. This is O(log N) where
   264  //   N is the number of entries in the cache.
   265  //
   266  // Overall, this may be too much memory and cpu overhead for not enough
   267  // benefit, but it will put an upper bound on what we can achieve with a
   268  // cache. And for inverted index queries involving intersection it is possible
   269  // that the row container contains far more rows than the number of unique
   270  // rows that will be accessed, so a small cache which knows the future could
   271  // be very beneficial. One motivation for this approach was that #48118
   272  // mentioned low observed cache hit rates with a simpler approach. And since
   273  // we turn off ssblock caching for these underlying storage engine, the
   274  // cost of a cache miss is high.
   275  //
   276  // TODO(sumeer):
   277  // - Use some realistic inverted index workloads (including geospatial) to
   278  //   measure the effect of this cache.
   279  type numberedDiskRowIterator struct {
   280  	rowIter *numberedRowIterator
   281  	// After creation, the rowIter is not positioned. isPositioned transitions
   282  	// once from false => true.
   283  	isPositioned bool
   284  	// The current index the rowIter is positioned at, when isPositioned == true.
   285  	idxRowIter int
   286  	// The mean number of rows per ssblock.
   287  	meanRowsPerSSBlock int
   288  	// The maximum number of rows in the cache. This can be shrunk under memory
   289  	// pressure.
   290  	maxCacheSize int
   291  	memAcc       *mon.BoundAccount
   292  
   293  	// The cache. It contains an entry for all the rows that will be accessed,
   294  	// and not just the ones for which we currently have a cached EncDatumRow.
   295  	cache map[int]*cacheElement
   296  	// The current access index in the sequence of all the accesses. This is
   297  	// used to know where we are in the known future.
   298  	accessIdx int
   299  	// A max heap containing only the rows for which we have a cached
   300  	// EncDatumRow. The top element has the highest nextAccess and is the
   301  	// best candidate to evict.
   302  	cacheHeap  cacheMaxNextAccessHeap
   303  	datumAlloc sqlbase.DatumAlloc
   304  	rowAlloc   sqlbase.EncDatumRowAlloc
   305  
   306  	hitCount  int
   307  	missCount int
   308  }
   309  
   310  type cacheElement struct {
   311  	// The future accesses for this row, expressed as the accessIdx when it will
   312  	// happen. We update this slice to remove the first entry whenever an access
   313  	// happens, so when non-empty, accesses[0] represents the next access, and
   314  	// when empty there are no more accesses left.
   315  	accesses []int
   316  	// row is non-nil for a cached row.
   317  	row sqlbase.EncDatumRow
   318  	// When row is non-nil, this is the element in the heap.
   319  	heapElement cacheRowHeapElement
   320  	// Used only when initializing accesses, so that we can allocate a single
   321  	// shared slice for accesses across all cacheElements.
   322  	numAccesses int
   323  }
   324  
   325  var cacheElementSyncPool = sync.Pool{
   326  	New: func() interface{} {
   327  		return &cacheElement{}
   328  	},
   329  }
   330  
   331  func freeCacheElement(elem *cacheElement) {
   332  	elem.accesses = nil
   333  	elem.row = nil
   334  	elem.numAccesses = 0
   335  	cacheElementSyncPool.Put(elem)
   336  }
   337  
   338  func newCacheElement() *cacheElement {
   339  	return cacheElementSyncPool.Get().(*cacheElement)
   340  }
   341  
   342  type cacheRowHeapElement struct {
   343  	// The index of this cached row.
   344  	rowIdx int
   345  	// The next access of this cached row.
   346  	nextAccess int
   347  	// The index in the heap.
   348  	heapIdx int
   349  }
   350  
   351  type cacheMaxNextAccessHeap []*cacheRowHeapElement
   352  
   353  func (h cacheMaxNextAccessHeap) Len() int { return len(h) }
   354  func (h cacheMaxNextAccessHeap) Less(i, j int) bool {
   355  	return h[i].nextAccess > h[j].nextAccess
   356  }
   357  func (h cacheMaxNextAccessHeap) Swap(i, j int) {
   358  	h[i], h[j] = h[j], h[i]
   359  	h[i].heapIdx = i
   360  	h[j].heapIdx = j
   361  }
   362  func (h *cacheMaxNextAccessHeap) Push(x interface{}) {
   363  	n := len(*h)
   364  	elem := x.(*cacheRowHeapElement)
   365  	elem.heapIdx = n
   366  	*h = append(*h, elem)
   367  }
   368  func (h *cacheMaxNextAccessHeap) Pop() interface{} {
   369  	old := *h
   370  	n := len(old)
   371  	elem := old[n-1]
   372  	elem.heapIdx = -1
   373  	*h = old[0 : n-1]
   374  	return elem
   375  }
   376  
   377  // TODO(sumeer): memory accounting for map and heap.
   378  func newNumberedDiskRowIterator(
   379  	_ context.Context,
   380  	rowIter *numberedRowIterator,
   381  	accesses [][]int,
   382  	meanRowsPerSSBlock int,
   383  	maxCacheSize int,
   384  	cache map[int]*cacheElement,
   385  	memAcc *mon.BoundAccount,
   386  ) *numberedDiskRowIterator {
   387  	n := &numberedDiskRowIterator{
   388  		rowIter:            rowIter,
   389  		meanRowsPerSSBlock: meanRowsPerSSBlock,
   390  		maxCacheSize:       maxCacheSize,
   391  		memAcc:             memAcc,
   392  		cache:              cache,
   393  	}
   394  	var numAccesses int
   395  	for _, accSlice := range accesses {
   396  		for _, rowIdx := range accSlice {
   397  			elem := n.cache[rowIdx]
   398  			if elem == nil {
   399  				elem = newCacheElement()
   400  				elem.heapElement.rowIdx = rowIdx
   401  				n.cache[rowIdx] = elem
   402  			}
   403  			elem.numAccesses++
   404  			numAccesses++
   405  		}
   406  	}
   407  	allAccesses := make([]int, numAccesses)
   408  	accessIdx := 0
   409  	for _, accSlice := range accesses {
   410  		for _, rowIdx := range accSlice {
   411  			elem := n.cache[rowIdx]
   412  			if elem.accesses == nil {
   413  				// Sub-slice that can grow up to elem.numAccesses
   414  				elem.accesses = allAccesses[0:0:elem.numAccesses]
   415  				allAccesses = allAccesses[elem.numAccesses:]
   416  			}
   417  			elem.accesses = append(elem.accesses, accessIdx)
   418  			accessIdx++
   419  		}
   420  	}
   421  	return n
   422  }
   423  
   424  func (n *numberedDiskRowIterator) close() {
   425  	n.rowIter.Close()
   426  	for k, v := range n.cache {
   427  		freeCacheElement(v)
   428  		delete(n.cache, k)
   429  	}
   430  }
   431  
   432  func (n *numberedDiskRowIterator) getRow(
   433  	ctx context.Context, idx int, skip bool,
   434  ) (sqlbase.EncDatumRow, error) {
   435  	thisAccessIdx := n.accessIdx
   436  	n.accessIdx++
   437  	elem, ok := n.cache[idx]
   438  	if !ok {
   439  		return nil, errors.Errorf("caller is accessing a row that was not specified up front")
   440  	}
   441  	if len(elem.accesses) == 0 || elem.accesses[0] != thisAccessIdx {
   442  		return nil, errors.Errorf("caller is no longer synchronized with future accesses")
   443  	}
   444  	elem.accesses = elem.accesses[1:]
   445  	var nextAccess int
   446  	if len(elem.accesses) > 0 {
   447  		nextAccess = elem.accesses[0]
   448  	} else {
   449  		nextAccess = math.MaxInt32
   450  	}
   451  
   452  	// Check for cache hit. This also updates the heap position,
   453  	// which we need to do even for skip == true.
   454  	if elem.row != nil {
   455  		n.hitCount++
   456  		elem.heapElement.nextAccess = nextAccess
   457  		heap.Fix(&n.cacheHeap, elem.heapElement.heapIdx)
   458  		if skip {
   459  			return nil, nil
   460  		}
   461  		return elem.row, nil
   462  	}
   463  
   464  	// Cache miss.
   465  	n.missCount++
   466  	// If skip, we can just return.
   467  	if skip {
   468  		return nil, nil
   469  	}
   470  
   471  	// Need to position the rowIter. We could add Prev(), since the engine supports
   472  	// it, if benchmarks indicate it would help. For now we just Seek() for that
   473  	// case.
   474  	if n.isPositioned && idx >= n.idxRowIter && (idx-n.idxRowIter <= n.meanRowsPerSSBlock) {
   475  		// Need to move forward, possibly within the same ssblock, so use Next().
   476  		// It is possible we are already positioned at the right place.
   477  		for i := idx - n.idxRowIter; i > 0; {
   478  			n.rowIter.Next()
   479  			if valid, err := n.rowIter.Valid(); err != nil || !valid {
   480  				if err != nil {
   481  					return nil, err
   482  				}
   483  				return nil, errors.Errorf("caller is asking for index higher than any added index")
   484  			}
   485  			n.idxRowIter++
   486  			i--
   487  			if i == 0 {
   488  				break
   489  			}
   490  			// i > 0. This is before the row we want to return, but it may
   491  			// be worthwhile to cache it.
   492  			preElem, ok := n.cache[n.idxRowIter]
   493  			if !ok {
   494  				// This is a row that is never accessed.
   495  				continue
   496  			}
   497  			if preElem.row != nil {
   498  				// Already in cache.
   499  				continue
   500  			}
   501  			if len(preElem.accesses) == 0 {
   502  				// No accesses left.
   503  				continue
   504  			}
   505  			if err := n.tryAddCache(ctx, preElem); err != nil {
   506  				return nil, err
   507  			}
   508  		}
   509  		// Try adding to cache
   510  		return n.tryAddCacheAndReturnRow(ctx, elem)
   511  	}
   512  	n.rowIter.seekToIndex(idx)
   513  	n.isPositioned = true
   514  	n.idxRowIter = idx
   515  	if valid, err := n.rowIter.Valid(); err != nil || !valid {
   516  		if err != nil {
   517  			return nil, err
   518  		}
   519  		return nil, errors.Errorf("caller is asking for index higher than any added index")
   520  	}
   521  	// Try adding to cache
   522  	return n.tryAddCacheAndReturnRow(ctx, elem)
   523  }
   524  
   525  func (n *numberedDiskRowIterator) ensureDecoded(row sqlbase.EncDatumRow) error {
   526  	for i := range row {
   527  		if err := row[i].EnsureDecoded(n.rowIter.rowContainer.types[i], &n.datumAlloc); err != nil {
   528  			return err
   529  		}
   530  	}
   531  	return nil
   532  }
   533  
   534  func (n *numberedDiskRowIterator) tryAddCacheAndReturnRow(
   535  	ctx context.Context, elem *cacheElement,
   536  ) (sqlbase.EncDatumRow, error) {
   537  	r, err := n.rowIter.Row()
   538  	if err != nil {
   539  		return nil, err
   540  	}
   541  	if err = n.ensureDecoded(r); err != nil {
   542  		return nil, err
   543  	}
   544  	if len(elem.accesses) == 0 {
   545  		return r, nil
   546  	}
   547  	return r, n.tryAddCacheHelper(ctx, elem, r, true)
   548  }
   549  
   550  func (n *numberedDiskRowIterator) tryAddCache(ctx context.Context, elem *cacheElement) error {
   551  	// We don't want to pay the cost of rowIter.Row() if the row will not be
   552  	// added to the cache. But to do correct memory accounting, which is needed
   553  	// for the precise caching decision, we do need the EncDatumRow. So we do a
   554  	// cheap check that is a good predictor of whether the row will be cached,
   555  	// and then call rowIter.Row().
   556  	cacheSize := len(n.cacheHeap)
   557  	if cacheSize == n.maxCacheSize && (cacheSize == 0 || n.cacheHeap[0].nextAccess <= elem.accesses[0]) {
   558  		return nil
   559  	}
   560  	row, err := n.rowIter.Row()
   561  	if err != nil {
   562  		return err
   563  	}
   564  	return n.tryAddCacheHelper(ctx, elem, row, false)
   565  }
   566  
   567  func (n *numberedDiskRowIterator) tryAddCacheHelper(
   568  	ctx context.Context, elem *cacheElement, row sqlbase.EncDatumRow, alreadyDecoded bool,
   569  ) error {
   570  	if elem.row != nil {
   571  		log.Fatalf(ctx, "adding row to cache when it is already in cache")
   572  	}
   573  	nextAccess := elem.accesses[0]
   574  	evict := func() (sqlbase.EncDatumRow, error) {
   575  		heapElem := heap.Pop(&n.cacheHeap).(*cacheRowHeapElement)
   576  		evictElem, ok := n.cache[heapElem.rowIdx]
   577  		if !ok {
   578  			return nil, errors.Errorf("bug: element not in cache map")
   579  		}
   580  		bytes := evictElem.row.Size()
   581  		n.memAcc.Shrink(ctx, int64(bytes))
   582  		evictedRow := evictElem.row
   583  		evictElem.row = nil
   584  		return evictedRow, nil
   585  	}
   586  	rowBytesUsage := -1
   587  	var rowToReuse sqlbase.EncDatumRow
   588  	for {
   589  		if n.maxCacheSize == 0 {
   590  			return nil
   591  		}
   592  		if len(n.cacheHeap) == n.maxCacheSize && n.cacheHeap[0].nextAccess <= nextAccess {
   593  			return nil
   594  		}
   595  		var err error
   596  		if len(n.cacheHeap) >= n.maxCacheSize {
   597  			if rowToReuse, err = evict(); err != nil {
   598  				return err
   599  			}
   600  			continue
   601  		}
   602  
   603  		// We shrink maxCacheSize such that it is a good current indicator of how
   604  		// many rows memAcc will allow us to place in the cache. So it is likely
   605  		// that this row can be added. Decode the row to get the correct
   606  		// rowBytesUsage.
   607  		if !alreadyDecoded {
   608  			err = n.ensureDecoded(row)
   609  			if err != nil {
   610  				return err
   611  			}
   612  			alreadyDecoded = true
   613  		}
   614  		if rowBytesUsage == -1 {
   615  			rowBytesUsage = int(row.Size())
   616  		}
   617  		if err := n.memAcc.Grow(ctx, int64(rowBytesUsage)); err != nil {
   618  			if sqlbase.IsOutOfMemoryError(err) {
   619  				// Could not grow the memory to handle this row, so reduce the
   620  				// maxCacheSize (max count of entries), to the current number of
   621  				// entries in the cache. The assumption here is that rows in the cache
   622  				// are of similar size. Using maxCacheSize to make eviction decisions
   623  				// is cheaper than calling Grow().
   624  				n.maxCacheSize = len(n.cacheHeap)
   625  				continue
   626  			} else {
   627  				return err
   628  			}
   629  		} else {
   630  			// There is room in the cache.
   631  			break
   632  		}
   633  	}
   634  	// Add to cache.
   635  	elem.heapElement.nextAccess = nextAccess
   636  	// Need to copy row, since its lifetime is less than the cached row.
   637  	if rowToReuse == nil {
   638  		elem.row = n.rowAlloc.CopyRow(row)
   639  	} else {
   640  		copy(rowToReuse, row)
   641  		elem.row = rowToReuse
   642  	}
   643  	heap.Push(&n.cacheHeap, &elem.heapElement)
   644  	return nil
   645  }