github.com/cockroachdb/cockroach@v20.2.0-alpha.1+incompatible/pkg/sql/colexec/spilling_queue.go (about) 1 // Copyright 2020 The Cockroach Authors. 2 // 3 // Use of this software is governed by the Business Source License 4 // included in the file licenses/BSL.txt. 5 // 6 // As of the Change Date specified in that file, in accordance with 7 // the Business Source License, use of this software will be governed 8 // by the Apache License, Version 2.0, included in the file 9 // licenses/APL.txt. 10 11 package colexec 12 13 import ( 14 "context" 15 "fmt" 16 17 "github.com/cockroachdb/cockroach/pkg/col/coldata" 18 "github.com/cockroachdb/cockroach/pkg/sql/colcontainer" 19 "github.com/cockroachdb/cockroach/pkg/sql/colexecbase/colexecerror" 20 "github.com/cockroachdb/cockroach/pkg/sql/colmem" 21 "github.com/cockroachdb/cockroach/pkg/sql/types" 22 "github.com/cockroachdb/cockroach/pkg/util/log" 23 "github.com/cockroachdb/cockroach/pkg/util/mon" 24 "github.com/cockroachdb/errors" 25 "github.com/marusama/semaphore" 26 ) 27 28 // spillingQueue is a Queue that uses a fixed-size in-memory circular buffer 29 // and spills to disk if spillingQueue.items has no more slots available to hold 30 // a reference to an enqueued batch or the allocator reports that more memory 31 // than the caller-provided maxMemoryLimit is in use. 32 // When spilling to disk, a DiskQueue will be created. When spilling batches to 33 // disk, their memory will first be released using the allocator. When batches 34 // are read from disk back into memory, that memory will be reclaimed. 35 // NOTE: When a batch is returned, that batch's memory will still be tracked 36 // using the allocator. Since the memory in use is fixed, a previously returned 37 // batch may be overwritten by a batch read from disk. This new batch's memory 38 // footprint will replace the footprint of the previously returned batch. Since 39 // batches are unsafe for reuse, it is assumed that the previously returned 40 // batch is not kept around and thus its referenced memory will be GCed as soon 41 // as the batch is updated. 42 type spillingQueue struct { 43 unlimitedAllocator *colmem.Allocator 44 maxMemoryLimit int64 45 46 typs []*types.T 47 items []coldata.Batch 48 curHeadIdx int 49 curTailIdx int 50 numInMemoryItems int 51 numOnDiskItems int 52 closed bool 53 54 diskQueueCfg colcontainer.DiskQueueCfg 55 diskQueue colcontainer.Queue 56 fdSemaphore semaphore.Semaphore 57 dequeueScratch coldata.Batch 58 59 rewindable bool 60 rewindableState struct { 61 numItemsDequeued int 62 } 63 64 diskAcc *mon.BoundAccount 65 } 66 67 // newSpillingQueue creates a new spillingQueue. An unlimited allocator must be 68 // passed in. The spillingQueue will use this allocator to check whether memory 69 // usage exceeds the given memory limit and use disk if so. 70 // If fdSemaphore is nil, no Acquire or Release calls will happen. The caller 71 // may want to do this if requesting FDs up front. 72 func newSpillingQueue( 73 unlimitedAllocator *colmem.Allocator, 74 typs []*types.T, 75 memoryLimit int64, 76 cfg colcontainer.DiskQueueCfg, 77 fdSemaphore semaphore.Semaphore, 78 batchSize int, 79 diskAcc *mon.BoundAccount, 80 ) *spillingQueue { 81 // Reduce the memory limit by what the DiskQueue may need to buffer 82 // writes/reads. 83 memoryLimit -= int64(cfg.BufferSizeBytes) 84 if memoryLimit < 0 { 85 memoryLimit = 0 86 } 87 itemsLen := memoryLimit / int64(colmem.EstimateBatchSizeBytes(typs, batchSize)) 88 if itemsLen == 0 { 89 // Make items at least of length 1. Even though batches will spill to disk 90 // directly (this can only happen with a very low memory limit), it's nice 91 // to have at least one item in order to be able to deserialize from disk 92 // into this slice. 93 itemsLen = 1 94 } 95 return &spillingQueue{ 96 unlimitedAllocator: unlimitedAllocator, 97 maxMemoryLimit: memoryLimit, 98 typs: typs, 99 items: make([]coldata.Batch, itemsLen), 100 diskQueueCfg: cfg, 101 fdSemaphore: fdSemaphore, 102 dequeueScratch: unlimitedAllocator.NewMemBatchWithSize(typs, coldata.BatchSize()), 103 diskAcc: diskAcc, 104 } 105 } 106 107 // newRewindableSpillingQueue creates a new spillingQueue that can be rewinded 108 // in order to dequeue all enqueued batches all over again. An unlimited 109 // allocator must be passed in. The queue will use this allocator to check 110 // whether memory usage exceeds the given memory limit and use disk if so. 111 func newRewindableSpillingQueue( 112 unlimitedAllocator *colmem.Allocator, 113 typs []*types.T, 114 memoryLimit int64, 115 cfg colcontainer.DiskQueueCfg, 116 fdSemaphore semaphore.Semaphore, 117 batchSize int, 118 diskAcc *mon.BoundAccount, 119 ) *spillingQueue { 120 q := newSpillingQueue(unlimitedAllocator, typs, memoryLimit, cfg, fdSemaphore, batchSize, diskAcc) 121 q.rewindable = true 122 return q 123 } 124 125 func (q *spillingQueue) enqueue(ctx context.Context, batch coldata.Batch) error { 126 if batch.Length() == 0 { 127 if q.diskQueue != nil { 128 if err := q.diskQueue.Enqueue(ctx, batch); err != nil { 129 return err 130 } 131 } 132 return nil 133 } 134 135 if q.numOnDiskItems > 0 || q.unlimitedAllocator.Used() > q.maxMemoryLimit || q.numInMemoryItems == len(q.items) { 136 // In this case, there is not enough memory available to keep this batch in 137 // memory, or the in-memory circular buffer has no slots available (we do 138 // an initial estimate of how many batches would fit into the buffer, which 139 // might be wrong). The tail of the queue might also already be on disk, in 140 // which case that is where the batch must be enqueued to maintain order. 141 if err := q.maybeSpillToDisk(ctx); err != nil { 142 return err 143 } 144 q.unlimitedAllocator.ReleaseBatch(batch) 145 if err := q.diskQueue.Enqueue(ctx, batch); err != nil { 146 return err 147 } 148 q.numOnDiskItems++ 149 return nil 150 } 151 152 q.items[q.curTailIdx] = batch 153 q.curTailIdx++ 154 if q.curTailIdx == len(q.items) { 155 q.curTailIdx = 0 156 } 157 q.numInMemoryItems++ 158 return nil 159 } 160 161 func (q *spillingQueue) dequeue(ctx context.Context) (coldata.Batch, error) { 162 if q.empty() { 163 return coldata.ZeroBatch, nil 164 } 165 166 if (q.rewindable && q.numInMemoryItems <= q.rewindableState.numItemsDequeued) || 167 (!q.rewindable && q.numInMemoryItems == 0) { 168 // No more in-memory items. Fill the circular buffer as much as possible. 169 // Note that there must be at least one element on disk. 170 if !q.rewindable && q.curHeadIdx != q.curTailIdx { 171 colexecerror.InternalError(fmt.Sprintf("assertion failed in spillingQueue: curHeadIdx != curTailIdx, %d != %d", q.curHeadIdx, q.curTailIdx)) 172 } 173 // NOTE: Only one item is dequeued from disk since a deserialized batch is 174 // only valid until the next call to Dequeue. In practice we could Dequeue 175 // up until a new file region is loaded (which will overwrite the memory of 176 // the previous batches), but Dequeue calls are already amortized, so this 177 // is acceptable. 178 // Release a batch to make space for a new batch from disk. 179 q.unlimitedAllocator.ReleaseBatch(q.dequeueScratch) 180 ok, err := q.diskQueue.Dequeue(ctx, q.dequeueScratch) 181 if err != nil { 182 return nil, err 183 } 184 if !ok { 185 // There was no batch to dequeue from disk. This should not really 186 // happen, as it should have been caught by the q.empty() check above. 187 colexecerror.InternalError("disk queue was not empty but failed to dequeue element in spillingQueue") 188 } 189 // Account for this batch's memory. 190 q.unlimitedAllocator.RetainBatch(q.dequeueScratch) 191 if q.rewindable { 192 q.rewindableState.numItemsDequeued++ 193 return q.dequeueScratch, nil 194 } 195 q.numOnDiskItems-- 196 q.numInMemoryItems++ 197 q.items[q.curTailIdx] = q.dequeueScratch 198 q.curTailIdx++ 199 if q.curTailIdx == len(q.items) { 200 q.curTailIdx = 0 201 } 202 } 203 204 res := q.items[q.curHeadIdx] 205 q.curHeadIdx++ 206 if q.curHeadIdx == len(q.items) { 207 q.curHeadIdx = 0 208 } 209 if q.rewindable { 210 q.rewindableState.numItemsDequeued++ 211 } else { 212 q.numInMemoryItems-- 213 } 214 return res, nil 215 } 216 217 func (q *spillingQueue) numFDsOpenAtAnyGivenTime() int { 218 if q.diskQueueCfg.CacheMode != colcontainer.DiskQueueCacheModeDefault { 219 // The access pattern must be write-everything then read-everything so 220 // either a read FD or a write FD are open at any one point. 221 return 1 222 } 223 // Otherwise, both will be open. 224 return 2 225 } 226 227 func (q *spillingQueue) maybeSpillToDisk(ctx context.Context) error { 228 if q.diskQueue != nil { 229 return nil 230 } 231 var err error 232 // Acquire two file descriptors for the DiskQueue: one for the write file and 233 // one for the read file. 234 if q.fdSemaphore != nil { 235 if err = q.fdSemaphore.Acquire(ctx, q.numFDsOpenAtAnyGivenTime()); err != nil { 236 return err 237 } 238 } 239 log.VEvent(ctx, 1, "spilled to disk") 240 var diskQueue colcontainer.Queue 241 if q.rewindable { 242 diskQueue, err = colcontainer.NewRewindableDiskQueue(ctx, q.typs, q.diskQueueCfg, q.diskAcc) 243 } else { 244 diskQueue, err = colcontainer.NewDiskQueue(ctx, q.typs, q.diskQueueCfg, q.diskAcc) 245 } 246 if err != nil { 247 return err 248 } 249 // Only assign q.diskQueue if there was no error, otherwise the returned value 250 // may be non-nil but invalid. 251 q.diskQueue = diskQueue 252 return nil 253 } 254 255 // empty returns whether there are currently no items to be dequeued. 256 func (q *spillingQueue) empty() bool { 257 if q.rewindable { 258 return q.numInMemoryItems+q.numOnDiskItems == q.rewindableState.numItemsDequeued 259 } 260 return q.numInMemoryItems == 0 && q.numOnDiskItems == 0 261 } 262 263 func (q *spillingQueue) spilled() bool { 264 return q.diskQueue != nil 265 } 266 267 func (q *spillingQueue) close(ctx context.Context) error { 268 if q.closed { 269 return nil 270 } 271 if q.diskQueue != nil { 272 if err := q.diskQueue.Close(ctx); err != nil { 273 return err 274 } 275 if q.fdSemaphore != nil { 276 q.fdSemaphore.Release(q.numFDsOpenAtAnyGivenTime()) 277 } 278 q.closed = true 279 return nil 280 } 281 return nil 282 } 283 284 func (q *spillingQueue) rewind() error { 285 if !q.rewindable { 286 return errors.Newf("unexpectedly rewind() called when spilling queue is not rewindable") 287 } 288 if q.diskQueue != nil { 289 if err := q.diskQueue.(colcontainer.RewindableQueue).Rewind(); err != nil { 290 return err 291 } 292 } 293 q.curHeadIdx = 0 294 q.rewindableState.numItemsDequeued = 0 295 return nil 296 } 297 298 func (q *spillingQueue) reset(ctx context.Context) { 299 if err := q.close(ctx); err != nil { 300 colexecerror.InternalError(err) 301 } 302 q.diskQueue = nil 303 q.closed = false 304 q.numInMemoryItems = 0 305 q.numOnDiskItems = 0 306 q.curHeadIdx = 0 307 q.curTailIdx = 0 308 q.rewindableState.numItemsDequeued = 0 309 }