github.com/cockroachdb/cockroach@v20.2.0-alpha.1+incompatible/pkg/sql/colcontainer/partitionedqueue.go (about) 1 // Copyright 2020 The Cockroach Authors. 2 // 3 // Use of this software is governed by the Business Source License 4 // included in the file licenses/BSL.txt. 5 // 6 // As of the Change Date specified in that file, in accordance with 7 // the Business Source License, use of this software will be governed 8 // by the Apache License, Version 2.0, included in the file 9 // licenses/APL.txt. 10 11 package colcontainer 12 13 import ( 14 "context" 15 "fmt" 16 17 "github.com/cockroachdb/cockroach/pkg/col/coldata" 18 "github.com/cockroachdb/cockroach/pkg/sql/colexecbase/colexecerror" 19 "github.com/cockroachdb/cockroach/pkg/sql/types" 20 "github.com/cockroachdb/cockroach/pkg/util/mon" 21 "github.com/cockroachdb/errors" 22 "github.com/marusama/semaphore" 23 ) 24 25 // PartitionedQueue is the abstraction for on-disk storage. 26 type PartitionedQueue interface { 27 // Enqueue adds the batch to the end of the partitionIdx'th partition. If a 28 // partition at that index does not exist, a new one is created. Existing 29 // partitions may not be Enqueued to after calling 30 // CloseAllOpenWriteFileDescriptors. 31 Enqueue(ctx context.Context, partitionIdx int, batch coldata.Batch) error 32 // Dequeue removes and returns the batch from the front of the 33 // partitionIdx'th partition. If the partition is empty, or no partition at 34 // that index was Enqueued to, a zero-length batch is returned. Note that 35 // it is illegal to call Enqueue on a partition after Dequeue. 36 Dequeue(ctx context.Context, partitionIdx int, batch coldata.Batch) error 37 // CloseAllOpenWriteFileDescriptors notifies the PartitionedQueue that it can 38 // close all open write file descriptors. After this point, only new 39 // partitions may be Enqueued to. 40 CloseAllOpenWriteFileDescriptors(ctx context.Context) error 41 // CloseAllOpenReadFileDescriptors closes the open read file descriptors 42 // belonging to partitions. These partitions may still be Dequeued from, 43 // although this will trigger files to be reopened. 44 CloseAllOpenReadFileDescriptors() error 45 // CloseInactiveReadPartitions closes all partitions that have been Dequeued 46 // from and have either been temporarily closed through 47 // CloseAllOpenReadFileDescriptors or have returned a coldata.ZeroBatch from 48 // Dequeue. This close removes the underlying files. 49 CloseInactiveReadPartitions(ctx context.Context) error 50 // Close closes all partitions created. 51 Close(ctx context.Context) error 52 } 53 54 // partitionState is the state a partition is in. 55 type partitionState int 56 57 const ( 58 // partitionStateWriting is the initial state of a partition. A partition will 59 // always transition to partitionStateClosedForWriting next. 60 partitionStateWriting partitionState = iota 61 // partitionStateClosedForWriting is the state a partition is in when it is 62 // closed for writing. The next possible state is for reads to happen, and 63 // thus a transition to partitionStateReading. Note that if a partition is 64 // in this state when entering a method of PartitionedDiskQueue, it is always 65 // true that the file descriptor for this partition has been released. 66 partitionStateClosedForWriting 67 // partitionStateReading is the state a partition is in when Dequeue has been 68 // called. It is always the case that this partition has been closed for 69 // writing and may not transition back to partitionStateWriting or 70 // partitionStateClosedForWriting. The read file descriptor for this partition 71 // may be closed in this state, although it will be reacquired on the next 72 // read. 73 partitionStateReading 74 // partitionStateClosedForReading is the state a partition is in when a 75 // partition was in partitionStateReading and CloseAllOpenReadFileDescriptors 76 // was called. If Dequeued, this partition will reacquire a file descriptor 77 // and transition back to partitionStateReading. 78 partitionStateClosedForReading 79 // partitionStatePermanentlyClosed is the state a partition is in when its 80 // underlying DiskQueue has been Closed. 81 partitionStatePermanentlyClosed 82 ) 83 84 // partition is a simple wrapper over a Queue used by the PartitionedDiskQueue. 85 type partition struct { 86 Queue 87 state partitionState 88 } 89 90 // PartitionerStrategy describes a strategy used by the PartitionedQueue during 91 // its operation. 92 type PartitionerStrategy int 93 94 const ( 95 // PartitionerStrategyDefault is a partitioner strategy in which the 96 // PartitionedQueue will keep all partitions open for writing. 97 // Note that this uses up as many file descriptors as partitions. 98 PartitionerStrategyDefault PartitionerStrategy = iota 99 // PartitionerStrategyCloseOnNewPartition is a partitioner strategy that 100 // closes an open partition for writing if a new partition is created. This 101 // ensures that the total number of file descriptors remains at 1. However, 102 // note that closed partitions may never be written to again, only read. 103 PartitionerStrategyCloseOnNewPartition 104 ) 105 106 // PartitionedDiskQueue is a PartitionedQueue whose partitions are on-disk. 107 type PartitionedDiskQueue struct { 108 typs []*types.T 109 strategy PartitionerStrategy 110 cfg DiskQueueCfg 111 112 partitionIdxToIndex map[int]int 113 partitions []partition 114 115 // lastEnqueuedPartitionIdx is the index of the last enqueued partition. Set 116 // to -1 during initialization. 117 lastEnqueuedPartitionIdx int 118 119 numOpenFDs int 120 fdSemaphore semaphore.Semaphore 121 diskAcc *mon.BoundAccount 122 } 123 124 var _ PartitionedQueue = &PartitionedDiskQueue{} 125 126 // NewPartitionedDiskQueue creates a PartitionedDiskQueue whose partitions are 127 // all on-disk queues. Note that diskQueues will be lazily created when 128 // enqueueing to a new partition. Each new partition will use 129 // cfg.BufferSizeBytes, so memory usage may increase in an unbounded fashion if 130 // used unmethodically. The file descriptors are acquired through fdSemaphore. 131 // If fdSemaphore is nil, the partitioned disk queue will not Acquire or Release 132 // file descriptors. Do this if the caller knows that it will use a constant 133 // maximum number of file descriptors and wishes to acquire these up front. 134 // Note that actual file descriptors open may be less than, but never more than 135 // the number acquired through the semaphore. 136 func NewPartitionedDiskQueue( 137 typs []*types.T, 138 cfg DiskQueueCfg, 139 fdSemaphore semaphore.Semaphore, 140 partitionerStrategy PartitionerStrategy, 141 diskAcc *mon.BoundAccount, 142 ) *PartitionedDiskQueue { 143 if len(typs) == 0 { 144 // DiskQueues cannot serialize zero length schemas, so catch this error 145 // early. 146 // TODO(asubiotto): We could support this, but not sure we need to. 147 colexecerror.InternalError("zero length schema unsupported") 148 } 149 return &PartitionedDiskQueue{ 150 typs: typs, 151 strategy: partitionerStrategy, 152 cfg: cfg, 153 partitionIdxToIndex: make(map[int]int), 154 partitions: make([]partition, 0), 155 lastEnqueuedPartitionIdx: -1, 156 fdSemaphore: fdSemaphore, 157 diskAcc: diskAcc, 158 } 159 } 160 161 type closeWritePartitionArgument int 162 163 const ( 164 retainFD closeWritePartitionArgument = iota 165 releaseFD 166 ) 167 168 // closeWritePartition enqueues a coldata.ZeroBatch to a partition at index 169 // idx, resulting in it closing its write file descriptor. This partition may 170 // still be read from, but never written to again. Note that releaseFDOption 171 // should always be retainFD if a new file is opened in the same scope, to avoid 172 // having to re-enter the semaphore. The argument should only be releaseFD if 173 // reopening a different file in a different scope. 174 func (p *PartitionedDiskQueue) closeWritePartition( 175 ctx context.Context, idx int, releaseFDOption closeWritePartitionArgument, 176 ) error { 177 if p.partitions[idx].state != partitionStateWriting { 178 colexecerror.InternalError(fmt.Sprintf("illegal state change from %d to partitionStateClosedForWriting, only partitionStateWriting allowed", p.partitions[idx].state)) 179 } 180 if err := p.partitions[idx].Enqueue(ctx, coldata.ZeroBatch); err != nil { 181 return err 182 } 183 if releaseFDOption == releaseFD && p.fdSemaphore != nil { 184 p.fdSemaphore.Release(1) 185 p.numOpenFDs-- 186 } 187 p.partitions[idx].state = partitionStateClosedForWriting 188 return nil 189 } 190 191 func (p *PartitionedDiskQueue) closeReadPartition(idx int) error { 192 if p.partitions[idx].state != partitionStateReading { 193 colexecerror.InternalError(fmt.Sprintf("illegal state change from %d to partitionStateClosedForReading, only partitionStateReading allowed", p.partitions[idx].state)) 194 } 195 if err := p.partitions[idx].CloseRead(); err != nil { 196 return err 197 } 198 if p.fdSemaphore != nil { 199 p.fdSemaphore.Release(1) 200 p.numOpenFDs-- 201 } 202 p.partitions[idx].state = partitionStateClosedForReading 203 return nil 204 } 205 206 func (p *PartitionedDiskQueue) acquireNewFD(ctx context.Context) error { 207 if p.fdSemaphore == nil { 208 return nil 209 } 210 if err := p.fdSemaphore.Acquire(ctx, 1); err != nil { 211 return err 212 } 213 p.numOpenFDs++ 214 return nil 215 } 216 217 // Enqueue enqueues a batch at partition partitionIdx. 218 func (p *PartitionedDiskQueue) Enqueue( 219 ctx context.Context, partitionIdx int, batch coldata.Batch, 220 ) error { 221 idx, ok := p.partitionIdxToIndex[partitionIdx] 222 if !ok { 223 needToAcquireFD := true 224 if p.strategy == PartitionerStrategyCloseOnNewPartition && p.lastEnqueuedPartitionIdx != -1 { 225 idxToClose, found := p.partitionIdxToIndex[p.lastEnqueuedPartitionIdx] 226 if !found { 227 // This would be unexpected. 228 return errors.New("PartitionerStrategyCloseOnNewPartition unable to find last Enqueued partition") 229 } 230 if p.partitions[idxToClose].state == partitionStateWriting { 231 // Close the last enqueued partition. No need to release or acquire a new 232 // file descriptor, since the acquired FD will represent the new 233 // partition's FD opened in Enqueue below. 234 if err := p.closeWritePartition(ctx, idxToClose, retainFD); err != nil { 235 return err 236 } 237 needToAcquireFD = false 238 } else { 239 // The partition that was last enqueued to is not open for writing, so 240 // we need to acquire a new FD for this new partition. 241 needToAcquireFD = true 242 } 243 } 244 245 if needToAcquireFD { 246 // Acquire only one file descriptor. This will represent the write file 247 // descriptor. When we start Dequeueing from this partition, this will 248 // represent the read file descriptor. 249 if err := p.acquireNewFD(ctx); err != nil { 250 return err 251 } 252 } 253 // Partition has not been created yet. 254 q, err := NewDiskQueue(ctx, p.typs, p.cfg, p.diskAcc) 255 if err != nil { 256 return err 257 } 258 idx = len(p.partitions) 259 p.partitions = append(p.partitions, partition{Queue: q}) 260 p.partitionIdxToIndex[partitionIdx] = idx 261 } 262 if state := p.partitions[idx].state; state != partitionStateWriting { 263 if state == partitionStatePermanentlyClosed { 264 return errors.Errorf("partition at index %d permanently closed, cannot Enqueue", partitionIdx) 265 } 266 return errors.New("Enqueue illegally called after Dequeue or CloseAllOpenWriteFileDescriptors") 267 } 268 p.lastEnqueuedPartitionIdx = partitionIdx 269 return p.partitions[idx].Enqueue(ctx, batch) 270 } 271 272 // Dequeue dequeues a batch from partition partitionIdx, returns a 273 // coldata.ZeroBatch if that partition does not exist. If the partition exists 274 // and a coldata.ZeroBatch is returned, that partition is closed. 275 func (p *PartitionedDiskQueue) Dequeue( 276 ctx context.Context, partitionIdx int, batch coldata.Batch, 277 ) error { 278 idx, ok := p.partitionIdxToIndex[partitionIdx] 279 if !ok { 280 batch.SetLength(0) 281 return nil 282 } 283 switch state := p.partitions[idx].state; state { 284 case partitionStateWriting: 285 // Close this partition for writing. However, we keep a file descriptor 286 // acquired for the read file descriptor opened for Dequeue. 287 if err := p.closeWritePartition(ctx, idx, retainFD); err != nil { 288 return err 289 } 290 p.partitions[idx].state = partitionStateReading 291 case partitionStateClosedForWriting, partitionStateClosedForReading: 292 // There will never be a file descriptor acquired for a partition in this 293 // state, so acquire it. 294 if err := p.acquireNewFD(ctx); err != nil { 295 return err 296 } 297 p.partitions[idx].state = partitionStateReading 298 case partitionStateReading: 299 // Do nothing. 300 case partitionStatePermanentlyClosed: 301 return errors.Errorf("partition at index %d permanently closed, cannot Dequeue", partitionIdx) 302 default: 303 colexecerror.InternalError(fmt.Sprintf("unhandled state %d", state)) 304 } 305 notEmpty, err := p.partitions[idx].Dequeue(ctx, batch) 306 if err != nil { 307 return err 308 } 309 if batch.Length() == 0 { 310 // Queue is finished, release the acquired file descriptor. 311 if err := p.closeReadPartition(idx); err != nil { 312 return err 313 } 314 } 315 if !notEmpty { 316 // This should never happen. It simply means that there was no batch to 317 // Dequeue but more batches will be added in the future (i.e. a zero batch 318 // was never enqueued). Since we require partitions to be closed for writing 319 // before reading, this state is unexpected. 320 colexecerror.InternalError("DiskQueue unexpectedly returned that more data will be added") 321 } 322 return nil 323 } 324 325 // CloseAllOpenWriteFileDescriptors closes all open write file descriptors 326 // belonging to partitions that are being Enqueued to. Once this method is 327 // called, existing partitions may not be enqueued to again. 328 func (p *PartitionedDiskQueue) CloseAllOpenWriteFileDescriptors(ctx context.Context) error { 329 for i, q := range p.partitions { 330 if q.state != partitionStateWriting { 331 continue 332 } 333 // closeWritePartition will Release the file descriptor. 334 if err := p.closeWritePartition(ctx, i, releaseFD); err != nil { 335 return err 336 } 337 } 338 return nil 339 } 340 341 // CloseAllOpenReadFileDescriptors closes all open read file descriptors 342 // belonging to partitions that are being Dequeued from. If Dequeue is called 343 // on a closed partition, it will be reopened. 344 func (p *PartitionedDiskQueue) CloseAllOpenReadFileDescriptors() error { 345 for i, q := range p.partitions { 346 if q.state != partitionStateReading { 347 continue 348 } 349 // closeReadPartition will Release the file descriptor. 350 if err := p.closeReadPartition(i); err != nil { 351 return err 352 } 353 } 354 return nil 355 } 356 357 // CloseInactiveReadPartitions closes all partitions that were Dequeued from 358 // and either Dequeued a coldata.ZeroBatch or were closed through 359 // CloseAllOpenReadFileDescriptors. This method call Closes the underlying 360 // DiskQueue to remove its files, so a partition may never be used again. 361 func (p *PartitionedDiskQueue) CloseInactiveReadPartitions(ctx context.Context) error { 362 var lastErr error 363 for i, q := range p.partitions { 364 if q.state != partitionStateClosedForReading { 365 continue 366 } 367 lastErr = q.Close(ctx) 368 p.partitions[i].state = partitionStatePermanentlyClosed 369 } 370 return lastErr 371 } 372 373 // Close closes all the PartitionedDiskQueue's partitions. If an error is 374 // encountered, the PartitionedDiskQueue will attempt to close all partitions 375 // anyway and return the last error encountered. 376 func (p *PartitionedDiskQueue) Close(ctx context.Context) error { 377 var lastErr error 378 for i, q := range p.partitions { 379 if q.state == partitionStatePermanentlyClosed { 380 // Already closed. 381 continue 382 } 383 lastErr = q.Close(ctx) 384 p.partitions[i].state = partitionStatePermanentlyClosed 385 } 386 if p.numOpenFDs != 0 { 387 // Note that if p.numOpenFDs is non-zero, it must be the case that 388 // fdSemaphore is non-nil. 389 p.fdSemaphore.Release(p.numOpenFDs) 390 p.numOpenFDs = 0 391 } 392 return lastErr 393 }