github.com/cockroachdb/cockroach@v20.2.0-alpha.1+incompatible/pkg/sql/colcontainer/diskqueue.go (about) 1 // Copyright 2019 The Cockroach Authors. 2 // 3 // Use of this software is governed by the Business Source License 4 // included in the file licenses/BSL.txt. 5 // 6 // As of the Change Date specified in that file, in accordance with 7 // the Business Source License, use of this software will be governed 8 // by the Apache License, Version 2.0, included in the file 9 // licenses/APL.txt. 10 11 package colcontainer 12 13 import ( 14 "bytes" 15 "context" 16 "io" 17 "path/filepath" 18 "strconv" 19 20 "github.com/cockroachdb/cockroach/pkg/col/coldata" 21 "github.com/cockroachdb/cockroach/pkg/col/colserde" 22 "github.com/cockroachdb/cockroach/pkg/sql/types" 23 "github.com/cockroachdb/cockroach/pkg/storage/fs" 24 "github.com/cockroachdb/cockroach/pkg/util/mon" 25 "github.com/cockroachdb/cockroach/pkg/util/uuid" 26 "github.com/cockroachdb/errors" 27 "github.com/golang/snappy" 28 ) 29 30 const ( 31 // compressionSizeReductionThreshold is the factor used to determine whether 32 // to write compressed bytes or not. If the compressed bytes are larger than 33 // 1-1/compressionSizeReductionThreshold of the original size, compression is 34 // not used. This is to avoid paying the cost of decompression if the space 35 // savings are not sufficient. 36 compressionSizeReductionThreshold = 8 37 // bytesPerSync is the amount of bytes written to a file before Sync is 38 // called (implemented by using a vfs.SyncingFile). 39 bytesPerSync = 512 << 10 /* 512 KiB */ 40 ) 41 42 // file represents in-memory state used by a diskQueue to keep track of the 43 // state of a file. 44 type file struct { 45 name string 46 // offsets represent the start and ends of logical regions of a file to be 47 // read at once. This allows a region of coldata.Batches to be deserialized 48 // without reading a whole file into memory. 49 offsets []int 50 // curOffsetIdx is an index into offsets. 51 curOffsetIdx int 52 totalSize int 53 // finishedWriting specifies whether this file will be written to in the 54 // future or not. If finishedWriting is true and the reader reaches the end 55 // of the file, the file represented by this struct should be closed and 56 // (if the disk queue is not rewindable) removed. 57 finishedWriting bool 58 } 59 60 // diskQueueWriter is an object that encapsulates the writing logic of a 61 // diskQueue. As bytes are written to it, they are buffered until 62 // compressAndFlush is called, which compresses all bytes and writes them to the 63 // wrapped io.Writer. 64 type diskQueueWriter struct { 65 // testingKnobAlwaysCompress specifies whether the writer should always 66 // compress writes (i.e. don't bother measuring whether compression passes 67 // a certain threshold of size improvement before writing compressed bytes). 68 testingKnobAlwaysCompress bool 69 buffer bytes.Buffer 70 wrapped io.Writer 71 scratch struct { 72 // blockType is a single byte that specifies whether the following block on 73 // disk (i.e. compressedBuf in memory) is compressed or not. It is an array 74 // due to having to pass this byte in as a slice to Write. 75 blockType [1]byte 76 compressedBuf []byte 77 } 78 } 79 80 const ( 81 snappyUncompressedBlock byte = 0 82 snappyCompressedBlock byte = 1 83 ) 84 85 func (w *diskQueueWriter) Write(p []byte) (int, error) { 86 return w.buffer.Write(p) 87 } 88 89 // reset resets the diskQueueWriter's wrapped writer and discards any buffered 90 // bytes. 91 func (w *diskQueueWriter) reset(wrapped io.Writer) { 92 w.wrapped = wrapped 93 w.buffer.Reset() 94 } 95 96 // compressAndFlush compresses all buffered bytes and writes them to the wrapped 97 // io.Writer. The number of total bytes written to the wrapped writer is 98 // returned if no error occurred, otherwise 0, err is returned. 99 func (w *diskQueueWriter) compressAndFlush() (int, error) { 100 b := w.buffer.Bytes() 101 compressed := snappy.Encode(w.scratch.compressedBuf, b) 102 w.scratch.compressedBuf = compressed[:cap(compressed)] 103 104 blockType := snappyUncompressedBlock 105 // Discard result if < 12.5% size reduction. All code that uses snappy 106 // compression (including pebble and the higher-level snappy implementation) 107 // has this threshold in place. 108 if w.testingKnobAlwaysCompress || len(compressed) < len(b)-len(b)/compressionSizeReductionThreshold { 109 blockType = snappyCompressedBlock 110 b = compressed 111 } 112 113 // Write whether this data is compressed or not. 114 w.scratch.blockType[0] = blockType 115 nType, err := w.wrapped.Write(w.scratch.blockType[:]) 116 if err != nil { 117 return 0, err 118 } 119 120 nBody, err := w.wrapped.Write(b) 121 if err != nil { 122 return 0, err 123 } 124 w.buffer.Reset() 125 return nType + nBody, err 126 } 127 128 func (w *diskQueueWriter) numBytesBuffered() int { 129 return w.buffer.Len() 130 } 131 132 // diskQueueState describes the current state of the disk queue. Used to assert 133 // that an invalid state transition doesn't happen when a DiskQueue is in 134 // DiskQueueCacheMode{ClearAnd}ReuseCache. 135 type diskQueueState int 136 137 const ( 138 diskQueueStateEnqueueing diskQueueState = iota 139 diskQueueStateDequeueing 140 ) 141 142 // diskQueue is an on-disk queue of coldata.Batches that implements the Queue 143 // interface. coldata.Batches are serialized and buffered up, after which they 144 // are compressed and flushed to a file. A directory with a random UUID name 145 // will be created in cfg.Path, and files will be created in that directory 146 // using sequence numbers. 147 // When a file reaches DiskQueueCfg.MaxFileSizeBytes, a new file is created with 148 // the next sequential file number to store the next batches in the queue. 149 // Note that, if diskQueue is not rewindable, files will be cleaned up as 150 // coldata.Batches are dequeued from the diskQueue. DiskQueueCfg.Dir will also 151 // be removed on Close, deleting all files. 152 // A diskQueue will never use more memory than cfg.BufferSizeBytes, but not all 153 // the available memory will be used to buffer only writes. Refer to the 154 // DiskQueueCacheMode comment as to how cfg.BufferSizeBytes is divided in each 155 // mode. 156 type diskQueue struct { 157 // dirName is the directory in cfg.Path that holds this queue's files. 158 dirName string 159 160 typs []*types.T 161 cfg DiskQueueCfg 162 files []file 163 seqNo int 164 165 state diskQueueState 166 rewindable bool 167 168 // done is set when a coldata.ZeroBatch has been Enqueued. 169 done bool 170 171 serializer *colserde.FileSerializer 172 // numBufferedBatches is the number of batches buffered that haven't been 173 // flushed to disk. This is useful for a reader to determine whether to flush 174 // or not, since the number of buffered bytes will always be > 0 even though 175 // no batches have been enqueued (due to metadata). 176 numBufferedBatches int 177 writer *diskQueueWriter 178 // writeBufferLimit is the limit on the number of uncompressed write bytes 179 // written before a compress and flush. 180 writeBufferLimit int 181 writeFileIdx int 182 writeFile fs.File 183 deserializerState struct { 184 *colserde.FileDeserializer 185 curBatch int 186 } 187 // readFileIdx is an index into the current file in files the deserializer is 188 // reading from. 189 readFileIdx int 190 readFile fs.File 191 scratchDecompressedReadBytes []byte 192 193 diskAcc *mon.BoundAccount 194 } 195 196 var _ RewindableQueue = &diskQueue{} 197 198 // Queue describes a simple queue interface to which coldata.Batches can be 199 // Enqueued and Dequeued. 200 type Queue interface { 201 // Enqueue enqueues a coldata.Batch to this queue. A zero-length batch should 202 // be enqueued when no more elements will be enqueued. 203 // WARNING: Selection vectors are ignored. 204 Enqueue(context.Context, coldata.Batch) error 205 // Dequeue dequeues a coldata.Batch from the queue into the batch that is 206 // passed in. The boolean returned specifies whether the queue was not empty 207 // (i.e. whether there was a batch to Dequeue). If true is returned and the 208 // batch has a length of zero, the Queue is finished and will not be Enqueued 209 // to. If an error is returned, the batch and boolean returned are 210 // meaningless. 211 Dequeue(context.Context, coldata.Batch) (bool, error) 212 // CloseRead closes the read file descriptor. If Dequeued, the file may be 213 // reopened. 214 CloseRead() error 215 // Close closes any resources associated with the Queue. 216 Close(context.Context) error 217 } 218 219 // RewindableQueue is a Queue that can be read from multiple times. Note that 220 // in order for this Queue to return the same data after rewinding, all 221 // Enqueueing *must* occur before any Dequeueing. 222 type RewindableQueue interface { 223 Queue 224 // Rewind resets the Queue so that it Dequeues all Enqueued batches from the 225 // start. 226 Rewind() error 227 } 228 229 const ( 230 // defaultBufferSizeBytesDefaultCacheMode is the default buffer size used when 231 // the DiskQueue is in DiskQueueCacheModeDefault. 232 // This value was chosen by running BenchmarkQueue. 233 defaultBufferSizeBytesDefaultCacheMode = 128 << 10 /* 128 KiB */ 234 // defaultBufferSizeBytesReuseCacheMode is the default buffer size used when 235 // the DiskQueue is in DiskQueueCacheMode{ClearAnd}ReuseCache. 236 defaultBufferSizeBytesReuseCacheMode = 64 << 10 /* 64 KiB */ 237 // defaultMaxFileSizeBytes is the default maximum file size after which the 238 // DiskQueue rolls over to a new file. This value was chosen by running 239 // BenchmarkQueue. 240 defaultMaxFileSizeBytes = 32 << 20 /* 32 MiB */ 241 ) 242 243 // DiskQueueCacheMode specifies a pattern that a DiskQueue should use regarding 244 // its cache. 245 type DiskQueueCacheMode int 246 247 const ( 248 // DiskQueueCacheModeDefault is the default mode for DiskQueue cache behavior. 249 // The cache (DiskQueueCfg.BufferSizeBytes) will be divided as follows: 250 // - 1/3 for buffered writes (before compression) 251 // - 1/3 for compressed writes, this is distinct from the previous 1/3 because 252 // it is a requirement of the snappy library that the compressed memory may 253 // not overlap with the uncompressed memory. This memory is reused to read 254 // compressed bytes from disk. 255 // - 1/3 for buffered reads after decompression. Kept separate from the write 256 // memory to allow for Enqueues to come in while unread batches are held in 257 // memory. 258 // In this mode, Enqueues and Dequeues may happen in any order. 259 DiskQueueCacheModeDefault DiskQueueCacheMode = iota 260 // DiskQueueCacheModeReuseCache imposes a limitation that all Enqueues happen 261 // before all Dequeues to be able to reuse more memory. In this mode the cache 262 // will be divided as follows: 263 // - 1/2 for buffered writes and buffered reads. 264 // - 1/2 for compressed write and reads (given the limitation that this memory 265 // has to be non-overlapping. 266 DiskQueueCacheModeReuseCache 267 // DiskQueueCacheModeClearAndReuseCache is the same as 268 // DiskQueueCacheModeReuseCache with the additional behavior that when a 269 // coldata.ZeroBatch is Enqueued, the cache will be released to the GC. 270 DiskQueueCacheModeClearAndReuseCache 271 ) 272 273 // DiskQueueCfg is a struct holding the configuration options for a DiskQueue. 274 type DiskQueueCfg struct { 275 // FS is the filesystem interface to use. 276 FS fs.FS 277 // Path is where the temporary directory that will contain this DiskQueue's 278 // files should be created. The directory name will be a UUID. 279 Path string 280 // CacheMode defines the way a DiskQueue should use its cache. Refer to the 281 // comment of DiskQueueCacheModes for more information. 282 CacheMode DiskQueueCacheMode 283 // BufferSizeBytes is the number of bytes to buffer before compressing and 284 // writing to disk. 285 BufferSizeBytes int 286 // MaxFileSizeBytes is the maximum size an on-disk file should reach before 287 // rolling over to a new one. 288 MaxFileSizeBytes int 289 290 // OnNewDiskQueueCb is an optional callback function that will be called when 291 // NewDiskQueue is called. 292 OnNewDiskQueueCb func() 293 294 // TestingKnobs are used to test the queue implementation. 295 TestingKnobs struct { 296 // AlwaysCompress, if true, will skip a check that determines whether 297 // compression is used for a given write or not given the percentage size 298 // improvement. This allows us to test compression. 299 AlwaysCompress bool 300 } 301 } 302 303 // EnsureDefaults ensures that optional fields are set to reasonable defaults. 304 // If any necessary options have been elided, an error is returned. 305 func (cfg *DiskQueueCfg) EnsureDefaults() error { 306 if cfg.FS == nil { 307 return errors.New("FS unset on DiskQueueCfg") 308 } 309 if cfg.BufferSizeBytes == 0 { 310 cfg.SetDefaultBufferSizeBytesForCacheMode() 311 } 312 if cfg.MaxFileSizeBytes == 0 { 313 cfg.MaxFileSizeBytes = defaultMaxFileSizeBytes 314 } 315 return nil 316 } 317 318 // SetDefaultBufferSizeBytesForCacheMode sets the default BufferSizeBytes 319 // according to the set CacheMode. 320 func (cfg *DiskQueueCfg) SetDefaultBufferSizeBytesForCacheMode() { 321 if cfg.CacheMode == DiskQueueCacheModeDefault { 322 cfg.BufferSizeBytes = defaultBufferSizeBytesDefaultCacheMode 323 } else { 324 cfg.BufferSizeBytes = defaultBufferSizeBytesReuseCacheMode 325 } 326 } 327 328 // NewDiskQueue creates a Queue that spills to disk. 329 func NewDiskQueue( 330 ctx context.Context, typs []*types.T, cfg DiskQueueCfg, diskAcc *mon.BoundAccount, 331 ) (Queue, error) { 332 return newDiskQueue(ctx, typs, cfg, diskAcc) 333 } 334 335 // NewRewindableDiskQueue creates a RewindableQueue that spills to disk. 336 func NewRewindableDiskQueue( 337 ctx context.Context, typs []*types.T, cfg DiskQueueCfg, diskAcc *mon.BoundAccount, 338 ) (RewindableQueue, error) { 339 d, err := newDiskQueue(ctx, typs, cfg, diskAcc) 340 if err != nil { 341 return nil, err 342 } 343 d.rewindable = true 344 return d, nil 345 } 346 347 func newDiskQueue( 348 ctx context.Context, typs []*types.T, cfg DiskQueueCfg, diskAcc *mon.BoundAccount, 349 ) (*diskQueue, error) { 350 if err := cfg.EnsureDefaults(); err != nil { 351 return nil, err 352 } 353 if cfg.OnNewDiskQueueCb != nil { 354 cfg.OnNewDiskQueueCb() 355 } 356 d := &diskQueue{ 357 dirName: uuid.FastMakeV4().String(), 358 typs: typs, 359 cfg: cfg, 360 files: make([]file, 0, 4), 361 writeBufferLimit: cfg.BufferSizeBytes / 3, 362 diskAcc: diskAcc, 363 } 364 // Refer to the DiskQueueCacheMode comment for why this division of 365 // BufferSizeBytes. 366 if d.cfg.CacheMode != DiskQueueCacheModeDefault { 367 d.writeBufferLimit = d.cfg.BufferSizeBytes / 2 368 } 369 if err := cfg.FS.MkdirAll(filepath.Join(cfg.Path, d.dirName)); err != nil { 370 return nil, err 371 } 372 // rotateFile will create a new file to write to. 373 return d, d.rotateFile(ctx) 374 } 375 376 func (d *diskQueue) CloseRead() error { 377 if d.readFile != nil { 378 if err := d.readFile.Close(); err != nil { 379 return err 380 } 381 d.readFile = nil 382 } 383 return nil 384 } 385 386 func (d *diskQueue) closeFileDeserializer() error { 387 if d.deserializerState.FileDeserializer != nil { 388 if err := d.deserializerState.Close(); err != nil { 389 return err 390 } 391 } 392 d.deserializerState.FileDeserializer = nil 393 return nil 394 } 395 396 func (d *diskQueue) Close(ctx context.Context) error { 397 if d.serializer != nil { 398 if err := d.writeFooterAndFlush(ctx); err != nil { 399 return err 400 } 401 d.serializer = nil 402 } 403 if err := d.closeFileDeserializer(); err != nil { 404 return err 405 } 406 if d.writeFile != nil { 407 if err := d.writeFile.Close(); err != nil { 408 return err 409 } 410 d.writeFile = nil 411 } 412 // The readFile will be removed below in DeleteDirAndFiles. 413 if err := d.CloseRead(); err != nil { 414 return err 415 } 416 if err := d.cfg.FS.RemoveAll(filepath.Join(d.cfg.Path, d.dirName)); err != nil { 417 return err 418 } 419 totalSize := int64(0) 420 leftOverFileIdx := 0 421 if !d.rewindable { 422 leftOverFileIdx = d.readFileIdx 423 } 424 for _, file := range d.files[leftOverFileIdx : d.writeFileIdx+1] { 425 totalSize += int64(file.totalSize) 426 } 427 if totalSize > d.diskAcc.Used() { 428 totalSize = d.diskAcc.Used() 429 } 430 d.diskAcc.Shrink(ctx, totalSize) 431 return nil 432 } 433 434 // rotateFile performs file rotation for the diskQueue. i.e. it creates a new 435 // file to write to and sets the diskQueue state up to write to that file when 436 // Enqueue is called. 437 // It is valid to call rotateFile when the diskQueue is not currently writing to 438 // any file (i.e. during initialization). This will simply create the first file 439 // to write to. 440 func (d *diskQueue) rotateFile(ctx context.Context) error { 441 fName := filepath.Join(d.cfg.Path, d.dirName, strconv.Itoa(d.seqNo)) 442 f, err := d.cfg.FS.CreateWithSync(fName, bytesPerSync) 443 if err != nil { 444 return err 445 } 446 d.seqNo++ 447 448 if d.serializer == nil { 449 writer := &diskQueueWriter{testingKnobAlwaysCompress: d.cfg.TestingKnobs.AlwaysCompress, wrapped: f} 450 d.serializer, err = colserde.NewFileSerializer(writer, d.typs) 451 if err != nil { 452 return err 453 } 454 d.writer = writer 455 } else { 456 if err := d.writeFooterAndFlush(ctx); err != nil { 457 return err 458 } 459 if err := d.resetWriters(f); err != nil { 460 return err 461 } 462 } 463 464 if d.writeFile != nil { 465 d.files[d.writeFileIdx].finishedWriting = true 466 if err := d.writeFile.Close(); err != nil { 467 return err 468 } 469 } 470 471 d.writeFileIdx = len(d.files) 472 d.files = append(d.files, file{name: fName, offsets: make([]int, 1, 16)}) 473 d.writeFile = f 474 return nil 475 } 476 477 func (d *diskQueue) resetWriters(f fs.File) error { 478 d.writer.reset(f) 479 return d.serializer.Reset(d.writer) 480 } 481 482 func (d *diskQueue) writeFooterAndFlush(ctx context.Context) error { 483 err := d.serializer.Finish() 484 if err != nil { 485 return err 486 } 487 written, err := d.writer.compressAndFlush() 488 if err != nil { 489 return err 490 } 491 d.numBufferedBatches = 0 492 d.files[d.writeFileIdx].totalSize += written 493 if err := d.diskAcc.Grow(ctx, int64(written)); err != nil { 494 return err 495 } 496 // Append offset for the readers. 497 d.files[d.writeFileIdx].offsets = append(d.files[d.writeFileIdx].offsets, d.files[d.writeFileIdx].totalSize) 498 return nil 499 } 500 501 func (d *diskQueue) Enqueue(ctx context.Context, b coldata.Batch) error { 502 if d.state == diskQueueStateDequeueing { 503 if d.cfg.CacheMode != DiskQueueCacheModeDefault { 504 return errors.Errorf("attempted to Enqueue to DiskQueue in mode that disallows it: %d", d.cfg.CacheMode) 505 } 506 if d.rewindable { 507 return errors.Errorf("attempted to Enqueue to RewindableDiskQueue after Dequeue has been called") 508 } 509 } 510 d.state = diskQueueStateEnqueueing 511 if b.Length() == 0 { 512 if d.done { 513 // Already done. 514 return nil 515 } 516 if err := d.writeFooterAndFlush(ctx); err != nil { 517 return err 518 } 519 if err := d.writeFile.Close(); err != nil { 520 return err 521 } 522 d.files[d.writeFileIdx].finishedWriting = true 523 d.writeFile = nil 524 // Done with the serializer. Not setting this will cause us to attempt to 525 // flush the serializer on Close. 526 d.serializer = nil 527 // The write file will be closed in Close. 528 d.done = true 529 if d.cfg.CacheMode == DiskQueueCacheModeClearAndReuseCache { 530 // Clear the cache. d.scratchDecompressedReadBytes should already be nil 531 // since we don't allow writes once reads happen in this mode. 532 d.scratchDecompressedReadBytes = nil 533 // Clear the write side of the cache. 534 d.writer.buffer = bytes.Buffer{} 535 d.writer.scratch.compressedBuf = nil 536 } 537 return nil 538 } 539 if err := d.serializer.AppendBatch(b); err != nil { 540 return err 541 } 542 d.numBufferedBatches++ 543 544 bufferSizeLimitReached := d.writer.numBytesBuffered() > d.writeBufferLimit 545 fileSizeLimitReached := d.files[d.writeFileIdx].totalSize+d.writer.numBytesBuffered() > d.cfg.MaxFileSizeBytes 546 if bufferSizeLimitReached || fileSizeLimitReached { 547 if fileSizeLimitReached { 548 // rotateFile will flush and reset writers. 549 return d.rotateFile(ctx) 550 } 551 if err := d.writeFooterAndFlush(ctx); err != nil { 552 return err 553 } 554 return d.resetWriters(d.writeFile) 555 } 556 return nil 557 } 558 559 func (d *diskQueue) maybeInitDeserializer(ctx context.Context) (bool, error) { 560 if d.deserializerState.FileDeserializer != nil { 561 return true, nil 562 } 563 if d.readFileIdx >= len(d.files) { 564 // There is no valid file to read from. Either more data will be enqueued or 565 // not, but the behavior there depends on the caller. 566 return false, nil 567 } 568 fileToRead := d.files[d.readFileIdx] 569 if fileToRead.curOffsetIdx == len(fileToRead.offsets)-1 { 570 // The current offset index is the last element in offsets. This means that 571 // either the region to read from next is currently being written to or the 572 // writer has rotated to a new file. 573 if fileToRead.finishedWriting { 574 // Close current file. 575 if err := d.CloseRead(); err != nil { 576 return false, err 577 } 578 if !d.rewindable { 579 // Remove current file. 580 if err := d.cfg.FS.Remove(d.files[d.readFileIdx].name); err != nil { 581 return false, err 582 } 583 fileSize := int64(d.files[d.readFileIdx].totalSize) 584 if fileSize > d.diskAcc.Used() { 585 fileSize = d.diskAcc.Used() 586 } 587 d.diskAcc.Shrink(ctx, fileSize) 588 } 589 d.readFile = nil 590 // Read next file. 591 d.readFileIdx++ 592 return d.maybeInitDeserializer(ctx) 593 } 594 // Not finished writing. there is currently no data to read. 595 return false, nil 596 } 597 if d.readFile == nil { 598 // File is not open. 599 f, err := d.cfg.FS.Open(fileToRead.name) 600 if err != nil { 601 return false, err 602 } 603 d.readFile = f 604 } 605 readRegionStart := fileToRead.offsets[fileToRead.curOffsetIdx] 606 readRegionLength := fileToRead.offsets[fileToRead.curOffsetIdx+1] - readRegionStart 607 if cap(d.writer.scratch.compressedBuf) < readRegionLength { 608 // Not enough capacity, we have to allocate a new compressedBuf. 609 d.writer.scratch.compressedBuf = make([]byte, readRegionLength) 610 } 611 // Slice the compressedBuf to be of the desired length, encoded in 612 // readRegionLength. 613 d.writer.scratch.compressedBuf = d.writer.scratch.compressedBuf[0:readRegionLength] 614 // Read the desired length starting at readRegionStart. 615 n, err := d.readFile.ReadAt(d.writer.scratch.compressedBuf, int64(readRegionStart)) 616 if err != nil && err != io.EOF { 617 return false, err 618 } 619 if n != len(d.writer.scratch.compressedBuf) { 620 return false, errors.Errorf("expected to read %d bytes but read %d", len(d.writer.scratch.compressedBuf), n) 621 } 622 623 blockType := d.writer.scratch.compressedBuf[0] 624 compressedBytes := d.writer.scratch.compressedBuf[1:] 625 var decompressedBytes []byte 626 if blockType == snappyCompressedBlock { 627 decompressedBytes, err = snappy.Decode(d.scratchDecompressedReadBytes, compressedBytes) 628 if err != nil { 629 return false, err 630 } 631 d.scratchDecompressedReadBytes = decompressedBytes[:cap(decompressedBytes)] 632 } else { 633 // Copy the result for safety since we're reusing the diskQueueWriter's 634 // compressed write buffer. If an Enqueue were to arrive between Dequeue 635 // calls of the same buffered coldata.Batches to return, the memory would 636 // be corrupted. The following code ensures that 637 // scratchDecompressedReadBytes is of the required capacity. 638 if cap(d.scratchDecompressedReadBytes) < len(compressedBytes) { 639 d.scratchDecompressedReadBytes = make([]byte, len(compressedBytes)) 640 } 641 // Slice up to the length of compressedBytes so that the copy below will 642 // copy all desired bytes. 643 d.scratchDecompressedReadBytes = d.scratchDecompressedReadBytes[:len(compressedBytes)] 644 copy(d.scratchDecompressedReadBytes, compressedBytes) 645 decompressedBytes = d.scratchDecompressedReadBytes 646 } 647 648 deserializer, err := colserde.NewFileDeserializerFromBytes(d.typs, decompressedBytes) 649 if err != nil { 650 return false, err 651 } 652 d.deserializerState.FileDeserializer = deserializer 653 d.deserializerState.curBatch = 0 654 if d.deserializerState.NumBatches() == 0 { 655 // Zero batches to deserialize in this region. This shouldn't happen but we 656 // might as well handle it. 657 if err := d.closeFileDeserializer(); err != nil { 658 return false, err 659 } 660 d.files[d.readFileIdx].curOffsetIdx++ 661 return d.maybeInitDeserializer(ctx) 662 } 663 return true, nil 664 } 665 666 // Dequeue dequeues a batch from disk and deserializes it into b. Note that the 667 // deserialized batch is only valid until the next call to Dequeue. 668 func (d *diskQueue) Dequeue(ctx context.Context, b coldata.Batch) (bool, error) { 669 if d.serializer != nil && d.numBufferedBatches > 0 { 670 if err := d.writeFooterAndFlush(ctx); err != nil { 671 return false, err 672 } 673 if err := d.resetWriters(d.writeFile); err != nil { 674 return false, err 675 } 676 } 677 if d.state == diskQueueStateEnqueueing && d.cfg.CacheMode != DiskQueueCacheModeDefault { 678 // This is the first Dequeue after Enqueues, so reuse the write cache for 679 // reads. Note that the buffer for compressed reads is reused in 680 // maybeInitDeserializer in either case, so there is nothing to do here for 681 // that. 682 d.writer.buffer.Reset() 683 d.scratchDecompressedReadBytes = d.writer.buffer.Bytes() 684 } 685 d.state = diskQueueStateDequeueing 686 687 if d.deserializerState.FileDeserializer != nil && d.deserializerState.curBatch >= d.deserializerState.NumBatches() { 688 // Finished all the batches, set the deserializer to nil to initialize a new 689 // one to read the next region. 690 if err := d.closeFileDeserializer(); err != nil { 691 return false, err 692 } 693 d.files[d.readFileIdx].curOffsetIdx++ 694 } 695 696 if dataToRead, err := d.maybeInitDeserializer(ctx); err != nil { 697 return false, err 698 } else if !dataToRead { 699 // No data to read. 700 if !d.done { 701 // Data might still be added. 702 return false, nil 703 } 704 // No data will be added. 705 b.SetLength(0) 706 } else { 707 if d.deserializerState.curBatch == 0 { 708 // It is possible that the caller has appended more columns to the 709 // batch than it provided types during diskQueue's creation. We 710 // will only be touching the prefix of the batch that we have been 711 // told about. 712 vecs := b.ColVecs()[:len(d.typs)] 713 for i := range vecs { 714 // When we deserialize a new memory region, we allocate a new null 715 // bitmap for the batch which deserializer will write to. If we naively 716 // allow the arrow batch converter to directly overwrite null bitmap of 717 // each column, it could lead to memory corruption. Doing this avoids 718 // reallocating a new scratchDecompressedReadBytes every time we perform 719 // a read from the file and constrains the downside to allocating a new 720 // null bitmap every couple of batches. 721 nulls := coldata.NewNulls(coldata.BatchSize()) 722 vecs[i].SetNulls(&nulls) 723 } 724 } 725 if err := d.deserializerState.GetBatch(d.deserializerState.curBatch, b); err != nil { 726 return false, err 727 } 728 d.deserializerState.curBatch++ 729 } 730 731 return true, nil 732 } 733 734 // Rewind is part of the RewindableQueue interface. 735 func (d *diskQueue) Rewind() error { 736 if err := d.closeFileDeserializer(); err != nil { 737 return err 738 } 739 if err := d.CloseRead(); err != nil { 740 return err 741 } 742 d.deserializerState.curBatch = 0 743 d.readFile = nil 744 d.readFileIdx = 0 745 for i := range d.files { 746 d.files[i].curOffsetIdx = 0 747 } 748 return nil 749 }