github.com/zuoyebang/bitalostable@v1.0.1-0.20240229032404-e3b99a834294/sstable/writer.go (about) 1 // Copyright 2011 The LevelDB-Go and Pebble Authors. All rights reserved. Use 2 // of this source code is governed by a BSD-style license that can be found in 3 // the LICENSE file. 4 5 package sstable 6 7 import ( 8 "bufio" 9 "bytes" 10 "encoding/binary" 11 "fmt" 12 "io" 13 "math" 14 "runtime" 15 "sync" 16 17 "github.com/cespare/xxhash/v2" 18 "github.com/cockroachdb/errors" 19 "github.com/zuoyebang/bitalostable/internal/base" 20 "github.com/zuoyebang/bitalostable/internal/cache" 21 "github.com/zuoyebang/bitalostable/internal/crc" 22 "github.com/zuoyebang/bitalostable/internal/invariants" 23 "github.com/zuoyebang/bitalostable/internal/keyspan" 24 "github.com/zuoyebang/bitalostable/internal/private" 25 "github.com/zuoyebang/bitalostable/internal/rangekey" 26 ) 27 28 // encodedBHPEstimatedSize estimates the size of the encoded BlockHandleWithProperties. 29 // It would also be nice to account for the length of the data block properties here, 30 // but isn't necessary since this is an estimate. 31 const encodedBHPEstimatedSize = binary.MaxVarintLen64 * 2 32 33 var errWriterClosed = errors.New("bitalostable: writer is closed") 34 35 // WriterMetadata holds info about a finished sstable. 36 type WriterMetadata struct { 37 Size uint64 38 SmallestPoint InternalKey 39 // LargestPoint, LargestRangeKey, LargestRangeDel should not be accessed 40 // before Writer.Close is called, because they may only be set on 41 // Writer.Close. 42 LargestPoint InternalKey 43 SmallestRangeDel InternalKey 44 LargestRangeDel InternalKey 45 SmallestRangeKey InternalKey 46 LargestRangeKey InternalKey 47 HasPointKeys bool 48 HasRangeDelKeys bool 49 HasRangeKeys bool 50 SmallestSeqNum uint64 51 LargestSeqNum uint64 52 Properties Properties 53 } 54 55 // SetSmallestPointKey sets the smallest point key to the given key. 56 // NB: this method set the "absolute" smallest point key. Any existing key is 57 // overridden. 58 func (m *WriterMetadata) SetSmallestPointKey(k InternalKey) { 59 m.SmallestPoint = k 60 m.HasPointKeys = true 61 } 62 63 // SetSmallestRangeDelKey sets the smallest rangedel key to the given key. 64 // NB: this method set the "absolute" smallest rangedel key. Any existing key is 65 // overridden. 66 func (m *WriterMetadata) SetSmallestRangeDelKey(k InternalKey) { 67 m.SmallestRangeDel = k 68 m.HasRangeDelKeys = true 69 } 70 71 // SetSmallestRangeKey sets the smallest range key to the given key. 72 // NB: this method set the "absolute" smallest range key. Any existing key is 73 // overridden. 74 func (m *WriterMetadata) SetSmallestRangeKey(k InternalKey) { 75 m.SmallestRangeKey = k 76 m.HasRangeKeys = true 77 } 78 79 // SetLargestPointKey sets the largest point key to the given key. 80 // NB: this method set the "absolute" largest point key. Any existing key is 81 // overridden. 82 func (m *WriterMetadata) SetLargestPointKey(k InternalKey) { 83 m.LargestPoint = k 84 m.HasPointKeys = true 85 } 86 87 // SetLargestRangeDelKey sets the largest rangedel key to the given key. 88 // NB: this method set the "absolute" largest rangedel key. Any existing key is 89 // overridden. 90 func (m *WriterMetadata) SetLargestRangeDelKey(k InternalKey) { 91 m.LargestRangeDel = k 92 m.HasRangeDelKeys = true 93 } 94 95 // SetLargestRangeKey sets the largest range key to the given key. 96 // NB: this method set the "absolute" largest range key. Any existing key is 97 // overridden. 98 func (m *WriterMetadata) SetLargestRangeKey(k InternalKey) { 99 m.LargestRangeKey = k 100 m.HasRangeKeys = true 101 } 102 103 func (m *WriterMetadata) updateSeqNum(seqNum uint64) { 104 if m.SmallestSeqNum > seqNum { 105 m.SmallestSeqNum = seqNum 106 } 107 if m.LargestSeqNum < seqNum { 108 m.LargestSeqNum = seqNum 109 } 110 } 111 112 type flusher interface { 113 Flush() error 114 } 115 116 type writeCloseSyncer interface { 117 io.WriteCloser 118 Sync() error 119 } 120 121 // Writer is a table writer. 122 type Writer struct { 123 writer io.Writer 124 bufWriter *bufio.Writer 125 syncer writeCloseSyncer 126 meta WriterMetadata 127 err error 128 // cacheID and fileNum are used to remove blocks written to the sstable from 129 // the cache, providing a defense in depth against bugs which cause cache 130 // collisions. 131 cacheID uint64 132 fileNum base.FileNum 133 // The following fields are copied from Options. 134 blockSize int 135 blockSizeThreshold int 136 indexBlockSize int 137 indexBlockSizeThreshold int 138 compare Compare 139 split Split 140 formatKey base.FormatKey 141 compression Compression 142 separator Separator 143 successor Successor 144 tableFormat TableFormat 145 cache *cache.Cache 146 restartInterval int 147 checksumType ChecksumType 148 // disableKeyOrderChecks disables the checks that keys are added to an 149 // sstable in order. It is intended for internal use only in the construction 150 // of invalid sstables for testing. See tool/make_test_sstables.go. 151 disableKeyOrderChecks bool 152 // With two level indexes, the index/filter of a SST file is partitioned into 153 // smaller blocks with an additional top-level index on them. When reading an 154 // index/filter, only the top-level index is loaded into memory. The two level 155 // index/filter then uses the top-level index to load on demand into the block 156 // cache the partitions that are required to perform the index/filter query. 157 // 158 // Two level indexes are enabled automatically when there is more than one 159 // index block. 160 // 161 // This is useful when there are very large index blocks, which generally occurs 162 // with the usage of large keys. With large index blocks, the index blocks fight 163 // the data blocks for block cache space and the index blocks are likely to be 164 // re-read many times from the disk. The top level index, which has a much 165 // smaller memory footprint, can be used to prevent the entire index block from 166 // being loaded into the block cache. 167 twoLevelIndex bool 168 // Internal flag to allow creation of range-del-v1 format blocks. Only used 169 // for testing. Note that v2 format blocks are backwards compatible with v1 170 // format blocks. 171 rangeDelV1Format bool 172 indexBlock *indexBlockBuf 173 rangeDelBlock blockWriter 174 rangeKeyBlock blockWriter 175 topLevelIndexBlock blockWriter 176 props Properties 177 propCollectors []TablePropertyCollector 178 blockPropCollectors []BlockPropertyCollector 179 blockPropsEncoder blockPropertiesEncoder 180 // filter accumulates the filter block. If populated, the filter ingests 181 // either the output of w.split (i.e. a prefix extractor) if w.split is not 182 // nil, or the full keys otherwise. 183 filter filterWriter 184 indexPartitions []indexBlockAndBlockProperties 185 186 // indexBlockAlloc is used to bulk-allocate byte slices used to store index 187 // blocks in indexPartitions. These live until the index finishes. 188 indexBlockAlloc []byte 189 // indexSepAlloc is used to bulk-allocate index block seperator slices stored 190 // in indexPartitions. These live until the index finishes. 191 indexSepAlloc []byte 192 193 // To allow potentially overlapping (i.e. un-fragmented) range keys spans to 194 // be added to the Writer, a keyspan.Fragmenter is used to retain the keys 195 // and values, emitting fragmented, coalesced spans as appropriate. Range 196 // keys must be added in order of their start user-key. 197 fragmenter keyspan.Fragmenter 198 rangeKeyEncoder rangekey.Encoder 199 rangeKeyCoalesced keyspan.Span 200 rkBuf []byte 201 // dataBlockBuf consists of the state which is currently owned by and used by 202 // the Writer client goroutine. This state can be handed off to other goroutines. 203 dataBlockBuf *dataBlockBuf 204 // blockBuf consists of the state which is owned by and used by the Writer client 205 // goroutine. 206 blockBuf blockBuf 207 208 coordination coordinationState 209 } 210 211 type coordinationState struct { 212 parallelismEnabled bool 213 214 // writeQueue is used to write data blocks to disk. The writeQueue is primarily 215 // used to maintain the order in which data blocks must be written to disk. For 216 // this reason, every single data block write must be done through the writeQueue. 217 writeQueue *writeQueue 218 219 sizeEstimate dataBlockEstimates 220 } 221 222 func (c *coordinationState) init(parallelismEnabled bool, writer *Writer) { 223 c.parallelismEnabled = parallelismEnabled 224 c.sizeEstimate.useMutex = parallelismEnabled 225 226 // writeQueueSize determines the size of the write queue, or the number 227 // of items which can be added to the queue without blocking. By default, we 228 // use a writeQueue size of 0, since we won't be doing any block writes in 229 // parallel. 230 writeQueueSize := 0 231 if parallelismEnabled { 232 writeQueueSize = runtime.GOMAXPROCS(0) 233 } 234 c.writeQueue = newWriteQueue(writeQueueSize, writer) 235 } 236 237 type sizeEstimate struct { 238 // emptySize is the size when there is no inflight data, and numEntries is 0. 239 // emptySize is constant once set. 240 emptySize uint64 241 242 // inflightSize is the estimated size of some inflight data which hasn't 243 // been written yet. 244 inflightSize uint64 245 246 // totalSize is the total size of the data which has already been written. 247 totalSize uint64 248 249 // numWrittenEntries is the total number of entries which have already been 250 // written. 251 numWrittenEntries uint64 252 // numInflightEntries is the total number of entries which are inflight, and 253 // haven't been written. 254 numInflightEntries uint64 255 256 // maxEstimatedSize stores the maximum result returned from sizeEstimate.size. 257 // It ensures that values returned from subsequent calls to Writer.EstimatedSize 258 // never decrease. 259 maxEstimatedSize uint64 260 261 // We assume that the entries added to the sizeEstimate can be compressed. 262 // For this reason, we keep track of a compressedSize and an uncompressedSize 263 // to compute a compression ratio for the inflight entries. If the entries 264 // aren't being compressed, then compressedSize and uncompressedSize must be 265 // equal. 266 compressedSize uint64 267 uncompressedSize uint64 268 } 269 270 func (s *sizeEstimate) init(emptySize uint64) { 271 s.emptySize = emptySize 272 } 273 274 func (s *sizeEstimate) size() uint64 { 275 ratio := float64(1) 276 if s.uncompressedSize > 0 { 277 ratio = float64(s.compressedSize) / float64(s.uncompressedSize) 278 } 279 estimatedInflightSize := uint64(float64(s.inflightSize) * ratio) 280 total := s.totalSize + estimatedInflightSize 281 if total > s.maxEstimatedSize { 282 s.maxEstimatedSize = total 283 } else { 284 total = s.maxEstimatedSize 285 } 286 287 if total == 0 { 288 return s.emptySize 289 } 290 291 return total 292 } 293 294 func (s *sizeEstimate) numTotalEntries() uint64 { 295 return s.numWrittenEntries + s.numInflightEntries 296 } 297 298 func (s *sizeEstimate) addInflight(size int) { 299 s.numInflightEntries++ 300 s.inflightSize += uint64(size) 301 } 302 303 func (s *sizeEstimate) written(newTotalSize uint64, inflightSize int, finalEntrySize int) { 304 s.inflightSize -= uint64(inflightSize) 305 if inflightSize > 0 { 306 // This entry was previously inflight, so we should decrement inflight 307 // entries. 308 s.numInflightEntries-- 309 } 310 s.numWrittenEntries++ 311 s.totalSize = newTotalSize 312 313 s.uncompressedSize += uint64(inflightSize) 314 s.compressedSize += uint64(finalEntrySize) 315 } 316 317 func (s *sizeEstimate) clear() { 318 *s = sizeEstimate{emptySize: s.emptySize} 319 } 320 321 type indexBlockBuf struct { 322 // block will only be accessed from the writeQueue. 323 block blockWriter 324 325 size struct { 326 useMutex bool 327 mu sync.Mutex 328 estimate sizeEstimate 329 } 330 331 // restartInterval matches indexBlockBuf.block.restartInterval. We store it twice, because the `block` 332 // must only be accessed from the writeQueue goroutine. 333 restartInterval int 334 } 335 336 func (i *indexBlockBuf) clear() { 337 i.block.clear() 338 if i.size.useMutex { 339 i.size.mu.Lock() 340 defer i.size.mu.Unlock() 341 } 342 i.size.estimate.clear() 343 i.restartInterval = 0 344 } 345 346 var indexBlockBufPool = sync.Pool{ 347 New: func() interface{} { 348 return &indexBlockBuf{} 349 }, 350 } 351 352 const indexBlockRestartInterval = 1 353 354 func newIndexBlockBuf(useMutex bool) *indexBlockBuf { 355 i := indexBlockBufPool.Get().(*indexBlockBuf) 356 i.size.useMutex = useMutex 357 i.restartInterval = indexBlockRestartInterval 358 i.block.restartInterval = indexBlockRestartInterval 359 i.size.estimate.init(emptyBlockSize) 360 return i 361 } 362 363 func (i *indexBlockBuf) shouldFlush( 364 sep InternalKey, valueLen, targetBlockSize, sizeThreshold int, 365 ) bool { 366 if i.size.useMutex { 367 i.size.mu.Lock() 368 defer i.size.mu.Unlock() 369 } 370 371 // nEntries := i.size.estimate.numWrittenEntries + i.size.estimate.numInflightEntries 372 nEntries := i.size.estimate.numTotalEntries() 373 return shouldFlush( 374 sep, valueLen, i.restartInterval, int(i.size.estimate.size()), 375 int(nEntries), targetBlockSize, sizeThreshold) 376 } 377 378 func (i *indexBlockBuf) add(key InternalKey, value []byte, inflightSize int) { 379 i.block.add(key, value) 380 size := i.block.estimatedSize() 381 if i.size.useMutex { 382 i.size.mu.Lock() 383 defer i.size.mu.Unlock() 384 } 385 // Since, we're not compressing index entries when adding them to index blocks, 386 // we assume that the size of entry written to the index block is equal to the 387 // size of the inflight entry, giving us a compression ratio of 1. 388 i.size.estimate.written(uint64(size), inflightSize, inflightSize) 389 } 390 391 func (i *indexBlockBuf) finish() []byte { 392 b := i.block.finish() 393 return b 394 } 395 396 func (i *indexBlockBuf) addInflight(inflightSize int) { 397 if i.size.useMutex { 398 i.size.mu.Lock() 399 defer i.size.mu.Unlock() 400 } 401 i.size.estimate.addInflight(inflightSize) 402 } 403 404 func (i *indexBlockBuf) estimatedSize() uint64 { 405 if i.size.useMutex { 406 i.size.mu.Lock() 407 defer i.size.mu.Unlock() 408 } 409 410 // Make sure that the size estimation works as expected when parallelism 411 // is disabled. 412 if invariants.Enabled && !i.size.useMutex { 413 if i.size.estimate.inflightSize != 0 { 414 panic("unexpected inflight entry in index block size estimation") 415 } 416 417 // NB: The i.block should only be accessed from the writeQueue goroutine, 418 // when parallelism is enabled. We break that invariant here, but that's 419 // okay since parallelism is disabled. 420 if i.size.estimate.size() != uint64(i.block.estimatedSize()) { 421 panic("index block size estimation sans parallelism is incorrect") 422 } 423 } 424 return i.size.estimate.size() 425 } 426 427 // sizeEstimate is used for sstable size estimation. sizeEstimate can be accessed by 428 // the Writer client, writeQueue, compressionQueue goroutines. Fields should only be 429 // read/updated through the functions defined on the *sizeEstimate type. 430 type dataBlockEstimates struct { 431 // If we don't do block compression, block writes in parallel, then we don't need to take 432 // the performance hit of synchronizing using this mutex. 433 useMutex bool 434 mu sync.Mutex 435 436 estimate sizeEstimate 437 } 438 439 // newTotalSize is the new w.meta.Size. inflightSize is the uncompressed block size estimate which 440 // was previously added to sizeEstimate.inflightSize. writtenSize is the compressed size of the block 441 // which was written to disk. 442 func (d *dataBlockEstimates) dataBlockWritten( 443 newTotalSize uint64, inflightSize int, writtenSize int, 444 ) { 445 if d.useMutex { 446 d.mu.Lock() 447 defer d.mu.Unlock() 448 } 449 450 d.estimate.written(newTotalSize, inflightSize, writtenSize) 451 } 452 453 // size is an estimated size of datablock data which has been written to disk. 454 func (d *dataBlockEstimates) size() uint64 { 455 if d.useMutex { 456 d.mu.Lock() 457 defer d.mu.Unlock() 458 } 459 460 // Use invariants to make sure that the size estimation works as expected 461 // when parallelism is disabled. 462 if invariants.Enabled && !d.useMutex { 463 if d.estimate.inflightSize != 0 { 464 panic("unexpected inflight entry in data block size estimation") 465 } 466 } 467 468 return d.estimate.size() 469 } 470 471 func (d *dataBlockEstimates) addInflightDataBlock(size int) { 472 if d.useMutex { 473 d.mu.Lock() 474 defer d.mu.Unlock() 475 } 476 477 d.estimate.addInflight(size) 478 } 479 480 var writeTaskPool = sync.Pool{ 481 New: func() interface{} { 482 t := &writeTask{} 483 t.compressionDone = make(chan bool, 1) 484 return t 485 }, 486 } 487 488 type checksummer struct { 489 checksumType ChecksumType 490 xxHasher *xxhash.Digest 491 } 492 493 func (c *checksummer) checksum(block []byte, blockType []byte) (checksum uint32) { 494 // Calculate the checksum. 495 switch c.checksumType { 496 case ChecksumTypeCRC32c: 497 checksum = crc.New(block).Update(blockType).Value() 498 case ChecksumTypeXXHash64: 499 if c.xxHasher == nil { 500 c.xxHasher = xxhash.New() 501 } else { 502 c.xxHasher.Reset() 503 } 504 c.xxHasher.Write(block) 505 c.xxHasher.Write(blockType) 506 checksum = uint32(c.xxHasher.Sum64()) 507 default: 508 panic(errors.Newf("unsupported checksum type: %d", c.checksumType)) 509 } 510 return checksum 511 } 512 513 type blockBuf struct { 514 // tmp is a scratch buffer, large enough to hold either footerLen bytes, 515 // blockTrailerLen bytes, (5 * binary.MaxVarintLen64) bytes, and most 516 // likely large enough for a block handle with properties. 517 tmp [blockHandleLikelyMaxLen]byte 518 // compressedBuf is the destination buffer for compression. It is re-used over the 519 // lifetime of the blockBuf, avoiding the allocation of a temporary buffer for each block. 520 compressedBuf []byte 521 checksummer checksummer 522 } 523 524 func (b *blockBuf) clear() { 525 // We can't assign b.compressedBuf[:0] to compressedBuf because snappy relies 526 // on the length of the buffer, and not the capacity to determine if it needs 527 // to make an allocation. 528 *b = blockBuf{ 529 compressedBuf: b.compressedBuf, checksummer: b.checksummer, 530 } 531 } 532 533 // A dataBlockBuf holds all the state required to compress and write a data block to disk. 534 // A dataBlockBuf begins its lifecycle owned by the Writer client goroutine. The Writer 535 // client goroutine adds keys to the sstable, writing directly into a dataBlockBuf's blockWriter 536 // until the block is full. Once a dataBlockBuf's block is full, the dataBlockBuf may be passed 537 // to other goroutines for compression and file I/O. 538 type dataBlockBuf struct { 539 blockBuf 540 dataBlock blockWriter 541 542 // uncompressed is a reference to a byte slice which is owned by the dataBlockBuf. It is the 543 // next byte slice to be compressed. The uncompressed byte slice will be backed by the 544 // dataBlock.buf. 545 uncompressed []byte 546 // compressed is a reference to a byte slice which is owned by the dataBlockBuf. It is the 547 // compressed byte slice which must be written to disk. The compressed byte slice may be 548 // backed by the dataBlock.buf, or the dataBlockBuf.compressedBuf, depending on whether 549 // we use the result of the compression. 550 compressed []byte 551 552 // We're making calls to BlockPropertyCollectors from the Writer client goroutine. We need to 553 // pass the encoded block properties over to the write queue. To prevent copies, and allocations, 554 // we give each dataBlockBuf, a blockPropertiesEncoder. 555 blockPropsEncoder blockPropertiesEncoder 556 // dataBlockProps is set when Writer.finishDataBlockProps is called. The dataBlockProps slice is 557 // a shallow copy of the internal buffer of the dataBlockBuf.blockPropsEncoder. 558 dataBlockProps []byte 559 560 // sepScratch is reusable scratch space for computing separator keys. 561 sepScratch []byte 562 } 563 564 func (d *dataBlockBuf) clear() { 565 d.blockBuf.clear() 566 d.dataBlock.clear() 567 568 d.uncompressed = nil 569 d.compressed = nil 570 d.dataBlockProps = nil 571 d.sepScratch = d.sepScratch[:0] 572 } 573 574 var dataBlockBufPool = sync.Pool{ 575 New: func() interface{} { 576 return &dataBlockBuf{} 577 }, 578 } 579 580 func newDataBlockBuf(restartInterval int, checksumType ChecksumType) *dataBlockBuf { 581 d := dataBlockBufPool.Get().(*dataBlockBuf) 582 d.dataBlock.restartInterval = restartInterval 583 d.checksummer.checksumType = checksumType 584 return d 585 } 586 587 func (d *dataBlockBuf) finish() { 588 d.uncompressed = d.dataBlock.finish() 589 } 590 591 func (d *dataBlockBuf) compressAndChecksum(c Compression) { 592 d.compressed = compressAndChecksum(d.uncompressed, c, &d.blockBuf) 593 } 594 595 func (d *dataBlockBuf) shouldFlush( 596 key InternalKey, valueLen, targetBlockSize, sizeThreshold int, 597 ) bool { 598 return shouldFlush( 599 key, valueLen, d.dataBlock.restartInterval, d.dataBlock.estimatedSize(), 600 d.dataBlock.nEntries, targetBlockSize, sizeThreshold) 601 } 602 603 type indexBlockAndBlockProperties struct { 604 nEntries int 605 // sep is the last key added to this block, for computing a separator later. 606 sep InternalKey 607 properties []byte 608 // block is the encoded block produced by blockWriter.finish. 609 block []byte 610 } 611 612 // Set sets the value for the given key. The sequence number is set to 0. 613 // Intended for use to externally construct an sstable before ingestion into a 614 // DB. For a given Writer, the keys passed to Set must be in strictly increasing 615 // order. 616 // 617 // TODO(peter): untested 618 func (w *Writer) Set(key, value []byte) error { 619 if w.err != nil { 620 return w.err 621 } 622 return w.addPoint(base.MakeInternalKey(key, 0, InternalKeyKindSet), value) 623 } 624 625 // Delete deletes the value for the given key. The sequence number is set to 626 // 0. Intended for use to externally construct an sstable before ingestion into 627 // a DB. 628 // 629 // TODO(peter): untested 630 func (w *Writer) Delete(key []byte) error { 631 if w.err != nil { 632 return w.err 633 } 634 return w.addPoint(base.MakeInternalKey(key, 0, InternalKeyKindDelete), nil) 635 } 636 637 // DeleteRange deletes all of the keys (and values) in the range [start,end) 638 // (inclusive on start, exclusive on end). The sequence number is set to 639 // 0. Intended for use to externally construct an sstable before ingestion into 640 // a DB. 641 // 642 // TODO(peter): untested 643 func (w *Writer) DeleteRange(start, end []byte) error { 644 if w.err != nil { 645 return w.err 646 } 647 return w.addTombstone(base.MakeInternalKey(start, 0, InternalKeyKindRangeDelete), end) 648 } 649 650 // Merge adds an action to the DB that merges the value at key with the new 651 // value. The details of the merge are dependent upon the configured merge 652 // operator. The sequence number is set to 0. Intended for use to externally 653 // construct an sstable before ingestion into a DB. 654 // 655 // TODO(peter): untested 656 func (w *Writer) Merge(key, value []byte) error { 657 if w.err != nil { 658 return w.err 659 } 660 return w.addPoint(base.MakeInternalKey(key, 0, InternalKeyKindMerge), value) 661 } 662 663 // Add adds a key/value pair to the table being written. For a given Writer, 664 // the keys passed to Add must be in increasing order. The exception to this 665 // rule is range deletion tombstones. Range deletion tombstones need to be 666 // added ordered by their start key, but they can be added out of order from 667 // point entries. Additionally, range deletion tombstones must be fragmented 668 // (i.e. by keyspan.Fragmenter). 669 func (w *Writer) Add(key InternalKey, value []byte) error { 670 if w.err != nil { 671 return w.err 672 } 673 674 switch key.Kind() { 675 case InternalKeyKindRangeDelete: 676 return w.addTombstone(key, value) 677 case base.InternalKeyKindRangeKeyDelete, 678 base.InternalKeyKindRangeKeySet, 679 base.InternalKeyKindRangeKeyUnset: 680 w.err = errors.Errorf( 681 "bitalostable: range keys must be added via one of the RangeKey* functions") 682 return w.err 683 } 684 return w.addPoint(key, value) 685 } 686 687 func (w *Writer) addPoint(key InternalKey, value []byte) error { 688 if !w.disableKeyOrderChecks && w.dataBlockBuf.dataBlock.nEntries >= 1 { 689 // curKey is guaranteed to be the last point key which was added to the Writer. 690 // Inlining base.DecodeInternalKey has a 2-3% improve in the BenchmarkWriter 691 // benchmark. 692 encodedKey := w.dataBlockBuf.dataBlock.curKey 693 n := len(encodedKey) - base.InternalTrailerLen 694 var trailer uint64 695 if n >= 0 { 696 trailer = binary.LittleEndian.Uint64(encodedKey[n:]) 697 encodedKey = encodedKey[:n:n] 698 } else { 699 trailer = uint64(InternalKeyKindInvalid) 700 encodedKey = nil 701 } 702 largestPointKey := InternalKey{ 703 UserKey: encodedKey, 704 Trailer: trailer, 705 } 706 707 if largestPointKey.UserKey != nil { 708 // TODO(peter): Manually inlined version of base.InternalCompare(). This is 709 // 3.5% faster on BenchmarkWriter on go1.13. Remove if go1.14 or future 710 // versions show this to not be a performance win. 711 x := w.compare(largestPointKey.UserKey, key.UserKey) 712 if x > 0 || (x == 0 && largestPointKey.Trailer <= key.Trailer) { 713 w.err = errors.Errorf("bitalostable: keys must be added in strictly increasing order: %s, %s", 714 largestPointKey.Pretty(w.formatKey), key.Pretty(w.formatKey)) 715 return w.err 716 } 717 } 718 } 719 720 if err := w.maybeFlush(key, value); err != nil { 721 return err 722 } 723 724 for i := range w.propCollectors { 725 if err := w.propCollectors[i].Add(key, value); err != nil { 726 w.err = err 727 return err 728 } 729 } 730 for i := range w.blockPropCollectors { 731 if err := w.blockPropCollectors[i].Add(key, value); err != nil { 732 w.err = err 733 return err 734 } 735 } 736 737 w.maybeAddToFilter(key.UserKey) 738 w.dataBlockBuf.dataBlock.add(key, value) 739 740 w.meta.updateSeqNum(key.SeqNum()) 741 742 if !w.meta.HasPointKeys { 743 k := base.DecodeInternalKey(w.dataBlockBuf.dataBlock.curKey) 744 // NB: We need to ensure that SmallestPoint.UserKey is set, so we create 745 // an InternalKey which is semantically identical to the key, but won't 746 // have a nil UserKey. We do this, because key.UserKey could be nil, and 747 // we don't want SmallestPoint.UserKey to be nil. 748 // 749 // todo(bananabrick): Determine if it's okay to have a nil SmallestPoint 750 // .UserKey now that we don't rely on a nil UserKey to determine if the 751 // key has been set or not. 752 w.meta.SetSmallestPointKey(k.Clone()) 753 } 754 755 w.props.NumEntries++ 756 switch key.Kind() { 757 case InternalKeyKindDelete: 758 w.props.NumDeletions++ 759 case InternalKeyKindMerge: 760 w.props.NumMergeOperands++ 761 } 762 w.props.RawKeySize += uint64(key.Size()) 763 w.props.RawValueSize += uint64(len(value)) 764 return nil 765 } 766 767 func (w *Writer) prettyTombstone(k InternalKey, value []byte) fmt.Formatter { 768 return keyspan.Span{ 769 Start: k.UserKey, 770 End: value, 771 Keys: []keyspan.Key{{Trailer: k.Trailer}}, 772 }.Pretty(w.formatKey) 773 } 774 775 func (w *Writer) addTombstone(key InternalKey, value []byte) error { 776 if !w.disableKeyOrderChecks && !w.rangeDelV1Format && w.rangeDelBlock.nEntries > 0 { 777 // Check that tombstones are being added in fragmented order. If the two 778 // tombstones overlap, their start and end keys must be identical. 779 prevKey := base.DecodeInternalKey(w.rangeDelBlock.curKey) 780 switch c := w.compare(prevKey.UserKey, key.UserKey); { 781 case c > 0: 782 w.err = errors.Errorf("bitalostable: keys must be added in order: %s, %s", 783 prevKey.Pretty(w.formatKey), key.Pretty(w.formatKey)) 784 return w.err 785 case c == 0: 786 prevValue := w.rangeDelBlock.curValue 787 if w.compare(prevValue, value) != 0 { 788 w.err = errors.Errorf("bitalostable: overlapping tombstones must be fragmented: %s vs %s", 789 w.prettyTombstone(prevKey, prevValue), 790 w.prettyTombstone(key, value)) 791 return w.err 792 } 793 if prevKey.SeqNum() <= key.SeqNum() { 794 w.err = errors.Errorf("bitalostable: keys must be added in strictly increasing order: %s, %s", 795 prevKey.Pretty(w.formatKey), key.Pretty(w.formatKey)) 796 return w.err 797 } 798 default: 799 prevValue := w.rangeDelBlock.curValue 800 if w.compare(prevValue, key.UserKey) > 0 { 801 w.err = errors.Errorf("bitalostable: overlapping tombstones must be fragmented: %s vs %s", 802 w.prettyTombstone(prevKey, prevValue), 803 w.prettyTombstone(key, value)) 804 return w.err 805 } 806 } 807 } 808 809 if key.Trailer == InternalKeyRangeDeleteSentinel { 810 w.err = errors.Errorf("bitalostable: cannot add range delete sentinel: %s", key.Pretty(w.formatKey)) 811 return w.err 812 } 813 814 for i := range w.propCollectors { 815 if err := w.propCollectors[i].Add(key, value); err != nil { 816 w.err = err 817 return err 818 } 819 } 820 821 w.meta.updateSeqNum(key.SeqNum()) 822 823 switch { 824 case w.rangeDelV1Format: 825 // Range tombstones are not fragmented in the v1 (i.e. RocksDB) range 826 // deletion block format, so we need to track the largest range tombstone 827 // end key as every range tombstone is added. 828 // 829 // Note that writing the v1 format is only supported for tests. 830 if w.props.NumRangeDeletions == 0 { 831 w.meta.SetSmallestRangeDelKey(key.Clone()) 832 w.meta.SetLargestRangeDelKey(base.MakeRangeDeleteSentinelKey(value).Clone()) 833 } else { 834 if base.InternalCompare(w.compare, w.meta.SmallestRangeDel, key) > 0 { 835 w.meta.SetSmallestRangeDelKey(key.Clone()) 836 } 837 end := base.MakeRangeDeleteSentinelKey(value) 838 if base.InternalCompare(w.compare, w.meta.LargestRangeDel, end) < 0 { 839 w.meta.SetLargestRangeDelKey(end.Clone()) 840 } 841 } 842 843 default: 844 // Range tombstones are fragmented in the v2 range deletion block format, 845 // so the start key of the first range tombstone added will be the smallest 846 // range tombstone key. The largest range tombstone key will be determined 847 // in Writer.Close() as the end key of the last range tombstone added. 848 if w.props.NumRangeDeletions == 0 { 849 w.meta.SetSmallestRangeDelKey(key.Clone()) 850 } 851 } 852 853 w.props.NumEntries++ 854 w.props.NumDeletions++ 855 w.props.NumRangeDeletions++ 856 w.props.RawKeySize += uint64(key.Size()) 857 w.props.RawValueSize += uint64(len(value)) 858 w.rangeDelBlock.add(key, value) 859 return nil 860 } 861 862 // RangeKeySet sets a range between start (inclusive) and end (exclusive) with 863 // the given suffix to the given value. 864 // 865 // Keys must be added to the table in increasing order of start key. Spans are 866 // not required to be fragmented. 867 func (w *Writer) RangeKeySet(start, end, suffix, value []byte) error { 868 return w.addRangeKeySpan(keyspan.Span{ 869 Start: w.tempRangeKeyCopy(start), 870 End: w.tempRangeKeyCopy(end), 871 Keys: []keyspan.Key{ 872 { 873 Trailer: base.MakeTrailer(0, base.InternalKeyKindRangeKeySet), 874 Suffix: w.tempRangeKeyCopy(suffix), 875 Value: w.tempRangeKeyCopy(value), 876 }, 877 }, 878 }) 879 } 880 881 // RangeKeyUnset un-sets a range between start (inclusive) and end (exclusive) 882 // with the given suffix. 883 // 884 // Keys must be added to the table in increasing order of start key. Spans are 885 // not required to be fragmented. 886 func (w *Writer) RangeKeyUnset(start, end, suffix []byte) error { 887 return w.addRangeKeySpan(keyspan.Span{ 888 Start: w.tempRangeKeyCopy(start), 889 End: w.tempRangeKeyCopy(end), 890 Keys: []keyspan.Key{ 891 { 892 Trailer: base.MakeTrailer(0, base.InternalKeyKindRangeKeyUnset), 893 Suffix: w.tempRangeKeyCopy(suffix), 894 }, 895 }, 896 }) 897 } 898 899 // RangeKeyDelete deletes a range between start (inclusive) and end (exclusive). 900 // 901 // Keys must be added to the table in increasing order of start key. Spans are 902 // not required to be fragmented. 903 func (w *Writer) RangeKeyDelete(start, end []byte) error { 904 return w.addRangeKeySpan(keyspan.Span{ 905 Start: w.tempRangeKeyCopy(start), 906 End: w.tempRangeKeyCopy(end), 907 Keys: []keyspan.Key{ 908 {Trailer: base.MakeTrailer(0, base.InternalKeyKindRangeKeyDelete)}, 909 }, 910 }) 911 } 912 913 // AddRangeKey adds a range key set, unset, or delete key/value pair to the 914 // table being written. 915 // 916 // Range keys must be supplied in strictly ascending order of start key (i.e. 917 // user key ascending, sequence number descending, and key type descending). 918 // Ranges added must also be supplied in fragmented span order - i.e. other than 919 // spans that are perfectly aligned (same start and end keys), spans may not 920 // overlap. Range keys may be added out of order relative to point keys and 921 // range deletions. 922 func (w *Writer) AddRangeKey(key InternalKey, value []byte) error { 923 if w.err != nil { 924 return w.err 925 } 926 return w.addRangeKey(key, value) 927 } 928 929 func (w *Writer) addRangeKeySpan(span keyspan.Span) error { 930 if w.fragmenter.Start() != nil && w.compare(w.fragmenter.Start(), span.Start) > 0 { 931 return errors.Errorf("bitalostable: spans must be added in order: %s > %s", 932 w.formatKey(w.fragmenter.Start()), w.formatKey(span.Start)) 933 } 934 // Add this span to the fragmenter. 935 w.fragmenter.Add(span) 936 return w.err 937 } 938 939 func (w *Writer) coalesceSpans(span keyspan.Span) { 940 // This method is the emit function of the Fragmenter, so span.Keys is only 941 // owned by this span and it's safe to mutate. 942 w.rangeKeyCoalesced.Start = span.Start 943 w.rangeKeyCoalesced.End = span.End 944 err := rangekey.Coalesce(w.compare, span.Keys, &w.rangeKeyCoalesced.Keys) 945 if err != nil { 946 w.err = errors.Newf("sstable: could not coalesce span: %s", err) 947 return 948 } 949 950 // NB: The span only contains range keys and is internally consistent (eg, 951 // no duplicate suffixes, no additional keys after a RANGEKEYDEL). 952 w.err = firstError(w.err, w.rangeKeyEncoder.Encode(&w.rangeKeyCoalesced)) 953 } 954 955 func (w *Writer) addRangeKey(key InternalKey, value []byte) error { 956 if !w.disableKeyOrderChecks && w.rangeKeyBlock.nEntries > 0 { 957 prevStartKey := base.DecodeInternalKey(w.rangeKeyBlock.curKey) 958 prevEndKey, _, ok := rangekey.DecodeEndKey(prevStartKey.Kind(), w.rangeKeyBlock.curValue) 959 if !ok { 960 // We panic here as we should have previously decoded and validated this 961 // key and value when it was first added to the range key block. 962 panic(errors.Errorf("bitalostable: invalid end key for span: %s", 963 prevStartKey.Pretty(w.formatKey))) 964 } 965 966 curStartKey := key 967 curEndKey, _, ok := rangekey.DecodeEndKey(curStartKey.Kind(), value) 968 if !ok { 969 w.err = errors.Errorf("bitalostable: invalid end key for span: %s", 970 curStartKey.Pretty(w.formatKey)) 971 return w.err 972 } 973 974 // Start keys must be strictly increasing. 975 if base.InternalCompare(w.compare, prevStartKey, curStartKey) >= 0 { 976 w.err = errors.Errorf( 977 "bitalostable: range keys starts must be added in increasing order: %s, %s", 978 prevStartKey.Pretty(w.formatKey), key.Pretty(w.formatKey)) 979 return w.err 980 } 981 982 // Start keys are increasing. If the start user keys are equal, the 983 // end keys must be equal (i.e. aligned spans). 984 if w.compare(prevStartKey.UserKey, curStartKey.UserKey) == 0 { 985 if w.compare(prevEndKey, curEndKey) != 0 { 986 w.err = errors.Errorf("bitalostable: overlapping range keys must be fragmented: %s, %s", 987 prevStartKey.Pretty(w.formatKey), 988 curStartKey.Pretty(w.formatKey)) 989 return w.err 990 } 991 } else if w.compare(prevEndKey, curStartKey.UserKey) > 0 { 992 // If the start user keys are NOT equal, the spans must be disjoint (i.e. 993 // no overlap). 994 // NOTE: the inequality excludes zero, as we allow the end key of the 995 // lower span be the same as the start key of the upper span, because 996 // the range end key is considered an exclusive bound. 997 w.err = errors.Errorf("bitalostable: overlapping range keys must be fragmented: %s, %s", 998 prevStartKey.Pretty(w.formatKey), 999 curStartKey.Pretty(w.formatKey)) 1000 return w.err 1001 } 1002 } 1003 1004 // TODO(travers): Add an invariant-gated check to ensure that suffix-values 1005 // are sorted within coalesced spans. 1006 1007 // Range-keys and point-keys are intended to live in "parallel" keyspaces. 1008 // However, we track a single seqnum in the table metadata that spans both of 1009 // these keyspaces. 1010 // TODO(travers): Consider tracking range key seqnums separately. 1011 w.meta.updateSeqNum(key.SeqNum()) 1012 1013 // Range tombstones are fragmented, so the start key of the first range key 1014 // added will be the smallest. The largest range key is determined in 1015 // Writer.Close() as the end key of the last range key added to the block. 1016 if w.props.NumRangeKeys() == 0 { 1017 w.meta.SetSmallestRangeKey(key.Clone()) 1018 } 1019 1020 // Update block properties. 1021 w.props.RawRangeKeyKeySize += uint64(key.Size()) 1022 w.props.RawRangeKeyValueSize += uint64(len(value)) 1023 switch key.Kind() { 1024 case base.InternalKeyKindRangeKeyDelete: 1025 w.props.NumRangeKeyDels++ 1026 case base.InternalKeyKindRangeKeySet: 1027 w.props.NumRangeKeySets++ 1028 case base.InternalKeyKindRangeKeyUnset: 1029 w.props.NumRangeKeyUnsets++ 1030 default: 1031 panic(errors.Errorf("bitalostable: invalid range key type: %s", key.Kind())) 1032 } 1033 1034 for i := range w.blockPropCollectors { 1035 if err := w.blockPropCollectors[i].Add(key, value); err != nil { 1036 return err 1037 } 1038 } 1039 1040 // Add the key to the block. 1041 w.rangeKeyBlock.add(key, value) 1042 return nil 1043 } 1044 1045 // tempRangeKeyBuf returns a slice of length n from the Writer's rkBuf byte 1046 // slice. Any byte written to the returned slice is retained for the lifetime of 1047 // the Writer. 1048 func (w *Writer) tempRangeKeyBuf(n int) []byte { 1049 if cap(w.rkBuf)-len(w.rkBuf) < n { 1050 size := len(w.rkBuf) + 2*n 1051 if size < 2*cap(w.rkBuf) { 1052 size = 2 * cap(w.rkBuf) 1053 } 1054 buf := make([]byte, len(w.rkBuf), size) 1055 copy(buf, w.rkBuf) 1056 w.rkBuf = buf 1057 } 1058 b := w.rkBuf[len(w.rkBuf) : len(w.rkBuf)+n] 1059 w.rkBuf = w.rkBuf[:len(w.rkBuf)+n] 1060 return b 1061 } 1062 1063 // tempRangeKeyCopy returns a copy of the provided slice, stored in the Writer's 1064 // range key buffer. 1065 func (w *Writer) tempRangeKeyCopy(k []byte) []byte { 1066 if len(k) == 0 { 1067 return nil 1068 } 1069 buf := w.tempRangeKeyBuf(len(k)) 1070 copy(buf, k) 1071 return buf 1072 } 1073 1074 func (w *Writer) maybeAddToFilter(key []byte) { 1075 if w.filter != nil { 1076 if w.split != nil { 1077 prefix := key[:w.split(key)] 1078 w.filter.addKey(prefix) 1079 } else { 1080 w.filter.addKey(key) 1081 } 1082 } 1083 } 1084 1085 func (w *Writer) flush(key InternalKey) error { 1086 estimatedUncompressedSize := w.dataBlockBuf.dataBlock.estimatedSize() 1087 w.coordination.sizeEstimate.addInflightDataBlock(estimatedUncompressedSize) 1088 1089 var err error 1090 1091 // We're finishing a data block. 1092 err = w.finishDataBlockProps(w.dataBlockBuf) 1093 if err != nil { 1094 return err 1095 } 1096 1097 w.dataBlockBuf.finish() 1098 w.dataBlockBuf.compressAndChecksum(w.compression) 1099 1100 // Determine if the index block should be flushed. Since we're accessing the 1101 // dataBlockBuf.dataBlock.curKey here, we have to make sure that once we start 1102 // to pool the dataBlockBufs, the curKey isn't used by the Writer once the 1103 // dataBlockBuf is added back to a sync.Pool. In this particular case, the 1104 // byte slice which supports "sep" will eventually be copied when "sep" is 1105 // added to the index block. 1106 prevKey := base.DecodeInternalKey(w.dataBlockBuf.dataBlock.curKey) 1107 sep := w.indexEntrySep(prevKey, key, w.dataBlockBuf) 1108 // We determine that we should flush an index block from the Writer client 1109 // goroutine, but we actually finish the index block from the writeQueue. 1110 // When we determine that an index block should be flushed, we need to call 1111 // BlockPropertyCollector.FinishIndexBlock. But block property collector 1112 // calls must happen sequentially from the Writer client. Therefore, we need 1113 // to determine that we are going to flush the index block from the Writer 1114 // client. 1115 shouldFlushIndexBlock := supportsTwoLevelIndex(w.tableFormat) && w.indexBlock.shouldFlush( 1116 sep, encodedBHPEstimatedSize, w.indexBlockSize, w.indexBlockSizeThreshold, 1117 ) 1118 1119 var indexProps []byte 1120 var flushableIndexBlock *indexBlockBuf 1121 if shouldFlushIndexBlock { 1122 flushableIndexBlock = w.indexBlock 1123 w.indexBlock = newIndexBlockBuf(w.coordination.parallelismEnabled) 1124 // Call BlockPropertyCollector.FinishIndexBlock, since we've decided to 1125 // flush the index block. 1126 indexProps, err = w.finishIndexBlockProps() 1127 if err != nil { 1128 return err 1129 } 1130 } 1131 1132 // We've called BlockPropertyCollector.FinishDataBlock, and, if necessary, 1133 // BlockPropertyCollector.FinishIndexBlock. Since we've decided to finish 1134 // the data block, we can call 1135 // BlockPropertyCollector.AddPrevDataBlockToIndexBlock. 1136 w.addPrevDataBlockToIndexBlockProps() 1137 1138 // Schedule a write. 1139 writeTask := writeTaskPool.Get().(*writeTask) 1140 // We're setting compressionDone to indicate that compression of this block 1141 // has already been completed. 1142 writeTask.compressionDone <- true 1143 writeTask.buf = w.dataBlockBuf 1144 writeTask.indexEntrySep = sep 1145 writeTask.inflightSize = estimatedUncompressedSize 1146 writeTask.currIndexBlock = w.indexBlock 1147 writeTask.indexInflightSize = sep.Size() + encodedBHPEstimatedSize 1148 writeTask.finishedIndexProps = indexProps 1149 writeTask.flushableIndexBlock = flushableIndexBlock 1150 1151 // The writeTask corresponds to an unwritten index entry. 1152 w.indexBlock.addInflight(writeTask.indexInflightSize) 1153 1154 w.dataBlockBuf = nil 1155 if w.coordination.parallelismEnabled { 1156 w.coordination.writeQueue.add(writeTask) 1157 } else { 1158 err = w.coordination.writeQueue.addSync(writeTask) 1159 } 1160 w.dataBlockBuf = newDataBlockBuf(w.restartInterval, w.checksumType) 1161 1162 return err 1163 } 1164 1165 func (w *Writer) maybeFlush(key InternalKey, value []byte) error { 1166 if !w.dataBlockBuf.shouldFlush(key, len(value), w.blockSize, w.blockSizeThreshold) { 1167 return nil 1168 } 1169 1170 err := w.flush(key) 1171 1172 if err != nil { 1173 w.err = err 1174 return err 1175 } 1176 1177 return nil 1178 } 1179 1180 // dataBlockBuf.dataBlockProps set by this method must be encoded before any future use of the 1181 // dataBlockBuf.blockPropsEncoder, since the properties slice will get reused by the 1182 // blockPropsEncoder. 1183 func (w *Writer) finishDataBlockProps(buf *dataBlockBuf) error { 1184 if len(w.blockPropCollectors) == 0 { 1185 return nil 1186 } 1187 var err error 1188 buf.blockPropsEncoder.resetProps() 1189 for i := range w.blockPropCollectors { 1190 scratch := buf.blockPropsEncoder.getScratchForProp() 1191 if scratch, err = w.blockPropCollectors[i].FinishDataBlock(scratch); err != nil { 1192 return err 1193 } 1194 if len(scratch) > 0 { 1195 buf.blockPropsEncoder.addProp(shortID(i), scratch) 1196 } 1197 } 1198 1199 buf.dataBlockProps = buf.blockPropsEncoder.unsafeProps() 1200 return nil 1201 } 1202 1203 // The BlockHandleWithProperties returned by this method must be encoded before any future use of 1204 // the Writer.blockPropsEncoder, since the properties slice will get reused by the blockPropsEncoder. 1205 // maybeAddBlockPropertiesToBlockHandle should only be called if block is being written synchronously 1206 // with the Writer client. 1207 func (w *Writer) maybeAddBlockPropertiesToBlockHandle( 1208 bh BlockHandle, 1209 ) (BlockHandleWithProperties, error) { 1210 err := w.finishDataBlockProps(w.dataBlockBuf) 1211 if err != nil { 1212 return BlockHandleWithProperties{}, err 1213 } 1214 return BlockHandleWithProperties{BlockHandle: bh, Props: w.dataBlockBuf.dataBlockProps}, nil 1215 } 1216 1217 func (w *Writer) indexEntrySep(prevKey, key InternalKey, dataBlockBuf *dataBlockBuf) InternalKey { 1218 // Make a rough guess that we want key-sized scratch to compute the separator. 1219 if cap(dataBlockBuf.sepScratch) < key.Size() { 1220 dataBlockBuf.sepScratch = make([]byte, 0, key.Size()*2) 1221 } 1222 1223 var sep InternalKey 1224 if key.UserKey == nil && key.Trailer == 0 { 1225 sep = prevKey.Successor(w.compare, w.successor, dataBlockBuf.sepScratch[:0]) 1226 } else { 1227 sep = prevKey.Separator(w.compare, w.separator, dataBlockBuf.sepScratch[:0], key) 1228 } 1229 return sep 1230 } 1231 1232 // addIndexEntry adds an index entry for the specified key and block handle. 1233 // addIndexEntry can be called from both the Writer client goroutine, and the 1234 // writeQueue goroutine. If the flushIndexBuf != nil, then the indexProps, as 1235 // they're used when the index block is finished. 1236 // 1237 // Invariant: 1238 // 1. addIndexEntry must not store references to the sep InternalKey, the tmp 1239 // byte slice, bhp.Props. That is, these must be either deep copied or 1240 // encoded. 1241 // 2. addIndexEntry must not hold references to the flushIndexBuf, and the writeTo 1242 // indexBlockBufs. 1243 func (w *Writer) addIndexEntry( 1244 sep InternalKey, 1245 bhp BlockHandleWithProperties, 1246 tmp []byte, 1247 flushIndexBuf *indexBlockBuf, 1248 writeTo *indexBlockBuf, 1249 inflightSize int, 1250 indexProps []byte, 1251 ) error { 1252 if bhp.Length == 0 { 1253 // A valid blockHandle must be non-zero. 1254 // In particular, it must have a non-zero length. 1255 return nil 1256 } 1257 1258 encoded := encodeBlockHandleWithProperties(tmp, bhp) 1259 1260 if flushIndexBuf != nil { 1261 if cap(w.indexPartitions) == 0 { 1262 w.indexPartitions = make([]indexBlockAndBlockProperties, 0, 32) 1263 } 1264 // Enable two level indexes if there is more than one index block. 1265 w.twoLevelIndex = true 1266 if err := w.finishIndexBlock(flushIndexBuf, indexProps); err != nil { 1267 return err 1268 } 1269 } 1270 1271 writeTo.add(sep, encoded, inflightSize) 1272 return nil 1273 } 1274 1275 func (w *Writer) addPrevDataBlockToIndexBlockProps() { 1276 for i := range w.blockPropCollectors { 1277 w.blockPropCollectors[i].AddPrevDataBlockToIndexBlock() 1278 } 1279 } 1280 1281 // addIndexEntrySync adds an index entry for the specified key and block handle. 1282 // Writer.addIndexEntry is only called synchronously once Writer.Close is called. 1283 // addIndexEntrySync should only be called if we're sure that index entries 1284 // aren't being written asynchronously. 1285 // 1286 // Invariant: 1287 // 1. addIndexEntrySync must not store references to the prevKey, key InternalKey's, 1288 // the tmp byte slice. That is, these must be either deep copied or encoded. 1289 func (w *Writer) addIndexEntrySync( 1290 prevKey, key InternalKey, bhp BlockHandleWithProperties, tmp []byte, 1291 ) error { 1292 sep := w.indexEntrySep(prevKey, key, w.dataBlockBuf) 1293 shouldFlush := supportsTwoLevelIndex( 1294 w.tableFormat) && w.indexBlock.shouldFlush( 1295 sep, encodedBHPEstimatedSize, w.indexBlockSize, w.indexBlockSizeThreshold, 1296 ) 1297 var flushableIndexBlock *indexBlockBuf 1298 var props []byte 1299 var err error 1300 if shouldFlush { 1301 flushableIndexBlock = w.indexBlock 1302 w.indexBlock = newIndexBlockBuf(w.coordination.parallelismEnabled) 1303 1304 // Call BlockPropertyCollector.FinishIndexBlock, since we've decided to 1305 // flush the index block. 1306 props, err = w.finishIndexBlockProps() 1307 if err != nil { 1308 return err 1309 } 1310 } 1311 1312 err = w.addIndexEntry(sep, bhp, tmp, flushableIndexBlock, w.indexBlock, 0, props) 1313 if flushableIndexBlock != nil { 1314 flushableIndexBlock.clear() 1315 indexBlockBufPool.Put(flushableIndexBlock) 1316 } 1317 w.addPrevDataBlockToIndexBlockProps() 1318 return err 1319 } 1320 1321 func shouldFlush( 1322 key InternalKey, 1323 valueLen int, 1324 restartInterval, estimatedBlockSize, numEntries, targetBlockSize, sizeThreshold int, 1325 ) bool { 1326 if numEntries == 0 { 1327 return false 1328 } 1329 1330 if estimatedBlockSize >= targetBlockSize { 1331 return true 1332 } 1333 1334 // The block is currently smaller than the target size. 1335 if estimatedBlockSize <= sizeThreshold { 1336 // The block is smaller than the threshold size at which we'll consider 1337 // flushing it. 1338 return false 1339 } 1340 1341 newSize := estimatedBlockSize + key.Size() + valueLen 1342 if numEntries%restartInterval == 0 { 1343 newSize += 4 1344 } 1345 newSize += 4 // varint for shared prefix length 1346 newSize += uvarintLen(uint32(key.Size())) // varint for unshared key bytes 1347 newSize += uvarintLen(uint32(valueLen)) // varint for value size 1348 // Flush if the block plus the new entry is larger than the target size. 1349 return newSize > targetBlockSize 1350 } 1351 1352 const keyAllocSize = 256 << 10 1353 1354 func cloneKeyWithBuf(k InternalKey, buf []byte) ([]byte, InternalKey) { 1355 if len(k.UserKey) == 0 { 1356 return buf, k 1357 } 1358 if len(buf) < len(k.UserKey) { 1359 buf = make([]byte, len(k.UserKey)+keyAllocSize) 1360 } 1361 n := copy(buf, k.UserKey) 1362 return buf[n:], InternalKey{UserKey: buf[:n:n], Trailer: k.Trailer} 1363 } 1364 1365 // Invariants: The byte slice returned by finishIndexBlockProps is heap-allocated 1366 // 1367 // and has its own lifetime, independent of the Writer and the blockPropsEncoder, 1368 // 1369 // and it is safe to: 1370 // 1. Reuse w.blockPropsEncoder without first encoding the byte slice returned. 1371 // 2. Store the byte slice in the Writer since it is a copy and not supported by 1372 // an underlying buffer. 1373 func (w *Writer) finishIndexBlockProps() ([]byte, error) { 1374 w.blockPropsEncoder.resetProps() 1375 for i := range w.blockPropCollectors { 1376 scratch := w.blockPropsEncoder.getScratchForProp() 1377 var err error 1378 if scratch, err = w.blockPropCollectors[i].FinishIndexBlock(scratch); err != nil { 1379 return nil, err 1380 } 1381 if len(scratch) > 0 { 1382 w.blockPropsEncoder.addProp(shortID(i), scratch) 1383 } 1384 } 1385 return w.blockPropsEncoder.props(), nil 1386 } 1387 1388 // finishIndexBlock finishes the current index block and adds it to the top 1389 // level index block. This is only used when two level indexes are enabled. 1390 // 1391 // Invariants: 1392 // 1. The props slice passed into finishedIndexBlock must not be a 1393 // owned by any other struct, since it will be stored in the Writer.indexPartitions 1394 // slice. 1395 // 2. None of the buffers owned by indexBuf will be shallow copied and stored elsewhere. 1396 // That is, it must be safe to reuse indexBuf after finishIndexBlock has been called. 1397 func (w *Writer) finishIndexBlock(indexBuf *indexBlockBuf, props []byte) error { 1398 part := indexBlockAndBlockProperties{ 1399 nEntries: indexBuf.block.nEntries, properties: props, 1400 } 1401 w.indexSepAlloc, part.sep = cloneKeyWithBuf( 1402 base.DecodeInternalKey(indexBuf.block.curKey), w.indexSepAlloc, 1403 ) 1404 bk := indexBuf.finish() 1405 if len(w.indexBlockAlloc) < len(bk) { 1406 // Allocate enough bytes for approximately 16 index blocks. 1407 w.indexBlockAlloc = make([]byte, len(bk)*16) 1408 } 1409 n := copy(w.indexBlockAlloc, bk) 1410 part.block = w.indexBlockAlloc[:n:n] 1411 w.indexBlockAlloc = w.indexBlockAlloc[n:] 1412 w.indexPartitions = append(w.indexPartitions, part) 1413 return nil 1414 } 1415 1416 func (w *Writer) writeTwoLevelIndex() (BlockHandle, error) { 1417 props, err := w.finishIndexBlockProps() 1418 if err != nil { 1419 return BlockHandle{}, err 1420 } 1421 // Add the final unfinished index. 1422 if err = w.finishIndexBlock(w.indexBlock, props); err != nil { 1423 return BlockHandle{}, err 1424 } 1425 1426 for i := range w.indexPartitions { 1427 b := &w.indexPartitions[i] 1428 w.props.NumDataBlocks += uint64(b.nEntries) 1429 1430 data := b.block 1431 w.props.IndexSize += uint64(len(data)) 1432 bh, err := w.writeBlock(data, w.compression, &w.blockBuf) 1433 if err != nil { 1434 return BlockHandle{}, err 1435 } 1436 bhp := BlockHandleWithProperties{ 1437 BlockHandle: bh, 1438 Props: b.properties, 1439 } 1440 encoded := encodeBlockHandleWithProperties(w.blockBuf.tmp[:], bhp) 1441 w.topLevelIndexBlock.add(b.sep, encoded) 1442 } 1443 1444 // NB: RocksDB includes the block trailer length in the index size 1445 // property, though it doesn't include the trailer in the top level 1446 // index size property. 1447 w.props.IndexPartitions = uint64(len(w.indexPartitions)) 1448 w.props.TopLevelIndexSize = uint64(w.topLevelIndexBlock.estimatedSize()) 1449 w.props.IndexSize += w.props.TopLevelIndexSize + blockTrailerLen 1450 1451 return w.writeBlock(w.topLevelIndexBlock.finish(), w.compression, &w.blockBuf) 1452 } 1453 1454 func compressAndChecksum(b []byte, compression Compression, blockBuf *blockBuf) []byte { 1455 // Compress the buffer, discarding the result if the improvement isn't at 1456 // least 12.5%. 1457 blockType, compressed := compressBlock(compression, b, blockBuf.compressedBuf) 1458 if blockType != noCompressionBlockType && cap(compressed) > cap(blockBuf.compressedBuf) { 1459 blockBuf.compressedBuf = compressed[:cap(compressed)] 1460 } 1461 if len(compressed) < len(b)-len(b)/8 { 1462 b = compressed 1463 } else { 1464 blockType = noCompressionBlockType 1465 } 1466 1467 blockBuf.tmp[0] = byte(blockType) 1468 1469 // Calculate the checksum. 1470 checksum := blockBuf.checksummer.checksum(b, blockBuf.tmp[:1]) 1471 binary.LittleEndian.PutUint32(blockBuf.tmp[1:5], checksum) 1472 return b 1473 } 1474 1475 func (w *Writer) writeCompressedBlock(block []byte, blockTrailerBuf []byte) (BlockHandle, error) { 1476 bh := BlockHandle{Offset: w.meta.Size, Length: uint64(len(block))} 1477 1478 if w.cacheID != 0 && w.fileNum != 0 { 1479 // Remove the block being written from the cache. This provides defense in 1480 // depth against bugs which cause cache collisions. 1481 // 1482 // TODO(peter): Alternatively, we could add the uncompressed value to the 1483 // cache. 1484 w.cache.Delete(w.cacheID, w.fileNum, bh.Offset) 1485 } 1486 1487 // Write the bytes to the file. 1488 n, err := w.writer.Write(block) 1489 if err != nil { 1490 return BlockHandle{}, err 1491 } 1492 w.meta.Size += uint64(n) 1493 n, err = w.writer.Write(blockTrailerBuf[:blockTrailerLen]) 1494 if err != nil { 1495 return BlockHandle{}, err 1496 } 1497 w.meta.Size += uint64(n) 1498 1499 return bh, nil 1500 } 1501 1502 func (w *Writer) writeBlock( 1503 b []byte, compression Compression, blockBuf *blockBuf, 1504 ) (BlockHandle, error) { 1505 b = compressAndChecksum(b, compression, blockBuf) 1506 return w.writeCompressedBlock(b, blockBuf.tmp[:]) 1507 } 1508 1509 // assertFormatCompatibility ensures that the features present on the table are 1510 // compatible with the table format version. 1511 func (w *Writer) assertFormatCompatibility() error { 1512 // PebbleDBv1: block properties. 1513 if len(w.blockPropCollectors) > 0 && w.tableFormat < TableFormatPebblev1 { 1514 return errors.Newf( 1515 "table format version %s is less than the minimum required version %s for block properties", 1516 w.tableFormat, TableFormatPebblev1, 1517 ) 1518 } 1519 1520 // PebbleDBv2: range keys. 1521 if w.props.NumRangeKeys() > 0 && w.tableFormat < TableFormatPebblev2 { 1522 return errors.Newf( 1523 "table format version %s is less than the minimum required version %s for range keys", 1524 w.tableFormat, TableFormatPebblev2, 1525 ) 1526 } 1527 1528 return nil 1529 } 1530 1531 // Close finishes writing the table and closes the underlying file that the 1532 // table was written to. 1533 func (w *Writer) Close() (err error) { 1534 defer func() { 1535 if w.syncer == nil { 1536 return 1537 } 1538 err1 := w.syncer.Close() 1539 if err == nil { 1540 err = err1 1541 } 1542 w.syncer = nil 1543 }() 1544 1545 // finish must be called before we check for an error, because finish will 1546 // block until every single task added to the writeQueue has been processed, 1547 // and an error could be encountered while any of those tasks are processed. 1548 if err = w.coordination.writeQueue.finish(); err != nil { 1549 w.err = err 1550 } 1551 1552 if w.err != nil { 1553 return w.err 1554 } 1555 1556 // The w.meta.LargestPointKey is only used once the Writer is closed, so it is safe to set it 1557 // when the Writer is closed. 1558 // 1559 // The following invariants ensure that setting the largest key at this point of a Writer close 1560 // is correct: 1561 // 1. Keys must only be added to the Writer in an increasing order. 1562 // 2. The current w.dataBlockBuf is guaranteed to have the latest key added to the Writer. This 1563 // must be true, because a w.dataBlockBuf is only switched out when a dataBlock is flushed, 1564 // however, if a dataBlock is flushed, then we add a key to the new w.dataBlockBuf in the 1565 // addPoint function after the flush occurs. 1566 if w.dataBlockBuf.dataBlock.nEntries >= 1 { 1567 w.meta.SetLargestPointKey(base.DecodeInternalKey(w.dataBlockBuf.dataBlock.curKey).Clone()) 1568 } 1569 1570 // Finish the last data block, or force an empty data block if there 1571 // aren't any data blocks at all. 1572 if w.dataBlockBuf.dataBlock.nEntries > 0 || w.indexBlock.block.nEntries == 0 { 1573 bh, err := w.writeBlock(w.dataBlockBuf.dataBlock.finish(), w.compression, &w.dataBlockBuf.blockBuf) 1574 if err != nil { 1575 w.err = err 1576 return w.err 1577 } 1578 var bhp BlockHandleWithProperties 1579 if bhp, err = w.maybeAddBlockPropertiesToBlockHandle(bh); err != nil { 1580 w.err = err 1581 return err 1582 } 1583 prevKey := base.DecodeInternalKey(w.dataBlockBuf.dataBlock.curKey) 1584 if err = w.addIndexEntrySync(prevKey, InternalKey{}, bhp, w.dataBlockBuf.tmp[:]); err != nil { 1585 w.err = err 1586 return err 1587 } 1588 } 1589 w.props.DataSize = w.meta.Size 1590 1591 // Write the filter block. 1592 var metaindex rawBlockWriter 1593 metaindex.restartInterval = 1 1594 if w.filter != nil { 1595 b, err := w.filter.finish() 1596 if err != nil { 1597 w.err = err 1598 return w.err 1599 } 1600 bh, err := w.writeBlock(b, NoCompression, &w.blockBuf) 1601 if err != nil { 1602 w.err = err 1603 return w.err 1604 } 1605 n := encodeBlockHandle(w.blockBuf.tmp[:], bh) 1606 metaindex.add(InternalKey{UserKey: []byte(w.filter.metaName())}, w.blockBuf.tmp[:n]) 1607 w.props.FilterPolicyName = w.filter.policyName() 1608 w.props.FilterSize = bh.Length 1609 } 1610 1611 var indexBH BlockHandle 1612 if w.twoLevelIndex { 1613 w.props.IndexType = twoLevelIndex 1614 // Write the two level index block. 1615 indexBH, err = w.writeTwoLevelIndex() 1616 if err != nil { 1617 w.err = err 1618 return w.err 1619 } 1620 } else { 1621 w.props.IndexType = binarySearchIndex 1622 // NB: RocksDB includes the block trailer length in the index size 1623 // property, though it doesn't include the trailer in the filter size 1624 // property. 1625 w.props.IndexSize = uint64(w.indexBlock.estimatedSize()) + blockTrailerLen 1626 w.props.NumDataBlocks = uint64(w.indexBlock.block.nEntries) 1627 1628 // Write the single level index block. 1629 indexBH, err = w.writeBlock(w.indexBlock.finish(), w.compression, &w.blockBuf) 1630 if err != nil { 1631 w.err = err 1632 return w.err 1633 } 1634 } 1635 1636 // Write the range-del block. The block handle must added to the meta index block 1637 // after the properties block has been written. This is because the entries in the 1638 // metaindex block must be sorted by key. 1639 var rangeDelBH BlockHandle 1640 if w.props.NumRangeDeletions > 0 { 1641 if !w.rangeDelV1Format { 1642 // Because the range tombstones are fragmented in the v2 format, the end 1643 // key of the last added range tombstone will be the largest range 1644 // tombstone key. Note that we need to make this into a range deletion 1645 // sentinel because sstable boundaries are inclusive while the end key of 1646 // a range deletion tombstone is exclusive. A Clone() is necessary as 1647 // rangeDelBlock.curValue is the same slice that will get passed 1648 // into w.writer, and some implementations of vfs.File mutate the 1649 // slice passed into Write(). Also, w.meta will often outlive the 1650 // blockWriter, and so cloning curValue allows the rangeDelBlock's 1651 // internal buffer to get gc'd. 1652 k := base.MakeRangeDeleteSentinelKey(w.rangeDelBlock.curValue).Clone() 1653 w.meta.SetLargestRangeDelKey(k) 1654 } 1655 rangeDelBH, err = w.writeBlock(w.rangeDelBlock.finish(), NoCompression, &w.blockBuf) 1656 if err != nil { 1657 w.err = err 1658 return w.err 1659 } 1660 } 1661 1662 // Write the range-key block, flushing any remaining spans from the 1663 // fragmenter first. 1664 w.fragmenter.Finish() 1665 1666 var rangeKeyBH BlockHandle 1667 if w.props.NumRangeKeys() > 0 { 1668 key := base.DecodeInternalKey(w.rangeKeyBlock.curKey) 1669 kind := key.Kind() 1670 endKey, _, ok := rangekey.DecodeEndKey(kind, w.rangeKeyBlock.curValue) 1671 if !ok { 1672 w.err = errors.Newf("invalid end key: %s", w.rangeKeyBlock.curValue) 1673 return w.err 1674 } 1675 k := base.MakeExclusiveSentinelKey(kind, endKey).Clone() 1676 w.meta.SetLargestRangeKey(k) 1677 // TODO(travers): The lack of compression on the range key block matches the 1678 // lack of compression on the range-del block. Revisit whether we want to 1679 // enable compression on this block. 1680 rangeKeyBH, err = w.writeBlock(w.rangeKeyBlock.finish(), NoCompression, &w.blockBuf) 1681 if err != nil { 1682 w.err = err 1683 return w.err 1684 } 1685 } 1686 1687 // Add the range key block handle to the metaindex block. Note that we add the 1688 // block handle to the metaindex block before the other meta blocks as the 1689 // metaindex block entries must be sorted, and the range key block name sorts 1690 // before the other block names. 1691 if w.props.NumRangeKeys() > 0 { 1692 n := encodeBlockHandle(w.blockBuf.tmp[:], rangeKeyBH) 1693 metaindex.add(InternalKey{UserKey: []byte(metaRangeKeyName)}, w.blockBuf.tmp[:n]) 1694 } 1695 1696 { 1697 userProps := make(map[string]string) 1698 for i := range w.propCollectors { 1699 if err := w.propCollectors[i].Finish(userProps); err != nil { 1700 w.err = err 1701 return err 1702 } 1703 } 1704 for i := range w.blockPropCollectors { 1705 scratch := w.blockPropsEncoder.getScratchForProp() 1706 // Place the shortID in the first byte. 1707 scratch = append(scratch, byte(i)) 1708 buf, err := 1709 w.blockPropCollectors[i].FinishTable(scratch) 1710 if err != nil { 1711 w.err = err 1712 return err 1713 } 1714 var prop string 1715 if len(buf) > 0 { 1716 prop = string(buf) 1717 } 1718 // NB: The property is populated in the map even if it is the 1719 // empty string, since the presence in the map is what indicates 1720 // that the block property collector was used when writing. 1721 userProps[w.blockPropCollectors[i].Name()] = prop 1722 } 1723 if len(userProps) > 0 { 1724 w.props.UserProperties = userProps 1725 } 1726 1727 // Write the properties block. 1728 var raw rawBlockWriter 1729 // The restart interval is set to infinity because the properties block 1730 // is always read sequentially and cached in a heap located object. This 1731 // reduces table size without a significant impact on performance. 1732 raw.restartInterval = propertiesBlockRestartInterval 1733 w.props.CompressionOptions = rocksDBCompressionOptions 1734 w.props.save(&raw) 1735 bh, err := w.writeBlock(raw.finish(), NoCompression, &w.blockBuf) 1736 if err != nil { 1737 w.err = err 1738 return w.err 1739 } 1740 n := encodeBlockHandle(w.blockBuf.tmp[:], bh) 1741 metaindex.add(InternalKey{UserKey: []byte(metaPropertiesName)}, w.blockBuf.tmp[:n]) 1742 } 1743 1744 // Add the range deletion block handle to the metaindex block. 1745 if w.props.NumRangeDeletions > 0 { 1746 n := encodeBlockHandle(w.blockBuf.tmp[:], rangeDelBH) 1747 // The v2 range-del block encoding is backwards compatible with the v1 1748 // encoding. We add meta-index entries for both the old name and the new 1749 // name so that old code can continue to find the range-del block and new 1750 // code knows that the range tombstones in the block are fragmented and 1751 // sorted. 1752 metaindex.add(InternalKey{UserKey: []byte(metaRangeDelName)}, w.blockBuf.tmp[:n]) 1753 if !w.rangeDelV1Format { 1754 metaindex.add(InternalKey{UserKey: []byte(metaRangeDelV2Name)}, w.blockBuf.tmp[:n]) 1755 } 1756 } 1757 1758 // Write the metaindex block. It might be an empty block, if the filter 1759 // policy is nil. NoCompression is specified because a) RocksDB never 1760 // compresses the meta-index block and b) RocksDB has some code paths which 1761 // expect the meta-index block to not be compressed. 1762 metaindexBH, err := w.writeBlock(metaindex.blockWriter.finish(), NoCompression, &w.blockBuf) 1763 if err != nil { 1764 w.err = err 1765 return w.err 1766 } 1767 1768 // Write the table footer. 1769 footer := footer{ 1770 format: w.tableFormat, 1771 checksum: w.blockBuf.checksummer.checksumType, 1772 metaindexBH: metaindexBH, 1773 indexBH: indexBH, 1774 } 1775 var n int 1776 if n, err = w.writer.Write(footer.encode(w.blockBuf.tmp[:])); err != nil { 1777 w.err = err 1778 return w.err 1779 } 1780 w.meta.Size += uint64(n) 1781 w.meta.Properties = w.props 1782 1783 // Flush the buffer. 1784 if w.bufWriter != nil { 1785 if err := w.bufWriter.Flush(); err != nil { 1786 w.err = err 1787 return err 1788 } 1789 } 1790 1791 // Check that the features present in the table are compatible with the format 1792 // configured for the table. 1793 if err = w.assertFormatCompatibility(); err != nil { 1794 w.err = err 1795 return w.err 1796 } 1797 1798 if err := w.syncer.Sync(); err != nil { 1799 w.err = err 1800 return err 1801 } 1802 1803 w.dataBlockBuf.clear() 1804 dataBlockBufPool.Put(w.dataBlockBuf) 1805 w.dataBlockBuf = nil 1806 w.indexBlock.clear() 1807 indexBlockBufPool.Put(w.indexBlock) 1808 w.indexBlock = nil 1809 1810 // Make any future calls to Set or Close return an error. 1811 if w.err != nil { 1812 return w.err 1813 } 1814 w.err = errWriterClosed 1815 return nil 1816 } 1817 1818 // EstimatedSize returns the estimated size of the sstable being written if a 1819 // call to Finish() was made without adding additional keys. 1820 func (w *Writer) EstimatedSize() uint64 { 1821 if invariants.Enabled && !w.coordination.parallelismEnabled { 1822 // The w.meta.Size should only be accessed from the writeQueue goroutine 1823 // if parallelism is enabled, but since it isn't we break that invariant 1824 // here. 1825 if w.coordination.sizeEstimate.size() != w.meta.Size { 1826 panic("sstable size estimation sans parallelism is incorrect") 1827 } 1828 } 1829 return w.coordination.sizeEstimate.size() + 1830 uint64(w.dataBlockBuf.dataBlock.estimatedSize()) + 1831 w.indexBlock.estimatedSize() 1832 } 1833 1834 // Metadata returns the metadata for the finished sstable. Only valid to call 1835 // after the sstable has been finished. 1836 func (w *Writer) Metadata() (*WriterMetadata, error) { 1837 if w.syncer != nil { 1838 return nil, errors.New("bitalostable: writer is not closed") 1839 } 1840 return &w.meta, nil 1841 } 1842 1843 // WriterOption provide an interface to do work on Writer while it is being 1844 // opened. 1845 type WriterOption interface { 1846 // writerApply is called on the writer during opening in order to set 1847 // internal parameters. 1848 writerApply(*Writer) 1849 } 1850 1851 // PreviousPointKeyOpt is a WriterOption that provides access to the last 1852 // point key written to the writer while building a sstable. 1853 type PreviousPointKeyOpt struct { 1854 w *Writer 1855 } 1856 1857 // UnsafeKey returns the last point key written to the writer to which this 1858 // option was passed during creation. The returned key points directly into 1859 // a buffer belonging to the Writer. The value's lifetime ends the next time a 1860 // point key is added to the Writer. 1861 // Invariant: UnsafeKey isn't and shouldn't be called after the Writer is closed. 1862 func (o PreviousPointKeyOpt) UnsafeKey() base.InternalKey { 1863 if o.w == nil { 1864 return base.InvalidInternalKey 1865 } 1866 1867 if o.w.dataBlockBuf.dataBlock.nEntries >= 1 { 1868 // o.w.dataBlockBuf.dataBlock.curKey is guaranteed to point to the last point key 1869 // which was added to the Writer. 1870 return base.DecodeInternalKey(o.w.dataBlockBuf.dataBlock.curKey) 1871 } 1872 return base.InternalKey{} 1873 } 1874 1875 func (o *PreviousPointKeyOpt) writerApply(w *Writer) { 1876 o.w = w 1877 } 1878 1879 // internalTableOpt is a WriterOption that sets properties for sstables being 1880 // created by the db itself (i.e. through flushes and compactions), as opposed 1881 // to those meant for ingestion. 1882 type internalTableOpt struct{} 1883 1884 func (i internalTableOpt) writerApply(w *Writer) { 1885 // Set the external sst version to 0. This is what RocksDB expects for 1886 // db-internal sstables; otherwise, it could apply a global sequence number. 1887 w.props.ExternalFormatVersion = 0 1888 } 1889 1890 // NewWriter returns a new table writer for the file. Closing the writer will 1891 // close the file. 1892 func NewWriter(f writeCloseSyncer, o WriterOptions, extraOpts ...WriterOption) *Writer { 1893 o = o.ensureDefaults() 1894 w := &Writer{ 1895 syncer: f, 1896 meta: WriterMetadata{ 1897 SmallestSeqNum: math.MaxUint64, 1898 }, 1899 blockSize: o.BlockSize, 1900 blockSizeThreshold: (o.BlockSize*o.BlockSizeThreshold + 99) / 100, 1901 indexBlockSize: o.IndexBlockSize, 1902 indexBlockSizeThreshold: (o.IndexBlockSize*o.BlockSizeThreshold + 99) / 100, 1903 compare: o.Comparer.Compare, 1904 split: o.Comparer.Split, 1905 formatKey: o.Comparer.FormatKey, 1906 compression: o.Compression, 1907 separator: o.Comparer.Separator, 1908 successor: o.Comparer.Successor, 1909 tableFormat: o.TableFormat, 1910 cache: o.Cache, 1911 restartInterval: o.BlockRestartInterval, 1912 checksumType: o.Checksum, 1913 indexBlock: newIndexBlockBuf(o.Parallelism), 1914 rangeDelBlock: blockWriter{ 1915 restartInterval: 1, 1916 }, 1917 rangeKeyBlock: blockWriter{ 1918 restartInterval: 1, 1919 }, 1920 topLevelIndexBlock: blockWriter{ 1921 restartInterval: 1, 1922 }, 1923 fragmenter: keyspan.Fragmenter{ 1924 Cmp: o.Comparer.Compare, 1925 Format: o.Comparer.FormatKey, 1926 }, 1927 } 1928 1929 w.dataBlockBuf = newDataBlockBuf(w.restartInterval, w.checksumType) 1930 1931 w.blockBuf = blockBuf{ 1932 checksummer: checksummer{checksumType: o.Checksum}, 1933 } 1934 1935 w.coordination.init(o.Parallelism, w) 1936 1937 if f == nil { 1938 w.err = errors.New("bitalostable: nil file") 1939 return w 1940 } 1941 1942 // Note that WriterOptions are applied in two places; the ones with a 1943 // preApply() method are applied here, and the rest are applied after 1944 // default properties are set. 1945 type preApply interface{ preApply() } 1946 for _, opt := range extraOpts { 1947 if _, ok := opt.(preApply); ok { 1948 opt.writerApply(w) 1949 } 1950 } 1951 1952 w.props.PrefixExtractorName = "nullptr" 1953 if o.FilterPolicy != nil { 1954 switch o.FilterType { 1955 case TableFilter: 1956 w.filter = newTableFilterWriter(o.FilterPolicy) 1957 if w.split != nil { 1958 w.props.PrefixExtractorName = o.Comparer.Name 1959 w.props.PrefixFiltering = true 1960 } else { 1961 w.props.WholeKeyFiltering = true 1962 } 1963 default: 1964 panic(fmt.Sprintf("unknown filter type: %v", o.FilterType)) 1965 } 1966 } 1967 1968 w.props.ColumnFamilyID = math.MaxInt32 1969 w.props.ComparerName = o.Comparer.Name 1970 w.props.CompressionName = o.Compression.String() 1971 w.props.MergerName = o.MergerName 1972 w.props.PropertyCollectorNames = "[]" 1973 w.props.ExternalFormatVersion = rocksDBExternalFormatVersion 1974 1975 if len(o.TablePropertyCollectors) > 0 || len(o.BlockPropertyCollectors) > 0 { 1976 var buf bytes.Buffer 1977 buf.WriteString("[") 1978 if len(o.TablePropertyCollectors) > 0 { 1979 w.propCollectors = make([]TablePropertyCollector, len(o.TablePropertyCollectors)) 1980 for i := range o.TablePropertyCollectors { 1981 w.propCollectors[i] = o.TablePropertyCollectors[i]() 1982 if i > 0 { 1983 buf.WriteString(",") 1984 } 1985 buf.WriteString(w.propCollectors[i].Name()) 1986 } 1987 } 1988 if len(o.BlockPropertyCollectors) > 0 { 1989 // shortID is a uint8, so we cannot exceed that number of block 1990 // property collectors. 1991 if len(o.BlockPropertyCollectors) > math.MaxUint8 { 1992 w.err = errors.New("bitalostable: too many block property collectors") 1993 return w 1994 } 1995 // The shortID assigned to a collector is the same as its index in 1996 // this slice. 1997 w.blockPropCollectors = make([]BlockPropertyCollector, len(o.BlockPropertyCollectors)) 1998 for i := range o.BlockPropertyCollectors { 1999 w.blockPropCollectors[i] = o.BlockPropertyCollectors[i]() 2000 if i > 0 || len(o.TablePropertyCollectors) > 0 { 2001 buf.WriteString(",") 2002 } 2003 buf.WriteString(w.blockPropCollectors[i].Name()) 2004 } 2005 } 2006 buf.WriteString("]") 2007 w.props.PropertyCollectorNames = buf.String() 2008 } 2009 2010 // Apply the remaining WriterOptions that do not have a preApply() method. 2011 for _, opt := range extraOpts { 2012 if _, ok := opt.(preApply); !ok { 2013 opt.writerApply(w) 2014 } 2015 } 2016 2017 // Initialize the range key fragmenter and encoder. 2018 w.fragmenter.Emit = w.coalesceSpans 2019 w.rangeKeyEncoder.Emit = w.addRangeKey 2020 2021 // If f does not have a Flush method, do our own buffering. 2022 if _, ok := f.(flusher); ok { 2023 w.writer = f 2024 } else { 2025 w.bufWriter = bufio.NewWriter(f) 2026 w.writer = w.bufWriter 2027 } 2028 return w 2029 } 2030 2031 func init() { 2032 private.SSTableWriterDisableKeyOrderChecks = func(i interface{}) { 2033 w := i.(*Writer) 2034 w.disableKeyOrderChecks = true 2035 } 2036 private.SSTableInternalTableOpt = internalTableOpt{} 2037 }