github.com/cockroachdb/pebble@v1.1.1-0.20240513155919-3622ade60459/sstable/writer.go (about) 1 // Copyright 2011 The LevelDB-Go and Pebble Authors. All rights reserved. Use 2 // of this source code is governed by a BSD-style license that can be found in 3 // the LICENSE file. 4 5 package sstable 6 7 import ( 8 "bytes" 9 "encoding/binary" 10 "fmt" 11 "math" 12 "runtime" 13 "sort" 14 "sync" 15 16 "github.com/cespare/xxhash/v2" 17 "github.com/cockroachdb/errors" 18 "github.com/cockroachdb/pebble/internal/base" 19 "github.com/cockroachdb/pebble/internal/bytealloc" 20 "github.com/cockroachdb/pebble/internal/cache" 21 "github.com/cockroachdb/pebble/internal/crc" 22 "github.com/cockroachdb/pebble/internal/invariants" 23 "github.com/cockroachdb/pebble/internal/keyspan" 24 "github.com/cockroachdb/pebble/internal/private" 25 "github.com/cockroachdb/pebble/internal/rangekey" 26 "github.com/cockroachdb/pebble/objstorage" 27 ) 28 29 // encodedBHPEstimatedSize estimates the size of the encoded BlockHandleWithProperties. 30 // It would also be nice to account for the length of the data block properties here, 31 // but isn't necessary since this is an estimate. 32 const encodedBHPEstimatedSize = binary.MaxVarintLen64 * 2 33 34 var errWriterClosed = errors.New("pebble: writer is closed") 35 36 // WriterMetadata holds info about a finished sstable. 37 type WriterMetadata struct { 38 Size uint64 39 SmallestPoint InternalKey 40 // LargestPoint, LargestRangeKey, LargestRangeDel should not be accessed 41 // before Writer.Close is called, because they may only be set on 42 // Writer.Close. 43 LargestPoint InternalKey 44 SmallestRangeDel InternalKey 45 LargestRangeDel InternalKey 46 SmallestRangeKey InternalKey 47 LargestRangeKey InternalKey 48 HasPointKeys bool 49 HasRangeDelKeys bool 50 HasRangeKeys bool 51 SmallestSeqNum uint64 52 LargestSeqNum uint64 53 Properties Properties 54 } 55 56 // SetSmallestPointKey sets the smallest point key to the given key. 57 // NB: this method set the "absolute" smallest point key. Any existing key is 58 // overridden. 59 func (m *WriterMetadata) SetSmallestPointKey(k InternalKey) { 60 m.SmallestPoint = k 61 m.HasPointKeys = true 62 } 63 64 // SetSmallestRangeDelKey sets the smallest rangedel key to the given key. 65 // NB: this method set the "absolute" smallest rangedel key. Any existing key is 66 // overridden. 67 func (m *WriterMetadata) SetSmallestRangeDelKey(k InternalKey) { 68 m.SmallestRangeDel = k 69 m.HasRangeDelKeys = true 70 } 71 72 // SetSmallestRangeKey sets the smallest range key to the given key. 73 // NB: this method set the "absolute" smallest range key. Any existing key is 74 // overridden. 75 func (m *WriterMetadata) SetSmallestRangeKey(k InternalKey) { 76 m.SmallestRangeKey = k 77 m.HasRangeKeys = true 78 } 79 80 // SetLargestPointKey sets the largest point key to the given key. 81 // NB: this method set the "absolute" largest point key. Any existing key is 82 // overridden. 83 func (m *WriterMetadata) SetLargestPointKey(k InternalKey) { 84 m.LargestPoint = k 85 m.HasPointKeys = true 86 } 87 88 // SetLargestRangeDelKey sets the largest rangedel key to the given key. 89 // NB: this method set the "absolute" largest rangedel key. Any existing key is 90 // overridden. 91 func (m *WriterMetadata) SetLargestRangeDelKey(k InternalKey) { 92 m.LargestRangeDel = k 93 m.HasRangeDelKeys = true 94 } 95 96 // SetLargestRangeKey sets the largest range key to the given key. 97 // NB: this method set the "absolute" largest range key. Any existing key is 98 // overridden. 99 func (m *WriterMetadata) SetLargestRangeKey(k InternalKey) { 100 m.LargestRangeKey = k 101 m.HasRangeKeys = true 102 } 103 104 func (m *WriterMetadata) updateSeqNum(seqNum uint64) { 105 if m.SmallestSeqNum > seqNum { 106 m.SmallestSeqNum = seqNum 107 } 108 if m.LargestSeqNum < seqNum { 109 m.LargestSeqNum = seqNum 110 } 111 } 112 113 // Writer is a table writer. 114 type Writer struct { 115 writable objstorage.Writable 116 meta WriterMetadata 117 err error 118 // cacheID and fileNum are used to remove blocks written to the sstable from 119 // the cache, providing a defense in depth against bugs which cause cache 120 // collisions. 121 cacheID uint64 122 fileNum base.DiskFileNum 123 // The following fields are copied from Options. 124 blockSize int 125 blockSizeThreshold int 126 indexBlockSize int 127 indexBlockSizeThreshold int 128 compare Compare 129 split Split 130 formatKey base.FormatKey 131 compression Compression 132 separator Separator 133 successor Successor 134 tableFormat TableFormat 135 isStrictObsolete bool 136 writingToLowestLevel bool 137 cache *cache.Cache 138 restartInterval int 139 checksumType ChecksumType 140 // disableKeyOrderChecks disables the checks that keys are added to an 141 // sstable in order. It is intended for internal use only in the construction 142 // of invalid sstables for testing. See tool/make_test_sstables.go. 143 disableKeyOrderChecks bool 144 // With two level indexes, the index/filter of a SST file is partitioned into 145 // smaller blocks with an additional top-level index on them. When reading an 146 // index/filter, only the top-level index is loaded into memory. The two level 147 // index/filter then uses the top-level index to load on demand into the block 148 // cache the partitions that are required to perform the index/filter query. 149 // 150 // Two level indexes are enabled automatically when there is more than one 151 // index block. 152 // 153 // This is useful when there are very large index blocks, which generally occurs 154 // with the usage of large keys. With large index blocks, the index blocks fight 155 // the data blocks for block cache space and the index blocks are likely to be 156 // re-read many times from the disk. The top level index, which has a much 157 // smaller memory footprint, can be used to prevent the entire index block from 158 // being loaded into the block cache. 159 twoLevelIndex bool 160 // Internal flag to allow creation of range-del-v1 format blocks. Only used 161 // for testing. Note that v2 format blocks are backwards compatible with v1 162 // format blocks. 163 rangeDelV1Format bool 164 indexBlock *indexBlockBuf 165 rangeDelBlock blockWriter 166 rangeKeyBlock blockWriter 167 topLevelIndexBlock blockWriter 168 props Properties 169 propCollectors []TablePropertyCollector 170 blockPropCollectors []BlockPropertyCollector 171 obsoleteCollector obsoleteKeyBlockPropertyCollector 172 blockPropsEncoder blockPropertiesEncoder 173 // filter accumulates the filter block. If populated, the filter ingests 174 // either the output of w.split (i.e. a prefix extractor) if w.split is not 175 // nil, or the full keys otherwise. 176 filter filterWriter 177 indexPartitions []indexBlockAndBlockProperties 178 179 // indexBlockAlloc is used to bulk-allocate byte slices used to store index 180 // blocks in indexPartitions. These live until the index finishes. 181 indexBlockAlloc []byte 182 // indexSepAlloc is used to bulk-allocate index block separator slices stored 183 // in indexPartitions. These live until the index finishes. 184 indexSepAlloc bytealloc.A 185 186 // To allow potentially overlapping (i.e. un-fragmented) range keys spans to 187 // be added to the Writer, a keyspan.Fragmenter is used to retain the keys 188 // and values, emitting fragmented, coalesced spans as appropriate. Range 189 // keys must be added in order of their start user-key. 190 fragmenter keyspan.Fragmenter 191 rangeKeyEncoder rangekey.Encoder 192 rangeKeysBySuffix keyspan.KeysBySuffix 193 rangeKeySpan keyspan.Span 194 rkBuf []byte 195 // dataBlockBuf consists of the state which is currently owned by and used by 196 // the Writer client goroutine. This state can be handed off to other goroutines. 197 dataBlockBuf *dataBlockBuf 198 // blockBuf consists of the state which is owned by and used by the Writer client 199 // goroutine. 200 blockBuf blockBuf 201 202 coordination coordinationState 203 204 // Information (other than the byte slice) about the last point key, to 205 // avoid extracting it again. 206 lastPointKeyInfo pointKeyInfo 207 208 // For value blocks. 209 shortAttributeExtractor base.ShortAttributeExtractor 210 requiredInPlaceValueBound UserKeyPrefixBound 211 valueBlockWriter *valueBlockWriter 212 } 213 214 type pointKeyInfo struct { 215 trailer uint64 216 // Only computed when w.valueBlockWriter is not nil. 217 userKeyLen int 218 // prefixLen uses w.split, if not nil. Only computed when w.valueBlockWriter 219 // is not nil. 220 prefixLen int 221 // True iff the point was marked obsolete. 222 isObsolete bool 223 } 224 225 type coordinationState struct { 226 parallelismEnabled bool 227 228 // writeQueue is used to write data blocks to disk. The writeQueue is primarily 229 // used to maintain the order in which data blocks must be written to disk. For 230 // this reason, every single data block write must be done through the writeQueue. 231 writeQueue *writeQueue 232 233 sizeEstimate dataBlockEstimates 234 } 235 236 func (c *coordinationState) init(parallelismEnabled bool, writer *Writer) { 237 c.parallelismEnabled = parallelismEnabled 238 // useMutex is false regardless of parallelismEnabled, because we do not do 239 // parallel compression yet. 240 c.sizeEstimate.useMutex = false 241 242 // writeQueueSize determines the size of the write queue, or the number 243 // of items which can be added to the queue without blocking. By default, we 244 // use a writeQueue size of 0, since we won't be doing any block writes in 245 // parallel. 246 writeQueueSize := 0 247 if parallelismEnabled { 248 writeQueueSize = runtime.GOMAXPROCS(0) 249 } 250 c.writeQueue = newWriteQueue(writeQueueSize, writer) 251 } 252 253 // sizeEstimate is a general purpose helper for estimating two kinds of sizes: 254 // A. The compressed sstable size, which is useful for deciding when to start 255 // 256 // a new sstable during flushes or compactions. In practice, we use this in 257 // estimating the data size (excluding the index). 258 // 259 // B. The size of index blocks to decide when to start a new index block. 260 // 261 // There are some terminology peculiarities which are due to the origin of 262 // sizeEstimate for use case A with parallel compression enabled (for which 263 // the code has not been merged). Specifically this relates to the terms 264 // "written" and "compressed". 265 // - The notion of "written" for case A is sufficiently defined by saying that 266 // the data block is compressed. Waiting for the actual data block write to 267 // happen can result in unnecessary estimation, when we already know how big 268 // it will be in compressed form. Additionally, with the forthcoming value 269 // blocks containing older MVCC values, these compressed block will be held 270 // in-memory until late in the sstable writing, and we do want to accurately 271 // account for them without waiting for the actual write. 272 // For case B, "written" means that the index entry has been fully 273 // generated, and has been added to the uncompressed block buffer for that 274 // index block. It does not include actually writing a potentially 275 // compressed index block. 276 // - The notion of "compressed" is to differentiate between a "inflight" size 277 // and the actual size, and is handled via computing a compression ratio 278 // observed so far (defaults to 1). 279 // For case A, this is actual data block compression, so the "inflight" size 280 // is uncompressed blocks (that are no longer being written to) and the 281 // "compressed" size is after they have been compressed. 282 // For case B the inflight size is for a key-value pair in the index for 283 // which the value size (the encoded size of the BlockHandleWithProperties) 284 // is not accurately known, while the compressed size is the size of that 285 // entry when it has been added to the (in-progress) index ssblock. 286 // 287 // Usage: To update state, one can optionally provide an inflight write value 288 // using addInflight (used for case B). When something is "written" the state 289 // can be updated using either writtenWithDelta or writtenWithTotal, which 290 // provide the actual delta size or the total size (latter must be 291 // monotonically non-decreasing). If there were no calls to addInflight, there 292 // isn't any real estimation happening here. So case A does not do any real 293 // estimation. However, when we introduce parallel compression, there will be 294 // estimation in that the client goroutine will call addInFlight and the 295 // compression goroutines will call writtenWithDelta. 296 type sizeEstimate struct { 297 // emptySize is the size when there is no inflight data, and numEntries is 0. 298 // emptySize is constant once set. 299 emptySize uint64 300 301 // inflightSize is the estimated size of some inflight data which hasn't 302 // been written yet. 303 inflightSize uint64 304 305 // totalSize is the total size of the data which has already been written. 306 totalSize uint64 307 308 // numWrittenEntries is the total number of entries which have already been 309 // written. 310 numWrittenEntries uint64 311 // numInflightEntries is the total number of entries which are inflight, and 312 // haven't been written. 313 numInflightEntries uint64 314 315 // maxEstimatedSize stores the maximum result returned from sizeEstimate.size. 316 // It ensures that values returned from subsequent calls to Writer.EstimatedSize 317 // never decrease. 318 maxEstimatedSize uint64 319 320 // We assume that the entries added to the sizeEstimate can be compressed. 321 // For this reason, we keep track of a compressedSize and an uncompressedSize 322 // to compute a compression ratio for the inflight entries. If the entries 323 // aren't being compressed, then compressedSize and uncompressedSize must be 324 // equal. 325 compressedSize uint64 326 uncompressedSize uint64 327 } 328 329 func (s *sizeEstimate) init(emptySize uint64) { 330 s.emptySize = emptySize 331 } 332 333 func (s *sizeEstimate) size() uint64 { 334 ratio := float64(1) 335 if s.uncompressedSize > 0 { 336 ratio = float64(s.compressedSize) / float64(s.uncompressedSize) 337 } 338 estimatedInflightSize := uint64(float64(s.inflightSize) * ratio) 339 total := s.totalSize + estimatedInflightSize 340 if total > s.maxEstimatedSize { 341 s.maxEstimatedSize = total 342 } else { 343 total = s.maxEstimatedSize 344 } 345 346 if total == 0 { 347 return s.emptySize 348 } 349 350 return total 351 } 352 353 func (s *sizeEstimate) numTotalEntries() uint64 { 354 return s.numWrittenEntries + s.numInflightEntries 355 } 356 357 func (s *sizeEstimate) addInflight(size int) { 358 s.numInflightEntries++ 359 s.inflightSize += uint64(size) 360 } 361 362 func (s *sizeEstimate) writtenWithTotal(newTotalSize uint64, inflightSize int) { 363 finalEntrySize := int(newTotalSize - s.totalSize) 364 s.writtenWithDelta(finalEntrySize, inflightSize) 365 } 366 367 func (s *sizeEstimate) writtenWithDelta(finalEntrySize int, inflightSize int) { 368 if inflightSize > 0 { 369 // This entry was previously inflight, so we should decrement inflight 370 // entries and update the "compression" stats for future estimation. 371 s.numInflightEntries-- 372 s.inflightSize -= uint64(inflightSize) 373 s.uncompressedSize += uint64(inflightSize) 374 s.compressedSize += uint64(finalEntrySize) 375 } 376 s.numWrittenEntries++ 377 s.totalSize += uint64(finalEntrySize) 378 } 379 380 func (s *sizeEstimate) clear() { 381 *s = sizeEstimate{emptySize: s.emptySize} 382 } 383 384 type indexBlockBuf struct { 385 // block will only be accessed from the writeQueue. 386 block blockWriter 387 388 size struct { 389 useMutex bool 390 mu sync.Mutex 391 estimate sizeEstimate 392 } 393 394 // restartInterval matches indexBlockBuf.block.restartInterval. We store it twice, because the `block` 395 // must only be accessed from the writeQueue goroutine. 396 restartInterval int 397 } 398 399 func (i *indexBlockBuf) clear() { 400 i.block.clear() 401 if i.size.useMutex { 402 i.size.mu.Lock() 403 defer i.size.mu.Unlock() 404 } 405 i.size.estimate.clear() 406 i.restartInterval = 0 407 } 408 409 var indexBlockBufPool = sync.Pool{ 410 New: func() interface{} { 411 return &indexBlockBuf{} 412 }, 413 } 414 415 const indexBlockRestartInterval = 1 416 417 func newIndexBlockBuf(useMutex bool) *indexBlockBuf { 418 i := indexBlockBufPool.Get().(*indexBlockBuf) 419 i.size.useMutex = useMutex 420 i.restartInterval = indexBlockRestartInterval 421 i.block.restartInterval = indexBlockRestartInterval 422 i.size.estimate.init(emptyBlockSize) 423 return i 424 } 425 426 func (i *indexBlockBuf) shouldFlush( 427 sep InternalKey, valueLen, targetBlockSize, sizeThreshold int, 428 ) bool { 429 if i.size.useMutex { 430 i.size.mu.Lock() 431 defer i.size.mu.Unlock() 432 } 433 434 nEntries := i.size.estimate.numTotalEntries() 435 return shouldFlush( 436 sep, valueLen, i.restartInterval, int(i.size.estimate.size()), 437 int(nEntries), targetBlockSize, sizeThreshold) 438 } 439 440 func (i *indexBlockBuf) add(key InternalKey, value []byte, inflightSize int) { 441 i.block.add(key, value) 442 size := i.block.estimatedSize() 443 if i.size.useMutex { 444 i.size.mu.Lock() 445 defer i.size.mu.Unlock() 446 } 447 i.size.estimate.writtenWithTotal(uint64(size), inflightSize) 448 } 449 450 func (i *indexBlockBuf) finish() []byte { 451 b := i.block.finish() 452 return b 453 } 454 455 func (i *indexBlockBuf) addInflight(inflightSize int) { 456 if i.size.useMutex { 457 i.size.mu.Lock() 458 defer i.size.mu.Unlock() 459 } 460 i.size.estimate.addInflight(inflightSize) 461 } 462 463 func (i *indexBlockBuf) estimatedSize() uint64 { 464 if i.size.useMutex { 465 i.size.mu.Lock() 466 defer i.size.mu.Unlock() 467 } 468 469 // Make sure that the size estimation works as expected when parallelism 470 // is disabled. 471 if invariants.Enabled && !i.size.useMutex { 472 if i.size.estimate.inflightSize != 0 { 473 panic("unexpected inflight entry in index block size estimation") 474 } 475 476 // NB: The i.block should only be accessed from the writeQueue goroutine, 477 // when parallelism is enabled. We break that invariant here, but that's 478 // okay since parallelism is disabled. 479 if i.size.estimate.size() != uint64(i.block.estimatedSize()) { 480 panic("index block size estimation sans parallelism is incorrect") 481 } 482 } 483 return i.size.estimate.size() 484 } 485 486 // sizeEstimate is used for sstable size estimation. sizeEstimate can be 487 // accessed by the Writer client and compressionQueue goroutines. Fields 488 // should only be read/updated through the functions defined on the 489 // *sizeEstimate type. 490 type dataBlockEstimates struct { 491 // If we don't do block compression in parallel, then we don't need to take 492 // the performance hit of synchronizing using this mutex. 493 useMutex bool 494 mu sync.Mutex 495 496 estimate sizeEstimate 497 } 498 499 // inflightSize is the uncompressed block size estimate which has been 500 // previously provided to addInflightDataBlock(). If addInflightDataBlock() 501 // has not been called, this must be set to 0. compressedSize is the 502 // compressed size of the block. 503 func (d *dataBlockEstimates) dataBlockCompressed(compressedSize int, inflightSize int) { 504 if d.useMutex { 505 d.mu.Lock() 506 defer d.mu.Unlock() 507 } 508 d.estimate.writtenWithDelta(compressedSize+blockTrailerLen, inflightSize) 509 } 510 511 // size is an estimated size of datablock data which has been written to disk. 512 func (d *dataBlockEstimates) size() uint64 { 513 if d.useMutex { 514 d.mu.Lock() 515 defer d.mu.Unlock() 516 } 517 // If there is no parallel compression, there should not be any inflight bytes. 518 if invariants.Enabled && !d.useMutex { 519 if d.estimate.inflightSize != 0 { 520 panic("unexpected inflight entry in data block size estimation") 521 } 522 } 523 return d.estimate.size() 524 } 525 526 // Avoid linter unused error. 527 var _ = (&dataBlockEstimates{}).addInflightDataBlock 528 529 // NB: unused since no parallel compression. 530 func (d *dataBlockEstimates) addInflightDataBlock(size int) { 531 if d.useMutex { 532 d.mu.Lock() 533 defer d.mu.Unlock() 534 } 535 536 d.estimate.addInflight(size) 537 } 538 539 var writeTaskPool = sync.Pool{ 540 New: func() interface{} { 541 t := &writeTask{} 542 t.compressionDone = make(chan bool, 1) 543 return t 544 }, 545 } 546 547 type checksummer struct { 548 checksumType ChecksumType 549 xxHasher *xxhash.Digest 550 } 551 552 func (c *checksummer) checksum(block []byte, blockType []byte) (checksum uint32) { 553 // Calculate the checksum. 554 switch c.checksumType { 555 case ChecksumTypeCRC32c: 556 checksum = crc.New(block).Update(blockType).Value() 557 case ChecksumTypeXXHash64: 558 if c.xxHasher == nil { 559 c.xxHasher = xxhash.New() 560 } else { 561 c.xxHasher.Reset() 562 } 563 c.xxHasher.Write(block) 564 c.xxHasher.Write(blockType) 565 checksum = uint32(c.xxHasher.Sum64()) 566 default: 567 panic(errors.Newf("unsupported checksum type: %d", c.checksumType)) 568 } 569 return checksum 570 } 571 572 type blockBuf struct { 573 // tmp is a scratch buffer, large enough to hold either footerLen bytes, 574 // blockTrailerLen bytes, (5 * binary.MaxVarintLen64) bytes, and most 575 // likely large enough for a block handle with properties. 576 tmp [blockHandleLikelyMaxLen]byte 577 // compressedBuf is the destination buffer for compression. It is re-used over the 578 // lifetime of the blockBuf, avoiding the allocation of a temporary buffer for each block. 579 compressedBuf []byte 580 checksummer checksummer 581 } 582 583 func (b *blockBuf) clear() { 584 // We can't assign b.compressedBuf[:0] to compressedBuf because snappy relies 585 // on the length of the buffer, and not the capacity to determine if it needs 586 // to make an allocation. 587 *b = blockBuf{ 588 compressedBuf: b.compressedBuf, checksummer: b.checksummer, 589 } 590 } 591 592 // A dataBlockBuf holds all the state required to compress and write a data block to disk. 593 // A dataBlockBuf begins its lifecycle owned by the Writer client goroutine. The Writer 594 // client goroutine adds keys to the sstable, writing directly into a dataBlockBuf's blockWriter 595 // until the block is full. Once a dataBlockBuf's block is full, the dataBlockBuf may be passed 596 // to other goroutines for compression and file I/O. 597 type dataBlockBuf struct { 598 blockBuf 599 dataBlock blockWriter 600 601 // uncompressed is a reference to a byte slice which is owned by the dataBlockBuf. It is the 602 // next byte slice to be compressed. The uncompressed byte slice will be backed by the 603 // dataBlock.buf. 604 uncompressed []byte 605 // compressed is a reference to a byte slice which is owned by the dataBlockBuf. It is the 606 // compressed byte slice which must be written to disk. The compressed byte slice may be 607 // backed by the dataBlock.buf, or the dataBlockBuf.compressedBuf, depending on whether 608 // we use the result of the compression. 609 compressed []byte 610 611 // We're making calls to BlockPropertyCollectors from the Writer client goroutine. We need to 612 // pass the encoded block properties over to the write queue. To prevent copies, and allocations, 613 // we give each dataBlockBuf, a blockPropertiesEncoder. 614 blockPropsEncoder blockPropertiesEncoder 615 // dataBlockProps is set when Writer.finishDataBlockProps is called. The dataBlockProps slice is 616 // a shallow copy of the internal buffer of the dataBlockBuf.blockPropsEncoder. 617 dataBlockProps []byte 618 619 // sepScratch is reusable scratch space for computing separator keys. 620 sepScratch []byte 621 } 622 623 func (d *dataBlockBuf) clear() { 624 d.blockBuf.clear() 625 d.dataBlock.clear() 626 627 d.uncompressed = nil 628 d.compressed = nil 629 d.dataBlockProps = nil 630 d.sepScratch = d.sepScratch[:0] 631 } 632 633 var dataBlockBufPool = sync.Pool{ 634 New: func() interface{} { 635 return &dataBlockBuf{} 636 }, 637 } 638 639 func newDataBlockBuf(restartInterval int, checksumType ChecksumType) *dataBlockBuf { 640 d := dataBlockBufPool.Get().(*dataBlockBuf) 641 d.dataBlock.restartInterval = restartInterval 642 d.checksummer.checksumType = checksumType 643 return d 644 } 645 646 func (d *dataBlockBuf) finish() { 647 d.uncompressed = d.dataBlock.finish() 648 } 649 650 func (d *dataBlockBuf) compressAndChecksum(c Compression) { 651 d.compressed = compressAndChecksum(d.uncompressed, c, &d.blockBuf) 652 } 653 654 func (d *dataBlockBuf) shouldFlush( 655 key InternalKey, valueLen, targetBlockSize, sizeThreshold int, 656 ) bool { 657 return shouldFlush( 658 key, valueLen, d.dataBlock.restartInterval, d.dataBlock.estimatedSize(), 659 d.dataBlock.nEntries, targetBlockSize, sizeThreshold) 660 } 661 662 type indexBlockAndBlockProperties struct { 663 nEntries int 664 // sep is the last key added to this block, for computing a separator later. 665 sep InternalKey 666 properties []byte 667 // block is the encoded block produced by blockWriter.finish. 668 block []byte 669 } 670 671 // Set sets the value for the given key. The sequence number is set to 0. 672 // Intended for use to externally construct an sstable before ingestion into a 673 // DB. For a given Writer, the keys passed to Set must be in strictly increasing 674 // order. 675 // 676 // TODO(peter): untested 677 func (w *Writer) Set(key, value []byte) error { 678 if w.err != nil { 679 return w.err 680 } 681 if w.isStrictObsolete { 682 return errors.Errorf("use AddWithForceObsolete") 683 } 684 // forceObsolete is false based on the assumption that no RANGEDELs in the 685 // sstable delete the added points. 686 return w.addPoint(base.MakeInternalKey(key, 0, InternalKeyKindSet), value, false) 687 } 688 689 // Delete deletes the value for the given key. The sequence number is set to 690 // 0. Intended for use to externally construct an sstable before ingestion into 691 // a DB. 692 // 693 // TODO(peter): untested 694 func (w *Writer) Delete(key []byte) error { 695 if w.err != nil { 696 return w.err 697 } 698 if w.isStrictObsolete { 699 return errors.Errorf("use AddWithForceObsolete") 700 } 701 // forceObsolete is false based on the assumption that no RANGEDELs in the 702 // sstable delete the added points. 703 return w.addPoint(base.MakeInternalKey(key, 0, InternalKeyKindDelete), nil, false) 704 } 705 706 // DeleteRange deletes all of the keys (and values) in the range [start,end) 707 // (inclusive on start, exclusive on end). The sequence number is set to 708 // 0. Intended for use to externally construct an sstable before ingestion into 709 // a DB. 710 // 711 // TODO(peter): untested 712 func (w *Writer) DeleteRange(start, end []byte) error { 713 if w.err != nil { 714 return w.err 715 } 716 return w.addTombstone(base.MakeInternalKey(start, 0, InternalKeyKindRangeDelete), end) 717 } 718 719 // Merge adds an action to the DB that merges the value at key with the new 720 // value. The details of the merge are dependent upon the configured merge 721 // operator. The sequence number is set to 0. Intended for use to externally 722 // construct an sstable before ingestion into a DB. 723 // 724 // TODO(peter): untested 725 func (w *Writer) Merge(key, value []byte) error { 726 if w.err != nil { 727 return w.err 728 } 729 if w.isStrictObsolete { 730 return errors.Errorf("use AddWithForceObsolete") 731 } 732 // forceObsolete is false based on the assumption that no RANGEDELs in the 733 // sstable that delete the added points. If the user configured this writer 734 // to be strict-obsolete, addPoint will reject the addition of this MERGE. 735 return w.addPoint(base.MakeInternalKey(key, 0, InternalKeyKindMerge), value, false) 736 } 737 738 // Add adds a key/value pair to the table being written. For a given Writer, 739 // the keys passed to Add must be in increasing order. The exception to this 740 // rule is range deletion tombstones. Range deletion tombstones need to be 741 // added ordered by their start key, but they can be added out of order from 742 // point entries. Additionally, range deletion tombstones must be fragmented 743 // (i.e. by keyspan.Fragmenter). 744 func (w *Writer) Add(key InternalKey, value []byte) error { 745 if w.isStrictObsolete { 746 return errors.Errorf("use AddWithForceObsolete") 747 } 748 return w.AddWithForceObsolete(key, value, false) 749 } 750 751 // AddWithForceObsolete must be used when writing a strict-obsolete sstable. 752 // 753 // forceObsolete indicates whether the caller has determined that this key is 754 // obsolete even though it may be the latest point key for this userkey. This 755 // should be set to true for keys obsoleted by RANGEDELs, and is required for 756 // strict-obsolete sstables. 757 // 758 // Note that there are two properties, S1 and S2 (see comment in format.go) 759 // that strict-obsolete ssts must satisfy. S2, due to RANGEDELs, is solely the 760 // responsibility of the caller. S1 is solely the responsibility of the 761 // callee. 762 func (w *Writer) AddWithForceObsolete(key InternalKey, value []byte, forceObsolete bool) error { 763 if w.err != nil { 764 return w.err 765 } 766 767 switch key.Kind() { 768 case InternalKeyKindRangeDelete: 769 return w.addTombstone(key, value) 770 case base.InternalKeyKindRangeKeyDelete, 771 base.InternalKeyKindRangeKeySet, 772 base.InternalKeyKindRangeKeyUnset: 773 w.err = errors.Errorf( 774 "pebble: range keys must be added via one of the RangeKey* functions") 775 return w.err 776 } 777 return w.addPoint(key, value, forceObsolete) 778 } 779 780 func (w *Writer) makeAddPointDecisionV2(key InternalKey) error { 781 prevTrailer := w.lastPointKeyInfo.trailer 782 w.lastPointKeyInfo.trailer = key.Trailer 783 if w.dataBlockBuf.dataBlock.nEntries == 0 { 784 return nil 785 } 786 if !w.disableKeyOrderChecks { 787 prevPointUserKey := w.dataBlockBuf.dataBlock.getCurUserKey() 788 cmpUser := w.compare(prevPointUserKey, key.UserKey) 789 if cmpUser > 0 || (cmpUser == 0 && prevTrailer <= key.Trailer) { 790 return errors.Errorf( 791 "pebble: keys must be added in strictly increasing order: %s, %s", 792 InternalKey{UserKey: prevPointUserKey, Trailer: prevTrailer}.Pretty(w.formatKey), 793 key.Pretty(w.formatKey)) 794 } 795 } 796 return nil 797 } 798 799 // REQUIRES: at least one point has been written to the Writer. 800 func (w *Writer) getLastPointUserKey() []byte { 801 if w.dataBlockBuf.dataBlock.nEntries == 0 { 802 panic(errors.AssertionFailedf("no point keys added to writer")) 803 } 804 return w.dataBlockBuf.dataBlock.getCurUserKey() 805 } 806 807 func (w *Writer) makeAddPointDecisionV3( 808 key InternalKey, valueLen int, 809 ) (setHasSamePrefix bool, writeToValueBlock bool, isObsolete bool, err error) { 810 prevPointKeyInfo := w.lastPointKeyInfo 811 w.lastPointKeyInfo.userKeyLen = len(key.UserKey) 812 w.lastPointKeyInfo.prefixLen = w.lastPointKeyInfo.userKeyLen 813 if w.split != nil { 814 w.lastPointKeyInfo.prefixLen = w.split(key.UserKey) 815 } 816 w.lastPointKeyInfo.trailer = key.Trailer 817 w.lastPointKeyInfo.isObsolete = false 818 if !w.meta.HasPointKeys { 819 return false, false, false, nil 820 } 821 keyKind := base.TrailerKind(key.Trailer) 822 prevPointUserKey := w.getLastPointUserKey() 823 prevPointKey := InternalKey{UserKey: prevPointUserKey, Trailer: prevPointKeyInfo.trailer} 824 prevKeyKind := base.TrailerKind(prevPointKeyInfo.trailer) 825 considerWriteToValueBlock := prevKeyKind == InternalKeyKindSet && 826 keyKind == InternalKeyKindSet 827 if considerWriteToValueBlock && !w.requiredInPlaceValueBound.IsEmpty() { 828 keyPrefix := key.UserKey[:w.lastPointKeyInfo.prefixLen] 829 cmpUpper := w.compare( 830 w.requiredInPlaceValueBound.Upper, keyPrefix) 831 if cmpUpper <= 0 { 832 // Common case for CockroachDB. Make it empty since all future keys in 833 // this sstable will also have cmpUpper <= 0. 834 w.requiredInPlaceValueBound = UserKeyPrefixBound{} 835 } else if w.compare(keyPrefix, w.requiredInPlaceValueBound.Lower) >= 0 { 836 considerWriteToValueBlock = false 837 } 838 } 839 // cmpPrefix is initialized iff considerWriteToValueBlock. 840 var cmpPrefix int 841 var cmpUser int 842 if considerWriteToValueBlock { 843 // Compare the prefixes. 844 cmpPrefix = w.compare(prevPointUserKey[:prevPointKeyInfo.prefixLen], 845 key.UserKey[:w.lastPointKeyInfo.prefixLen]) 846 cmpUser = cmpPrefix 847 if cmpPrefix == 0 { 848 // Need to compare suffixes to compute cmpUser. 849 cmpUser = w.compare(prevPointUserKey[prevPointKeyInfo.prefixLen:], 850 key.UserKey[w.lastPointKeyInfo.prefixLen:]) 851 } 852 } else { 853 cmpUser = w.compare(prevPointUserKey, key.UserKey) 854 } 855 // Ensure that no one adds a point key kind without considering the obsolete 856 // handling for that kind. 857 switch keyKind { 858 case InternalKeyKindSet, InternalKeyKindSetWithDelete, InternalKeyKindMerge, 859 InternalKeyKindDelete, InternalKeyKindSingleDelete, InternalKeyKindDeleteSized: 860 default: 861 panic(errors.AssertionFailedf("unexpected key kind %s", keyKind.String())) 862 } 863 // If same user key, then the current key is obsolete if any of the 864 // following is true: 865 // C1 The prev key was obsolete. 866 // C2 The prev key was not a MERGE. When the previous key is a MERGE we must 867 // preserve SET* and MERGE since their values will be merged into the 868 // previous key. We also must preserve DEL* since there may be an older 869 // SET*/MERGE in a lower level that must not be merged with the MERGE -- 870 // if we omit the DEL* that lower SET*/MERGE will become visible. 871 // 872 // Regardless of whether it is the same user key or not 873 // C3 The current key is some kind of point delete, and we are writing to 874 // the lowest level, then it is also obsolete. The correctness of this 875 // relies on the same user key not spanning multiple sstables in a level. 876 // 877 // C1 ensures that for a user key there is at most one transition from 878 // !obsolete to obsolete. Consider a user key k, for which the first n keys 879 // are not obsolete. We consider the various value of n: 880 // 881 // n = 0: This happens due to forceObsolete being set by the caller, or due 882 // to C3. forceObsolete must only be set due a RANGEDEL, and that RANGEDEL 883 // must also delete all the lower seqnums for the same user key. C3 triggers 884 // due to a point delete and that deletes all the lower seqnums for the same 885 // user key. 886 // 887 // n = 1: This is the common case. It happens when the first key is not a 888 // MERGE, or the current key is some kind of point delete. 889 // 890 // n > 1: This is due to a sequence of MERGE keys, potentially followed by a 891 // single non-MERGE key. 892 isObsoleteC1AndC2 := cmpUser == 0 && 893 (prevPointKeyInfo.isObsolete || prevKeyKind != InternalKeyKindMerge) 894 isObsoleteC3 := w.writingToLowestLevel && 895 (keyKind == InternalKeyKindDelete || keyKind == InternalKeyKindSingleDelete || 896 keyKind == InternalKeyKindDeleteSized) 897 isObsolete = isObsoleteC1AndC2 || isObsoleteC3 898 // TODO(sumeer): storing isObsolete SET and SETWITHDEL in value blocks is 899 // possible, but requires some care in documenting and checking invariants. 900 // There is code that assumes nothing in value blocks because of single MVCC 901 // version (those should be ok). We have to ensure setHasSamePrefix is 902 // correctly initialized here etc. 903 904 if !w.disableKeyOrderChecks && 905 (cmpUser > 0 || (cmpUser == 0 && prevPointKeyInfo.trailer <= key.Trailer)) { 906 return false, false, false, errors.Errorf( 907 "pebble: keys must be added in strictly increasing order: %s, %s", 908 prevPointKey.Pretty(w.formatKey), key.Pretty(w.formatKey)) 909 } 910 if !considerWriteToValueBlock { 911 return false, false, isObsolete, nil 912 } 913 // NB: it is possible that cmpUser == 0, i.e., these two SETs have identical 914 // user keys (because of an open snapshot). This should be the rare case. 915 setHasSamePrefix = cmpPrefix == 0 916 considerWriteToValueBlock = setHasSamePrefix 917 // Use of 0 here is somewhat arbitrary. Given the minimum 3 byte encoding of 918 // valueHandle, this should be > 3. But tiny values are common in test and 919 // unlikely in production, so we use 0 here for better test coverage. 920 const tinyValueThreshold = 0 921 if considerWriteToValueBlock && valueLen <= tinyValueThreshold { 922 considerWriteToValueBlock = false 923 } 924 return setHasSamePrefix, considerWriteToValueBlock, isObsolete, nil 925 } 926 927 func (w *Writer) addPoint(key InternalKey, value []byte, forceObsolete bool) error { 928 if w.isStrictObsolete && key.Kind() == InternalKeyKindMerge { 929 return errors.Errorf("MERGE not supported in a strict-obsolete sstable") 930 } 931 var err error 932 var setHasSameKeyPrefix, writeToValueBlock, addPrefixToValueStoredWithKey bool 933 var isObsolete bool 934 maxSharedKeyLen := len(key.UserKey) 935 if w.valueBlockWriter != nil { 936 // maxSharedKeyLen is limited to the prefix of the preceding key. If the 937 // preceding key was in a different block, then the blockWriter will 938 // ignore this maxSharedKeyLen. 939 maxSharedKeyLen = w.lastPointKeyInfo.prefixLen 940 setHasSameKeyPrefix, writeToValueBlock, isObsolete, err = 941 w.makeAddPointDecisionV3(key, len(value)) 942 addPrefixToValueStoredWithKey = base.TrailerKind(key.Trailer) == InternalKeyKindSet 943 } else { 944 err = w.makeAddPointDecisionV2(key) 945 } 946 if err != nil { 947 return err 948 } 949 isObsolete = w.tableFormat >= TableFormatPebblev4 && (isObsolete || forceObsolete) 950 w.lastPointKeyInfo.isObsolete = isObsolete 951 var valueStoredWithKey []byte 952 var prefix valuePrefix 953 var valueStoredWithKeyLen int 954 if writeToValueBlock { 955 vh, err := w.valueBlockWriter.addValue(value) 956 if err != nil { 957 return err 958 } 959 n := encodeValueHandle(w.blockBuf.tmp[:], vh) 960 valueStoredWithKey = w.blockBuf.tmp[:n] 961 valueStoredWithKeyLen = len(valueStoredWithKey) + 1 962 var attribute base.ShortAttribute 963 if w.shortAttributeExtractor != nil { 964 // TODO(sumeer): for compactions, it is possible that the input sstable 965 // already has this value in the value section and so we have already 966 // extracted the ShortAttribute. Avoid extracting it again. This will 967 // require changing the Writer.Add interface. 968 if attribute, err = w.shortAttributeExtractor( 969 key.UserKey, w.lastPointKeyInfo.prefixLen, value); err != nil { 970 return err 971 } 972 } 973 prefix = makePrefixForValueHandle(setHasSameKeyPrefix, attribute) 974 } else { 975 valueStoredWithKey = value 976 valueStoredWithKeyLen = len(value) 977 if addPrefixToValueStoredWithKey { 978 valueStoredWithKeyLen++ 979 } 980 prefix = makePrefixForInPlaceValue(setHasSameKeyPrefix) 981 } 982 983 if err := w.maybeFlush(key, valueStoredWithKeyLen); err != nil { 984 return err 985 } 986 987 for i := range w.propCollectors { 988 if err := w.propCollectors[i].Add(key, value); err != nil { 989 w.err = err 990 return err 991 } 992 } 993 for i := range w.blockPropCollectors { 994 v := value 995 if addPrefixToValueStoredWithKey { 996 // Values for SET are not required to be in-place, and in the future may 997 // not even be read by the compaction, so pass nil values. Block 998 // property collectors in such Pebble DB's must not look at the value. 999 v = nil 1000 } 1001 if err := w.blockPropCollectors[i].Add(key, v); err != nil { 1002 w.err = err 1003 return err 1004 } 1005 } 1006 if w.tableFormat >= TableFormatPebblev4 { 1007 w.obsoleteCollector.AddPoint(isObsolete) 1008 } 1009 1010 w.maybeAddToFilter(key.UserKey) 1011 w.dataBlockBuf.dataBlock.addWithOptionalValuePrefix( 1012 key, isObsolete, valueStoredWithKey, maxSharedKeyLen, addPrefixToValueStoredWithKey, prefix, 1013 setHasSameKeyPrefix) 1014 1015 w.meta.updateSeqNum(key.SeqNum()) 1016 1017 if !w.meta.HasPointKeys { 1018 k := w.dataBlockBuf.dataBlock.getCurKey() 1019 // NB: We need to ensure that SmallestPoint.UserKey is set, so we create 1020 // an InternalKey which is semantically identical to the key, but won't 1021 // have a nil UserKey. We do this, because key.UserKey could be nil, and 1022 // we don't want SmallestPoint.UserKey to be nil. 1023 // 1024 // todo(bananabrick): Determine if it's okay to have a nil SmallestPoint 1025 // .UserKey now that we don't rely on a nil UserKey to determine if the 1026 // key has been set or not. 1027 w.meta.SetSmallestPointKey(k.Clone()) 1028 } 1029 1030 w.props.NumEntries++ 1031 switch key.Kind() { 1032 case InternalKeyKindDelete, InternalKeyKindSingleDelete: 1033 w.props.NumDeletions++ 1034 w.props.RawPointTombstoneKeySize += uint64(len(key.UserKey)) 1035 case InternalKeyKindDeleteSized: 1036 var size uint64 1037 if len(value) > 0 { 1038 var n int 1039 size, n = binary.Uvarint(value) 1040 if n <= 0 { 1041 w.err = errors.Newf("%s key's value (%x) does not parse as uvarint", 1042 errors.Safe(key.Kind().String()), value) 1043 return w.err 1044 } 1045 } 1046 w.props.NumDeletions++ 1047 w.props.NumSizedDeletions++ 1048 w.props.RawPointTombstoneKeySize += uint64(len(key.UserKey)) 1049 w.props.RawPointTombstoneValueSize += size 1050 case InternalKeyKindMerge: 1051 w.props.NumMergeOperands++ 1052 } 1053 w.props.RawKeySize += uint64(key.Size()) 1054 w.props.RawValueSize += uint64(len(value)) 1055 return nil 1056 } 1057 1058 func (w *Writer) prettyTombstone(k InternalKey, value []byte) fmt.Formatter { 1059 return keyspan.Span{ 1060 Start: k.UserKey, 1061 End: value, 1062 Keys: []keyspan.Key{{Trailer: k.Trailer}}, 1063 }.Pretty(w.formatKey) 1064 } 1065 1066 func (w *Writer) addTombstone(key InternalKey, value []byte) error { 1067 if !w.disableKeyOrderChecks && !w.rangeDelV1Format && w.rangeDelBlock.nEntries > 0 { 1068 // Check that tombstones are being added in fragmented order. If the two 1069 // tombstones overlap, their start and end keys must be identical. 1070 prevKey := w.rangeDelBlock.getCurKey() 1071 switch c := w.compare(prevKey.UserKey, key.UserKey); { 1072 case c > 0: 1073 w.err = errors.Errorf("pebble: keys must be added in order: %s, %s", 1074 prevKey.Pretty(w.formatKey), key.Pretty(w.formatKey)) 1075 return w.err 1076 case c == 0: 1077 prevValue := w.rangeDelBlock.curValue 1078 if w.compare(prevValue, value) != 0 { 1079 w.err = errors.Errorf("pebble: overlapping tombstones must be fragmented: %s vs %s", 1080 w.prettyTombstone(prevKey, prevValue), 1081 w.prettyTombstone(key, value)) 1082 return w.err 1083 } 1084 if prevKey.SeqNum() <= key.SeqNum() { 1085 w.err = errors.Errorf("pebble: keys must be added in strictly increasing order: %s, %s", 1086 prevKey.Pretty(w.formatKey), key.Pretty(w.formatKey)) 1087 return w.err 1088 } 1089 default: 1090 prevValue := w.rangeDelBlock.curValue 1091 if w.compare(prevValue, key.UserKey) > 0 { 1092 w.err = errors.Errorf("pebble: overlapping tombstones must be fragmented: %s vs %s", 1093 w.prettyTombstone(prevKey, prevValue), 1094 w.prettyTombstone(key, value)) 1095 return w.err 1096 } 1097 } 1098 } 1099 1100 if key.Trailer == InternalKeyRangeDeleteSentinel { 1101 w.err = errors.Errorf("pebble: cannot add range delete sentinel: %s", key.Pretty(w.formatKey)) 1102 return w.err 1103 } 1104 1105 for i := range w.propCollectors { 1106 if err := w.propCollectors[i].Add(key, value); err != nil { 1107 w.err = err 1108 return err 1109 } 1110 } 1111 1112 w.meta.updateSeqNum(key.SeqNum()) 1113 1114 switch { 1115 case w.rangeDelV1Format: 1116 // Range tombstones are not fragmented in the v1 (i.e. RocksDB) range 1117 // deletion block format, so we need to track the largest range tombstone 1118 // end key as every range tombstone is added. 1119 // 1120 // Note that writing the v1 format is only supported for tests. 1121 if w.props.NumRangeDeletions == 0 { 1122 w.meta.SetSmallestRangeDelKey(key.Clone()) 1123 w.meta.SetLargestRangeDelKey(base.MakeRangeDeleteSentinelKey(value).Clone()) 1124 } else { 1125 if base.InternalCompare(w.compare, w.meta.SmallestRangeDel, key) > 0 { 1126 w.meta.SetSmallestRangeDelKey(key.Clone()) 1127 } 1128 end := base.MakeRangeDeleteSentinelKey(value) 1129 if base.InternalCompare(w.compare, w.meta.LargestRangeDel, end) < 0 { 1130 w.meta.SetLargestRangeDelKey(end.Clone()) 1131 } 1132 } 1133 1134 default: 1135 // Range tombstones are fragmented in the v2 range deletion block format, 1136 // so the start key of the first range tombstone added will be the smallest 1137 // range tombstone key. The largest range tombstone key will be determined 1138 // in Writer.Close() as the end key of the last range tombstone added. 1139 if w.props.NumRangeDeletions == 0 { 1140 w.meta.SetSmallestRangeDelKey(key.Clone()) 1141 } 1142 } 1143 1144 w.props.NumEntries++ 1145 w.props.NumDeletions++ 1146 w.props.NumRangeDeletions++ 1147 w.props.RawKeySize += uint64(key.Size()) 1148 w.props.RawValueSize += uint64(len(value)) 1149 w.rangeDelBlock.add(key, value) 1150 return nil 1151 } 1152 1153 // RangeKeySet sets a range between start (inclusive) and end (exclusive) with 1154 // the given suffix to the given value. The resulting range key is given the 1155 // sequence number zero, with the expectation that the resulting sstable will be 1156 // ingested. 1157 // 1158 // Keys must be added to the table in increasing order of start key. Spans are 1159 // not required to be fragmented. The same suffix may not be set or unset twice 1160 // over the same keyspan, because it would result in inconsistent state. Both 1161 // the Set and Unset would share the zero sequence number, and a key cannot be 1162 // both simultaneously set and unset. 1163 func (w *Writer) RangeKeySet(start, end, suffix, value []byte) error { 1164 return w.addRangeKeySpan(keyspan.Span{ 1165 Start: w.tempRangeKeyCopy(start), 1166 End: w.tempRangeKeyCopy(end), 1167 Keys: []keyspan.Key{ 1168 { 1169 Trailer: base.MakeTrailer(0, base.InternalKeyKindRangeKeySet), 1170 Suffix: w.tempRangeKeyCopy(suffix), 1171 Value: w.tempRangeKeyCopy(value), 1172 }, 1173 }, 1174 }) 1175 } 1176 1177 // RangeKeyUnset un-sets a range between start (inclusive) and end (exclusive) 1178 // with the given suffix. The resulting range key is given the 1179 // sequence number zero, with the expectation that the resulting sstable will be 1180 // ingested. 1181 // 1182 // Keys must be added to the table in increasing order of start key. Spans are 1183 // not required to be fragmented. The same suffix may not be set or unset twice 1184 // over the same keyspan, because it would result in inconsistent state. Both 1185 // the Set and Unset would share the zero sequence number, and a key cannot be 1186 // both simultaneously set and unset. 1187 func (w *Writer) RangeKeyUnset(start, end, suffix []byte) error { 1188 return w.addRangeKeySpan(keyspan.Span{ 1189 Start: w.tempRangeKeyCopy(start), 1190 End: w.tempRangeKeyCopy(end), 1191 Keys: []keyspan.Key{ 1192 { 1193 Trailer: base.MakeTrailer(0, base.InternalKeyKindRangeKeyUnset), 1194 Suffix: w.tempRangeKeyCopy(suffix), 1195 }, 1196 }, 1197 }) 1198 } 1199 1200 // RangeKeyDelete deletes a range between start (inclusive) and end (exclusive). 1201 // 1202 // Keys must be added to the table in increasing order of start key. Spans are 1203 // not required to be fragmented. 1204 func (w *Writer) RangeKeyDelete(start, end []byte) error { 1205 return w.addRangeKeySpan(keyspan.Span{ 1206 Start: w.tempRangeKeyCopy(start), 1207 End: w.tempRangeKeyCopy(end), 1208 Keys: []keyspan.Key{ 1209 {Trailer: base.MakeTrailer(0, base.InternalKeyKindRangeKeyDelete)}, 1210 }, 1211 }) 1212 } 1213 1214 // AddRangeKey adds a range key set, unset, or delete key/value pair to the 1215 // table being written. 1216 // 1217 // Range keys must be supplied in strictly ascending order of start key (i.e. 1218 // user key ascending, sequence number descending, and key type descending). 1219 // Ranges added must also be supplied in fragmented span order - i.e. other than 1220 // spans that are perfectly aligned (same start and end keys), spans may not 1221 // overlap. Range keys may be added out of order relative to point keys and 1222 // range deletions. 1223 func (w *Writer) AddRangeKey(key InternalKey, value []byte) error { 1224 if w.err != nil { 1225 return w.err 1226 } 1227 return w.addRangeKey(key, value) 1228 } 1229 1230 func (w *Writer) addRangeKeySpan(span keyspan.Span) error { 1231 if w.compare(span.Start, span.End) >= 0 { 1232 return errors.Errorf( 1233 "pebble: start key must be strictly less than end key", 1234 ) 1235 } 1236 if w.fragmenter.Start() != nil && w.compare(w.fragmenter.Start(), span.Start) > 0 { 1237 return errors.Errorf("pebble: spans must be added in order: %s > %s", 1238 w.formatKey(w.fragmenter.Start()), w.formatKey(span.Start)) 1239 } 1240 // Add this span to the fragmenter. 1241 w.fragmenter.Add(span) 1242 return w.err 1243 } 1244 1245 func (w *Writer) encodeRangeKeySpan(span keyspan.Span) { 1246 // This method is the emit function of the Fragmenter. 1247 // 1248 // NB: The span should only contain range keys and be internally consistent 1249 // (eg, no duplicate suffixes, no additional keys after a RANGEKEYDEL). 1250 // 1251 // We use w.rangeKeysBySuffix and w.rangeKeySpan to avoid allocations. 1252 1253 // Sort the keys by suffix. Iteration doesn't *currently* depend on it, but 1254 // we may want to in the future. 1255 w.rangeKeysBySuffix.Cmp = w.compare 1256 w.rangeKeysBySuffix.Keys = span.Keys 1257 sort.Sort(&w.rangeKeysBySuffix) 1258 1259 w.rangeKeySpan = span 1260 w.rangeKeySpan.Keys = w.rangeKeysBySuffix.Keys 1261 w.err = firstError(w.err, w.rangeKeyEncoder.Encode(&w.rangeKeySpan)) 1262 } 1263 1264 func (w *Writer) addRangeKey(key InternalKey, value []byte) error { 1265 if !w.disableKeyOrderChecks && w.rangeKeyBlock.nEntries > 0 { 1266 prevStartKey := w.rangeKeyBlock.getCurKey() 1267 prevEndKey, _, ok := rangekey.DecodeEndKey(prevStartKey.Kind(), w.rangeKeyBlock.curValue) 1268 if !ok { 1269 // We panic here as we should have previously decoded and validated this 1270 // key and value when it was first added to the range key block. 1271 panic(errors.Errorf("pebble: invalid end key for span: %s", 1272 prevStartKey.Pretty(w.formatKey))) 1273 } 1274 1275 curStartKey := key 1276 curEndKey, _, ok := rangekey.DecodeEndKey(curStartKey.Kind(), value) 1277 if !ok { 1278 w.err = errors.Errorf("pebble: invalid end key for span: %s", 1279 curStartKey.Pretty(w.formatKey)) 1280 return w.err 1281 } 1282 1283 // Start keys must be strictly increasing. 1284 if base.InternalCompare(w.compare, prevStartKey, curStartKey) >= 0 { 1285 w.err = errors.Errorf( 1286 "pebble: range keys starts must be added in increasing order: %s, %s", 1287 prevStartKey.Pretty(w.formatKey), key.Pretty(w.formatKey)) 1288 return w.err 1289 } 1290 1291 // Start keys are increasing. If the start user keys are equal, the 1292 // end keys must be equal (i.e. aligned spans). 1293 if w.compare(prevStartKey.UserKey, curStartKey.UserKey) == 0 { 1294 if w.compare(prevEndKey, curEndKey) != 0 { 1295 w.err = errors.Errorf("pebble: overlapping range keys must be fragmented: %s, %s", 1296 prevStartKey.Pretty(w.formatKey), 1297 curStartKey.Pretty(w.formatKey)) 1298 return w.err 1299 } 1300 } else if w.compare(prevEndKey, curStartKey.UserKey) > 0 { 1301 // If the start user keys are NOT equal, the spans must be disjoint (i.e. 1302 // no overlap). 1303 // NOTE: the inequality excludes zero, as we allow the end key of the 1304 // lower span be the same as the start key of the upper span, because 1305 // the range end key is considered an exclusive bound. 1306 w.err = errors.Errorf("pebble: overlapping range keys must be fragmented: %s, %s", 1307 prevStartKey.Pretty(w.formatKey), 1308 curStartKey.Pretty(w.formatKey)) 1309 return w.err 1310 } 1311 } 1312 1313 // TODO(travers): Add an invariant-gated check to ensure that suffix-values 1314 // are sorted within coalesced spans. 1315 1316 // Range-keys and point-keys are intended to live in "parallel" keyspaces. 1317 // However, we track a single seqnum in the table metadata that spans both of 1318 // these keyspaces. 1319 // TODO(travers): Consider tracking range key seqnums separately. 1320 w.meta.updateSeqNum(key.SeqNum()) 1321 1322 // Range tombstones are fragmented, so the start key of the first range key 1323 // added will be the smallest. The largest range key is determined in 1324 // Writer.Close() as the end key of the last range key added to the block. 1325 if w.props.NumRangeKeys() == 0 { 1326 w.meta.SetSmallestRangeKey(key.Clone()) 1327 } 1328 1329 // Update block properties. 1330 w.props.RawRangeKeyKeySize += uint64(key.Size()) 1331 w.props.RawRangeKeyValueSize += uint64(len(value)) 1332 switch key.Kind() { 1333 case base.InternalKeyKindRangeKeyDelete: 1334 w.props.NumRangeKeyDels++ 1335 case base.InternalKeyKindRangeKeySet: 1336 w.props.NumRangeKeySets++ 1337 case base.InternalKeyKindRangeKeyUnset: 1338 w.props.NumRangeKeyUnsets++ 1339 default: 1340 panic(errors.Errorf("pebble: invalid range key type: %s", key.Kind())) 1341 } 1342 1343 for i := range w.blockPropCollectors { 1344 if err := w.blockPropCollectors[i].Add(key, value); err != nil { 1345 return err 1346 } 1347 } 1348 1349 // Add the key to the block. 1350 w.rangeKeyBlock.add(key, value) 1351 return nil 1352 } 1353 1354 // tempRangeKeyBuf returns a slice of length n from the Writer's rkBuf byte 1355 // slice. Any byte written to the returned slice is retained for the lifetime of 1356 // the Writer. 1357 func (w *Writer) tempRangeKeyBuf(n int) []byte { 1358 if cap(w.rkBuf)-len(w.rkBuf) < n { 1359 size := len(w.rkBuf) + 2*n 1360 if size < 2*cap(w.rkBuf) { 1361 size = 2 * cap(w.rkBuf) 1362 } 1363 buf := make([]byte, len(w.rkBuf), size) 1364 copy(buf, w.rkBuf) 1365 w.rkBuf = buf 1366 } 1367 b := w.rkBuf[len(w.rkBuf) : len(w.rkBuf)+n] 1368 w.rkBuf = w.rkBuf[:len(w.rkBuf)+n] 1369 return b 1370 } 1371 1372 // tempRangeKeyCopy returns a copy of the provided slice, stored in the Writer's 1373 // range key buffer. 1374 func (w *Writer) tempRangeKeyCopy(k []byte) []byte { 1375 if len(k) == 0 { 1376 return nil 1377 } 1378 buf := w.tempRangeKeyBuf(len(k)) 1379 copy(buf, k) 1380 return buf 1381 } 1382 1383 func (w *Writer) maybeAddToFilter(key []byte) { 1384 if w.filter != nil { 1385 if w.split != nil { 1386 prefix := key[:w.split(key)] 1387 w.filter.addKey(prefix) 1388 } else { 1389 w.filter.addKey(key) 1390 } 1391 } 1392 } 1393 1394 func (w *Writer) flush(key InternalKey) error { 1395 // We're finishing a data block. 1396 err := w.finishDataBlockProps(w.dataBlockBuf) 1397 if err != nil { 1398 return err 1399 } 1400 w.dataBlockBuf.finish() 1401 w.dataBlockBuf.compressAndChecksum(w.compression) 1402 // Since dataBlockEstimates.addInflightDataBlock was never called, the 1403 // inflightSize is set to 0. 1404 w.coordination.sizeEstimate.dataBlockCompressed(len(w.dataBlockBuf.compressed), 0) 1405 1406 // Determine if the index block should be flushed. Since we're accessing the 1407 // dataBlockBuf.dataBlock.curKey here, we have to make sure that once we start 1408 // to pool the dataBlockBufs, the curKey isn't used by the Writer once the 1409 // dataBlockBuf is added back to a sync.Pool. In this particular case, the 1410 // byte slice which supports "sep" will eventually be copied when "sep" is 1411 // added to the index block. 1412 prevKey := w.dataBlockBuf.dataBlock.getCurKey() 1413 sep := w.indexEntrySep(prevKey, key, w.dataBlockBuf) 1414 // We determine that we should flush an index block from the Writer client 1415 // goroutine, but we actually finish the index block from the writeQueue. 1416 // When we determine that an index block should be flushed, we need to call 1417 // BlockPropertyCollector.FinishIndexBlock. But block property collector 1418 // calls must happen sequentially from the Writer client. Therefore, we need 1419 // to determine that we are going to flush the index block from the Writer 1420 // client. 1421 shouldFlushIndexBlock := supportsTwoLevelIndex(w.tableFormat) && w.indexBlock.shouldFlush( 1422 sep, encodedBHPEstimatedSize, w.indexBlockSize, w.indexBlockSizeThreshold, 1423 ) 1424 1425 var indexProps []byte 1426 var flushableIndexBlock *indexBlockBuf 1427 if shouldFlushIndexBlock { 1428 flushableIndexBlock = w.indexBlock 1429 w.indexBlock = newIndexBlockBuf(w.coordination.parallelismEnabled) 1430 // Call BlockPropertyCollector.FinishIndexBlock, since we've decided to 1431 // flush the index block. 1432 indexProps, err = w.finishIndexBlockProps() 1433 if err != nil { 1434 return err 1435 } 1436 } 1437 1438 // We've called BlockPropertyCollector.FinishDataBlock, and, if necessary, 1439 // BlockPropertyCollector.FinishIndexBlock. Since we've decided to finish 1440 // the data block, we can call 1441 // BlockPropertyCollector.AddPrevDataBlockToIndexBlock. 1442 w.addPrevDataBlockToIndexBlockProps() 1443 1444 // Schedule a write. 1445 writeTask := writeTaskPool.Get().(*writeTask) 1446 // We're setting compressionDone to indicate that compression of this block 1447 // has already been completed. 1448 writeTask.compressionDone <- true 1449 writeTask.buf = w.dataBlockBuf 1450 writeTask.indexEntrySep = sep 1451 writeTask.currIndexBlock = w.indexBlock 1452 writeTask.indexInflightSize = sep.Size() + encodedBHPEstimatedSize 1453 writeTask.finishedIndexProps = indexProps 1454 writeTask.flushableIndexBlock = flushableIndexBlock 1455 1456 // The writeTask corresponds to an unwritten index entry. 1457 w.indexBlock.addInflight(writeTask.indexInflightSize) 1458 1459 w.dataBlockBuf = nil 1460 if w.coordination.parallelismEnabled { 1461 w.coordination.writeQueue.add(writeTask) 1462 } else { 1463 err = w.coordination.writeQueue.addSync(writeTask) 1464 } 1465 w.dataBlockBuf = newDataBlockBuf(w.restartInterval, w.checksumType) 1466 1467 return err 1468 } 1469 1470 func (w *Writer) maybeFlush(key InternalKey, valueLen int) error { 1471 if !w.dataBlockBuf.shouldFlush(key, valueLen, w.blockSize, w.blockSizeThreshold) { 1472 return nil 1473 } 1474 1475 err := w.flush(key) 1476 1477 if err != nil { 1478 w.err = err 1479 return err 1480 } 1481 1482 return nil 1483 } 1484 1485 // dataBlockBuf.dataBlockProps set by this method must be encoded before any future use of the 1486 // dataBlockBuf.blockPropsEncoder, since the properties slice will get reused by the 1487 // blockPropsEncoder. 1488 func (w *Writer) finishDataBlockProps(buf *dataBlockBuf) error { 1489 if len(w.blockPropCollectors) == 0 { 1490 return nil 1491 } 1492 var err error 1493 buf.blockPropsEncoder.resetProps() 1494 for i := range w.blockPropCollectors { 1495 scratch := buf.blockPropsEncoder.getScratchForProp() 1496 if scratch, err = w.blockPropCollectors[i].FinishDataBlock(scratch); err != nil { 1497 return err 1498 } 1499 if len(scratch) > 0 { 1500 buf.blockPropsEncoder.addProp(shortID(i), scratch) 1501 } 1502 } 1503 1504 buf.dataBlockProps = buf.blockPropsEncoder.unsafeProps() 1505 return nil 1506 } 1507 1508 // The BlockHandleWithProperties returned by this method must be encoded before any future use of 1509 // the Writer.blockPropsEncoder, since the properties slice will get reused by the blockPropsEncoder. 1510 // maybeAddBlockPropertiesToBlockHandle should only be called if block is being written synchronously 1511 // with the Writer client. 1512 func (w *Writer) maybeAddBlockPropertiesToBlockHandle( 1513 bh BlockHandle, 1514 ) (BlockHandleWithProperties, error) { 1515 err := w.finishDataBlockProps(w.dataBlockBuf) 1516 if err != nil { 1517 return BlockHandleWithProperties{}, err 1518 } 1519 return BlockHandleWithProperties{BlockHandle: bh, Props: w.dataBlockBuf.dataBlockProps}, nil 1520 } 1521 1522 func (w *Writer) indexEntrySep(prevKey, key InternalKey, dataBlockBuf *dataBlockBuf) InternalKey { 1523 // Make a rough guess that we want key-sized scratch to compute the separator. 1524 if cap(dataBlockBuf.sepScratch) < key.Size() { 1525 dataBlockBuf.sepScratch = make([]byte, 0, key.Size()*2) 1526 } 1527 1528 var sep InternalKey 1529 if key.UserKey == nil && key.Trailer == 0 { 1530 sep = prevKey.Successor(w.compare, w.successor, dataBlockBuf.sepScratch[:0]) 1531 } else { 1532 sep = prevKey.Separator(w.compare, w.separator, dataBlockBuf.sepScratch[:0], key) 1533 } 1534 return sep 1535 } 1536 1537 // addIndexEntry adds an index entry for the specified key and block handle. 1538 // addIndexEntry can be called from both the Writer client goroutine, and the 1539 // writeQueue goroutine. If the flushIndexBuf != nil, then the indexProps, as 1540 // they're used when the index block is finished. 1541 // 1542 // Invariant: 1543 // 1. addIndexEntry must not store references to the sep InternalKey, the tmp 1544 // byte slice, bhp.Props. That is, these must be either deep copied or 1545 // encoded. 1546 // 2. addIndexEntry must not hold references to the flushIndexBuf, and the writeTo 1547 // indexBlockBufs. 1548 func (w *Writer) addIndexEntry( 1549 sep InternalKey, 1550 bhp BlockHandleWithProperties, 1551 tmp []byte, 1552 flushIndexBuf *indexBlockBuf, 1553 writeTo *indexBlockBuf, 1554 inflightSize int, 1555 indexProps []byte, 1556 ) error { 1557 if bhp.Length == 0 { 1558 // A valid blockHandle must be non-zero. 1559 // In particular, it must have a non-zero length. 1560 return nil 1561 } 1562 1563 encoded := encodeBlockHandleWithProperties(tmp, bhp) 1564 1565 if flushIndexBuf != nil { 1566 if cap(w.indexPartitions) == 0 { 1567 w.indexPartitions = make([]indexBlockAndBlockProperties, 0, 32) 1568 } 1569 // Enable two level indexes if there is more than one index block. 1570 w.twoLevelIndex = true 1571 if err := w.finishIndexBlock(flushIndexBuf, indexProps); err != nil { 1572 return err 1573 } 1574 } 1575 1576 writeTo.add(sep, encoded, inflightSize) 1577 return nil 1578 } 1579 1580 func (w *Writer) addPrevDataBlockToIndexBlockProps() { 1581 for i := range w.blockPropCollectors { 1582 w.blockPropCollectors[i].AddPrevDataBlockToIndexBlock() 1583 } 1584 } 1585 1586 // addIndexEntrySync adds an index entry for the specified key and block handle. 1587 // Writer.addIndexEntry is only called synchronously once Writer.Close is called. 1588 // addIndexEntrySync should only be called if we're sure that index entries 1589 // aren't being written asynchronously. 1590 // 1591 // Invariant: 1592 // 1. addIndexEntrySync must not store references to the prevKey, key InternalKey's, 1593 // the tmp byte slice. That is, these must be either deep copied or encoded. 1594 func (w *Writer) addIndexEntrySync( 1595 prevKey, key InternalKey, bhp BlockHandleWithProperties, tmp []byte, 1596 ) error { 1597 sep := w.indexEntrySep(prevKey, key, w.dataBlockBuf) 1598 shouldFlush := supportsTwoLevelIndex( 1599 w.tableFormat) && w.indexBlock.shouldFlush( 1600 sep, encodedBHPEstimatedSize, w.indexBlockSize, w.indexBlockSizeThreshold, 1601 ) 1602 var flushableIndexBlock *indexBlockBuf 1603 var props []byte 1604 var err error 1605 if shouldFlush { 1606 flushableIndexBlock = w.indexBlock 1607 w.indexBlock = newIndexBlockBuf(w.coordination.parallelismEnabled) 1608 1609 // Call BlockPropertyCollector.FinishIndexBlock, since we've decided to 1610 // flush the index block. 1611 props, err = w.finishIndexBlockProps() 1612 if err != nil { 1613 return err 1614 } 1615 } 1616 1617 err = w.addIndexEntry(sep, bhp, tmp, flushableIndexBlock, w.indexBlock, 0, props) 1618 if flushableIndexBlock != nil { 1619 flushableIndexBlock.clear() 1620 indexBlockBufPool.Put(flushableIndexBlock) 1621 } 1622 w.addPrevDataBlockToIndexBlockProps() 1623 return err 1624 } 1625 1626 func shouldFlush( 1627 key InternalKey, 1628 valueLen int, 1629 restartInterval, estimatedBlockSize, numEntries, targetBlockSize, sizeThreshold int, 1630 ) bool { 1631 if numEntries == 0 { 1632 return false 1633 } 1634 1635 if estimatedBlockSize >= targetBlockSize { 1636 return true 1637 } 1638 1639 // The block is currently smaller than the target size. 1640 if estimatedBlockSize <= sizeThreshold { 1641 // The block is smaller than the threshold size at which we'll consider 1642 // flushing it. 1643 return false 1644 } 1645 1646 newSize := estimatedBlockSize + key.Size() + valueLen 1647 if numEntries%restartInterval == 0 { 1648 newSize += 4 1649 } 1650 newSize += 4 // varint for shared prefix length 1651 newSize += uvarintLen(uint32(key.Size())) // varint for unshared key bytes 1652 newSize += uvarintLen(uint32(valueLen)) // varint for value size 1653 // Flush if the block plus the new entry is larger than the target size. 1654 return newSize > targetBlockSize 1655 } 1656 1657 func cloneKeyWithBuf(k InternalKey, a bytealloc.A) (bytealloc.A, InternalKey) { 1658 if len(k.UserKey) == 0 { 1659 return a, k 1660 } 1661 a, keyCopy := a.Copy(k.UserKey) 1662 return a, InternalKey{UserKey: keyCopy, Trailer: k.Trailer} 1663 } 1664 1665 // Invariants: The byte slice returned by finishIndexBlockProps is heap-allocated 1666 // 1667 // and has its own lifetime, independent of the Writer and the blockPropsEncoder, 1668 // 1669 // and it is safe to: 1670 // 1. Reuse w.blockPropsEncoder without first encoding the byte slice returned. 1671 // 2. Store the byte slice in the Writer since it is a copy and not supported by 1672 // an underlying buffer. 1673 func (w *Writer) finishIndexBlockProps() ([]byte, error) { 1674 w.blockPropsEncoder.resetProps() 1675 for i := range w.blockPropCollectors { 1676 scratch := w.blockPropsEncoder.getScratchForProp() 1677 var err error 1678 if scratch, err = w.blockPropCollectors[i].FinishIndexBlock(scratch); err != nil { 1679 return nil, err 1680 } 1681 if len(scratch) > 0 { 1682 w.blockPropsEncoder.addProp(shortID(i), scratch) 1683 } 1684 } 1685 return w.blockPropsEncoder.props(), nil 1686 } 1687 1688 // finishIndexBlock finishes the current index block and adds it to the top 1689 // level index block. This is only used when two level indexes are enabled. 1690 // 1691 // Invariants: 1692 // 1. The props slice passed into finishedIndexBlock must not be a 1693 // owned by any other struct, since it will be stored in the Writer.indexPartitions 1694 // slice. 1695 // 2. None of the buffers owned by indexBuf will be shallow copied and stored elsewhere. 1696 // That is, it must be safe to reuse indexBuf after finishIndexBlock has been called. 1697 func (w *Writer) finishIndexBlock(indexBuf *indexBlockBuf, props []byte) error { 1698 part := indexBlockAndBlockProperties{ 1699 nEntries: indexBuf.block.nEntries, properties: props, 1700 } 1701 w.indexSepAlloc, part.sep = cloneKeyWithBuf( 1702 indexBuf.block.getCurKey(), w.indexSepAlloc, 1703 ) 1704 bk := indexBuf.finish() 1705 if len(w.indexBlockAlloc) < len(bk) { 1706 // Allocate enough bytes for approximately 16 index blocks. 1707 w.indexBlockAlloc = make([]byte, len(bk)*16) 1708 } 1709 n := copy(w.indexBlockAlloc, bk) 1710 part.block = w.indexBlockAlloc[:n:n] 1711 w.indexBlockAlloc = w.indexBlockAlloc[n:] 1712 w.indexPartitions = append(w.indexPartitions, part) 1713 return nil 1714 } 1715 1716 func (w *Writer) writeTwoLevelIndex() (BlockHandle, error) { 1717 props, err := w.finishIndexBlockProps() 1718 if err != nil { 1719 return BlockHandle{}, err 1720 } 1721 // Add the final unfinished index. 1722 if err = w.finishIndexBlock(w.indexBlock, props); err != nil { 1723 return BlockHandle{}, err 1724 } 1725 1726 for i := range w.indexPartitions { 1727 b := &w.indexPartitions[i] 1728 w.props.NumDataBlocks += uint64(b.nEntries) 1729 1730 data := b.block 1731 w.props.IndexSize += uint64(len(data)) 1732 bh, err := w.writeBlock(data, w.compression, &w.blockBuf) 1733 if err != nil { 1734 return BlockHandle{}, err 1735 } 1736 bhp := BlockHandleWithProperties{ 1737 BlockHandle: bh, 1738 Props: b.properties, 1739 } 1740 encoded := encodeBlockHandleWithProperties(w.blockBuf.tmp[:], bhp) 1741 w.topLevelIndexBlock.add(b.sep, encoded) 1742 } 1743 1744 // NB: RocksDB includes the block trailer length in the index size 1745 // property, though it doesn't include the trailer in the top level 1746 // index size property. 1747 w.props.IndexPartitions = uint64(len(w.indexPartitions)) 1748 w.props.TopLevelIndexSize = uint64(w.topLevelIndexBlock.estimatedSize()) 1749 w.props.IndexSize += w.props.TopLevelIndexSize + blockTrailerLen 1750 1751 return w.writeBlock(w.topLevelIndexBlock.finish(), w.compression, &w.blockBuf) 1752 } 1753 1754 func compressAndChecksum(b []byte, compression Compression, blockBuf *blockBuf) []byte { 1755 // Compress the buffer, discarding the result if the improvement isn't at 1756 // least 12.5%. 1757 blockType, compressed := compressBlock(compression, b, blockBuf.compressedBuf) 1758 if blockType != noCompressionBlockType && cap(compressed) > cap(blockBuf.compressedBuf) { 1759 blockBuf.compressedBuf = compressed[:cap(compressed)] 1760 } 1761 if len(compressed) < len(b)-len(b)/8 { 1762 b = compressed 1763 } else { 1764 blockType = noCompressionBlockType 1765 } 1766 1767 blockBuf.tmp[0] = byte(blockType) 1768 1769 // Calculate the checksum. 1770 checksum := blockBuf.checksummer.checksum(b, blockBuf.tmp[:1]) 1771 binary.LittleEndian.PutUint32(blockBuf.tmp[1:5], checksum) 1772 return b 1773 } 1774 1775 func (w *Writer) writeCompressedBlock(block []byte, blockTrailerBuf []byte) (BlockHandle, error) { 1776 bh := BlockHandle{Offset: w.meta.Size, Length: uint64(len(block))} 1777 1778 if w.cacheID != 0 && w.fileNum.FileNum() != 0 { 1779 // Remove the block being written from the cache. This provides defense in 1780 // depth against bugs which cause cache collisions. 1781 // 1782 // TODO(peter): Alternatively, we could add the uncompressed value to the 1783 // cache. 1784 w.cache.Delete(w.cacheID, w.fileNum, bh.Offset) 1785 } 1786 1787 // Write the bytes to the file. 1788 if err := w.writable.Write(block); err != nil { 1789 return BlockHandle{}, err 1790 } 1791 w.meta.Size += uint64(len(block)) 1792 if err := w.writable.Write(blockTrailerBuf[:blockTrailerLen]); err != nil { 1793 return BlockHandle{}, err 1794 } 1795 w.meta.Size += blockTrailerLen 1796 1797 return bh, nil 1798 } 1799 1800 // Write implements io.Writer. This is analogous to writeCompressedBlock for 1801 // blocks that already incorporate the trailer, and don't need the callee to 1802 // return a BlockHandle. 1803 func (w *Writer) Write(blockWithTrailer []byte) (n int, err error) { 1804 offset := w.meta.Size 1805 if w.cacheID != 0 && w.fileNum.FileNum() != 0 { 1806 // Remove the block being written from the cache. This provides defense in 1807 // depth against bugs which cause cache collisions. 1808 // 1809 // TODO(peter): Alternatively, we could add the uncompressed value to the 1810 // cache. 1811 w.cache.Delete(w.cacheID, w.fileNum, offset) 1812 } 1813 w.meta.Size += uint64(len(blockWithTrailer)) 1814 if err := w.writable.Write(blockWithTrailer); err != nil { 1815 return 0, err 1816 } 1817 return len(blockWithTrailer), nil 1818 } 1819 1820 func (w *Writer) writeBlock( 1821 b []byte, compression Compression, blockBuf *blockBuf, 1822 ) (BlockHandle, error) { 1823 b = compressAndChecksum(b, compression, blockBuf) 1824 return w.writeCompressedBlock(b, blockBuf.tmp[:]) 1825 } 1826 1827 // assertFormatCompatibility ensures that the features present on the table are 1828 // compatible with the table format version. 1829 func (w *Writer) assertFormatCompatibility() error { 1830 // PebbleDBv1: block properties. 1831 if len(w.blockPropCollectors) > 0 && w.tableFormat < TableFormatPebblev1 { 1832 return errors.Newf( 1833 "table format version %s is less than the minimum required version %s for block properties", 1834 w.tableFormat, TableFormatPebblev1, 1835 ) 1836 } 1837 1838 // PebbleDBv2: range keys. 1839 if w.props.NumRangeKeys() > 0 && w.tableFormat < TableFormatPebblev2 { 1840 return errors.Newf( 1841 "table format version %s is less than the minimum required version %s for range keys", 1842 w.tableFormat, TableFormatPebblev2, 1843 ) 1844 } 1845 1846 // PebbleDBv3: value blocks. 1847 if (w.props.NumValueBlocks > 0 || w.props.NumValuesInValueBlocks > 0 || 1848 w.props.ValueBlocksSize > 0) && w.tableFormat < TableFormatPebblev3 { 1849 return errors.Newf( 1850 "table format version %s is less than the minimum required version %s for value blocks", 1851 w.tableFormat, TableFormatPebblev3) 1852 } 1853 1854 // PebbleDBv4: DELSIZED tombstones. 1855 if w.props.NumSizedDeletions > 0 && w.tableFormat < TableFormatPebblev4 { 1856 return errors.Newf( 1857 "table format version %s is less than the minimum required version %s for sized deletion tombstones", 1858 w.tableFormat, TableFormatPebblev4) 1859 } 1860 return nil 1861 } 1862 1863 // Close finishes writing the table and closes the underlying file that the 1864 // table was written to. 1865 func (w *Writer) Close() (err error) { 1866 defer func() { 1867 if w.valueBlockWriter != nil { 1868 releaseValueBlockWriter(w.valueBlockWriter) 1869 // Defensive code in case Close gets called again. We don't want to put 1870 // the same object to a sync.Pool. 1871 w.valueBlockWriter = nil 1872 } 1873 if w.writable != nil { 1874 w.writable.Abort() 1875 w.writable = nil 1876 } 1877 // Record any error in the writer (so we can exit early if Close is called 1878 // again). 1879 if err != nil { 1880 w.err = err 1881 } 1882 }() 1883 1884 // finish must be called before we check for an error, because finish will 1885 // block until every single task added to the writeQueue has been processed, 1886 // and an error could be encountered while any of those tasks are processed. 1887 if err := w.coordination.writeQueue.finish(); err != nil { 1888 return err 1889 } 1890 1891 if w.err != nil { 1892 return w.err 1893 } 1894 1895 // The w.meta.LargestPointKey is only used once the Writer is closed, so it is safe to set it 1896 // when the Writer is closed. 1897 // 1898 // The following invariants ensure that setting the largest key at this point of a Writer close 1899 // is correct: 1900 // 1. Keys must only be added to the Writer in an increasing order. 1901 // 2. The current w.dataBlockBuf is guaranteed to have the latest key added to the Writer. This 1902 // must be true, because a w.dataBlockBuf is only switched out when a dataBlock is flushed, 1903 // however, if a dataBlock is flushed, then we add a key to the new w.dataBlockBuf in the 1904 // addPoint function after the flush occurs. 1905 if w.dataBlockBuf.dataBlock.nEntries >= 1 { 1906 w.meta.SetLargestPointKey(w.dataBlockBuf.dataBlock.getCurKey().Clone()) 1907 } 1908 1909 // Finish the last data block, or force an empty data block if there 1910 // aren't any data blocks at all. 1911 if w.dataBlockBuf.dataBlock.nEntries > 0 || w.indexBlock.block.nEntries == 0 { 1912 bh, err := w.writeBlock(w.dataBlockBuf.dataBlock.finish(), w.compression, &w.dataBlockBuf.blockBuf) 1913 if err != nil { 1914 return err 1915 } 1916 bhp, err := w.maybeAddBlockPropertiesToBlockHandle(bh) 1917 if err != nil { 1918 return err 1919 } 1920 prevKey := w.dataBlockBuf.dataBlock.getCurKey() 1921 if err := w.addIndexEntrySync(prevKey, InternalKey{}, bhp, w.dataBlockBuf.tmp[:]); err != nil { 1922 return err 1923 } 1924 } 1925 w.props.DataSize = w.meta.Size 1926 1927 // Write the filter block. 1928 var metaindex rawBlockWriter 1929 metaindex.restartInterval = 1 1930 if w.filter != nil { 1931 b, err := w.filter.finish() 1932 if err != nil { 1933 return err 1934 } 1935 bh, err := w.writeBlock(b, NoCompression, &w.blockBuf) 1936 if err != nil { 1937 return err 1938 } 1939 n := encodeBlockHandle(w.blockBuf.tmp[:], bh) 1940 metaindex.add(InternalKey{UserKey: []byte(w.filter.metaName())}, w.blockBuf.tmp[:n]) 1941 w.props.FilterPolicyName = w.filter.policyName() 1942 w.props.FilterSize = bh.Length 1943 } 1944 1945 var indexBH BlockHandle 1946 if w.twoLevelIndex { 1947 w.props.IndexType = twoLevelIndex 1948 // Write the two level index block. 1949 indexBH, err = w.writeTwoLevelIndex() 1950 if err != nil { 1951 return err 1952 } 1953 } else { 1954 w.props.IndexType = binarySearchIndex 1955 // NB: RocksDB includes the block trailer length in the index size 1956 // property, though it doesn't include the trailer in the filter size 1957 // property. 1958 w.props.IndexSize = uint64(w.indexBlock.estimatedSize()) + blockTrailerLen 1959 w.props.NumDataBlocks = uint64(w.indexBlock.block.nEntries) 1960 1961 // Write the single level index block. 1962 indexBH, err = w.writeBlock(w.indexBlock.finish(), w.compression, &w.blockBuf) 1963 if err != nil { 1964 return err 1965 } 1966 } 1967 1968 // Write the range-del block. The block handle must added to the meta index block 1969 // after the properties block has been written. This is because the entries in the 1970 // metaindex block must be sorted by key. 1971 var rangeDelBH BlockHandle 1972 if w.props.NumRangeDeletions > 0 { 1973 if !w.rangeDelV1Format { 1974 // Because the range tombstones are fragmented in the v2 format, the end 1975 // key of the last added range tombstone will be the largest range 1976 // tombstone key. Note that we need to make this into a range deletion 1977 // sentinel because sstable boundaries are inclusive while the end key of 1978 // a range deletion tombstone is exclusive. A Clone() is necessary as 1979 // rangeDelBlock.curValue is the same slice that will get passed 1980 // into w.writer, and some implementations of vfs.File mutate the 1981 // slice passed into Write(). Also, w.meta will often outlive the 1982 // blockWriter, and so cloning curValue allows the rangeDelBlock's 1983 // internal buffer to get gc'd. 1984 k := base.MakeRangeDeleteSentinelKey(w.rangeDelBlock.curValue).Clone() 1985 w.meta.SetLargestRangeDelKey(k) 1986 } 1987 rangeDelBH, err = w.writeBlock(w.rangeDelBlock.finish(), NoCompression, &w.blockBuf) 1988 if err != nil { 1989 return err 1990 } 1991 } 1992 1993 // Write the range-key block, flushing any remaining spans from the 1994 // fragmenter first. 1995 w.fragmenter.Finish() 1996 1997 var rangeKeyBH BlockHandle 1998 if w.props.NumRangeKeys() > 0 { 1999 key := w.rangeKeyBlock.getCurKey() 2000 kind := key.Kind() 2001 endKey, _, ok := rangekey.DecodeEndKey(kind, w.rangeKeyBlock.curValue) 2002 if !ok { 2003 return errors.Newf("invalid end key: %s", w.rangeKeyBlock.curValue) 2004 } 2005 k := base.MakeExclusiveSentinelKey(kind, endKey).Clone() 2006 w.meta.SetLargestRangeKey(k) 2007 // TODO(travers): The lack of compression on the range key block matches the 2008 // lack of compression on the range-del block. Revisit whether we want to 2009 // enable compression on this block. 2010 rangeKeyBH, err = w.writeBlock(w.rangeKeyBlock.finish(), NoCompression, &w.blockBuf) 2011 if err != nil { 2012 return err 2013 } 2014 } 2015 2016 if w.valueBlockWriter != nil { 2017 vbiHandle, vbStats, err := w.valueBlockWriter.finish(w, w.meta.Size) 2018 if err != nil { 2019 return err 2020 } 2021 w.props.NumValueBlocks = vbStats.numValueBlocks 2022 w.props.NumValuesInValueBlocks = vbStats.numValuesInValueBlocks 2023 w.props.ValueBlocksSize = vbStats.valueBlocksAndIndexSize 2024 if vbStats.numValueBlocks > 0 { 2025 n := encodeValueBlocksIndexHandle(w.blockBuf.tmp[:], vbiHandle) 2026 metaindex.add(InternalKey{UserKey: []byte(metaValueIndexName)}, w.blockBuf.tmp[:n]) 2027 } 2028 } 2029 2030 // Add the range key block handle to the metaindex block. Note that we add the 2031 // block handle to the metaindex block before the other meta blocks as the 2032 // metaindex block entries must be sorted, and the range key block name sorts 2033 // before the other block names. 2034 if w.props.NumRangeKeys() > 0 { 2035 n := encodeBlockHandle(w.blockBuf.tmp[:], rangeKeyBH) 2036 metaindex.add(InternalKey{UserKey: []byte(metaRangeKeyName)}, w.blockBuf.tmp[:n]) 2037 } 2038 2039 { 2040 userProps := make(map[string]string) 2041 for i := range w.propCollectors { 2042 if err := w.propCollectors[i].Finish(userProps); err != nil { 2043 return err 2044 } 2045 } 2046 for i := range w.blockPropCollectors { 2047 scratch := w.blockPropsEncoder.getScratchForProp() 2048 // Place the shortID in the first byte. 2049 scratch = append(scratch, byte(i)) 2050 buf, err := w.blockPropCollectors[i].FinishTable(scratch) 2051 if err != nil { 2052 return err 2053 } 2054 var prop string 2055 if len(buf) > 0 { 2056 prop = string(buf) 2057 } 2058 // NB: The property is populated in the map even if it is the 2059 // empty string, since the presence in the map is what indicates 2060 // that the block property collector was used when writing. 2061 userProps[w.blockPropCollectors[i].Name()] = prop 2062 } 2063 if len(userProps) > 0 { 2064 w.props.UserProperties = userProps 2065 } 2066 2067 // Write the properties block. 2068 var raw rawBlockWriter 2069 // The restart interval is set to infinity because the properties block 2070 // is always read sequentially and cached in a heap located object. This 2071 // reduces table size without a significant impact on performance. 2072 raw.restartInterval = propertiesBlockRestartInterval 2073 w.props.CompressionOptions = rocksDBCompressionOptions 2074 w.props.save(w.tableFormat, &raw) 2075 bh, err := w.writeBlock(raw.finish(), NoCompression, &w.blockBuf) 2076 if err != nil { 2077 return err 2078 } 2079 n := encodeBlockHandle(w.blockBuf.tmp[:], bh) 2080 metaindex.add(InternalKey{UserKey: []byte(metaPropertiesName)}, w.blockBuf.tmp[:n]) 2081 } 2082 2083 // Add the range deletion block handle to the metaindex block. 2084 if w.props.NumRangeDeletions > 0 { 2085 n := encodeBlockHandle(w.blockBuf.tmp[:], rangeDelBH) 2086 // The v2 range-del block encoding is backwards compatible with the v1 2087 // encoding. We add meta-index entries for both the old name and the new 2088 // name so that old code can continue to find the range-del block and new 2089 // code knows that the range tombstones in the block are fragmented and 2090 // sorted. 2091 metaindex.add(InternalKey{UserKey: []byte(metaRangeDelName)}, w.blockBuf.tmp[:n]) 2092 if !w.rangeDelV1Format { 2093 metaindex.add(InternalKey{UserKey: []byte(metaRangeDelV2Name)}, w.blockBuf.tmp[:n]) 2094 } 2095 } 2096 2097 // Write the metaindex block. It might be an empty block, if the filter 2098 // policy is nil. NoCompression is specified because a) RocksDB never 2099 // compresses the meta-index block and b) RocksDB has some code paths which 2100 // expect the meta-index block to not be compressed. 2101 metaindexBH, err := w.writeBlock(metaindex.blockWriter.finish(), NoCompression, &w.blockBuf) 2102 if err != nil { 2103 return err 2104 } 2105 2106 // Write the table footer. 2107 footer := footer{ 2108 format: w.tableFormat, 2109 checksum: w.blockBuf.checksummer.checksumType, 2110 metaindexBH: metaindexBH, 2111 indexBH: indexBH, 2112 } 2113 encoded := footer.encode(w.blockBuf.tmp[:]) 2114 if err := w.writable.Write(footer.encode(w.blockBuf.tmp[:])); err != nil { 2115 return err 2116 } 2117 w.meta.Size += uint64(len(encoded)) 2118 w.meta.Properties = w.props 2119 2120 // Check that the features present in the table are compatible with the format 2121 // configured for the table. 2122 if err = w.assertFormatCompatibility(); err != nil { 2123 return err 2124 } 2125 2126 if err := w.writable.Finish(); err != nil { 2127 w.writable = nil 2128 return err 2129 } 2130 w.writable = nil 2131 2132 w.dataBlockBuf.clear() 2133 dataBlockBufPool.Put(w.dataBlockBuf) 2134 w.dataBlockBuf = nil 2135 w.indexBlock.clear() 2136 indexBlockBufPool.Put(w.indexBlock) 2137 w.indexBlock = nil 2138 2139 // Make any future calls to Set or Close return an error. 2140 w.err = errWriterClosed 2141 return nil 2142 } 2143 2144 // EstimatedSize returns the estimated size of the sstable being written if a 2145 // call to Finish() was made without adding additional keys. 2146 func (w *Writer) EstimatedSize() uint64 { 2147 return w.coordination.sizeEstimate.size() + 2148 uint64(w.dataBlockBuf.dataBlock.estimatedSize()) + 2149 w.indexBlock.estimatedSize() 2150 } 2151 2152 // Metadata returns the metadata for the finished sstable. Only valid to call 2153 // after the sstable has been finished. 2154 func (w *Writer) Metadata() (*WriterMetadata, error) { 2155 if w.writable != nil { 2156 return nil, errors.New("pebble: writer is not closed") 2157 } 2158 return &w.meta, nil 2159 } 2160 2161 // WriterOption provide an interface to do work on Writer while it is being 2162 // opened. 2163 type WriterOption interface { 2164 // writerApply is called on the writer during opening in order to set 2165 // internal parameters. 2166 writerApply(*Writer) 2167 } 2168 2169 // PreviousPointKeyOpt is a WriterOption that provides access to the last 2170 // point key written to the writer while building a sstable. 2171 type PreviousPointKeyOpt struct { 2172 w *Writer 2173 } 2174 2175 // UnsafeKey returns the last point key written to the writer to which this 2176 // option was passed during creation. The returned key points directly into 2177 // a buffer belonging to the Writer. The value's lifetime ends the next time a 2178 // point key is added to the Writer. 2179 // Invariant: UnsafeKey isn't and shouldn't be called after the Writer is closed. 2180 func (o PreviousPointKeyOpt) UnsafeKey() base.InternalKey { 2181 if o.w == nil { 2182 return base.InvalidInternalKey 2183 } 2184 2185 if o.w.dataBlockBuf.dataBlock.nEntries >= 1 { 2186 // o.w.dataBlockBuf.dataBlock.curKey is guaranteed to point to the last point key 2187 // which was added to the Writer. 2188 return o.w.dataBlockBuf.dataBlock.getCurKey() 2189 } 2190 return base.InternalKey{} 2191 } 2192 2193 func (o *PreviousPointKeyOpt) writerApply(w *Writer) { 2194 o.w = w 2195 } 2196 2197 // NewWriter returns a new table writer for the file. Closing the writer will 2198 // close the file. 2199 func NewWriter(writable objstorage.Writable, o WriterOptions, extraOpts ...WriterOption) *Writer { 2200 o = o.ensureDefaults() 2201 w := &Writer{ 2202 writable: writable, 2203 meta: WriterMetadata{ 2204 SmallestSeqNum: math.MaxUint64, 2205 }, 2206 blockSize: o.BlockSize, 2207 blockSizeThreshold: (o.BlockSize*o.BlockSizeThreshold + 99) / 100, 2208 indexBlockSize: o.IndexBlockSize, 2209 indexBlockSizeThreshold: (o.IndexBlockSize*o.BlockSizeThreshold + 99) / 100, 2210 compare: o.Comparer.Compare, 2211 split: o.Comparer.Split, 2212 formatKey: o.Comparer.FormatKey, 2213 compression: o.Compression, 2214 separator: o.Comparer.Separator, 2215 successor: o.Comparer.Successor, 2216 tableFormat: o.TableFormat, 2217 isStrictObsolete: o.IsStrictObsolete, 2218 writingToLowestLevel: o.WritingToLowestLevel, 2219 cache: o.Cache, 2220 restartInterval: o.BlockRestartInterval, 2221 checksumType: o.Checksum, 2222 indexBlock: newIndexBlockBuf(o.Parallelism), 2223 rangeDelBlock: blockWriter{ 2224 restartInterval: 1, 2225 }, 2226 rangeKeyBlock: blockWriter{ 2227 restartInterval: 1, 2228 }, 2229 topLevelIndexBlock: blockWriter{ 2230 restartInterval: 1, 2231 }, 2232 fragmenter: keyspan.Fragmenter{ 2233 Cmp: o.Comparer.Compare, 2234 Format: o.Comparer.FormatKey, 2235 }, 2236 } 2237 if w.tableFormat >= TableFormatPebblev3 { 2238 w.shortAttributeExtractor = o.ShortAttributeExtractor 2239 w.requiredInPlaceValueBound = o.RequiredInPlaceValueBound 2240 w.valueBlockWriter = newValueBlockWriter( 2241 w.blockSize, w.blockSizeThreshold, w.compression, w.checksumType, func(compressedSize int) { 2242 w.coordination.sizeEstimate.dataBlockCompressed(compressedSize, 0) 2243 }) 2244 } 2245 2246 w.dataBlockBuf = newDataBlockBuf(w.restartInterval, w.checksumType) 2247 2248 w.blockBuf = blockBuf{ 2249 checksummer: checksummer{checksumType: o.Checksum}, 2250 } 2251 2252 w.coordination.init(o.Parallelism, w) 2253 2254 if writable == nil { 2255 w.err = errors.New("pebble: nil writable") 2256 return w 2257 } 2258 2259 // Note that WriterOptions are applied in two places; the ones with a 2260 // preApply() method are applied here. The rest are applied down below after 2261 // default properties are set. 2262 type preApply interface{ preApply() } 2263 for _, opt := range extraOpts { 2264 if _, ok := opt.(preApply); ok { 2265 opt.writerApply(w) 2266 } 2267 } 2268 2269 w.props.PrefixExtractorName = "nullptr" 2270 if o.FilterPolicy != nil { 2271 switch o.FilterType { 2272 case TableFilter: 2273 w.filter = newTableFilterWriter(o.FilterPolicy) 2274 if w.split != nil { 2275 w.props.PrefixExtractorName = o.Comparer.Name 2276 w.props.PrefixFiltering = true 2277 } else { 2278 w.props.WholeKeyFiltering = true 2279 } 2280 default: 2281 panic(fmt.Sprintf("unknown filter type: %v", o.FilterType)) 2282 } 2283 } 2284 2285 w.props.ComparerName = o.Comparer.Name 2286 w.props.CompressionName = o.Compression.String() 2287 w.props.MergerName = o.MergerName 2288 w.props.PropertyCollectorNames = "[]" 2289 w.props.ExternalFormatVersion = rocksDBExternalFormatVersion 2290 2291 if len(o.TablePropertyCollectors) > 0 || len(o.BlockPropertyCollectors) > 0 || 2292 w.tableFormat >= TableFormatPebblev4 { 2293 var buf bytes.Buffer 2294 buf.WriteString("[") 2295 if len(o.TablePropertyCollectors) > 0 { 2296 w.propCollectors = make([]TablePropertyCollector, len(o.TablePropertyCollectors)) 2297 for i := range o.TablePropertyCollectors { 2298 w.propCollectors[i] = o.TablePropertyCollectors[i]() 2299 if i > 0 { 2300 buf.WriteString(",") 2301 } 2302 buf.WriteString(w.propCollectors[i].Name()) 2303 } 2304 } 2305 numBlockPropertyCollectors := len(o.BlockPropertyCollectors) 2306 if w.tableFormat >= TableFormatPebblev4 { 2307 numBlockPropertyCollectors++ 2308 } 2309 // shortID is a uint8, so we cannot exceed that number of block 2310 // property collectors. 2311 if numBlockPropertyCollectors > math.MaxUint8 { 2312 w.err = errors.New("pebble: too many block property collectors") 2313 return w 2314 } 2315 if numBlockPropertyCollectors > 0 { 2316 w.blockPropCollectors = make([]BlockPropertyCollector, numBlockPropertyCollectors) 2317 } 2318 if len(o.BlockPropertyCollectors) > 0 { 2319 // The shortID assigned to a collector is the same as its index in 2320 // this slice. 2321 for i := range o.BlockPropertyCollectors { 2322 w.blockPropCollectors[i] = o.BlockPropertyCollectors[i]() 2323 if i > 0 || len(o.TablePropertyCollectors) > 0 { 2324 buf.WriteString(",") 2325 } 2326 buf.WriteString(w.blockPropCollectors[i].Name()) 2327 } 2328 } 2329 if w.tableFormat >= TableFormatPebblev4 { 2330 if numBlockPropertyCollectors > 1 || len(o.TablePropertyCollectors) > 0 { 2331 buf.WriteString(",") 2332 } 2333 w.blockPropCollectors[numBlockPropertyCollectors-1] = &w.obsoleteCollector 2334 buf.WriteString(w.obsoleteCollector.Name()) 2335 } 2336 buf.WriteString("]") 2337 w.props.PropertyCollectorNames = buf.String() 2338 } 2339 2340 // Apply the remaining WriterOptions that do not have a preApply() method. 2341 for _, opt := range extraOpts { 2342 if _, ok := opt.(preApply); ok { 2343 continue 2344 } 2345 opt.writerApply(w) 2346 } 2347 2348 // Initialize the range key fragmenter and encoder. 2349 w.fragmenter.Emit = w.encodeRangeKeySpan 2350 w.rangeKeyEncoder.Emit = w.addRangeKey 2351 return w 2352 } 2353 2354 // internalGetProperties is a private, internal-use-only function that takes a 2355 // Writer and returns a pointer to its Properties, allowing direct mutation. 2356 // It's used by internal Pebble flushes and compactions to set internal 2357 // properties. It gets installed in private. 2358 func internalGetProperties(w *Writer) *Properties { 2359 return &w.props 2360 } 2361 2362 func init() { 2363 private.SSTableWriterDisableKeyOrderChecks = func(i interface{}) { 2364 w := i.(*Writer) 2365 w.disableKeyOrderChecks = true 2366 } 2367 private.SSTableInternalProperties = internalGetProperties 2368 } 2369 2370 type obsoleteKeyBlockPropertyCollector struct { 2371 blockIsNonObsolete bool 2372 indexIsNonObsolete bool 2373 tableIsNonObsolete bool 2374 } 2375 2376 func encodeNonObsolete(isNonObsolete bool, buf []byte) []byte { 2377 if isNonObsolete { 2378 return buf 2379 } 2380 return append(buf, 't') 2381 } 2382 2383 func (o *obsoleteKeyBlockPropertyCollector) Name() string { 2384 return "obsolete-key" 2385 } 2386 2387 func (o *obsoleteKeyBlockPropertyCollector) Add(key InternalKey, value []byte) error { 2388 // Ignore. 2389 return nil 2390 } 2391 2392 func (o *obsoleteKeyBlockPropertyCollector) AddPoint(isObsolete bool) { 2393 o.blockIsNonObsolete = o.blockIsNonObsolete || !isObsolete 2394 } 2395 2396 func (o *obsoleteKeyBlockPropertyCollector) FinishDataBlock(buf []byte) ([]byte, error) { 2397 o.tableIsNonObsolete = o.tableIsNonObsolete || o.blockIsNonObsolete 2398 return encodeNonObsolete(o.blockIsNonObsolete, buf), nil 2399 } 2400 2401 func (o *obsoleteKeyBlockPropertyCollector) AddPrevDataBlockToIndexBlock() { 2402 o.indexIsNonObsolete = o.indexIsNonObsolete || o.blockIsNonObsolete 2403 o.blockIsNonObsolete = false 2404 } 2405 2406 func (o *obsoleteKeyBlockPropertyCollector) FinishIndexBlock(buf []byte) ([]byte, error) { 2407 indexIsNonObsolete := o.indexIsNonObsolete 2408 o.indexIsNonObsolete = false 2409 return encodeNonObsolete(indexIsNonObsolete, buf), nil 2410 } 2411 2412 func (o *obsoleteKeyBlockPropertyCollector) FinishTable(buf []byte) ([]byte, error) { 2413 return encodeNonObsolete(o.tableIsNonObsolete, buf), nil 2414 } 2415 2416 func (o *obsoleteKeyBlockPropertyCollector) UpdateKeySuffixes( 2417 oldProp []byte, oldSuffix, newSuffix []byte, 2418 ) error { 2419 _, err := propToIsObsolete(oldProp) 2420 if err != nil { 2421 return err 2422 } 2423 // Suffix rewriting currently loses the obsolete bit. 2424 o.blockIsNonObsolete = true 2425 return nil 2426 } 2427 2428 // NB: obsoleteKeyBlockPropertyFilter is stateless. This aspect of the filter 2429 // is used in table_cache.go for in-place modification of a filters slice. 2430 type obsoleteKeyBlockPropertyFilter struct{} 2431 2432 func (o obsoleteKeyBlockPropertyFilter) Name() string { 2433 return "obsolete-key" 2434 } 2435 2436 // Intersects returns true if the set represented by prop intersects with 2437 // the set in the filter. 2438 func (o obsoleteKeyBlockPropertyFilter) Intersects(prop []byte) (bool, error) { 2439 return propToIsObsolete(prop) 2440 } 2441 2442 func propToIsObsolete(prop []byte) (bool, error) { 2443 if len(prop) == 0 { 2444 return true, nil 2445 } 2446 if len(prop) > 1 || prop[0] != 't' { 2447 return false, errors.Errorf("unexpected property %x", prop) 2448 } 2449 return false, nil 2450 }