github.com/petermattis/pebble@v0.0.0-20190905164901-ab51a2166067/sstable/writer.go (about) 1 // Copyright 2011 The LevelDB-Go and Pebble Authors. All rights reserved. Use 2 // of this source code is governed by a BSD-style license that can be found in 3 // the LICENSE file. 4 5 package sstable 6 7 import ( 8 "bufio" 9 "bytes" 10 "encoding/binary" 11 "errors" 12 "fmt" 13 "io" 14 "math" 15 16 "github.com/golang/snappy" 17 "github.com/petermattis/pebble/internal/base" 18 "github.com/petermattis/pebble/internal/crc" 19 "github.com/petermattis/pebble/internal/rangedel" 20 ) 21 22 // WriterMetadata holds info about a finished sstable. 23 type WriterMetadata struct { 24 Size uint64 25 SmallestPoint InternalKey 26 SmallestRange InternalKey 27 LargestPoint InternalKey 28 LargestRange InternalKey 29 SmallestSeqNum uint64 30 LargestSeqNum uint64 31 } 32 33 func (m *WriterMetadata) updateSeqNum(seqNum uint64) { 34 if m.SmallestSeqNum > seqNum { 35 m.SmallestSeqNum = seqNum 36 } 37 if m.LargestSeqNum < seqNum { 38 m.LargestSeqNum = seqNum 39 } 40 } 41 42 func (m *WriterMetadata) updateLargestPoint(key InternalKey) { 43 // Avoid the memory allocation in InternalKey.Clone() by reusing the buffer. 44 m.LargestPoint.UserKey = append(m.LargestPoint.UserKey[:0], key.UserKey...) 45 m.LargestPoint.Trailer = key.Trailer 46 } 47 48 // Smallest returns the smaller of SmallestPoint and SmallestRange. 49 func (m *WriterMetadata) Smallest(cmp Compare) InternalKey { 50 if m.SmallestPoint.UserKey == nil { 51 return m.SmallestRange 52 } 53 if m.SmallestRange.UserKey == nil { 54 return m.SmallestPoint 55 } 56 if base.InternalCompare(cmp, m.SmallestPoint, m.SmallestRange) < 0 { 57 return m.SmallestPoint 58 } 59 return m.SmallestRange 60 } 61 62 // Largest returns the larget of LargestPoint and LargestRange. 63 func (m *WriterMetadata) Largest(cmp Compare) InternalKey { 64 if m.LargestPoint.UserKey == nil { 65 return m.LargestRange 66 } 67 if m.LargestRange.UserKey == nil { 68 return m.LargestPoint 69 } 70 if base.InternalCompare(cmp, m.LargestPoint, m.LargestRange) > 0 { 71 return m.LargestPoint 72 } 73 return m.LargestRange 74 } 75 76 type flusher interface { 77 Flush() error 78 } 79 80 type writeCloseSyncer interface { 81 io.WriteCloser 82 Sync() error 83 } 84 85 // Writer is a table writer. 86 type Writer struct { 87 writer io.Writer 88 bufWriter *bufio.Writer 89 syncer writeCloseSyncer 90 meta WriterMetadata 91 err error 92 // The following fields are copied from Options. 93 blockSize int 94 blockSizeThreshold int 95 indexBlockSize int 96 indexBlockSizeThreshold int 97 compare Compare 98 split Split 99 compression Compression 100 separator Separator 101 successor Successor 102 tableFormat TableFormat 103 // With two level indexes, the index/filter of a SST file is partitioned into 104 // smaller blocks with an additional top-level index on them. When reading an 105 // index/filter, only the top-level index is loaded into memory. The two level 106 // index/filter then uses the top-level index to load on demand into the block 107 // cache the partitions that are required to perform the index/filter query. 108 // 109 // Two level indexes are enabled automatically when there is more than one 110 // index block. 111 // 112 // This is useful when there are very large index blocks, which generally occurs 113 // with the usage of large keys. With large index blocks, the index blocks fight 114 // the data blocks for block cache space and the index blocks are likely to be 115 // re-read many times from the disk. The top level index, which has a much 116 // smaller memory footprint, can be used to prevent the entire index block from 117 // being loaded into the block cache. 118 twoLevelIndex bool 119 // Internal flag to allow creation of range-del-v1 format blocks. Only used 120 // for testing. Note that v2 format blocks are backwards compatible with v1 121 // format blocks. 122 rangeDelV1Format bool 123 // A table is a series of blocks and a block's index entry contains a 124 // separator key between one block and the next. Thus, a finished block 125 // cannot be written until the first key in the next block is seen. 126 // pendingBH is the blockHandle of a finished block that is waiting for 127 // the next call to Set. If the writer is not in this state, pendingBH 128 // is zero. 129 pendingBH BlockHandle 130 block blockWriter 131 indexBlock blockWriter 132 rangeDelBlock blockWriter 133 props Properties 134 propCollectors []TablePropertyCollector 135 // compressedBuf is the destination buffer for snappy compression. It is 136 // re-used over the lifetime of the writer, avoiding the allocation of a 137 // temporary buffer for each block. 138 compressedBuf []byte 139 // filter accumulates the filter block. If populated, the filter ingests 140 // either the output of w.split (i.e. a prefix extractor) if w.split is not 141 // nil, or the full keys otherwise. 142 filter filterWriter 143 // tmp is a scratch buffer, large enough to hold either footerLen bytes, 144 // blockTrailerLen bytes, or (5 * binary.MaxVarintLen64) bytes. 145 tmp [rocksDBFooterLen]byte 146 147 topLevelIndexBlock blockWriter 148 indexPartitions []blockWriter 149 } 150 151 // Set sets the value for the given key. The sequence number is set to 152 // 0. Intended for use to externally construct an sstable before ingestion into 153 // a DB. 154 // 155 // TODO(peter): untested 156 func (w *Writer) Set(key, value []byte) error { 157 if w.err != nil { 158 return w.err 159 } 160 return w.addPoint(base.MakeInternalKey(key, 0, InternalKeyKindSet), value) 161 } 162 163 // Delete deletes the value for the given key. The sequence number is set to 164 // 0. Intended for use to externally construct an sstable before ingestion into 165 // a DB. 166 // 167 // TODO(peter): untested 168 func (w *Writer) Delete(key []byte) error { 169 if w.err != nil { 170 return w.err 171 } 172 return w.addPoint(base.MakeInternalKey(key, 0, InternalKeyKindDelete), nil) 173 } 174 175 // DeleteRange deletes all of the keys (and values) in the range [start,end) 176 // (inclusive on start, exclusive on end). The sequence number is set to 177 // 0. Intended for use to externally construct an sstable before ingestion into 178 // a DB. 179 // 180 // TODO(peter): untested 181 func (w *Writer) DeleteRange(start, end []byte) error { 182 if w.err != nil { 183 return w.err 184 } 185 return w.addTombstone(base.MakeInternalKey(start, 0, InternalKeyKindRangeDelete), end) 186 } 187 188 // Merge adds an action to the DB that merges the value at key with the new 189 // value. The details of the merge are dependent upon the configured merge 190 // operator. The sequence number is set to 0. Intended for use to externally 191 // construct an sstable before ingestion into a DB. 192 // 193 // TODO(peter): untested 194 func (w *Writer) Merge(key, value []byte) error { 195 if w.err != nil { 196 return w.err 197 } 198 return w.addPoint(base.MakeInternalKey(key, 0, InternalKeyKindMerge), value) 199 } 200 201 // Add adds a key/value pair to the table being written. For a given Writer, 202 // the keys passed to Add must be in increasing order. The exception to this 203 // rule is range deletion tombstones. Range deletion tombstones need to be 204 // added ordered by their start key, but they can be added out of order from 205 // point entries. Additionally, range deletion tombstones must be fragmented 206 // (i.e. by rangedel.Fragmenter). 207 func (w *Writer) Add(key InternalKey, value []byte) error { 208 if w.err != nil { 209 return w.err 210 } 211 212 if key.Kind() == InternalKeyKindRangeDelete { 213 return w.addTombstone(key, value) 214 } 215 return w.addPoint(key, value) 216 } 217 218 func (w *Writer) addPoint(key InternalKey, value []byte) error { 219 if base.InternalCompare(w.compare, w.meta.LargestPoint, key) >= 0 { 220 w.err = fmt.Errorf("pebble: keys must be added in order: %s, %s", w.meta.LargestPoint, key) 221 return w.err 222 } 223 224 if err := w.maybeFlush(key, value); err != nil { 225 return err 226 } 227 228 for i := range w.propCollectors { 229 if err := w.propCollectors[i].Add(key, value); err != nil { 230 return err 231 } 232 } 233 234 w.meta.updateSeqNum(key.SeqNum()) 235 w.meta.updateLargestPoint(key) 236 237 w.maybeAddToFilter(key.UserKey) 238 239 if w.props.NumEntries == 0 { 240 w.meta.SmallestPoint = key.Clone() 241 } 242 w.props.NumEntries++ 243 switch key.Kind() { 244 case InternalKeyKindDelete: 245 w.props.NumDeletions++ 246 case InternalKeyKindMerge: 247 w.props.NumMergeOperands++ 248 } 249 w.props.RawKeySize += uint64(key.Size()) 250 w.props.RawValueSize += uint64(len(value)) 251 w.block.add(key, value) 252 return nil 253 } 254 255 func (w *Writer) addTombstone(key InternalKey, value []byte) error { 256 if !w.rangeDelV1Format && w.rangeDelBlock.nEntries > 0 { 257 // Check that tombstones are being added in fragmented order. If the two 258 // tombstones overlap, their start and end keys must be identical. 259 prevKey := base.DecodeInternalKey(w.rangeDelBlock.curKey) 260 switch c := w.compare(prevKey.UserKey, key.UserKey); { 261 case c > 0: 262 w.err = fmt.Errorf("pebble: keys must be added in order: %s, %s", prevKey, key) 263 return w.err 264 case c == 0: 265 prevValue := w.rangeDelBlock.curValue 266 if w.compare(prevValue, value) != 0 { 267 w.err = fmt.Errorf("pebble: overlapping tombstones must be fragmented: %s vs %s", 268 rangedel.Tombstone{Start: prevKey, End: prevValue}, 269 rangedel.Tombstone{Start: key, End: value}) 270 return w.err 271 } 272 if prevKey.SeqNum() <= key.SeqNum() { 273 w.err = fmt.Errorf("pebble: keys must be added in order: %s, %s", prevKey, key) 274 return w.err 275 } 276 default: 277 prevValue := w.rangeDelBlock.curValue 278 if w.compare(prevValue, key.UserKey) > 0 { 279 w.err = fmt.Errorf("pebble: overlapping tombstones must be fragmented: %s vs %s", 280 rangedel.Tombstone{Start: prevKey, End: prevValue}, 281 rangedel.Tombstone{Start: key, End: value}) 282 return w.err 283 } 284 } 285 } 286 287 for i := range w.propCollectors { 288 if err := w.propCollectors[i].Add(key, value); err != nil { 289 return err 290 } 291 } 292 293 w.meta.updateSeqNum(key.SeqNum()) 294 295 if w.props.NumRangeDeletions == 0 { 296 w.meta.SmallestRange = key.Clone() 297 w.meta.LargestRange = base.MakeRangeDeleteSentinelKey(value).Clone() 298 } else if w.rangeDelV1Format { 299 if base.InternalCompare(w.compare, w.meta.SmallestRange, key) > 0 { 300 w.meta.SmallestRange = key.Clone() 301 } 302 end := base.MakeRangeDeleteSentinelKey(value) 303 if base.InternalCompare(w.compare, w.meta.LargestRange, end) < 0 { 304 w.meta.LargestRange = end.Clone() 305 } 306 } 307 w.props.NumEntries++ 308 w.props.NumDeletions++ 309 w.props.NumRangeDeletions++ 310 w.props.RawKeySize += uint64(key.Size()) 311 w.props.RawValueSize += uint64(len(value)) 312 w.rangeDelBlock.add(key, value) 313 return nil 314 } 315 316 func (w *Writer) maybeAddToFilter(key []byte) { 317 if w.filter != nil { 318 if w.split != nil { 319 prefix := key[:w.split(key)] 320 w.filter.addKey(prefix) 321 } else { 322 w.filter.addKey(key) 323 } 324 } 325 } 326 327 func (w *Writer) maybeFlush(key InternalKey, value []byte) error { 328 if !shouldFlush(key, value, w.block, w.blockSize, w.blockSizeThreshold) { 329 return nil 330 } 331 332 bh, err := w.finishBlock(&w.block) 333 if err != nil { 334 w.err = err 335 return w.err 336 } 337 w.pendingBH = bh 338 w.flushPendingBH(key) 339 return nil 340 } 341 342 // flushPendingBH adds any pending block handle to the index entries. 343 func (w *Writer) flushPendingBH(key InternalKey) { 344 if w.pendingBH.Length == 0 { 345 // A valid blockHandle must be non-zero. 346 // In particular, it must have a non-zero length. 347 return 348 } 349 prevKey := base.DecodeInternalKey(w.block.curKey) 350 var sep InternalKey 351 if key.UserKey == nil && key.Trailer == 0 { 352 sep = prevKey.Successor(w.compare, w.successor, nil) 353 } else { 354 sep = prevKey.Separator(w.compare, w.separator, nil, key) 355 } 356 n := encodeBlockHandle(w.tmp[:], w.pendingBH) 357 358 if shouldFlush(sep, w.tmp[:n], w.indexBlock, w.indexBlockSize, w.indexBlockSizeThreshold) { 359 // Enable two level indexes if there is more than one index block. 360 w.twoLevelIndex = true 361 w.finishIndexBlock() 362 } 363 364 w.indexBlock.add(sep, w.tmp[:n]) 365 366 w.pendingBH = BlockHandle{} 367 } 368 369 func shouldFlush(key InternalKey, value []byte, block blockWriter, blockSize, sizeThreshold int) bool { 370 if size := block.estimatedSize(); size < blockSize { 371 // The block is currently smaller than the target size. 372 if size <= sizeThreshold { 373 // The block is smaller than the threshold size at which we'll consider 374 // flushing it. 375 return false 376 } 377 newSize := size + key.Size() + len(value) 378 if block.nEntries%block.restartInterval == 0 { 379 newSize += 4 380 } 381 newSize += 4 // varint for shared prefix length 382 newSize += uvarintLen(uint32(key.Size())) // varint for unshared key bytes 383 newSize += uvarintLen(uint32(len(value))) // varint for value size 384 if newSize <= blockSize { 385 // The block plus the new entry is smaller than the target size. 386 return false 387 } 388 } 389 390 return true 391 } 392 393 // finishBlock finishes the current block and returns its block handle, which is 394 // its offset and length in the table. 395 func (w *Writer) finishBlock(block *blockWriter) (BlockHandle, error) { 396 bh, err := w.writeRawBlock(block.finish(), w.compression) 397 398 // Calculate filters. 399 if w.filter != nil { 400 w.filter.finishBlock(w.meta.Size) 401 } 402 403 // Reset the per-block state. 404 block.reset() 405 return bh, err 406 } 407 408 // finishIndexBlock finishes the current index block and adds it to the top 409 // level index block. This is only used when two level indexes are enabled. 410 func (w *Writer) finishIndexBlock() { 411 w.indexPartitions = append(w.indexPartitions, w.indexBlock) 412 w.indexBlock = blockWriter{ 413 restartInterval: 1, 414 } 415 } 416 417 func (w *Writer) writeTwoLevelIndex() (BlockHandle, error) { 418 // Add the final unfinished index. 419 w.finishIndexBlock() 420 421 for _, b := range w.indexPartitions { 422 sep := base.DecodeInternalKey(b.curKey) 423 bh, _ := w.writeRawBlock(b.finish(), w.compression) 424 425 if w.filter != nil { 426 w.filter.finishBlock(w.meta.Size) 427 } 428 429 n := encodeBlockHandle(w.tmp[:], bh) 430 w.topLevelIndexBlock.add(sep, w.tmp[:n]) 431 432 w.props.IndexSize += uint64(len(b.buf)) 433 w.props.NumDataBlocks += uint64(b.nEntries) 434 } 435 436 // NB: RocksDB includes the block trailer length in the index size 437 // property, though it doesn't include the trailer in the top level 438 // index size property. 439 w.props.IndexPartitions = uint64(len(w.indexPartitions)) 440 w.props.TopLevelIndexSize = uint64(w.topLevelIndexBlock.estimatedSize()) 441 w.props.IndexSize += w.props.TopLevelIndexSize + blockTrailerLen 442 443 return w.finishBlock(&w.topLevelIndexBlock) 444 } 445 446 func (w *Writer) writeRawBlock(b []byte, compression Compression) (BlockHandle, error) { 447 blockType := noCompressionBlockType 448 if compression == SnappyCompression { 449 // Compress the buffer, discarding the result if the improvement isn't at 450 // least 12.5%. 451 compressed := snappy.Encode(w.compressedBuf, b) 452 w.compressedBuf = compressed[:cap(compressed)] 453 if len(compressed) < len(b)-len(b)/8 { 454 blockType = snappyCompressionBlockType 455 b = compressed 456 } 457 } 458 w.tmp[0] = blockType 459 460 // Calculate the checksum. 461 checksum := crc.New(b).Update(w.tmp[:1]).Value() 462 binary.LittleEndian.PutUint32(w.tmp[1:5], checksum) 463 bh := BlockHandle{w.meta.Size, uint64(len(b))} 464 465 // Write the bytes to the file. 466 n, err := w.writer.Write(b) 467 if err != nil { 468 return BlockHandle{}, err 469 } 470 w.meta.Size += uint64(n) 471 n, err = w.writer.Write(w.tmp[:blockTrailerLen]) 472 if err != nil { 473 return BlockHandle{}, err 474 } 475 w.meta.Size += uint64(n) 476 477 return bh, nil 478 } 479 480 // Close finishes writing the table and closes the underlying file that the 481 // table was written to. 482 func (w *Writer) Close() (err error) { 483 defer func() { 484 if w.syncer == nil { 485 return 486 } 487 err1 := w.syncer.Close() 488 if err == nil { 489 err = err1 490 } 491 w.syncer = nil 492 }() 493 if w.err != nil { 494 return w.err 495 } 496 497 // Finish the last data block, or force an empty data block if there 498 // aren't any data blocks at all. 499 w.flushPendingBH(InternalKey{}) 500 if w.block.nEntries > 0 || w.indexBlock.nEntries == 0 { 501 bh, err := w.finishBlock(&w.block) 502 if err != nil { 503 w.err = err 504 return w.err 505 } 506 w.pendingBH = bh 507 w.flushPendingBH(InternalKey{}) 508 } 509 w.props.DataSize = w.meta.Size 510 511 // Write the filter block. 512 var metaindex rawBlockWriter 513 metaindex.restartInterval = 1 514 if w.filter != nil { 515 b, err := w.filter.finish() 516 if err != nil { 517 w.err = err 518 return w.err 519 } 520 bh, err := w.writeRawBlock(b, NoCompression) 521 if err != nil { 522 w.err = err 523 return w.err 524 } 525 n := encodeBlockHandle(w.tmp[:], bh) 526 metaindex.add(InternalKey{UserKey: []byte(w.filter.metaName())}, w.tmp[:n]) 527 w.props.FilterPolicyName = w.filter.policyName() 528 w.props.FilterSize = bh.Length 529 } 530 531 var indexBH BlockHandle 532 if w.twoLevelIndex { 533 w.props.IndexType = twoLevelIndex 534 // Write the two level index block. 535 indexBH, err = w.writeTwoLevelIndex() 536 if err != nil { 537 w.err = err 538 return w.err 539 } 540 } else { 541 w.props.IndexType = binarySearchIndex 542 // NB: RocksDB includes the block trailer length in the index size 543 // property, though it doesn't include the trailer in the filter size 544 // property. 545 w.props.IndexSize = uint64(w.indexBlock.estimatedSize()) + blockTrailerLen 546 w.props.NumDataBlocks = uint64(w.indexBlock.nEntries) 547 548 // Write the single level index block. 549 indexBH, err = w.finishBlock(&w.indexBlock) 550 if err != nil { 551 w.err = err 552 return w.err 553 } 554 } 555 556 // Write the range-del block. The block handle must added to the meta index block 557 // after the properties block has been written. This is because the entries in the 558 // metaindex block must be sorted by key. 559 var rangeDelBH BlockHandle 560 if w.props.NumRangeDeletions > 0 { 561 if !w.rangeDelV1Format { 562 // Because the range tombstones are fragmented, the end key of the last 563 // added range tombstone will be the largest range tombstone key. Note 564 // that we need to make this into a range deletion sentinel because 565 // sstable boundaries are inclusive while the end key of a range deletion 566 // tombstone is exclusive. 567 w.meta.LargestRange = base.MakeRangeDeleteSentinelKey(w.rangeDelBlock.curValue) 568 } 569 b := w.rangeDelBlock.finish() 570 rangeDelBH, err = w.writeRawBlock(b, NoCompression) 571 if err != nil { 572 w.err = err 573 return w.err 574 } 575 } 576 577 { 578 userProps := make(map[string]string) 579 for i := range w.propCollectors { 580 if err := w.propCollectors[i].Finish(userProps); err != nil { 581 return err 582 } 583 } 584 if len(userProps) > 0 { 585 w.props.UserProperties = userProps 586 } 587 588 // Write the properties block. 589 var raw rawBlockWriter 590 // The restart interval is set to infinity because the properties block 591 // is always read sequentially and cached in a heap located object. This 592 // reduces table size without a significant impact on performance. 593 raw.restartInterval = propertiesBlockRestartInterval 594 w.props.CompressionOptions = rocksDBCompressionOptions 595 w.props.save(&raw) 596 bh, err := w.writeRawBlock(raw.finish(), NoCompression) 597 if err != nil { 598 w.err = err 599 return w.err 600 } 601 n := encodeBlockHandle(w.tmp[:], bh) 602 metaindex.add(InternalKey{UserKey: []byte(metaPropertiesName)}, w.tmp[:n]) 603 } 604 605 // Add the range deletion block handle to the metaindex block. 606 if w.props.NumRangeDeletions > 0 { 607 n := encodeBlockHandle(w.tmp[:], rangeDelBH) 608 // The v2 range-del block encoding is backwards compatible with the v1 609 // encoding. We add meta-index entries for both the old name and the new 610 // name so that old code can continue to find the range-del block and new 611 // code knows that the range tombstones in the block are fragmented and 612 // sorted. 613 metaindex.add(InternalKey{UserKey: []byte(metaRangeDelName)}, w.tmp[:n]) 614 if !w.rangeDelV1Format { 615 metaindex.add(InternalKey{UserKey: []byte(metaRangeDelV2Name)}, w.tmp[:n]) 616 } 617 } 618 619 // Write the metaindex block. It might be an empty block, if the filter 620 // policy is nil. 621 metaindexBH, err := w.finishBlock(&metaindex.blockWriter) 622 if err != nil { 623 w.err = err 624 return w.err 625 } 626 627 // Write the table footer. 628 footer := footer{ 629 format: w.tableFormat, 630 checksum: checksumCRC32c, 631 metaindexBH: metaindexBH, 632 indexBH: indexBH, 633 } 634 var n int 635 if n, err = w.writer.Write(footer.encode(w.tmp[:])); err != nil { 636 w.err = err 637 return w.err 638 } 639 w.meta.Size += uint64(n) 640 641 // Flush the buffer. 642 if w.bufWriter != nil { 643 if err := w.bufWriter.Flush(); err != nil { 644 w.err = err 645 return err 646 } 647 } 648 649 if err := w.syncer.Sync(); err != nil { 650 w.err = err 651 return err 652 } 653 654 // Make any future calls to Set or Close return an error. 655 w.err = errors.New("pebble: writer is closed") 656 return nil 657 } 658 659 // EstimatedSize returns the estimated size of the sstable being written if a 660 // called to Finish() was made without adding additional keys. 661 func (w *Writer) EstimatedSize() uint64 { 662 return w.meta.Size + uint64(w.block.estimatedSize()+w.indexBlock.estimatedSize()) 663 } 664 665 // Metadata returns the metadata for the finished sstable. Only valid to call 666 // after the sstable has been finished. 667 func (w *Writer) Metadata() (*WriterMetadata, error) { 668 if w.syncer != nil { 669 return nil, errors.New("pebble: writer is not closed") 670 } 671 return &w.meta, nil 672 } 673 674 // NewWriter returns a new table writer for the file. Closing the writer will 675 // close the file. 676 func NewWriter(f writeCloseSyncer, o *Options, lo TableOptions) *Writer { 677 o = o.EnsureDefaults() 678 lo = *lo.EnsureDefaults() 679 680 w := &Writer{ 681 syncer: f, 682 meta: WriterMetadata{ 683 SmallestSeqNum: math.MaxUint64, 684 }, 685 blockSize: lo.BlockSize, 686 blockSizeThreshold: (lo.BlockSize*lo.BlockSizeThreshold + 99) / 100, 687 indexBlockSize: lo.IndexBlockSize, 688 indexBlockSizeThreshold: (lo.IndexBlockSize*lo.BlockSizeThreshold + 99) / 100, 689 compare: o.Comparer.Compare, 690 split: o.Comparer.Split, 691 compression: lo.Compression, 692 separator: o.Comparer.Separator, 693 successor: o.Comparer.Successor, 694 tableFormat: o.TableFormat, 695 block: blockWriter{ 696 restartInterval: lo.BlockRestartInterval, 697 }, 698 indexBlock: blockWriter{ 699 restartInterval: 1, 700 }, 701 rangeDelBlock: blockWriter{ 702 restartInterval: 1, 703 }, 704 topLevelIndexBlock: blockWriter{ 705 restartInterval: 1, 706 }, 707 } 708 if f == nil { 709 w.err = errors.New("pebble: nil file") 710 return w 711 } 712 713 w.props.PrefixExtractorName = "nullptr" 714 if lo.FilterPolicy != nil { 715 switch lo.FilterType { 716 case TableFilter: 717 w.filter = newTableFilterWriter(lo.FilterPolicy) 718 if w.split != nil { 719 w.props.PrefixExtractorName = o.Comparer.Name 720 w.props.PrefixFiltering = true 721 } else { 722 w.props.WholeKeyFiltering = true 723 } 724 default: 725 panic(fmt.Sprintf("unknown filter type: %v", lo.FilterType)) 726 } 727 } 728 729 w.props.ColumnFamilyID = math.MaxInt32 730 w.props.ComparerName = o.Comparer.Name 731 w.props.CompressionName = lo.Compression.String() 732 w.props.MergerName = o.Merger.Name 733 w.props.PropertyCollectorNames = "[]" 734 w.props.Version = 2 // TODO(peter): what is this? 735 736 if len(o.TablePropertyCollectors) > 0 { 737 w.propCollectors = make([]TablePropertyCollector, len(o.TablePropertyCollectors)) 738 var buf bytes.Buffer 739 buf.WriteString("[") 740 for i := range o.TablePropertyCollectors { 741 w.propCollectors[i] = o.TablePropertyCollectors[i]() 742 if i > 0 { 743 buf.WriteString(",") 744 } 745 buf.WriteString(w.propCollectors[i].Name()) 746 } 747 buf.WriteString("]") 748 w.props.PropertyCollectorNames = buf.String() 749 } 750 751 // If f does not have a Flush method, do our own buffering. 752 if _, ok := f.(flusher); ok { 753 w.writer = f 754 } else { 755 w.bufWriter = bufio.NewWriter(f) 756 w.writer = w.bufWriter 757 } 758 return w 759 }