github.com/cockroachdb/pebble@v0.0.0-20231214172447-ab4952c5f87b/db.go (about) 1 // Copyright 2012 The LevelDB-Go and Pebble Authors. All rights reserved. Use 2 // of this source code is governed by a BSD-style license that can be found in 3 // the LICENSE file. 4 5 // Package pebble provides an ordered key/value store. 6 package pebble // import "github.com/cockroachdb/pebble" 7 8 import ( 9 "context" 10 "fmt" 11 "io" 12 "os" 13 "strconv" 14 "sync" 15 "sync/atomic" 16 "time" 17 18 "github.com/cockroachdb/errors" 19 "github.com/cockroachdb/pebble/internal/arenaskl" 20 "github.com/cockroachdb/pebble/internal/base" 21 "github.com/cockroachdb/pebble/internal/invalidating" 22 "github.com/cockroachdb/pebble/internal/invariants" 23 "github.com/cockroachdb/pebble/internal/keyspan" 24 "github.com/cockroachdb/pebble/internal/manifest" 25 "github.com/cockroachdb/pebble/internal/manual" 26 "github.com/cockroachdb/pebble/objstorage" 27 "github.com/cockroachdb/pebble/objstorage/remote" 28 "github.com/cockroachdb/pebble/rangekey" 29 "github.com/cockroachdb/pebble/record" 30 "github.com/cockroachdb/pebble/sstable" 31 "github.com/cockroachdb/pebble/vfs" 32 "github.com/cockroachdb/pebble/vfs/atomicfs" 33 "github.com/cockroachdb/tokenbucket" 34 "github.com/prometheus/client_golang/prometheus" 35 ) 36 37 const ( 38 // minTableCacheSize is the minimum size of the table cache, for a single db. 39 minTableCacheSize = 64 40 41 // numNonTableCacheFiles is an approximation for the number of files 42 // that we don't use for table caches, for a given db. 43 numNonTableCacheFiles = 10 44 ) 45 46 var ( 47 // ErrNotFound is returned when a get operation does not find the requested 48 // key. 49 ErrNotFound = base.ErrNotFound 50 // ErrClosed is panicked when an operation is performed on a closed snapshot or 51 // DB. Use errors.Is(err, ErrClosed) to check for this error. 52 ErrClosed = errors.New("pebble: closed") 53 // ErrReadOnly is returned when a write operation is performed on a read-only 54 // database. 55 ErrReadOnly = errors.New("pebble: read-only") 56 // errNoSplit indicates that the user is trying to perform a range key 57 // operation but the configured Comparer does not provide a Split 58 // implementation. 59 errNoSplit = errors.New("pebble: Comparer.Split required for range key operations") 60 ) 61 62 // Reader is a readable key/value store. 63 // 64 // It is safe to call Get and NewIter from concurrent goroutines. 65 type Reader interface { 66 // Get gets the value for the given key. It returns ErrNotFound if the DB 67 // does not contain the key. 68 // 69 // The caller should not modify the contents of the returned slice, but it is 70 // safe to modify the contents of the argument after Get returns. The 71 // returned slice will remain valid until the returned Closer is closed. On 72 // success, the caller MUST call closer.Close() or a memory leak will occur. 73 Get(key []byte) (value []byte, closer io.Closer, err error) 74 75 // NewIter returns an iterator that is unpositioned (Iterator.Valid() will 76 // return false). The iterator can be positioned via a call to SeekGE, 77 // SeekLT, First or Last. 78 NewIter(o *IterOptions) (*Iterator, error) 79 80 // NewIterWithContext is like NewIter, and additionally accepts a context 81 // for tracing. 82 NewIterWithContext(ctx context.Context, o *IterOptions) (*Iterator, error) 83 84 // Close closes the Reader. It may or may not close any underlying io.Reader 85 // or io.Writer, depending on how the DB was created. 86 // 87 // It is not safe to close a DB until all outstanding iterators are closed. 88 // It is valid to call Close multiple times. Other methods should not be 89 // called after the DB has been closed. 90 Close() error 91 } 92 93 // Writer is a writable key/value store. 94 // 95 // Goroutine safety is dependent on the specific implementation. 96 type Writer interface { 97 // Apply the operations contained in the batch to the DB. 98 // 99 // It is safe to modify the contents of the arguments after Apply returns. 100 Apply(batch *Batch, o *WriteOptions) error 101 102 // Delete deletes the value for the given key. Deletes are blind all will 103 // succeed even if the given key does not exist. 104 // 105 // It is safe to modify the contents of the arguments after Delete returns. 106 Delete(key []byte, o *WriteOptions) error 107 108 // DeleteSized behaves identically to Delete, but takes an additional 109 // argument indicating the size of the value being deleted. DeleteSized 110 // should be preferred when the caller has the expectation that there exists 111 // a single internal KV pair for the key (eg, the key has not been 112 // overwritten recently), and the caller knows the size of its value. 113 // 114 // DeleteSized will record the value size within the tombstone and use it to 115 // inform compaction-picking heuristics which strive to reduce space 116 // amplification in the LSM. This "calling your shot" mechanic allows the 117 // storage engine to more accurately estimate and reduce space 118 // amplification. 119 // 120 // It is safe to modify the contents of the arguments after DeleteSized 121 // returns. 122 DeleteSized(key []byte, valueSize uint32, _ *WriteOptions) error 123 124 // SingleDelete is similar to Delete in that it deletes the value for the given key. Like Delete, 125 // it is a blind operation that will succeed even if the given key does not exist. 126 // 127 // WARNING: Undefined (non-deterministic) behavior will result if a key is overwritten and 128 // then deleted using SingleDelete. The record may appear deleted immediately, but be 129 // resurrected at a later time after compactions have been performed. Or the record may 130 // be deleted permanently. A Delete operation lays down a "tombstone" which shadows all 131 // previous versions of a key. The SingleDelete operation is akin to "anti-matter" and will 132 // only delete the most recently written version for a key. These different semantics allow 133 // the DB to avoid propagating a SingleDelete operation during a compaction as soon as the 134 // corresponding Set operation is encountered. These semantics require extreme care to handle 135 // properly. Only use if you have a workload where the performance gain is critical and you 136 // can guarantee that a record is written once and then deleted once. 137 // 138 // SingleDelete is internally transformed into a Delete if the most recent record for a key is either 139 // a Merge or Delete record. 140 // 141 // It is safe to modify the contents of the arguments after SingleDelete returns. 142 SingleDelete(key []byte, o *WriteOptions) error 143 144 // DeleteRange deletes all of the point keys (and values) in the range 145 // [start,end) (inclusive on start, exclusive on end). DeleteRange does NOT 146 // delete overlapping range keys (eg, keys set via RangeKeySet). 147 // 148 // It is safe to modify the contents of the arguments after DeleteRange 149 // returns. 150 DeleteRange(start, end []byte, o *WriteOptions) error 151 152 // LogData adds the specified to the batch. The data will be written to the 153 // WAL, but not added to memtables or sstables. Log data is never indexed, 154 // which makes it useful for testing WAL performance. 155 // 156 // It is safe to modify the contents of the argument after LogData returns. 157 LogData(data []byte, opts *WriteOptions) error 158 159 // Merge merges the value for the given key. The details of the merge are 160 // dependent upon the configured merge operation. 161 // 162 // It is safe to modify the contents of the arguments after Merge returns. 163 Merge(key, value []byte, o *WriteOptions) error 164 165 // Set sets the value for the given key. It overwrites any previous value 166 // for that key; a DB is not a multi-map. 167 // 168 // It is safe to modify the contents of the arguments after Set returns. 169 Set(key, value []byte, o *WriteOptions) error 170 171 // RangeKeySet sets a range key mapping the key range [start, end) at the MVCC 172 // timestamp suffix to value. The suffix is optional. If any portion of the key 173 // range [start, end) is already set by a range key with the same suffix value, 174 // RangeKeySet overrides it. 175 // 176 // It is safe to modify the contents of the arguments after RangeKeySet returns. 177 RangeKeySet(start, end, suffix, value []byte, opts *WriteOptions) error 178 179 // RangeKeyUnset removes a range key mapping the key range [start, end) at the 180 // MVCC timestamp suffix. The suffix may be omitted to remove an unsuffixed 181 // range key. RangeKeyUnset only removes portions of range keys that fall within 182 // the [start, end) key span, and only range keys with suffixes that exactly 183 // match the unset suffix. 184 // 185 // It is safe to modify the contents of the arguments after RangeKeyUnset 186 // returns. 187 RangeKeyUnset(start, end, suffix []byte, opts *WriteOptions) error 188 189 // RangeKeyDelete deletes all of the range keys in the range [start,end) 190 // (inclusive on start, exclusive on end). It does not delete point keys (for 191 // that use DeleteRange). RangeKeyDelete removes all range keys within the 192 // bounds, including those with or without suffixes. 193 // 194 // It is safe to modify the contents of the arguments after RangeKeyDelete 195 // returns. 196 RangeKeyDelete(start, end []byte, opts *WriteOptions) error 197 } 198 199 // CPUWorkHandle represents a handle used by the CPUWorkPermissionGranter API. 200 type CPUWorkHandle interface { 201 // Permitted indicates whether Pebble can use additional CPU resources. 202 Permitted() bool 203 } 204 205 // CPUWorkPermissionGranter is used to request permission to opportunistically 206 // use additional CPUs to speed up internal background work. 207 type CPUWorkPermissionGranter interface { 208 // GetPermission returns a handle regardless of whether permission is granted 209 // or not. In the latter case, the handle is only useful for recording 210 // the CPU time actually spent on this calling goroutine. 211 GetPermission(time.Duration) CPUWorkHandle 212 // CPUWorkDone must be called regardless of whether CPUWorkHandle.Permitted 213 // returns true or false. 214 CPUWorkDone(CPUWorkHandle) 215 } 216 217 // Use a default implementation for the CPU work granter to avoid excessive nil 218 // checks in the code. 219 type defaultCPUWorkHandle struct{} 220 221 func (d defaultCPUWorkHandle) Permitted() bool { 222 return false 223 } 224 225 type defaultCPUWorkGranter struct{} 226 227 func (d defaultCPUWorkGranter) GetPermission(_ time.Duration) CPUWorkHandle { 228 return defaultCPUWorkHandle{} 229 } 230 231 func (d defaultCPUWorkGranter) CPUWorkDone(_ CPUWorkHandle) {} 232 233 // DB provides a concurrent, persistent ordered key/value store. 234 // 235 // A DB's basic operations (Get, Set, Delete) should be self-explanatory. Get 236 // and Delete will return ErrNotFound if the requested key is not in the store. 237 // Callers are free to ignore this error. 238 // 239 // A DB also allows for iterating over the key/value pairs in key order. If d 240 // is a DB, the code below prints all key/value pairs whose keys are 'greater 241 // than or equal to' k: 242 // 243 // iter := d.NewIter(readOptions) 244 // for iter.SeekGE(k); iter.Valid(); iter.Next() { 245 // fmt.Printf("key=%q value=%q\n", iter.Key(), iter.Value()) 246 // } 247 // return iter.Close() 248 // 249 // The Options struct holds the optional parameters for the DB, including a 250 // Comparer to define a 'less than' relationship over keys. It is always valid 251 // to pass a nil *Options, which means to use the default parameter values. Any 252 // zero field of a non-nil *Options also means to use the default value for 253 // that parameter. Thus, the code below uses a custom Comparer, but the default 254 // values for every other parameter: 255 // 256 // db := pebble.Open(&Options{ 257 // Comparer: myComparer, 258 // }) 259 type DB struct { 260 // The count and size of referenced memtables. This includes memtables 261 // present in DB.mu.mem.queue, as well as memtables that have been flushed 262 // but are still referenced by an inuse readState, as well as up to one 263 // memTable waiting to be reused and stored in d.memTableRecycle. 264 memTableCount atomic.Int64 265 memTableReserved atomic.Int64 // number of bytes reserved in the cache for memtables 266 // memTableRecycle holds a pointer to an obsolete memtable. The next 267 // memtable allocation will reuse this memtable if it has not already been 268 // recycled. 269 memTableRecycle atomic.Pointer[memTable] 270 271 // The size of the current log file (i.e. db.mu.log.queue[len(queue)-1]. 272 logSize atomic.Uint64 273 274 // The number of bytes available on disk. 275 diskAvailBytes atomic.Uint64 276 277 cacheID uint64 278 dirname string 279 walDirname string 280 opts *Options 281 cmp Compare 282 equal Equal 283 merge Merge 284 split Split 285 abbreviatedKey AbbreviatedKey 286 // The threshold for determining when a batch is "large" and will skip being 287 // inserted into a memtable. 288 largeBatchThreshold uint64 289 // The current OPTIONS file number. 290 optionsFileNum base.DiskFileNum 291 // The on-disk size of the current OPTIONS file. 292 optionsFileSize uint64 293 294 // objProvider is used to access and manage SSTs. 295 objProvider objstorage.Provider 296 297 fileLock *Lock 298 dataDir vfs.File 299 walDir vfs.File 300 301 tableCache *tableCacheContainer 302 newIters tableNewIters 303 tableNewRangeKeyIter keyspan.TableNewSpanIter 304 305 commit *commitPipeline 306 307 // readState provides access to the state needed for reading without needing 308 // to acquire DB.mu. 309 readState struct { 310 sync.RWMutex 311 val *readState 312 } 313 // logRecycler holds a set of log file numbers that are available for 314 // reuse. Writing to a recycled log file is faster than to a new log file on 315 // some common filesystems (xfs, and ext3/4) due to avoiding metadata 316 // updates. 317 logRecycler logRecycler 318 319 closed *atomic.Value 320 closedCh chan struct{} 321 322 cleanupManager *cleanupManager 323 324 // During an iterator close, we may asynchronously schedule read compactions. 325 // We want to wait for those goroutines to finish, before closing the DB. 326 // compactionShedulers.Wait() should not be called while the DB.mu is held. 327 compactionSchedulers sync.WaitGroup 328 329 // The main mutex protecting internal DB state. This mutex encompasses many 330 // fields because those fields need to be accessed and updated atomically. In 331 // particular, the current version, log.*, mem.*, and snapshot list need to 332 // be accessed and updated atomically during compaction. 333 // 334 // Care is taken to avoid holding DB.mu during IO operations. Accomplishing 335 // this sometimes requires releasing DB.mu in a method that was called with 336 // it held. See versionSet.logAndApply() and DB.makeRoomForWrite() for 337 // examples. This is a common pattern, so be careful about expectations that 338 // DB.mu will be held continuously across a set of calls. 339 mu struct { 340 sync.Mutex 341 342 formatVers struct { 343 // vers is the database's current format major version. 344 // Backwards-incompatible features are gated behind new 345 // format major versions and not enabled until a database's 346 // version is ratcheted upwards. 347 // 348 // Although this is under the `mu` prefix, readers may read vers 349 // atomically without holding d.mu. Writers must only write to this 350 // value through finalizeFormatVersUpgrade which requires d.mu is 351 // held. 352 vers atomic.Uint64 353 // marker is the atomic marker for the format major version. 354 // When a database's version is ratcheted upwards, the 355 // marker is moved in order to atomically record the new 356 // version. 357 marker *atomicfs.Marker 358 // ratcheting when set to true indicates that the database is 359 // currently in the process of ratcheting the format major version 360 // to vers + 1. As a part of ratcheting the format major version, 361 // migrations may drop and re-acquire the mutex. 362 ratcheting bool 363 } 364 365 // The ID of the next job. Job IDs are passed to event listener 366 // notifications and act as a mechanism for tying together the events and 367 // log messages for a single job such as a flush, compaction, or file 368 // ingestion. Job IDs are not serialized to disk or used for correctness. 369 nextJobID int 370 371 // The collection of immutable versions and state about the log and visible 372 // sequence numbers. Use the pointer here to ensure the atomic fields in 373 // version set are aligned properly. 374 versions *versionSet 375 376 log struct { 377 // The queue of logs, containing both flushed and unflushed logs. The 378 // flushed logs will be a prefix, the unflushed logs a suffix. The 379 // delimeter between flushed and unflushed logs is 380 // versionSet.minUnflushedLogNum. 381 queue []fileInfo 382 // The number of input bytes to the log. This is the raw size of the 383 // batches written to the WAL, without the overhead of the record 384 // envelopes. 385 bytesIn uint64 386 // The LogWriter is protected by commitPipeline.mu. This allows log 387 // writes to be performed without holding DB.mu, but requires both 388 // commitPipeline.mu and DB.mu to be held when rotating the WAL/memtable 389 // (i.e. makeRoomForWrite). 390 *record.LogWriter 391 // Can be nil. 392 metrics struct { 393 fsyncLatency prometheus.Histogram 394 record.LogWriterMetrics 395 } 396 registerLogWriterForTesting func(w *record.LogWriter) 397 } 398 399 mem struct { 400 // The current mutable memTable. 401 mutable *memTable 402 // Queue of flushables (the mutable memtable is at end). Elements are 403 // added to the end of the slice and removed from the beginning. Once an 404 // index is set it is never modified making a fixed slice immutable and 405 // safe for concurrent reads. 406 queue flushableList 407 // nextSize is the size of the next memtable. The memtable size starts at 408 // min(256KB,Options.MemTableSize) and doubles each time a new memtable 409 // is allocated up to Options.MemTableSize. This reduces the memory 410 // footprint of memtables when lots of DB instances are used concurrently 411 // in test environments. 412 nextSize uint64 413 } 414 415 compact struct { 416 // Condition variable used to signal when a flush or compaction has 417 // completed. Used by the write-stall mechanism to wait for the stall 418 // condition to clear. See DB.makeRoomForWrite(). 419 cond sync.Cond 420 // True when a flush is in progress. 421 flushing bool 422 // The number of ongoing compactions. 423 compactingCount int 424 // The list of deletion hints, suggesting ranges for delete-only 425 // compactions. 426 deletionHints []deleteCompactionHint 427 // The list of manual compactions. The next manual compaction to perform 428 // is at the start of the list. New entries are added to the end. 429 manual []*manualCompaction 430 // downloads is the list of suggested download tasks. The next download to 431 // perform is at the start of the list. New entries are added to the end. 432 downloads []*downloadSpan 433 // inProgress is the set of in-progress flushes and compactions. 434 // It's used in the calculation of some metrics and to initialize L0 435 // sublevels' state. Some of the compactions contained within this 436 // map may have already committed an edit to the version but are 437 // lingering performing cleanup, like deleting obsolete files. 438 inProgress map[*compaction]struct{} 439 440 // rescheduleReadCompaction indicates to an iterator that a read compaction 441 // should be scheduled. 442 rescheduleReadCompaction bool 443 444 // readCompactions is a readCompactionQueue which keeps track of the 445 // compactions which we might have to perform. 446 readCompactions readCompactionQueue 447 448 // The cumulative duration of all completed compactions since Open. 449 // Does not include flushes. 450 duration time.Duration 451 // Flush throughput metric. 452 flushWriteThroughput ThroughputMetric 453 // The idle start time for the flush "loop", i.e., when the flushing 454 // bool above transitions to false. 455 noOngoingFlushStartTime time.Time 456 } 457 458 // Non-zero when file cleaning is disabled. The disabled count acts as a 459 // reference count to prohibit file cleaning. See 460 // DB.{disable,Enable}FileDeletions(). 461 disableFileDeletions int 462 463 snapshots struct { 464 // The list of active snapshots. 465 snapshotList 466 467 // The cumulative count and size of snapshot-pinned keys written to 468 // sstables. 469 cumulativePinnedCount uint64 470 cumulativePinnedSize uint64 471 } 472 473 tableStats struct { 474 // Condition variable used to signal the completion of a 475 // job to collect table stats. 476 cond sync.Cond 477 // True when a stat collection operation is in progress. 478 loading bool 479 // True if stat collection has loaded statistics for all tables 480 // other than those listed explicitly in pending. This flag starts 481 // as false when a database is opened and flips to true once stat 482 // collection has caught up. 483 loadedInitial bool 484 // A slice of files for which stats have not been computed. 485 // Compactions, ingests, flushes append files to be processed. An 486 // active stat collection goroutine clears the list and processes 487 // them. 488 pending []manifest.NewFileEntry 489 } 490 491 tableValidation struct { 492 // cond is a condition variable used to signal the completion of a 493 // job to validate one or more sstables. 494 cond sync.Cond 495 // pending is a slice of metadata for sstables waiting to be 496 // validated. Only physical sstables should be added to the pending 497 // queue. 498 pending []newFileEntry 499 // validating is set to true when validation is running. 500 validating bool 501 } 502 } 503 504 // Normally equal to time.Now() but may be overridden in tests. 505 timeNow func() time.Time 506 // the time at database Open; may be used to compute metrics like effective 507 // compaction concurrency 508 openedAt time.Time 509 } 510 511 var _ Reader = (*DB)(nil) 512 var _ Writer = (*DB)(nil) 513 514 // TestOnlyWaitForCleaning MUST only be used in tests. 515 func (d *DB) TestOnlyWaitForCleaning() { 516 d.cleanupManager.Wait() 517 } 518 519 // Get gets the value for the given key. It returns ErrNotFound if the DB does 520 // not contain the key. 521 // 522 // The caller should not modify the contents of the returned slice, but it is 523 // safe to modify the contents of the argument after Get returns. The returned 524 // slice will remain valid until the returned Closer is closed. On success, the 525 // caller MUST call closer.Close() or a memory leak will occur. 526 func (d *DB) Get(key []byte) ([]byte, io.Closer, error) { 527 return d.getInternal(key, nil /* batch */, nil /* snapshot */) 528 } 529 530 type getIterAlloc struct { 531 dbi Iterator 532 keyBuf []byte 533 get getIter 534 } 535 536 var getIterAllocPool = sync.Pool{ 537 New: func() interface{} { 538 return &getIterAlloc{} 539 }, 540 } 541 542 func (d *DB) getInternal(key []byte, b *Batch, s *Snapshot) ([]byte, io.Closer, error) { 543 if err := d.closed.Load(); err != nil { 544 panic(err) 545 } 546 547 // Grab and reference the current readState. This prevents the underlying 548 // files in the associated version from being deleted if there is a current 549 // compaction. The readState is unref'd by Iterator.Close(). 550 readState := d.loadReadState() 551 552 // Determine the seqnum to read at after grabbing the read state (current and 553 // memtables) above. 554 var seqNum uint64 555 if s != nil { 556 seqNum = s.seqNum 557 } else { 558 seqNum = d.mu.versions.visibleSeqNum.Load() 559 } 560 561 buf := getIterAllocPool.Get().(*getIterAlloc) 562 563 get := &buf.get 564 *get = getIter{ 565 logger: d.opts.Logger, 566 comparer: d.opts.Comparer, 567 newIters: d.newIters, 568 snapshot: seqNum, 569 key: key, 570 batch: b, 571 mem: readState.memtables, 572 l0: readState.current.L0SublevelFiles, 573 version: readState.current, 574 } 575 576 // Strip off memtables which cannot possibly contain the seqNum being read 577 // at. 578 for len(get.mem) > 0 { 579 n := len(get.mem) 580 if logSeqNum := get.mem[n-1].logSeqNum; logSeqNum < seqNum { 581 break 582 } 583 get.mem = get.mem[:n-1] 584 } 585 586 i := &buf.dbi 587 pointIter := get 588 *i = Iterator{ 589 ctx: context.Background(), 590 getIterAlloc: buf, 591 iter: pointIter, 592 pointIter: pointIter, 593 merge: d.merge, 594 comparer: *d.opts.Comparer, 595 readState: readState, 596 keyBuf: buf.keyBuf, 597 } 598 599 if !i.First() { 600 err := i.Close() 601 if err != nil { 602 return nil, nil, err 603 } 604 return nil, nil, ErrNotFound 605 } 606 return i.Value(), i, nil 607 } 608 609 // Set sets the value for the given key. It overwrites any previous value 610 // for that key; a DB is not a multi-map. 611 // 612 // It is safe to modify the contents of the arguments after Set returns. 613 func (d *DB) Set(key, value []byte, opts *WriteOptions) error { 614 b := newBatch(d) 615 _ = b.Set(key, value, opts) 616 if err := d.Apply(b, opts); err != nil { 617 return err 618 } 619 // Only release the batch on success. 620 b.release() 621 return nil 622 } 623 624 // Delete deletes the value for the given key. Deletes are blind all will 625 // succeed even if the given key does not exist. 626 // 627 // It is safe to modify the contents of the arguments after Delete returns. 628 func (d *DB) Delete(key []byte, opts *WriteOptions) error { 629 b := newBatch(d) 630 _ = b.Delete(key, opts) 631 if err := d.Apply(b, opts); err != nil { 632 return err 633 } 634 // Only release the batch on success. 635 b.release() 636 return nil 637 } 638 639 // DeleteSized behaves identically to Delete, but takes an additional 640 // argument indicating the size of the value being deleted. DeleteSized 641 // should be preferred when the caller has the expectation that there exists 642 // a single internal KV pair for the key (eg, the key has not been 643 // overwritten recently), and the caller knows the size of its value. 644 // 645 // DeleteSized will record the value size within the tombstone and use it to 646 // inform compaction-picking heuristics which strive to reduce space 647 // amplification in the LSM. This "calling your shot" mechanic allows the 648 // storage engine to more accurately estimate and reduce space amplification. 649 // 650 // It is safe to modify the contents of the arguments after DeleteSized 651 // returns. 652 func (d *DB) DeleteSized(key []byte, valueSize uint32, opts *WriteOptions) error { 653 b := newBatch(d) 654 _ = b.DeleteSized(key, valueSize, opts) 655 if err := d.Apply(b, opts); err != nil { 656 return err 657 } 658 // Only release the batch on success. 659 b.release() 660 return nil 661 } 662 663 // SingleDelete adds an action to the batch that single deletes the entry for key. 664 // See Writer.SingleDelete for more details on the semantics of SingleDelete. 665 // 666 // It is safe to modify the contents of the arguments after SingleDelete returns. 667 func (d *DB) SingleDelete(key []byte, opts *WriteOptions) error { 668 b := newBatch(d) 669 _ = b.SingleDelete(key, opts) 670 if err := d.Apply(b, opts); err != nil { 671 return err 672 } 673 // Only release the batch on success. 674 b.release() 675 return nil 676 } 677 678 // DeleteRange deletes all of the keys (and values) in the range [start,end) 679 // (inclusive on start, exclusive on end). 680 // 681 // It is safe to modify the contents of the arguments after DeleteRange 682 // returns. 683 func (d *DB) DeleteRange(start, end []byte, opts *WriteOptions) error { 684 b := newBatch(d) 685 _ = b.DeleteRange(start, end, opts) 686 if err := d.Apply(b, opts); err != nil { 687 return err 688 } 689 // Only release the batch on success. 690 b.release() 691 return nil 692 } 693 694 // Merge adds an action to the DB that merges the value at key with the new 695 // value. The details of the merge are dependent upon the configured merge 696 // operator. 697 // 698 // It is safe to modify the contents of the arguments after Merge returns. 699 func (d *DB) Merge(key, value []byte, opts *WriteOptions) error { 700 b := newBatch(d) 701 _ = b.Merge(key, value, opts) 702 if err := d.Apply(b, opts); err != nil { 703 return err 704 } 705 // Only release the batch on success. 706 b.release() 707 return nil 708 } 709 710 // LogData adds the specified to the batch. The data will be written to the 711 // WAL, but not added to memtables or sstables. Log data is never indexed, 712 // which makes it useful for testing WAL performance. 713 // 714 // It is safe to modify the contents of the argument after LogData returns. 715 func (d *DB) LogData(data []byte, opts *WriteOptions) error { 716 b := newBatch(d) 717 _ = b.LogData(data, opts) 718 if err := d.Apply(b, opts); err != nil { 719 return err 720 } 721 // Only release the batch on success. 722 b.release() 723 return nil 724 } 725 726 // RangeKeySet sets a range key mapping the key range [start, end) at the MVCC 727 // timestamp suffix to value. The suffix is optional. If any portion of the key 728 // range [start, end) is already set by a range key with the same suffix value, 729 // RangeKeySet overrides it. 730 // 731 // It is safe to modify the contents of the arguments after RangeKeySet returns. 732 func (d *DB) RangeKeySet(start, end, suffix, value []byte, opts *WriteOptions) error { 733 b := newBatch(d) 734 _ = b.RangeKeySet(start, end, suffix, value, opts) 735 if err := d.Apply(b, opts); err != nil { 736 return err 737 } 738 // Only release the batch on success. 739 b.release() 740 return nil 741 } 742 743 // RangeKeyUnset removes a range key mapping the key range [start, end) at the 744 // MVCC timestamp suffix. The suffix may be omitted to remove an unsuffixed 745 // range key. RangeKeyUnset only removes portions of range keys that fall within 746 // the [start, end) key span, and only range keys with suffixes that exactly 747 // match the unset suffix. 748 // 749 // It is safe to modify the contents of the arguments after RangeKeyUnset 750 // returns. 751 func (d *DB) RangeKeyUnset(start, end, suffix []byte, opts *WriteOptions) error { 752 b := newBatch(d) 753 _ = b.RangeKeyUnset(start, end, suffix, opts) 754 if err := d.Apply(b, opts); err != nil { 755 return err 756 } 757 // Only release the batch on success. 758 b.release() 759 return nil 760 } 761 762 // RangeKeyDelete deletes all of the range keys in the range [start,end) 763 // (inclusive on start, exclusive on end). It does not delete point keys (for 764 // that use DeleteRange). RangeKeyDelete removes all range keys within the 765 // bounds, including those with or without suffixes. 766 // 767 // It is safe to modify the contents of the arguments after RangeKeyDelete 768 // returns. 769 func (d *DB) RangeKeyDelete(start, end []byte, opts *WriteOptions) error { 770 b := newBatch(d) 771 _ = b.RangeKeyDelete(start, end, opts) 772 if err := d.Apply(b, opts); err != nil { 773 return err 774 } 775 // Only release the batch on success. 776 b.release() 777 return nil 778 } 779 780 // Apply the operations contained in the batch to the DB. If the batch is large 781 // the contents of the batch may be retained by the database. If that occurs 782 // the batch contents will be cleared preventing the caller from attempting to 783 // reuse them. 784 // 785 // It is safe to modify the contents of the arguments after Apply returns. 786 func (d *DB) Apply(batch *Batch, opts *WriteOptions) error { 787 return d.applyInternal(batch, opts, false) 788 } 789 790 // ApplyNoSyncWait must only be used when opts.Sync is true and the caller 791 // does not want to wait for the WAL fsync to happen. The method will return 792 // once the mutation is applied to the memtable and is visible (note that a 793 // mutation is visible before the WAL sync even in the wait case, so we have 794 // not weakened the durability semantics). The caller must call Batch.SyncWait 795 // to wait for the WAL fsync. The caller must not Close the batch without 796 // first calling Batch.SyncWait. 797 // 798 // RECOMMENDATION: Prefer using Apply unless you really understand why you 799 // need ApplyNoSyncWait. 800 // EXPERIMENTAL: API/feature subject to change. Do not yet use outside 801 // CockroachDB. 802 func (d *DB) ApplyNoSyncWait(batch *Batch, opts *WriteOptions) error { 803 if !opts.Sync { 804 return errors.Errorf("cannot request asynchonous apply when WriteOptions.Sync is false") 805 } 806 return d.applyInternal(batch, opts, true) 807 } 808 809 // REQUIRES: noSyncWait => opts.Sync 810 func (d *DB) applyInternal(batch *Batch, opts *WriteOptions, noSyncWait bool) error { 811 if err := d.closed.Load(); err != nil { 812 panic(err) 813 } 814 if batch.committing { 815 panic("pebble: batch already committing") 816 } 817 if batch.applied.Load() { 818 panic("pebble: batch already applied") 819 } 820 if d.opts.ReadOnly { 821 return ErrReadOnly 822 } 823 if batch.db != nil && batch.db != d { 824 panic(fmt.Sprintf("pebble: batch db mismatch: %p != %p", batch.db, d)) 825 } 826 827 sync := opts.GetSync() 828 if sync && d.opts.DisableWAL { 829 return errors.New("pebble: WAL disabled") 830 } 831 832 if batch.minimumFormatMajorVersion != FormatMostCompatible { 833 if fmv := d.FormatMajorVersion(); fmv < batch.minimumFormatMajorVersion { 834 panic(fmt.Sprintf( 835 "pebble: batch requires at least format major version %d (current: %d)", 836 batch.minimumFormatMajorVersion, fmv, 837 )) 838 } 839 } 840 841 if batch.countRangeKeys > 0 { 842 if d.split == nil { 843 return errNoSplit 844 } 845 } 846 batch.committing = true 847 848 if batch.db == nil { 849 if err := batch.refreshMemTableSize(); err != nil { 850 return err 851 } 852 } 853 if batch.memTableSize >= d.largeBatchThreshold { 854 var err error 855 batch.flushable, err = newFlushableBatch(batch, d.opts.Comparer) 856 if err != nil { 857 return err 858 } 859 } 860 if err := d.commit.Commit(batch, sync, noSyncWait); err != nil { 861 // There isn't much we can do on an error here. The commit pipeline will be 862 // horked at this point. 863 d.opts.Logger.Fatalf("pebble: fatal commit error: %v", err) 864 } 865 // If this is a large batch, we need to clear the batch contents as the 866 // flushable batch may still be present in the flushables queue. 867 // 868 // TODO(peter): Currently large batches are written to the WAL. We could 869 // skip the WAL write and instead wait for the large batch to be flushed to 870 // an sstable. For a 100 MB batch, this might actually be faster. For a 1 871 // GB batch this is almost certainly faster. 872 if batch.flushable != nil { 873 batch.data = nil 874 } 875 return nil 876 } 877 878 func (d *DB) commitApply(b *Batch, mem *memTable) error { 879 if b.flushable != nil { 880 // This is a large batch which was already added to the immutable queue. 881 return nil 882 } 883 err := mem.apply(b, b.SeqNum()) 884 if err != nil { 885 return err 886 } 887 888 // If the batch contains range tombstones and the database is configured 889 // to flush range deletions, schedule a delayed flush so that disk space 890 // may be reclaimed without additional writes or an explicit flush. 891 if b.countRangeDels > 0 && d.opts.FlushDelayDeleteRange > 0 { 892 d.mu.Lock() 893 d.maybeScheduleDelayedFlush(mem, d.opts.FlushDelayDeleteRange) 894 d.mu.Unlock() 895 } 896 897 // If the batch contains range keys and the database is configured to flush 898 // range keys, schedule a delayed flush so that the range keys are cleared 899 // from the memtable. 900 if b.countRangeKeys > 0 && d.opts.FlushDelayRangeKey > 0 { 901 d.mu.Lock() 902 d.maybeScheduleDelayedFlush(mem, d.opts.FlushDelayRangeKey) 903 d.mu.Unlock() 904 } 905 906 if mem.writerUnref() { 907 d.mu.Lock() 908 d.maybeScheduleFlush() 909 d.mu.Unlock() 910 } 911 return nil 912 } 913 914 func (d *DB) commitWrite(b *Batch, syncWG *sync.WaitGroup, syncErr *error) (*memTable, error) { 915 var size int64 916 repr := b.Repr() 917 918 if b.flushable != nil { 919 // We have a large batch. Such batches are special in that they don't get 920 // added to the memtable, and are instead inserted into the queue of 921 // memtables. The call to makeRoomForWrite with this batch will force the 922 // current memtable to be flushed. We want the large batch to be part of 923 // the same log, so we add it to the WAL here, rather than after the call 924 // to makeRoomForWrite(). 925 // 926 // Set the sequence number since it was not set to the correct value earlier 927 // (see comment in newFlushableBatch()). 928 b.flushable.setSeqNum(b.SeqNum()) 929 if !d.opts.DisableWAL { 930 var err error 931 size, err = d.mu.log.SyncRecord(repr, syncWG, syncErr) 932 if err != nil { 933 panic(err) 934 } 935 } 936 } 937 938 d.mu.Lock() 939 940 var err error 941 if !b.ingestedSSTBatch { 942 // Batches which contain keys of kind InternalKeyKindIngestSST will 943 // never be applied to the memtable, so we don't need to make room for 944 // write. For the other cases, switch out the memtable if there was not 945 // enough room to store the batch. 946 err = d.makeRoomForWrite(b) 947 } 948 949 if err == nil && !d.opts.DisableWAL { 950 d.mu.log.bytesIn += uint64(len(repr)) 951 } 952 953 // Grab a reference to the memtable while holding DB.mu. Note that for 954 // non-flushable batches (b.flushable == nil) makeRoomForWrite() added a 955 // reference to the memtable which will prevent it from being flushed until 956 // we unreference it. This reference is dropped in DB.commitApply(). 957 mem := d.mu.mem.mutable 958 959 d.mu.Unlock() 960 if err != nil { 961 return nil, err 962 } 963 964 if d.opts.DisableWAL { 965 return mem, nil 966 } 967 968 if b.flushable == nil { 969 size, err = d.mu.log.SyncRecord(repr, syncWG, syncErr) 970 if err != nil { 971 panic(err) 972 } 973 } 974 975 d.logSize.Store(uint64(size)) 976 return mem, err 977 } 978 979 type iterAlloc struct { 980 dbi Iterator 981 keyBuf []byte 982 boundsBuf [2][]byte 983 prefixOrFullSeekKey []byte 984 merging mergingIter 985 mlevels [3 + numLevels]mergingIterLevel 986 levels [3 + numLevels]levelIter 987 levelsPositioned [3 + numLevels]bool 988 } 989 990 var iterAllocPool = sync.Pool{ 991 New: func() interface{} { 992 return &iterAlloc{} 993 }, 994 } 995 996 // snapshotIterOpts denotes snapshot-related iterator options when calling 997 // newIter. These are the possible cases for a snapshotIterOpts: 998 // - No snapshot: All fields are zero values. 999 // - Classic snapshot: Only `seqNum` is set. The latest readState will be used 1000 // and the specified seqNum will be used as the snapshot seqNum. 1001 // - EventuallyFileOnlySnapshot (EFOS) behaving as a classic snapshot. Only 1002 // the `seqNum` is set. The latest readState will be used 1003 // and the specified seqNum will be used as the snapshot seqNum. 1004 // - EFOS in file-only state: Only `seqNum` and `vers` are set. All the 1005 // relevant SSTs are referenced by the *version. 1006 // - EFOS that has been excised but is in alwaysCreateIters mode (tests only). 1007 // Only `seqNum` and `readState` are set. 1008 type snapshotIterOpts struct { 1009 seqNum uint64 1010 vers *version 1011 readState *readState 1012 } 1013 1014 type batchIterOpts struct { 1015 batchOnly bool 1016 } 1017 type newIterOpts struct { 1018 snapshot snapshotIterOpts 1019 batch batchIterOpts 1020 } 1021 1022 // newIter constructs a new iterator, merging in batch iterators as an extra 1023 // level. 1024 func (d *DB) newIter( 1025 ctx context.Context, batch *Batch, internalOpts newIterOpts, o *IterOptions, 1026 ) *Iterator { 1027 if internalOpts.batch.batchOnly { 1028 if batch == nil { 1029 panic("batchOnly is true, but batch is nil") 1030 } 1031 if internalOpts.snapshot.vers != nil { 1032 panic("batchOnly is true, but snapshotIterOpts is initialized") 1033 } 1034 } 1035 if err := d.closed.Load(); err != nil { 1036 panic(err) 1037 } 1038 seqNum := internalOpts.snapshot.seqNum 1039 if o.rangeKeys() { 1040 if d.FormatMajorVersion() < FormatRangeKeys { 1041 panic(fmt.Sprintf( 1042 "pebble: range keys require at least format major version %d (current: %d)", 1043 FormatRangeKeys, d.FormatMajorVersion(), 1044 )) 1045 } 1046 } 1047 if o != nil && o.RangeKeyMasking.Suffix != nil && o.KeyTypes != IterKeyTypePointsAndRanges { 1048 panic("pebble: range key masking requires IterKeyTypePointsAndRanges") 1049 } 1050 if (batch != nil || seqNum != 0) && (o != nil && o.OnlyReadGuaranteedDurable) { 1051 // We could add support for OnlyReadGuaranteedDurable on snapshots if 1052 // there was a need: this would require checking that the sequence number 1053 // of the snapshot has been flushed, by comparing with 1054 // DB.mem.queue[0].logSeqNum. 1055 panic("OnlyReadGuaranteedDurable is not supported for batches or snapshots") 1056 } 1057 var readState *readState 1058 var newIters tableNewIters 1059 var newIterRangeKey keyspan.TableNewSpanIter 1060 if !internalOpts.batch.batchOnly { 1061 // Grab and reference the current readState. This prevents the underlying 1062 // files in the associated version from being deleted if there is a current 1063 // compaction. The readState is unref'd by Iterator.Close(). 1064 if internalOpts.snapshot.vers == nil { 1065 if internalOpts.snapshot.readState != nil { 1066 readState = internalOpts.snapshot.readState 1067 readState.ref() 1068 } else { 1069 // NB: loadReadState() calls readState.ref(). 1070 readState = d.loadReadState() 1071 } 1072 } else { 1073 // vers != nil 1074 internalOpts.snapshot.vers.Ref() 1075 } 1076 1077 // Determine the seqnum to read at after grabbing the read state (current and 1078 // memtables) above. 1079 if seqNum == 0 { 1080 seqNum = d.mu.versions.visibleSeqNum.Load() 1081 } 1082 newIters = d.newIters 1083 newIterRangeKey = d.tableNewRangeKeyIter 1084 } 1085 1086 // Bundle various structures under a single umbrella in order to allocate 1087 // them together. 1088 buf := iterAllocPool.Get().(*iterAlloc) 1089 dbi := &buf.dbi 1090 *dbi = Iterator{ 1091 ctx: ctx, 1092 alloc: buf, 1093 merge: d.merge, 1094 comparer: *d.opts.Comparer, 1095 readState: readState, 1096 version: internalOpts.snapshot.vers, 1097 keyBuf: buf.keyBuf, 1098 prefixOrFullSeekKey: buf.prefixOrFullSeekKey, 1099 boundsBuf: buf.boundsBuf, 1100 batch: batch, 1101 newIters: newIters, 1102 newIterRangeKey: newIterRangeKey, 1103 seqNum: seqNum, 1104 batchOnlyIter: internalOpts.batch.batchOnly, 1105 } 1106 if o != nil { 1107 dbi.opts = *o 1108 dbi.processBounds(o.LowerBound, o.UpperBound) 1109 } 1110 dbi.opts.logger = d.opts.Logger 1111 if d.opts.private.disableLazyCombinedIteration { 1112 dbi.opts.disableLazyCombinedIteration = true 1113 } 1114 if batch != nil { 1115 dbi.batchSeqNum = dbi.batch.nextSeqNum() 1116 } 1117 return finishInitializingIter(ctx, buf) 1118 } 1119 1120 // finishInitializingIter is a helper for doing the non-trivial initialization 1121 // of an Iterator. It's invoked to perform the initial initialization of an 1122 // Iterator during NewIter or Clone, and to perform reinitialization due to a 1123 // change in IterOptions by a call to Iterator.SetOptions. 1124 func finishInitializingIter(ctx context.Context, buf *iterAlloc) *Iterator { 1125 // Short-hand. 1126 dbi := &buf.dbi 1127 var memtables flushableList 1128 if dbi.readState != nil { 1129 memtables = dbi.readState.memtables 1130 } 1131 if dbi.opts.OnlyReadGuaranteedDurable { 1132 memtables = nil 1133 } else { 1134 // We only need to read from memtables which contain sequence numbers older 1135 // than seqNum. Trim off newer memtables. 1136 for i := len(memtables) - 1; i >= 0; i-- { 1137 if logSeqNum := memtables[i].logSeqNum; logSeqNum < dbi.seqNum { 1138 break 1139 } 1140 memtables = memtables[:i] 1141 } 1142 } 1143 1144 if dbi.opts.pointKeys() { 1145 // Construct the point iterator, initializing dbi.pointIter to point to 1146 // dbi.merging. If this is called during a SetOptions call and this 1147 // Iterator has already initialized dbi.merging, constructPointIter is a 1148 // noop and an initialized pointIter already exists in dbi.pointIter. 1149 dbi.constructPointIter(ctx, memtables, buf) 1150 dbi.iter = dbi.pointIter 1151 } else { 1152 dbi.iter = emptyIter 1153 } 1154 1155 if dbi.opts.rangeKeys() { 1156 dbi.rangeKeyMasking.init(dbi, dbi.comparer.Compare, dbi.comparer.Split) 1157 1158 // When iterating over both point and range keys, don't create the 1159 // range-key iterator stack immediately if we can avoid it. This 1160 // optimization takes advantage of the expected sparseness of range 1161 // keys, and configures the point-key iterator to dynamically switch to 1162 // combined iteration when it observes a file containing range keys. 1163 // 1164 // Lazy combined iteration is not possible if a batch or a memtable 1165 // contains any range keys. 1166 useLazyCombinedIteration := dbi.rangeKey == nil && 1167 dbi.opts.KeyTypes == IterKeyTypePointsAndRanges && 1168 (dbi.batch == nil || dbi.batch.countRangeKeys == 0) && 1169 !dbi.opts.disableLazyCombinedIteration 1170 if useLazyCombinedIteration { 1171 // The user requested combined iteration, and there's no indexed 1172 // batch currently containing range keys that would prevent lazy 1173 // combined iteration. Check the memtables to see if they contain 1174 // any range keys. 1175 for i := range memtables { 1176 if memtables[i].containsRangeKeys() { 1177 useLazyCombinedIteration = false 1178 break 1179 } 1180 } 1181 } 1182 1183 if useLazyCombinedIteration { 1184 dbi.lazyCombinedIter = lazyCombinedIter{ 1185 parent: dbi, 1186 pointIter: dbi.pointIter, 1187 combinedIterState: combinedIterState{ 1188 initialized: false, 1189 }, 1190 } 1191 dbi.iter = &dbi.lazyCombinedIter 1192 dbi.iter = invalidating.MaybeWrapIfInvariants(dbi.iter) 1193 } else { 1194 dbi.lazyCombinedIter.combinedIterState = combinedIterState{ 1195 initialized: true, 1196 } 1197 if dbi.rangeKey == nil { 1198 dbi.rangeKey = iterRangeKeyStateAllocPool.Get().(*iteratorRangeKeyState) 1199 dbi.rangeKey.init(dbi.comparer.Compare, dbi.comparer.Split, &dbi.opts) 1200 dbi.constructRangeKeyIter() 1201 } else { 1202 dbi.rangeKey.iterConfig.SetBounds(dbi.opts.LowerBound, dbi.opts.UpperBound) 1203 } 1204 1205 // Wrap the point iterator (currently dbi.iter) with an interleaving 1206 // iterator that interleaves range keys pulled from 1207 // dbi.rangeKey.rangeKeyIter. 1208 // 1209 // NB: The interleaving iterator is always reinitialized, even if 1210 // dbi already had an initialized range key iterator, in case the point 1211 // iterator changed or the range key masking suffix changed. 1212 dbi.rangeKey.iiter.Init(&dbi.comparer, dbi.iter, dbi.rangeKey.rangeKeyIter, 1213 keyspan.InterleavingIterOpts{ 1214 Mask: &dbi.rangeKeyMasking, 1215 LowerBound: dbi.opts.LowerBound, 1216 UpperBound: dbi.opts.UpperBound, 1217 }) 1218 dbi.iter = &dbi.rangeKey.iiter 1219 } 1220 } else { 1221 // !dbi.opts.rangeKeys() 1222 // 1223 // Reset the combined iterator state. The initialized=true ensures the 1224 // iterator doesn't unnecessarily try to switch to combined iteration. 1225 dbi.lazyCombinedIter.combinedIterState = combinedIterState{initialized: true} 1226 } 1227 return dbi 1228 } 1229 1230 // ScanInternal scans all internal keys within the specified bounds, truncating 1231 // any rangedels and rangekeys to those bounds if they span past them. For use 1232 // when an external user needs to be aware of all internal keys that make up a 1233 // key range. 1234 // 1235 // Keys deleted by range deletions must not be returned or exposed by this 1236 // method, while the range deletion deleting that key must be exposed using 1237 // visitRangeDel. Keys that would be masked by range key masking (if an 1238 // appropriate prefix were set) should be exposed, alongside the range key 1239 // that would have masked it. This method also collapses all point keys into 1240 // one InternalKey; so only one internal key at most per user key is returned 1241 // to visitPointKey. 1242 // 1243 // If visitSharedFile is not nil, ScanInternal iterates in skip-shared iteration 1244 // mode. In this iteration mode, sstables in levels L5 and L6 are skipped, and 1245 // their metadatas truncated to [lower, upper) and passed into visitSharedFile. 1246 // ErrInvalidSkipSharedIteration is returned if visitSharedFile is not nil and an 1247 // sstable in L5 or L6 is found that is not in shared storage according to 1248 // provider.IsShared, or an sstable in those levels contains a newer key than the 1249 // snapshot sequence number (only applicable for snapshot.ScanInternal). Examples 1250 // of when this could happen could be if Pebble started writing sstables before a 1251 // creator ID was set (as creator IDs are necessary to enable shared storage) 1252 // resulting in some lower level SSTs being on non-shared storage. Skip-shared 1253 // iteration is invalid in those cases. 1254 func (d *DB) ScanInternal( 1255 ctx context.Context, 1256 categoryAndQoS sstable.CategoryAndQoS, 1257 lower, upper []byte, 1258 visitPointKey func(key *InternalKey, value LazyValue, iterInfo IteratorLevel) error, 1259 visitRangeDel func(start, end []byte, seqNum uint64) error, 1260 visitRangeKey func(start, end []byte, keys []rangekey.Key) error, 1261 visitSharedFile func(sst *SharedSSTMeta) error, 1262 ) error { 1263 scanInternalOpts := &scanInternalOptions{ 1264 CategoryAndQoS: categoryAndQoS, 1265 visitPointKey: visitPointKey, 1266 visitRangeDel: visitRangeDel, 1267 visitRangeKey: visitRangeKey, 1268 visitSharedFile: visitSharedFile, 1269 skipSharedLevels: visitSharedFile != nil, 1270 IterOptions: IterOptions{ 1271 KeyTypes: IterKeyTypePointsAndRanges, 1272 LowerBound: lower, 1273 UpperBound: upper, 1274 }, 1275 } 1276 iter, err := d.newInternalIter(ctx, snapshotIterOpts{} /* snapshot */, scanInternalOpts) 1277 if err != nil { 1278 return err 1279 } 1280 defer iter.close() 1281 return scanInternalImpl(ctx, lower, upper, iter, scanInternalOpts) 1282 } 1283 1284 // newInternalIter constructs and returns a new scanInternalIterator on this db. 1285 // If o.skipSharedLevels is true, levels below sharedLevelsStart are *not* added 1286 // to the internal iterator. 1287 // 1288 // TODO(bilal): This method has a lot of similarities with db.newIter as well as 1289 // finishInitializingIter. Both pairs of methods should be refactored to reduce 1290 // this duplication. 1291 func (d *DB) newInternalIter( 1292 ctx context.Context, sOpts snapshotIterOpts, o *scanInternalOptions, 1293 ) (*scanInternalIterator, error) { 1294 if err := d.closed.Load(); err != nil { 1295 panic(err) 1296 } 1297 // Grab and reference the current readState. This prevents the underlying 1298 // files in the associated version from being deleted if there is a current 1299 // compaction. The readState is unref'd by Iterator.Close(). 1300 var readState *readState 1301 if sOpts.vers == nil { 1302 if sOpts.readState != nil { 1303 readState = sOpts.readState 1304 readState.ref() 1305 } else { 1306 readState = d.loadReadState() 1307 } 1308 } 1309 if sOpts.vers != nil { 1310 sOpts.vers.Ref() 1311 } 1312 1313 // Determine the seqnum to read at after grabbing the read state (current and 1314 // memtables) above. 1315 seqNum := sOpts.seqNum 1316 if seqNum == 0 { 1317 seqNum = d.mu.versions.visibleSeqNum.Load() 1318 } 1319 1320 // Bundle various structures under a single umbrella in order to allocate 1321 // them together. 1322 buf := iterAllocPool.Get().(*iterAlloc) 1323 dbi := &scanInternalIterator{ 1324 ctx: ctx, 1325 db: d, 1326 comparer: d.opts.Comparer, 1327 merge: d.opts.Merger.Merge, 1328 readState: readState, 1329 version: sOpts.vers, 1330 alloc: buf, 1331 newIters: d.newIters, 1332 newIterRangeKey: d.tableNewRangeKeyIter, 1333 seqNum: seqNum, 1334 mergingIter: &buf.merging, 1335 } 1336 dbi.opts = *o 1337 dbi.opts.logger = d.opts.Logger 1338 if d.opts.private.disableLazyCombinedIteration { 1339 dbi.opts.disableLazyCombinedIteration = true 1340 } 1341 return finishInitializingInternalIter(buf, dbi) 1342 } 1343 1344 func finishInitializingInternalIter( 1345 buf *iterAlloc, i *scanInternalIterator, 1346 ) (*scanInternalIterator, error) { 1347 // Short-hand. 1348 var memtables flushableList 1349 if i.readState != nil { 1350 memtables = i.readState.memtables 1351 } 1352 // We only need to read from memtables which contain sequence numbers older 1353 // than seqNum. Trim off newer memtables. 1354 for j := len(memtables) - 1; j >= 0; j-- { 1355 if logSeqNum := memtables[j].logSeqNum; logSeqNum < i.seqNum { 1356 break 1357 } 1358 memtables = memtables[:j] 1359 } 1360 i.initializeBoundBufs(i.opts.LowerBound, i.opts.UpperBound) 1361 1362 i.constructPointIter(i.opts.CategoryAndQoS, memtables, buf) 1363 1364 // For internal iterators, we skip the lazy combined iteration optimization 1365 // entirely, and create the range key iterator stack directly. 1366 i.rangeKey = iterRangeKeyStateAllocPool.Get().(*iteratorRangeKeyState) 1367 i.rangeKey.init(i.comparer.Compare, i.comparer.Split, &i.opts.IterOptions) 1368 if err := i.constructRangeKeyIter(); err != nil { 1369 return nil, err 1370 } 1371 1372 // Wrap the point iterator (currently i.iter) with an interleaving 1373 // iterator that interleaves range keys pulled from 1374 // i.rangeKey.rangeKeyIter. 1375 i.rangeKey.iiter.Init(i.comparer, i.iter, i.rangeKey.rangeKeyIter, 1376 keyspan.InterleavingIterOpts{ 1377 LowerBound: i.opts.LowerBound, 1378 UpperBound: i.opts.UpperBound, 1379 }) 1380 i.iter = &i.rangeKey.iiter 1381 1382 return i, nil 1383 } 1384 1385 func (i *Iterator) constructPointIter( 1386 ctx context.Context, memtables flushableList, buf *iterAlloc, 1387 ) { 1388 if i.pointIter != nil { 1389 // Already have one. 1390 return 1391 } 1392 internalOpts := internalIterOpts{stats: &i.stats.InternalStats} 1393 if i.opts.RangeKeyMasking.Filter != nil { 1394 internalOpts.boundLimitedFilter = &i.rangeKeyMasking 1395 } 1396 1397 // Merging levels and levels from iterAlloc. 1398 mlevels := buf.mlevels[:0] 1399 levels := buf.levels[:0] 1400 1401 // We compute the number of levels needed ahead of time and reallocate a slice if 1402 // the array from the iterAlloc isn't large enough. Doing this allocation once 1403 // should improve the performance. 1404 numMergingLevels := 0 1405 numLevelIters := 0 1406 if i.batch != nil { 1407 numMergingLevels++ 1408 } 1409 1410 var current *version 1411 if !i.batchOnlyIter { 1412 numMergingLevels += len(memtables) 1413 1414 current = i.version 1415 if current == nil { 1416 current = i.readState.current 1417 } 1418 numMergingLevels += len(current.L0SublevelFiles) 1419 numLevelIters += len(current.L0SublevelFiles) 1420 for level := 1; level < len(current.Levels); level++ { 1421 if current.Levels[level].Empty() { 1422 continue 1423 } 1424 numMergingLevels++ 1425 numLevelIters++ 1426 } 1427 } 1428 1429 if numMergingLevels > cap(mlevels) { 1430 mlevels = make([]mergingIterLevel, 0, numMergingLevels) 1431 } 1432 if numLevelIters > cap(levels) { 1433 levels = make([]levelIter, 0, numLevelIters) 1434 } 1435 1436 // Top-level is the batch, if any. 1437 if i.batch != nil { 1438 if i.batch.index == nil { 1439 // This isn't an indexed batch. We shouldn't have gotten this far. 1440 panic(errors.AssertionFailedf("creating an iterator over an unindexed batch")) 1441 } else { 1442 i.batch.initInternalIter(&i.opts, &i.batchPointIter) 1443 i.batch.initRangeDelIter(&i.opts, &i.batchRangeDelIter, i.batchSeqNum) 1444 // Only include the batch's rangedel iterator if it's non-empty. 1445 // This requires some subtle logic in the case a rangedel is later 1446 // written to the batch and the view of the batch is refreshed 1447 // during a call to SetOptions—in this case, we need to reconstruct 1448 // the point iterator to add the batch rangedel iterator. 1449 var rangeDelIter keyspan.FragmentIterator 1450 if i.batchRangeDelIter.Count() > 0 { 1451 rangeDelIter = &i.batchRangeDelIter 1452 } 1453 mlevels = append(mlevels, mergingIterLevel{ 1454 iter: &i.batchPointIter, 1455 rangeDelIter: rangeDelIter, 1456 }) 1457 } 1458 } 1459 1460 if !i.batchOnlyIter { 1461 // Next are the memtables. 1462 for j := len(memtables) - 1; j >= 0; j-- { 1463 mem := memtables[j] 1464 mlevels = append(mlevels, mergingIterLevel{ 1465 iter: mem.newIter(&i.opts), 1466 rangeDelIter: mem.newRangeDelIter(&i.opts), 1467 }) 1468 } 1469 1470 // Next are the file levels: L0 sub-levels followed by lower levels. 1471 mlevelsIndex := len(mlevels) 1472 levelsIndex := len(levels) 1473 mlevels = mlevels[:numMergingLevels] 1474 levels = levels[:numLevelIters] 1475 i.opts.snapshotForHideObsoletePoints = buf.dbi.seqNum 1476 addLevelIterForFiles := func(files manifest.LevelIterator, level manifest.Level) { 1477 li := &levels[levelsIndex] 1478 1479 li.init(ctx, i.opts, &i.comparer, i.newIters, files, level, internalOpts) 1480 li.initRangeDel(&mlevels[mlevelsIndex].rangeDelIter) 1481 li.initBoundaryContext(&mlevels[mlevelsIndex].levelIterBoundaryContext) 1482 li.initCombinedIterState(&i.lazyCombinedIter.combinedIterState) 1483 mlevels[mlevelsIndex].levelIter = li 1484 mlevels[mlevelsIndex].iter = invalidating.MaybeWrapIfInvariants(li) 1485 1486 levelsIndex++ 1487 mlevelsIndex++ 1488 } 1489 1490 // Add level iterators for the L0 sublevels, iterating from newest to 1491 // oldest. 1492 for i := len(current.L0SublevelFiles) - 1; i >= 0; i-- { 1493 addLevelIterForFiles(current.L0SublevelFiles[i].Iter(), manifest.L0Sublevel(i)) 1494 } 1495 1496 // Add level iterators for the non-empty non-L0 levels. 1497 for level := 1; level < len(current.Levels); level++ { 1498 if current.Levels[level].Empty() { 1499 continue 1500 } 1501 addLevelIterForFiles(current.Levels[level].Iter(), manifest.Level(level)) 1502 } 1503 } 1504 buf.merging.init(&i.opts, &i.stats.InternalStats, i.comparer.Compare, i.comparer.Split, mlevels...) 1505 if len(mlevels) <= cap(buf.levelsPositioned) { 1506 buf.merging.levelsPositioned = buf.levelsPositioned[:len(mlevels)] 1507 } 1508 buf.merging.snapshot = i.seqNum 1509 buf.merging.batchSnapshot = i.batchSeqNum 1510 buf.merging.combinedIterState = &i.lazyCombinedIter.combinedIterState 1511 i.pointIter = invalidating.MaybeWrapIfInvariants(&buf.merging) 1512 i.merging = &buf.merging 1513 } 1514 1515 // NewBatch returns a new empty write-only batch. Any reads on the batch will 1516 // return an error. If the batch is committed it will be applied to the DB. 1517 func (d *DB) NewBatch() *Batch { 1518 return newBatch(d) 1519 } 1520 1521 // NewBatchWithSize is mostly identical to NewBatch, but it will allocate the 1522 // the specified memory space for the internal slice in advance. 1523 func (d *DB) NewBatchWithSize(size int) *Batch { 1524 return newBatchWithSize(d, size) 1525 } 1526 1527 // NewIndexedBatch returns a new empty read-write batch. Any reads on the batch 1528 // will read from both the batch and the DB. If the batch is committed it will 1529 // be applied to the DB. An indexed batch is slower that a non-indexed batch 1530 // for insert operations. If you do not need to perform reads on the batch, use 1531 // NewBatch instead. 1532 func (d *DB) NewIndexedBatch() *Batch { 1533 return newIndexedBatch(d, d.opts.Comparer) 1534 } 1535 1536 // NewIndexedBatchWithSize is mostly identical to NewIndexedBatch, but it will 1537 // allocate the the specified memory space for the internal slice in advance. 1538 func (d *DB) NewIndexedBatchWithSize(size int) *Batch { 1539 return newIndexedBatchWithSize(d, d.opts.Comparer, size) 1540 } 1541 1542 // NewIter returns an iterator that is unpositioned (Iterator.Valid() will 1543 // return false). The iterator can be positioned via a call to SeekGE, SeekLT, 1544 // First or Last. The iterator provides a point-in-time view of the current DB 1545 // state. This view is maintained by preventing file deletions and preventing 1546 // memtables referenced by the iterator from being deleted. Using an iterator 1547 // to maintain a long-lived point-in-time view of the DB state can lead to an 1548 // apparent memory and disk usage leak. Use snapshots (see NewSnapshot) for 1549 // point-in-time snapshots which avoids these problems. 1550 func (d *DB) NewIter(o *IterOptions) (*Iterator, error) { 1551 return d.NewIterWithContext(context.Background(), o) 1552 } 1553 1554 // NewIterWithContext is like NewIter, and additionally accepts a context for 1555 // tracing. 1556 func (d *DB) NewIterWithContext(ctx context.Context, o *IterOptions) (*Iterator, error) { 1557 return d.newIter(ctx, nil /* batch */, newIterOpts{}, o), nil 1558 } 1559 1560 // NewSnapshot returns a point-in-time view of the current DB state. Iterators 1561 // created with this handle will all observe a stable snapshot of the current 1562 // DB state. The caller must call Snapshot.Close() when the snapshot is no 1563 // longer needed. Snapshots are not persisted across DB restarts (close -> 1564 // open). Unlike the implicit snapshot maintained by an iterator, a snapshot 1565 // will not prevent memtables from being released or sstables from being 1566 // deleted. Instead, a snapshot prevents deletion of sequence numbers 1567 // referenced by the snapshot. 1568 func (d *DB) NewSnapshot() *Snapshot { 1569 if err := d.closed.Load(); err != nil { 1570 panic(err) 1571 } 1572 1573 d.mu.Lock() 1574 s := &Snapshot{ 1575 db: d, 1576 seqNum: d.mu.versions.visibleSeqNum.Load(), 1577 } 1578 d.mu.snapshots.pushBack(s) 1579 d.mu.Unlock() 1580 return s 1581 } 1582 1583 // NewEventuallyFileOnlySnapshot returns a point-in-time view of the current DB 1584 // state, similar to NewSnapshot, but with consistency constrained to the 1585 // provided set of key ranges. See the comment at EventuallyFileOnlySnapshot for 1586 // its semantics. 1587 func (d *DB) NewEventuallyFileOnlySnapshot(keyRanges []KeyRange) *EventuallyFileOnlySnapshot { 1588 if err := d.closed.Load(); err != nil { 1589 panic(err) 1590 } 1591 1592 internalKeyRanges := make([]internalKeyRange, len(keyRanges)) 1593 for i := range keyRanges { 1594 if i > 0 && d.cmp(keyRanges[i-1].End, keyRanges[i].Start) > 0 { 1595 panic("pebble: key ranges for eventually-file-only-snapshot not in order") 1596 } 1597 internalKeyRanges[i] = internalKeyRange{ 1598 smallest: base.MakeInternalKey(keyRanges[i].Start, InternalKeySeqNumMax, InternalKeyKindMax), 1599 largest: base.MakeExclusiveSentinelKey(InternalKeyKindRangeDelete, keyRanges[i].End), 1600 } 1601 } 1602 1603 return d.makeEventuallyFileOnlySnapshot(keyRanges, internalKeyRanges) 1604 } 1605 1606 // Close closes the DB. 1607 // 1608 // It is not safe to close a DB until all outstanding iterators are closed 1609 // or to call Close concurrently with any other DB method. It is not valid 1610 // to call any of a DB's methods after the DB has been closed. 1611 func (d *DB) Close() error { 1612 // Lock the commit pipeline for the duration of Close. This prevents a race 1613 // with makeRoomForWrite. Rotating the WAL in makeRoomForWrite requires 1614 // dropping d.mu several times for I/O. If Close only holds d.mu, an 1615 // in-progress WAL rotation may re-acquire d.mu only once the database is 1616 // closed. 1617 // 1618 // Additionally, locking the commit pipeline makes it more likely that 1619 // (illegal) concurrent writes will observe d.closed.Load() != nil, creating 1620 // more understable panics if the database is improperly used concurrently 1621 // during Close. 1622 d.commit.mu.Lock() 1623 defer d.commit.mu.Unlock() 1624 d.mu.Lock() 1625 defer d.mu.Unlock() 1626 if err := d.closed.Load(); err != nil { 1627 panic(err) 1628 } 1629 1630 // Clear the finalizer that is used to check that an unreferenced DB has been 1631 // closed. We're closing the DB here, so the check performed by that 1632 // finalizer isn't necessary. 1633 // 1634 // Note: this is a no-op if invariants are disabled or race is enabled. 1635 invariants.SetFinalizer(d.closed, nil) 1636 1637 d.closed.Store(errors.WithStack(ErrClosed)) 1638 close(d.closedCh) 1639 1640 defer d.opts.Cache.Unref() 1641 1642 for d.mu.compact.compactingCount > 0 || d.mu.compact.flushing { 1643 d.mu.compact.cond.Wait() 1644 } 1645 for d.mu.tableStats.loading { 1646 d.mu.tableStats.cond.Wait() 1647 } 1648 for d.mu.tableValidation.validating { 1649 d.mu.tableValidation.cond.Wait() 1650 } 1651 1652 var err error 1653 if n := len(d.mu.compact.inProgress); n > 0 { 1654 err = errors.Errorf("pebble: %d unexpected in-progress compactions", errors.Safe(n)) 1655 } 1656 err = firstError(err, d.mu.formatVers.marker.Close()) 1657 err = firstError(err, d.tableCache.close()) 1658 if !d.opts.ReadOnly { 1659 err = firstError(err, d.mu.log.Close()) 1660 } else if d.mu.log.LogWriter != nil { 1661 panic("pebble: log-writer should be nil in read-only mode") 1662 } 1663 err = firstError(err, d.fileLock.Close()) 1664 1665 // Note that versionSet.close() only closes the MANIFEST. The versions list 1666 // is still valid for the checks below. 1667 err = firstError(err, d.mu.versions.close()) 1668 1669 err = firstError(err, d.dataDir.Close()) 1670 if d.dataDir != d.walDir { 1671 err = firstError(err, d.walDir.Close()) 1672 } 1673 1674 d.readState.val.unrefLocked() 1675 1676 current := d.mu.versions.currentVersion() 1677 for v := d.mu.versions.versions.Front(); true; v = v.Next() { 1678 refs := v.Refs() 1679 if v == current { 1680 if refs != 1 { 1681 err = firstError(err, errors.Errorf("leaked iterators: current\n%s", v)) 1682 } 1683 break 1684 } 1685 if refs != 0 { 1686 err = firstError(err, errors.Errorf("leaked iterators:\n%s", v)) 1687 } 1688 } 1689 1690 for _, mem := range d.mu.mem.queue { 1691 // Usually, we'd want to delete the files returned by readerUnref. But 1692 // in this case, even if we're unreferencing the flushables, the 1693 // flushables aren't obsolete. They will be reconstructed during WAL 1694 // replay. 1695 mem.readerUnrefLocked(false) 1696 } 1697 // If there's an unused, recycled memtable, we need to release its memory. 1698 if obsoleteMemTable := d.memTableRecycle.Swap(nil); obsoleteMemTable != nil { 1699 d.freeMemTable(obsoleteMemTable) 1700 } 1701 if reserved := d.memTableReserved.Load(); reserved != 0 { 1702 err = firstError(err, errors.Errorf("leaked memtable reservation: %d", errors.Safe(reserved))) 1703 } 1704 1705 // Since we called d.readState.val.unrefLocked() above, we are expected to 1706 // manually schedule deletion of obsolete files. 1707 if len(d.mu.versions.obsoleteTables) > 0 { 1708 d.deleteObsoleteFiles(d.mu.nextJobID) 1709 } 1710 1711 d.mu.Unlock() 1712 d.compactionSchedulers.Wait() 1713 1714 // Wait for all cleaning jobs to finish. 1715 d.cleanupManager.Close() 1716 1717 // Sanity check metrics. 1718 if invariants.Enabled { 1719 m := d.Metrics() 1720 if m.Compact.NumInProgress > 0 || m.Compact.InProgressBytes > 0 { 1721 d.mu.Lock() 1722 panic(fmt.Sprintf("invalid metrics on close:\n%s", m)) 1723 } 1724 } 1725 1726 d.mu.Lock() 1727 1728 // As a sanity check, ensure that there are no zombie tables. A non-zero count 1729 // hints at a reference count leak. 1730 if ztbls := len(d.mu.versions.zombieTables); ztbls > 0 { 1731 err = firstError(err, errors.Errorf("non-zero zombie file count: %d", ztbls)) 1732 } 1733 1734 err = firstError(err, d.objProvider.Close()) 1735 1736 // If the options include a closer to 'close' the filesystem, close it. 1737 if d.opts.private.fsCloser != nil { 1738 d.opts.private.fsCloser.Close() 1739 } 1740 1741 // Return an error if the user failed to close all open snapshots. 1742 if v := d.mu.snapshots.count(); v > 0 { 1743 err = firstError(err, errors.Errorf("leaked snapshots: %d open snapshots on DB %p", v, d)) 1744 } 1745 1746 return err 1747 } 1748 1749 // Compact the specified range of keys in the database. 1750 func (d *DB) Compact(start, end []byte, parallelize bool) error { 1751 if err := d.closed.Load(); err != nil { 1752 panic(err) 1753 } 1754 if d.opts.ReadOnly { 1755 return ErrReadOnly 1756 } 1757 if d.cmp(start, end) >= 0 { 1758 return errors.Errorf("Compact start %s is not less than end %s", 1759 d.opts.Comparer.FormatKey(start), d.opts.Comparer.FormatKey(end)) 1760 } 1761 iStart := base.MakeInternalKey(start, InternalKeySeqNumMax, InternalKeyKindMax) 1762 iEnd := base.MakeInternalKey(end, 0, 0) 1763 m := (&fileMetadata{}).ExtendPointKeyBounds(d.cmp, iStart, iEnd) 1764 meta := []*fileMetadata{m} 1765 1766 d.mu.Lock() 1767 maxLevelWithFiles := 1 1768 cur := d.mu.versions.currentVersion() 1769 for level := 0; level < numLevels; level++ { 1770 overlaps := cur.Overlaps(level, d.cmp, start, end, iEnd.IsExclusiveSentinel()) 1771 if !overlaps.Empty() { 1772 maxLevelWithFiles = level + 1 1773 } 1774 } 1775 1776 keyRanges := make([]internalKeyRange, len(meta)) 1777 for i := range meta { 1778 keyRanges[i] = internalKeyRange{smallest: m.Smallest, largest: m.Largest} 1779 } 1780 // Determine if any memtable overlaps with the compaction range. We wait for 1781 // any such overlap to flush (initiating a flush if necessary). 1782 mem, err := func() (*flushableEntry, error) { 1783 // Check to see if any files overlap with any of the memtables. The queue 1784 // is ordered from oldest to newest with the mutable memtable being the 1785 // last element in the slice. We want to wait for the newest table that 1786 // overlaps. 1787 for i := len(d.mu.mem.queue) - 1; i >= 0; i-- { 1788 mem := d.mu.mem.queue[i] 1789 if ingestMemtableOverlaps(d.cmp, mem, keyRanges) { 1790 var err error 1791 if mem.flushable == d.mu.mem.mutable { 1792 // We have to hold both commitPipeline.mu and DB.mu when calling 1793 // makeRoomForWrite(). Lock order requirements elsewhere force us to 1794 // unlock DB.mu in order to grab commitPipeline.mu first. 1795 d.mu.Unlock() 1796 d.commit.mu.Lock() 1797 d.mu.Lock() 1798 defer d.commit.mu.Unlock() 1799 if mem.flushable == d.mu.mem.mutable { 1800 // Only flush if the active memtable is unchanged. 1801 err = d.makeRoomForWrite(nil) 1802 } 1803 } 1804 mem.flushForced = true 1805 d.maybeScheduleFlush() 1806 return mem, err 1807 } 1808 } 1809 return nil, nil 1810 }() 1811 1812 d.mu.Unlock() 1813 1814 if err != nil { 1815 return err 1816 } 1817 if mem != nil { 1818 <-mem.flushed 1819 } 1820 1821 for level := 0; level < maxLevelWithFiles; { 1822 for { 1823 if err := d.manualCompact( 1824 iStart.UserKey, iEnd.UserKey, level, parallelize); err != nil { 1825 if errors.Is(err, ErrCancelledCompaction) { 1826 continue 1827 } 1828 return err 1829 } 1830 break 1831 } 1832 level++ 1833 if level == numLevels-1 { 1834 // A manual compaction of the bottommost level occurred. 1835 // There is no next level to try and compact. 1836 break 1837 } 1838 } 1839 return nil 1840 } 1841 1842 func (d *DB) manualCompact(start, end []byte, level int, parallelize bool) error { 1843 d.mu.Lock() 1844 curr := d.mu.versions.currentVersion() 1845 files := curr.Overlaps(level, d.cmp, start, end, false) 1846 if files.Empty() { 1847 d.mu.Unlock() 1848 return nil 1849 } 1850 1851 var compactions []*manualCompaction 1852 if parallelize { 1853 compactions = append(compactions, d.splitManualCompaction(start, end, level)...) 1854 } else { 1855 compactions = append(compactions, &manualCompaction{ 1856 level: level, 1857 done: make(chan error, 1), 1858 start: start, 1859 end: end, 1860 }) 1861 } 1862 d.mu.compact.manual = append(d.mu.compact.manual, compactions...) 1863 d.maybeScheduleCompaction() 1864 d.mu.Unlock() 1865 1866 // Each of the channels is guaranteed to be eventually sent to once. After a 1867 // compaction is possibly picked in d.maybeScheduleCompaction(), either the 1868 // compaction is dropped, executed after being scheduled, or retried later. 1869 // Assuming eventual progress when a compaction is retried, all outcomes send 1870 // a value to the done channel. Since the channels are buffered, it is not 1871 // necessary to read from each channel, and so we can exit early in the event 1872 // of an error. 1873 for _, compaction := range compactions { 1874 if err := <-compaction.done; err != nil { 1875 return err 1876 } 1877 } 1878 return nil 1879 } 1880 1881 // splitManualCompaction splits a manual compaction over [start,end] on level 1882 // such that the resulting compactions have no key overlap. 1883 func (d *DB) splitManualCompaction( 1884 start, end []byte, level int, 1885 ) (splitCompactions []*manualCompaction) { 1886 curr := d.mu.versions.currentVersion() 1887 endLevel := level + 1 1888 baseLevel := d.mu.versions.picker.getBaseLevel() 1889 if level == 0 { 1890 endLevel = baseLevel 1891 } 1892 keyRanges := calculateInuseKeyRanges(curr, d.cmp, level, endLevel, start, end) 1893 for _, keyRange := range keyRanges { 1894 splitCompactions = append(splitCompactions, &manualCompaction{ 1895 level: level, 1896 done: make(chan error, 1), 1897 start: keyRange.Start, 1898 end: keyRange.End, 1899 split: true, 1900 }) 1901 } 1902 return splitCompactions 1903 } 1904 1905 // DownloadSpan is a key range passed to the Download method. 1906 type DownloadSpan struct { 1907 StartKey []byte 1908 // EndKey is exclusive. 1909 EndKey []byte 1910 } 1911 1912 func (d *DB) downloadSpan(ctx context.Context, span DownloadSpan) error { 1913 dSpan := &downloadSpan{ 1914 start: span.StartKey, 1915 end: span.EndKey, 1916 // Protected by d.mu. 1917 doneChans: make([]chan error, 1), 1918 } 1919 dSpan.doneChans[0] = make(chan error, 1) 1920 doneChan := dSpan.doneChans[0] 1921 compactionIdx := 0 1922 1923 func() { 1924 d.mu.Lock() 1925 defer d.mu.Unlock() 1926 1927 d.mu.compact.downloads = append(d.mu.compact.downloads, dSpan) 1928 d.maybeScheduleCompaction() 1929 }() 1930 1931 // Requires d.mu to be held. 1932 noExternalFilesInSpan := func() (noExternalFiles bool) { 1933 vers := d.mu.versions.currentVersion() 1934 1935 for i := 0; i < len(vers.Levels); i++ { 1936 if vers.Levels[i].Empty() { 1937 continue 1938 } 1939 overlap := vers.Overlaps(i, d.cmp, span.StartKey, span.EndKey, true /* exclusiveEnd */) 1940 foundExternalFile := false 1941 overlap.Each(func(metadata *manifest.FileMetadata) { 1942 objMeta, err := d.objProvider.Lookup(fileTypeTable, metadata.FileBacking.DiskFileNum) 1943 if err != nil { 1944 return 1945 } 1946 if objMeta.IsExternal() { 1947 foundExternalFile = true 1948 } 1949 }) 1950 if foundExternalFile { 1951 return false 1952 } 1953 } 1954 return true 1955 } 1956 1957 // Requires d.mu to be held. 1958 removeUsFromList := func() { 1959 // Check where we are in d.mu.compact.downloads. Remove us from the 1960 // list. 1961 for i := range d.mu.compact.downloads { 1962 if d.mu.compact.downloads[i] != dSpan { 1963 continue 1964 } 1965 copy(d.mu.compact.downloads[i:], d.mu.compact.downloads[i+1:]) 1966 d.mu.compact.downloads = d.mu.compact.downloads[:len(d.mu.compact.downloads)-1] 1967 break 1968 } 1969 } 1970 1971 for { 1972 select { 1973 case <-ctx.Done(): 1974 d.mu.Lock() 1975 defer d.mu.Unlock() 1976 removeUsFromList() 1977 return ctx.Err() 1978 case err := <-doneChan: 1979 if err != nil { 1980 d.mu.Lock() 1981 defer d.mu.Unlock() 1982 removeUsFromList() 1983 return err 1984 } 1985 compactionIdx++ 1986 // Grab the next doneCh to wait on. 1987 func() { 1988 d.mu.Lock() 1989 defer d.mu.Unlock() 1990 doneChan = dSpan.doneChans[compactionIdx] 1991 }() 1992 default: 1993 doneSpan := func() bool { 1994 d.mu.Lock() 1995 defer d.mu.Unlock() 1996 // It's possible to have downloaded all files without writing to any 1997 // doneChans. This is expected if there are a significant amount 1998 // of overlapping writes that schedule regular, non-download compactions. 1999 if noExternalFilesInSpan() { 2000 removeUsFromList() 2001 return true 2002 } 2003 d.maybeScheduleCompaction() 2004 d.mu.compact.cond.Wait() 2005 return false 2006 }() 2007 if doneSpan { 2008 return nil 2009 } 2010 } 2011 } 2012 } 2013 2014 // Download ensures that the LSM does not use any external sstables for the 2015 // given key ranges. It does so by performing appropriate compactions so that 2016 // all external data becomes available locally. 2017 // 2018 // Note that calling this method does not imply that all other compactions stop; 2019 // it simply informs Pebble of a list of spans for which external data should be 2020 // downloaded with high priority. 2021 // 2022 // The method returns once no external sstasbles overlap the given spans, the 2023 // context is canceled, or an error is hit. 2024 // 2025 // TODO(radu): consider passing a priority/impact knob to express how important 2026 // the download is (versus live traffic performance, LSM health). 2027 func (d *DB) Download(ctx context.Context, spans []DownloadSpan) error { 2028 ctx, cancel := context.WithCancel(ctx) 2029 defer cancel() 2030 if err := d.closed.Load(); err != nil { 2031 panic(err) 2032 } 2033 if d.opts.ReadOnly { 2034 return ErrReadOnly 2035 } 2036 for i := range spans { 2037 if err := ctx.Err(); err != nil { 2038 return err 2039 } 2040 if err := d.downloadSpan(ctx, spans[i]); err != nil { 2041 return err 2042 } 2043 } 2044 return nil 2045 } 2046 2047 // Flush the memtable to stable storage. 2048 func (d *DB) Flush() error { 2049 flushDone, err := d.AsyncFlush() 2050 if err != nil { 2051 return err 2052 } 2053 <-flushDone 2054 return nil 2055 } 2056 2057 // AsyncFlush asynchronously flushes the memtable to stable storage. 2058 // 2059 // If no error is returned, the caller can receive from the returned channel in 2060 // order to wait for the flush to complete. 2061 func (d *DB) AsyncFlush() (<-chan struct{}, error) { 2062 if err := d.closed.Load(); err != nil { 2063 panic(err) 2064 } 2065 if d.opts.ReadOnly { 2066 return nil, ErrReadOnly 2067 } 2068 2069 d.commit.mu.Lock() 2070 defer d.commit.mu.Unlock() 2071 d.mu.Lock() 2072 defer d.mu.Unlock() 2073 flushed := d.mu.mem.queue[len(d.mu.mem.queue)-1].flushed 2074 err := d.makeRoomForWrite(nil) 2075 if err != nil { 2076 return nil, err 2077 } 2078 return flushed, nil 2079 } 2080 2081 // Metrics returns metrics about the database. 2082 func (d *DB) Metrics() *Metrics { 2083 metrics := &Metrics{} 2084 recycledLogsCount, recycledLogSize := d.logRecycler.stats() 2085 2086 d.mu.Lock() 2087 vers := d.mu.versions.currentVersion() 2088 *metrics = d.mu.versions.metrics 2089 metrics.Compact.EstimatedDebt = d.mu.versions.picker.estimatedCompactionDebt(0) 2090 metrics.Compact.InProgressBytes = d.mu.versions.atomicInProgressBytes.Load() 2091 metrics.Compact.NumInProgress = int64(d.mu.compact.compactingCount) 2092 metrics.Compact.MarkedFiles = vers.Stats.MarkedForCompaction 2093 metrics.Compact.Duration = d.mu.compact.duration 2094 for c := range d.mu.compact.inProgress { 2095 if c.kind != compactionKindFlush { 2096 metrics.Compact.Duration += d.timeNow().Sub(c.beganAt) 2097 } 2098 } 2099 2100 for _, m := range d.mu.mem.queue { 2101 metrics.MemTable.Size += m.totalBytes() 2102 } 2103 metrics.Snapshots.Count = d.mu.snapshots.count() 2104 if metrics.Snapshots.Count > 0 { 2105 metrics.Snapshots.EarliestSeqNum = d.mu.snapshots.earliest() 2106 } 2107 metrics.Snapshots.PinnedKeys = d.mu.snapshots.cumulativePinnedCount 2108 metrics.Snapshots.PinnedSize = d.mu.snapshots.cumulativePinnedSize 2109 metrics.MemTable.Count = int64(len(d.mu.mem.queue)) 2110 metrics.MemTable.ZombieCount = d.memTableCount.Load() - metrics.MemTable.Count 2111 metrics.MemTable.ZombieSize = uint64(d.memTableReserved.Load()) - metrics.MemTable.Size 2112 metrics.WAL.ObsoleteFiles = int64(recycledLogsCount) 2113 metrics.WAL.ObsoletePhysicalSize = recycledLogSize 2114 metrics.WAL.Size = d.logSize.Load() 2115 // The current WAL size (d.atomic.logSize) is the current logical size, 2116 // which may be less than the WAL's physical size if it was recycled. 2117 // The file sizes in d.mu.log.queue are updated to the physical size 2118 // during WAL rotation. Use the larger of the two for the current WAL. All 2119 // the previous WALs's fileSizes in d.mu.log.queue are already updated. 2120 metrics.WAL.PhysicalSize = metrics.WAL.Size 2121 if len(d.mu.log.queue) > 0 && metrics.WAL.PhysicalSize < d.mu.log.queue[len(d.mu.log.queue)-1].fileSize { 2122 metrics.WAL.PhysicalSize = d.mu.log.queue[len(d.mu.log.queue)-1].fileSize 2123 } 2124 for i, n := 0, len(d.mu.log.queue)-1; i < n; i++ { 2125 metrics.WAL.PhysicalSize += d.mu.log.queue[i].fileSize 2126 } 2127 2128 metrics.WAL.BytesIn = d.mu.log.bytesIn // protected by d.mu 2129 for i, n := 0, len(d.mu.mem.queue)-1; i < n; i++ { 2130 metrics.WAL.Size += d.mu.mem.queue[i].logSize 2131 } 2132 metrics.WAL.BytesWritten = metrics.Levels[0].BytesIn + metrics.WAL.Size 2133 if p := d.mu.versions.picker; p != nil { 2134 compactions := d.getInProgressCompactionInfoLocked(nil) 2135 for level, score := range p.getScores(compactions) { 2136 metrics.Levels[level].Score = score 2137 } 2138 } 2139 metrics.Table.ZombieCount = int64(len(d.mu.versions.zombieTables)) 2140 for _, size := range d.mu.versions.zombieTables { 2141 metrics.Table.ZombieSize += size 2142 } 2143 metrics.private.optionsFileSize = d.optionsFileSize 2144 2145 // TODO(jackson): Consider making these metrics optional. 2146 metrics.Keys.RangeKeySetsCount = countRangeKeySetFragments(vers) 2147 metrics.Keys.TombstoneCount = countTombstones(vers) 2148 2149 d.mu.versions.logLock() 2150 metrics.private.manifestFileSize = uint64(d.mu.versions.manifest.Size()) 2151 metrics.Table.BackingTableCount = uint64(len(d.mu.versions.backingState.fileBackingMap)) 2152 metrics.Table.BackingTableSize = d.mu.versions.backingState.fileBackingSize 2153 if invariants.Enabled { 2154 var totalSize uint64 2155 for _, backing := range d.mu.versions.backingState.fileBackingMap { 2156 totalSize += backing.Size 2157 } 2158 if totalSize != metrics.Table.BackingTableSize { 2159 panic("pebble: invalid backing table size accounting") 2160 } 2161 } 2162 d.mu.versions.logUnlock() 2163 2164 metrics.LogWriter.FsyncLatency = d.mu.log.metrics.fsyncLatency 2165 if err := metrics.LogWriter.Merge(&d.mu.log.metrics.LogWriterMetrics); err != nil { 2166 d.opts.Logger.Errorf("metrics error: %s", err) 2167 } 2168 metrics.Flush.WriteThroughput = d.mu.compact.flushWriteThroughput 2169 if d.mu.compact.flushing { 2170 metrics.Flush.NumInProgress = 1 2171 } 2172 for i := 0; i < numLevels; i++ { 2173 metrics.Levels[i].Additional.ValueBlocksSize = valueBlocksSizeForLevel(vers, i) 2174 } 2175 2176 d.mu.Unlock() 2177 2178 metrics.BlockCache = d.opts.Cache.Metrics() 2179 metrics.TableCache, metrics.Filter = d.tableCache.metrics() 2180 metrics.TableIters = int64(d.tableCache.iterCount()) 2181 metrics.CategoryStats = d.tableCache.dbOpts.sstStatsCollector.GetStats() 2182 2183 metrics.SecondaryCacheMetrics = d.objProvider.Metrics() 2184 2185 metrics.Uptime = d.timeNow().Sub(d.openedAt) 2186 2187 return metrics 2188 } 2189 2190 // sstablesOptions hold the optional parameters to retrieve TableInfo for all sstables. 2191 type sstablesOptions struct { 2192 // set to true will return the sstable properties in TableInfo 2193 withProperties bool 2194 2195 // if set, return sstables that overlap the key range (end-exclusive) 2196 start []byte 2197 end []byte 2198 2199 withApproximateSpanBytes bool 2200 } 2201 2202 // SSTablesOption set optional parameter used by `DB.SSTables`. 2203 type SSTablesOption func(*sstablesOptions) 2204 2205 // WithProperties enable return sstable properties in each TableInfo. 2206 // 2207 // NOTE: if most of the sstable properties need to be read from disk, 2208 // this options may make method `SSTables` quite slow. 2209 func WithProperties() SSTablesOption { 2210 return func(opt *sstablesOptions) { 2211 opt.withProperties = true 2212 } 2213 } 2214 2215 // WithKeyRangeFilter ensures returned sstables overlap start and end (end-exclusive) 2216 // if start and end are both nil these properties have no effect. 2217 func WithKeyRangeFilter(start, end []byte) SSTablesOption { 2218 return func(opt *sstablesOptions) { 2219 opt.end = end 2220 opt.start = start 2221 } 2222 } 2223 2224 // WithApproximateSpanBytes enables capturing the approximate number of bytes that 2225 // overlap the provided key span for each sstable. 2226 // NOTE: this option can only be used with WithKeyRangeFilter and WithProperties 2227 // provided. 2228 func WithApproximateSpanBytes() SSTablesOption { 2229 return func(opt *sstablesOptions) { 2230 opt.withApproximateSpanBytes = true 2231 } 2232 } 2233 2234 // BackingType denotes the type of storage backing a given sstable. 2235 type BackingType int 2236 2237 const ( 2238 // BackingTypeLocal denotes an sstable stored on local disk according to the 2239 // objprovider. This file is completely owned by us. 2240 BackingTypeLocal BackingType = iota 2241 // BackingTypeShared denotes an sstable stored on shared storage, created 2242 // by this Pebble instance and possibly shared by other Pebble instances. 2243 // These types of files have lifecycle managed by Pebble. 2244 BackingTypeShared 2245 // BackingTypeSharedForeign denotes an sstable stored on shared storage, 2246 // created by a Pebble instance other than this one. These types of files have 2247 // lifecycle managed by Pebble. 2248 BackingTypeSharedForeign 2249 // BackingTypeExternal denotes an sstable stored on external storage, 2250 // not owned by any Pebble instance and with no refcounting/cleanup methods 2251 // or lifecycle management. An example of an external file is a file restored 2252 // from a backup. 2253 BackingTypeExternal 2254 ) 2255 2256 // SSTableInfo export manifest.TableInfo with sstable.Properties alongside 2257 // other file backing info. 2258 type SSTableInfo struct { 2259 manifest.TableInfo 2260 // Virtual indicates whether the sstable is virtual. 2261 Virtual bool 2262 // BackingSSTNum is the file number associated with backing sstable which 2263 // backs the sstable associated with this SSTableInfo. If Virtual is false, 2264 // then BackingSSTNum == FileNum. 2265 BackingSSTNum base.FileNum 2266 // BackingType is the type of storage backing this sstable. 2267 BackingType BackingType 2268 // Locator is the remote.Locator backing this sstable, if the backing type is 2269 // not BackingTypeLocal. 2270 Locator remote.Locator 2271 2272 // Properties is the sstable properties of this table. If Virtual is true, 2273 // then the Properties are associated with the backing sst. 2274 Properties *sstable.Properties 2275 } 2276 2277 // SSTables retrieves the current sstables. The returned slice is indexed by 2278 // level and each level is indexed by the position of the sstable within the 2279 // level. Note that this information may be out of date due to concurrent 2280 // flushes and compactions. 2281 func (d *DB) SSTables(opts ...SSTablesOption) ([][]SSTableInfo, error) { 2282 opt := &sstablesOptions{} 2283 for _, fn := range opts { 2284 fn(opt) 2285 } 2286 2287 if opt.withApproximateSpanBytes && !opt.withProperties { 2288 return nil, errors.Errorf("Cannot use WithApproximateSpanBytes without WithProperties option.") 2289 } 2290 if opt.withApproximateSpanBytes && (opt.start == nil || opt.end == nil) { 2291 return nil, errors.Errorf("Cannot use WithApproximateSpanBytes without WithKeyRangeFilter option.") 2292 } 2293 2294 // Grab and reference the current readState. 2295 readState := d.loadReadState() 2296 defer readState.unref() 2297 2298 // TODO(peter): This is somewhat expensive, especially on a large 2299 // database. It might be worthwhile to unify TableInfo and FileMetadata and 2300 // then we could simply return current.Files. Note that RocksDB is doing 2301 // something similar to the current code, so perhaps it isn't too bad. 2302 srcLevels := readState.current.Levels 2303 var totalTables int 2304 for i := range srcLevels { 2305 totalTables += srcLevels[i].Len() 2306 } 2307 2308 destTables := make([]SSTableInfo, totalTables) 2309 destLevels := make([][]SSTableInfo, len(srcLevels)) 2310 for i := range destLevels { 2311 iter := srcLevels[i].Iter() 2312 j := 0 2313 for m := iter.First(); m != nil; m = iter.Next() { 2314 if opt.start != nil && opt.end != nil && !m.Overlaps(d.opts.Comparer.Compare, opt.start, opt.end, true /* exclusive end */) { 2315 continue 2316 } 2317 destTables[j] = SSTableInfo{TableInfo: m.TableInfo()} 2318 if opt.withProperties { 2319 p, err := d.tableCache.getTableProperties( 2320 m, 2321 ) 2322 if err != nil { 2323 return nil, err 2324 } 2325 destTables[j].Properties = p 2326 } 2327 destTables[j].Virtual = m.Virtual 2328 destTables[j].BackingSSTNum = m.FileBacking.DiskFileNum.FileNum() 2329 objMeta, err := d.objProvider.Lookup(fileTypeTable, m.FileBacking.DiskFileNum) 2330 if err != nil { 2331 return nil, err 2332 } 2333 if objMeta.IsRemote() { 2334 if objMeta.IsShared() { 2335 if d.objProvider.IsSharedForeign(objMeta) { 2336 destTables[j].BackingType = BackingTypeSharedForeign 2337 } else { 2338 destTables[j].BackingType = BackingTypeShared 2339 } 2340 } else { 2341 destTables[j].BackingType = BackingTypeExternal 2342 } 2343 destTables[j].Locator = objMeta.Remote.Locator 2344 } else { 2345 destTables[j].BackingType = BackingTypeLocal 2346 } 2347 2348 if opt.withApproximateSpanBytes { 2349 var spanBytes uint64 2350 if m.ContainedWithinSpan(d.opts.Comparer.Compare, opt.start, opt.end) { 2351 spanBytes = m.Size 2352 } else { 2353 size, err := d.tableCache.estimateSize(m, opt.start, opt.end) 2354 if err != nil { 2355 return nil, err 2356 } 2357 spanBytes = size 2358 } 2359 propertiesCopy := *destTables[j].Properties 2360 2361 // Deep copy user properties so approximate span bytes can be added. 2362 propertiesCopy.UserProperties = make(map[string]string, len(destTables[j].Properties.UserProperties)+1) 2363 for k, v := range destTables[j].Properties.UserProperties { 2364 propertiesCopy.UserProperties[k] = v 2365 } 2366 propertiesCopy.UserProperties["approximate-span-bytes"] = strconv.FormatUint(spanBytes, 10) 2367 destTables[j].Properties = &propertiesCopy 2368 } 2369 j++ 2370 } 2371 destLevels[i] = destTables[:j] 2372 destTables = destTables[j:] 2373 } 2374 2375 return destLevels, nil 2376 } 2377 2378 // EstimateDiskUsage returns the estimated filesystem space used in bytes for 2379 // storing the range `[start, end]`. The estimation is computed as follows: 2380 // 2381 // - For sstables fully contained in the range the whole file size is included. 2382 // - For sstables partially contained in the range the overlapping data block sizes 2383 // are included. Even if a data block partially overlaps, or we cannot determine 2384 // overlap due to abbreviated index keys, the full data block size is included in 2385 // the estimation. Note that unlike fully contained sstables, none of the 2386 // meta-block space is counted for partially overlapped files. 2387 // - For virtual sstables, we use the overlap between start, end and the virtual 2388 // sstable bounds to determine disk usage. 2389 // - There may also exist WAL entries for unflushed keys in this range. This 2390 // estimation currently excludes space used for the range in the WAL. 2391 func (d *DB) EstimateDiskUsage(start, end []byte) (uint64, error) { 2392 bytes, _, _, err := d.EstimateDiskUsageByBackingType(start, end) 2393 return bytes, err 2394 } 2395 2396 // EstimateDiskUsageByBackingType is like EstimateDiskUsage but additionally 2397 // returns the subsets of that size in remote ane external files. 2398 func (d *DB) EstimateDiskUsageByBackingType( 2399 start, end []byte, 2400 ) (totalSize, remoteSize, externalSize uint64, _ error) { 2401 if err := d.closed.Load(); err != nil { 2402 panic(err) 2403 } 2404 if d.opts.Comparer.Compare(start, end) > 0 { 2405 return 0, 0, 0, errors.New("invalid key-range specified (start > end)") 2406 } 2407 2408 // Grab and reference the current readState. This prevents the underlying 2409 // files in the associated version from being deleted if there is a concurrent 2410 // compaction. 2411 readState := d.loadReadState() 2412 defer readState.unref() 2413 2414 for level, files := range readState.current.Levels { 2415 iter := files.Iter() 2416 if level > 0 { 2417 // We can only use `Overlaps` to restrict `files` at L1+ since at L0 it 2418 // expands the range iteratively until it has found a set of files that 2419 // do not overlap any other L0 files outside that set. 2420 overlaps := readState.current.Overlaps(level, d.opts.Comparer.Compare, start, end, false /* exclusiveEnd */) 2421 iter = overlaps.Iter() 2422 } 2423 for file := iter.First(); file != nil; file = iter.Next() { 2424 if d.opts.Comparer.Compare(start, file.Smallest.UserKey) <= 0 && 2425 d.opts.Comparer.Compare(file.Largest.UserKey, end) <= 0 { 2426 // The range fully contains the file, so skip looking it up in 2427 // table cache/looking at its indexes, and add the full file size. 2428 meta, err := d.objProvider.Lookup(fileTypeTable, file.FileBacking.DiskFileNum) 2429 if err != nil { 2430 return 0, 0, 0, err 2431 } 2432 if meta.IsRemote() { 2433 remoteSize += file.Size 2434 if meta.Remote.CleanupMethod == objstorage.SharedNoCleanup { 2435 externalSize += file.Size 2436 } 2437 } 2438 totalSize += file.Size 2439 } else if d.opts.Comparer.Compare(file.Smallest.UserKey, end) <= 0 && 2440 d.opts.Comparer.Compare(start, file.Largest.UserKey) <= 0 { 2441 var size uint64 2442 var err error 2443 if file.Virtual { 2444 err = d.tableCache.withVirtualReader( 2445 file.VirtualMeta(), 2446 func(r sstable.VirtualReader) (err error) { 2447 size, err = r.EstimateDiskUsage(start, end) 2448 return err 2449 }, 2450 ) 2451 } else { 2452 err = d.tableCache.withReader( 2453 file.PhysicalMeta(), 2454 func(r *sstable.Reader) (err error) { 2455 size, err = r.EstimateDiskUsage(start, end) 2456 return err 2457 }, 2458 ) 2459 } 2460 if err != nil { 2461 return 0, 0, 0, err 2462 } 2463 meta, err := d.objProvider.Lookup(fileTypeTable, file.FileBacking.DiskFileNum) 2464 if err != nil { 2465 return 0, 0, 0, err 2466 } 2467 if meta.IsRemote() { 2468 remoteSize += size 2469 if meta.Remote.CleanupMethod == objstorage.SharedNoCleanup { 2470 externalSize += size 2471 } 2472 } 2473 totalSize += size 2474 } 2475 } 2476 } 2477 return totalSize, remoteSize, externalSize, nil 2478 } 2479 2480 func (d *DB) walPreallocateSize() int { 2481 // Set the WAL preallocate size to 110% of the memtable size. Note that there 2482 // is a bit of apples and oranges in units here as the memtabls size 2483 // corresponds to the memory usage of the memtable while the WAL size is the 2484 // size of the batches (plus overhead) stored in the WAL. 2485 // 2486 // TODO(peter): 110% of the memtable size is quite hefty for a block 2487 // size. This logic is taken from GetWalPreallocateBlockSize in 2488 // RocksDB. Could a smaller preallocation block size be used? 2489 size := d.opts.MemTableSize 2490 size = (size / 10) + size 2491 return int(size) 2492 } 2493 2494 func (d *DB) newMemTable(logNum base.DiskFileNum, logSeqNum uint64) (*memTable, *flushableEntry) { 2495 size := d.mu.mem.nextSize 2496 if d.mu.mem.nextSize < d.opts.MemTableSize { 2497 d.mu.mem.nextSize *= 2 2498 if d.mu.mem.nextSize > d.opts.MemTableSize { 2499 d.mu.mem.nextSize = d.opts.MemTableSize 2500 } 2501 } 2502 2503 memtblOpts := memTableOptions{ 2504 Options: d.opts, 2505 logSeqNum: logSeqNum, 2506 } 2507 2508 // Before attempting to allocate a new memtable, check if there's one 2509 // available for recycling in memTableRecycle. Large contiguous allocations 2510 // can be costly as fragmentation makes it more difficult to find a large 2511 // contiguous free space. We've observed 64MB allocations taking 10ms+. 2512 // 2513 // To reduce these costly allocations, up to 1 obsolete memtable is stashed 2514 // in `d.memTableRecycle` to allow a future memtable rotation to reuse 2515 // existing memory. 2516 var mem *memTable 2517 mem = d.memTableRecycle.Swap(nil) 2518 if mem != nil && uint64(len(mem.arenaBuf)) != size { 2519 d.freeMemTable(mem) 2520 mem = nil 2521 } 2522 if mem != nil { 2523 // Carry through the existing buffer and memory reservation. 2524 memtblOpts.arenaBuf = mem.arenaBuf 2525 memtblOpts.releaseAccountingReservation = mem.releaseAccountingReservation 2526 } else { 2527 mem = new(memTable) 2528 memtblOpts.arenaBuf = manual.New(int(size)) 2529 memtblOpts.releaseAccountingReservation = d.opts.Cache.Reserve(int(size)) 2530 d.memTableCount.Add(1) 2531 d.memTableReserved.Add(int64(size)) 2532 2533 // Note: this is a no-op if invariants are disabled or race is enabled. 2534 invariants.SetFinalizer(mem, checkMemTable) 2535 } 2536 mem.init(memtblOpts) 2537 2538 entry := d.newFlushableEntry(mem, logNum, logSeqNum) 2539 entry.releaseMemAccounting = func() { 2540 // If the user leaks iterators, we may be releasing the memtable after 2541 // the DB is already closed. In this case, we want to just release the 2542 // memory because DB.Close won't come along to free it for us. 2543 if err := d.closed.Load(); err != nil { 2544 d.freeMemTable(mem) 2545 return 2546 } 2547 2548 // The next memtable allocation might be able to reuse this memtable. 2549 // Stash it on d.memTableRecycle. 2550 if unusedMem := d.memTableRecycle.Swap(mem); unusedMem != nil { 2551 // There was already a memtable waiting to be recycled. We're now 2552 // responsible for freeing it. 2553 d.freeMemTable(unusedMem) 2554 } 2555 } 2556 return mem, entry 2557 } 2558 2559 func (d *DB) freeMemTable(m *memTable) { 2560 d.memTableCount.Add(-1) 2561 d.memTableReserved.Add(-int64(len(m.arenaBuf))) 2562 m.free() 2563 } 2564 2565 func (d *DB) newFlushableEntry( 2566 f flushable, logNum base.DiskFileNum, logSeqNum uint64, 2567 ) *flushableEntry { 2568 fe := &flushableEntry{ 2569 flushable: f, 2570 flushed: make(chan struct{}), 2571 logNum: logNum, 2572 logSeqNum: logSeqNum, 2573 deleteFn: d.mu.versions.addObsolete, 2574 deleteFnLocked: d.mu.versions.addObsoleteLocked, 2575 } 2576 fe.readerRefs.Store(1) 2577 return fe 2578 } 2579 2580 // makeRoomForWrite ensures that the memtable has room to hold the contents of 2581 // Batch. It reserves the space in the memtable and adds a reference to the 2582 // memtable. The caller must later ensure that the memtable is unreferenced. If 2583 // the memtable is full, or a nil Batch is provided, the current memtable is 2584 // rotated (marked as immutable) and a new mutable memtable is allocated. This 2585 // memtable rotation also causes a log rotation. 2586 // 2587 // Both DB.mu and commitPipeline.mu must be held by the caller. Note that DB.mu 2588 // may be released and reacquired. 2589 func (d *DB) makeRoomForWrite(b *Batch) error { 2590 if b != nil && b.ingestedSSTBatch { 2591 panic("pebble: invalid function call") 2592 } 2593 2594 force := b == nil || b.flushable != nil 2595 stalled := false 2596 for { 2597 if b != nil && b.flushable == nil { 2598 err := d.mu.mem.mutable.prepare(b) 2599 if err != arenaskl.ErrArenaFull { 2600 if stalled { 2601 d.opts.EventListener.WriteStallEnd() 2602 } 2603 return err 2604 } 2605 } else if !force { 2606 if stalled { 2607 d.opts.EventListener.WriteStallEnd() 2608 } 2609 return nil 2610 } 2611 // force || err == ErrArenaFull, so we need to rotate the current memtable. 2612 { 2613 var size uint64 2614 for i := range d.mu.mem.queue { 2615 size += d.mu.mem.queue[i].totalBytes() 2616 } 2617 if size >= uint64(d.opts.MemTableStopWritesThreshold)*d.opts.MemTableSize { 2618 // We have filled up the current memtable, but already queued memtables 2619 // are still flushing, so we wait. 2620 if !stalled { 2621 stalled = true 2622 d.opts.EventListener.WriteStallBegin(WriteStallBeginInfo{ 2623 Reason: "memtable count limit reached", 2624 }) 2625 } 2626 now := time.Now() 2627 d.mu.compact.cond.Wait() 2628 if b != nil { 2629 b.commitStats.MemTableWriteStallDuration += time.Since(now) 2630 } 2631 continue 2632 } 2633 } 2634 l0ReadAmp := d.mu.versions.currentVersion().L0Sublevels.ReadAmplification() 2635 if l0ReadAmp >= d.opts.L0StopWritesThreshold { 2636 // There are too many level-0 files, so we wait. 2637 if !stalled { 2638 stalled = true 2639 d.opts.EventListener.WriteStallBegin(WriteStallBeginInfo{ 2640 Reason: "L0 file count limit exceeded", 2641 }) 2642 } 2643 now := time.Now() 2644 d.mu.compact.cond.Wait() 2645 if b != nil { 2646 b.commitStats.L0ReadAmpWriteStallDuration += time.Since(now) 2647 } 2648 continue 2649 } 2650 2651 var newLogNum base.DiskFileNum 2652 var prevLogSize uint64 2653 if !d.opts.DisableWAL { 2654 now := time.Now() 2655 newLogNum, prevLogSize = d.recycleWAL() 2656 if b != nil { 2657 b.commitStats.WALRotationDuration += time.Since(now) 2658 } 2659 } 2660 2661 immMem := d.mu.mem.mutable 2662 imm := d.mu.mem.queue[len(d.mu.mem.queue)-1] 2663 imm.logSize = prevLogSize 2664 imm.flushForced = imm.flushForced || (b == nil) 2665 2666 // If we are manually flushing and we used less than half of the bytes in 2667 // the memtable, don't increase the size for the next memtable. This 2668 // reduces memtable memory pressure when an application is frequently 2669 // manually flushing. 2670 if (b == nil) && uint64(immMem.availBytes()) > immMem.totalBytes()/2 { 2671 d.mu.mem.nextSize = immMem.totalBytes() 2672 } 2673 2674 if b != nil && b.flushable != nil { 2675 // The batch is too large to fit in the memtable so add it directly to 2676 // the immutable queue. The flushable batch is associated with the same 2677 // log as the immutable memtable, but logically occurs after it in 2678 // seqnum space. We ensure while flushing that the flushable batch 2679 // is flushed along with the previous memtable in the flushable 2680 // queue. See the top level comment in DB.flush1 to learn how this 2681 // is ensured. 2682 // 2683 // See DB.commitWrite for the special handling of log writes for large 2684 // batches. In particular, the large batch has already written to 2685 // imm.logNum. 2686 entry := d.newFlushableEntry(b.flushable, imm.logNum, b.SeqNum()) 2687 // The large batch is by definition large. Reserve space from the cache 2688 // for it until it is flushed. 2689 entry.releaseMemAccounting = d.opts.Cache.Reserve(int(b.flushable.totalBytes())) 2690 d.mu.mem.queue = append(d.mu.mem.queue, entry) 2691 } 2692 2693 var logSeqNum uint64 2694 if b != nil { 2695 logSeqNum = b.SeqNum() 2696 if b.flushable != nil { 2697 logSeqNum += uint64(b.Count()) 2698 } 2699 } else { 2700 logSeqNum = d.mu.versions.logSeqNum.Load() 2701 } 2702 d.rotateMemtable(newLogNum, logSeqNum, immMem) 2703 force = false 2704 } 2705 } 2706 2707 // Both DB.mu and commitPipeline.mu must be held by the caller. 2708 func (d *DB) rotateMemtable(newLogNum base.DiskFileNum, logSeqNum uint64, prev *memTable) { 2709 // Create a new memtable, scheduling the previous one for flushing. We do 2710 // this even if the previous memtable was empty because the DB.Flush 2711 // mechanism is dependent on being able to wait for the empty memtable to 2712 // flush. We can't just mark the empty memtable as flushed here because we 2713 // also have to wait for all previous immutable tables to 2714 // flush. Additionally, the memtable is tied to particular WAL file and we 2715 // want to go through the flush path in order to recycle that WAL file. 2716 // 2717 // NB: newLogNum corresponds to the WAL that contains mutations that are 2718 // present in the new memtable. When immutable memtables are flushed to 2719 // disk, a VersionEdit will be created telling the manifest the minimum 2720 // unflushed log number (which will be the next one in d.mu.mem.mutable 2721 // that was not flushed). 2722 // 2723 // NB: prev should be the current mutable memtable. 2724 var entry *flushableEntry 2725 d.mu.mem.mutable, entry = d.newMemTable(newLogNum, logSeqNum) 2726 d.mu.mem.queue = append(d.mu.mem.queue, entry) 2727 d.updateReadStateLocked(nil) 2728 if prev.writerUnref() { 2729 d.maybeScheduleFlush() 2730 } 2731 } 2732 2733 // Both DB.mu and commitPipeline.mu must be held by the caller. Note that DB.mu 2734 // may be released and reacquired. 2735 func (d *DB) recycleWAL() (newLogNum base.DiskFileNum, prevLogSize uint64) { 2736 if d.opts.DisableWAL { 2737 panic("pebble: invalid function call") 2738 } 2739 2740 jobID := d.mu.nextJobID 2741 d.mu.nextJobID++ 2742 newLogNum = d.mu.versions.getNextDiskFileNum() 2743 2744 prevLogSize = uint64(d.mu.log.Size()) 2745 2746 // The previous log may have grown past its original physical 2747 // size. Update its file size in the queue so we have a proper 2748 // accounting of its file size. 2749 if d.mu.log.queue[len(d.mu.log.queue)-1].fileSize < prevLogSize { 2750 d.mu.log.queue[len(d.mu.log.queue)-1].fileSize = prevLogSize 2751 } 2752 d.mu.Unlock() 2753 2754 var err error 2755 // Close the previous log first. This writes an EOF trailer 2756 // signifying the end of the file and syncs it to disk. We must 2757 // close the previous log before linking the new log file, 2758 // otherwise a crash could leave both logs with unclean tails, and 2759 // Open will treat the previous log as corrupt. 2760 err = d.mu.log.LogWriter.Close() 2761 metrics := d.mu.log.LogWriter.Metrics() 2762 d.mu.Lock() 2763 if err := d.mu.log.metrics.Merge(metrics); err != nil { 2764 d.opts.Logger.Errorf("metrics error: %s", err) 2765 } 2766 d.mu.Unlock() 2767 2768 newLogName := base.MakeFilepath(d.opts.FS, d.walDirname, fileTypeLog, newLogNum) 2769 2770 // Try to use a recycled log file. Recycling log files is an important 2771 // performance optimization as it is faster to sync a file that has 2772 // already been written, than one which is being written for the first 2773 // time. This is due to the need to sync file metadata when a file is 2774 // being written for the first time. Note this is true even if file 2775 // preallocation is performed (e.g. fallocate). 2776 var recycleLog fileInfo 2777 var recycleOK bool 2778 var newLogFile vfs.File 2779 if err == nil { 2780 recycleLog, recycleOK = d.logRecycler.peek() 2781 if recycleOK { 2782 recycleLogName := base.MakeFilepath(d.opts.FS, d.walDirname, fileTypeLog, recycleLog.fileNum) 2783 newLogFile, err = d.opts.FS.ReuseForWrite(recycleLogName, newLogName) 2784 base.MustExist(d.opts.FS, newLogName, d.opts.Logger, err) 2785 } else { 2786 newLogFile, err = d.opts.FS.Create(newLogName) 2787 base.MustExist(d.opts.FS, newLogName, d.opts.Logger, err) 2788 } 2789 } 2790 2791 var newLogSize uint64 2792 if err == nil && recycleOK { 2793 // Figure out the recycled WAL size. This Stat is necessary 2794 // because ReuseForWrite's contract allows for removing the 2795 // old file and creating a new one. We don't know whether the 2796 // WAL was actually recycled. 2797 // TODO(jackson): Adding a boolean to the ReuseForWrite return 2798 // value indicating whether or not the file was actually 2799 // reused would allow us to skip the stat and use 2800 // recycleLog.fileSize. 2801 var finfo os.FileInfo 2802 finfo, err = newLogFile.Stat() 2803 if err == nil { 2804 newLogSize = uint64(finfo.Size()) 2805 } 2806 } 2807 2808 if err == nil { 2809 // TODO(peter): RocksDB delays sync of the parent directory until the 2810 // first time the log is synced. Is that worthwhile? 2811 err = d.walDir.Sync() 2812 } 2813 2814 if err != nil && newLogFile != nil { 2815 newLogFile.Close() 2816 } else if err == nil { 2817 newLogFile = vfs.NewSyncingFile(newLogFile, vfs.SyncingFileOptions{ 2818 NoSyncOnClose: d.opts.NoSyncOnClose, 2819 BytesPerSync: d.opts.WALBytesPerSync, 2820 PreallocateSize: d.walPreallocateSize(), 2821 }) 2822 } 2823 2824 if recycleOK { 2825 err = firstError(err, d.logRecycler.pop(recycleLog.fileNum.FileNum())) 2826 } 2827 2828 d.opts.EventListener.WALCreated(WALCreateInfo{ 2829 JobID: jobID, 2830 Path: newLogName, 2831 FileNum: newLogNum, 2832 RecycledFileNum: recycleLog.fileNum.FileNum(), 2833 Err: err, 2834 }) 2835 2836 d.mu.Lock() 2837 2838 d.mu.versions.metrics.WAL.Files++ 2839 2840 if err != nil { 2841 // TODO(peter): avoid chewing through file numbers in a tight loop if there 2842 // is an error here. 2843 // 2844 // What to do here? Stumbling on doesn't seem worthwhile. If we failed to 2845 // close the previous log it is possible we lost a write. 2846 panic(err) 2847 } 2848 2849 d.mu.log.queue = append(d.mu.log.queue, fileInfo{fileNum: newLogNum, fileSize: newLogSize}) 2850 d.mu.log.LogWriter = record.NewLogWriter(newLogFile, newLogNum, record.LogWriterConfig{ 2851 WALFsyncLatency: d.mu.log.metrics.fsyncLatency, 2852 WALMinSyncInterval: d.opts.WALMinSyncInterval, 2853 QueueSemChan: d.commit.logSyncQSem, 2854 }) 2855 if d.mu.log.registerLogWriterForTesting != nil { 2856 d.mu.log.registerLogWriterForTesting(d.mu.log.LogWriter) 2857 } 2858 2859 return 2860 } 2861 2862 func (d *DB) getEarliestUnflushedSeqNumLocked() uint64 { 2863 seqNum := InternalKeySeqNumMax 2864 for i := range d.mu.mem.queue { 2865 logSeqNum := d.mu.mem.queue[i].logSeqNum 2866 if seqNum > logSeqNum { 2867 seqNum = logSeqNum 2868 } 2869 } 2870 return seqNum 2871 } 2872 2873 func (d *DB) getInProgressCompactionInfoLocked(finishing *compaction) (rv []compactionInfo) { 2874 for c := range d.mu.compact.inProgress { 2875 if len(c.flushing) == 0 && (finishing == nil || c != finishing) { 2876 info := compactionInfo{ 2877 versionEditApplied: c.versionEditApplied, 2878 inputs: c.inputs, 2879 smallest: c.smallest, 2880 largest: c.largest, 2881 outputLevel: -1, 2882 } 2883 if c.outputLevel != nil { 2884 info.outputLevel = c.outputLevel.level 2885 } 2886 rv = append(rv, info) 2887 } 2888 } 2889 return 2890 } 2891 2892 func inProgressL0Compactions(inProgress []compactionInfo) []manifest.L0Compaction { 2893 var compactions []manifest.L0Compaction 2894 for _, info := range inProgress { 2895 // Skip in-progress compactions that have already committed; the L0 2896 // sublevels initialization code requires the set of in-progress 2897 // compactions to be consistent with the current version. Compactions 2898 // with versionEditApplied=true are already applied to the current 2899 // version and but are performing cleanup without the database mutex. 2900 if info.versionEditApplied { 2901 continue 2902 } 2903 l0 := false 2904 for _, cl := range info.inputs { 2905 l0 = l0 || cl.level == 0 2906 } 2907 if !l0 { 2908 continue 2909 } 2910 compactions = append(compactions, manifest.L0Compaction{ 2911 Smallest: info.smallest, 2912 Largest: info.largest, 2913 IsIntraL0: info.outputLevel == 0, 2914 }) 2915 } 2916 return compactions 2917 } 2918 2919 // firstError returns the first non-nil error of err0 and err1, or nil if both 2920 // are nil. 2921 func firstError(err0, err1 error) error { 2922 if err0 != nil { 2923 return err0 2924 } 2925 return err1 2926 } 2927 2928 // SetCreatorID sets the CreatorID which is needed in order to use shared objects. 2929 // Remote object usage is disabled until this method is called the first time. 2930 // Once set, the Creator ID is persisted and cannot change. 2931 // 2932 // Does nothing if SharedStorage was not set in the options when the DB was 2933 // opened or if the DB is in read-only mode. 2934 func (d *DB) SetCreatorID(creatorID uint64) error { 2935 if d.opts.Experimental.RemoteStorage == nil || d.opts.ReadOnly { 2936 return nil 2937 } 2938 return d.objProvider.SetCreatorID(objstorage.CreatorID(creatorID)) 2939 } 2940 2941 // KeyStatistics keeps track of the number of keys that have been pinned by a 2942 // snapshot as well as counts of the different key kinds in the lsm. 2943 // 2944 // One way of using the accumulated stats, when we only have sets and dels, 2945 // and say the counts are represented as del_count, set_count, 2946 // del_latest_count, set_latest_count, snapshot_pinned_count. 2947 // 2948 // - del_latest_count + set_latest_count is the set of unique user keys 2949 // (unique). 2950 // 2951 // - set_latest_count is the set of live unique user keys (live_unique). 2952 // 2953 // - Garbage is del_count + set_count - live_unique. 2954 // 2955 // - If everything were in the LSM, del_count+set_count-snapshot_pinned_count 2956 // would also be the set of unique user keys (note that 2957 // snapshot_pinned_count is counting something different -- see comment below). 2958 // But snapshot_pinned_count only counts keys in the LSM so the excess here 2959 // must be keys in memtables. 2960 type KeyStatistics struct { 2961 // TODO(sumeer): the SnapshotPinned* are incorrect in that these older 2962 // versions can be in a different level. Either fix the accounting or 2963 // rename these fields. 2964 2965 // SnapshotPinnedKeys represents obsolete keys that cannot be elided during 2966 // a compaction, because they are required by an open snapshot. 2967 SnapshotPinnedKeys int 2968 // SnapshotPinnedKeysBytes is the total number of bytes of all snapshot 2969 // pinned keys. 2970 SnapshotPinnedKeysBytes uint64 2971 // KindsCount is the count for each kind of key. It includes point keys, 2972 // range deletes and range keys. 2973 KindsCount [InternalKeyKindMax + 1]int 2974 // LatestKindsCount is the count for each kind of key when it is the latest 2975 // kind for a user key. It is only populated for point keys. 2976 LatestKindsCount [InternalKeyKindMax + 1]int 2977 } 2978 2979 // LSMKeyStatistics is used by DB.ScanStatistics. 2980 type LSMKeyStatistics struct { 2981 Accumulated KeyStatistics 2982 // Levels contains statistics only for point keys. Range deletions and range keys will 2983 // appear in Accumulated but not Levels. 2984 Levels [numLevels]KeyStatistics 2985 // BytesRead represents the logical, pre-compression size of keys and values read 2986 BytesRead uint64 2987 } 2988 2989 // ScanStatisticsOptions is used by DB.ScanStatistics. 2990 type ScanStatisticsOptions struct { 2991 // LimitBytesPerSecond indicates the number of bytes that are able to be read 2992 // per second using ScanInternal. 2993 // A value of 0 indicates that there is no limit set. 2994 LimitBytesPerSecond int64 2995 } 2996 2997 // ScanStatistics returns the count of different key kinds within the lsm for a 2998 // key span [lower, upper) as well as the number of snapshot keys. 2999 func (d *DB) ScanStatistics( 3000 ctx context.Context, lower, upper []byte, opts ScanStatisticsOptions, 3001 ) (LSMKeyStatistics, error) { 3002 stats := LSMKeyStatistics{} 3003 var prevKey InternalKey 3004 var rateLimitFunc func(key *InternalKey, val LazyValue) error 3005 tb := tokenbucket.TokenBucket{} 3006 3007 if opts.LimitBytesPerSecond != 0 { 3008 // Each "token" roughly corresponds to a byte that was read. 3009 tb.Init(tokenbucket.TokensPerSecond(opts.LimitBytesPerSecond), tokenbucket.Tokens(1024)) 3010 rateLimitFunc = func(key *InternalKey, val LazyValue) error { 3011 return tb.WaitCtx(ctx, tokenbucket.Tokens(key.Size()+val.Len())) 3012 } 3013 } 3014 3015 scanInternalOpts := &scanInternalOptions{ 3016 visitPointKey: func(key *InternalKey, value LazyValue, iterInfo IteratorLevel) error { 3017 // If the previous key is equal to the current point key, the current key was 3018 // pinned by a snapshot. 3019 size := uint64(key.Size()) 3020 kind := key.Kind() 3021 sameKey := d.equal(prevKey.UserKey, key.UserKey) 3022 if iterInfo.Kind == IteratorLevelLSM && sameKey { 3023 stats.Levels[iterInfo.Level].SnapshotPinnedKeys++ 3024 stats.Levels[iterInfo.Level].SnapshotPinnedKeysBytes += size 3025 stats.Accumulated.SnapshotPinnedKeys++ 3026 stats.Accumulated.SnapshotPinnedKeysBytes += size 3027 } 3028 if iterInfo.Kind == IteratorLevelLSM { 3029 stats.Levels[iterInfo.Level].KindsCount[kind]++ 3030 } 3031 if !sameKey { 3032 if iterInfo.Kind == IteratorLevelLSM { 3033 stats.Levels[iterInfo.Level].LatestKindsCount[kind]++ 3034 } 3035 stats.Accumulated.LatestKindsCount[kind]++ 3036 } 3037 3038 stats.Accumulated.KindsCount[kind]++ 3039 prevKey.CopyFrom(*key) 3040 stats.BytesRead += uint64(key.Size() + value.Len()) 3041 return nil 3042 }, 3043 visitRangeDel: func(start, end []byte, seqNum uint64) error { 3044 stats.Accumulated.KindsCount[InternalKeyKindRangeDelete]++ 3045 stats.BytesRead += uint64(len(start) + len(end)) 3046 return nil 3047 }, 3048 visitRangeKey: func(start, end []byte, keys []rangekey.Key) error { 3049 stats.BytesRead += uint64(len(start) + len(end)) 3050 for _, key := range keys { 3051 stats.Accumulated.KindsCount[key.Kind()]++ 3052 stats.BytesRead += uint64(len(key.Value) + len(key.Suffix)) 3053 } 3054 return nil 3055 }, 3056 includeObsoleteKeys: true, 3057 IterOptions: IterOptions{ 3058 KeyTypes: IterKeyTypePointsAndRanges, 3059 LowerBound: lower, 3060 UpperBound: upper, 3061 }, 3062 rateLimitFunc: rateLimitFunc, 3063 } 3064 iter, err := d.newInternalIter(ctx, snapshotIterOpts{}, scanInternalOpts) 3065 if err != nil { 3066 return LSMKeyStatistics{}, err 3067 } 3068 defer iter.close() 3069 3070 err = scanInternalImpl(ctx, lower, upper, iter, scanInternalOpts) 3071 3072 if err != nil { 3073 return LSMKeyStatistics{}, err 3074 } 3075 3076 return stats, nil 3077 } 3078 3079 // ObjProvider returns the objstorage.Provider for this database. Meant to be 3080 // used for internal purposes only. 3081 func (d *DB) ObjProvider() objstorage.Provider { 3082 return d.objProvider 3083 } 3084 3085 func (d *DB) checkVirtualBounds(m *fileMetadata) { 3086 if !invariants.Enabled { 3087 return 3088 } 3089 3090 objMeta, err := d.objProvider.Lookup(fileTypeTable, m.FileBacking.DiskFileNum) 3091 if err != nil { 3092 panic(err) 3093 } 3094 if objMeta.IsExternal() { 3095 // Nothing to do; bounds are expected to be loose. 3096 return 3097 } 3098 3099 if m.HasPointKeys { 3100 pointIter, rangeDelIter, err := d.newIters(context.TODO(), m, nil, internalIterOpts{}) 3101 if err != nil { 3102 panic(errors.Wrap(err, "pebble: error creating point iterator")) 3103 } 3104 3105 defer pointIter.Close() 3106 if rangeDelIter != nil { 3107 defer rangeDelIter.Close() 3108 } 3109 3110 pointKey, _ := pointIter.First() 3111 var rangeDel *keyspan.Span 3112 if rangeDelIter != nil { 3113 rangeDel = rangeDelIter.First() 3114 } 3115 3116 // Check that the lower bound is tight. 3117 if (rangeDel == nil || d.cmp(rangeDel.SmallestKey().UserKey, m.SmallestPointKey.UserKey) != 0) && 3118 (pointKey == nil || d.cmp(pointKey.UserKey, m.SmallestPointKey.UserKey) != 0) { 3119 panic(errors.Newf("pebble: virtual sstable %s lower point key bound is not tight", m.FileNum)) 3120 } 3121 3122 pointKey, _ = pointIter.Last() 3123 rangeDel = nil 3124 if rangeDelIter != nil { 3125 rangeDel = rangeDelIter.Last() 3126 } 3127 3128 // Check that the upper bound is tight. 3129 if (rangeDel == nil || d.cmp(rangeDel.LargestKey().UserKey, m.LargestPointKey.UserKey) != 0) && 3130 (pointKey == nil || d.cmp(pointKey.UserKey, m.LargestPointKey.UserKey) != 0) { 3131 panic(errors.Newf("pebble: virtual sstable %s upper point key bound is not tight", m.FileNum)) 3132 } 3133 3134 // Check that iterator keys are within bounds. 3135 for key, _ := pointIter.First(); key != nil; key, _ = pointIter.Next() { 3136 if d.cmp(key.UserKey, m.SmallestPointKey.UserKey) < 0 || d.cmp(key.UserKey, m.LargestPointKey.UserKey) > 0 { 3137 panic(errors.Newf("pebble: virtual sstable %s point key %s is not within bounds", m.FileNum, key.UserKey)) 3138 } 3139 } 3140 3141 if rangeDelIter != nil { 3142 for key := rangeDelIter.First(); key != nil; key = rangeDelIter.Next() { 3143 if d.cmp(key.SmallestKey().UserKey, m.SmallestPointKey.UserKey) < 0 { 3144 panic(errors.Newf("pebble: virtual sstable %s point key %s is not within bounds", m.FileNum, key.SmallestKey().UserKey)) 3145 } 3146 3147 if d.cmp(key.LargestKey().UserKey, m.LargestPointKey.UserKey) > 0 { 3148 panic(errors.Newf("pebble: virtual sstable %s point key %s is not within bounds", m.FileNum, key.LargestKey().UserKey)) 3149 } 3150 } 3151 } 3152 } 3153 3154 if !m.HasRangeKeys { 3155 return 3156 } 3157 3158 rangeKeyIter, err := d.tableNewRangeKeyIter(m, keyspan.SpanIterOptions{}) 3159 defer rangeKeyIter.Close() 3160 3161 if err != nil { 3162 panic(errors.Wrap(err, "pebble: error creating range key iterator")) 3163 } 3164 3165 // Check that the lower bound is tight. 3166 if d.cmp(rangeKeyIter.First().SmallestKey().UserKey, m.SmallestRangeKey.UserKey) != 0 { 3167 panic(errors.Newf("pebble: virtual sstable %s lower range key bound is not tight", m.FileNum)) 3168 } 3169 3170 // Check that upper bound is tight. 3171 if d.cmp(rangeKeyIter.Last().LargestKey().UserKey, m.LargestRangeKey.UserKey) != 0 { 3172 panic(errors.Newf("pebble: virtual sstable %s upper range key bound is not tight", m.FileNum)) 3173 } 3174 3175 for key := rangeKeyIter.First(); key != nil; key = rangeKeyIter.Next() { 3176 if d.cmp(key.SmallestKey().UserKey, m.SmallestRangeKey.UserKey) < 0 { 3177 panic(errors.Newf("pebble: virtual sstable %s point key %s is not within bounds", m.FileNum, key.SmallestKey().UserKey)) 3178 } 3179 if d.cmp(key.LargestKey().UserKey, m.LargestRangeKey.UserKey) > 0 { 3180 panic(errors.Newf("pebble: virtual sstable %s point key %s is not within bounds", m.FileNum, key.LargestKey().UserKey)) 3181 } 3182 } 3183 }