github.com/cockroachdb/cockroach@v20.2.0-alpha.1+incompatible/pkg/storage/engine.go (about) 1 // Copyright 2014 The Cockroach Authors. 2 // 3 // Use of this software is governed by the Business Source License 4 // included in the file licenses/BSL.txt. 5 // 6 // As of the Change Date specified in that file, in accordance with 7 // the Business Source License, use of this software will be governed 8 // by the Apache License, Version 2.0, included in the file 9 // licenses/APL.txt. 10 11 package storage 12 13 import ( 14 "context" 15 "fmt" 16 "path/filepath" 17 "time" 18 19 "github.com/cockroachdb/cockroach/pkg/base" 20 "github.com/cockroachdb/cockroach/pkg/roachpb" 21 "github.com/cockroachdb/cockroach/pkg/settings" 22 "github.com/cockroachdb/cockroach/pkg/settings/cluster" 23 "github.com/cockroachdb/cockroach/pkg/storage/enginepb" 24 "github.com/cockroachdb/cockroach/pkg/storage/fs" 25 "github.com/cockroachdb/cockroach/pkg/util/envutil" 26 "github.com/cockroachdb/cockroach/pkg/util/hlc" 27 "github.com/cockroachdb/cockroach/pkg/util/log" 28 "github.com/cockroachdb/cockroach/pkg/util/protoutil" 29 "github.com/cockroachdb/errors" 30 "github.com/cockroachdb/pebble" 31 ) 32 33 // DefaultStorageEngine represents the default storage engine to use. 34 var DefaultStorageEngine enginepb.EngineType 35 36 func init() { 37 _ = DefaultStorageEngine.Set(envutil.EnvOrDefaultString("COCKROACH_STORAGE_ENGINE", "default")) 38 } 39 40 // SimpleIterator is an interface for iterating over key/value pairs in an 41 // engine. SimpleIterator implementations are thread safe unless otherwise 42 // noted. SimpleIterator is a subset of the functionality offered by Iterator. 43 type SimpleIterator interface { 44 // Close frees up resources held by the iterator. 45 Close() 46 // SeekGE advances the iterator to the first key in the engine which 47 // is >= the provided key. 48 SeekGE(key MVCCKey) 49 // Valid must be called after any call to Seek(), Next(), Prev(), or 50 // similar methods. It returns (true, nil) if the iterator points to 51 // a valid key (it is undefined to call Key(), Value(), or similar 52 // methods unless Valid() has returned (true, nil)). It returns 53 // (false, nil) if the iterator has moved past the end of the valid 54 // range, or (false, err) if an error has occurred. Valid() will 55 // never return true with a non-nil error. 56 Valid() (bool, error) 57 // Next advances the iterator to the next key/value in the 58 // iteration. After this call, Valid() will be true if the 59 // iterator was not positioned at the last key. 60 Next() 61 // NextKey advances the iterator to the next MVCC key. This operation is 62 // distinct from Next which advances to the next version of the current key 63 // or the next key if the iterator is currently located at the last version 64 // for a key. 65 NextKey() 66 // UnsafeKey returns the same value as Key, but the memory is invalidated on 67 // the next call to {Next,Prev,Seek,SeekReverse,Close}. 68 UnsafeKey() MVCCKey 69 // UnsafeValue returns the same value as Value, but the memory is 70 // invalidated on the next call to {Next,Prev,Seek,SeekReverse,Close}. 71 UnsafeValue() []byte 72 } 73 74 // IteratorStats is returned from (Iterator).Stats. 75 type IteratorStats struct { 76 InternalDeleteSkippedCount int 77 TimeBoundNumSSTs int 78 } 79 80 // Iterator is an interface for iterating over key/value pairs in an 81 // engine. Iterator implementations are thread safe unless otherwise 82 // noted. 83 type Iterator interface { 84 SimpleIterator 85 86 // SeekLT advances the iterator to the first key in the engine which 87 // is < the provided key. 88 SeekLT(key MVCCKey) 89 // Prev moves the iterator backward to the previous key/value 90 // in the iteration. After this call, Valid() will be true if the 91 // iterator was not positioned at the first key. 92 Prev() 93 // Key returns the current key. 94 Key() MVCCKey 95 // Value returns the current value as a byte slice. 96 Value() []byte 97 // ValueProto unmarshals the value the iterator is currently 98 // pointing to using a protobuf decoder. 99 ValueProto(msg protoutil.Message) error 100 // ComputeStats scans the underlying engine from start to end keys and 101 // computes stats counters based on the values. This method is used after a 102 // range is split to recompute stats for each subrange. The start key is 103 // always adjusted to avoid counting local keys in the event stats are being 104 // recomputed for the first range (i.e. the one with start key == KeyMin). 105 // The nowNanos arg specifies the wall time in nanoseconds since the 106 // epoch and is used to compute the total age of all intents. 107 ComputeStats(start, end roachpb.Key, nowNanos int64) (enginepb.MVCCStats, error) 108 // FindSplitKey finds a key from the given span such that the left side of 109 // the split is roughly targetSize bytes. The returned key will never be 110 // chosen from the key ranges listed in keys.NoSplitSpans and will always 111 // sort equal to or after minSplitKey. 112 // 113 // DO NOT CALL directly (except in wrapper Iterator implementations). Use the 114 // package-level MVCCFindSplitKey instead. For correct operation, the caller 115 // must set the upper bound on the iterator before calling this method. 116 FindSplitKey(start, end, minSplitKey roachpb.Key, targetSize int64) (MVCCKey, error) 117 // CheckForKeyCollisions checks whether any keys collide between the iterator 118 // and the encoded SST data specified, within the provided key range. Returns 119 // stats on skipped KVs, or an error if a collision is found. 120 CheckForKeyCollisions(sstData []byte, start, end roachpb.Key) (enginepb.MVCCStats, error) 121 // SetUpperBound installs a new upper bound for this iterator. 122 SetUpperBound(roachpb.Key) 123 // Stats returns statistics about the iterator. 124 Stats() IteratorStats 125 } 126 127 // MVCCIterator is an interface that extends Iterator and provides concrete 128 // implementations for MVCCGet and MVCCScan operations. It is used by instances 129 // of the interface backed by RocksDB iterators to avoid cgo hops. 130 type MVCCIterator interface { 131 Iterator 132 // MVCCOpsSpecialized returns whether the iterator has a specialized 133 // implementation of MVCCGet and MVCCScan. This is exposed as a method 134 // so that wrapper types can defer to their wrapped iterators. 135 MVCCOpsSpecialized() bool 136 // MVCCGet is the internal implementation of the family of package-level 137 // MVCCGet functions. 138 MVCCGet( 139 key roachpb.Key, timestamp hlc.Timestamp, opts MVCCGetOptions, 140 ) (*roachpb.Value, *roachpb.Intent, error) 141 // MVCCScan is the internal implementation of the family of package-level 142 // MVCCScan functions. The notable difference is that key/value pairs are 143 // returned raw, as a series of buffers of length-prefixed slices, 144 // alternating from key to value, where numKVs specifies the number of pairs 145 // in the buffer. 146 MVCCScan( 147 start, end roachpb.Key, timestamp hlc.Timestamp, opts MVCCScanOptions, 148 ) (MVCCScanResult, error) 149 } 150 151 // IterOptions contains options used to create an Iterator. 152 // 153 // For performance, every Iterator must specify either Prefix or UpperBound. 154 type IterOptions struct { 155 // If Prefix is true, Seek will use the user-key prefix of 156 // the supplied MVCC key to restrict which sstables are searched, 157 // but iteration (using Next) over keys without the same user-key 158 // prefix will not work correctly (keys may be skipped). 159 Prefix bool 160 // LowerBound gives this iterator an inclusive lower bound. Attempts to 161 // SeekReverse or Prev to a key that is strictly less than the bound will 162 // invalidate the iterator. 163 LowerBound roachpb.Key 164 // UpperBound gives this iterator an exclusive upper bound. Attempts to Seek 165 // or Next to a key that is greater than or equal to the bound will invalidate 166 // the iterator. UpperBound must be provided unless Prefix is true, in which 167 // case the end of the prefix will be used as the upper bound. 168 UpperBound roachpb.Key 169 // If WithStats is true, the iterator accumulates RocksDB performance 170 // counters over its lifetime which can be queried via `Stats()`. 171 WithStats bool 172 // MinTimestampHint and MaxTimestampHint, if set, indicate that keys outside 173 // of the time range formed by [MinTimestampHint, MaxTimestampHint] do not 174 // need to be presented by the iterator. The underlying iterator may be able 175 // to efficiently skip over keys outside of the hinted time range, e.g., when 176 // an SST indicates that it contains no keys within the time range. 177 // 178 // Note that time bound hints are strictly a performance optimization, and 179 // iterators with time bounds hints will frequently return keys outside of the 180 // [start, end] time range. If you must guarantee that you never see a key 181 // outside of the time bounds, perform your own filtering. 182 MinTimestampHint, MaxTimestampHint hlc.Timestamp 183 } 184 185 // Reader is the read interface to an engine's data. 186 type Reader interface { 187 // Close closes the reader, freeing up any outstanding resources. Note that 188 // various implementations have slightly different behaviors. In particular, 189 // Distinct() batches release their parent batch for future use while 190 // Engines, Snapshots and Batches free the associated C++ resources. 191 Close() 192 // Closed returns true if the reader has been closed or is not usable. 193 // Objects backed by this reader (e.g. Iterators) can check this to ensure 194 // that they are not using a closed engine. Intended for use within package 195 // engine; exported to enable wrappers to exist in other packages. 196 Closed() bool 197 // ExportToSst exports changes to the keyrange [startKey, endKey) over the 198 // interval (startTS, endTS]. Passing exportAllRevisions exports 199 // every revision of a key for the interval, otherwise only the latest value 200 // within the interval is exported. Deletions are included if all revisions are 201 // requested or if the start.Timestamp is non-zero. Returns the bytes of an 202 // SSTable containing the exported keys, the size of exported data, or an error. 203 // 204 // If targetSize is positive, it indicates that the export should produce SSTs 205 // which are roughly target size. Specifically, it will return an SST such that 206 // the last key is responsible for meeting or exceeding the targetSize. If the 207 // resumeKey is non-nil then the data size of the returned sst will be greater 208 // than or equal to the targetSize. 209 // 210 // If maxSize is positive, it is an absolute maximum on byte size for the 211 // returned sst. If it is the case that the versions of the last key will lead 212 // to an SST that exceeds maxSize, an error will be returned. This parameter 213 // exists to prevent creating SSTs which are too large to be used. 214 ExportToSst( 215 startKey, endKey roachpb.Key, startTS, endTS hlc.Timestamp, 216 exportAllRevisions bool, targetSize uint64, maxSize uint64, 217 io IterOptions, 218 ) (sst []byte, _ roachpb.BulkOpSummary, resumeKey roachpb.Key, _ error) 219 // Get returns the value for the given key, nil otherwise. 220 // 221 // Deprecated: use MVCCGet instead. 222 Get(key MVCCKey) ([]byte, error) 223 // GetProto fetches the value at the specified key and unmarshals it 224 // using a protobuf decoder. Returns true on success or false if the 225 // key was not found. On success, returns the length in bytes of the 226 // key and the value. 227 // 228 // Deprecated: use Iterator.ValueProto instead. 229 GetProto(key MVCCKey, msg protoutil.Message) (ok bool, keyBytes, valBytes int64, err error) 230 // Iterate scans from the start key to the end key (exclusive), invoking the 231 // function f on each key value pair. If f returns an error or if the scan 232 // itself encounters an error, the iteration will stop and return the error. 233 // If the first result of f is true, the iteration stops and returns a nil 234 // error. Note that this method is not expected take into account the 235 // timestamp of the end key; all MVCCKeys at end.Key are considered excluded 236 // in the iteration. 237 Iterate(start, end roachpb.Key, f func(MVCCKeyValue) (stop bool, err error)) error 238 // NewIterator returns a new instance of an Iterator over this 239 // engine. The caller must invoke Iterator.Close() when finished 240 // with the iterator to free resources. 241 NewIterator(opts IterOptions) Iterator 242 } 243 244 // Writer is the write interface to an engine's data. 245 type Writer interface { 246 // ApplyBatchRepr atomically applies a set of batched updates. Created by 247 // calling Repr() on a batch. Using this method is equivalent to constructing 248 // and committing a batch whose Repr() equals repr. If sync is true, the 249 // batch is synchronously written to disk. It is an error to specify 250 // sync=true if the Writer is a Batch. 251 // 252 // It is safe to modify the contents of the arguments after ApplyBatchRepr 253 // returns. 254 ApplyBatchRepr(repr []byte, sync bool) error 255 // Clear removes the item from the db with the given key. Note that clear 256 // actually removes entries from the storage engine, rather than inserting 257 // tombstones. 258 // 259 // It is safe to modify the contents of the arguments after Clear returns. 260 Clear(key MVCCKey) error 261 // SingleClear removes the most recent write to the item from the db with 262 // the given key. Whether older version of the item will come back to life 263 // if not also removed with SingleClear is undefined. See the following: 264 // https://github.com/facebook/rocksdb/wiki/Single-Delete 265 // for details on the SingleDelete operation that this method invokes. Note 266 // that clear actually removes entries from the storage engine, rather than 267 // inserting tombstones. 268 // 269 // It is safe to modify the contents of the arguments after SingleClear 270 // returns. 271 SingleClear(key MVCCKey) error 272 // ClearRange removes a set of entries, from start (inclusive) to end 273 // (exclusive). Similar to Clear, this method actually removes entries from 274 // the storage engine. 275 // 276 // Note that when used on batches, subsequent reads may not reflect the result 277 // of the ClearRange. 278 // 279 // It is safe to modify the contents of the arguments after ClearRange 280 // returns. 281 // 282 // TODO(peter): Most callers want to pass roachpb.Key, except for 283 // MVCCClearTimeRange. That function actually does what to clear records 284 // between specific versions. 285 ClearRange(start, end MVCCKey) error 286 // ClearIterRange removes a set of entries, from start (inclusive) to end 287 // (exclusive). Similar to Clear and ClearRange, this method actually removes 288 // entries from the storage engine. Unlike ClearRange, the entries to remove 289 // are determined by iterating over iter and per-key tombstones are 290 // generated. 291 // 292 // It is safe to modify the contents of the arguments after ClearIterRange 293 // returns. 294 ClearIterRange(iter Iterator, start, end roachpb.Key) error 295 // Merge is a high-performance write operation used for values which are 296 // accumulated over several writes. Multiple values can be merged 297 // sequentially into a single key; a subsequent read will return a "merged" 298 // value which is computed from the original merged values. 299 // 300 // Merge currently provides specialized behavior for three data types: 301 // integers, byte slices, and time series observations. Merged integers are 302 // summed, acting as a high-performance accumulator. Byte slices are simply 303 // concatenated in the order they are merged. Time series observations 304 // (stored as byte slices with a special tag on the roachpb.Value) are 305 // combined with specialized logic beyond that of simple byte slices. 306 // 307 // The logic for merges is written in db.cc in order to be compatible with 308 // RocksDB. 309 // 310 // It is safe to modify the contents of the arguments after Merge returns. 311 Merge(key MVCCKey, value []byte) error 312 // Put sets the given key to the value provided. 313 // 314 // It is safe to modify the contents of the arguments after Put returns. 315 Put(key MVCCKey, value []byte) error 316 // LogData adds the specified data to the RocksDB WAL. The data is 317 // uninterpreted by RocksDB (i.e. not added to the memtable or sstables). 318 // 319 // It is safe to modify the contents of the arguments after LogData returns. 320 LogData(data []byte) error 321 // LogLogicalOp logs the specified logical mvcc operation with the provided 322 // details to the writer, if it has logical op logging enabled. For most 323 // Writer implementations, this is a no-op. 324 LogLogicalOp(op MVCCLogicalOpType, details MVCCLogicalOpDetails) 325 } 326 327 // ReadWriter is the read/write interface to an engine's data. 328 type ReadWriter interface { 329 Reader 330 Writer 331 } 332 333 // Engine is the interface that wraps the core operations of a key/value store. 334 type Engine interface { 335 ReadWriter 336 // Attrs returns the engine/store attributes. 337 Attrs() roachpb.Attributes 338 // Capacity returns capacity details for the engine's available storage. 339 Capacity() (roachpb.StoreCapacity, error) 340 // Compact forces compaction over the entire database. 341 Compact() error 342 // Flush causes the engine to write all in-memory data to disk 343 // immediately. 344 Flush() error 345 // GetSSTables retrieves metadata about this engine's live sstables. 346 GetSSTables() SSTableInfos 347 // GetCompactionStats returns the internal RocksDB compaction stats. See 348 // https://github.com/facebook/rocksdb/wiki/RocksDB-Tuning-Guide#rocksdb-statistics. 349 GetCompactionStats() string 350 // GetStats retrieves stats from the engine. 351 GetStats() (*Stats, error) 352 // GetEncryptionRegistries returns the file and key registries when encryption is enabled 353 // on the store. 354 GetEncryptionRegistries() (*EncryptionRegistries, error) 355 // GetEnvStats retrieves stats about the engine's environment 356 // For RocksDB, this includes details of at-rest encryption. 357 GetEnvStats() (*EnvStats, error) 358 // GetAuxiliaryDir returns a path under which files can be stored 359 // persistently, and from which data can be ingested by the engine. 360 // 361 // Not thread safe. 362 GetAuxiliaryDir() string 363 // NewBatch returns a new instance of a batched engine which wraps 364 // this engine. Batched engines accumulate all mutations and apply 365 // them atomically on a call to Commit(). 366 NewBatch() Batch 367 // NewReadOnly returns a new instance of a ReadWriter that wraps this 368 // engine. This wrapper panics when unexpected operations (e.g., write 369 // operations) are executed on it and caches iterators to avoid the overhead 370 // of creating multiple iterators for batched reads. 371 // 372 // All iterators created from a read-only engine with the same "Prefix" 373 // option are guaranteed to provide a consistent snapshot of the underlying 374 // engine. For instance, two prefix iterators created from a read-only 375 // engine will provide a consistent snapshot. Similarly, two non-prefix 376 // iterators created from a read-only engine will provide a consistent 377 // snapshot. However, a prefix iterator and a non-prefix iterator created 378 // from a read-only engine are not guaranteed to provide a consistent view 379 // of the underlying engine. 380 // 381 // TODO(nvanbenschoten): remove this complexity when we're fully on Pebble 382 // and can guarantee that all iterators created from a read-only engine are 383 // consistent. To do this, we will want to add an Iterator.Clone method. 384 NewReadOnly() ReadWriter 385 // NewWriteOnlyBatch returns a new instance of a batched engine which wraps 386 // this engine. A write-only batch accumulates all mutations and applies them 387 // atomically on a call to Commit(). Read operations return an error. 388 // 389 // Note that a distinct write-only batch allows reads. Distinct batches are a 390 // means of indicating that the user does not need to read its own writes. 391 // 392 // TODO(peter): This should return a WriteBatch interface, but there are mild 393 // complications in both defining that interface and implementing it. In 394 // particular, Batch.Close would no longer come from Reader and we'd need to 395 // refactor a bunch of code in rocksDBBatch. 396 NewWriteOnlyBatch() Batch 397 // NewSnapshot returns a new instance of a read-only snapshot 398 // engine. Snapshots are instantaneous and, as long as they're 399 // released relatively quickly, inexpensive. Snapshots are released 400 // by invoking Close(). Note that snapshots must not be used after the 401 // original engine has been stopped. 402 NewSnapshot() Reader 403 // Type returns engine type. 404 Type() enginepb.EngineType 405 // IngestExternalFiles atomically links a slice of files into the RocksDB 406 // log-structured merge-tree. 407 IngestExternalFiles(ctx context.Context, paths []string) error 408 // PreIngestDelay offers an engine the chance to backpressure ingestions. 409 // When called, it may choose to block if the engine determines that it is in 410 // or approaching a state where further ingestions may risk its health. 411 PreIngestDelay(ctx context.Context) 412 // ApproximateDiskBytes returns an approximation of the on-disk size for the given key span. 413 ApproximateDiskBytes(from, to roachpb.Key) (uint64, error) 414 // CompactRange ensures that the specified range of key value pairs is 415 // optimized for space efficiency. The forceBottommost parameter ensures 416 // that the key range is compacted all the way to the bottommost level of 417 // SSTables, which is necessary to pick up changes to bloom filters. 418 CompactRange(start, end roachpb.Key, forceBottommost bool) error 419 // InMem returns true if the receiver is an in-memory engine and false 420 // otherwise. 421 // 422 // TODO(peter): This is a bit of a wart in the interface. It is used by 423 // addSSTablePreApply to select alternate code paths, but really there should 424 // be a unified code path there. 425 InMem() bool 426 427 // Filesystem functionality. 428 fs.FS 429 // ReadFile reads the content from the file with the given filename int this RocksDB's env. 430 ReadFile(filename string) ([]byte, error) 431 // WriteFile writes data to a file in this RocksDB's env. 432 WriteFile(filename string, data []byte) error 433 // CreateCheckpoint creates a checkpoint of the engine in the given directory, 434 // which must not exist. The directory should be on the same file system so 435 // that hard links can be used. 436 CreateCheckpoint(dir string) error 437 } 438 439 // Batch is the interface for batch specific operations. 440 type Batch interface { 441 ReadWriter 442 // Commit atomically applies any batched updates to the underlying 443 // engine. This is a noop unless the batch was created via NewBatch(). If 444 // sync is true, the batch is synchronously committed to disk. 445 Commit(sync bool) error 446 // Distinct returns a view of the existing batch which only sees writes that 447 // were performed before the Distinct batch was created. That is, the 448 // returned batch will not read its own writes, but it will read writes to 449 // the parent batch performed before the call to Distinct(), except if the 450 // parent batch is a WriteOnlyBatch, in which case the Distinct() batch will 451 // read from the underlying engine. 452 // 453 // The returned 454 // batch needs to be closed before using the parent batch again. This is used 455 // as an optimization to avoid flushing mutations buffered by the batch in 456 // situations where we know all of the batched operations are for distinct 457 // keys. 458 // 459 // TODO(tbg): it seems insane that you cannot read from a WriteOnlyBatch but 460 // you can read from a Distinct on top of a WriteOnlyBatch but randomly don't 461 // see the batch at all. I was personally just bitten by this. 462 // 463 // TODO(itsbilal): Improve comments around how/why distinct batches are an 464 // optimization in the rocksdb write path. 465 Distinct() ReadWriter 466 // Empty returns whether the batch has been written to or not. 467 Empty() bool 468 // Len returns the size of the underlying representation of the batch. 469 // Because of the batch header, the size of the batch is never 0 and should 470 // not be used interchangeably with Empty. The method avoids the memory copy 471 // that Repr imposes, but it still may require flushing the batch's mutations. 472 Len() int 473 // Repr returns the underlying representation of the batch and can be used to 474 // reconstitute the batch on a remote node using Writer.ApplyBatchRepr(). 475 Repr() []byte 476 } 477 478 // Stats is a set of Engine stats. Most are described in RocksDB. 479 // Some stats (eg, `IngestedBytes`) are only exposed by Pebble. 480 // 481 // Currently, we collect stats from the following sources: 482 // 1. RocksDB's internal "tickers" (i.e. counters). They're defined in 483 // rocksdb/statistics.h 484 // 2. DBEventListener, which implements RocksDB's EventListener interface. 485 // 3. rocksdb::DB::GetProperty(). 486 // 487 // This is a good resource describing RocksDB's memory-related stats: 488 // https://github.com/facebook/rocksdb/wiki/Memory-usage-in-RocksDB 489 type Stats struct { 490 BlockCacheHits int64 491 BlockCacheMisses int64 492 BlockCacheUsage int64 493 BlockCachePinnedUsage int64 494 BloomFilterPrefixChecked int64 495 BloomFilterPrefixUseful int64 496 MemtableTotalSize int64 497 Flushes int64 498 FlushedBytes int64 499 Compactions int64 500 IngestedBytes int64 // Pebble only 501 CompactedBytesRead int64 502 CompactedBytesWritten int64 503 TableReadersMemEstimate int64 504 PendingCompactionBytesEstimate int64 505 L0FileCount int64 506 } 507 508 // EnvStats is a set of RocksDB env stats, including encryption status. 509 type EnvStats struct { 510 // TotalFiles is the total number of files reported by rocksdb. 511 TotalFiles uint64 512 // TotalBytes is the total size of files reported by rocksdb. 513 TotalBytes uint64 514 // ActiveKeyFiles is the number of files using the active data key. 515 ActiveKeyFiles uint64 516 // ActiveKeyBytes is the size of files using the active data key. 517 ActiveKeyBytes uint64 518 // EncryptionType is an enum describing the active encryption algorithm. 519 // See: ccl/storageccl/engineccl/enginepbccl/key_registry.proto 520 EncryptionType int32 521 // EncryptionStatus is a serialized enginepbccl/stats.proto::EncryptionStatus protobuf. 522 EncryptionStatus []byte 523 } 524 525 // EncryptionRegistries contains the encryption-related registries: 526 // Both are serialized protobufs. 527 type EncryptionRegistries struct { 528 // FileRegistry is the list of files with encryption status. 529 // serialized storage/engine/enginepb/file_registry.proto::FileRegistry 530 FileRegistry []byte 531 // KeyRegistry is the list of keys, scrubbed of actual key data. 532 // serialized ccl/storageccl/engineccl/enginepbccl/key_registry.proto::DataKeysRegistry 533 KeyRegistry []byte 534 } 535 536 // NewEngine creates a new storage engine. 537 func NewEngine( 538 engine enginepb.EngineType, cacheSize int64, storageConfig base.StorageConfig, 539 ) (Engine, error) { 540 switch engine { 541 case enginepb.EngineTypeTeePebbleRocksDB: 542 pebbleConfig := PebbleConfig{ 543 StorageConfig: storageConfig, 544 Opts: DefaultPebbleOptions(), 545 } 546 pebbleConfig.Opts.Cache = pebble.NewCache(cacheSize) 547 defer pebbleConfig.Opts.Cache.Unref() 548 549 pebbleConfig.Dir = filepath.Join(pebbleConfig.Dir, "pebble") 550 cache := NewRocksDBCache(cacheSize) 551 defer cache.Release() 552 553 ctx := context.Background() 554 pebbleDB, err := NewPebble(ctx, pebbleConfig) 555 if err != nil { 556 return nil, err 557 } 558 559 rocksDBConfig := RocksDBConfig{StorageConfig: storageConfig} 560 rocksDBConfig.Dir = filepath.Join(rocksDBConfig.Dir, "rocksdb") 561 rocksDB, err := NewRocksDB(rocksDBConfig, cache) 562 if err != nil { 563 return nil, err 564 } 565 566 return NewTee(ctx, rocksDB, pebbleDB), nil 567 case enginepb.EngineTypeDefault, enginepb.EngineTypePebble: 568 pebbleConfig := PebbleConfig{ 569 StorageConfig: storageConfig, 570 Opts: DefaultPebbleOptions(), 571 } 572 pebbleConfig.Opts.Cache = pebble.NewCache(cacheSize) 573 defer pebbleConfig.Opts.Cache.Unref() 574 575 return NewPebble(context.Background(), pebbleConfig) 576 case enginepb.EngineTypeRocksDB: 577 cache := NewRocksDBCache(cacheSize) 578 defer cache.Release() 579 580 return NewRocksDB( 581 RocksDBConfig{StorageConfig: storageConfig}, 582 cache) 583 } 584 panic(fmt.Sprintf("unknown engine type: %d", engine)) 585 } 586 587 // NewDefaultEngine allocates and returns a new, opened engine with the default configuration. 588 // The caller must call the engine's Close method when the engine is no longer needed. 589 func NewDefaultEngine(cacheSize int64, storageConfig base.StorageConfig) (Engine, error) { 590 return NewEngine(DefaultStorageEngine, cacheSize, storageConfig) 591 } 592 593 // PutProto sets the given key to the protobuf-serialized byte string 594 // of msg and the provided timestamp. Returns the length in bytes of 595 // key and the value. 596 // 597 // Deprecated: use MVCCPutProto instead. 598 func PutProto( 599 writer Writer, key MVCCKey, msg protoutil.Message, 600 ) (keyBytes, valBytes int64, err error) { 601 bytes, err := protoutil.Marshal(msg) 602 if err != nil { 603 return 0, 0, err 604 } 605 606 if err := writer.Put(key, bytes); err != nil { 607 return 0, 0, err 608 } 609 610 return int64(key.EncodedSize()), int64(len(bytes)), nil 611 } 612 613 // Scan returns up to max key/value objects starting from 614 // start (inclusive) and ending at end (non-inclusive). 615 // Specify max=0 for unbounded scans. 616 func Scan(reader Reader, start, end roachpb.Key, max int64) ([]MVCCKeyValue, error) { 617 var kvs []MVCCKeyValue 618 err := reader.Iterate(start, end, func(kv MVCCKeyValue) (bool, error) { 619 if max != 0 && int64(len(kvs)) >= max { 620 return true, nil 621 } 622 kvs = append(kvs, kv) 623 return false, nil 624 }) 625 return kvs, err 626 } 627 628 // WriteSyncNoop carries out a synchronous no-op write to the engine. 629 func WriteSyncNoop(ctx context.Context, eng Engine) error { 630 batch := eng.NewBatch() 631 defer batch.Close() 632 633 if err := batch.LogData(nil); err != nil { 634 return err 635 } 636 637 if err := batch.Commit(true /* sync */); err != nil { 638 return err 639 } 640 return nil 641 } 642 643 // ClearRangeWithHeuristic clears the keys from start (inclusive) to end 644 // (exclusive). Depending on the number of keys, it will either use ClearRange 645 // or ClearIterRange. 646 func ClearRangeWithHeuristic(reader Reader, writer Writer, start, end roachpb.Key) error { 647 iter := reader.NewIterator(IterOptions{UpperBound: end}) 648 defer iter.Close() 649 650 // It is expensive for there to be many range deletion tombstones in the same 651 // sstable because all of the tombstones in an sstable are loaded whenever the 652 // sstable is accessed. So we avoid using range deletion unless there is some 653 // minimum number of keys. The value here was pulled out of thin air. It might 654 // be better to make this dependent on the size of the data being deleted. Or 655 // perhaps we should fix RocksDB to handle large numbers of tombstones in an 656 // sstable better. 657 const clearRangeMinKeys = 64 658 // Peek into the range to see whether it's large enough to justify 659 // ClearRange. Note that the work done here is bounded by 660 // clearRangeMinKeys, so it will be fairly cheap even for large 661 // ranges. 662 // 663 // TODO(bdarnell): Move this into ClearIterRange so we don't have 664 // to do this scan twice. 665 count := 0 666 iter.SeekGE(MakeMVCCMetadataKey(start)) 667 for { 668 valid, err := iter.Valid() 669 if err != nil { 670 return err 671 } 672 if !valid { 673 break 674 } 675 count++ 676 if count > clearRangeMinKeys { 677 break 678 } 679 iter.Next() 680 } 681 var err error 682 if count > clearRangeMinKeys { 683 err = writer.ClearRange(MakeMVCCMetadataKey(start), MakeMVCCMetadataKey(end)) 684 } else { 685 err = writer.ClearIterRange(iter, start, end) 686 } 687 if err != nil { 688 return err 689 } 690 return nil 691 } 692 693 var ingestDelayL0Threshold = settings.RegisterIntSetting( 694 "rocksdb.ingest_backpressure.l0_file_count_threshold", 695 "number of L0 files after which to backpressure SST ingestions", 696 20, 697 ) 698 699 var ingestDelayTime = settings.RegisterDurationSetting( 700 "rocksdb.ingest_backpressure.max_delay", 701 "maximum amount of time to backpressure a single SST ingestion", 702 time.Second*5, 703 ) 704 705 // PreIngestDelay may choose to block for some duration if L0 has an excessive 706 // number of files in it or if PendingCompactionBytesEstimate is elevated. This 707 // it is intended to be called before ingesting a new SST, since we'd rather 708 // backpressure the bulk operation adding SSTs than slow down the whole RocksDB 709 // instance and impact all forground traffic by adding too many files to it. 710 // After the number of L0 files exceeds the configured limit, it gradually 711 // begins delaying more for each additional file in L0 over the limit until 712 // hitting its configured (via settings) maximum delay. If the pending 713 // compaction limit is exceeded, it waits for the maximum delay. 714 func preIngestDelay(ctx context.Context, eng Engine, settings *cluster.Settings) { 715 if settings == nil { 716 return 717 } 718 stats, err := eng.GetStats() 719 if err != nil { 720 log.Warningf(ctx, "failed to read stats: %+v", err) 721 return 722 } 723 targetDelay := calculatePreIngestDelay(settings, stats) 724 725 if targetDelay == 0 { 726 return 727 } 728 log.VEventf(ctx, 2, "delaying SST ingestion %s. %d L0 files", targetDelay, stats.L0FileCount) 729 730 select { 731 case <-time.After(targetDelay): 732 case <-ctx.Done(): 733 } 734 } 735 736 func calculatePreIngestDelay(settings *cluster.Settings, stats *Stats) time.Duration { 737 maxDelay := ingestDelayTime.Get(&settings.SV) 738 l0Filelimit := ingestDelayL0Threshold.Get(&settings.SV) 739 740 const ramp = 10 741 if stats.L0FileCount > l0Filelimit { 742 delayPerFile := maxDelay / time.Duration(ramp) 743 targetDelay := time.Duration(stats.L0FileCount-l0Filelimit) * delayPerFile 744 if targetDelay > maxDelay { 745 return maxDelay 746 } 747 return targetDelay 748 } 749 return 0 750 } 751 752 // Helper function to implement Reader.Iterate(). 753 func iterateOnReader( 754 reader Reader, start, end roachpb.Key, f func(MVCCKeyValue) (stop bool, err error), 755 ) error { 756 if reader.Closed() { 757 return errors.New("cannot call Iterate on a closed batch") 758 } 759 if start.Compare(end) >= 0 { 760 return nil 761 } 762 763 it := reader.NewIterator(IterOptions{UpperBound: end}) 764 defer it.Close() 765 766 it.SeekGE(MakeMVCCMetadataKey(start)) 767 for ; ; it.Next() { 768 ok, err := it.Valid() 769 if err != nil { 770 return err 771 } else if !ok { 772 break 773 } 774 if done, err := f(MVCCKeyValue{Key: it.Key(), Value: it.Value()}); done || err != nil { 775 return err 776 } 777 } 778 return nil 779 }