github.com/cockroachdb/cockroach@v20.2.0-alpha.1+incompatible/pkg/storage/pebble.go (about) 1 // Copyright 2019 The Cockroach Authors. 2 // 3 // Use of this software is governed by the Business Source License 4 // included in the file licenses/BSL.txt. 5 // 6 // As of the Change Date specified in that file, in accordance with 7 // the Business Source License, use of this software will be governed 8 // by the Apache License, Version 2.0, included in the file 9 // licenses/APL.txt. 10 11 package storage 12 13 import ( 14 "bytes" 15 "context" 16 "fmt" 17 "io" 18 "io/ioutil" 19 "os" 20 "sort" 21 "strconv" 22 "strings" 23 "time" 24 25 "github.com/cockroachdb/cockroach/pkg/base" 26 "github.com/cockroachdb/cockroach/pkg/roachpb" 27 "github.com/cockroachdb/cockroach/pkg/settings/cluster" 28 "github.com/cockroachdb/cockroach/pkg/storage/enginepb" 29 "github.com/cockroachdb/cockroach/pkg/storage/fs" 30 "github.com/cockroachdb/cockroach/pkg/util/hlc" 31 "github.com/cockroachdb/cockroach/pkg/util/log" 32 "github.com/cockroachdb/cockroach/pkg/util/protoutil" 33 "github.com/cockroachdb/errors" 34 "github.com/cockroachdb/logtags" 35 "github.com/cockroachdb/pebble" 36 "github.com/cockroachdb/pebble/bloom" 37 "github.com/cockroachdb/pebble/vfs" 38 ) 39 40 // MVCCKeyCompare compares cockroach keys, including the MVCC timestamps. 41 func MVCCKeyCompare(a, b []byte) int { 42 // NB: For performance, this routine manually splits the key into the 43 // user-key and timestamp components rather than using SplitMVCCKey. Don't 44 // try this at home kids: use SplitMVCCKey. 45 46 aEnd := len(a) - 1 47 bEnd := len(b) - 1 48 if aEnd < 0 || bEnd < 0 { 49 // This should never happen unless there is some sort of corruption of 50 // the keys. This is a little bizarre, but the behavior exactly matches 51 // engine/db.cc:DBComparator. 52 return bytes.Compare(a, b) 53 } 54 55 // Compute the index of the separator between the key and the timestamp. 56 aSep := aEnd - int(a[aEnd]) 57 bSep := bEnd - int(b[bEnd]) 58 if aSep < 0 || bSep < 0 { 59 // This should never happen unless there is some sort of corruption of 60 // the keys. This is a little bizarre, but the behavior exactly matches 61 // engine/db.cc:DBComparator. 62 return bytes.Compare(a, b) 63 } 64 65 // Compare the "user key" part of the key. 66 if c := bytes.Compare(a[:aSep], b[:bSep]); c != 0 { 67 return c 68 } 69 70 // Compare the timestamp part of the key. 71 aTS := a[aSep:aEnd] 72 bTS := b[bSep:bEnd] 73 if len(aTS) == 0 { 74 if len(bTS) == 0 { 75 return 0 76 } 77 return -1 78 } else if len(bTS) == 0 { 79 return 1 80 } 81 return bytes.Compare(bTS, aTS) 82 } 83 84 // MVCCComparer is a pebble.Comparer object that implements MVCC-specific 85 // comparator settings for use with Pebble. 86 var MVCCComparer = &pebble.Comparer{ 87 Compare: MVCCKeyCompare, 88 89 AbbreviatedKey: func(k []byte) uint64 { 90 key, _, ok := enginepb.SplitMVCCKey(k) 91 if !ok { 92 return 0 93 } 94 return pebble.DefaultComparer.AbbreviatedKey(key) 95 }, 96 97 FormatKey: func(k []byte) fmt.Formatter { 98 decoded, err := DecodeMVCCKey(k) 99 if err != nil { 100 return mvccKeyFormatter{err: err} 101 } 102 return mvccKeyFormatter{key: decoded} 103 }, 104 105 Separator: func(dst, a, b []byte) []byte { 106 aKey, _, ok := enginepb.SplitMVCCKey(a) 107 if !ok { 108 return append(dst, a...) 109 } 110 bKey, _, ok := enginepb.SplitMVCCKey(b) 111 if !ok { 112 return append(dst, a...) 113 } 114 // If the keys are the same just return a. 115 if bytes.Equal(aKey, bKey) { 116 return append(dst, a...) 117 } 118 n := len(dst) 119 // MVCC key comparison uses bytes.Compare on the roachpb.Key, which is the same semantics as 120 // pebble.DefaultComparer, so reuse the latter's Separator implementation. 121 dst = pebble.DefaultComparer.Separator(dst, aKey, bKey) 122 // Did it pick a separator different than aKey -- if it did not we can't do better than a. 123 buf := dst[n:] 124 if bytes.Equal(aKey, buf) { 125 return append(dst[:n], a...) 126 } 127 // The separator is > aKey, so we only need to add the timestamp sentinel. 128 return append(dst, 0) 129 }, 130 131 Successor: func(dst, a []byte) []byte { 132 aKey, _, ok := enginepb.SplitMVCCKey(a) 133 if !ok { 134 return append(dst, a...) 135 } 136 n := len(dst) 137 // MVCC key comparison uses bytes.Compare on the roachpb.Key, which is the same semantics as 138 // pebble.DefaultComparer, so reuse the latter's Successor implementation. 139 dst = pebble.DefaultComparer.Successor(dst, aKey) 140 // Did it pick a successor different than aKey -- if it did not we can't do better than a. 141 buf := dst[n:] 142 if bytes.Equal(aKey, buf) { 143 return append(dst[:n], a...) 144 } 145 // The successor is > aKey, so we only need to add the timestamp sentinel. 146 return append(dst, 0) 147 }, 148 149 Split: func(k []byte) int { 150 key, _, ok := enginepb.SplitMVCCKey(k) 151 if !ok { 152 return len(k) 153 } 154 // This matches the behavior of libroach/KeyPrefix. RocksDB requires that 155 // keys generated via a SliceTransform be comparable with normal encoded 156 // MVCC keys. Encoded MVCC keys have a suffix indicating the number of 157 // bytes of timestamp data. MVCC keys without a timestamp have a suffix of 158 // 0. We're careful in EncodeKey to make sure that the user-key always has 159 // a trailing 0. If there is no timestamp this falls out naturally. If 160 // there is a timestamp we prepend a 0 to the encoded timestamp data. 161 return len(key) + 1 162 }, 163 164 Name: "cockroach_comparator", 165 } 166 167 // MVCCMerger is a pebble.Merger object that implements the merge operator used 168 // by Cockroach. 169 var MVCCMerger = &pebble.Merger{ 170 Name: "cockroach_merge_operator", 171 Merge: func(_, value []byte) (pebble.ValueMerger, error) { 172 res := &MVCCValueMerger{} 173 err := res.MergeNewer(value) 174 if err != nil { 175 return nil, err 176 } 177 return res, nil 178 }, 179 } 180 181 // pebbleTimeBoundPropCollector implements a property collector for MVCC 182 // Timestamps. Its behavior matches TimeBoundTblPropCollector in 183 // table_props.cc. 184 // 185 // The handling of timestamps in intents is mildly complicated. Consider: 186 // 187 // a@<meta> -> <MVCCMetadata: Timestamp=t2> 188 // a@t2 -> <value> 189 // a@t1 -> <value> 190 // 191 // The metadata record (a.k.a. the intent) for a key always sorts first. The 192 // timestamp field always points to the next record. In this case, the meta 193 // record contains t2 and the next record is t2. Because of this duplication of 194 // the timestamp both in the intent and in the timestamped record that 195 // immediately follows it, we only need to unmarshal the MVCCMetadata if it is 196 // the last key in the sstable. 197 type pebbleTimeBoundPropCollector struct { 198 min, max []byte 199 lastValue []byte 200 } 201 202 func (t *pebbleTimeBoundPropCollector) Add(key pebble.InternalKey, value []byte) error { 203 _, ts, ok := enginepb.SplitMVCCKey(key.UserKey) 204 if !ok { 205 return errors.Errorf("failed to split MVCC key") 206 } 207 if len(ts) > 0 { 208 t.lastValue = t.lastValue[:0] 209 t.updateBounds(ts) 210 } else { 211 t.lastValue = append(t.lastValue[:0], value...) 212 } 213 return nil 214 } 215 216 func (t *pebbleTimeBoundPropCollector) Finish(userProps map[string]string) error { 217 if len(t.lastValue) > 0 { 218 // The last record in the sstable was an intent. Unmarshal the metadata and 219 // update the bounds with the timestamp it contains. 220 meta := &enginepb.MVCCMetadata{} 221 if err := protoutil.Unmarshal(t.lastValue, meta); err != nil { 222 // We're unable to parse the MVCCMetadata. Fail open by not setting the 223 // min/max timestamp properties. This mimics the behavior of 224 // TimeBoundTblPropCollector. 225 // TODO(petermattis): Return the error here and in C++, see #43422. 226 return nil //nolint:returnerrcheck 227 } 228 if meta.Txn != nil { 229 ts := encodeTimestamp(hlc.Timestamp(meta.Timestamp)) 230 t.updateBounds(ts) 231 } 232 } 233 234 userProps["crdb.ts.min"] = string(t.min) 235 userProps["crdb.ts.max"] = string(t.max) 236 return nil 237 } 238 239 func (t *pebbleTimeBoundPropCollector) updateBounds(ts []byte) { 240 if len(t.min) == 0 || bytes.Compare(ts, t.min) < 0 { 241 t.min = append(t.min[:0], ts...) 242 } 243 if len(t.max) == 0 || bytes.Compare(ts, t.max) > 0 { 244 t.max = append(t.max[:0], ts...) 245 } 246 } 247 248 func (t *pebbleTimeBoundPropCollector) Name() string { 249 // This constant needs to match the one used by the RocksDB version of this 250 // table property collector. DO NOT CHANGE. 251 return "TimeBoundTblPropCollectorFactory" 252 } 253 254 var _ pebble.NeedCompacter = &pebbleDeleteRangeCollector{} 255 256 // pebbleDeleteRangeCollector marks an sstable for compaction that contains a 257 // range tombstone. 258 type pebbleDeleteRangeCollector struct { 259 numRangeTombstones int 260 } 261 262 func (c *pebbleDeleteRangeCollector) Add(key pebble.InternalKey, value []byte) error { 263 if key.Kind() == pebble.InternalKeyKindRangeDelete { 264 c.numRangeTombstones++ 265 } 266 return nil 267 } 268 269 // NeedCompact implements the pebble.NeedCompacter interface. 270 func (c *pebbleDeleteRangeCollector) NeedCompact() bool { 271 // NB: Mark any file containing range deletions as requiring a 272 // compaction. This ensures that range deletions are quickly compacted out 273 // of existence. 274 return c.numRangeTombstones > 0 275 } 276 277 func (*pebbleDeleteRangeCollector) Finish(userProps map[string]string) error { 278 return nil 279 } 280 281 func (*pebbleDeleteRangeCollector) Name() string { 282 // This constant needs to match the one used by the RocksDB version of this 283 // table property collector. DO NOT CHANGE. 284 return "DeleteRangeTblPropCollectorFactory" 285 } 286 287 // PebbleTablePropertyCollectors is the list of Pebble TablePropertyCollectors. 288 var PebbleTablePropertyCollectors = []func() pebble.TablePropertyCollector{ 289 func() pebble.TablePropertyCollector { return &pebbleTimeBoundPropCollector{} }, 290 func() pebble.TablePropertyCollector { return &pebbleDeleteRangeCollector{} }, 291 } 292 293 // DefaultPebbleOptions returns the default pebble options. 294 func DefaultPebbleOptions() *pebble.Options { 295 // In RocksDB, the concurrency setting corresponds to both flushes and 296 // compactions. In Pebble, there is always a slot for a flush, and 297 // compactions are counted separately. 298 maxConcurrentCompactions := rocksdbConcurrency - 1 299 if maxConcurrentCompactions < 1 { 300 maxConcurrentCompactions = 1 301 } 302 303 opts := &pebble.Options{ 304 Comparer: MVCCComparer, 305 L0CompactionThreshold: 2, 306 L0StopWritesThreshold: 1000, 307 LBaseMaxBytes: 64 << 20, // 64 MB 308 Levels: make([]pebble.LevelOptions, 7), 309 MaxConcurrentCompactions: maxConcurrentCompactions, 310 MemTableSize: 64 << 20, // 64 MB 311 MemTableStopWritesThreshold: 4, 312 Merger: MVCCMerger, 313 MinFlushRate: 4 << 20, // 4 MB/sec 314 TablePropertyCollectors: PebbleTablePropertyCollectors, 315 } 316 317 for i := 0; i < len(opts.Levels); i++ { 318 l := &opts.Levels[i] 319 l.BlockSize = 32 << 10 // 32 KB 320 l.IndexBlockSize = 256 << 10 // 256 KB 321 l.FilterPolicy = bloom.FilterPolicy(10) 322 l.FilterType = pebble.TableFilter 323 if i > 0 { 324 l.TargetFileSize = opts.Levels[i-1].TargetFileSize * 2 325 } 326 l.EnsureDefaults() 327 } 328 329 // Do not create bloom filters for the last level (i.e. the largest level 330 // which contains data in the LSM store). This configuration reduces the size 331 // of the bloom filters by 10x. This is significant given that bloom filters 332 // require 1.25 bytes (10 bits) per key which can translate into gigabytes of 333 // memory given typical key and value sizes. The downside is that bloom 334 // filters will only be usable on the higher levels, but that seems 335 // acceptable. We typically see read amplification of 5-6x on clusters 336 // (i.e. there are 5-6 levels of sstables) which means we'll achieve 80-90% 337 // of the benefit of having bloom filters on every level for only 10% of the 338 // memory cost. 339 opts.Levels[6].FilterPolicy = nil 340 return opts 341 } 342 343 var pebbleLog *log.SecondaryLogger 344 345 // InitPebbleLogger initializes the logger to use for Pebble log messages. If 346 // not called, WARNING, ERROR, and FATAL logs will be output to the normal 347 // CockroachDB log. The caller is responsible for ensuring the 348 // Close() method is eventually called on the new logger. 349 func InitPebbleLogger(ctx context.Context) *log.SecondaryLogger { 350 pebbleLog = log.NewSecondaryLogger(ctx, nil, "pebble", 351 true /* enableGC */, false /* forceSyncWrites */, false /* enableMsgCount */) 352 return pebbleLog 353 } 354 355 type pebbleLogger struct { 356 ctx context.Context 357 depth int 358 } 359 360 func (l pebbleLogger) Infof(format string, args ...interface{}) { 361 if pebbleLog != nil { 362 pebbleLog.LogfDepth(l.ctx, l.depth, format, args...) 363 return 364 } 365 // Only log INFO logs to the normal CockroachDB log at --v=3 and above. 366 if log.V(3) { 367 log.InfofDepth(l.ctx, l.depth, format, args...) 368 } 369 } 370 371 func (l pebbleLogger) Fatalf(format string, args ...interface{}) { 372 log.FatalfDepth(l.ctx, l.depth, format, args...) 373 } 374 375 // PebbleConfig holds all configuration parameters and knobs used in setting up 376 // a new Pebble instance. 377 type PebbleConfig struct { 378 // StorageConfig contains storage configs for all storage engines. 379 base.StorageConfig 380 // Pebble specific options. 381 Opts *pebble.Options 382 } 383 384 // EncryptionStatsHandler provides encryption related stats. 385 type EncryptionStatsHandler interface { 386 // Returns a serialized enginepbccl.EncryptionStatus. 387 GetEncryptionStatus() ([]byte, error) 388 // Returns a serialized enginepbccl.DataKeysRegistry, scrubbed of key contents. 389 GetDataKeysRegistry() ([]byte, error) 390 // Returns the ID of the active data key, or "plain" if none. 391 GetActiveDataKeyID() (string, error) 392 // Returns the enum value of the encryption type. 393 GetActiveStoreKeyType() int32 394 // Returns the KeyID embedded in the serialized EncryptionSettings. 395 GetKeyIDFromSettings(settings []byte) (string, error) 396 } 397 398 // Pebble is a wrapper around a Pebble database instance. 399 type Pebble struct { 400 db *pebble.DB 401 402 closed bool 403 path string 404 auxDir string 405 maxSize int64 406 attrs roachpb.Attributes 407 settings *cluster.Settings 408 statsHandler EncryptionStatsHandler 409 fileRegistry *PebbleFileRegistry 410 411 // Relevant options copied over from pebble.Options. 412 fs vfs.FS 413 logger pebble.Logger 414 } 415 416 var _ Engine = &Pebble{} 417 418 // NewEncryptedEnvFunc creates an encrypted environment and returns the vfs.FS to use for reading 419 // and writing data. This should be initialized by calling engineccl.Init() before calling 420 // NewPebble(). The optionBytes is a binary serialized baseccl.EncryptionOptions, so that non-CCL 421 // code does not depend on CCL code. 422 var NewEncryptedEnvFunc func(fs vfs.FS, fr *PebbleFileRegistry, dbDir string, readOnly bool, optionBytes []byte) (vfs.FS, EncryptionStatsHandler, error) 423 424 // ResolveEncryptedEnvOptions fills in cfg.Opts.FS with an encrypted vfs if this 425 // store has encryption-at-rest enabled. Also returns the associated file 426 // registry and EncryptionStatsHandler. 427 func ResolveEncryptedEnvOptions( 428 cfg *PebbleConfig, 429 ) (*PebbleFileRegistry, EncryptionStatsHandler, error) { 430 fileRegistry := &PebbleFileRegistry{FS: cfg.Opts.FS, DBDir: cfg.Dir, ReadOnly: cfg.Opts.ReadOnly} 431 if cfg.UseFileRegistry { 432 if err := fileRegistry.Load(); err != nil { 433 return nil, nil, err 434 } 435 } else { 436 if err := fileRegistry.checkNoRegistryFile(); err != nil { 437 return nil, nil, fmt.Errorf("encryption was used on this store before, but no encryption flags " + 438 "specified. You need a CCL build and must fully specify the --enterprise-encryption flag") 439 } 440 fileRegistry = nil 441 } 442 443 var statsHandler EncryptionStatsHandler 444 if len(cfg.ExtraOptions) > 0 { 445 // Encryption is enabled. 446 if !cfg.UseFileRegistry { 447 return nil, nil, fmt.Errorf("file registry is needed to support encryption") 448 } 449 if NewEncryptedEnvFunc == nil { 450 return nil, nil, fmt.Errorf("encryption is enabled but no function to create the encrypted env") 451 } 452 var err error 453 cfg.Opts.FS, statsHandler, err = 454 NewEncryptedEnvFunc(cfg.Opts.FS, fileRegistry, cfg.Dir, cfg.Opts.ReadOnly, cfg.ExtraOptions) 455 if err != nil { 456 return nil, nil, err 457 } 458 } 459 return fileRegistry, statsHandler, nil 460 } 461 462 // NewPebble creates a new Pebble instance, at the specified path. 463 func NewPebble(ctx context.Context, cfg PebbleConfig) (*Pebble, error) { 464 // pebble.Open also calls EnsureDefaults, but only after doing a clone. Call 465 // EnsureDefaults beforehand so we have a matching cfg here for when we save 466 // cfg.FS and cfg.ReadOnly later on. 467 cfg.Opts.EnsureDefaults() 468 cfg.Opts.ErrorIfNotExists = cfg.MustExist 469 if settings := cfg.Settings; settings != nil { 470 cfg.Opts.WALMinSyncInterval = func() time.Duration { 471 return minWALSyncInterval.Get(&settings.SV) 472 } 473 } 474 475 var auxDir string 476 if cfg.Dir == "" { 477 // TODO(peter): This is horribly hacky but matches what RocksDB does. For 478 // in-memory instances, we create an on-disk auxiliary directory. This is 479 // necessary because various tests expect the auxiliary directory to 480 // actually exist on disk even though they don't actually write files to 481 // the directory. See SSTSnapshotStorage for one example of this bad 482 // behavior. 483 var err error 484 auxDir, err = ioutil.TempDir(os.TempDir(), "cockroach-auxiliary") 485 if err != nil { 486 return nil, err 487 } 488 } else { 489 auxDir = cfg.Opts.FS.PathJoin(cfg.Dir, base.AuxiliaryDir) 490 if err := cfg.Opts.FS.MkdirAll(auxDir, 0755); err != nil { 491 return nil, err 492 } 493 } 494 495 fileRegistry, statsHandler, err := ResolveEncryptedEnvOptions(&cfg) 496 if err != nil { 497 return nil, err 498 } 499 500 // The context dance here is done so that we have a clean context without 501 // timeouts that has a copy of the log tags. 502 logCtx := logtags.WithTags(context.Background(), logtags.FromContext(ctx)) 503 cfg.Opts.Logger = pebbleLogger{ 504 ctx: logCtx, 505 depth: 1, 506 } 507 cfg.Opts.EventListener = pebble.MakeLoggingEventListener(pebbleLogger{ 508 ctx: logCtx, 509 depth: 2, // skip over the EventListener stack frame 510 }) 511 512 db, err := pebble.Open(cfg.StorageConfig.Dir, cfg.Opts) 513 if err != nil { 514 return nil, err 515 } 516 517 return &Pebble{ 518 db: db, 519 path: cfg.Dir, 520 auxDir: auxDir, 521 maxSize: cfg.MaxSize, 522 attrs: cfg.Attrs, 523 settings: cfg.Settings, 524 statsHandler: statsHandler, 525 fileRegistry: fileRegistry, 526 fs: cfg.Opts.FS, 527 logger: cfg.Opts.Logger, 528 }, nil 529 } 530 531 func newTeeInMem(ctx context.Context, attrs roachpb.Attributes, cacheSize int64) *TeeEngine { 532 // Note that we use the same unmodified directories for both pebble and 533 // rocksdb. This is to make sure the file paths match up, and that we're 534 // able to write to both and ingest from both memory filesystems. 535 pebbleInMem := newPebbleInMem(ctx, attrs, cacheSize) 536 rocksDBInMem := newRocksDBInMem(attrs, cacheSize) 537 tee := NewTee(ctx, rocksDBInMem, pebbleInMem) 538 return tee 539 } 540 541 func newPebbleInMem(ctx context.Context, attrs roachpb.Attributes, cacheSize int64) *Pebble { 542 opts := DefaultPebbleOptions() 543 opts.Cache = pebble.NewCache(cacheSize) 544 defer opts.Cache.Unref() 545 546 opts.FS = vfs.NewMem() 547 db, err := NewPebble( 548 ctx, 549 PebbleConfig{ 550 StorageConfig: base.StorageConfig{ 551 Attrs: attrs, 552 // TODO(bdarnell): The hard-coded 512 MiB is wrong; see 553 // https://github.com/cockroachdb/cockroach/issues/16750 554 MaxSize: 512 << 20, /* 512 MiB */ 555 }, 556 Opts: opts, 557 }) 558 if err != nil { 559 panic(err) 560 } 561 return db 562 } 563 564 func (p *Pebble) String() string { 565 dir := p.path 566 if dir == "" { 567 dir = "<in-mem>" 568 } 569 attrs := p.attrs.String() 570 if attrs == "" { 571 attrs = "<no-attributes>" 572 } 573 return fmt.Sprintf("%s=%s", attrs, dir) 574 } 575 576 // Close implements the Engine interface. 577 func (p *Pebble) Close() { 578 if p.closed { 579 p.logger.Infof("closing unopened pebble instance") 580 return 581 } 582 p.closed = true 583 584 if p.path == "" { 585 // Remove the temporary directory when the engine is in-memory. This 586 // matches the RocksDB behavior. 587 // 588 // TODO(peter): The aux-dir shouldn't be on-disk for in-memory 589 // engines. This is just a wart that needs to be removed. 590 if err := os.RemoveAll(p.auxDir); err != nil { 591 p.logger.Infof("%v", err) 592 } 593 } 594 595 _ = p.db.Close() 596 } 597 598 // Closed implements the Engine interface. 599 func (p *Pebble) Closed() bool { 600 return p.closed 601 } 602 603 // ExportToSst is part of the engine.Reader interface. 604 func (p *Pebble) ExportToSst( 605 startKey, endKey roachpb.Key, 606 startTS, endTS hlc.Timestamp, 607 exportAllRevisions bool, 608 targetSize, maxSize uint64, 609 io IterOptions, 610 ) ([]byte, roachpb.BulkOpSummary, roachpb.Key, error) { 611 return pebbleExportToSst(p, startKey, endKey, startTS, endTS, exportAllRevisions, targetSize, maxSize, io) 612 } 613 614 // Get implements the Engine interface. 615 func (p *Pebble) Get(key MVCCKey) ([]byte, error) { 616 if len(key.Key) == 0 { 617 return nil, emptyKeyError() 618 } 619 ret, closer, err := p.db.Get(EncodeKey(key)) 620 if closer != nil { 621 retCopy := make([]byte, len(ret)) 622 copy(retCopy, ret) 623 ret = retCopy 624 closer.Close() 625 } 626 if errors.Is(err, pebble.ErrNotFound) || len(ret) == 0 { 627 return nil, nil 628 } 629 return ret, err 630 } 631 632 // GetCompactionStats implements the Engine interface. 633 func (p *Pebble) GetCompactionStats() string { 634 // NB: The initial blank line matches the formatting used by RocksDB and 635 // ensures that compaction stats display will not contain the log prefix 636 // (this method is only used for logging purposes). 637 return "\n" + p.db.Metrics().String() 638 } 639 640 // GetProto implements the Engine interface. 641 func (p *Pebble) GetProto( 642 key MVCCKey, msg protoutil.Message, 643 ) (ok bool, keyBytes, valBytes int64, err error) { 644 if len(key.Key) == 0 { 645 return false, 0, 0, emptyKeyError() 646 } 647 encodedKey := EncodeKey(key) 648 val, closer, err := p.db.Get(encodedKey) 649 if closer != nil { 650 if msg != nil { 651 err = protoutil.Unmarshal(val, msg) 652 } 653 keyBytes = int64(len(encodedKey)) 654 valBytes = int64(len(val)) 655 closer.Close() 656 return true, keyBytes, valBytes, err 657 } 658 if errors.Is(err, pebble.ErrNotFound) { 659 return false, 0, 0, nil 660 } 661 return false, 0, 0, err 662 } 663 664 // Iterate implements the Engine interface. 665 func (p *Pebble) Iterate( 666 start, end roachpb.Key, f func(MVCCKeyValue) (stop bool, err error), 667 ) error { 668 return iterateOnReader(p, start, end, f) 669 } 670 671 // NewIterator implements the Engine interface. 672 func (p *Pebble) NewIterator(opts IterOptions) Iterator { 673 iter := newPebbleIterator(p.db, opts) 674 if iter == nil { 675 panic("couldn't create a new iterator") 676 } 677 return iter 678 } 679 680 // ApplyBatchRepr implements the Engine interface. 681 func (p *Pebble) ApplyBatchRepr(repr []byte, sync bool) error { 682 // batch.SetRepr takes ownership of the underlying slice, so make a copy. 683 reprCopy := make([]byte, len(repr)) 684 copy(reprCopy, repr) 685 686 batch := p.db.NewBatch() 687 if err := batch.SetRepr(reprCopy); err != nil { 688 return err 689 } 690 691 opts := pebble.NoSync 692 if sync { 693 opts = pebble.Sync 694 } 695 return batch.Commit(opts) 696 } 697 698 // Clear implements the Engine interface. 699 func (p *Pebble) Clear(key MVCCKey) error { 700 if len(key.Key) == 0 { 701 return emptyKeyError() 702 } 703 return p.db.Delete(EncodeKey(key), pebble.Sync) 704 } 705 706 // SingleClear implements the Engine interface. 707 func (p *Pebble) SingleClear(key MVCCKey) error { 708 if len(key.Key) == 0 { 709 return emptyKeyError() 710 } 711 return p.db.SingleDelete(EncodeKey(key), pebble.Sync) 712 } 713 714 // ClearRange implements the Engine interface. 715 func (p *Pebble) ClearRange(start, end MVCCKey) error { 716 bufStart := EncodeKey(start) 717 bufEnd := EncodeKey(end) 718 return p.db.DeleteRange(bufStart, bufEnd, pebble.Sync) 719 } 720 721 // ClearIterRange implements the Engine interface. 722 func (p *Pebble) ClearIterRange(iter Iterator, start, end roachpb.Key) error { 723 // Write all the tombstones in one batch. 724 batch := p.NewWriteOnlyBatch() 725 defer batch.Close() 726 727 if err := batch.ClearIterRange(iter, start, end); err != nil { 728 return err 729 } 730 return batch.Commit(true) 731 } 732 733 // Merge implements the Engine interface. 734 func (p *Pebble) Merge(key MVCCKey, value []byte) error { 735 if len(key.Key) == 0 { 736 return emptyKeyError() 737 } 738 return p.db.Merge(EncodeKey(key), value, pebble.Sync) 739 } 740 741 // Put implements the Engine interface. 742 func (p *Pebble) Put(key MVCCKey, value []byte) error { 743 if len(key.Key) == 0 { 744 return emptyKeyError() 745 } 746 return p.db.Set(EncodeKey(key), value, pebble.Sync) 747 } 748 749 // LogData implements the Engine interface. 750 func (p *Pebble) LogData(data []byte) error { 751 return p.db.LogData(data, pebble.Sync) 752 } 753 754 // LogLogicalOp implements the Engine interface. 755 func (p *Pebble) LogLogicalOp(op MVCCLogicalOpType, details MVCCLogicalOpDetails) { 756 // No-op. Logical logging disabled. 757 } 758 759 // Attrs implements the Engine interface. 760 func (p *Pebble) Attrs() roachpb.Attributes { 761 return p.attrs 762 } 763 764 // Capacity implements the Engine interface. 765 func (p *Pebble) Capacity() (roachpb.StoreCapacity, error) { 766 return computeCapacity(p.path, p.maxSize) 767 } 768 769 // Flush implements the Engine interface. 770 func (p *Pebble) Flush() error { 771 return p.db.Flush() 772 } 773 774 // GetStats implements the Engine interface. 775 func (p *Pebble) GetStats() (*Stats, error) { 776 m := p.db.Metrics() 777 778 // Aggregate compaction stats across levels. 779 var ingestedBytes, compactedBytesRead, compactedBytesWritten int64 780 for _, lm := range m.Levels { 781 ingestedBytes += int64(lm.BytesIngested) 782 compactedBytesRead += int64(lm.BytesRead) 783 compactedBytesWritten += int64(lm.BytesCompacted) 784 } 785 786 return &Stats{ 787 BlockCacheHits: m.BlockCache.Hits, 788 BlockCacheMisses: m.BlockCache.Misses, 789 BlockCacheUsage: m.BlockCache.Size, 790 BlockCachePinnedUsage: 0, 791 BloomFilterPrefixChecked: m.Filter.Hits + m.Filter.Misses, 792 BloomFilterPrefixUseful: m.Filter.Hits, 793 MemtableTotalSize: int64(m.MemTable.Size), 794 Flushes: m.Flush.Count, 795 FlushedBytes: int64(m.Levels[0].BytesFlushed), 796 Compactions: m.Compact.Count, 797 IngestedBytes: ingestedBytes, 798 CompactedBytesRead: compactedBytesRead, 799 CompactedBytesWritten: compactedBytesWritten, 800 TableReadersMemEstimate: m.TableCache.Size, 801 PendingCompactionBytesEstimate: int64(m.Compact.EstimatedDebt), 802 L0FileCount: m.Levels[0].NumFiles, 803 }, nil 804 } 805 806 // GetEncryptionRegistries implements the Engine interface. 807 func (p *Pebble) GetEncryptionRegistries() (*EncryptionRegistries, error) { 808 rv := &EncryptionRegistries{} 809 var err error 810 if p.statsHandler != nil { 811 rv.KeyRegistry, err = p.statsHandler.GetDataKeysRegistry() 812 if err != nil { 813 return nil, err 814 } 815 } 816 if p.fileRegistry != nil { 817 rv.FileRegistry, err = protoutil.Marshal(p.fileRegistry.getRegistryCopy()) 818 if err != nil { 819 return nil, err 820 } 821 } 822 return rv, nil 823 } 824 825 // GetEnvStats implements the Engine interface. 826 func (p *Pebble) GetEnvStats() (*EnvStats, error) { 827 // TODO(sumeer): make the stats complete. There are no bytes stats. The TotalFiles is missing 828 // files that are not in the registry (from before encryption was enabled). 829 stats := &EnvStats{} 830 if p.statsHandler == nil { 831 return stats, nil 832 } 833 stats.EncryptionType = p.statsHandler.GetActiveStoreKeyType() 834 var err error 835 stats.EncryptionStatus, err = p.statsHandler.GetEncryptionStatus() 836 if err != nil { 837 return nil, err 838 } 839 fr := p.fileRegistry.getRegistryCopy() 840 activeKeyID, err := p.statsHandler.GetActiveDataKeyID() 841 if err != nil { 842 return nil, err 843 } 844 845 m := p.db.Metrics() 846 stats.TotalFiles = 3 /* CURRENT, MANIFEST, OPTIONS */ 847 stats.TotalFiles += uint64(m.WAL.Files + m.Table.ZombieCount + m.WAL.ObsoleteFiles) 848 stats.TotalBytes = m.WAL.Size + m.Table.ZombieSize 849 for _, l := range m.Levels { 850 stats.TotalFiles += uint64(l.NumFiles) 851 stats.TotalBytes += l.Size 852 } 853 854 sstSizes := make(map[pebble.FileNum]uint64) 855 for _, ssts := range p.db.SSTables() { 856 for _, sst := range ssts { 857 sstSizes[sst.FileNum] = sst.Size 858 } 859 } 860 861 for filePath, entry := range fr.Files { 862 keyID, err := p.statsHandler.GetKeyIDFromSettings(entry.EncryptionSettings) 863 if err != nil { 864 return nil, err 865 } 866 if len(keyID) == 0 { 867 keyID = "plain" 868 } 869 if keyID != activeKeyID { 870 continue 871 } 872 stats.ActiveKeyFiles++ 873 874 filename := p.fs.PathBase(filePath) 875 numStr := strings.TrimSuffix(filename, ".sst") 876 if len(numStr) == len(filename) { 877 continue // not a sstable 878 } 879 u, err := strconv.ParseUint(numStr, 10, 64) 880 if err != nil { 881 return nil, errors.Wrapf(err, "parsing filename %q", errors.Safe(filename)) 882 } 883 stats.ActiveKeyBytes += sstSizes[pebble.FileNum(u)] 884 } 885 return stats, nil 886 } 887 888 // GetAuxiliaryDir implements the Engine interface. 889 func (p *Pebble) GetAuxiliaryDir() string { 890 return p.auxDir 891 } 892 893 // NewBatch implements the Engine interface. 894 func (p *Pebble) NewBatch() Batch { 895 return newPebbleBatch(p.db, p.db.NewIndexedBatch()) 896 } 897 898 // NewReadOnly implements the Engine interface. 899 func (p *Pebble) NewReadOnly() ReadWriter { 900 return &pebbleReadOnly{ 901 parent: p, 902 } 903 } 904 905 // NewWriteOnlyBatch implements the Engine interface. 906 func (p *Pebble) NewWriteOnlyBatch() Batch { 907 return newPebbleBatch(p.db, p.db.NewBatch()) 908 } 909 910 // NewSnapshot implements the Engine interface. 911 func (p *Pebble) NewSnapshot() Reader { 912 return &pebbleSnapshot{ 913 snapshot: p.db.NewSnapshot(), 914 } 915 } 916 917 // Type implements the Engine interface. 918 func (p *Pebble) Type() enginepb.EngineType { 919 return enginepb.EngineTypePebble 920 } 921 922 // IngestExternalFiles implements the Engine interface. 923 func (p *Pebble) IngestExternalFiles(ctx context.Context, paths []string) error { 924 return p.db.Ingest(paths) 925 } 926 927 // PreIngestDelay implements the Engine interface. 928 func (p *Pebble) PreIngestDelay(ctx context.Context) { 929 preIngestDelay(ctx, p, p.settings) 930 } 931 932 // ApproximateDiskBytes implements the Engine interface. 933 func (p *Pebble) ApproximateDiskBytes(from, to roachpb.Key) (uint64, error) { 934 count, err := p.db.EstimateDiskUsage(from, to) 935 if err != nil { 936 return 0, err 937 } 938 return count, nil 939 } 940 941 // Compact implements the Engine interface. 942 func (p *Pebble) Compact() error { 943 return p.db.Compact(nil, EncodeKey(MVCCKeyMax)) 944 } 945 946 // CompactRange implements the Engine interface. 947 func (p *Pebble) CompactRange(start, end roachpb.Key, forceBottommost bool) error { 948 bufStart := EncodeKey(MVCCKey{start, hlc.Timestamp{}}) 949 bufEnd := EncodeKey(MVCCKey{end, hlc.Timestamp{}}) 950 return p.db.Compact(bufStart, bufEnd) 951 } 952 953 // InMem returns true if the receiver is an in-memory engine and false 954 // otherwise. 955 func (p *Pebble) InMem() bool { 956 return p.path == "" 957 } 958 959 // ReadFile implements the Engine interface. 960 func (p *Pebble) ReadFile(filename string) ([]byte, error) { 961 file, err := p.fs.Open(filename) 962 if err != nil { 963 return nil, err 964 } 965 defer file.Close() 966 967 return ioutil.ReadAll(file) 968 } 969 970 // WriteFile writes data to a file in this RocksDB's env. 971 func (p *Pebble) WriteFile(filename string, data []byte) error { 972 file, err := p.fs.Create(filename) 973 if err != nil { 974 return err 975 } 976 defer file.Close() 977 978 _, err = io.Copy(file, bytes.NewReader(data)) 979 return err 980 } 981 982 // Remove implements the FS interface. 983 func (p *Pebble) Remove(filename string) error { 984 return p.fs.Remove(filename) 985 } 986 987 // RemoveAll implements the Engine interface. 988 func (p *Pebble) RemoveAll(dir string) error { 989 return p.fs.RemoveAll(dir) 990 } 991 992 // Link implements the FS interface. 993 func (p *Pebble) Link(oldname, newname string) error { 994 return p.fs.Link(oldname, newname) 995 } 996 997 var _ fs.FS = &Pebble{} 998 999 // Create implements the FS interface. 1000 func (p *Pebble) Create(name string) (fs.File, error) { 1001 // TODO(peter): On RocksDB, the MemEnv allows creating a file when the parent 1002 // directory does not exist. Various tests in the storage package depend on 1003 // this because they are accidentally creating the required directory on the 1004 // actual filesystem instead of in the memory filesystem. See 1005 // diskSideloadedStorage and SSTSnapshotStrategy. 1006 if p.InMem() { 1007 _ = p.fs.MkdirAll(p.fs.PathDir(name), 0755) 1008 } 1009 return p.fs.Create(name) 1010 } 1011 1012 // CreateWithSync implements the FS interface. 1013 func (p *Pebble) CreateWithSync(name string, bytesPerSync int) (fs.File, error) { 1014 // TODO(peter): On RocksDB, the MemEnv allows creating a file when the parent 1015 // directory does not exist. Various tests in the storage package depend on 1016 // this because they are accidentally creating the required directory on the 1017 // actual filesystem instead of in the memory filesystem. See 1018 // diskSideloadedStorage and SSTSnapshotStrategy. 1019 if p.InMem() { 1020 _ = p.fs.MkdirAll(p.fs.PathDir(name), 0755) 1021 } 1022 f, err := p.fs.Create(name) 1023 if err != nil { 1024 return nil, err 1025 } 1026 return vfs.NewSyncingFile(f, vfs.SyncingFileOptions{BytesPerSync: bytesPerSync}), nil 1027 } 1028 1029 // Open implements the FS interface. 1030 func (p *Pebble) Open(name string) (fs.File, error) { 1031 return p.fs.Open(name) 1032 } 1033 1034 // OpenDir implements the FS interface. 1035 func (p *Pebble) OpenDir(name string) (fs.File, error) { 1036 return p.fs.OpenDir(name) 1037 } 1038 1039 // Rename implements the FS interface. 1040 func (p *Pebble) Rename(oldname, newname string) error { 1041 return p.fs.Rename(oldname, newname) 1042 } 1043 1044 // MkdirAll implements the FS interface. 1045 func (p *Pebble) MkdirAll(name string) error { 1046 return p.fs.MkdirAll(name, 0755) 1047 } 1048 1049 // RemoveDir implements the FS interface. 1050 func (p *Pebble) RemoveDir(name string) error { 1051 return p.fs.Remove(name) 1052 } 1053 1054 // List implements the FS interface. 1055 func (p *Pebble) List(name string) ([]string, error) { 1056 dirents, err := p.fs.List(name) 1057 sort.Strings(dirents) 1058 return dirents, err 1059 } 1060 1061 // CreateCheckpoint implements the Engine interface. 1062 func (p *Pebble) CreateCheckpoint(dir string) error { 1063 return p.db.Checkpoint(dir) 1064 } 1065 1066 // GetSSTables implements the WithSSTables interface. 1067 func (p *Pebble) GetSSTables() (sstables SSTableInfos) { 1068 for level, tables := range p.db.SSTables() { 1069 for _, table := range tables { 1070 startKey, _ := DecodeMVCCKey(table.Smallest.UserKey) 1071 endKey, _ := DecodeMVCCKey(table.Largest.UserKey) 1072 info := SSTableInfo{ 1073 Level: level, 1074 Size: int64(table.Size), 1075 Start: startKey, 1076 End: endKey, 1077 } 1078 sstables = append(sstables, info) 1079 } 1080 } 1081 1082 sort.Sort(sstables) 1083 return sstables 1084 } 1085 1086 type pebbleReadOnly struct { 1087 parent *Pebble 1088 prefixIter pebbleIterator 1089 normalIter pebbleIterator 1090 closed bool 1091 } 1092 1093 var _ ReadWriter = &pebbleReadOnly{} 1094 1095 func (p *pebbleReadOnly) Close() { 1096 if p.closed { 1097 panic("closing an already-closed pebbleReadOnly") 1098 } 1099 p.closed = true 1100 p.prefixIter.destroy() 1101 p.normalIter.destroy() 1102 } 1103 1104 func (p *pebbleReadOnly) Closed() bool { 1105 return p.closed 1106 } 1107 1108 // ExportToSst is part of the engine.Reader interface. 1109 func (p *pebbleReadOnly) ExportToSst( 1110 startKey, endKey roachpb.Key, 1111 startTS, endTS hlc.Timestamp, 1112 exportAllRevisions bool, 1113 targetSize, maxSize uint64, 1114 io IterOptions, 1115 ) ([]byte, roachpb.BulkOpSummary, roachpb.Key, error) { 1116 return pebbleExportToSst(p, startKey, endKey, startTS, endTS, exportAllRevisions, targetSize, maxSize, io) 1117 } 1118 1119 func (p *pebbleReadOnly) Get(key MVCCKey) ([]byte, error) { 1120 if p.closed { 1121 panic("using a closed pebbleReadOnly") 1122 } 1123 return p.parent.Get(key) 1124 } 1125 1126 func (p *pebbleReadOnly) GetProto( 1127 key MVCCKey, msg protoutil.Message, 1128 ) (ok bool, keyBytes, valBytes int64, err error) { 1129 if p.closed { 1130 panic("using a closed pebbleReadOnly") 1131 } 1132 return p.parent.GetProto(key, msg) 1133 } 1134 1135 func (p *pebbleReadOnly) Iterate(start, end roachpb.Key, f func(MVCCKeyValue) (bool, error)) error { 1136 if p.closed { 1137 panic("using a closed pebbleReadOnly") 1138 } 1139 return iterateOnReader(p, start, end, f) 1140 } 1141 1142 func (p *pebbleReadOnly) NewIterator(opts IterOptions) Iterator { 1143 if p.closed { 1144 panic("using a closed pebbleReadOnly") 1145 } 1146 1147 if opts.MinTimestampHint != (hlc.Timestamp{}) { 1148 // Iterators that specify timestamp bounds cannot be cached. 1149 return newPebbleIterator(p.parent.db, opts) 1150 } 1151 1152 iter := &p.normalIter 1153 if opts.Prefix { 1154 iter = &p.prefixIter 1155 } 1156 if iter.inuse { 1157 panic("iterator already in use") 1158 } 1159 1160 if iter.iter != nil { 1161 iter.setOptions(opts) 1162 } else { 1163 iter.init(p.parent.db, opts) 1164 iter.reusable = true 1165 } 1166 1167 iter.inuse = true 1168 return iter 1169 } 1170 1171 // Writer methods are not implemented for pebbleReadOnly. Ideally, the code 1172 // could be refactored so that a Reader could be supplied to evaluateBatch 1173 1174 // Writer is the write interface to an engine's data. 1175 func (p *pebbleReadOnly) ApplyBatchRepr(repr []byte, sync bool) error { 1176 panic("not implemented") 1177 } 1178 1179 func (p *pebbleReadOnly) Clear(key MVCCKey) error { 1180 panic("not implemented") 1181 } 1182 1183 func (p *pebbleReadOnly) SingleClear(key MVCCKey) error { 1184 panic("not implemented") 1185 } 1186 1187 func (p *pebbleReadOnly) ClearRange(start, end MVCCKey) error { 1188 panic("not implemented") 1189 } 1190 1191 func (p *pebbleReadOnly) ClearIterRange(iter Iterator, start, end roachpb.Key) error { 1192 panic("not implemented") 1193 } 1194 1195 func (p *pebbleReadOnly) Merge(key MVCCKey, value []byte) error { 1196 panic("not implemented") 1197 } 1198 1199 func (p *pebbleReadOnly) Put(key MVCCKey, value []byte) error { 1200 panic("not implemented") 1201 } 1202 1203 func (p *pebbleReadOnly) LogData(data []byte) error { 1204 panic("not implemented") 1205 } 1206 1207 func (p *pebbleReadOnly) LogLogicalOp(op MVCCLogicalOpType, details MVCCLogicalOpDetails) { 1208 panic("not implemented") 1209 } 1210 1211 // pebbleSnapshot represents a snapshot created using Pebble.NewSnapshot(). 1212 type pebbleSnapshot struct { 1213 snapshot *pebble.Snapshot 1214 closed bool 1215 } 1216 1217 var _ Reader = &pebbleSnapshot{} 1218 1219 // Close implements the Reader interface. 1220 func (p *pebbleSnapshot) Close() { 1221 _ = p.snapshot.Close() 1222 p.closed = true 1223 } 1224 1225 // Closed implements the Reader interface. 1226 func (p *pebbleSnapshot) Closed() bool { 1227 return p.closed 1228 } 1229 1230 // ExportToSst is part of the engine.Reader interface. 1231 func (p *pebbleSnapshot) ExportToSst( 1232 startKey, endKey roachpb.Key, 1233 startTS, endTS hlc.Timestamp, 1234 exportAllRevisions bool, 1235 targetSize, maxSize uint64, 1236 io IterOptions, 1237 ) ([]byte, roachpb.BulkOpSummary, roachpb.Key, error) { 1238 return pebbleExportToSst(p, startKey, endKey, startTS, endTS, exportAllRevisions, targetSize, maxSize, io) 1239 } 1240 1241 // Get implements the Reader interface. 1242 func (p *pebbleSnapshot) Get(key MVCCKey) ([]byte, error) { 1243 if len(key.Key) == 0 { 1244 return nil, emptyKeyError() 1245 } 1246 1247 ret, closer, err := p.snapshot.Get(EncodeKey(key)) 1248 if closer != nil { 1249 retCopy := make([]byte, len(ret)) 1250 copy(retCopy, ret) 1251 ret = retCopy 1252 closer.Close() 1253 } 1254 if errors.Is(err, pebble.ErrNotFound) || len(ret) == 0 { 1255 return nil, nil 1256 } 1257 return ret, err 1258 } 1259 1260 // GetProto implements the Reader interface. 1261 func (p *pebbleSnapshot) GetProto( 1262 key MVCCKey, msg protoutil.Message, 1263 ) (ok bool, keyBytes, valBytes int64, err error) { 1264 if len(key.Key) == 0 { 1265 return false, 0, 0, emptyKeyError() 1266 } 1267 encodedKey := EncodeKey(key) 1268 val, closer, err := p.snapshot.Get(encodedKey) 1269 if closer != nil { 1270 if msg != nil { 1271 err = protoutil.Unmarshal(val, msg) 1272 } 1273 keyBytes = int64(len(encodedKey)) 1274 valBytes = int64(len(val)) 1275 closer.Close() 1276 return true, keyBytes, valBytes, err 1277 } 1278 if errors.Is(err, pebble.ErrNotFound) { 1279 return false, 0, 0, nil 1280 } 1281 return false, 0, 0, err 1282 } 1283 1284 // Iterate implements the Reader interface. 1285 func (p *pebbleSnapshot) Iterate( 1286 start, end roachpb.Key, f func(MVCCKeyValue) (stop bool, err error), 1287 ) error { 1288 return iterateOnReader(p, start, end, f) 1289 } 1290 1291 // NewIterator implements the Reader interface. 1292 func (p pebbleSnapshot) NewIterator(opts IterOptions) Iterator { 1293 return newPebbleIterator(p.snapshot, opts) 1294 } 1295 1296 func pebbleExportToSst( 1297 reader Reader, 1298 startKey, endKey roachpb.Key, 1299 startTS, endTS hlc.Timestamp, 1300 exportAllRevisions bool, 1301 targetSize, maxSize uint64, 1302 io IterOptions, 1303 ) ([]byte, roachpb.BulkOpSummary, roachpb.Key, error) { 1304 sstFile := &MemFile{} 1305 sstWriter := MakeBackupSSTWriter(sstFile) 1306 defer sstWriter.Close() 1307 1308 var rows RowCounter 1309 iter := NewMVCCIncrementalIterator( 1310 reader, 1311 MVCCIncrementalIterOptions{ 1312 IterOptions: io, 1313 StartTime: startTS, 1314 EndTime: endTS, 1315 }) 1316 defer iter.Close() 1317 var curKey roachpb.Key // only used if exportAllRevisions 1318 var resumeKey roachpb.Key 1319 paginated := targetSize > 0 1320 for iter.SeekGE(MakeMVCCMetadataKey(startKey)); ; { 1321 ok, err := iter.Valid() 1322 if err != nil { 1323 // The error may be a WriteIntentError. In which case, returning it will 1324 // cause this command to be retried. 1325 return nil, roachpb.BulkOpSummary{}, nil, err 1326 } 1327 if !ok { 1328 break 1329 } 1330 unsafeKey := iter.UnsafeKey() 1331 if unsafeKey.Key.Compare(endKey) >= 0 { 1332 break 1333 } 1334 unsafeValue := iter.UnsafeValue() 1335 isNewKey := !exportAllRevisions || !unsafeKey.Key.Equal(curKey) 1336 if paginated && exportAllRevisions && isNewKey { 1337 curKey = append(curKey[:0], unsafeKey.Key...) 1338 } 1339 1340 // Skip tombstone (len=0) records when start time is zero (non-incremental) 1341 // and we are not exporting all versions. 1342 skipTombstones := !exportAllRevisions && startTS.IsEmpty() 1343 if len(unsafeValue) > 0 || !skipTombstones { 1344 if err := rows.Count(unsafeKey.Key); err != nil { 1345 return nil, roachpb.BulkOpSummary{}, nil, errors.Wrapf(err, "decoding %s", unsafeKey) 1346 } 1347 curSize := rows.BulkOpSummary.DataSize 1348 reachedTargetSize := curSize > 0 && uint64(curSize) >= targetSize 1349 if paginated && isNewKey && reachedTargetSize { 1350 // Allocate the right size for resumeKey rather than using curKey. 1351 resumeKey = append(make(roachpb.Key, 0, len(unsafeKey.Key)), unsafeKey.Key...) 1352 break 1353 } 1354 if err := sstWriter.Put(unsafeKey, unsafeValue); err != nil { 1355 return nil, roachpb.BulkOpSummary{}, nil, errors.Wrapf(err, "adding key %s", unsafeKey) 1356 } 1357 newSize := curSize + int64(len(unsafeKey.Key)+len(unsafeValue)) 1358 if maxSize > 0 && newSize > int64(maxSize) { 1359 return nil, roachpb.BulkOpSummary{}, nil, 1360 errors.Errorf("export size (%d bytes) exceeds max size (%d bytes)", newSize, maxSize) 1361 } 1362 rows.BulkOpSummary.DataSize = newSize 1363 } 1364 1365 if exportAllRevisions { 1366 iter.Next() 1367 } else { 1368 iter.NextKey() 1369 } 1370 } 1371 1372 if rows.BulkOpSummary.DataSize == 0 { 1373 // If no records were added to the sstable, skip completing it and return a 1374 // nil slice – the export code will discard it anyway (based on 0 DataSize). 1375 return nil, roachpb.BulkOpSummary{}, nil, nil 1376 } 1377 1378 if err := sstWriter.Finish(); err != nil { 1379 return nil, roachpb.BulkOpSummary{}, nil, err 1380 } 1381 1382 return sstFile.Data(), rows.BulkOpSummary, resumeKey, nil 1383 }