github.com/cockroachdb/cockroach@v20.2.0-alpha.1+incompatible/pkg/storage/rocksdb.go (about) 1 // Copyright 2014 The Cockroach Authors. 2 // 3 // Use of this software is governed by the Business Source License 4 // included in the file licenses/BSL.txt. 5 // 6 // As of the Change Date specified in that file, in accordance with 7 // the Business Source License, use of this software will be governed 8 // by the Apache License, Version 2.0, included in the file 9 // licenses/APL.txt. 10 11 package storage 12 13 import ( 14 "bytes" 15 "context" 16 "fmt" 17 "io/ioutil" 18 "math" 19 "os" 20 "path/filepath" 21 "runtime" 22 "runtime/debug" 23 "sort" 24 "strings" 25 "sync" 26 "time" 27 "unsafe" 28 29 "github.com/cockroachdb/cockroach/pkg/base" 30 "github.com/cockroachdb/cockroach/pkg/roachpb" 31 "github.com/cockroachdb/cockroach/pkg/settings" 32 "github.com/cockroachdb/cockroach/pkg/storage/enginepb" 33 "github.com/cockroachdb/cockroach/pkg/storage/fs" 34 "github.com/cockroachdb/cockroach/pkg/util" 35 "github.com/cockroachdb/cockroach/pkg/util/envutil" 36 "github.com/cockroachdb/cockroach/pkg/util/hlc" 37 "github.com/cockroachdb/cockroach/pkg/util/humanizeutil" 38 "github.com/cockroachdb/cockroach/pkg/util/log" 39 "github.com/cockroachdb/cockroach/pkg/util/protoutil" 40 "github.com/cockroachdb/cockroach/pkg/util/syncutil" 41 "github.com/cockroachdb/cockroach/pkg/util/timeutil" 42 "github.com/cockroachdb/errors" 43 "github.com/cockroachdb/logtags" 44 ) 45 46 // TODO(tamird): why does rocksdb not link jemalloc,snappy statically? 47 48 // #cgo CPPFLAGS: -I../../c-deps/libroach/include 49 // #cgo LDFLAGS: -lroach 50 // #cgo LDFLAGS: -lprotobuf 51 // #cgo LDFLAGS: -lrocksdb 52 // #cgo LDFLAGS: -lsnappy 53 // #cgo linux LDFLAGS: -lrt -lpthread 54 // #cgo windows LDFLAGS: -lshlwapi -lrpcrt4 55 // 56 // #include <stdlib.h> 57 // #include <libroach.h> 58 import "C" 59 60 var minWALSyncInterval = settings.RegisterDurationSetting( 61 "rocksdb.min_wal_sync_interval", 62 "minimum duration between syncs of the RocksDB WAL", 63 0*time.Millisecond, 64 ) 65 66 var rocksdbConcurrency = envutil.EnvOrDefaultInt( 67 "COCKROACH_ROCKSDB_CONCURRENCY", func() int { 68 // Use up to min(numCPU, 4) threads for background RocksDB compactions per 69 // store. 70 const max = 4 71 if n := runtime.NumCPU(); n <= max { 72 return n 73 } 74 return max 75 }()) 76 77 // Set to true to perform expensive iterator debug leak checking. In normal 78 // operation, we perform inexpensive iterator leak checking but those checks do 79 // not indicate where the leak arose. The expensive checking tracks the stack 80 // traces of every iterator allocated. DO NOT ENABLE in production code. 81 const debugIteratorLeak = false 82 83 var rocksdbLogger *log.SecondaryLogger 84 85 // InitRocksDBLogger initializes the logger to use for RocksDB log messages. If 86 // not called, WARNING, ERROR, and FATAL logs will be output to the normal 87 // CockroachDB log. The caller is responsible for ensuring the 88 // Close() method is eventually called on the new logger. 89 func InitRocksDBLogger(ctx context.Context) *log.SecondaryLogger { 90 rocksdbLogger = log.NewSecondaryLogger(ctx, nil, "rocksdb", 91 true /* enableGC */, false /* forceSyncWrites */, false /* enableMsgCount */) 92 return rocksdbLogger 93 } 94 95 //export rocksDBLog 96 func rocksDBLog(usePrimaryLog C.bool, sevLvl C.int, s *C.char, n C.int) { 97 sev := log.Severity(sevLvl) 98 if !usePrimaryLog { 99 if rocksdbLogger != nil { 100 // NB: No need for the rocksdb tag if we're logging to a rocksdb specific 101 // file. 102 rocksdbLogger.LogSev(context.Background(), sev, C.GoStringN(s, n)) 103 return 104 } 105 106 // Only log INFO logs to the normal CockroachDB log at --v=3 and 107 // above. This only applies when we're not using the primary log for 108 // RocksDB generated messages (which is utilized by the encryption-at-rest 109 // code). 110 if sev == log.Severity_INFO && !log.V(3) { 111 return 112 } 113 } 114 115 ctx := logtags.AddTag(context.Background(), "rocksdb", nil) 116 switch sev { 117 case log.Severity_WARNING: 118 log.Warningf(ctx, "%v", C.GoStringN(s, n)) 119 case log.Severity_ERROR: 120 log.Errorf(ctx, "%v", C.GoStringN(s, n)) 121 case log.Severity_FATAL: 122 log.Fatalf(ctx, "%v", C.GoStringN(s, n)) 123 default: 124 log.Infof(ctx, "%v", C.GoStringN(s, n)) 125 } 126 } 127 128 //export prettyPrintKey 129 func prettyPrintKey(cKey C.DBKey) *C.char { 130 mvccKey := MVCCKey{ 131 Key: gobytes(unsafe.Pointer(cKey.key.data), int(cKey.key.len)), 132 Timestamp: hlc.Timestamp{ 133 WallTime: int64(cKey.wall_time), 134 Logical: int32(cKey.logical), 135 }, 136 } 137 return C.CString(mvccKey.String()) 138 } 139 140 const ( 141 // RecommendedMaxOpenFiles is the recommended value for RocksDB's 142 // max_open_files option. 143 RecommendedMaxOpenFiles = 10000 144 // MinimumMaxOpenFiles is the minimum value that RocksDB's max_open_files 145 // option can be set to. While this should be set as high as possible, the 146 // minimum total for a single store node must be under 2048 for Windows 147 // compatibility. See: 148 // https://wpdev.uservoice.com/forums/266908-command-prompt-console-bash-on-ubuntu-on-windo/suggestions/17310124-add-ability-to-change-max-number-of-open-files-for 149 MinimumMaxOpenFiles = 1700 150 ) 151 152 // SSTableInfo contains metadata about a single sstable. Note this mirrors 153 // the C.DBSSTable struct contents. 154 type SSTableInfo struct { 155 Level int 156 Size int64 157 Start MVCCKey 158 End MVCCKey 159 } 160 161 // SSTableInfos is a slice of SSTableInfo structures. 162 type SSTableInfos []SSTableInfo 163 164 func (s SSTableInfos) Len() int { 165 return len(s) 166 } 167 168 func (s SSTableInfos) Swap(i, j int) { 169 s[i], s[j] = s[j], s[i] 170 } 171 172 func (s SSTableInfos) Less(i, j int) bool { 173 switch { 174 case s[i].Level < s[j].Level: 175 return true 176 case s[i].Level > s[j].Level: 177 return false 178 case s[i].Size > s[j].Size: 179 return true 180 case s[i].Size < s[j].Size: 181 return false 182 default: 183 return s[i].Start.Less(s[j].Start) 184 } 185 } 186 187 func (s SSTableInfos) String() string { 188 const ( 189 KB = 1 << 10 190 MB = 1 << 20 191 GB = 1 << 30 192 TB = 1 << 40 193 ) 194 195 roundTo := func(val, to int64) int64 { 196 return (val + to/2) / to 197 } 198 199 // We're intentionally not using humanizeutil here as we want a slightly more 200 // compact representation. 201 humanize := func(size int64) string { 202 switch { 203 case size < MB: 204 return fmt.Sprintf("%dK", roundTo(size, KB)) 205 case size < GB: 206 return fmt.Sprintf("%dM", roundTo(size, MB)) 207 case size < TB: 208 return fmt.Sprintf("%dG", roundTo(size, GB)) 209 default: 210 return fmt.Sprintf("%dT", roundTo(size, TB)) 211 } 212 } 213 214 type levelInfo struct { 215 size int64 216 count int 217 } 218 219 var levels []*levelInfo 220 for _, t := range s { 221 for i := len(levels); i <= t.Level; i++ { 222 levels = append(levels, &levelInfo{}) 223 } 224 info := levels[t.Level] 225 info.size += t.Size 226 info.count++ 227 } 228 229 var maxSize int 230 var maxLevelCount int 231 for _, info := range levels { 232 size := len(humanize(info.size)) 233 if maxSize < size { 234 maxSize = size 235 } 236 count := 1 + int(math.Log10(float64(info.count))) 237 if maxLevelCount < count { 238 maxLevelCount = count 239 } 240 } 241 levelFormat := fmt.Sprintf("%%d [ %%%ds %%%dd ]:", maxSize, maxLevelCount) 242 243 level := -1 244 var buf bytes.Buffer 245 var lastSize string 246 var lastSizeCount int 247 248 flushLastSize := func() { 249 if lastSizeCount > 0 { 250 fmt.Fprintf(&buf, " %s", lastSize) 251 if lastSizeCount > 1 { 252 fmt.Fprintf(&buf, "[%d]", lastSizeCount) 253 } 254 lastSizeCount = 0 255 } 256 } 257 258 maybeFlush := func(newLevel, i int) { 259 if level == newLevel { 260 return 261 } 262 flushLastSize() 263 if buf.Len() > 0 { 264 buf.WriteString("\n") 265 } 266 level = newLevel 267 if level >= 0 { 268 info := levels[level] 269 fmt.Fprintf(&buf, levelFormat, level, humanize(info.size), info.count) 270 } 271 } 272 273 for i, t := range s { 274 maybeFlush(t.Level, i) 275 size := humanize(t.Size) 276 if size == lastSize { 277 lastSizeCount++ 278 } else { 279 flushLastSize() 280 lastSize = size 281 lastSizeCount = 1 282 } 283 } 284 285 maybeFlush(-1, 0) 286 return buf.String() 287 } 288 289 // ReadAmplification returns RocksDB's worst case read amplification, which is 290 // the number of level-0 sstables plus the number of levels, other than level 0, 291 // with at least one sstable. 292 // 293 // This definition comes from here: 294 // https://github.com/facebook/rocksdb/wiki/RocksDB-Tuning-Guide#level-style-compaction 295 func (s SSTableInfos) ReadAmplification() int { 296 var readAmp int 297 seenLevel := make(map[int]bool) 298 for _, t := range s { 299 if t.Level == 0 { 300 readAmp++ 301 } else if !seenLevel[t.Level] { 302 readAmp++ 303 seenLevel[t.Level] = true 304 } 305 } 306 return readAmp 307 } 308 309 // SSTableInfosByLevel maintains slices of SSTableInfo objects, one 310 // per level. The slice for each level contains the SSTableInfo 311 // objects for SSTables at that level, sorted by start key. 312 type SSTableInfosByLevel struct { 313 // Each level is a slice of SSTableInfos. 314 levels [][]SSTableInfo 315 } 316 317 // NewSSTableInfosByLevel returns a new SSTableInfosByLevel object 318 // based on the supplied SSTableInfos slice. 319 func NewSSTableInfosByLevel(s SSTableInfos) SSTableInfosByLevel { 320 var result SSTableInfosByLevel 321 for _, t := range s { 322 for i := len(result.levels); i <= t.Level; i++ { 323 result.levels = append(result.levels, []SSTableInfo{}) 324 } 325 result.levels[t.Level] = append(result.levels[t.Level], t) 326 } 327 // Sort each level by start key. 328 for _, l := range result.levels { 329 sort.Slice(l, func(i, j int) bool { return l[i].Start.Less(l[j].Start) }) 330 } 331 return result 332 } 333 334 // MaxLevel returns the maximum level for which there are SSTables. 335 func (s *SSTableInfosByLevel) MaxLevel() int { 336 return len(s.levels) - 1 337 } 338 339 // MaxLevelSpanOverlapsContiguousSSTables returns the maximum level at 340 // which the specified key span overlaps either none, one, or at most 341 // two contiguous SSTables. Level 0 is returned if no level qualifies. 342 // 343 // This is useful when considering when to merge two compactions. In 344 // this case, the method is called with the "gap" between the two 345 // spans to be compacted. When the result is that the gap span touches 346 // at most two SSTables at a high level, it suggests that merging the 347 // two compactions is a good idea (as the up to two SSTables touched 348 // by the gap span, due to containing endpoints of the existing 349 // compactions, would be rewritten anyway). 350 // 351 // As an example, consider the following sstables in a small database: 352 // 353 // Level 0. 354 // {Level: 0, Size: 20, Start: key("a"), End: key("z")}, 355 // {Level: 0, Size: 15, Start: key("a"), End: key("k")}, 356 // Level 2. 357 // {Level: 2, Size: 200, Start: key("a"), End: key("j")}, 358 // {Level: 2, Size: 100, Start: key("k"), End: key("o")}, 359 // {Level: 2, Size: 100, Start: key("r"), End: key("t")}, 360 // Level 6. 361 // {Level: 6, Size: 201, Start: key("a"), End: key("c")}, 362 // {Level: 6, Size: 200, Start: key("d"), End: key("f")}, 363 // {Level: 6, Size: 300, Start: key("h"), End: key("r")}, 364 // {Level: 6, Size: 405, Start: key("s"), End: key("z")}, 365 // 366 // - The span "a"-"c" overlaps only a single SSTable at the max level 367 // (L6). That's great, so we definitely want to compact that. 368 // - The span "s"-"t" overlaps zero SSTables at the max level (L6). 369 // Again, great! That means we're going to compact the 3rd L2 370 // SSTable and maybe push that directly to L6. 371 func (s *SSTableInfosByLevel) MaxLevelSpanOverlapsContiguousSSTables(span roachpb.Span) int { 372 // Note overlapsMoreTHanTwo should not be called on level 0, where 373 // the SSTables are not guaranteed disjoint. 374 overlapsMoreThanTwo := func(tables []SSTableInfo) bool { 375 // Search to find the first sstable which might overlap the span. 376 i := sort.Search(len(tables), func(i int) bool { return span.Key.Compare(tables[i].End.Key) < 0 }) 377 // If no SSTable is overlapped, return false. 378 if i == -1 || i == len(tables) || span.EndKey.Compare(tables[i].Start.Key) < 0 { 379 return false 380 } 381 // Return true if the span is not subsumed by the combination of 382 // this sstable and the next. This logic is complicated and is 383 // covered in the unittest. There are three successive conditions 384 // which together ensure the span doesn't overlap > 2 SSTables. 385 // 386 // - If the first overlapped SSTable is the last. 387 // - If the span does not exceed the end of the next SSTable. 388 // - If the span does not overlap the start of the next next SSTable. 389 if i >= len(tables)-1 { 390 // First overlapped SSTable is the last (right-most) SSTable. 391 // Span: [c-----f) 392 // SSTs: [a---d) 393 // or 394 // SSTs: [a-----------q) 395 return false 396 } 397 if span.EndKey.Compare(tables[i+1].End.Key) <= 0 { 398 // Span does not reach outside of this SSTable's right neighbor. 399 // Span: [c------f) 400 // SSTs: [a---d) [e-f) ... 401 return false 402 } 403 if i >= len(tables)-2 { 404 // Span reaches outside of this SSTable's right neighbor, but 405 // there are no more SSTables to the right. 406 // Span: [c-------------x) 407 // SSTs: [a---d) [e---q) 408 return false 409 } 410 if span.EndKey.Compare(tables[i+2].Start.Key) <= 0 { 411 // There's another SSTable two to the right, but the span doesn't 412 // reach into it. 413 // Span: [c------------x) 414 // SSTs: [a---d) [e---q) [x--z) ... 415 return false 416 } 417 418 // Touching at least three SSTables. 419 // Span: [c-------------y) 420 // SSTs: [a---d) [e---q) [x--z) ... 421 return true 422 } 423 // Note that we never consider level 0, where SSTables can overlap. 424 // Level 0 is instead returned as a catch-all which means that there 425 // is no level where the span overlaps only two or fewer SSTables. 426 for i := len(s.levels) - 1; i > 0; i-- { 427 if !overlapsMoreThanTwo(s.levels[i]) { 428 return i 429 } 430 } 431 return 0 432 } 433 434 // RocksDBCache is a wrapper around C.DBCache 435 type RocksDBCache struct { 436 cache *C.DBCache 437 } 438 439 // NewRocksDBCache creates a new cache of the specified size. Note that the 440 // cache is refcounted internally and starts out with a refcount of one (i.e. 441 // Release() should be called after having used the cache). 442 func NewRocksDBCache(cacheSize int64) RocksDBCache { 443 return RocksDBCache{cache: C.DBNewCache(C.uint64_t(cacheSize))} 444 } 445 446 func (c RocksDBCache) ref() RocksDBCache { 447 if c.cache != nil { 448 c.cache = C.DBRefCache(c.cache) 449 } 450 return c 451 } 452 453 // Release releases the cache. Note that the cache will continue to be used 454 // until all of the RocksDB engines it was attached to have been closed, and 455 // that RocksDB engines which use it auto-release when they close. 456 func (c RocksDBCache) Release() { 457 if c.cache != nil { 458 C.DBReleaseCache(c.cache) 459 } 460 } 461 462 // RocksDBConfig holds all configuration parameters and knobs used in setting 463 // up a new RocksDB instance. 464 type RocksDBConfig struct { 465 // StorageConfig contains storage configs for all storage engines. 466 base.StorageConfig 467 // ReadOnly will open the database in read only mode if set to true. 468 ReadOnly bool 469 // MaxOpenFiles controls the maximum number of file descriptors RocksDB 470 // creates. If MaxOpenFiles is zero, this is set to DefaultMaxOpenFiles. 471 MaxOpenFiles uint64 472 // WarnLargeBatchThreshold controls if a log message is printed when a 473 // WriteBatch takes longer than WarnLargeBatchThreshold. If it is set to 474 // zero, no log messages are ever printed. 475 WarnLargeBatchThreshold time.Duration 476 // RocksDBOptions contains RocksDB specific options using a semicolon 477 // separated key-value syntax ("key1=value1; key2=value2"). 478 RocksDBOptions string 479 } 480 481 // RocksDB is a wrapper around a RocksDB database instance. 482 type RocksDB struct { 483 cfg RocksDBConfig 484 rdb *C.DBEngine 485 cache RocksDBCache // Shared cache. 486 // auxDir is used for storing auxiliary files. Ideally it is a subdirectory of Dir. 487 auxDir string 488 489 commit struct { 490 syncutil.Mutex 491 cond sync.Cond 492 committing bool 493 groupSize int 494 pending []*rocksDBBatch 495 } 496 497 syncer struct { 498 syncutil.Mutex 499 cond sync.Cond 500 closed bool 501 pending []*rocksDBBatch 502 } 503 504 iters struct { 505 syncutil.Mutex 506 m map[*rocksDBIterator][]byte 507 } 508 } 509 510 var _ Engine = &RocksDB{} 511 512 // SetRocksDBOpenHook sets the DBOpenHook function that will be called during 513 // RocksDB initialization. It is intended to be called by CCL code. 514 func SetRocksDBOpenHook(fn unsafe.Pointer) { 515 C.DBSetOpenHook(fn) 516 } 517 518 // NewRocksDB allocates and returns a new RocksDB object. 519 // This creates options and opens the database. If the database 520 // doesn't yet exist at the specified directory, one is initialized 521 // from scratch. 522 // The caller must call the engine's Close method when the engine is no longer 523 // needed. 524 func NewRocksDB(cfg RocksDBConfig, cache RocksDBCache) (*RocksDB, error) { 525 if cfg.Dir == "" { 526 return nil, errors.New("dir must be non-empty") 527 } 528 529 r := &RocksDB{ 530 cfg: cfg, 531 cache: cache.ref(), 532 } 533 534 if err := r.setAuxiliaryDir(filepath.Join(cfg.Dir, base.AuxiliaryDir)); err != nil { 535 return nil, err 536 } 537 538 if err := r.open(); err != nil { 539 return nil, err 540 } 541 return r, nil 542 } 543 544 func newRocksDBInMem(attrs roachpb.Attributes, cacheSize int64) *RocksDB { 545 cache := NewRocksDBCache(cacheSize) 546 // The cache starts out with a refcount of one, and creating the engine 547 // from it adds another refcount, at which point we release one of them. 548 defer cache.Release() 549 550 // TODO(bdarnell): The hard-coded 512 MiB is wrong; see 551 // https://github.com/cockroachdb/cockroach/issues/16750 552 db, err := newMemRocksDB(attrs, cache, 512<<20 /* MaxSize: 512 MiB */) 553 if err != nil { 554 panic(err) 555 } 556 return db 557 } 558 559 func newMemRocksDB(attrs roachpb.Attributes, cache RocksDBCache, maxSize int64) (*RocksDB, error) { 560 r := &RocksDB{ 561 cfg: RocksDBConfig{ 562 StorageConfig: base.StorageConfig{ 563 Attrs: attrs, 564 MaxSize: maxSize, 565 }, 566 }, 567 // dir: empty dir == "mem" RocksDB instance. 568 cache: cache.ref(), 569 } 570 571 // TODO(peter): This is bizarre. We're creating on on-disk temporary 572 // directory for an in-memory filesystem. The reason this is done is because 573 // various users of the auxiliary directory use the os.* routines (which is 574 // invalid!). This needs to be cleaned up. 575 auxDir, err := ioutil.TempDir(os.TempDir(), "cockroach-auxiliary") 576 if err != nil { 577 return nil, err 578 } 579 if err := r.setAuxiliaryDir(auxDir); err != nil { 580 return nil, err 581 } 582 583 if err := r.open(); err != nil { 584 return nil, err 585 } 586 587 return r, nil 588 } 589 590 // String formatter. 591 func (r *RocksDB) String() string { 592 dir := r.cfg.Dir 593 if r.cfg.Dir == "" { 594 dir = "<in-mem>" 595 } 596 attrs := r.Attrs().String() 597 if attrs == "" { 598 attrs = "<no-attributes>" 599 } 600 return fmt.Sprintf("%s=%s", attrs, dir) 601 } 602 603 func (r *RocksDB) open() error { 604 var existingVersion, newVersion storageVersion 605 if len(r.cfg.Dir) != 0 { 606 log.Infof(context.TODO(), "opening rocksdb instance at %q", r.cfg.Dir) 607 608 // Check the version number. 609 var err error 610 if existingVersion, err = getVersion(r.cfg.Dir); err != nil { 611 return err 612 } 613 if existingVersion < versionMinimum || existingVersion > versionCurrent { 614 // Instead of an error, we should call a migration if possible when 615 // one is needed immediately following the DBOpen call. 616 return fmt.Errorf("incompatible rocksdb data version, current:%d, on disk:%d, minimum:%d", 617 versionCurrent, existingVersion, versionMinimum) 618 } 619 620 newVersion = existingVersion 621 if newVersion == versionNoFile { 622 // We currently set the default store version one before the file registry 623 // to allow downgrades to older binaries as long as encryption is not in use. 624 // TODO(mberhault): once enough releases supporting versionFileRegistry have passed, we can upgrade 625 // to it without worry. 626 newVersion = versionBeta20160331 627 } 628 629 // Using the file registry forces the latest version. We can't downgrade! 630 if r.cfg.UseFileRegistry { 631 newVersion = versionCurrent 632 } 633 } else { 634 if log.V(2) { 635 log.Infof(context.TODO(), "opening in memory rocksdb instance") 636 } 637 638 // In memory dbs are always current. 639 existingVersion = versionCurrent 640 } 641 642 maxOpenFiles := uint64(RecommendedMaxOpenFiles) 643 if r.cfg.MaxOpenFiles != 0 { 644 maxOpenFiles = r.cfg.MaxOpenFiles 645 } 646 647 status := C.DBOpen(&r.rdb, goToCSlice([]byte(r.cfg.Dir)), 648 C.DBOptions{ 649 cache: r.cache.cache, 650 num_cpu: C.int(rocksdbConcurrency), 651 max_open_files: C.int(maxOpenFiles), 652 use_file_registry: C.bool(newVersion == versionCurrent), 653 must_exist: C.bool(r.cfg.MustExist), 654 read_only: C.bool(r.cfg.ReadOnly), 655 rocksdb_options: goToCSlice([]byte(r.cfg.RocksDBOptions)), 656 extra_options: goToCSlice(r.cfg.ExtraOptions), 657 }) 658 if err := statusToError(status); err != nil { 659 return errors.Wrap(err, "could not open rocksdb instance") 660 } 661 662 // Update or add the version file if needed and if on-disk. 663 if len(r.cfg.Dir) != 0 && existingVersion < newVersion { 664 if err := writeVersionFile(r.cfg.Dir, newVersion); err != nil { 665 return err 666 } 667 } 668 669 r.commit.cond.L = &r.commit.Mutex 670 r.syncer.cond.L = &r.syncer.Mutex 671 r.iters.m = make(map[*rocksDBIterator][]byte) 672 673 // NB: The sync goroutine acts as a check that the RocksDB instance was 674 // properly closed as the goroutine will leak otherwise. 675 go r.syncLoop() 676 return nil 677 } 678 679 func (r *RocksDB) syncLoop() { 680 s := &r.syncer 681 s.Lock() 682 683 var lastSync time.Time 684 var err error 685 686 for { 687 for len(s.pending) == 0 && !s.closed { 688 s.cond.Wait() 689 } 690 if s.closed { 691 s.Unlock() 692 return 693 } 694 695 var min time.Duration 696 if r.cfg.Settings != nil { 697 min = minWALSyncInterval.Get(&r.cfg.Settings.SV) 698 } 699 if delta := timeutil.Since(lastSync); delta < min { 700 s.Unlock() 701 time.Sleep(min - delta) 702 s.Lock() 703 } 704 705 pending := s.pending 706 s.pending = nil 707 708 s.Unlock() 709 710 // Linux only guarantees we'll be notified of a writeback error once 711 // during a sync call. After sync fails once, we cannot rely on any 712 // future data written to WAL being crash-recoverable. That's because 713 // any future writes will be appended after a potential corruption in 714 // the WAL, and RocksDB's recovery terminates upon encountering any 715 // corruption. So, we must not call `DBSyncWAL` again after it has 716 // failed once. 717 if r.cfg.Dir != "" && err == nil { 718 err = statusToError(C.DBSyncWAL(r.rdb)) 719 lastSync = timeutil.Now() 720 } 721 722 for _, b := range pending { 723 b.commitErr = err 724 b.commitWG.Done() 725 } 726 727 s.Lock() 728 } 729 } 730 731 // Close closes the database by deallocating the underlying handle. 732 func (r *RocksDB) Close() { 733 if r.rdb == nil { 734 log.Errorf(context.TODO(), "closing unopened rocksdb instance") 735 return 736 } 737 if len(r.cfg.Dir) == 0 { 738 if log.V(1) { 739 log.Infof(context.TODO(), "closing in-memory rocksdb instance") 740 } 741 // Remove the temporary directory when the engine is in-memory. 742 if err := os.RemoveAll(r.auxDir); err != nil { 743 log.Warningf(context.TODO(), "%v", err) 744 } 745 } else { 746 log.Infof(context.TODO(), "closing rocksdb instance at %q", r.cfg.Dir) 747 } 748 if r.rdb != nil { 749 if err := statusToError(C.DBClose(r.rdb)); err != nil { 750 if debugIteratorLeak { 751 r.iters.Lock() 752 for _, stack := range r.iters.m { 753 fmt.Printf("%s\n", stack) 754 } 755 r.iters.Unlock() 756 } 757 panic(err) 758 } 759 r.rdb = nil 760 } 761 r.cache.Release() 762 r.syncer.Lock() 763 r.syncer.closed = true 764 r.syncer.cond.Signal() 765 r.syncer.Unlock() 766 } 767 768 // CreateCheckpoint creates a RocksDB checkpoint in the given directory (which 769 // must not exist). This directory should be located on the same file system, or 770 // copies of all data are used instead of hard links, which is very expensive. 771 func (r *RocksDB) CreateCheckpoint(dir string) error { 772 status := C.DBCreateCheckpoint(r.rdb, goToCSlice([]byte(dir))) 773 return errors.Wrap(statusToError(status), "unable to take RocksDB checkpoint") 774 } 775 776 // Closed returns true if the engine is closed. 777 func (r *RocksDB) Closed() bool { 778 return r.rdb == nil 779 } 780 781 // ExportToSst is part of the engine.Reader interface. 782 func (r *RocksDB) ExportToSst( 783 startKey, endKey roachpb.Key, 784 startTS, endTS hlc.Timestamp, 785 exportAllRevisions bool, 786 targetSize, maxSize uint64, 787 io IterOptions, 788 ) ([]byte, roachpb.BulkOpSummary, roachpb.Key, error) { 789 start := MVCCKey{Key: startKey, Timestamp: startTS} 790 end := MVCCKey{Key: endKey, Timestamp: endTS} 791 792 var data C.DBString 793 var intentErr C.DBString 794 var bulkopSummary C.DBString 795 var resumeKey C.DBString 796 797 err := statusToError(C.DBExportToSst(goToCKey(start), goToCKey(end), 798 C.bool(exportAllRevisions), 799 C.uint64_t(targetSize), C.uint64_t(maxSize), 800 goToCIterOptions(io), r.rdb, &data, &intentErr, &bulkopSummary, &resumeKey)) 801 802 if err != nil { 803 if err.Error() == "WriteIntentError" { 804 var e roachpb.WriteIntentError 805 if err := protoutil.Unmarshal(cStringToGoBytes(intentErr), &e); err != nil { 806 return nil, roachpb.BulkOpSummary{}, nil, errors.Wrap(err, "failed to decode write intent error") 807 } 808 809 return nil, roachpb.BulkOpSummary{}, nil, &e 810 } 811 return nil, roachpb.BulkOpSummary{}, nil, err 812 } 813 814 var summary roachpb.BulkOpSummary 815 if err := protoutil.Unmarshal(cStringToGoBytes(bulkopSummary), &summary); err != nil { 816 return nil, roachpb.BulkOpSummary{}, nil, errors.Wrap(err, "failed to decode BulkopSummary") 817 } 818 819 return cStringToGoBytes(data), summary, roachpb.Key(cStringToGoBytes(resumeKey)), nil 820 } 821 822 // Attrs returns the list of attributes describing this engine. This 823 // may include a specification of disk type (e.g. hdd, ssd, fio, etc.) 824 // and potentially other labels to identify important attributes of 825 // the engine. 826 func (r *RocksDB) Attrs() roachpb.Attributes { 827 return r.cfg.Attrs 828 } 829 830 // Put sets the given key to the value provided. 831 // 832 // It is safe to modify the contents of the arguments after Put returns. 833 func (r *RocksDB) Put(key MVCCKey, value []byte) error { 834 return dbPut(r.rdb, key, value) 835 } 836 837 // Merge implements the RocksDB merge operator using the function goMergeInit 838 // to initialize missing values and goMerge to merge the old and the given 839 // value into a new value, which is then stored under key. 840 // Currently 64-bit counter logic is implemented. See the documentation of 841 // goMerge and goMergeInit for details. 842 // 843 // It is safe to modify the contents of the arguments after Merge returns. 844 func (r *RocksDB) Merge(key MVCCKey, value []byte) error { 845 return dbMerge(r.rdb, key, value) 846 } 847 848 // LogData is part of the Writer interface. 849 // 850 // It is safe to modify the contents of the arguments after LogData returns. 851 func (r *RocksDB) LogData(data []byte) error { 852 panic("unimplemented") 853 } 854 855 // LogLogicalOp is part of the Writer interface. 856 func (r *RocksDB) LogLogicalOp(op MVCCLogicalOpType, details MVCCLogicalOpDetails) { 857 // No-op. Logical logging disabled. 858 } 859 860 // ApplyBatchRepr atomically applies a set of batched updates. Created by 861 // calling Repr() on a batch. Using this method is equivalent to constructing 862 // and committing a batch whose Repr() equals repr. 863 // 864 // It is safe to modify the contents of the arguments after ApplyBatchRepr 865 // returns. 866 func (r *RocksDB) ApplyBatchRepr(repr []byte, sync bool) error { 867 return dbApplyBatchRepr(r.rdb, repr, sync) 868 } 869 870 // Get returns the value for the given key. 871 func (r *RocksDB) Get(key MVCCKey) ([]byte, error) { 872 return dbGet(r.rdb, key) 873 } 874 875 // GetProto fetches the value at the specified key and unmarshals it. 876 func (r *RocksDB) GetProto( 877 key MVCCKey, msg protoutil.Message, 878 ) (ok bool, keyBytes, valBytes int64, err error) { 879 return dbGetProto(r.rdb, key, msg) 880 } 881 882 // Clear removes the item from the db with the given key. 883 // 884 // It is safe to modify the contents of the arguments after Clear returns. 885 func (r *RocksDB) Clear(key MVCCKey) error { 886 return dbClear(r.rdb, key) 887 } 888 889 // SingleClear removes the most recent item from the db with the given key. 890 // 891 // It is safe to modify the contents of the arguments after SingleClear returns. 892 func (r *RocksDB) SingleClear(key MVCCKey) error { 893 return dbSingleClear(r.rdb, key) 894 } 895 896 // ClearRange removes a set of entries, from start (inclusive) to end 897 // (exclusive). 898 // 899 // It is safe to modify the contents of the arguments after ClearRange returns. 900 func (r *RocksDB) ClearRange(start, end MVCCKey) error { 901 return dbClearRange(r.rdb, start, end) 902 } 903 904 // ClearIterRange removes a set of entries, from start (inclusive) to end 905 // (exclusive). 906 // 907 // It is safe to modify the contents of the arguments after ClearIterRange 908 // returns. 909 func (r *RocksDB) ClearIterRange(iter Iterator, start, end roachpb.Key) error { 910 return dbClearIterRange(r.rdb, iter, start, end) 911 } 912 913 // Iterate iterates from start to end keys, invoking f on each 914 // key/value pair. See engine.Iterate for details. 915 func (r *RocksDB) Iterate(start, end roachpb.Key, f func(MVCCKeyValue) (bool, error)) error { 916 return iterateOnReader(r, start, end, f) 917 } 918 919 // Capacity queries the underlying file system for disk capacity information. 920 func (r *RocksDB) Capacity() (roachpb.StoreCapacity, error) { 921 return computeCapacity(r.cfg.Dir, r.cfg.MaxSize) 922 } 923 924 // Compact forces compaction over the entire database. 925 func (r *RocksDB) Compact() error { 926 return statusToError(C.DBCompact(r.rdb)) 927 } 928 929 // CompactRange forces compaction over a specified range of keys in the database. 930 func (r *RocksDB) CompactRange(start, end roachpb.Key, forceBottommost bool) error { 931 return statusToError(C.DBCompactRange(r.rdb, goToCSlice(start), goToCSlice(end), C.bool(forceBottommost))) 932 } 933 934 // disableAutoCompaction disables automatic compactions. For testing use only. 935 func (r *RocksDB) disableAutoCompaction() error { 936 return statusToError(C.DBDisableAutoCompaction(r.rdb)) 937 } 938 939 // ApproximateDiskBytes returns the approximate on-disk size of the specified key range. 940 func (r *RocksDB) ApproximateDiskBytes(from, to roachpb.Key) (uint64, error) { 941 start := MVCCKey{Key: from} 942 end := MVCCKey{Key: to} 943 var result C.uint64_t 944 err := statusToError(C.DBApproximateDiskBytes(r.rdb, goToCKey(start), goToCKey(end), &result)) 945 return uint64(result), err 946 } 947 948 // Flush causes RocksDB to write all in-memory data to disk immediately. 949 func (r *RocksDB) Flush() error { 950 return statusToError(C.DBFlush(r.rdb)) 951 } 952 953 // NewIterator returns an iterator over this rocksdb engine. 954 func (r *RocksDB) NewIterator(opts IterOptions) Iterator { 955 return newRocksDBIterator(r.rdb, opts, r, r) 956 } 957 958 // NewSnapshot creates a snapshot handle from engine and returns a 959 // read-only rocksDBSnapshot engine. 960 func (r *RocksDB) NewSnapshot() Reader { 961 if r.rdb == nil { 962 panic("RocksDB is not initialized yet") 963 } 964 return &rocksDBSnapshot{ 965 parent: r, 966 handle: C.DBNewSnapshot(r.rdb), 967 } 968 } 969 970 // Type implements the Engine interface. 971 func (r *RocksDB) Type() enginepb.EngineType { 972 return enginepb.EngineTypeRocksDB 973 } 974 975 // NewReadOnly returns a new ReadWriter wrapping this rocksdb engine. 976 func (r *RocksDB) NewReadOnly() ReadWriter { 977 return &rocksDBReadOnly{ 978 parent: r, 979 isClosed: false, 980 } 981 } 982 983 type rocksDBReadOnly struct { 984 parent *RocksDB 985 prefixIter reusableIterator 986 normalIter reusableIterator 987 isClosed bool 988 } 989 990 func (r *rocksDBReadOnly) Close() { 991 if r.isClosed { 992 panic("closing an already-closed rocksDBReadOnly") 993 } 994 r.isClosed = true 995 if i := &r.prefixIter.rocksDBIterator; i.iter != nil { 996 i.destroy() 997 } 998 if i := &r.normalIter.rocksDBIterator; i.iter != nil { 999 i.destroy() 1000 } 1001 } 1002 1003 // Read-only batches are not committed 1004 func (r *rocksDBReadOnly) Closed() bool { 1005 return r.isClosed 1006 } 1007 1008 // ExportToSst is part of the engine.Reader interface. 1009 func (r *rocksDBReadOnly) ExportToSst( 1010 startKey, endKey roachpb.Key, 1011 startTS, endTS hlc.Timestamp, 1012 exportAllRevisions bool, 1013 targetSize, maxSize uint64, 1014 io IterOptions, 1015 ) ([]byte, roachpb.BulkOpSummary, roachpb.Key, error) { 1016 return r.parent.ExportToSst(startKey, endKey, startTS, endTS, exportAllRevisions, targetSize, maxSize, io) 1017 } 1018 1019 func (r *rocksDBReadOnly) Get(key MVCCKey) ([]byte, error) { 1020 if r.isClosed { 1021 panic("using a closed rocksDBReadOnly") 1022 } 1023 return dbGet(r.parent.rdb, key) 1024 } 1025 1026 func (r *rocksDBReadOnly) GetProto( 1027 key MVCCKey, msg protoutil.Message, 1028 ) (ok bool, keyBytes, valBytes int64, err error) { 1029 if r.isClosed { 1030 panic("using a closed rocksDBReadOnly") 1031 } 1032 return dbGetProto(r.parent.rdb, key, msg) 1033 } 1034 1035 func (r *rocksDBReadOnly) Iterate( 1036 start, end roachpb.Key, f func(MVCCKeyValue) (bool, error), 1037 ) error { 1038 if r.isClosed { 1039 panic("using a closed rocksDBReadOnly") 1040 } 1041 return iterateOnReader(r, start, end, f) 1042 } 1043 1044 // NewIterator returns an iterator over the underlying engine. Note 1045 // that the returned iterator is cached and re-used for the lifetime of the 1046 // rocksDBReadOnly. A panic will be thrown if multiple prefix or normal (non-prefix) 1047 // iterators are used simultaneously on the same rocksDBReadOnly. 1048 func (r *rocksDBReadOnly) NewIterator(opts IterOptions) Iterator { 1049 if r.isClosed { 1050 panic("using a closed rocksDBReadOnly") 1051 } 1052 if opts.MinTimestampHint != (hlc.Timestamp{}) { 1053 // Iterators that specify timestamp bounds cannot be cached. 1054 return newRocksDBIterator(r.parent.rdb, opts, r, r.parent) 1055 } 1056 iter := &r.normalIter 1057 if opts.Prefix { 1058 iter = &r.prefixIter 1059 } 1060 if iter.rocksDBIterator.iter == nil { 1061 iter.rocksDBIterator.init(r.parent.rdb, opts, r, r.parent) 1062 } else { 1063 iter.rocksDBIterator.setOptions(opts) 1064 } 1065 if iter.inuse { 1066 panic("iterator already in use") 1067 } 1068 iter.inuse = true 1069 return iter 1070 } 1071 1072 // Writer methods are not implemented for rocksDBReadOnly. Ideally, the code 1073 // could be refactored so that a Reader could be supplied to evaluateBatch 1074 1075 // Writer is the write interface to an engine's data. 1076 func (r *rocksDBReadOnly) ApplyBatchRepr(repr []byte, sync bool) error { 1077 panic("not implemented") 1078 } 1079 1080 func (r *rocksDBReadOnly) Clear(key MVCCKey) error { 1081 panic("not implemented") 1082 } 1083 1084 func (r *rocksDBReadOnly) SingleClear(key MVCCKey) error { 1085 panic("not implemented") 1086 } 1087 1088 func (r *rocksDBReadOnly) ClearRange(start, end MVCCKey) error { 1089 panic("not implemented") 1090 } 1091 1092 func (r *rocksDBReadOnly) ClearIterRange(iter Iterator, start, end roachpb.Key) error { 1093 panic("not implemented") 1094 } 1095 1096 func (r *rocksDBReadOnly) Merge(key MVCCKey, value []byte) error { 1097 panic("not implemented") 1098 } 1099 1100 func (r *rocksDBReadOnly) Put(key MVCCKey, value []byte) error { 1101 panic("not implemented") 1102 } 1103 1104 func (r *rocksDBReadOnly) LogData(data []byte) error { 1105 panic("not implemented") 1106 } 1107 1108 func (r *rocksDBReadOnly) LogLogicalOp(op MVCCLogicalOpType, details MVCCLogicalOpDetails) { 1109 panic("not implemented") 1110 } 1111 1112 // NewBatch returns a new batch wrapping this rocksdb engine. 1113 func (r *RocksDB) NewBatch() Batch { 1114 b := newRocksDBBatch(r, false /* writeOnly */) 1115 return b 1116 } 1117 1118 // NewWriteOnlyBatch returns a new write-only batch wrapping this rocksdb 1119 // engine. 1120 func (r *RocksDB) NewWriteOnlyBatch() Batch { 1121 return newRocksDBBatch(r, true /* writeOnly */) 1122 } 1123 1124 // GetSSTables retrieves metadata about this engine's live sstables. 1125 func (r *RocksDB) GetSSTables() SSTableInfos { 1126 var n C.int 1127 tables := C.DBGetSSTables(r.rdb, &n) 1128 // We can't index into tables because it is a pointer, not a slice. The 1129 // hackery below treats the pointer as an array and then constructs a slice 1130 // from it. 1131 1132 tableSize := unsafe.Sizeof(C.DBSSTable{}) 1133 tableVal := func(i int) C.DBSSTable { 1134 return *(*C.DBSSTable)(unsafe.Pointer(uintptr(unsafe.Pointer(tables)) + uintptr(i)*tableSize)) 1135 } 1136 1137 res := make(SSTableInfos, n) 1138 for i := range res { 1139 r := &res[i] 1140 tv := tableVal(i) 1141 r.Level = int(tv.level) 1142 r.Size = int64(tv.size) 1143 r.Start = cToGoKey(tv.start_key) 1144 r.End = cToGoKey(tv.end_key) 1145 if ptr := tv.start_key.key.data; ptr != nil { 1146 C.free(unsafe.Pointer(ptr)) 1147 } 1148 if ptr := tv.end_key.key.data; ptr != nil { 1149 C.free(unsafe.Pointer(ptr)) 1150 } 1151 } 1152 C.free(unsafe.Pointer(tables)) 1153 1154 sort.Sort(res) 1155 return res 1156 } 1157 1158 // WALFileInfo contains metadata about a single write-ahead log file. Note this 1159 // mirrors the C.DBWALFile struct. 1160 type WALFileInfo struct { 1161 LogNumber int64 1162 Size int64 1163 } 1164 1165 // GetSortedWALFiles retrievews information about all of the write-ahead log 1166 // files in this engine in order from oldest to newest. 1167 func (r *RocksDB) GetSortedWALFiles() ([]WALFileInfo, error) { 1168 var n C.int 1169 var files *C.DBWALFile 1170 status := C.DBGetSortedWALFiles(r.rdb, &files, &n) 1171 if err := statusToError(status); err != nil { 1172 return nil, errors.Wrap(err, "could not get sorted WAL files") 1173 } 1174 defer C.free(unsafe.Pointer(files)) 1175 1176 // We can't index into files because it is a pointer, not a slice. The hackery 1177 // below treats the pointer as an array and then constructs a slice from it. 1178 1179 structSize := unsafe.Sizeof(C.DBWALFile{}) 1180 getWALFile := func(i int) *C.DBWALFile { 1181 return (*C.DBWALFile)(unsafe.Pointer(uintptr(unsafe.Pointer(files)) + uintptr(i)*structSize)) 1182 } 1183 1184 res := make([]WALFileInfo, n) 1185 for i := range res { 1186 wf := getWALFile(i) 1187 res[i].LogNumber = int64(wf.log_number) 1188 res[i].Size = int64(wf.size) 1189 } 1190 return res, nil 1191 } 1192 1193 // GetUserProperties fetches the user properties stored in each sstable's 1194 // metadata. 1195 func (r *RocksDB) GetUserProperties() (enginepb.SSTUserPropertiesCollection, error) { 1196 buf := cStringToGoBytes(C.DBGetUserProperties(r.rdb)) 1197 var ssts enginepb.SSTUserPropertiesCollection 1198 if err := protoutil.Unmarshal(buf, &ssts); err != nil { 1199 return enginepb.SSTUserPropertiesCollection{}, err 1200 } 1201 if ssts.Error != "" { 1202 return enginepb.SSTUserPropertiesCollection{}, errors.Newf("%s", ssts.Error) 1203 } 1204 return ssts, nil 1205 } 1206 1207 // GetStats retrieves stats from this engine's RocksDB instance and 1208 // returns it in a new instance of Stats. 1209 func (r *RocksDB) GetStats() (*Stats, error) { 1210 var s C.DBStatsResult 1211 if err := statusToError(C.DBGetStats(r.rdb, &s)); err != nil { 1212 return nil, err 1213 } 1214 return &Stats{ 1215 BlockCacheHits: int64(s.block_cache_hits), 1216 BlockCacheMisses: int64(s.block_cache_misses), 1217 BlockCacheUsage: int64(s.block_cache_usage), 1218 BlockCachePinnedUsage: int64(s.block_cache_pinned_usage), 1219 BloomFilterPrefixChecked: int64(s.bloom_filter_prefix_checked), 1220 BloomFilterPrefixUseful: int64(s.bloom_filter_prefix_useful), 1221 MemtableTotalSize: int64(s.memtable_total_size), 1222 Flushes: int64(s.flushes), 1223 FlushedBytes: int64(s.flush_bytes), 1224 Compactions: int64(s.compactions), 1225 IngestedBytes: 0, // Not exposed by RocksDB. 1226 CompactedBytesRead: int64(s.compact_read_bytes), 1227 CompactedBytesWritten: int64(s.compact_write_bytes), 1228 TableReadersMemEstimate: int64(s.table_readers_mem_estimate), 1229 PendingCompactionBytesEstimate: int64(s.pending_compaction_bytes_estimate), 1230 L0FileCount: int64(s.l0_file_count), 1231 }, nil 1232 } 1233 1234 // GetTickersAndHistograms retrieves maps of all RocksDB tickers and histograms. 1235 // It differs from `GetStats` by getting _every_ ticker and histogram, and by not 1236 // getting anything else (DB properties, for example). 1237 func (r *RocksDB) GetTickersAndHistograms() (*enginepb.TickersAndHistograms, error) { 1238 res := new(enginepb.TickersAndHistograms) 1239 var s C.DBTickersAndHistogramsResult 1240 if err := statusToError(C.DBGetTickersAndHistograms(r.rdb, &s)); err != nil { 1241 return nil, err 1242 } 1243 1244 tickers := (*[MaxArrayLen / C.sizeof_TickerInfo]C.TickerInfo)( 1245 unsafe.Pointer(s.tickers))[:s.tickers_len:s.tickers_len] 1246 res.Tickers = make(map[string]uint64) 1247 for _, ticker := range tickers { 1248 name := cStringToGoString(ticker.name) 1249 value := uint64(ticker.value) 1250 res.Tickers[name] = value 1251 } 1252 C.free(unsafe.Pointer(s.tickers)) 1253 1254 res.Histograms = make(map[string]enginepb.HistogramData) 1255 histograms := (*[MaxArrayLen / C.sizeof_HistogramInfo]C.HistogramInfo)( 1256 unsafe.Pointer(s.histograms))[:s.histograms_len:s.histograms_len] 1257 for _, histogram := range histograms { 1258 name := cStringToGoString(histogram.name) 1259 value := enginepb.HistogramData{ 1260 Mean: float64(histogram.mean), 1261 P50: float64(histogram.p50), 1262 P95: float64(histogram.p95), 1263 P99: float64(histogram.p99), 1264 Max: float64(histogram.max), 1265 Count: uint64(histogram.count), 1266 Sum: uint64(histogram.sum), 1267 } 1268 res.Histograms[name] = value 1269 } 1270 C.free(unsafe.Pointer(s.histograms)) 1271 return res, nil 1272 } 1273 1274 // GetCompactionStats returns the internal RocksDB compaction stats. See 1275 // https://github.com/facebook/rocksdb/wiki/RocksDB-Tuning-Guide#rocksdb-statistics. 1276 func (r *RocksDB) GetCompactionStats() string { 1277 s := cStringToGoString(C.DBGetCompactionStats(r.rdb)) + 1278 "estimated_pending_compaction_bytes: " 1279 stats, err := r.GetStats() 1280 if err != nil { 1281 return s + err.Error() 1282 } 1283 return s + humanizeutil.IBytes(stats.PendingCompactionBytesEstimate) 1284 } 1285 1286 // GetEnvStats returns stats for the RocksDB env. This may include encryption stats. 1287 func (r *RocksDB) GetEnvStats() (*EnvStats, error) { 1288 var s C.DBEnvStatsResult 1289 if err := statusToError(C.DBGetEnvStats(r.rdb, &s)); err != nil { 1290 return nil, err 1291 } 1292 1293 return &EnvStats{ 1294 TotalFiles: uint64(s.total_files), 1295 TotalBytes: uint64(s.total_bytes), 1296 ActiveKeyFiles: uint64(s.active_key_files), 1297 ActiveKeyBytes: uint64(s.active_key_bytes), 1298 EncryptionType: int32(s.encryption_type), 1299 EncryptionStatus: cStringToGoBytes(s.encryption_status), 1300 }, nil 1301 } 1302 1303 // GetEncryptionRegistries returns the file and key registries when encryption is enabled 1304 // on the store. 1305 func (r *RocksDB) GetEncryptionRegistries() (*EncryptionRegistries, error) { 1306 var s C.DBEncryptionRegistries 1307 if err := statusToError(C.DBGetEncryptionRegistries(r.rdb, &s)); err != nil { 1308 return nil, err 1309 } 1310 1311 return &EncryptionRegistries{ 1312 FileRegistry: cStringToGoBytes(s.file_registry), 1313 KeyRegistry: cStringToGoBytes(s.key_registry), 1314 }, nil 1315 } 1316 1317 type rocksDBSnapshot struct { 1318 parent *RocksDB 1319 handle *C.DBEngine 1320 } 1321 1322 // Close releases the snapshot handle. 1323 func (r *rocksDBSnapshot) Close() { 1324 C.DBClose(r.handle) 1325 r.handle = nil 1326 } 1327 1328 // Closed returns true if the engine is closed. 1329 func (r *rocksDBSnapshot) Closed() bool { 1330 return r.handle == nil 1331 } 1332 1333 // ExportToSst is part of the engine.Reader interface. 1334 func (r *rocksDBSnapshot) ExportToSst( 1335 startKey, endKey roachpb.Key, 1336 startTS, endTS hlc.Timestamp, 1337 exportAllRevisions bool, 1338 targetSize, maxSize uint64, 1339 io IterOptions, 1340 ) ([]byte, roachpb.BulkOpSummary, roachpb.Key, error) { 1341 return r.parent.ExportToSst(startKey, endKey, startTS, endTS, exportAllRevisions, targetSize, maxSize, io) 1342 } 1343 1344 // Get returns the value for the given key, nil otherwise using 1345 // the snapshot handle. 1346 func (r *rocksDBSnapshot) Get(key MVCCKey) ([]byte, error) { 1347 return dbGet(r.handle, key) 1348 } 1349 1350 func (r *rocksDBSnapshot) GetProto( 1351 key MVCCKey, msg protoutil.Message, 1352 ) (ok bool, keyBytes, valBytes int64, err error) { 1353 return dbGetProto(r.handle, key, msg) 1354 } 1355 1356 // Iterate iterates over the keys between start inclusive and end 1357 // exclusive, invoking f() on each key/value pair using the snapshot 1358 // handle. 1359 func (r *rocksDBSnapshot) Iterate( 1360 start, end roachpb.Key, f func(MVCCKeyValue) (bool, error), 1361 ) error { 1362 return iterateOnReader(r, start, end, f) 1363 } 1364 1365 // NewIterator returns a new instance of an Iterator over the 1366 // engine using the snapshot handle. 1367 func (r *rocksDBSnapshot) NewIterator(opts IterOptions) Iterator { 1368 return newRocksDBIterator(r.handle, opts, r, r.parent) 1369 } 1370 1371 // reusableIterator wraps rocksDBIterator and allows reuse of an iterator 1372 // for the lifetime of a batch. 1373 type reusableIterator struct { 1374 rocksDBIterator 1375 inuse bool 1376 } 1377 1378 func (r *reusableIterator) Close() { 1379 // reusableIterator.Close() leaves the underlying rocksdb iterator open until 1380 // the associated batch is closed. 1381 if !r.inuse { 1382 panic("closing idle iterator") 1383 } 1384 r.inuse = false 1385 } 1386 1387 type distinctBatch struct { 1388 *rocksDBBatch 1389 prefixIter reusableIterator 1390 normalIter reusableIterator 1391 } 1392 1393 func (r *distinctBatch) Close() { 1394 if !r.distinctOpen { 1395 panic("distinct batch not open") 1396 } 1397 r.distinctOpen = false 1398 } 1399 1400 // NewIterator returns an iterator over the batch and underlying engine. Note 1401 // that the returned iterator is cached and re-used for the lifetime of the 1402 // batch. A panic will be thrown if multiple prefix or normal (non-prefix) 1403 // iterators are used simultaneously on the same batch. 1404 func (r *distinctBatch) NewIterator(opts IterOptions) Iterator { 1405 if opts.MinTimestampHint != (hlc.Timestamp{}) { 1406 // Iterators that specify timestamp bounds cannot be cached. 1407 if r.writeOnly { 1408 return newRocksDBIterator(r.parent.rdb, opts, r, r.parent) 1409 } 1410 r.ensureBatch() 1411 return newRocksDBIterator(r.batch, opts, r, r.parent) 1412 } 1413 1414 // Use the cached iterator, creating it on first access. 1415 iter := &r.normalIter 1416 if opts.Prefix { 1417 iter = &r.prefixIter 1418 } 1419 if iter.rocksDBIterator.iter == nil { 1420 if r.writeOnly { 1421 iter.rocksDBIterator.init(r.parent.rdb, opts, r, r.parent) 1422 } else { 1423 r.ensureBatch() 1424 iter.rocksDBIterator.init(r.batch, opts, r, r.parent) 1425 } 1426 } else { 1427 iter.rocksDBIterator.setOptions(opts) 1428 } 1429 if iter.inuse { 1430 panic("iterator already in use") 1431 } 1432 iter.inuse = true 1433 return iter 1434 } 1435 1436 func (r *distinctBatch) Get(key MVCCKey) ([]byte, error) { 1437 if r.writeOnly { 1438 return dbGet(r.parent.rdb, key) 1439 } 1440 r.ensureBatch() 1441 return dbGet(r.batch, key) 1442 } 1443 1444 func (r *distinctBatch) GetProto( 1445 key MVCCKey, msg protoutil.Message, 1446 ) (ok bool, keyBytes, valBytes int64, err error) { 1447 if r.writeOnly { 1448 return dbGetProto(r.parent.rdb, key, msg) 1449 } 1450 r.ensureBatch() 1451 return dbGetProto(r.batch, key, msg) 1452 } 1453 1454 func (r *distinctBatch) Iterate(start, end roachpb.Key, f func(MVCCKeyValue) (bool, error)) error { 1455 r.ensureBatch() 1456 return iterateOnReader(r, start, end, f) 1457 } 1458 1459 func (r *distinctBatch) Put(key MVCCKey, value []byte) error { 1460 r.builder.Put(key, value) 1461 return nil 1462 } 1463 1464 func (r *distinctBatch) Merge(key MVCCKey, value []byte) error { 1465 r.builder.Merge(key, value) 1466 return nil 1467 } 1468 1469 func (r *distinctBatch) LogData(data []byte) error { 1470 r.builder.LogData(data) 1471 return nil 1472 } 1473 1474 func (r *distinctBatch) Clear(key MVCCKey) error { 1475 r.builder.Clear(key) 1476 return nil 1477 } 1478 1479 func (r *distinctBatch) SingleClear(key MVCCKey) error { 1480 r.builder.SingleClear(key) 1481 return nil 1482 } 1483 1484 func (r *distinctBatch) ClearRange(start, end MVCCKey) error { 1485 if !r.writeOnly { 1486 panic("readable batch") 1487 } 1488 r.flushMutations() 1489 r.flushes++ // make sure that Repr() doesn't take a shortcut 1490 r.ensureBatch() 1491 return dbClearRange(r.batch, start, end) 1492 } 1493 1494 func (r *distinctBatch) ClearIterRange(iter Iterator, start, end roachpb.Key) error { 1495 r.flushMutations() 1496 r.flushes++ // make sure that Repr() doesn't take a shortcut 1497 r.ensureBatch() 1498 return dbClearIterRange(r.batch, iter, start, end) 1499 } 1500 1501 func (r *distinctBatch) LogLogicalOp(op MVCCLogicalOpType, details MVCCLogicalOpDetails) { 1502 // No-op. Logical logging disabled. 1503 } 1504 1505 func (r *distinctBatch) close() { 1506 if r.prefixIter.inuse { 1507 panic("iterator still inuse") 1508 } 1509 if r.normalIter.inuse { 1510 panic("iterator still inuse") 1511 } 1512 if i := &r.prefixIter.rocksDBIterator; i.iter != nil { 1513 i.destroy() 1514 } 1515 if i := &r.normalIter.rocksDBIterator; i.iter != nil { 1516 i.destroy() 1517 } 1518 } 1519 1520 // batchIterator wraps rocksDBIterator and ensures that the buffered mutations 1521 // in a batch are flushed before performing read operations. 1522 type batchIterator struct { 1523 iter rocksDBIterator 1524 batch *rocksDBBatch 1525 } 1526 1527 func (r *batchIterator) Stats() IteratorStats { 1528 return r.iter.Stats() 1529 } 1530 1531 func (r *batchIterator) Close() { 1532 if r.batch == nil { 1533 panic("closing idle iterator") 1534 } 1535 r.batch = nil 1536 r.iter.destroy() 1537 } 1538 1539 func (r *batchIterator) SeekGE(key MVCCKey) { 1540 r.batch.flushMutations() 1541 r.iter.SeekGE(key) 1542 } 1543 1544 func (r *batchIterator) SeekLT(key MVCCKey) { 1545 r.batch.flushMutations() 1546 r.iter.SeekLT(key) 1547 } 1548 1549 func (r *batchIterator) Valid() (bool, error) { 1550 return r.iter.Valid() 1551 } 1552 1553 func (r *batchIterator) Next() { 1554 r.batch.flushMutations() 1555 r.iter.Next() 1556 } 1557 1558 func (r *batchIterator) Prev() { 1559 r.batch.flushMutations() 1560 r.iter.Prev() 1561 } 1562 1563 func (r *batchIterator) NextKey() { 1564 r.batch.flushMutations() 1565 r.iter.NextKey() 1566 } 1567 1568 func (r *batchIterator) ComputeStats( 1569 start, end roachpb.Key, nowNanos int64, 1570 ) (enginepb.MVCCStats, error) { 1571 r.batch.flushMutations() 1572 return r.iter.ComputeStats(start, end, nowNanos) 1573 } 1574 1575 func (r *batchIterator) FindSplitKey( 1576 start, end, minSplitKey roachpb.Key, targetSize int64, 1577 ) (MVCCKey, error) { 1578 r.batch.flushMutations() 1579 return r.iter.FindSplitKey(start, end, minSplitKey, targetSize) 1580 } 1581 1582 func (r *batchIterator) MVCCOpsSpecialized() bool { 1583 return r.iter.MVCCOpsSpecialized() 1584 } 1585 1586 func (r *batchIterator) MVCCGet( 1587 key roachpb.Key, timestamp hlc.Timestamp, opts MVCCGetOptions, 1588 ) (*roachpb.Value, *roachpb.Intent, error) { 1589 r.batch.flushMutations() 1590 return r.iter.MVCCGet(key, timestamp, opts) 1591 } 1592 1593 func (r *batchIterator) MVCCScan( 1594 start, end roachpb.Key, timestamp hlc.Timestamp, opts MVCCScanOptions, 1595 ) (MVCCScanResult, error) { 1596 r.batch.flushMutations() 1597 return r.iter.MVCCScan(start, end, timestamp, opts) 1598 } 1599 1600 func (r *batchIterator) SetUpperBound(key roachpb.Key) { 1601 r.iter.SetUpperBound(key) 1602 } 1603 1604 func (r *batchIterator) Key() MVCCKey { 1605 return r.iter.Key() 1606 } 1607 1608 func (r *batchIterator) Value() []byte { 1609 return r.iter.Value() 1610 } 1611 1612 func (r *batchIterator) ValueProto(msg protoutil.Message) error { 1613 return r.iter.ValueProto(msg) 1614 } 1615 1616 func (r *batchIterator) UnsafeKey() MVCCKey { 1617 return r.iter.UnsafeKey() 1618 } 1619 1620 func (r *batchIterator) UnsafeValue() []byte { 1621 return r.iter.UnsafeValue() 1622 } 1623 1624 func (r *batchIterator) getIter() *C.DBIterator { 1625 return r.iter.iter 1626 } 1627 1628 func (r *batchIterator) CheckForKeyCollisions( 1629 sstData []byte, start, end roachpb.Key, 1630 ) (enginepb.MVCCStats, error) { 1631 return r.iter.CheckForKeyCollisions(sstData, start, end) 1632 } 1633 1634 // reusableBatchIterator wraps batchIterator and makes the Close method a no-op 1635 // to allow reuse of the iterator for the lifetime of the batch. The batch must 1636 // call iter.destroy() when it closes itself. 1637 type reusableBatchIterator struct { 1638 batchIterator 1639 } 1640 1641 func (r *reusableBatchIterator) Close() { 1642 // reusableBatchIterator.Close() leaves the underlying rocksdb iterator open 1643 // until the associated batch is closed. 1644 if r.batch == nil { 1645 panic("closing idle iterator") 1646 } 1647 r.batch = nil 1648 } 1649 1650 type rocksDBBatch struct { 1651 parent *RocksDB 1652 batch *C.DBEngine 1653 flushes int 1654 flushedCount int 1655 flushedSize int 1656 prefixIter reusableBatchIterator 1657 normalIter reusableBatchIterator 1658 builder RocksDBBatchBuilder 1659 distinct distinctBatch 1660 distinctOpen bool 1661 distinctNeedsFlush bool 1662 writeOnly bool 1663 syncCommit bool 1664 closed bool 1665 committed bool 1666 commitErr error 1667 commitWG sync.WaitGroup 1668 } 1669 1670 var batchPool = sync.Pool{ 1671 New: func() interface{} { 1672 return &rocksDBBatch{} 1673 }, 1674 } 1675 1676 func newRocksDBBatch(parent *RocksDB, writeOnly bool) *rocksDBBatch { 1677 // Get a new batch from the pool. Batches in the pool may have their closed 1678 // fields set to true to facilitate some sanity check assertions. Reset this 1679 // field and set others. 1680 r := batchPool.Get().(*rocksDBBatch) 1681 r.closed = false 1682 r.parent = parent 1683 r.writeOnly = writeOnly 1684 r.distinct.rocksDBBatch = r 1685 return r 1686 } 1687 1688 func (r *rocksDBBatch) ensureBatch() { 1689 if r.batch == nil { 1690 r.batch = C.DBNewBatch(r.parent.rdb, C.bool(r.writeOnly)) 1691 } 1692 } 1693 1694 func (r *rocksDBBatch) Close() { 1695 if r.closed { 1696 panic("this batch was already closed") 1697 } 1698 r.distinct.close() 1699 if r.prefixIter.batch != nil { 1700 panic("iterator still inuse") 1701 } 1702 if r.normalIter.batch != nil { 1703 panic("iterator still inuse") 1704 } 1705 if i := &r.prefixIter.iter; i.iter != nil { 1706 i.destroy() 1707 } 1708 if i := &r.normalIter.iter; i.iter != nil { 1709 i.destroy() 1710 } 1711 if r.batch != nil { 1712 C.DBClose(r.batch) 1713 r.batch = nil 1714 } 1715 r.builder.reset() 1716 r.closed = true 1717 1718 // Zero all the remaining fields individually. We can't just copy a new 1719 // struct onto r, since r.builder has a sync.NoCopy. 1720 r.batch = nil 1721 r.parent = nil 1722 r.flushes = 0 1723 r.flushedCount = 0 1724 r.flushedSize = 0 1725 r.prefixIter = reusableBatchIterator{} 1726 r.normalIter = reusableBatchIterator{} 1727 r.distinctOpen = false 1728 r.distinctNeedsFlush = false 1729 r.writeOnly = false 1730 r.syncCommit = false 1731 r.committed = false 1732 r.commitErr = nil 1733 r.commitWG = sync.WaitGroup{} 1734 1735 batchPool.Put(r) 1736 } 1737 1738 // Closed returns true if the engine is closed. 1739 func (r *rocksDBBatch) Closed() bool { 1740 return r.closed || r.committed 1741 } 1742 1743 // ExportToSst is part of the engine.Reader interface. 1744 func (r *rocksDBBatch) ExportToSst( 1745 startKey, endKey roachpb.Key, 1746 startTS, endTS hlc.Timestamp, 1747 exportAllRevisions bool, 1748 targetSize, maxSize uint64, 1749 io IterOptions, 1750 ) ([]byte, roachpb.BulkOpSummary, roachpb.Key, error) { 1751 panic("unimplemented") 1752 } 1753 1754 func (r *rocksDBBatch) Put(key MVCCKey, value []byte) error { 1755 if r.distinctOpen { 1756 panic("distinct batch open") 1757 } 1758 r.distinctNeedsFlush = true 1759 r.builder.Put(key, value) 1760 return nil 1761 } 1762 1763 func (r *rocksDBBatch) Merge(key MVCCKey, value []byte) error { 1764 if r.distinctOpen { 1765 panic("distinct batch open") 1766 } 1767 r.distinctNeedsFlush = true 1768 r.builder.Merge(key, value) 1769 return nil 1770 } 1771 1772 func (r *rocksDBBatch) LogData(data []byte) error { 1773 if r.distinctOpen { 1774 panic("distinct batch open") 1775 } 1776 r.distinctNeedsFlush = true 1777 r.builder.LogData(data) 1778 return nil 1779 } 1780 1781 // ApplyBatchRepr atomically applies a set of batched updates to the current 1782 // batch (the receiver). 1783 func (r *rocksDBBatch) ApplyBatchRepr(repr []byte, sync bool) error { 1784 if r.distinctOpen { 1785 panic("distinct batch open") 1786 } 1787 r.distinctNeedsFlush = true 1788 return r.builder.ApplyRepr(repr) 1789 } 1790 1791 func (r *rocksDBBatch) Get(key MVCCKey) ([]byte, error) { 1792 if r.writeOnly { 1793 panic("write-only batch") 1794 } 1795 if r.distinctOpen { 1796 panic("distinct batch open") 1797 } 1798 r.flushMutations() 1799 r.ensureBatch() 1800 return dbGet(r.batch, key) 1801 } 1802 1803 func (r *rocksDBBatch) GetProto( 1804 key MVCCKey, msg protoutil.Message, 1805 ) (ok bool, keyBytes, valBytes int64, err error) { 1806 if r.writeOnly { 1807 panic("write-only batch") 1808 } 1809 if r.distinctOpen { 1810 panic("distinct batch open") 1811 } 1812 r.flushMutations() 1813 r.ensureBatch() 1814 return dbGetProto(r.batch, key, msg) 1815 } 1816 1817 func (r *rocksDBBatch) Iterate(start, end roachpb.Key, f func(MVCCKeyValue) (bool, error)) error { 1818 if r.writeOnly { 1819 panic("write-only batch") 1820 } 1821 if r.distinctOpen { 1822 panic("distinct batch open") 1823 } 1824 r.flushMutations() 1825 r.ensureBatch() 1826 return iterateOnReader(r, start, end, f) 1827 } 1828 1829 func (r *rocksDBBatch) Clear(key MVCCKey) error { 1830 if r.distinctOpen { 1831 panic("distinct batch open") 1832 } 1833 r.distinctNeedsFlush = true 1834 r.builder.Clear(key) 1835 return nil 1836 } 1837 1838 func (r *rocksDBBatch) SingleClear(key MVCCKey) error { 1839 if r.distinctOpen { 1840 panic("distinct batch open") 1841 } 1842 r.distinctNeedsFlush = true 1843 r.builder.SingleClear(key) 1844 return nil 1845 } 1846 1847 func (r *rocksDBBatch) ClearRange(start, end MVCCKey) error { 1848 if r.distinctOpen { 1849 panic("distinct batch open") 1850 } 1851 r.flushMutations() 1852 r.flushes++ // make sure that Repr() doesn't take a shortcut 1853 r.ensureBatch() 1854 return dbClearRange(r.batch, start, end) 1855 } 1856 1857 func (r *rocksDBBatch) ClearIterRange(iter Iterator, start, end roachpb.Key) error { 1858 if r.distinctOpen { 1859 panic("distinct batch open") 1860 } 1861 r.flushMutations() 1862 r.flushes++ // make sure that Repr() doesn't take a shortcut 1863 r.ensureBatch() 1864 return dbClearIterRange(r.batch, iter, start, end) 1865 } 1866 1867 func (r *rocksDBBatch) LogLogicalOp(op MVCCLogicalOpType, details MVCCLogicalOpDetails) { 1868 // No-op. Logical logging disabled. 1869 } 1870 1871 // NewIterator returns an iterator over the batch and underlying engine. Note 1872 // that the returned iterator is cached and re-used for the lifetime of the 1873 // batch. A panic will be thrown if multiple prefix or normal (non-prefix) 1874 // iterators are used simultaneously on the same batch. 1875 func (r *rocksDBBatch) NewIterator(opts IterOptions) Iterator { 1876 if r.writeOnly { 1877 panic("write-only batch") 1878 } 1879 if r.distinctOpen { 1880 panic("distinct batch open") 1881 } 1882 1883 if opts.MinTimestampHint != (hlc.Timestamp{}) { 1884 // Iterators that specify timestamp bounds cannot be cached. 1885 r.ensureBatch() 1886 iter := &batchIterator{batch: r} 1887 iter.iter.init(r.batch, opts, r, r.parent) 1888 return iter 1889 } 1890 1891 // Use the cached iterator, creating it on first access. 1892 iter := &r.normalIter 1893 if opts.Prefix { 1894 iter = &r.prefixIter 1895 } 1896 if iter.iter.iter == nil { 1897 r.ensureBatch() 1898 iter.iter.init(r.batch, opts, r, r.parent) 1899 } else { 1900 iter.iter.setOptions(opts) 1901 } 1902 if iter.batch != nil { 1903 panic("iterator already in use") 1904 } 1905 iter.batch = r 1906 return iter 1907 } 1908 1909 const maxBatchGroupSize = 1 << 20 // 1 MiB 1910 1911 // makeBatchGroup add the specified batch to the pending list of batches to 1912 // commit. Groups are delimited by a nil batch in the pending list. Group 1913 // leaders are the first batch in the pending list and the first batch after a 1914 // nil batch. The size of a group is limited by the maxSize parameter which is 1915 // measured as the number of bytes in the group's batches. The groupSize 1916 // parameter is the size of the current group being formed. Returns the new 1917 // list of pending batches, the new size of the current group and whether the 1918 // batch that was added is the leader of its group. 1919 func makeBatchGroup( 1920 pending []*rocksDBBatch, b *rocksDBBatch, groupSize, maxSize int, 1921 ) (_ []*rocksDBBatch, _ int, leader bool) { 1922 leader = len(pending) == 0 1923 if n := len(b.unsafeRepr()); leader { 1924 groupSize = n 1925 } else if groupSize+n > maxSize { 1926 leader = true 1927 groupSize = n 1928 pending = append(pending, nil) 1929 } else { 1930 groupSize += n 1931 } 1932 pending = append(pending, b) 1933 return pending, groupSize, leader 1934 } 1935 1936 // nextBatchGroup extracts the group of batches from the pending list. See 1937 // makeBatchGroup for an explanation of how groups are encoded into the pending 1938 // list. Returns the next group in the prefix return value, and the remaining 1939 // groups in the suffix parameter (the next group is always a prefix of the 1940 // pending argument). 1941 func nextBatchGroup(pending []*rocksDBBatch) (prefix []*rocksDBBatch, suffix []*rocksDBBatch) { 1942 for i := 1; i < len(pending); i++ { 1943 if pending[i] == nil { 1944 return pending[:i], pending[i+1:] 1945 } 1946 } 1947 return pending, pending[len(pending):] 1948 } 1949 1950 func (r *rocksDBBatch) Commit(syncCommit bool) error { 1951 if r.Closed() { 1952 panic("this batch was already committed") 1953 } 1954 r.distinctOpen = false 1955 1956 if r.Empty() { 1957 // Nothing was written to this batch. Fast path. 1958 r.committed = true 1959 return nil 1960 } 1961 1962 // Combine multiple write-only batch commits into a single call to 1963 // RocksDB. RocksDB is supposed to be performing such batching internally, 1964 // but whether Cgo or something else, it isn't achieving the same degree of 1965 // batching. Instrumentation shows that internally RocksDB almost never 1966 // batches commits together. While the batching below often can batch 20 or 1967 // 30 concurrent commits. 1968 c := &r.parent.commit 1969 r.commitWG.Add(1) 1970 r.syncCommit = syncCommit 1971 1972 // The leader for the commit is the first batch to be added to the pending 1973 // slice. Every batch has an associated wait group which is signaled when 1974 // the commit is complete. 1975 c.Lock() 1976 1977 var leader bool 1978 c.pending, c.groupSize, leader = makeBatchGroup(c.pending, r, c.groupSize, maxBatchGroupSize) 1979 1980 if leader { 1981 // We're the leader of our group. Wait for any running commit to finish and 1982 // for our batch to make it to the head of the pending queue. 1983 for c.committing || c.pending[0] != r { 1984 c.cond.Wait() 1985 } 1986 1987 var pending []*rocksDBBatch 1988 pending, c.pending = nextBatchGroup(c.pending) 1989 c.committing = true 1990 c.Unlock() 1991 1992 // We want the batch that is performing the commit to be write-only in 1993 // order to avoid the (significant) overhead of indexing the operations in 1994 // the other batches when they are applied. 1995 committer := r 1996 merge := pending[1:] 1997 if !r.writeOnly && len(merge) > 0 { 1998 committer = newRocksDBBatch(r.parent, true /* writeOnly */) 1999 defer committer.Close() 2000 merge = pending 2001 } 2002 2003 // Bundle all of the batches together. 2004 var err error 2005 for _, b := range merge { 2006 if err = committer.ApplyBatchRepr(b.unsafeRepr(), false /* sync */); err != nil { 2007 break 2008 } 2009 } 2010 2011 if err == nil { 2012 err = committer.commitInternal(false /* sync */) 2013 } 2014 2015 // We're done committing the batch, let the next group of batches 2016 // proceed. 2017 c.Lock() 2018 c.committing = false 2019 // NB: Multiple leaders can be waiting. 2020 c.cond.Broadcast() 2021 c.Unlock() 2022 2023 // Propagate the error to all of the batches involved in the commit. If a 2024 // batch requires syncing and the commit was successful, add it to the 2025 // syncing list. Note that we're reusing the pending list here for the 2026 // syncing list. We need to be careful to cap the capacity so that 2027 // extending this slice past the length of the pending list will result in 2028 // reallocation. Otherwise we have a race between appending to this list 2029 // while holding the sync lock below, and appending to the commit pending 2030 // list while holding the commit lock above. 2031 syncing := pending[:0:len(pending)] 2032 for _, b := range pending { 2033 if err != nil || !b.syncCommit { 2034 b.commitErr = err 2035 b.commitWG.Done() 2036 } else { 2037 syncing = append(syncing, b) 2038 } 2039 } 2040 2041 if len(syncing) > 0 { 2042 // The commit was successful and one or more of the batches requires 2043 // syncing: notify the sync goroutine. 2044 s := &r.parent.syncer 2045 s.Lock() 2046 if len(s.pending) == 0 { 2047 s.pending = syncing 2048 } else { 2049 s.pending = append(s.pending, syncing...) 2050 } 2051 s.cond.Signal() 2052 s.Unlock() 2053 } 2054 } else { 2055 c.Unlock() 2056 } 2057 // Wait for the commit/sync to finish. 2058 r.commitWG.Wait() 2059 return r.commitErr 2060 } 2061 2062 func (r *rocksDBBatch) commitInternal(sync bool) error { 2063 start := timeutil.Now() 2064 var count, size int 2065 2066 if r.flushes > 0 { 2067 // We've previously flushed mutations to the C++ batch, so we have to flush 2068 // any remaining mutations as well and then commit the batch. 2069 r.flushMutations() 2070 r.ensureBatch() 2071 if err := statusToError(C.DBCommitAndCloseBatch(r.batch, C.bool(sync))); err != nil { 2072 return err 2073 } 2074 r.batch = nil 2075 count, size = r.flushedCount, r.flushedSize 2076 } else if r.builder.Len() > 0 { 2077 count, size = int(r.builder.Count()), r.builder.Len() 2078 2079 // Fast-path which avoids flushing mutations to the C++ batch. Instead, we 2080 // directly apply the mutations to the database. 2081 if err := dbApplyBatchRepr(r.parent.rdb, r.builder.Finish(), sync); err != nil { 2082 return err 2083 } 2084 if r.batch != nil { 2085 C.DBClose(r.batch) 2086 r.batch = nil 2087 } 2088 } else { 2089 panic("commitInternal called on empty batch") 2090 } 2091 r.committed = true 2092 2093 warnLargeBatches := r.parent.cfg.WarnLargeBatchThreshold > 0 2094 if elapsed := timeutil.Since(start); warnLargeBatches && (elapsed >= r.parent.cfg.WarnLargeBatchThreshold) { 2095 log.Warningf(context.TODO(), "batch [%d/%d/%d] commit took %s (>= warning threshold %s)", 2096 count, size, r.flushes, elapsed, r.parent.cfg.WarnLargeBatchThreshold) 2097 } 2098 2099 return nil 2100 } 2101 2102 func (r *rocksDBBatch) Empty() bool { 2103 return r.flushes == 0 && r.builder.Count() == 0 && !r.builder.logData 2104 } 2105 2106 func (r *rocksDBBatch) Len() int { 2107 return len(r.unsafeRepr()) 2108 } 2109 2110 func (r *rocksDBBatch) unsafeRepr() []byte { 2111 if r.flushes == 0 { 2112 // We've never flushed to C++. Return the mutations only. 2113 return r.builder.getRepr() 2114 } 2115 r.flushMutations() 2116 return cSliceToUnsafeGoBytes(C.DBBatchRepr(r.batch)) 2117 } 2118 2119 func (r *rocksDBBatch) Repr() []byte { 2120 if r.flushes == 0 { 2121 // We've never flushed to C++. Return the mutations only. We make a copy 2122 // of the builder's byte slice so that the return []byte is valid even 2123 // if the builder is reset or finished. 2124 repr := r.builder.getRepr() 2125 cpy := make([]byte, len(repr)) 2126 copy(cpy, repr) 2127 return cpy 2128 } 2129 r.flushMutations() 2130 return cSliceToGoBytes(C.DBBatchRepr(r.batch)) 2131 } 2132 2133 func (r *rocksDBBatch) Distinct() ReadWriter { 2134 if r.distinctNeedsFlush { 2135 r.flushMutations() 2136 } 2137 if r.distinctOpen { 2138 panic("distinct batch already open") 2139 } 2140 r.distinctOpen = true 2141 return &r.distinct 2142 } 2143 2144 func (r *rocksDBBatch) flushMutations() { 2145 if r.builder.Count() == 0 { 2146 return 2147 } 2148 r.ensureBatch() 2149 r.distinctNeedsFlush = false 2150 r.flushes++ 2151 r.flushedCount += int(r.builder.Count()) 2152 r.flushedSize += r.builder.Len() 2153 if err := dbApplyBatchRepr(r.batch, r.builder.Finish(), false); err != nil { 2154 panic(err) 2155 } 2156 // Force a seek of the underlying iterator on the next Seek/ReverseSeek. 2157 r.prefixIter.iter.reseek = true 2158 r.normalIter.iter.reseek = true 2159 } 2160 2161 type dbIteratorGetter interface { 2162 getIter() *C.DBIterator 2163 } 2164 2165 type rocksDBIterator struct { 2166 parent *RocksDB 2167 reader Reader 2168 iter *C.DBIterator 2169 valid bool 2170 reseek bool 2171 prefix bool 2172 err error 2173 key C.DBKey 2174 value C.DBSlice 2175 } 2176 2177 // TODO(peter): Is this pool useful now that rocksDBBatch.NewIterator doesn't 2178 // allocate by returning internal pointers? 2179 var iterPool = sync.Pool{ 2180 New: func() interface{} { 2181 return &rocksDBIterator{} 2182 }, 2183 } 2184 2185 // newRocksDBIterator returns a new iterator over the supplied RocksDB 2186 // instance. If snapshotHandle is not nil, uses the indicated snapshot. 2187 // The caller must call rocksDBIterator.Close() when finished with the 2188 // iterator to free up resources. 2189 func newRocksDBIterator( 2190 rdb *C.DBEngine, opts IterOptions, reader Reader, parent *RocksDB, 2191 ) MVCCIterator { 2192 // In order to prevent content displacement, caching is disabled 2193 // when performing scans. Any options set within the shared read 2194 // options field that should be carried over needs to be set here 2195 // as well. 2196 r := iterPool.Get().(*rocksDBIterator) 2197 r.init(rdb, opts, reader, parent) 2198 return r 2199 } 2200 2201 func (r *rocksDBIterator) getIter() *C.DBIterator { 2202 return r.iter 2203 } 2204 2205 func (r *rocksDBIterator) init(rdb *C.DBEngine, opts IterOptions, reader Reader, parent *RocksDB) { 2206 r.parent = parent 2207 if debugIteratorLeak && r.parent != nil { 2208 r.parent.iters.Lock() 2209 r.parent.iters.m[r] = debug.Stack() 2210 r.parent.iters.Unlock() 2211 } 2212 2213 if !opts.Prefix && len(opts.UpperBound) == 0 && len(opts.LowerBound) == 0 { 2214 panic("iterator must set prefix or upper bound or lower bound") 2215 } 2216 2217 r.iter = C.DBNewIter(rdb, goToCIterOptions(opts)) 2218 if r.iter == nil { 2219 panic("unable to create iterator") 2220 } 2221 r.reader = reader 2222 r.prefix = opts.Prefix 2223 } 2224 2225 func (r *rocksDBIterator) setOptions(opts IterOptions) { 2226 if opts.MinTimestampHint != (hlc.Timestamp{}) || opts.MaxTimestampHint != (hlc.Timestamp{}) { 2227 panic("iterator with timestamp hints cannot be reused") 2228 } 2229 if !opts.Prefix && len(opts.UpperBound) == 0 && len(opts.LowerBound) == 0 { 2230 panic("iterator must set prefix or upper bound or lower bound") 2231 } 2232 C.DBIterSetLowerBound(r.iter, goToCKey(MakeMVCCMetadataKey(opts.LowerBound))) 2233 C.DBIterSetUpperBound(r.iter, goToCKey(MakeMVCCMetadataKey(opts.UpperBound))) 2234 } 2235 2236 func (r *rocksDBIterator) checkEngineOpen() { 2237 if r.reader.Closed() { 2238 panic("iterator used after backing engine closed") 2239 } 2240 } 2241 2242 func (r *rocksDBIterator) destroy() { 2243 if debugIteratorLeak && r.parent != nil { 2244 r.parent.iters.Lock() 2245 delete(r.parent.iters.m, r) 2246 r.parent.iters.Unlock() 2247 } 2248 C.DBIterDestroy(r.iter) 2249 *r = rocksDBIterator{} 2250 } 2251 2252 // The following methods implement the Iterator interface. 2253 2254 func (r *rocksDBIterator) Stats() IteratorStats { 2255 stats := C.DBIterStats(r.iter) 2256 return IteratorStats{ 2257 TimeBoundNumSSTs: int(stats.timebound_num_ssts), 2258 InternalDeleteSkippedCount: int(stats.internal_delete_skipped_count), 2259 } 2260 } 2261 2262 func (r *rocksDBIterator) Close() { 2263 r.destroy() 2264 iterPool.Put(r) 2265 } 2266 2267 func (r *rocksDBIterator) SeekGE(key MVCCKey) { 2268 r.checkEngineOpen() 2269 if len(key.Key) == 0 { 2270 // start=Key("") needs special treatment since we need 2271 // to access start[0] in an explicit seek. 2272 r.setState(C.DBIterSeekToFirst(r.iter)) 2273 } else { 2274 // We can avoid seeking if we're already at the key we seek. 2275 if r.valid && !r.reseek && key.Equal(r.UnsafeKey()) { 2276 return 2277 } 2278 r.setState(C.DBIterSeek(r.iter, goToCKey(key))) 2279 } 2280 } 2281 2282 func (r *rocksDBIterator) SeekLT(key MVCCKey) { 2283 r.checkEngineOpen() 2284 if len(key.Key) == 0 { 2285 r.setState(C.DBIterSeekToLast(r.iter)) 2286 } else { 2287 // SeekForPrev positions the iterator at the last key that is less 2288 // than or equal to key, so we may need to iterate backwards once. 2289 r.setState(C.DBIterSeekForPrev(r.iter, goToCKey(key))) 2290 if r.valid && key.Equal(r.UnsafeKey()) { 2291 r.Prev() 2292 } 2293 } 2294 } 2295 2296 func (r *rocksDBIterator) Valid() (bool, error) { 2297 return r.valid, r.err 2298 } 2299 2300 func (r *rocksDBIterator) Next() { 2301 r.checkEngineOpen() 2302 r.setState(C.DBIterNext(r.iter, C.bool(false) /* skip_current_key_versions */)) 2303 } 2304 2305 var errReversePrefixIteration = fmt.Errorf("unsupported reverse prefix iteration") 2306 2307 func (r *rocksDBIterator) Prev() { 2308 r.checkEngineOpen() 2309 if r.prefix { 2310 r.valid = false 2311 r.err = errReversePrefixIteration 2312 return 2313 } 2314 r.setState(C.DBIterPrev(r.iter, C.bool(false) /* skip_current_key_versions */)) 2315 } 2316 2317 func (r *rocksDBIterator) NextKey() { 2318 r.checkEngineOpen() 2319 r.setState(C.DBIterNext(r.iter, C.bool(true) /* skip_current_key_versions */)) 2320 } 2321 2322 func (r *rocksDBIterator) Key() MVCCKey { 2323 // The data returned by rocksdb_iter_{key,value} is not meant to be 2324 // freed by the client. It is a direct reference to the data managed 2325 // by the iterator, so it is copied instead of freed. 2326 return cToGoKey(r.key) 2327 } 2328 2329 func (r *rocksDBIterator) Value() []byte { 2330 return cSliceToGoBytes(r.value) 2331 } 2332 2333 func (r *rocksDBIterator) ValueProto(msg protoutil.Message) error { 2334 if r.value.len == 0 { 2335 return nil 2336 } 2337 return protoutil.Unmarshal(r.UnsafeValue(), msg) 2338 } 2339 2340 func (r *rocksDBIterator) UnsafeKey() MVCCKey { 2341 return cToUnsafeGoKey(r.key) 2342 } 2343 2344 func (r *rocksDBIterator) UnsafeValue() []byte { 2345 return cSliceToUnsafeGoBytes(r.value) 2346 } 2347 2348 func (r *rocksDBIterator) clearState() { 2349 r.valid = false 2350 r.reseek = true 2351 r.key = C.DBKey{} 2352 r.value = C.DBSlice{} 2353 r.err = nil 2354 } 2355 2356 func (r *rocksDBIterator) setState(state C.DBIterState) { 2357 r.valid = bool(state.valid) 2358 r.reseek = false 2359 r.key = state.key 2360 r.value = state.value 2361 r.err = statusToError(state.status) 2362 } 2363 2364 func (r *rocksDBIterator) ComputeStats( 2365 start, end roachpb.Key, nowNanos int64, 2366 ) (enginepb.MVCCStats, error) { 2367 r.clearState() 2368 result := C.MVCCComputeStats(r.iter, 2369 goToCKey(MakeMVCCMetadataKey(start)), 2370 goToCKey(MakeMVCCMetadataKey(end)), 2371 C.int64_t(nowNanos)) 2372 stats, err := cStatsToGoStats(result, nowNanos) 2373 if util.RaceEnabled { 2374 // If we've come here via batchIterator, then flushMutations (which forces 2375 // reseek) was called just before C.MVCCComputeStats. Set it here as well 2376 // to match. 2377 r.reseek = true 2378 // C.MVCCComputeStats and ComputeStatsGo must behave identically. 2379 // There are unit tests to ensure that they return the same result, but 2380 // as an additional check, use the race builds to check any edge cases 2381 // that the tests may miss. 2382 verifyStats, verifyErr := ComputeStatsGo(r, start, end, nowNanos) 2383 if (err != nil) != (verifyErr != nil) { 2384 panic(fmt.Sprintf("C.MVCCComputeStats differed from ComputeStatsGo: err %v vs %v", err, verifyErr)) 2385 } 2386 if !stats.Equal(verifyStats) { 2387 panic(fmt.Sprintf("C.MVCCComputeStats differed from ComputeStatsGo: stats %+v vs %+v", stats, verifyStats)) 2388 } 2389 } 2390 return stats, err 2391 } 2392 2393 func (r *rocksDBIterator) FindSplitKey( 2394 start, end, minSplitKey roachpb.Key, targetSize int64, 2395 ) (MVCCKey, error) { 2396 var splitKey C.DBString 2397 r.clearState() 2398 status := C.MVCCFindSplitKey(r.iter, 2399 goToCKey(MakeMVCCMetadataKey(start)), 2400 goToCKey(MakeMVCCMetadataKey(minSplitKey)), 2401 C.int64_t(targetSize), &splitKey) 2402 if err := statusToError(status); err != nil { 2403 return MVCCKey{}, err 2404 } 2405 return MVCCKey{Key: cStringToGoBytes(splitKey)}, nil 2406 } 2407 2408 func (r *rocksDBIterator) MVCCOpsSpecialized() bool { 2409 // rocksDBIterator provides specialized implementations of MVCCGet and 2410 // MVCCScan. 2411 return true 2412 } 2413 2414 func (r *rocksDBIterator) MVCCGet( 2415 key roachpb.Key, timestamp hlc.Timestamp, opts MVCCGetOptions, 2416 ) (*roachpb.Value, *roachpb.Intent, error) { 2417 if opts.Inconsistent && opts.Txn != nil { 2418 return nil, nil, errors.Errorf("cannot allow inconsistent reads within a transaction") 2419 } 2420 if len(key) == 0 { 2421 return nil, nil, emptyKeyError() 2422 } 2423 2424 r.clearState() 2425 state := C.MVCCGet( 2426 r.iter, goToCSlice(key), goToCTimestamp(timestamp), goToCTxn(opts.Txn), 2427 C.bool(opts.Inconsistent), C.bool(opts.Tombstones), C.bool(opts.FailOnMoreRecent), 2428 ) 2429 2430 if err := statusToError(state.status); err != nil { 2431 return nil, nil, err 2432 } 2433 if err := writeTooOldToError(timestamp, state.write_too_old_timestamp); err != nil { 2434 return nil, nil, err 2435 } 2436 if err := uncertaintyToError(timestamp, state.uncertainty_timestamp, opts.Txn); err != nil { 2437 return nil, nil, err 2438 } 2439 2440 intents, err := buildScanIntents(cSliceToGoBytes(state.intents)) 2441 if err != nil { 2442 return nil, nil, err 2443 } 2444 if !opts.Inconsistent && len(intents) > 0 { 2445 return nil, nil, &roachpb.WriteIntentError{Intents: intents} 2446 } 2447 2448 var intent *roachpb.Intent 2449 if len(intents) > 1 { 2450 return nil, nil, errors.Errorf("expected 0 or 1 intents, got %d", len(intents)) 2451 } else if len(intents) == 1 { 2452 intent = &intents[0] 2453 } 2454 if state.data.len == 0 { 2455 return nil, intent, nil 2456 } 2457 2458 count := state.data.count 2459 if count > 1 { 2460 return nil, nil, errors.Errorf("expected 0 or 1 result, found %d", count) 2461 } 2462 if count == 0 { 2463 return nil, intent, nil 2464 } 2465 2466 // Extract the value from the batch data. 2467 repr := copyFromSliceVector(state.data.bufs, state.data.len) 2468 mvccKey, rawValue, _, err := MVCCScanDecodeKeyValue(repr) 2469 if err != nil { 2470 return nil, nil, err 2471 } 2472 value := &roachpb.Value{ 2473 RawBytes: rawValue, 2474 Timestamp: mvccKey.Timestamp, 2475 } 2476 return value, intent, nil 2477 } 2478 2479 func (r *rocksDBIterator) MVCCScan( 2480 start, end roachpb.Key, timestamp hlc.Timestamp, opts MVCCScanOptions, 2481 ) (MVCCScanResult, error) { 2482 if opts.Inconsistent && opts.Txn != nil { 2483 return MVCCScanResult{}, errors.Errorf("cannot allow inconsistent reads within a transaction") 2484 } 2485 if len(end) == 0 { 2486 return MVCCScanResult{}, emptyKeyError() 2487 } 2488 if opts.MaxKeys < 0 { 2489 resumeSpan := &roachpb.Span{Key: start, EndKey: end} 2490 return MVCCScanResult{ResumeSpan: resumeSpan}, nil 2491 } 2492 2493 r.clearState() 2494 state := C.MVCCScan( 2495 r.iter, goToCSlice(start), goToCSlice(end), goToCTimestamp(timestamp), 2496 C.int64_t(opts.MaxKeys), C.int64_t(opts.TargetBytes), 2497 goToCTxn(opts.Txn), C.bool(opts.Inconsistent), 2498 C.bool(opts.Reverse), C.bool(opts.Tombstones), 2499 C.bool(opts.FailOnMoreRecent), 2500 ) 2501 2502 if err := statusToError(state.status); err != nil { 2503 return MVCCScanResult{}, err 2504 } 2505 if err := writeTooOldToError(timestamp, state.write_too_old_timestamp); err != nil { 2506 return MVCCScanResult{}, err 2507 } 2508 if err := uncertaintyToError(timestamp, state.uncertainty_timestamp, opts.Txn); err != nil { 2509 return MVCCScanResult{}, err 2510 } 2511 2512 kvData := [][]byte{copyFromSliceVector(state.data.bufs, state.data.len)} 2513 numKVs := int64(state.data.count) 2514 numBytes := int64(state.data.bytes) 2515 2516 var resumeSpan *roachpb.Span 2517 if resumeKey := cSliceToGoBytes(state.resume_key); resumeKey != nil { 2518 if opts.Reverse { 2519 resumeSpan = &roachpb.Span{Key: start, EndKey: roachpb.Key(resumeKey).Next()} 2520 } else { 2521 resumeSpan = &roachpb.Span{Key: resumeKey, EndKey: end} 2522 } 2523 } 2524 2525 intents, err := buildScanIntents(cSliceToGoBytes(state.intents)) 2526 if err != nil { 2527 return MVCCScanResult{}, err 2528 } 2529 if !opts.Inconsistent && len(intents) > 0 { 2530 return MVCCScanResult{}, &roachpb.WriteIntentError{Intents: intents} 2531 } 2532 2533 return MVCCScanResult{ 2534 KVData: kvData, 2535 NumKeys: numKVs, 2536 NumBytes: numBytes, 2537 ResumeSpan: resumeSpan, 2538 Intents: intents, 2539 }, nil 2540 } 2541 2542 func (r *rocksDBIterator) SetUpperBound(key roachpb.Key) { 2543 C.DBIterSetUpperBound(r.iter, goToCKey(MakeMVCCMetadataKey(key))) 2544 } 2545 2546 // CheckForKeyCollisions indicates if the provided SST data collides with this 2547 // iterator in the specified range. 2548 func (r *rocksDBIterator) CheckForKeyCollisions( 2549 sstData []byte, start, end roachpb.Key, 2550 ) (enginepb.MVCCStats, error) { 2551 // Create a C++ iterator over the SST being added. This iterator is used to 2552 // perform a check for key collisions between the SST being ingested, and the 2553 // exisiting data. As the collision check is in C++ we are unable to use a 2554 // pure go iterator as in verifySSTable. 2555 sst := MakeRocksDBSstFileReader() 2556 defer sst.Close() 2557 emptyStats := enginepb.MVCCStats{} 2558 2559 if err := sst.IngestExternalFile(sstData); err != nil { 2560 return emptyStats, err 2561 } 2562 sstIterator := sst.NewIterator(IterOptions{UpperBound: end}).(*rocksDBIterator) 2563 defer sstIterator.Close() 2564 sstIterator.SeekGE(MakeMVCCMetadataKey(start)) 2565 if ok, err := sstIterator.Valid(); err != nil || !ok { 2566 return emptyStats, errors.Wrap(err, "checking for key collisions") 2567 } 2568 2569 var intentErr C.DBString 2570 var skippedKVStats C.MVCCStatsResult 2571 2572 state := C.DBCheckForKeyCollisions(r.iter, sstIterator.iter, &skippedKVStats, &intentErr) 2573 2574 err := statusToError(state.status) 2575 if err != nil { 2576 if err.Error() == "WriteIntentError" { 2577 var e roachpb.WriteIntentError 2578 if err := protoutil.Unmarshal(cStringToGoBytes(intentErr), &e); err != nil { 2579 return emptyStats, errors.Wrap(err, "failed to decode write intent error") 2580 } 2581 return emptyStats, &e 2582 } else if err.Error() == "InlineError" { 2583 return emptyStats, errors.Errorf("inline values are unsupported when checking for key collisions") 2584 } 2585 err = errors.Wrap(&Error{msg: cToGoKey(state.key).String()}, "ingested key collides with an existing one") 2586 return emptyStats, err 2587 } 2588 2589 skippedStats, err := cStatsToGoStats(skippedKVStats, 0) 2590 return skippedStats, err 2591 } 2592 2593 func copyFromSliceVector(bufs *C.DBSlice, len C.int32_t) []byte { 2594 if bufs == nil { 2595 return nil 2596 } 2597 2598 // Interpret the C pointer as a pointer to a Go array, then slice. 2599 slices := (*[1 << 20]C.DBSlice)(unsafe.Pointer(bufs))[:len:len] 2600 neededBytes := 0 2601 for i := range slices { 2602 neededBytes += int(slices[i].len) 2603 } 2604 data := nonZeroingMakeByteSlice(neededBytes)[:0] 2605 for i := range slices { 2606 data = append(data, cSliceToUnsafeGoBytes(slices[i])...) 2607 } 2608 return data 2609 } 2610 2611 func cStatsToGoStats(stats C.MVCCStatsResult, nowNanos int64) (enginepb.MVCCStats, error) { 2612 ms := enginepb.MVCCStats{} 2613 if err := statusToError(stats.status); err != nil { 2614 return ms, err 2615 } 2616 2617 ms.ContainsEstimates = 0 2618 ms.LiveBytes = int64(stats.live_bytes) 2619 ms.KeyBytes = int64(stats.key_bytes) 2620 ms.ValBytes = int64(stats.val_bytes) 2621 ms.IntentBytes = int64(stats.intent_bytes) 2622 ms.LiveCount = int64(stats.live_count) 2623 ms.KeyCount = int64(stats.key_count) 2624 ms.ValCount = int64(stats.val_count) 2625 ms.IntentCount = int64(stats.intent_count) 2626 ms.IntentAge = int64(stats.intent_age) 2627 ms.GCBytesAge = int64(stats.gc_bytes_age) 2628 ms.SysBytes = int64(stats.sys_bytes) 2629 ms.SysCount = int64(stats.sys_count) 2630 ms.LastUpdateNanos = nowNanos 2631 return ms, nil 2632 } 2633 2634 // goToCSlice converts a go byte slice to a DBSlice. Note that this is 2635 // potentially dangerous as the DBSlice holds a reference to the go 2636 // byte slice memory that the Go GC does not know about. This method 2637 // is only intended for use in converting arguments to C 2638 // functions. The C function must copy any data that it wishes to 2639 // retain once the function returns. 2640 func goToCSlice(b []byte) C.DBSlice { 2641 if len(b) == 0 { 2642 return C.DBSlice{data: nil, len: 0} 2643 } 2644 return C.DBSlice{ 2645 data: (*C.char)(unsafe.Pointer(&b[0])), 2646 len: C.size_t(len(b)), 2647 } 2648 } 2649 2650 func goToCIgnoredSeqNums(b []enginepb.IgnoredSeqNumRange) C.DBIgnoredSeqNums { 2651 if len(b) == 0 { 2652 return C.DBIgnoredSeqNums{ranges: nil, len: 0} 2653 } 2654 return C.DBIgnoredSeqNums{ 2655 ranges: (*C.DBIgnoredSeqNumRange)(unsafe.Pointer(&b[0])), 2656 len: C.int(len(b)), 2657 } 2658 } 2659 2660 func goToCKey(key MVCCKey) C.DBKey { 2661 return C.DBKey{ 2662 key: goToCSlice(key.Key), 2663 wall_time: C.int64_t(key.Timestamp.WallTime), 2664 logical: C.int32_t(key.Timestamp.Logical), 2665 } 2666 } 2667 2668 func cToGoKey(key C.DBKey) MVCCKey { 2669 // When converting a C.DBKey to an MVCCKey, give the underlying slice an 2670 // extra byte of capacity in anticipation of roachpb.Key.Next() being 2671 // called. The extra byte is trivial extra space, but allows callers to avoid 2672 // an allocation and copy when calling roachpb.Key.Next(). Note that it is 2673 // important that the extra byte contain the value 0 in order for the 2674 // roachpb.Key.Next() fast-path to be invoked. This is true for the code 2675 // below because make() zero initializes all of the bytes. 2676 unsafeKey := cSliceToUnsafeGoBytes(key.key) 2677 safeKey := make([]byte, len(unsafeKey), len(unsafeKey)+1) 2678 copy(safeKey, unsafeKey) 2679 2680 return MVCCKey{ 2681 Key: safeKey, 2682 Timestamp: hlc.Timestamp{ 2683 WallTime: int64(key.wall_time), 2684 Logical: int32(key.logical), 2685 }, 2686 } 2687 } 2688 2689 func cToUnsafeGoKey(key C.DBKey) MVCCKey { 2690 return MVCCKey{ 2691 Key: cSliceToUnsafeGoBytes(key.key), 2692 Timestamp: hlc.Timestamp{ 2693 WallTime: int64(key.wall_time), 2694 Logical: int32(key.logical), 2695 }, 2696 } 2697 } 2698 2699 func cStringToGoString(s C.DBString) string { 2700 if s.data == nil { 2701 return "" 2702 } 2703 // Reinterpret the string as a slice, then cast to string which does a copy. 2704 result := string(cSliceToUnsafeGoBytes(C.DBSlice{s.data, s.len})) 2705 C.free(unsafe.Pointer(s.data)) 2706 return result 2707 } 2708 2709 func cStringToGoBytes(s C.DBString) []byte { 2710 if s.data == nil { 2711 return nil 2712 } 2713 result := gobytes(unsafe.Pointer(s.data), int(s.len)) 2714 C.free(unsafe.Pointer(s.data)) 2715 return result 2716 } 2717 2718 func cSliceToGoBytes(s C.DBSlice) []byte { 2719 if s.data == nil { 2720 return nil 2721 } 2722 return gobytes(unsafe.Pointer(s.data), int(s.len)) 2723 } 2724 2725 func cSliceToUnsafeGoBytes(s C.DBSlice) []byte { 2726 if s.data == nil { 2727 return nil 2728 } 2729 // Interpret the C pointer as a pointer to a Go array, then slice. 2730 return (*[MaxArrayLen]byte)(unsafe.Pointer(s.data))[:s.len:s.len] 2731 } 2732 2733 func goToCTimestamp(ts hlc.Timestamp) C.DBTimestamp { 2734 return C.DBTimestamp{ 2735 wall_time: C.int64_t(ts.WallTime), 2736 logical: C.int32_t(ts.Logical), 2737 } 2738 } 2739 2740 func cToGoTimestamp(ts C.DBTimestamp) hlc.Timestamp { 2741 return hlc.Timestamp{ 2742 WallTime: int64(ts.wall_time), 2743 Logical: int32(ts.logical), 2744 } 2745 } 2746 2747 func goToCTxn(txn *roachpb.Transaction) C.DBTxn { 2748 var r C.DBTxn 2749 if txn != nil { 2750 r.id = goToCSlice(txn.ID.GetBytesMut()) 2751 r.epoch = C.uint32_t(txn.Epoch) 2752 r.sequence = C.int32_t(txn.Sequence) 2753 r.max_timestamp = goToCTimestamp(txn.MaxTimestamp) 2754 r.ignored_seqnums = goToCIgnoredSeqNums(txn.IgnoredSeqNums) 2755 } 2756 return r 2757 } 2758 2759 func goToCIterOptions(opts IterOptions) C.DBIterOptions { 2760 return C.DBIterOptions{ 2761 prefix: C.bool(opts.Prefix), 2762 lower_bound: goToCKey(MakeMVCCMetadataKey(opts.LowerBound)), 2763 upper_bound: goToCKey(MakeMVCCMetadataKey(opts.UpperBound)), 2764 min_timestamp_hint: goToCTimestamp(opts.MinTimestampHint), 2765 max_timestamp_hint: goToCTimestamp(opts.MaxTimestampHint), 2766 with_stats: C.bool(opts.WithStats), 2767 } 2768 } 2769 2770 func statusToError(s C.DBStatus) error { 2771 if s.data == nil { 2772 return nil 2773 } 2774 return &Error{msg: cStringToGoString(s)} 2775 } 2776 2777 func writeTooOldToError(readTS hlc.Timestamp, existingCTS C.DBTimestamp) error { 2778 existingTS := cToGoTimestamp(existingCTS) 2779 if !existingTS.IsEmpty() { 2780 // The txn can't write at the existing timestamp, so we provide the 2781 // error with the timestamp immediately after it. 2782 return roachpb.NewWriteTooOldError(readTS, existingTS.Next()) 2783 } 2784 return nil 2785 } 2786 2787 func uncertaintyToError( 2788 readTS hlc.Timestamp, existingCTS C.DBTimestamp, txn *roachpb.Transaction, 2789 ) error { 2790 existingTS := cToGoTimestamp(existingCTS) 2791 if !existingTS.IsEmpty() { 2792 return roachpb.NewReadWithinUncertaintyIntervalError(readTS, existingTS, txn) 2793 } 2794 return nil 2795 } 2796 2797 // goMerge takes existing and update byte slices that are expected to 2798 // be marshaled roachpb.Values and merges the two values returning a 2799 // marshaled roachpb.Value or an error. 2800 func goMerge(existing, update []byte) ([]byte, error) { 2801 var result C.DBString 2802 status := C.DBMergeOne(goToCSlice(existing), goToCSlice(update), &result) 2803 if status.data != nil { 2804 return nil, errors.Errorf("%s: existing=%q, update=%q", 2805 cStringToGoString(status), existing, update) 2806 } 2807 return cStringToGoBytes(result), nil 2808 } 2809 2810 // goPartialMerge takes existing and update byte slices that are expected to 2811 // be marshaled roachpb.Values and performs a partial merge using C++ code, 2812 // marshaled roachpb.Value or an error. 2813 func goPartialMerge(existing, update []byte) ([]byte, error) { 2814 var result C.DBString 2815 status := C.DBPartialMergeOne(goToCSlice(existing), goToCSlice(update), &result) 2816 if status.data != nil { 2817 return nil, errors.Errorf("%s: existing=%q, update=%q", 2818 cStringToGoString(status), existing, update) 2819 } 2820 return cStringToGoBytes(result), nil 2821 } 2822 2823 func emptyKeyError() error { 2824 return errors.Errorf("attempted access to empty key") 2825 } 2826 2827 func dbPut(rdb *C.DBEngine, key MVCCKey, value []byte) error { 2828 if len(key.Key) == 0 { 2829 return emptyKeyError() 2830 } 2831 2832 // *Put, *Get, and *Delete call memcpy() (by way of MemTable::Add) 2833 // when called, so we do not need to worry about these byte slices 2834 // being reclaimed by the GC. 2835 return statusToError(C.DBPut(rdb, goToCKey(key), goToCSlice(value))) 2836 } 2837 2838 func dbMerge(rdb *C.DBEngine, key MVCCKey, value []byte) error { 2839 if len(key.Key) == 0 { 2840 return emptyKeyError() 2841 } 2842 2843 // DBMerge calls memcpy() (by way of MemTable::Add) 2844 // when called, so we do not need to worry about these byte slices being 2845 // reclaimed by the GC. 2846 return statusToError(C.DBMerge(rdb, goToCKey(key), goToCSlice(value))) 2847 } 2848 2849 func dbApplyBatchRepr(rdb *C.DBEngine, repr []byte, sync bool) error { 2850 return statusToError(C.DBApplyBatchRepr(rdb, goToCSlice(repr), C.bool(sync))) 2851 } 2852 2853 // dbGet returns the value for the given key. 2854 func dbGet(rdb *C.DBEngine, key MVCCKey) ([]byte, error) { 2855 if len(key.Key) == 0 { 2856 return nil, emptyKeyError() 2857 } 2858 var result C.DBString 2859 err := statusToError(C.DBGet(rdb, goToCKey(key), &result)) 2860 if err != nil { 2861 return nil, err 2862 } 2863 return cStringToGoBytes(result), nil 2864 } 2865 2866 func dbGetProto( 2867 rdb *C.DBEngine, key MVCCKey, msg protoutil.Message, 2868 ) (ok bool, keyBytes, valBytes int64, err error) { 2869 if len(key.Key) == 0 { 2870 err = emptyKeyError() 2871 return 2872 } 2873 var result C.DBString 2874 if err = statusToError(C.DBGet(rdb, goToCKey(key), &result)); err != nil { 2875 return 2876 } 2877 if result.len == 0 { 2878 msg.Reset() 2879 return 2880 } 2881 ok = true 2882 if msg != nil { 2883 // Make a byte slice that is backed by result.data. This slice 2884 // cannot live past the lifetime of this method, but we're only 2885 // using it to unmarshal the roachpb. 2886 data := cSliceToUnsafeGoBytes(C.DBSlice{data: result.data, len: result.len}) 2887 err = protoutil.Unmarshal(data, msg) 2888 } 2889 C.free(unsafe.Pointer(result.data)) 2890 keyBytes = int64(key.EncodedSize()) 2891 valBytes = int64(result.len) 2892 return 2893 } 2894 2895 func dbClear(rdb *C.DBEngine, key MVCCKey) error { 2896 if len(key.Key) == 0 { 2897 return emptyKeyError() 2898 } 2899 return statusToError(C.DBDelete(rdb, goToCKey(key))) 2900 } 2901 2902 func dbSingleClear(rdb *C.DBEngine, key MVCCKey) error { 2903 if len(key.Key) == 0 { 2904 return emptyKeyError() 2905 } 2906 return statusToError(C.DBSingleDelete(rdb, goToCKey(key))) 2907 } 2908 2909 func dbClearRange(rdb *C.DBEngine, start, end MVCCKey) error { 2910 if err := statusToError(C.DBDeleteRange(rdb, goToCKey(start), goToCKey(end))); err != nil { 2911 return err 2912 } 2913 // This is a serious hack. RocksDB generates sstables which cover an 2914 // excessively large amount of the key space when range tombstones are 2915 // present. The crux of the problem is that the logic for determining sstable 2916 // boundaries depends on actual keys being present. So we help that logic 2917 // along by adding deletions of the first key covered by the range tombstone, 2918 // and a key near the end of the range (previous is difficult). See 2919 // TestRocksDBDeleteRangeCompaction which verifies that either this hack is 2920 // working, or the upstream problem was fixed in RocksDB. 2921 if err := dbClear(rdb, start); err != nil { 2922 return err 2923 } 2924 prev := make(roachpb.Key, len(end.Key)) 2925 copy(prev, end.Key) 2926 if n := len(prev) - 1; prev[n] > 0 { 2927 prev[n]-- 2928 } else { 2929 prev = prev[:n] 2930 } 2931 if start.Key.Compare(prev) < 0 { 2932 if err := dbClear(rdb, MakeMVCCMetadataKey(prev)); err != nil { 2933 return err 2934 } 2935 } 2936 return nil 2937 } 2938 2939 func dbClearIterRange(rdb *C.DBEngine, iter Iterator, start, end roachpb.Key) error { 2940 getter, ok := iter.(dbIteratorGetter) 2941 if !ok { 2942 return errors.Errorf("%T is not a RocksDB iterator", iter) 2943 } 2944 return statusToError(C.DBDeleteIterRange(rdb, getter.getIter(), 2945 goToCKey(MakeMVCCMetadataKey(start)), goToCKey(MakeMVCCMetadataKey(end)))) 2946 } 2947 2948 // TODO(dan): Rename this to RocksDBSSTFileReader and RocksDBSSTFileWriter. 2949 2950 // RocksDBSstFileReader allows iteration over a number of non-overlapping 2951 // sstables exported by `RocksDBSstFileWriter`. 2952 type RocksDBSstFileReader struct { 2953 rocksDB *RocksDB 2954 filenameCounter int 2955 } 2956 2957 // MakeRocksDBSstFileReader creates a RocksDBSstFileReader backed by an 2958 // in-memory RocksDB instance. 2959 func MakeRocksDBSstFileReader() RocksDBSstFileReader { 2960 // cacheSize was selected because it's used for almost all other newRocksDBInMem 2961 // calls. It's seemed to work well so far, but there's probably more tuning 2962 // to be done here. 2963 const cacheSize = 1 << 20 2964 return RocksDBSstFileReader{rocksDB: newRocksDBInMem(roachpb.Attributes{}, cacheSize)} 2965 } 2966 2967 // IngestExternalFile links a file with the given contents into a database. See 2968 // the RocksDB documentation on `IngestExternalFile` for the various 2969 // restrictions on what can be added. 2970 func (fr *RocksDBSstFileReader) IngestExternalFile(data []byte) error { 2971 if fr.rocksDB == nil { 2972 return errors.New("cannot call IngestExternalFile on a closed reader") 2973 } 2974 2975 filename := fmt.Sprintf("ingest-%d", fr.filenameCounter) 2976 fr.filenameCounter++ 2977 if err := fr.rocksDB.WriteFile(filename, data); err != nil { 2978 return err 2979 } 2980 2981 cPaths := make([]*C.char, 1) 2982 cPaths[0] = C.CString(filename) 2983 cPathLen := C.size_t(len(cPaths)) 2984 defer C.free(unsafe.Pointer(cPaths[0])) 2985 2986 const noMove = false 2987 return statusToError(C.DBIngestExternalFiles(fr.rocksDB.rdb, &cPaths[0], cPathLen, noMove)) 2988 } 2989 2990 // Iterate iterates over the keys between start inclusive and end 2991 // exclusive, invoking f() on each key/value pair. 2992 func (fr *RocksDBSstFileReader) Iterate( 2993 start, end roachpb.Key, f func(MVCCKeyValue) (bool, error), 2994 ) error { 2995 if fr.rocksDB == nil { 2996 return errors.New("cannot call Iterate on a closed reader") 2997 } 2998 return fr.rocksDB.Iterate(start, end, f) 2999 } 3000 3001 // NewIterator returns an iterator over this sst reader. 3002 func (fr *RocksDBSstFileReader) NewIterator(opts IterOptions) Iterator { 3003 return newRocksDBIterator(fr.rocksDB.rdb, opts, fr.rocksDB, fr.rocksDB) 3004 } 3005 3006 // Close finishes the reader. 3007 func (fr *RocksDBSstFileReader) Close() { 3008 if fr.rocksDB == nil { 3009 return 3010 } 3011 fr.rocksDB.Close() 3012 fr.rocksDB = nil 3013 } 3014 3015 // RocksDBSstFileWriter creates a file suitable for importing with 3016 // RocksDBSstFileReader. It implements the Writer interface. 3017 type RocksDBSstFileWriter struct { 3018 fw *C.DBSstFileWriter 3019 // dataSize tracks the total key and value bytes added so far. 3020 dataSize int64 3021 } 3022 3023 var _ Writer = &RocksDBSstFileWriter{} 3024 3025 // MakeRocksDBSstFileWriter creates a new RocksDBSstFileWriter with the default 3026 // configuration. 3027 // 3028 // NOTE: This is deprecated - and should only be used in tests to check for 3029 // equivalence with engine.SSTWriter. 3030 // 3031 // TODO(itsbilal): Move all tests to SSTWriter and then delete this function 3032 // and struct. 3033 func MakeRocksDBSstFileWriter() (RocksDBSstFileWriter, error) { 3034 fw := C.DBSstFileWriterNew() 3035 err := statusToError(C.DBSstFileWriterOpen(fw)) 3036 return RocksDBSstFileWriter{fw: fw}, err 3037 } 3038 3039 // ApplyBatchRepr implements the Writer interface. 3040 func (fw *RocksDBSstFileWriter) ApplyBatchRepr(repr []byte, sync bool) error { 3041 panic("unimplemented") 3042 } 3043 3044 // Clear implements the Writer interface. Note that it inserts a tombstone 3045 // rather than actually remove the entry from the storage engine. An error is 3046 // returned if it is not greater than any previous key used in Put or Clear 3047 // (according to the comparator configured during writer creation). Close 3048 // cannot have been called. 3049 func (fw *RocksDBSstFileWriter) Clear(key MVCCKey) error { 3050 if fw.fw == nil { 3051 return errors.New("cannot call Clear on a closed writer") 3052 } 3053 fw.dataSize += int64(len(key.Key)) 3054 return statusToError(C.DBSstFileWriterDelete(fw.fw, goToCKey(key))) 3055 } 3056 3057 // DataSize returns the total key and value bytes added so far. 3058 func (fw *RocksDBSstFileWriter) DataSize() int64 { 3059 return fw.dataSize 3060 } 3061 3062 // SingleClear implements the Writer interface. 3063 func (fw *RocksDBSstFileWriter) SingleClear(key MVCCKey) error { 3064 panic("unimplemented") 3065 } 3066 3067 // ClearRange implements the Writer interface. Note that it inserts a range deletion 3068 // tombstone rather than actually remove the entries from the storage engine. 3069 // It can be called at any time with respect to Put and Clear. 3070 func (fw *RocksDBSstFileWriter) ClearRange(start, end MVCCKey) error { 3071 if fw.fw == nil { 3072 return errors.New("cannot call ClearRange on a closed writer") 3073 } 3074 fw.dataSize += int64(len(start.Key)) + int64(len(end.Key)) 3075 return statusToError(C.DBSstFileWriterDeleteRange(fw.fw, goToCKey(start), goToCKey(end))) 3076 } 3077 3078 // ClearIterRange implements the Writer interface. 3079 // 3080 // NOTE: This method is fairly expensive as it performs a Cgo call for every 3081 // key deleted. 3082 func (fw *RocksDBSstFileWriter) ClearIterRange(iter Iterator, start, end roachpb.Key) error { 3083 if fw.fw == nil { 3084 return errors.New("cannot call ClearIterRange on a closed writer") 3085 } 3086 mvccEndKey := MakeMVCCMetadataKey(end) 3087 iter.SeekGE(MakeMVCCMetadataKey(start)) 3088 for { 3089 valid, err := iter.Valid() 3090 if err != nil { 3091 return err 3092 } 3093 if !valid || !iter.Key().Less(mvccEndKey) { 3094 break 3095 } 3096 if err := fw.Clear(iter.Key()); err != nil { 3097 return err 3098 } 3099 iter.Next() 3100 } 3101 return nil 3102 } 3103 3104 // Merge implements the Writer interface. 3105 func (fw *RocksDBSstFileWriter) Merge(key MVCCKey, value []byte) error { 3106 panic("unimplemented") 3107 } 3108 3109 // Put implements the Writer interface. It puts a kv entry into the sstable 3110 // being built. An error is returned if it is not greater than any previous key 3111 // used in Put or Clear (according to the comparator configured during writer 3112 // creation). Close cannot have been called. 3113 func (fw *RocksDBSstFileWriter) Put(key MVCCKey, value []byte) error { 3114 if fw.fw == nil { 3115 return errors.New("cannot call Put on a closed writer") 3116 } 3117 fw.dataSize += int64(len(key.Key)) + int64(len(value)) 3118 return statusToError(C.DBSstFileWriterAdd(fw.fw, goToCKey(key), goToCSlice(value))) 3119 } 3120 3121 // LogData implements the Writer interface. 3122 func (fw *RocksDBSstFileWriter) LogData(data []byte) error { 3123 panic("unimplemented") 3124 } 3125 3126 // LogLogicalOp implements the Writer interface. 3127 func (fw *RocksDBSstFileWriter) LogLogicalOp(op MVCCLogicalOpType, details MVCCLogicalOpDetails) { 3128 // No-op. Logical logging disabled. 3129 } 3130 3131 // Truncate truncates the writer's current memory buffer and returns the 3132 // contents it contained. May be called multiple times. The function may not 3133 // truncate and return all keys if the underlying RocksDB blocks have not been 3134 // flushed. Close cannot have been called. 3135 func (fw *RocksDBSstFileWriter) Truncate() ([]byte, error) { 3136 if fw.fw == nil { 3137 return nil, errors.New("cannot call Truncate on a closed writer") 3138 } 3139 var contents C.DBString 3140 if err := statusToError(C.DBSstFileWriterTruncate(fw.fw, &contents)); err != nil { 3141 return nil, err 3142 } 3143 return cStringToGoBytes(contents), nil 3144 } 3145 3146 // Finish finalizes the writer and returns the constructed file's contents. At 3147 // least one kv entry must have been added. 3148 func (fw *RocksDBSstFileWriter) Finish() ([]byte, error) { 3149 if fw.fw == nil { 3150 return nil, errors.New("cannot call Finish on a closed writer") 3151 } 3152 var contents C.DBString 3153 if err := statusToError(C.DBSstFileWriterFinish(fw.fw, &contents)); err != nil { 3154 return nil, err 3155 } 3156 return cStringToGoBytes(contents), nil 3157 } 3158 3159 // Close finishes and frees memory and other resources. Close is idempotent. 3160 func (fw *RocksDBSstFileWriter) Close() { 3161 if fw.fw == nil { 3162 return 3163 } 3164 C.DBSstFileWriterClose(fw.fw) 3165 fw.fw = nil 3166 } 3167 3168 // RunLDB runs RocksDB's ldb command-line tool. The passed 3169 // command-line arguments should not include argv[0]. 3170 func RunLDB(args []string) { 3171 // Prepend "ldb" as argv[0]. 3172 args = append([]string{"ldb"}, args...) 3173 argv := make([]*C.char, len(args)) 3174 for i := range args { 3175 argv[i] = C.CString(args[i]) 3176 } 3177 defer func() { 3178 for i := range argv { 3179 C.free(unsafe.Pointer(argv[i])) 3180 } 3181 }() 3182 3183 C.DBRunLDB(C.int(len(argv)), &argv[0]) 3184 } 3185 3186 // RunSSTDump runs RocksDB's sst_dump command-line tool. The passed 3187 // command-line arguments should not include argv[0]. 3188 func RunSSTDump(args []string) { 3189 // Prepend "sst_dump" as argv[0]. 3190 args = append([]string{"sst_dump"}, args...) 3191 argv := make([]*C.char, len(args)) 3192 for i := range args { 3193 argv[i] = C.CString(args[i]) 3194 } 3195 defer func() { 3196 for i := range argv { 3197 C.free(unsafe.Pointer(argv[i])) 3198 } 3199 }() 3200 3201 C.DBRunSSTDump(C.int(len(argv)), &argv[0]) 3202 } 3203 3204 // GetAuxiliaryDir returns the auxiliary storage path for this engine. 3205 func (r *RocksDB) GetAuxiliaryDir() string { 3206 return r.auxDir 3207 } 3208 3209 func (r *RocksDB) setAuxiliaryDir(d string) error { 3210 if !r.cfg.ReadOnly { 3211 if err := os.MkdirAll(d, 0755); err != nil { 3212 return err 3213 } 3214 } 3215 r.auxDir = d 3216 return nil 3217 } 3218 3219 // PreIngestDelay implements the Engine interface. 3220 func (r *RocksDB) PreIngestDelay(ctx context.Context) { 3221 preIngestDelay(ctx, r, r.cfg.Settings) 3222 } 3223 3224 // IngestExternalFiles atomically links a slice of files into the RocksDB 3225 // log-structured merge-tree. 3226 func (r *RocksDB) IngestExternalFiles(ctx context.Context, paths []string) error { 3227 cPaths := make([]*C.char, len(paths)) 3228 for i := range paths { 3229 cPaths[i] = C.CString(paths[i]) 3230 } 3231 defer func() { 3232 for i := range cPaths { 3233 C.free(unsafe.Pointer(cPaths[i])) 3234 } 3235 }() 3236 3237 return statusToError(C.DBIngestExternalFiles( 3238 r.rdb, 3239 &cPaths[0], 3240 C.size_t(len(cPaths)), 3241 C._Bool(true), // move_files 3242 )) 3243 } 3244 3245 // InMem returns true if the receiver is an in-memory engine and false 3246 // otherwise. 3247 func (r *RocksDB) InMem() bool { 3248 return r.cfg.Dir == "" 3249 } 3250 3251 // ReadFile reads the content from a file with the given filename. The file 3252 // must have been opened through Engine.OpenFile. Otherwise an error will be 3253 // returned. 3254 func (r *RocksDB) ReadFile(filename string) ([]byte, error) { 3255 var data C.DBSlice 3256 if err := statusToError(C.DBEnvReadFile(r.rdb, goToCSlice([]byte(filename)), &data)); err != nil { 3257 return nil, notFoundErrOrDefault(err) 3258 } 3259 defer C.free(unsafe.Pointer(data.data)) 3260 return cSliceToGoBytes(data), nil 3261 } 3262 3263 // WriteFile writes data to a file in this RocksDB's env. 3264 func (r *RocksDB) WriteFile(filename string, data []byte) error { 3265 return statusToError(C.DBEnvWriteFile(r.rdb, goToCSlice([]byte(filename)), goToCSlice(data))) 3266 } 3267 3268 // Remove deletes the file with the given filename from this RocksDB's env. 3269 // If the file with given filename doesn't exist, return os.ErrNotExist. 3270 func (r *RocksDB) Remove(filename string) error { 3271 if err := statusToError(C.DBEnvDeleteFile(r.rdb, goToCSlice([]byte(filename)))); err != nil { 3272 return notFoundErrOrDefault(err) 3273 } 3274 return nil 3275 } 3276 3277 // RemoveAll removes path and any children it contains from this RocksDB's 3278 // env. If the path does not exist, RemoveAll returns nil (no error). 3279 func (r *RocksDB) RemoveAll(path string) error { 3280 // We don't have a reliable way of telling whether a path is a directory 3281 // or a file from the RocksDB Env interface. Assume it's a directory, 3282 // ignoring any resulting error, and delete any of its children. 3283 dirents, listErr := r.List(path) 3284 if listErr == nil { 3285 for _, dirent := range dirents { 3286 err := r.RemoveAll(filepath.Join(path, dirent)) 3287 if err != nil { 3288 return err 3289 } 3290 } 3291 3292 // Path should exist, point to a directory and have no children. 3293 return r.RemoveDir(path) 3294 } 3295 3296 // Path might be a file, non-existent, or a directory for which List 3297 // errored for some other reason. 3298 err := r.Remove(path) 3299 if err == nil { 3300 return nil 3301 } 3302 if os.IsNotExist(err) && os.IsNotExist(listErr) { 3303 return nil 3304 } 3305 return listErr 3306 } 3307 3308 // Link creates 'newname' as a hard link to 'oldname'. This use the Env 3309 // responsible for the file which may handle extra logic (eg: copy encryption 3310 // settings for EncryptedEnv). 3311 func (r *RocksDB) Link(oldname, newname string) error { 3312 if err := statusToError(C.DBEnvLinkFile(r.rdb, goToCSlice([]byte(oldname)), goToCSlice([]byte(newname)))); err != nil { 3313 return &os.LinkError{ 3314 Op: "link", 3315 Old: oldname, 3316 New: newname, 3317 Err: err, 3318 } 3319 } 3320 return nil 3321 } 3322 3323 // IsValidSplitKey returns whether the key is a valid split key. Certain key 3324 // ranges cannot be split (the meta1 span and the system DB span); split keys 3325 // chosen within any of these ranges are considered invalid. And a split key 3326 // equal to Meta2KeyMax (\x03\xff\xff) is considered invalid. 3327 func IsValidSplitKey(key roachpb.Key) bool { 3328 return bool(C.MVCCIsValidSplitKey(goToCSlice(key))) 3329 } 3330 3331 // lockFile sets a lock on the specified file using RocksDB's file locking interface. 3332 func lockFile(filename string) (C.DBFileLock, error) { 3333 var lock C.DBFileLock 3334 // C.DBLockFile mutates its argument. `lock, statusToError(...)` 3335 // happens to work in gc, but does not work in gccgo. 3336 // 3337 // See https://github.com/golang/go/issues/23188. 3338 err := statusToError(C.DBLockFile(goToCSlice([]byte(filename)), &lock)) 3339 return lock, err 3340 } 3341 3342 // unlockFile unlocks the file asscoiated with the specified lock and GCs any allocated memory for the lock. 3343 func unlockFile(lock C.DBFileLock) error { 3344 return statusToError(C.DBUnlockFile(lock)) 3345 } 3346 3347 // MVCCScanDecodeKeyValue decodes a key/value pair returned in an MVCCScan 3348 // "batch" (this is not the RocksDB batch repr format), returning both the 3349 // key/value and the suffix of data remaining in the batch. 3350 func MVCCScanDecodeKeyValue(repr []byte) (key MVCCKey, value []byte, orepr []byte, err error) { 3351 k, ts, value, orepr, err := enginepb.ScanDecodeKeyValue(repr) 3352 return MVCCKey{k, ts}, value, orepr, err 3353 } 3354 3355 // MVCCScanDecodeKeyValues decodes all key/value pairs returned in one or more 3356 // MVCCScan "batches" (this is not the RocksDB batch repr format). The provided 3357 // function is called for each key/value pair. 3358 func MVCCScanDecodeKeyValues(repr [][]byte, fn func(key MVCCKey, rawBytes []byte) error) error { 3359 var k MVCCKey 3360 var rawBytes []byte 3361 var err error 3362 for _, data := range repr { 3363 for len(data) > 0 { 3364 k, rawBytes, data, err = MVCCScanDecodeKeyValue(data) 3365 if err != nil { 3366 return err 3367 } 3368 if err = fn(k, rawBytes); err != nil { 3369 return err 3370 } 3371 } 3372 } 3373 return nil 3374 } 3375 3376 func notFoundErrOrDefault(err error) error { 3377 errStr := err.Error() 3378 if strings.Contains(errStr, "No such") || 3379 strings.Contains(errStr, "not found") || 3380 strings.Contains(errStr, "does not exist") || 3381 strings.Contains(errStr, "NotFound:") || 3382 strings.Contains(errStr, "cannot find") { 3383 return os.ErrNotExist 3384 } 3385 return err 3386 } 3387 3388 // rocksdbWritableFile implements the File interface. It is used to interact with the 3389 // DBWritableFile in the corresponding RocksDB env. 3390 type rocksdbWritableFile struct { 3391 file C.DBWritableFile 3392 rdb *C.DBEngine 3393 } 3394 3395 var _ fs.File = &rocksdbWritableFile{} 3396 3397 // Write implements the File interface. 3398 func (f *rocksdbWritableFile) Write(data []byte) (int, error) { 3399 err := statusToError(C.DBEnvAppendFile(f.rdb, f.file, goToCSlice(data))) 3400 return len(data), err 3401 } 3402 3403 // Close implements the File interface. 3404 func (f *rocksdbWritableFile) Close() error { 3405 return statusToError(C.DBEnvCloseFile(f.rdb, f.file)) 3406 } 3407 3408 // Sync implements the File interface. 3409 func (f *rocksdbWritableFile) Sync() error { 3410 return statusToError(C.DBEnvSyncFile(f.rdb, f.file)) 3411 } 3412 3413 // Read implements the File interface. 3414 func (f *rocksdbWritableFile) Read(p []byte) (n int, err error) { 3415 return 0, fmt.Errorf("cannot read file opened for writing") 3416 } 3417 3418 // ReadAt implements the File interface. 3419 func (f *rocksdbWritableFile) ReadAt(p []byte, off int64) (n int, err error) { 3420 return 0, fmt.Errorf("cannot read file opened for writing") 3421 } 3422 3423 // rocksdbReadableFile implements the File interface. It is used to interact with the 3424 // DBReadableFile in the corresponding RocksDB env. 3425 type rocksdbReadableFile struct { 3426 file C.DBReadableFile 3427 rdb *C.DBEngine 3428 offset int64 3429 } 3430 3431 var _ fs.File = &rocksdbReadableFile{} 3432 3433 // Write implements the File interface. 3434 func (f *rocksdbReadableFile) Write(data []byte) (int, error) { 3435 return 0, fmt.Errorf("cannot write file opened for reading") 3436 } 3437 3438 // Close implements the File interface. 3439 func (f *rocksdbReadableFile) Close() error { 3440 return statusToError(C.DBEnvCloseReadableFile(f.rdb, f.file)) 3441 } 3442 3443 // Sync implements the File interface. 3444 func (f *rocksdbReadableFile) Sync() error { 3445 return fmt.Errorf("cannot sync file opened for reading") 3446 } 3447 3448 // Read implements the File interface. 3449 func (f *rocksdbReadableFile) Read(p []byte) (n int, err error) { 3450 n, err = f.ReadAt(p, f.offset) 3451 f.offset += int64(n) 3452 return 3453 } 3454 3455 // ReadAt implements the File interface. 3456 func (f *rocksdbReadableFile) ReadAt(p []byte, off int64) (int, error) { 3457 var n C.int 3458 err := statusToError(C.DBEnvReadAtFile(f.rdb, f.file, goToCSlice(p), C.int64_t(off), &n)) 3459 return int(n), err 3460 } 3461 3462 type rocksdbDirectory struct { 3463 file C.DBDirectory 3464 rdb *C.DBEngine 3465 } 3466 3467 var _ fs.File = &rocksdbDirectory{} 3468 3469 // Write implements the File interface. 3470 func (f *rocksdbDirectory) Write(data []byte) (int, error) { 3471 return 0, fmt.Errorf("cannot write to directory") 3472 } 3473 3474 // Close implements the File interface. 3475 func (f *rocksdbDirectory) Close() error { 3476 return statusToError(C.DBEnvCloseDirectory(f.rdb, f.file)) 3477 } 3478 3479 // Sync implements the File interface. 3480 func (f *rocksdbDirectory) Sync() error { 3481 return statusToError(C.DBEnvSyncDirectory(f.rdb, f.file)) 3482 } 3483 3484 // Read implements the File interface. 3485 func (f *rocksdbDirectory) Read(p []byte) (n int, err error) { 3486 return 0, fmt.Errorf("cannot read directory") 3487 } 3488 3489 // ReadAt implements the File interface. 3490 func (f *rocksdbDirectory) ReadAt(p []byte, off int64) (n int, err error) { 3491 return 0, fmt.Errorf("cannot read directory") 3492 } 3493 3494 var _ fs.FS = &RocksDB{} 3495 3496 // Create implements the FS interface. 3497 func (r *RocksDB) Create(name string) (fs.File, error) { 3498 return r.CreateWithSync(name, 0) 3499 } 3500 3501 // CreateWithSync implements the FS interface. 3502 func (r *RocksDB) CreateWithSync(name string, bytesPerSync int) (fs.File, error) { 3503 var file C.DBWritableFile 3504 if err := statusToError(C.DBEnvOpenFile( 3505 r.rdb, goToCSlice([]byte(name)), C.uint64_t(bytesPerSync), &file)); err != nil { 3506 return nil, notFoundErrOrDefault(err) 3507 } 3508 return &rocksdbWritableFile{file: file, rdb: r.rdb}, nil 3509 } 3510 3511 // Open implements the FS interface. 3512 func (r *RocksDB) Open(name string) (fs.File, error) { 3513 var file C.DBReadableFile 3514 if err := statusToError(C.DBEnvOpenReadableFile(r.rdb, goToCSlice([]byte(name)), &file)); err != nil { 3515 return nil, notFoundErrOrDefault(err) 3516 } 3517 return &rocksdbReadableFile{file: file, rdb: r.rdb}, nil 3518 } 3519 3520 // OpenDir implements the FS interface. 3521 func (r *RocksDB) OpenDir(name string) (fs.File, error) { 3522 var file C.DBDirectory 3523 if err := statusToError(C.DBEnvOpenDirectory(r.rdb, goToCSlice([]byte(name)), &file)); err != nil { 3524 return nil, notFoundErrOrDefault(err) 3525 } 3526 return &rocksdbDirectory{file: file, rdb: r.rdb}, nil 3527 } 3528 3529 // Rename implements the FS interface. 3530 func (r *RocksDB) Rename(oldname, newname string) error { 3531 return statusToError(C.DBEnvRenameFile(r.rdb, goToCSlice([]byte(oldname)), goToCSlice([]byte(newname)))) 3532 } 3533 3534 // MkdirAll implements the FS interface. 3535 func (r *RocksDB) MkdirAll(path string) error { 3536 path = filepath.Clean(path) 3537 3538 // Skip trailing path separators. 3539 for len(path) > 0 && path[len(path)-1] == filepath.Separator { 3540 path = path[:len(path)-1] 3541 } 3542 // The path may be empty after cleaning and trimming tailing path 3543 // separators. 3544 if path == "" { 3545 return nil 3546 } 3547 3548 // Ensure the parent exists first. 3549 parent, _ := filepath.Split(path) 3550 if parent != "" { 3551 if err := r.MkdirAll(parent); err != nil { 3552 return err 3553 } 3554 } 3555 return statusToError(C.DBEnvCreateDir(r.rdb, goToCSlice([]byte(path)))) 3556 } 3557 3558 // RemoveDir implements the FS interface. 3559 func (r *RocksDB) RemoveDir(name string) error { 3560 return statusToError(C.DBEnvDeleteDir(r.rdb, goToCSlice([]byte(name)))) 3561 } 3562 3563 // List implements the FS interface. 3564 func (r *RocksDB) List(name string) ([]string, error) { 3565 list := C.DBEnvListDir(r.rdb, goToCSlice([]byte(name))) 3566 n := list.n 3567 names := list.names 3568 // We can't index into names because it is a pointer, not a slice. The 3569 // hackery below treats the pointer as an array and then constructs 3570 // a slice from it. 3571 nameSize := unsafe.Sizeof(C.DBString{}) 3572 nameVal := func(i int) C.DBString { 3573 return *(*C.DBString)(unsafe.Pointer(uintptr(unsafe.Pointer(names)) + uintptr(i)*nameSize)) 3574 } 3575 err := statusToError(list.status) 3576 if err != nil { 3577 err = notFoundErrOrDefault(err) 3578 } 3579 3580 result := make([]string, n) 3581 j := 0 3582 for i := range result { 3583 str := cStringToGoString(nameVal(i)) 3584 if str == "." || str == ".." { 3585 continue 3586 } 3587 result[j] = str 3588 j++ 3589 } 3590 C.free(unsafe.Pointer(names)) 3591 3592 result = result[:j] 3593 sort.Strings(result) 3594 return result, err 3595 } 3596 3597 // ThreadStacks returns the stacks for all threads. The stacks are raw 3598 // addresses, and do not contain symbols. Use addr2line (or atos on Darwin) to 3599 // symbolize. 3600 func ThreadStacks() string { 3601 return cStringToGoString(C.DBDumpThreadStacks()) 3602 }