github.com/weaviate/weaviate@v1.24.6/adapters/repos/db/lsmkv/bucket.go (about) 1 // _ _ 2 // __ _____ __ ___ ___ __ _| |_ ___ 3 // \ \ /\ / / _ \/ _` \ \ / / |/ _` | __/ _ \ 4 // \ V V / __/ (_| |\ V /| | (_| | || __/ 5 // \_/\_/ \___|\__,_| \_/ |_|\__,_|\__\___| 6 // 7 // Copyright © 2016 - 2024 Weaviate B.V. All rights reserved. 8 // 9 // CONTACT: hello@weaviate.io 10 // 11 12 package lsmkv 13 14 import ( 15 "bytes" 16 "context" 17 "fmt" 18 "os" 19 "path/filepath" 20 "sort" 21 "sync" 22 "time" 23 24 "github.com/pkg/errors" 25 "github.com/prometheus/client_golang/prometheus" 26 "github.com/sirupsen/logrus" 27 "github.com/weaviate/weaviate/adapters/repos/db/lsmkv/segmentindex" 28 "github.com/weaviate/weaviate/entities/cyclemanager" 29 "github.com/weaviate/weaviate/entities/interval" 30 "github.com/weaviate/weaviate/entities/lsmkv" 31 "github.com/weaviate/weaviate/entities/storagestate" 32 "github.com/weaviate/weaviate/entities/storobj" 33 ) 34 35 type Bucket struct { 36 dir string 37 rootDir string 38 active *Memtable 39 flushing *Memtable 40 disk *SegmentGroup 41 logger logrus.FieldLogger 42 43 // Lock() means a move from active to flushing is happening, RLock() is 44 // normal operation 45 flushLock sync.RWMutex 46 haltedFlushTimer *interval.BackoffTimer 47 48 walThreshold uint64 49 flushDirtyAfter time.Duration 50 memtableThreshold uint64 51 memtableResizer *memtableSizeAdvisor 52 strategy string 53 // Strategy inverted index is supposed to be created with, but existing 54 // segment files were created with different one. 55 // It can happen when new strategy were introduced to weaviate, but 56 // files are already created using old implementation. 57 // Example: RoaringSet strategy replaces CollectionSet strategy. 58 // Field can be used for migration files of old strategy to newer one. 59 desiredStrategy string 60 secondaryIndices uint16 61 62 // Optional to avoid syscalls 63 mmapContents bool 64 65 // for backward compatibility 66 legacyMapSortingBeforeCompaction bool 67 68 flushCallbackCtrl cyclemanager.CycleCallbackCtrl 69 70 status storagestate.Status 71 statusLock sync.RWMutex 72 73 metrics *Metrics 74 75 // all "replace" buckets support counting through net additions, but not all 76 // produce a meaningful count. Typically, the only count we're interested in 77 // is that of the bucket that holds objects 78 monitorCount bool 79 80 pauseTimer *prometheus.Timer // Times the pause 81 82 // Whether tombstones (set/map/replace types) or deletions (roaringset type) 83 // should be kept in root segment during compaction process. 84 // Since segments are immutable, deletions are added as new entries with 85 // tombstones. Tombstones are by default copied to merged segment, as they 86 // can refer to keys/values present in previous segments. 87 // Those tombstones can be removed entirely when merging with root (1st) segment, 88 // due to lack of previous segments, tombstones may relate to. 89 // As info about key/value being deleted (based on tombstone presence) may be important 90 // for some use cases (e.g. replication needs to know if object(ObjectsBucketLSM) was deleted) 91 // keeping tombstones on compaction is optional 92 keepTombstones bool 93 94 // Init and use bloom filter for getting key from bucket segments. 95 // As some buckets can be accessed only with cursor (see flat index), 96 // where bloom filter is not applicable, it can be disabled. 97 // ON by default 98 useBloomFilter bool 99 100 // Net additions keep track of number of elements stored in bucket (of type replace). 101 // As some buckets don't have to provide Count info (see flat index), 102 // tracking additions can be disabled. 103 // ON by default 104 calcCountNetAdditions bool 105 106 forceCompaction bool 107 } 108 109 // NewBucket initializes a new bucket. It either loads the state from disk if 110 // it exists, or initializes new state. 111 // 112 // You do not need to ever call NewBucket() yourself, if you are using a 113 // [Store]. In this case the [Store] can manage buckets for you, using methods 114 // such as CreateOrLoadBucket(). 115 func NewBucket(ctx context.Context, dir, rootDir string, logger logrus.FieldLogger, 116 metrics *Metrics, compactionCallbacks, flushCallbacks cyclemanager.CycleCallbackGroup, 117 opts ...BucketOption, 118 ) (*Bucket, error) { 119 beforeAll := time.Now() 120 defaultMemTableThreshold := uint64(10 * 1024 * 1024) 121 defaultWalThreshold := uint64(1024 * 1024 * 1024) 122 defaultFlushAfterDirty := 60 * time.Second 123 defaultStrategy := StrategyReplace 124 125 if err := os.MkdirAll(dir, 0o700); err != nil { 126 return nil, err 127 } 128 129 b := &Bucket{ 130 dir: dir, 131 rootDir: rootDir, 132 memtableThreshold: defaultMemTableThreshold, 133 walThreshold: defaultWalThreshold, 134 flushDirtyAfter: defaultFlushAfterDirty, 135 strategy: defaultStrategy, 136 mmapContents: true, 137 logger: logger, 138 metrics: metrics, 139 useBloomFilter: true, 140 calcCountNetAdditions: true, 141 haltedFlushTimer: interval.NewBackoffTimer(), 142 } 143 144 for _, opt := range opts { 145 if err := opt(b); err != nil { 146 return nil, err 147 } 148 } 149 150 if b.memtableResizer != nil { 151 b.memtableThreshold = uint64(b.memtableResizer.Initial()) 152 } 153 154 sg, err := newSegmentGroup(logger, metrics, compactionCallbacks, 155 sgConfig{ 156 dir: dir, 157 strategy: b.strategy, 158 mapRequiresSorting: b.legacyMapSortingBeforeCompaction, 159 monitorCount: b.monitorCount, 160 mmapContents: b.mmapContents, 161 keepTombstones: b.keepTombstones, 162 forceCompaction: b.forceCompaction, 163 useBloomFilter: b.useBloomFilter, 164 calcCountNetAdditions: b.calcCountNetAdditions, 165 }) 166 if err != nil { 167 return nil, fmt.Errorf("init disk segments: %w", err) 168 } 169 170 // Actual strategy is stored in segment files. In case it is SetCollection, 171 // while new implementation uses bitmaps and supposed to be RoaringSet, 172 // bucket and segmentgroup strategy is changed back to SetCollection 173 // (memtables will be created later on, with already modified strategy) 174 // TODO what if only WAL files exists, and there is no segment to get actual strategy? 175 if b.strategy == StrategyRoaringSet && len(sg.segments) > 0 && 176 sg.segments[0].strategy == segmentindex.StrategySetCollection { 177 b.strategy = StrategySetCollection 178 b.desiredStrategy = StrategyRoaringSet 179 sg.strategy = StrategySetCollection 180 } 181 // As of v1.19 property's IndexInterval setting is replaced with 182 // IndexFilterable (roaring set) + IndexSearchable (map) and enabled by default. 183 // Buckets for text/text[] inverted indexes created before 1.19 have strategy 184 // map and name that since 1.19 is used by filterable indeverted index. 185 // Those buckets (roaring set by configuration, but in fact map) have to be 186 // renamed on startup by migrator. Here actual strategy is set based on 187 // data found in segment files 188 if b.strategy == StrategyRoaringSet && len(sg.segments) > 0 && 189 sg.segments[0].strategy == segmentindex.StrategyMapCollection { 190 b.strategy = StrategyMapCollection 191 b.desiredStrategy = StrategyRoaringSet 192 sg.strategy = StrategyMapCollection 193 } 194 195 b.disk = sg 196 197 if err := b.mayRecoverFromCommitLogs(ctx); err != nil { 198 return nil, err 199 } 200 201 err = b.setNewActiveMemtable() 202 if err != nil { 203 return nil, err 204 } 205 206 id := "bucket/flush/" + b.dir 207 b.flushCallbackCtrl = flushCallbacks.Register(id, b.flushAndSwitchIfThresholdsMet) 208 209 b.metrics.TrackStartupBucket(beforeAll) 210 211 return b, nil 212 } 213 214 func (b *Bucket) GetDir() string { 215 return b.dir 216 } 217 218 func (b *Bucket) GetRootDir() string { 219 return b.rootDir 220 } 221 222 func (b *Bucket) GetStrategy() string { 223 return b.strategy 224 } 225 226 func (b *Bucket) GetDesiredStrategy() string { 227 return b.desiredStrategy 228 } 229 230 func (b *Bucket) GetSecondaryIndices() uint16 { 231 return b.secondaryIndices 232 } 233 234 func (b *Bucket) GetStatus() storagestate.Status { 235 b.statusLock.RLock() 236 defer b.statusLock.RUnlock() 237 238 return b.status 239 } 240 241 func (b *Bucket) GetMemtableThreshold() uint64 { 242 return b.memtableThreshold 243 } 244 245 func (b *Bucket) GetWalThreshold() uint64 { 246 return b.walThreshold 247 } 248 249 func (b *Bucket) GetFlushCallbackCtrl() cyclemanager.CycleCallbackCtrl { 250 return b.flushCallbackCtrl 251 } 252 253 func (b *Bucket) IterateObjects(ctx context.Context, f func(object *storobj.Object) error) error { 254 i := 0 255 cursor := b.Cursor() 256 defer cursor.Close() 257 258 for k, v := cursor.First(); k != nil; k, v = cursor.Next() { 259 obj, err := storobj.FromBinary(v) 260 if err != nil { 261 return fmt.Errorf("cannot unmarshal object %d, %v", i, err) 262 } 263 if err := f(obj); err != nil { 264 return fmt.Errorf("callback on object '%d' failed: %w", obj.DocID, err) 265 } 266 267 i++ 268 } 269 270 return nil 271 } 272 273 func (b *Bucket) IterateMapObjects(ctx context.Context, f func([]byte, []byte, []byte, bool) error) error { 274 cursor := b.MapCursor() 275 defer cursor.Close() 276 277 for kList, vList := cursor.First(); kList != nil; kList, vList = cursor.Next() { 278 for _, v := range vList { 279 if err := f(kList, v.Key, v.Value, v.Tombstone); err != nil { 280 return fmt.Errorf("callback on object '%v' failed: %w", v, err) 281 } 282 } 283 } 284 285 return nil 286 } 287 288 func (b *Bucket) SetMemtableThreshold(size uint64) { 289 b.memtableThreshold = size 290 } 291 292 // Get retrieves the single value for the given key. 293 // 294 // Get is specific to ReplaceStrategy and cannot be used with any of the other 295 // strategies. Use [Bucket.SetList] or [Bucket.MapList] instead. 296 // 297 // Get uses the regular or "primary" key for an object. If a bucket has 298 // secondary indexes, use [Bucket.GetBySecondary] to retrieve an object using 299 // its secondary key 300 func (b *Bucket) Get(key []byte) ([]byte, error) { 301 b.flushLock.RLock() 302 defer b.flushLock.RUnlock() 303 304 v, err := b.active.get(key) 305 if err == nil { 306 // item found and no error, return and stop searching, since the strategy 307 // is replace 308 return v, nil 309 } 310 if errors.Is(err, lsmkv.Deleted) { 311 // deleted in the mem-table (which is always the latest) means we don't 312 // have to check the disk segments, return nil now 313 return nil, nil 314 } 315 316 if !errors.Is(err, lsmkv.NotFound) { 317 panic(fmt.Sprintf("unsupported error in bucket.Get: %v\n", err)) 318 } 319 320 if b.flushing != nil { 321 v, err := b.flushing.get(key) 322 if err == nil { 323 // item found and no error, return and stop searching, since the strategy 324 // is replace 325 return v, nil 326 } 327 if errors.Is(err, lsmkv.Deleted) { 328 // deleted in the now most recent memtable means we don't have to check 329 // the disk segments, return nil now 330 return nil, nil 331 } 332 333 if !errors.Is(err, lsmkv.NotFound) { 334 panic("unsupported error in bucket.Get") 335 } 336 } 337 338 return b.disk.get(key) 339 } 340 341 // GetBySecondary retrieves an object using one of its secondary keys. A bucket 342 // can have an infinite number of secondary keys. Specify the secondary key 343 // position as the first argument. 344 // 345 // A real-life example of secondary keys is the Weaviate object store. Objects 346 // are stored with the user-facing ID as their primary key and with the doc-id 347 // (an ever-increasing uint64) as the secondary key. 348 // 349 // Similar to [Bucket.Get], GetBySecondary is limited to ReplaceStrategy. No 350 // equivalent exists for Set and Map, as those do not support secondary 351 // indexes. 352 func (b *Bucket) GetBySecondary(pos int, key []byte) ([]byte, error) { 353 bytes, _, err := b.GetBySecondaryIntoMemory(pos, key, nil) 354 return bytes, err 355 } 356 357 // GetBySecondaryWithBuffer is like [Bucket.GetBySecondary], but also takes a 358 // buffer. It's in the response of the caller to pool the buffer, since the 359 // bucket does not know when the caller is done using it. The return bytes will 360 // likely point to the same memory that's part of the buffer. However, if the 361 // buffer is to small, a larger buffer may also be returned (second arg). 362 func (b *Bucket) GetBySecondaryWithBuffer(pos int, key []byte, buf []byte) ([]byte, []byte, error) { 363 bytes, newBuf, err := b.GetBySecondaryIntoMemory(pos, key, buf) 364 return bytes, newBuf, err 365 } 366 367 // GetBySecondaryIntoMemory copies into the specified memory, and retrieves 368 // an object using one of its secondary keys. A bucket 369 // can have an infinite number of secondary keys. Specify the secondary key 370 // position as the first argument. 371 // 372 // A real-life example of secondary keys is the Weaviate object store. Objects 373 // are stored with the user-facing ID as their primary key and with the doc-id 374 // (an ever-increasing uint64) as the secondary key. 375 // 376 // Similar to [Bucket.Get], GetBySecondary is limited to ReplaceStrategy. No 377 // equivalent exists for Set and Map, as those do not support secondary 378 // indexes. 379 func (b *Bucket) GetBySecondaryIntoMemory(pos int, key []byte, buffer []byte) ([]byte, []byte, error) { 380 b.flushLock.RLock() 381 defer b.flushLock.RUnlock() 382 383 v, err := b.active.getBySecondary(pos, key) 384 if err == nil { 385 // item found and no error, return and stop searching, since the strategy 386 // is replace 387 return v, buffer, nil 388 } 389 if errors.Is(err, lsmkv.Deleted) { 390 // deleted in the mem-table (which is always the latest) means we don't 391 // have to check the disk segments, return nil now 392 return nil, buffer, nil 393 } 394 395 if !errors.Is(err, lsmkv.NotFound) { 396 panic("unsupported error in bucket.Get") 397 } 398 399 if b.flushing != nil { 400 v, err := b.flushing.getBySecondary(pos, key) 401 if err == nil { 402 // item found and no error, return and stop searching, since the strategy 403 // is replace 404 return v, buffer, nil 405 } 406 if errors.Is(err, lsmkv.Deleted) { 407 // deleted in the now most recent memtable means we don't have to check 408 // the disk segments, return nil now 409 return nil, buffer, nil 410 } 411 412 if !errors.Is(err, lsmkv.NotFound) { 413 panic("unsupported error in bucket.Get") 414 } 415 } 416 417 return b.disk.getBySecondaryIntoMemory(pos, key, buffer) 418 } 419 420 // SetList returns all Set entries for a given key. 421 // 422 // SetList is specific to the Set Strategy, for Map use [Bucket.MapList], and 423 // for Replace use [Bucket.Get]. 424 func (b *Bucket) SetList(key []byte) ([][]byte, error) { 425 b.flushLock.RLock() 426 defer b.flushLock.RUnlock() 427 428 var out []value 429 430 v, err := b.disk.getCollection(key) 431 if err != nil { 432 if err != nil && !errors.Is(err, lsmkv.NotFound) { 433 return nil, err 434 } 435 } 436 out = v 437 438 if b.flushing != nil { 439 v, err = b.flushing.getCollection(key) 440 if err != nil { 441 if err != nil && !errors.Is(err, lsmkv.NotFound) { 442 return nil, err 443 } 444 } 445 out = append(out, v...) 446 447 } 448 449 v, err = b.active.getCollection(key) 450 if err != nil { 451 if err != nil && !errors.Is(err, lsmkv.NotFound) { 452 return nil, err 453 } 454 } 455 if len(v) > 0 { 456 // skip the expensive append operation if there was no memtable 457 out = append(out, v...) 458 } 459 460 return newSetDecoder().Do(out), nil 461 } 462 463 // Put creates or replaces a single value for a given key. 464 // 465 // err := bucket.Put([]byte("my_key"), []byte("my_value")) 466 // if err != nil { 467 // /* do something */ 468 // } 469 // 470 // If a bucket has a secondary index configured, you can also specify one or 471 // more secondary keys, like so: 472 // 473 // err := bucket.Put([]byte("my_key"), []byte("my_value"), 474 // WithSecondaryKey(0, []byte("my_alternative_key")), 475 // ) 476 // if err != nil { 477 // /* do something */ 478 // } 479 // 480 // Put is limited to ReplaceStrategy, use [Bucket.SetAdd] for Set or 481 // [Bucket.MapSet] and [Bucket.MapSetMulti]. 482 func (b *Bucket) Put(key, value []byte, opts ...SecondaryKeyOption) error { 483 b.flushLock.RLock() 484 defer b.flushLock.RUnlock() 485 486 return b.active.put(key, value, opts...) 487 } 488 489 // SetAdd adds one or more Set-Entries to a Set for the given key. SetAdd is 490 // entirely agnostic of existing entries, it acts as append-only. This also 491 // makes it agnostic of whether the key already exists or not. 492 // 493 // Example to add two entries to a set: 494 // 495 // err := bucket.SetAdd([]byte("my_key"), [][]byte{ 496 // []byte("one-set-element"), []byte("another-set-element"), 497 // }) 498 // if err != nil { 499 // /* do something */ 500 // } 501 // 502 // SetAdd is specific to the Set strategy. For Replace, use [Bucket.Put], for 503 // Map use either [Bucket.MapSet] or [Bucket.MapSetMulti]. 504 func (b *Bucket) SetAdd(key []byte, values [][]byte) error { 505 b.flushLock.RLock() 506 defer b.flushLock.RUnlock() 507 508 return b.active.append(key, newSetEncoder().Do(values)) 509 } 510 511 // SetDeleteSingle removes one Set element from the given key. Note that LSM 512 // stores are append only, thus internally this action appends a tombstone. The 513 // entry will not be removed until a compaction has run, and even then a 514 // compaction does not guarantee the removal of the data right away. This is 515 // because an entry could have been created in an older segment than those 516 // present in the compaction. This can be seen as an implementation detail, 517 // unless the caller expects to free disk space by calling this method. Such 518 // freeing is not guaranteed. 519 // 520 // SetDeleteSingle is specific to the Set Strategy. For Replace, you can use 521 // [Bucket.Delete] to delete the entire row, for Maps use [Bucket.MapDeleteKey] 522 // to delete a single map entry. 523 func (b *Bucket) SetDeleteSingle(key []byte, valueToDelete []byte) error { 524 b.flushLock.RLock() 525 defer b.flushLock.RUnlock() 526 527 return b.active.append(key, []value{ 528 { 529 value: valueToDelete, 530 tombstone: true, 531 }, 532 }) 533 } 534 535 // WasDeleted determines if an object used to exist in the LSM store 536 // 537 // There are 3 different locations that we need to check for the key 538 // in this order: active memtable, flushing memtable, and disk 539 // segment 540 func (b *Bucket) WasDeleted(key []byte) (bool, error) { 541 if !b.keepTombstones { 542 return false, fmt.Errorf("Bucket requires option `keepTombstones` set to check deleted keys") 543 } 544 545 b.flushLock.RLock() 546 defer b.flushLock.RUnlock() 547 548 _, err := b.active.get(key) 549 switch err { 550 case nil: 551 return false, nil 552 case lsmkv.Deleted: 553 return true, nil 554 case lsmkv.NotFound: 555 // We can still check flushing and disk 556 default: 557 return false, fmt.Errorf("unsupported bucket error: %w", err) 558 } 559 560 if b.flushing != nil { 561 _, err := b.flushing.get(key) 562 switch err { 563 case nil: 564 return false, nil 565 case lsmkv.Deleted: 566 return true, nil 567 case lsmkv.NotFound: 568 // We can still check disk 569 default: 570 return false, fmt.Errorf("unsupported bucket error: %w", err) 571 } 572 } 573 574 _, err = b.disk.get(key) 575 switch err { 576 case nil, lsmkv.NotFound: 577 return false, nil 578 case lsmkv.Deleted: 579 return true, nil 580 default: 581 return false, fmt.Errorf("unsupported bucket error: %w", err) 582 } 583 } 584 585 type MapListOptionConfig struct { 586 acceptDuplicates bool 587 legacyRequireManualSorting bool 588 } 589 590 type MapListOption func(c *MapListOptionConfig) 591 592 func MapListAcceptDuplicates() MapListOption { 593 return func(c *MapListOptionConfig) { 594 c.acceptDuplicates = true 595 } 596 } 597 598 func MapListLegacySortingRequired() MapListOption { 599 return func(c *MapListOptionConfig) { 600 c.legacyRequireManualSorting = true 601 } 602 } 603 604 // MapList returns all map entries for a given row key. The order of map pairs 605 // has no specific meaning. For efficient merge operations, pair entries are 606 // stored sorted on disk, however that is an implementation detail and not a 607 // caller-facing guarantee. 608 // 609 // MapList is specific to the Map strategy, for Sets use [Bucket.SetList], for 610 // Replace use [Bucket.Get]. 611 func (b *Bucket) MapList(key []byte, cfgs ...MapListOption) ([]MapPair, error) { 612 b.flushLock.RLock() 613 defer b.flushLock.RUnlock() 614 615 c := MapListOptionConfig{} 616 for _, cfg := range cfgs { 617 cfg(&c) 618 } 619 620 segments := [][]MapPair{} 621 // before := time.Now() 622 disk, err := b.disk.getCollectionBySegments(key) 623 if err != nil { 624 if err != nil && !errors.Is(err, lsmkv.NotFound) { 625 return nil, err 626 } 627 } 628 629 for i := range disk { 630 segmentDecoded := make([]MapPair, len(disk[i])) 631 for j, v := range disk[i] { 632 if err := segmentDecoded[j].FromBytes(v.value, false); err != nil { 633 return nil, err 634 } 635 // Read "broken" tombstones with length 12 but a non-tombstone value 636 // Related to Issue #4125 637 // TODO: Remove the extra check, as it may interfere future in-disk format changes 638 segmentDecoded[j].Tombstone = v.tombstone || len(v.value) == 12 639 } 640 segments = append(segments, segmentDecoded) 641 } 642 643 // fmt.Printf("--map-list: get all disk segments took %s\n", time.Since(before)) 644 645 // before = time.Now() 646 // fmt.Printf("--map-list: append all disk segments took %s\n", time.Since(before)) 647 648 if b.flushing != nil { 649 v, err := b.flushing.getMap(key) 650 if err != nil { 651 if err != nil && !errors.Is(err, lsmkv.NotFound) { 652 return nil, err 653 } 654 } 655 656 segments = append(segments, v) 657 } 658 659 // before = time.Now() 660 v, err := b.active.getMap(key) 661 if err != nil { 662 if err != nil && !errors.Is(err, lsmkv.NotFound) { 663 return nil, err 664 } 665 } 666 segments = append(segments, v) 667 // fmt.Printf("--map-list: get all active segments took %s\n", time.Since(before)) 668 669 // before = time.Now() 670 // defer func() { 671 // fmt.Printf("--map-list: run decoder took %s\n", time.Since(before)) 672 // }() 673 674 if c.legacyRequireManualSorting { 675 // Sort to support segments which were stored in an unsorted fashion 676 for i := range segments { 677 sort.Slice(segments[i], func(a, b int) bool { 678 return bytes.Compare(segments[i][a].Key, segments[i][b].Key) == -1 679 }) 680 } 681 } 682 683 return newSortedMapMerger().do(segments) 684 } 685 686 // MapSet writes one [MapPair] into the map for the given row key. It is 687 // agnostic of whether the row key already exists, as well as agnostic of 688 // whether the map key already exists. In both cases it will create the entry 689 // if it does not exist or override if it does. 690 // 691 // Example to add a new MapPair: 692 // 693 // pair := MapPair{Key: []byte("Jane"), Value: []byte("Backend")} 694 // err := bucket.MapSet([]byte("developers"), pair) 695 // if err != nil { 696 // /* do something */ 697 // } 698 // 699 // MapSet is specific to the Map Strategy, for Replace use [Bucket.Put], and for Set use [Bucket.SetAdd] instead. 700 func (b *Bucket) MapSet(rowKey []byte, kv MapPair) error { 701 b.flushLock.RLock() 702 defer b.flushLock.RUnlock() 703 704 return b.active.appendMapSorted(rowKey, kv) 705 } 706 707 // MapSetMulti is the same as [Bucket.MapSet], except that it takes in multiple 708 // [MapPair] objects at the same time. 709 func (b *Bucket) MapSetMulti(rowKey []byte, kvs []MapPair) error { 710 b.flushLock.RLock() 711 defer b.flushLock.RUnlock() 712 713 for _, kv := range kvs { 714 if err := b.active.appendMapSorted(rowKey, kv); err != nil { 715 return err 716 } 717 } 718 719 return nil 720 } 721 722 // MapDeleteKey removes one key-value pair from the given map row. Note that 723 // LSM stores are append only, thus internally this action appends a tombstone. 724 // The entry will not be removed until a compaction has run, and even then a 725 // compaction does not guarantee the removal of the data right away. This is 726 // because an entry could have been created in an older segment than those 727 // present in the compaction. This can be seen as an implementation detail, 728 // unless the caller expects to free disk space by calling this method. Such 729 // freeing is not guaranteed. 730 // 731 // MapDeleteKey is specific to the Map Strategy. For Replace, you can use 732 // [Bucket.Delete] to delete the entire row, for Sets use [Bucket.SetDeleteSingle] to delete a single set element. 733 func (b *Bucket) MapDeleteKey(rowKey, mapKey []byte) error { 734 b.flushLock.RLock() 735 defer b.flushLock.RUnlock() 736 737 pair := MapPair{ 738 Key: mapKey, 739 Tombstone: true, 740 } 741 742 return b.active.appendMapSorted(rowKey, pair) 743 } 744 745 // Delete removes the given row. Note that LSM stores are append only, thus 746 // internally this action appends a tombstone. The entry will not be removed 747 // until a compaction has run, and even then a compaction does not guarantee 748 // the removal of the data right away. This is because an entry could have been 749 // created in an older segment than those present in the compaction. This can 750 // be seen as an implementation detail, unless the caller expects to free disk 751 // space by calling this method. Such freeing is not guaranteed. 752 // 753 // Delete is specific to the Replace Strategy. For Maps, you can use 754 // [Bucket.MapDeleteKey] to delete a single key-value pair, for Sets use 755 // [Bucket.SetDeleteSingle] to delete a single set element. 756 func (b *Bucket) Delete(key []byte, opts ...SecondaryKeyOption) error { 757 b.flushLock.RLock() 758 defer b.flushLock.RUnlock() 759 760 return b.active.setTombstone(key, opts...) 761 } 762 763 // meant to be called from situations where a lock is already held, does not 764 // lock on its own 765 func (b *Bucket) setNewActiveMemtable() error { 766 path := filepath.Join(b.dir, fmt.Sprintf("segment-%d", time.Now().UnixNano())) 767 768 cl, err := newCommitLogger(path) 769 if err != nil { 770 return errors.Wrap(err, "init commit logger") 771 } 772 773 mt, err := newMemtable(path, b.strategy, b.secondaryIndices, cl, b.metrics) 774 if err != nil { 775 return err 776 } 777 778 b.active = mt 779 return nil 780 } 781 782 func (b *Bucket) Count() int { 783 b.flushLock.RLock() 784 defer b.flushLock.RUnlock() 785 786 if b.strategy != StrategyReplace { 787 panic("Count() called on strategy other than 'replace'") 788 } 789 790 memtableCount := 0 791 if b.flushing == nil { 792 // only consider active 793 memtableCount += b.memtableNetCount(b.active.countStats(), nil) 794 } else { 795 flushingCountStats := b.flushing.countStats() 796 activeCountStats := b.active.countStats() 797 deltaActive := b.memtableNetCount(activeCountStats, flushingCountStats) 798 deltaFlushing := b.memtableNetCount(flushingCountStats, nil) 799 800 memtableCount = deltaActive + deltaFlushing 801 } 802 803 diskCount := b.disk.count() 804 805 if b.monitorCount { 806 b.metrics.ObjectCount(memtableCount + diskCount) 807 } 808 return memtableCount + diskCount 809 } 810 811 // CountAsync ignores the current memtable, that makes it async because it only 812 // reflects what has been already flushed. This in turn makes it very cheap to 813 // call, so it can be used for observability purposes where eventual 814 // consistency on the count is fine, but a large cost is not. 815 func (b *Bucket) CountAsync() int { 816 return b.disk.count() 817 } 818 819 func (b *Bucket) memtableNetCount(stats *countStats, previousMemtable *countStats) int { 820 netCount := 0 821 822 // TODO: this uses regular get, given that this may be called quite commonly, 823 // we might consider building a pure Exists(), which skips reading the value 824 // and only checks for tombstones, etc. 825 for _, key := range stats.upsertKeys { 826 if !b.existsOnDiskAndPreviousMemtable(previousMemtable, key) { 827 netCount++ 828 } 829 } 830 831 for _, key := range stats.tombstonedKeys { 832 if b.existsOnDiskAndPreviousMemtable(previousMemtable, key) { 833 netCount-- 834 } 835 } 836 837 return netCount 838 } 839 840 func (b *Bucket) existsOnDiskAndPreviousMemtable(previous *countStats, key []byte) bool { 841 v, _ := b.disk.get(key) // current implementation can't error 842 if v == nil { 843 // not on disk, but it could still be in the previous memtable 844 return previous.hasUpsert(key) 845 } 846 847 // it exists on disk ,but it could still have been deleted in the previous memtable 848 return !previous.hasTombstone(key) 849 } 850 851 func (b *Bucket) Shutdown(ctx context.Context) error { 852 if err := b.disk.shutdown(ctx); err != nil { 853 return err 854 } 855 856 if err := b.flushCallbackCtrl.Unregister(ctx); err != nil { 857 return fmt.Errorf("long-running flush in progress: %w", ctx.Err()) 858 } 859 860 b.flushLock.Lock() 861 if err := b.active.flush(); err != nil { 862 return err 863 } 864 b.flushLock.Unlock() 865 866 if b.flushing == nil { 867 // active has flushing, no one else was currently flushing, it's safe to 868 // exit 869 return nil 870 } 871 872 // it seems we still need to wait for someone to finish flushing 873 t := time.NewTicker(50 * time.Millisecond) 874 defer t.Stop() 875 for { 876 select { 877 case <-ctx.Done(): 878 return ctx.Err() 879 case <-t.C: 880 if b.flushing == nil { 881 return nil 882 } 883 } 884 } 885 } 886 887 func (b *Bucket) flushAndSwitchIfThresholdsMet(shouldAbort cyclemanager.ShouldAbortCallback) bool { 888 b.flushLock.RLock() 889 commitLogSize := b.active.commitlog.Size() 890 memtableTooLarge := b.active.Size() >= b.memtableThreshold 891 walTooLarge := uint64(commitLogSize) >= b.walThreshold 892 dirtyTooLong := b.active.DirtyDuration() >= b.flushDirtyAfter 893 shouldSwitch := memtableTooLarge || walTooLarge || dirtyTooLong 894 895 // If true, the parent shard has indicated that it has 896 // entered an immutable state. During this time, the 897 // bucket should refrain from flushing until its shard 898 // indicates otherwise 899 if shouldSwitch && b.isReadOnly() { 900 if b.haltedFlushTimer.IntervalElapsed() { 901 b.logger.WithField("action", "lsm_memtable_flush"). 902 WithField("path", b.dir). 903 Warn("flush halted due to shard READONLY status") 904 b.haltedFlushTimer.IncreaseInterval() 905 } 906 907 b.flushLock.RUnlock() 908 return false 909 } 910 911 b.flushLock.RUnlock() 912 if shouldSwitch { 913 b.haltedFlushTimer.Reset() 914 cycleLength := b.active.ActiveDuration() 915 if err := b.FlushAndSwitch(); err != nil { 916 b.logger.WithField("action", "lsm_memtable_flush"). 917 WithField("path", b.dir). 918 WithError(err). 919 Errorf("flush and switch failed") 920 } 921 922 if b.memtableResizer != nil { 923 next, ok := b.memtableResizer.NextTarget(int(b.memtableThreshold), cycleLength) 924 if ok { 925 b.memtableThreshold = uint64(next) 926 } 927 } 928 return true 929 } 930 return false 931 } 932 933 // UpdateStatus is used by the parent shard to communicate to the bucket 934 // when the shard has been set to readonly, or when it is ready for 935 // writes. 936 func (b *Bucket) UpdateStatus(status storagestate.Status) { 937 b.statusLock.Lock() 938 defer b.statusLock.Unlock() 939 940 b.status = status 941 b.disk.UpdateStatus(status) 942 } 943 944 func (b *Bucket) isReadOnly() bool { 945 b.statusLock.Lock() 946 defer b.statusLock.Unlock() 947 948 return b.status == storagestate.StatusReadOnly 949 } 950 951 // FlushAndSwitch is typically called periodically and does not require manual 952 // calling, but there are some situations where this might be intended, such as 953 // in test scenarios or when a force flush is desired. 954 func (b *Bucket) FlushAndSwitch() error { 955 before := time.Now() 956 957 b.logger.WithField("action", "lsm_memtable_flush_start"). 958 WithField("path", b.dir). 959 Trace("start flush and switch") 960 if err := b.atomicallySwitchMemtable(); err != nil { 961 return fmt.Errorf("switch active memtable: %w", err) 962 } 963 964 if err := b.flushing.flush(); err != nil { 965 return fmt.Errorf("flush: %w", err) 966 } 967 968 if err := b.atomicallyAddDiskSegmentAndRemoveFlushing(); err != nil { 969 return fmt.Errorf("add segment and remove flushing: %w", err) 970 } 971 972 took := time.Since(before) 973 b.logger.WithField("action", "lsm_memtable_flush_complete"). 974 WithField("path", b.dir). 975 Trace("finish flush and switch") 976 977 b.logger.WithField("action", "lsm_memtable_flush_complete"). 978 WithField("path", b.dir). 979 WithField("took", took). 980 Debugf("flush and switch took %s\n", took) 981 982 return nil 983 } 984 985 func (b *Bucket) atomicallyAddDiskSegmentAndRemoveFlushing() error { 986 b.flushLock.Lock() 987 defer b.flushLock.Unlock() 988 989 path := b.flushing.path 990 if err := b.disk.add(path + ".db"); err != nil { 991 return err 992 } 993 b.flushing = nil 994 995 if b.strategy == StrategyReplace && b.monitorCount { 996 // having just flushed the memtable we now have the most up2date count which 997 // is a good place to update the metric 998 b.metrics.ObjectCount(b.disk.count()) 999 } 1000 1001 return nil 1002 } 1003 1004 func (b *Bucket) atomicallySwitchMemtable() error { 1005 b.flushLock.Lock() 1006 defer b.flushLock.Unlock() 1007 1008 b.flushing = b.active 1009 return b.setNewActiveMemtable() 1010 } 1011 1012 func (b *Bucket) Strategy() string { 1013 return b.strategy 1014 } 1015 1016 func (b *Bucket) DesiredStrategy() string { 1017 return b.desiredStrategy 1018 } 1019 1020 // the WAL uses a buffer and isn't written until the buffer size is crossed or 1021 // this function explicitly called. This allows to avoid unnecessary disk 1022 // writes in larger operations, such as batches. It is sufficient to call write 1023 // on the WAL just once. This does not make a batch atomic, but it guarantees 1024 // that the WAL is written before a successful response is returned to the 1025 // user. 1026 func (b *Bucket) WriteWAL() error { 1027 b.flushLock.RLock() 1028 defer b.flushLock.RUnlock() 1029 1030 return b.active.writeWAL() 1031 }