github.com/weaviate/weaviate@v1.24.6/adapters/repos/db/lsmkv/bucket.go

github.com/weaviate/weaviate@v1.24.6/adapters/repos/db/lsmkv/bucket.go (about)

     1  //                           _       _
     2  // __      _____  __ ___   ___  __ _| |_ ___
     3  // \ \ /\ / / _ \/ _` \ \ / / |/ _` | __/ _ \
     4  //  \ V  V /  __/ (_| |\ V /| | (_| | ||  __/
     5  //   \_/\_/ \___|\__,_| \_/ |_|\__,_|\__\___|
     6  //
     7  //  Copyright © 2016 - 2024 Weaviate B.V. All rights reserved.
     8  //
     9  //  CONTACT: hello@weaviate.io
    10  //
    11  
    12  package lsmkv
    13  
    14  import (
    15  	"bytes"
    16  	"context"
    17  	"fmt"
    18  	"os"
    19  	"path/filepath"
    20  	"sort"
    21  	"sync"
    22  	"time"
    23  
    24  	"github.com/pkg/errors"
    25  	"github.com/prometheus/client_golang/prometheus"
    26  	"github.com/sirupsen/logrus"
    27  	"github.com/weaviate/weaviate/adapters/repos/db/lsmkv/segmentindex"
    28  	"github.com/weaviate/weaviate/entities/cyclemanager"
    29  	"github.com/weaviate/weaviate/entities/interval"
    30  	"github.com/weaviate/weaviate/entities/lsmkv"
    31  	"github.com/weaviate/weaviate/entities/storagestate"
    32  	"github.com/weaviate/weaviate/entities/storobj"
    33  )
    34  
    35  type Bucket struct {
    36  	dir      string
    37  	rootDir  string
    38  	active   *Memtable
    39  	flushing *Memtable
    40  	disk     *SegmentGroup
    41  	logger   logrus.FieldLogger
    42  
    43  	// Lock() means a move from active to flushing is happening, RLock() is
    44  	// normal operation
    45  	flushLock        sync.RWMutex
    46  	haltedFlushTimer *interval.BackoffTimer
    47  
    48  	walThreshold      uint64
    49  	flushDirtyAfter   time.Duration
    50  	memtableThreshold uint64
    51  	memtableResizer   *memtableSizeAdvisor
    52  	strategy          string
    53  	// Strategy inverted index is supposed to be created with, but existing
    54  	// segment files were created with different one.
    55  	// It can happen when new strategy were introduced to weaviate, but
    56  	// files are already created using old implementation.
    57  	// Example: RoaringSet strategy replaces CollectionSet strategy.
    58  	// Field can be used for migration files of old strategy to newer one.
    59  	desiredStrategy  string
    60  	secondaryIndices uint16
    61  
    62  	// Optional to avoid syscalls
    63  	mmapContents bool
    64  
    65  	// for backward compatibility
    66  	legacyMapSortingBeforeCompaction bool
    67  
    68  	flushCallbackCtrl cyclemanager.CycleCallbackCtrl
    69  
    70  	status     storagestate.Status
    71  	statusLock sync.RWMutex
    72  
    73  	metrics *Metrics
    74  
    75  	// all "replace" buckets support counting through net additions, but not all
    76  	// produce a meaningful count. Typically, the only count we're interested in
    77  	// is that of the bucket that holds objects
    78  	monitorCount bool
    79  
    80  	pauseTimer *prometheus.Timer // Times the pause
    81  
    82  	// Whether tombstones (set/map/replace types) or deletions (roaringset type)
    83  	// should be kept in root segment during compaction process.
    84  	// Since segments are immutable, deletions are added as new entries with
    85  	// tombstones. Tombstones are by default copied to merged segment, as they
    86  	// can refer to keys/values present in previous segments.
    87  	// Those tombstones can be removed entirely when merging with root (1st) segment,
    88  	// due to lack of previous segments, tombstones may relate to.
    89  	// As info about key/value being deleted (based on tombstone presence) may be important
    90  	// for some use cases (e.g. replication needs to know if object(ObjectsBucketLSM) was deleted)
    91  	// keeping tombstones on compaction is optional
    92  	keepTombstones bool
    93  
    94  	// Init and use bloom filter for getting key from bucket segments.
    95  	// As some buckets can be accessed only with cursor (see flat index),
    96  	// where bloom filter is not applicable, it can be disabled.
    97  	// ON by default
    98  	useBloomFilter bool
    99  
   100  	// Net additions keep track of number of elements stored in bucket (of type replace).
   101  	// As some buckets don't have to provide Count info (see flat index),
   102  	// tracking additions can be disabled.
   103  	// ON by default
   104  	calcCountNetAdditions bool
   105  
   106  	forceCompaction bool
   107  }
   108  
   109  // NewBucket initializes a new bucket. It either loads the state from disk if
   110  // it exists, or initializes new state.
   111  //
   112  // You do not need to ever call NewBucket() yourself, if you are using a
   113  // [Store]. In this case the [Store] can manage buckets for you, using methods
   114  // such as CreateOrLoadBucket().
   115  func NewBucket(ctx context.Context, dir, rootDir string, logger logrus.FieldLogger,
   116  	metrics *Metrics, compactionCallbacks, flushCallbacks cyclemanager.CycleCallbackGroup,
   117  	opts ...BucketOption,
   118  ) (*Bucket, error) {
   119  	beforeAll := time.Now()
   120  	defaultMemTableThreshold := uint64(10 * 1024 * 1024)
   121  	defaultWalThreshold := uint64(1024 * 1024 * 1024)
   122  	defaultFlushAfterDirty := 60 * time.Second
   123  	defaultStrategy := StrategyReplace
   124  
   125  	if err := os.MkdirAll(dir, 0o700); err != nil {
   126  		return nil, err
   127  	}
   128  
   129  	b := &Bucket{
   130  		dir:                   dir,
   131  		rootDir:               rootDir,
   132  		memtableThreshold:     defaultMemTableThreshold,
   133  		walThreshold:          defaultWalThreshold,
   134  		flushDirtyAfter:       defaultFlushAfterDirty,
   135  		strategy:              defaultStrategy,
   136  		mmapContents:          true,
   137  		logger:                logger,
   138  		metrics:               metrics,
   139  		useBloomFilter:        true,
   140  		calcCountNetAdditions: true,
   141  		haltedFlushTimer:      interval.NewBackoffTimer(),
   142  	}
   143  
   144  	for _, opt := range opts {
   145  		if err := opt(b); err != nil {
   146  			return nil, err
   147  		}
   148  	}
   149  
   150  	if b.memtableResizer != nil {
   151  		b.memtableThreshold = uint64(b.memtableResizer.Initial())
   152  	}
   153  
   154  	sg, err := newSegmentGroup(logger, metrics, compactionCallbacks,
   155  		sgConfig{
   156  			dir:                   dir,
   157  			strategy:              b.strategy,
   158  			mapRequiresSorting:    b.legacyMapSortingBeforeCompaction,
   159  			monitorCount:          b.monitorCount,
   160  			mmapContents:          b.mmapContents,
   161  			keepTombstones:        b.keepTombstones,
   162  			forceCompaction:       b.forceCompaction,
   163  			useBloomFilter:        b.useBloomFilter,
   164  			calcCountNetAdditions: b.calcCountNetAdditions,
   165  		})
   166  	if err != nil {
   167  		return nil, fmt.Errorf("init disk segments: %w", err)
   168  	}
   169  
   170  	// Actual strategy is stored in segment files. In case it is SetCollection,
   171  	// while new implementation uses bitmaps and supposed to be RoaringSet,
   172  	// bucket and segmentgroup strategy is changed back to SetCollection
   173  	// (memtables will be created later on, with already modified strategy)
   174  	// TODO what if only WAL files exists, and there is no segment to get actual strategy?
   175  	if b.strategy == StrategyRoaringSet && len(sg.segments) > 0 &&
   176  		sg.segments[0].strategy == segmentindex.StrategySetCollection {
   177  		b.strategy = StrategySetCollection
   178  		b.desiredStrategy = StrategyRoaringSet
   179  		sg.strategy = StrategySetCollection
   180  	}
   181  	// As of v1.19 property's IndexInterval setting is replaced with
   182  	// IndexFilterable (roaring set) + IndexSearchable (map) and enabled by default.
   183  	// Buckets for text/text[] inverted indexes created before 1.19 have strategy
   184  	// map and name that since 1.19 is used by filterable indeverted index.
   185  	// Those buckets (roaring set by configuration, but in fact map) have to be
   186  	// renamed on startup by migrator. Here actual strategy is set based on
   187  	// data found in segment files
   188  	if b.strategy == StrategyRoaringSet && len(sg.segments) > 0 &&
   189  		sg.segments[0].strategy == segmentindex.StrategyMapCollection {
   190  		b.strategy = StrategyMapCollection
   191  		b.desiredStrategy = StrategyRoaringSet
   192  		sg.strategy = StrategyMapCollection
   193  	}
   194  
   195  	b.disk = sg
   196  
   197  	if err := b.mayRecoverFromCommitLogs(ctx); err != nil {
   198  		return nil, err
   199  	}
   200  
   201  	err = b.setNewActiveMemtable()
   202  	if err != nil {
   203  		return nil, err
   204  	}
   205  
   206  	id := "bucket/flush/" + b.dir
   207  	b.flushCallbackCtrl = flushCallbacks.Register(id, b.flushAndSwitchIfThresholdsMet)
   208  
   209  	b.metrics.TrackStartupBucket(beforeAll)
   210  
   211  	return b, nil
   212  }
   213  
   214  func (b *Bucket) GetDir() string {
   215  	return b.dir
   216  }
   217  
   218  func (b *Bucket) GetRootDir() string {
   219  	return b.rootDir
   220  }
   221  
   222  func (b *Bucket) GetStrategy() string {
   223  	return b.strategy
   224  }
   225  
   226  func (b *Bucket) GetDesiredStrategy() string {
   227  	return b.desiredStrategy
   228  }
   229  
   230  func (b *Bucket) GetSecondaryIndices() uint16 {
   231  	return b.secondaryIndices
   232  }
   233  
   234  func (b *Bucket) GetStatus() storagestate.Status {
   235  	b.statusLock.RLock()
   236  	defer b.statusLock.RUnlock()
   237  
   238  	return b.status
   239  }
   240  
   241  func (b *Bucket) GetMemtableThreshold() uint64 {
   242  	return b.memtableThreshold
   243  }
   244  
   245  func (b *Bucket) GetWalThreshold() uint64 {
   246  	return b.walThreshold
   247  }
   248  
   249  func (b *Bucket) GetFlushCallbackCtrl() cyclemanager.CycleCallbackCtrl {
   250  	return b.flushCallbackCtrl
   251  }
   252  
   253  func (b *Bucket) IterateObjects(ctx context.Context, f func(object *storobj.Object) error) error {
   254  	i := 0
   255  	cursor := b.Cursor()
   256  	defer cursor.Close()
   257  
   258  	for k, v := cursor.First(); k != nil; k, v = cursor.Next() {
   259  		obj, err := storobj.FromBinary(v)
   260  		if err != nil {
   261  			return fmt.Errorf("cannot unmarshal object %d, %v", i, err)
   262  		}
   263  		if err := f(obj); err != nil {
   264  			return fmt.Errorf("callback on object '%d' failed: %w", obj.DocID, err)
   265  		}
   266  
   267  		i++
   268  	}
   269  
   270  	return nil
   271  }
   272  
   273  func (b *Bucket) IterateMapObjects(ctx context.Context, f func([]byte, []byte, []byte, bool) error) error {
   274  	cursor := b.MapCursor()
   275  	defer cursor.Close()
   276  
   277  	for kList, vList := cursor.First(); kList != nil; kList, vList = cursor.Next() {
   278  		for _, v := range vList {
   279  			if err := f(kList, v.Key, v.Value, v.Tombstone); err != nil {
   280  				return fmt.Errorf("callback on object '%v' failed: %w", v, err)
   281  			}
   282  		}
   283  	}
   284  
   285  	return nil
   286  }
   287  
   288  func (b *Bucket) SetMemtableThreshold(size uint64) {
   289  	b.memtableThreshold = size
   290  }
   291  
   292  // Get retrieves the single value for the given key.
   293  //
   294  // Get is specific to ReplaceStrategy and cannot be used with any of the other
   295  // strategies. Use [Bucket.SetList] or [Bucket.MapList] instead.
   296  //
   297  // Get uses the regular or "primary" key for an object. If a bucket has
   298  // secondary indexes, use [Bucket.GetBySecondary] to retrieve an object using
   299  // its secondary key
   300  func (b *Bucket) Get(key []byte) ([]byte, error) {
   301  	b.flushLock.RLock()
   302  	defer b.flushLock.RUnlock()
   303  
   304  	v, err := b.active.get(key)
   305  	if err == nil {
   306  		// item found and no error, return and stop searching, since the strategy
   307  		// is replace
   308  		return v, nil
   309  	}
   310  	if errors.Is(err, lsmkv.Deleted) {
   311  		// deleted in the mem-table (which is always the latest) means we don't
   312  		// have to check the disk segments, return nil now
   313  		return nil, nil
   314  	}
   315  
   316  	if !errors.Is(err, lsmkv.NotFound) {
   317  		panic(fmt.Sprintf("unsupported error in bucket.Get: %v\n", err))
   318  	}
   319  
   320  	if b.flushing != nil {
   321  		v, err := b.flushing.get(key)
   322  		if err == nil {
   323  			// item found and no error, return and stop searching, since the strategy
   324  			// is replace
   325  			return v, nil
   326  		}
   327  		if errors.Is(err, lsmkv.Deleted) {
   328  			// deleted in the now most recent memtable  means we don't have to check
   329  			// the disk segments, return nil now
   330  			return nil, nil
   331  		}
   332  
   333  		if !errors.Is(err, lsmkv.NotFound) {
   334  			panic("unsupported error in bucket.Get")
   335  		}
   336  	}
   337  
   338  	return b.disk.get(key)
   339  }
   340  
   341  // GetBySecondary retrieves an object using one of its secondary keys. A bucket
   342  // can have an infinite number of secondary keys. Specify the secondary key
   343  // position as the first argument.
   344  //
   345  // A real-life example of secondary keys is the Weaviate object store. Objects
   346  // are stored with the user-facing ID as their primary key and with the doc-id
   347  // (an ever-increasing uint64) as the secondary key.
   348  //
   349  // Similar to [Bucket.Get], GetBySecondary is limited to ReplaceStrategy. No
   350  // equivalent exists for Set and Map, as those do not support secondary
   351  // indexes.
   352  func (b *Bucket) GetBySecondary(pos int, key []byte) ([]byte, error) {
   353  	bytes, _, err := b.GetBySecondaryIntoMemory(pos, key, nil)
   354  	return bytes, err
   355  }
   356  
   357  // GetBySecondaryWithBuffer is like [Bucket.GetBySecondary], but also takes a
   358  // buffer. It's in the response of the caller to pool the buffer, since the
   359  // bucket does not know when the caller is done using it. The return bytes will
   360  // likely point to the same memory that's part of the buffer. However, if the
   361  // buffer is to small, a larger buffer may also be returned (second arg).
   362  func (b *Bucket) GetBySecondaryWithBuffer(pos int, key []byte, buf []byte) ([]byte, []byte, error) {
   363  	bytes, newBuf, err := b.GetBySecondaryIntoMemory(pos, key, buf)
   364  	return bytes, newBuf, err
   365  }
   366  
   367  // GetBySecondaryIntoMemory copies into the specified memory, and retrieves
   368  // an object using one of its secondary keys. A bucket
   369  // can have an infinite number of secondary keys. Specify the secondary key
   370  // position as the first argument.
   371  //
   372  // A real-life example of secondary keys is the Weaviate object store. Objects
   373  // are stored with the user-facing ID as their primary key and with the doc-id
   374  // (an ever-increasing uint64) as the secondary key.
   375  //
   376  // Similar to [Bucket.Get], GetBySecondary is limited to ReplaceStrategy. No
   377  // equivalent exists for Set and Map, as those do not support secondary
   378  // indexes.
   379  func (b *Bucket) GetBySecondaryIntoMemory(pos int, key []byte, buffer []byte) ([]byte, []byte, error) {
   380  	b.flushLock.RLock()
   381  	defer b.flushLock.RUnlock()
   382  
   383  	v, err := b.active.getBySecondary(pos, key)
   384  	if err == nil {
   385  		// item found and no error, return and stop searching, since the strategy
   386  		// is replace
   387  		return v, buffer, nil
   388  	}
   389  	if errors.Is(err, lsmkv.Deleted) {
   390  		// deleted in the mem-table (which is always the latest) means we don't
   391  		// have to check the disk segments, return nil now
   392  		return nil, buffer, nil
   393  	}
   394  
   395  	if !errors.Is(err, lsmkv.NotFound) {
   396  		panic("unsupported error in bucket.Get")
   397  	}
   398  
   399  	if b.flushing != nil {
   400  		v, err := b.flushing.getBySecondary(pos, key)
   401  		if err == nil {
   402  			// item found and no error, return and stop searching, since the strategy
   403  			// is replace
   404  			return v, buffer, nil
   405  		}
   406  		if errors.Is(err, lsmkv.Deleted) {
   407  			// deleted in the now most recent memtable  means we don't have to check
   408  			// the disk segments, return nil now
   409  			return nil, buffer, nil
   410  		}
   411  
   412  		if !errors.Is(err, lsmkv.NotFound) {
   413  			panic("unsupported error in bucket.Get")
   414  		}
   415  	}
   416  
   417  	return b.disk.getBySecondaryIntoMemory(pos, key, buffer)
   418  }
   419  
   420  // SetList returns all Set entries for a given key.
   421  //
   422  // SetList is specific to the Set Strategy, for Map use [Bucket.MapList], and
   423  // for Replace use [Bucket.Get].
   424  func (b *Bucket) SetList(key []byte) ([][]byte, error) {
   425  	b.flushLock.RLock()
   426  	defer b.flushLock.RUnlock()
   427  
   428  	var out []value
   429  
   430  	v, err := b.disk.getCollection(key)
   431  	if err != nil {
   432  		if err != nil && !errors.Is(err, lsmkv.NotFound) {
   433  			return nil, err
   434  		}
   435  	}
   436  	out = v
   437  
   438  	if b.flushing != nil {
   439  		v, err = b.flushing.getCollection(key)
   440  		if err != nil {
   441  			if err != nil && !errors.Is(err, lsmkv.NotFound) {
   442  				return nil, err
   443  			}
   444  		}
   445  		out = append(out, v...)
   446  
   447  	}
   448  
   449  	v, err = b.active.getCollection(key)
   450  	if err != nil {
   451  		if err != nil && !errors.Is(err, lsmkv.NotFound) {
   452  			return nil, err
   453  		}
   454  	}
   455  	if len(v) > 0 {
   456  		// skip the expensive append operation if there was no memtable
   457  		out = append(out, v...)
   458  	}
   459  
   460  	return newSetDecoder().Do(out), nil
   461  }
   462  
   463  // Put creates or replaces a single value for a given key.
   464  //
   465  //	err := bucket.Put([]byte("my_key"), []byte("my_value"))
   466  //	 if err != nil {
   467  //		/* do something */
   468  //	}
   469  //
   470  // If a bucket has a secondary index configured, you can also specify one or
   471  // more secondary keys, like so:
   472  //
   473  //	err := bucket.Put([]byte("my_key"), []byte("my_value"),
   474  //		WithSecondaryKey(0, []byte("my_alternative_key")),
   475  //	)
   476  //	 if err != nil {
   477  //		/* do something */
   478  //	}
   479  //
   480  // Put is limited to ReplaceStrategy, use [Bucket.SetAdd] for Set or
   481  // [Bucket.MapSet] and [Bucket.MapSetMulti].
   482  func (b *Bucket) Put(key, value []byte, opts ...SecondaryKeyOption) error {
   483  	b.flushLock.RLock()
   484  	defer b.flushLock.RUnlock()
   485  
   486  	return b.active.put(key, value, opts...)
   487  }
   488  
   489  // SetAdd adds one or more Set-Entries to a Set for the given key. SetAdd is
   490  // entirely agnostic of existing entries, it acts as append-only. This also
   491  // makes it agnostic of whether the key already exists or not.
   492  //
   493  // Example to add two entries to a set:
   494  //
   495  //	err := bucket.SetAdd([]byte("my_key"), [][]byte{
   496  //		[]byte("one-set-element"), []byte("another-set-element"),
   497  //	})
   498  //	if err != nil {
   499  //		/* do something */
   500  //	}
   501  //
   502  // SetAdd is specific to the Set strategy. For Replace, use [Bucket.Put], for
   503  // Map use either [Bucket.MapSet] or [Bucket.MapSetMulti].
   504  func (b *Bucket) SetAdd(key []byte, values [][]byte) error {
   505  	b.flushLock.RLock()
   506  	defer b.flushLock.RUnlock()
   507  
   508  	return b.active.append(key, newSetEncoder().Do(values))
   509  }
   510  
   511  // SetDeleteSingle removes one Set element from the given key. Note that LSM
   512  // stores are append only, thus internally this action appends a tombstone. The
   513  // entry will not be removed until a compaction has run, and even then a
   514  // compaction does not guarantee the removal of the data right away. This is
   515  // because an entry could have been created in an older segment than those
   516  // present in the compaction. This can be seen as an implementation detail,
   517  // unless the caller expects to free disk space by calling this method. Such
   518  // freeing is not guaranteed.
   519  //
   520  // SetDeleteSingle is specific to the Set Strategy. For Replace, you can use
   521  // [Bucket.Delete] to delete the entire row, for Maps use [Bucket.MapDeleteKey]
   522  // to delete a single map entry.
   523  func (b *Bucket) SetDeleteSingle(key []byte, valueToDelete []byte) error {
   524  	b.flushLock.RLock()
   525  	defer b.flushLock.RUnlock()
   526  
   527  	return b.active.append(key, []value{
   528  		{
   529  			value:     valueToDelete,
   530  			tombstone: true,
   531  		},
   532  	})
   533  }
   534  
   535  // WasDeleted determines if an object used to exist in the LSM store
   536  //
   537  // There are 3 different locations that we need to check for the key
   538  // in this order: active memtable, flushing memtable, and disk
   539  // segment
   540  func (b *Bucket) WasDeleted(key []byte) (bool, error) {
   541  	if !b.keepTombstones {
   542  		return false, fmt.Errorf("Bucket requires option `keepTombstones` set to check deleted keys")
   543  	}
   544  
   545  	b.flushLock.RLock()
   546  	defer b.flushLock.RUnlock()
   547  
   548  	_, err := b.active.get(key)
   549  	switch err {
   550  	case nil:
   551  		return false, nil
   552  	case lsmkv.Deleted:
   553  		return true, nil
   554  	case lsmkv.NotFound:
   555  		// We can still check flushing and disk
   556  	default:
   557  		return false, fmt.Errorf("unsupported bucket error: %w", err)
   558  	}
   559  
   560  	if b.flushing != nil {
   561  		_, err := b.flushing.get(key)
   562  		switch err {
   563  		case nil:
   564  			return false, nil
   565  		case lsmkv.Deleted:
   566  			return true, nil
   567  		case lsmkv.NotFound:
   568  			// We can still check disk
   569  		default:
   570  			return false, fmt.Errorf("unsupported bucket error: %w", err)
   571  		}
   572  	}
   573  
   574  	_, err = b.disk.get(key)
   575  	switch err {
   576  	case nil, lsmkv.NotFound:
   577  		return false, nil
   578  	case lsmkv.Deleted:
   579  		return true, nil
   580  	default:
   581  		return false, fmt.Errorf("unsupported bucket error: %w", err)
   582  	}
   583  }
   584  
   585  type MapListOptionConfig struct {
   586  	acceptDuplicates           bool
   587  	legacyRequireManualSorting bool
   588  }
   589  
   590  type MapListOption func(c *MapListOptionConfig)
   591  
   592  func MapListAcceptDuplicates() MapListOption {
   593  	return func(c *MapListOptionConfig) {
   594  		c.acceptDuplicates = true
   595  	}
   596  }
   597  
   598  func MapListLegacySortingRequired() MapListOption {
   599  	return func(c *MapListOptionConfig) {
   600  		c.legacyRequireManualSorting = true
   601  	}
   602  }
   603  
   604  // MapList returns all map entries for a given row key. The order of map pairs
   605  // has no specific meaning. For efficient merge operations, pair entries are
   606  // stored sorted on disk, however that is an implementation detail and not a
   607  // caller-facing guarantee.
   608  //
   609  // MapList is specific to the Map strategy, for Sets use [Bucket.SetList], for
   610  // Replace use [Bucket.Get].
   611  func (b *Bucket) MapList(key []byte, cfgs ...MapListOption) ([]MapPair, error) {
   612  	b.flushLock.RLock()
   613  	defer b.flushLock.RUnlock()
   614  
   615  	c := MapListOptionConfig{}
   616  	for _, cfg := range cfgs {
   617  		cfg(&c)
   618  	}
   619  
   620  	segments := [][]MapPair{}
   621  	// before := time.Now()
   622  	disk, err := b.disk.getCollectionBySegments(key)
   623  	if err != nil {
   624  		if err != nil && !errors.Is(err, lsmkv.NotFound) {
   625  			return nil, err
   626  		}
   627  	}
   628  
   629  	for i := range disk {
   630  		segmentDecoded := make([]MapPair, len(disk[i]))
   631  		for j, v := range disk[i] {
   632  			if err := segmentDecoded[j].FromBytes(v.value, false); err != nil {
   633  				return nil, err
   634  			}
   635  			// Read "broken" tombstones with length 12 but a non-tombstone value
   636  			// Related to Issue #4125
   637  			// TODO: Remove the extra check, as it may interfere future in-disk format changes
   638  			segmentDecoded[j].Tombstone = v.tombstone || len(v.value) == 12
   639  		}
   640  		segments = append(segments, segmentDecoded)
   641  	}
   642  
   643  	// fmt.Printf("--map-list: get all disk segments took %s\n", time.Since(before))
   644  
   645  	// before = time.Now()
   646  	// fmt.Printf("--map-list: append all disk segments took %s\n", time.Since(before))
   647  
   648  	if b.flushing != nil {
   649  		v, err := b.flushing.getMap(key)
   650  		if err != nil {
   651  			if err != nil && !errors.Is(err, lsmkv.NotFound) {
   652  				return nil, err
   653  			}
   654  		}
   655  
   656  		segments = append(segments, v)
   657  	}
   658  
   659  	// before = time.Now()
   660  	v, err := b.active.getMap(key)
   661  	if err != nil {
   662  		if err != nil && !errors.Is(err, lsmkv.NotFound) {
   663  			return nil, err
   664  		}
   665  	}
   666  	segments = append(segments, v)
   667  	// fmt.Printf("--map-list: get all active segments took %s\n", time.Since(before))
   668  
   669  	// before = time.Now()
   670  	// defer func() {
   671  	// 	fmt.Printf("--map-list: run decoder took %s\n", time.Since(before))
   672  	// }()
   673  
   674  	if c.legacyRequireManualSorting {
   675  		// Sort to support segments which were stored in an unsorted fashion
   676  		for i := range segments {
   677  			sort.Slice(segments[i], func(a, b int) bool {
   678  				return bytes.Compare(segments[i][a].Key, segments[i][b].Key) == -1
   679  			})
   680  		}
   681  	}
   682  
   683  	return newSortedMapMerger().do(segments)
   684  }
   685  
   686  // MapSet writes one [MapPair] into the map for the given row key. It is
   687  // agnostic of whether the row key already exists, as well as agnostic of
   688  // whether the map key already exists. In both cases it will create the entry
   689  // if it does not exist or override if it does.
   690  //
   691  // Example to add a new MapPair:
   692  //
   693  //	pair := MapPair{Key: []byte("Jane"), Value: []byte("Backend")}
   694  //	err := bucket.MapSet([]byte("developers"), pair)
   695  //	if err != nil {
   696  //		/* do something */
   697  //	}
   698  //
   699  // MapSet is specific to the Map Strategy, for Replace use [Bucket.Put], and for Set use [Bucket.SetAdd] instead.
   700  func (b *Bucket) MapSet(rowKey []byte, kv MapPair) error {
   701  	b.flushLock.RLock()
   702  	defer b.flushLock.RUnlock()
   703  
   704  	return b.active.appendMapSorted(rowKey, kv)
   705  }
   706  
   707  // MapSetMulti is the same as [Bucket.MapSet], except that it takes in multiple
   708  // [MapPair] objects at the same time.
   709  func (b *Bucket) MapSetMulti(rowKey []byte, kvs []MapPair) error {
   710  	b.flushLock.RLock()
   711  	defer b.flushLock.RUnlock()
   712  
   713  	for _, kv := range kvs {
   714  		if err := b.active.appendMapSorted(rowKey, kv); err != nil {
   715  			return err
   716  		}
   717  	}
   718  
   719  	return nil
   720  }
   721  
   722  // MapDeleteKey removes one key-value pair from the given map row. Note that
   723  // LSM stores are append only, thus internally this action appends a tombstone.
   724  // The entry will not be removed until a compaction has run, and even then a
   725  // compaction does not guarantee the removal of the data right away. This is
   726  // because an entry could have been created in an older segment than those
   727  // present in the compaction. This can be seen as an implementation detail,
   728  // unless the caller expects to free disk space by calling this method. Such
   729  // freeing is not guaranteed.
   730  //
   731  // MapDeleteKey is specific to the Map Strategy. For Replace, you can use
   732  // [Bucket.Delete] to delete the entire row, for Sets use [Bucket.SetDeleteSingle] to delete a single set element.
   733  func (b *Bucket) MapDeleteKey(rowKey, mapKey []byte) error {
   734  	b.flushLock.RLock()
   735  	defer b.flushLock.RUnlock()
   736  
   737  	pair := MapPair{
   738  		Key:       mapKey,
   739  		Tombstone: true,
   740  	}
   741  
   742  	return b.active.appendMapSorted(rowKey, pair)
   743  }
   744  
   745  // Delete removes the given row. Note that LSM stores are append only, thus
   746  // internally this action appends a tombstone.  The entry will not be removed
   747  // until a compaction has run, and even then a compaction does not guarantee
   748  // the removal of the data right away. This is because an entry could have been
   749  // created in an older segment than those present in the compaction. This can
   750  // be seen as an implementation detail, unless the caller expects to free disk
   751  // space by calling this method. Such freeing is not guaranteed.
   752  //
   753  // Delete is specific to the Replace Strategy. For Maps, you can use
   754  // [Bucket.MapDeleteKey] to delete a single key-value pair, for Sets use
   755  // [Bucket.SetDeleteSingle] to delete a single set element.
   756  func (b *Bucket) Delete(key []byte, opts ...SecondaryKeyOption) error {
   757  	b.flushLock.RLock()
   758  	defer b.flushLock.RUnlock()
   759  
   760  	return b.active.setTombstone(key, opts...)
   761  }
   762  
   763  // meant to be called from situations where a lock is already held, does not
   764  // lock on its own
   765  func (b *Bucket) setNewActiveMemtable() error {
   766  	path := filepath.Join(b.dir, fmt.Sprintf("segment-%d", time.Now().UnixNano()))
   767  
   768  	cl, err := newCommitLogger(path)
   769  	if err != nil {
   770  		return errors.Wrap(err, "init commit logger")
   771  	}
   772  
   773  	mt, err := newMemtable(path, b.strategy, b.secondaryIndices, cl, b.metrics)
   774  	if err != nil {
   775  		return err
   776  	}
   777  
   778  	b.active = mt
   779  	return nil
   780  }
   781  
   782  func (b *Bucket) Count() int {
   783  	b.flushLock.RLock()
   784  	defer b.flushLock.RUnlock()
   785  
   786  	if b.strategy != StrategyReplace {
   787  		panic("Count() called on strategy other than 'replace'")
   788  	}
   789  
   790  	memtableCount := 0
   791  	if b.flushing == nil {
   792  		// only consider active
   793  		memtableCount += b.memtableNetCount(b.active.countStats(), nil)
   794  	} else {
   795  		flushingCountStats := b.flushing.countStats()
   796  		activeCountStats := b.active.countStats()
   797  		deltaActive := b.memtableNetCount(activeCountStats, flushingCountStats)
   798  		deltaFlushing := b.memtableNetCount(flushingCountStats, nil)
   799  
   800  		memtableCount = deltaActive + deltaFlushing
   801  	}
   802  
   803  	diskCount := b.disk.count()
   804  
   805  	if b.monitorCount {
   806  		b.metrics.ObjectCount(memtableCount + diskCount)
   807  	}
   808  	return memtableCount + diskCount
   809  }
   810  
   811  // CountAsync ignores the current memtable, that makes it async because it only
   812  // reflects what has been already flushed. This in turn makes it very cheap to
   813  // call, so it can be used for observability purposes where eventual
   814  // consistency on the count is fine, but a large cost is not.
   815  func (b *Bucket) CountAsync() int {
   816  	return b.disk.count()
   817  }
   818  
   819  func (b *Bucket) memtableNetCount(stats *countStats, previousMemtable *countStats) int {
   820  	netCount := 0
   821  
   822  	// TODO: this uses regular get, given that this may be called quite commonly,
   823  	// we might consider building a pure Exists(), which skips reading the value
   824  	// and only checks for tombstones, etc.
   825  	for _, key := range stats.upsertKeys {
   826  		if !b.existsOnDiskAndPreviousMemtable(previousMemtable, key) {
   827  			netCount++
   828  		}
   829  	}
   830  
   831  	for _, key := range stats.tombstonedKeys {
   832  		if b.existsOnDiskAndPreviousMemtable(previousMemtable, key) {
   833  			netCount--
   834  		}
   835  	}
   836  
   837  	return netCount
   838  }
   839  
   840  func (b *Bucket) existsOnDiskAndPreviousMemtable(previous *countStats, key []byte) bool {
   841  	v, _ := b.disk.get(key) // current implementation can't error
   842  	if v == nil {
   843  		// not on disk, but it could still be in the previous memtable
   844  		return previous.hasUpsert(key)
   845  	}
   846  
   847  	// it exists on disk ,but it could still have been deleted in the previous memtable
   848  	return !previous.hasTombstone(key)
   849  }
   850  
   851  func (b *Bucket) Shutdown(ctx context.Context) error {
   852  	if err := b.disk.shutdown(ctx); err != nil {
   853  		return err
   854  	}
   855  
   856  	if err := b.flushCallbackCtrl.Unregister(ctx); err != nil {
   857  		return fmt.Errorf("long-running flush in progress: %w", ctx.Err())
   858  	}
   859  
   860  	b.flushLock.Lock()
   861  	if err := b.active.flush(); err != nil {
   862  		return err
   863  	}
   864  	b.flushLock.Unlock()
   865  
   866  	if b.flushing == nil {
   867  		// active has flushing, no one else was currently flushing, it's safe to
   868  		// exit
   869  		return nil
   870  	}
   871  
   872  	// it seems we still need to wait for someone to finish flushing
   873  	t := time.NewTicker(50 * time.Millisecond)
   874  	defer t.Stop()
   875  	for {
   876  		select {
   877  		case <-ctx.Done():
   878  			return ctx.Err()
   879  		case <-t.C:
   880  			if b.flushing == nil {
   881  				return nil
   882  			}
   883  		}
   884  	}
   885  }
   886  
   887  func (b *Bucket) flushAndSwitchIfThresholdsMet(shouldAbort cyclemanager.ShouldAbortCallback) bool {
   888  	b.flushLock.RLock()
   889  	commitLogSize := b.active.commitlog.Size()
   890  	memtableTooLarge := b.active.Size() >= b.memtableThreshold
   891  	walTooLarge := uint64(commitLogSize) >= b.walThreshold
   892  	dirtyTooLong := b.active.DirtyDuration() >= b.flushDirtyAfter
   893  	shouldSwitch := memtableTooLarge || walTooLarge || dirtyTooLong
   894  
   895  	// If true, the parent shard has indicated that it has
   896  	// entered an immutable state. During this time, the
   897  	// bucket should refrain from flushing until its shard
   898  	// indicates otherwise
   899  	if shouldSwitch && b.isReadOnly() {
   900  		if b.haltedFlushTimer.IntervalElapsed() {
   901  			b.logger.WithField("action", "lsm_memtable_flush").
   902  				WithField("path", b.dir).
   903  				Warn("flush halted due to shard READONLY status")
   904  			b.haltedFlushTimer.IncreaseInterval()
   905  		}
   906  
   907  		b.flushLock.RUnlock()
   908  		return false
   909  	}
   910  
   911  	b.flushLock.RUnlock()
   912  	if shouldSwitch {
   913  		b.haltedFlushTimer.Reset()
   914  		cycleLength := b.active.ActiveDuration()
   915  		if err := b.FlushAndSwitch(); err != nil {
   916  			b.logger.WithField("action", "lsm_memtable_flush").
   917  				WithField("path", b.dir).
   918  				WithError(err).
   919  				Errorf("flush and switch failed")
   920  		}
   921  
   922  		if b.memtableResizer != nil {
   923  			next, ok := b.memtableResizer.NextTarget(int(b.memtableThreshold), cycleLength)
   924  			if ok {
   925  				b.memtableThreshold = uint64(next)
   926  			}
   927  		}
   928  		return true
   929  	}
   930  	return false
   931  }
   932  
   933  // UpdateStatus is used by the parent shard to communicate to the bucket
   934  // when the shard has been set to readonly, or when it is ready for
   935  // writes.
   936  func (b *Bucket) UpdateStatus(status storagestate.Status) {
   937  	b.statusLock.Lock()
   938  	defer b.statusLock.Unlock()
   939  
   940  	b.status = status
   941  	b.disk.UpdateStatus(status)
   942  }
   943  
   944  func (b *Bucket) isReadOnly() bool {
   945  	b.statusLock.Lock()
   946  	defer b.statusLock.Unlock()
   947  
   948  	return b.status == storagestate.StatusReadOnly
   949  }
   950  
   951  // FlushAndSwitch is typically called periodically and does not require manual
   952  // calling, but there are some situations where this might be intended, such as
   953  // in test scenarios or when a force flush is desired.
   954  func (b *Bucket) FlushAndSwitch() error {
   955  	before := time.Now()
   956  
   957  	b.logger.WithField("action", "lsm_memtable_flush_start").
   958  		WithField("path", b.dir).
   959  		Trace("start flush and switch")
   960  	if err := b.atomicallySwitchMemtable(); err != nil {
   961  		return fmt.Errorf("switch active memtable: %w", err)
   962  	}
   963  
   964  	if err := b.flushing.flush(); err != nil {
   965  		return fmt.Errorf("flush: %w", err)
   966  	}
   967  
   968  	if err := b.atomicallyAddDiskSegmentAndRemoveFlushing(); err != nil {
   969  		return fmt.Errorf("add segment and remove flushing: %w", err)
   970  	}
   971  
   972  	took := time.Since(before)
   973  	b.logger.WithField("action", "lsm_memtable_flush_complete").
   974  		WithField("path", b.dir).
   975  		Trace("finish flush and switch")
   976  
   977  	b.logger.WithField("action", "lsm_memtable_flush_complete").
   978  		WithField("path", b.dir).
   979  		WithField("took", took).
   980  		Debugf("flush and switch took %s\n", took)
   981  
   982  	return nil
   983  }
   984  
   985  func (b *Bucket) atomicallyAddDiskSegmentAndRemoveFlushing() error {
   986  	b.flushLock.Lock()
   987  	defer b.flushLock.Unlock()
   988  
   989  	path := b.flushing.path
   990  	if err := b.disk.add(path + ".db"); err != nil {
   991  		return err
   992  	}
   993  	b.flushing = nil
   994  
   995  	if b.strategy == StrategyReplace && b.monitorCount {
   996  		// having just flushed the memtable we now have the most up2date count which
   997  		// is a good place to update the metric
   998  		b.metrics.ObjectCount(b.disk.count())
   999  	}
  1000  
  1001  	return nil
  1002  }
  1003  
  1004  func (b *Bucket) atomicallySwitchMemtable() error {
  1005  	b.flushLock.Lock()
  1006  	defer b.flushLock.Unlock()
  1007  
  1008  	b.flushing = b.active
  1009  	return b.setNewActiveMemtable()
  1010  }
  1011  
  1012  func (b *Bucket) Strategy() string {
  1013  	return b.strategy
  1014  }
  1015  
  1016  func (b *Bucket) DesiredStrategy() string {
  1017  	return b.desiredStrategy
  1018  }
  1019  
  1020  // the WAL uses a buffer and isn't written until the buffer size is crossed or
  1021  // this function explicitly called. This allows to avoid unnecessary disk
  1022  // writes in larger operations, such as batches. It is sufficient to call write
  1023  // on the WAL just once. This does not make a batch atomic, but it guarantees
  1024  // that the WAL is written before a successful response is returned to the
  1025  // user.
  1026  func (b *Bucket) WriteWAL() error {
  1027  	b.flushLock.RLock()
  1028  	defer b.flushLock.RUnlock()
  1029  
  1030  	return b.active.writeWAL()
  1031  }