github.com/weaviate/weaviate@v1.24.6/adapters/repos/db/lsmkv/strategies_map_sorted_merger.go (about)

     1  //                           _       _
     2  // __      _____  __ ___   ___  __ _| |_ ___
     3  // \ \ /\ / / _ \/ _` \ \ / / |/ _` | __/ _ \
     4  //  \ V  V /  __/ (_| |\ V /| | (_| | ||  __/
     5  //   \_/\_/ \___|\__,_| \_/ |_|\__,_|\__\___|
     6  //
     7  //  Copyright © 2016 - 2024 Weaviate B.V. All rights reserved.
     8  //
     9  //  CONTACT: hello@weaviate.io
    10  //
    11  
    12  package lsmkv
    13  
    14  import (
    15  	"bytes"
    16  
    17  	"github.com/pkg/errors"
    18  )
    19  
    20  type sortedMapMerger struct {
    21  	input   [][]MapPair
    22  	output  []MapPair
    23  	offsets []int
    24  }
    25  
    26  func newSortedMapMerger() *sortedMapMerger {
    27  	return &sortedMapMerger{}
    28  }
    29  
    30  func (s *sortedMapMerger) do(segments [][]MapPair) ([]MapPair, error) {
    31  	if err := s.init(segments); err != nil {
    32  		return nil, errors.Wrap(err, "init sorted map decoder")
    33  	}
    34  
    35  	i := 0
    36  	for {
    37  		match, ok := s.findSegmentWithLowestKey()
    38  		if !ok {
    39  			break
    40  		}
    41  
    42  		if match.Tombstone {
    43  			// the latest version of this key was a tombstone, so we can ignore it
    44  			continue
    45  		}
    46  
    47  		s.output[i] = match
    48  		i++
    49  	}
    50  
    51  	return s.output[:i], nil
    52  }
    53  
    54  // same as .do() but does not remove the tombstone if the most latest version
    55  // of a key is a tombstone. It can thus also be used in compactions
    56  func (s *sortedMapMerger) doKeepTombstones(segments [][]MapPair) ([]MapPair, error) {
    57  	if err := s.init(segments); err != nil {
    58  		return nil, errors.Wrap(err, "init sorted map decoder")
    59  	}
    60  
    61  	i := 0
    62  	for {
    63  		match, ok := s.findSegmentWithLowestKey()
    64  		if !ok {
    65  			break
    66  		}
    67  
    68  		s.output[i] = match
    69  		i++
    70  	}
    71  
    72  	return s.output[:i], nil
    73  }
    74  
    75  // same as .doKeepTombstone() but requires initialization from the outside and
    76  // can thus reuse state from previous rounds without having to allocate again.
    77  // must be pre-faced by a call of reset()
    78  func (s *sortedMapMerger) doKeepTombstonesReusable() ([]MapPair, error) {
    79  	i := 0
    80  	for {
    81  		match, ok := s.findSegmentWithLowestKey()
    82  		if !ok {
    83  			break
    84  		}
    85  
    86  		s.output[i] = match
    87  		i++
    88  	}
    89  
    90  	return s.output[:i], nil
    91  }
    92  
    93  // init is automatically called by .do() or .doKeepTombstones()
    94  func (s *sortedMapMerger) init(segments [][]MapPair) error {
    95  	s.input = segments
    96  
    97  	// all offset pointers initialized at 0 which is where we want to start
    98  	s.offsets = make([]int, len(segments))
    99  
   100  	// The maximum output is the sum of all the input segments if there are only
   101  	// unique keys and zero tombstones. If there are duplicate keys (i.e.
   102  	// updates) or tombstones, we will slice off some elements of the output
   103  	// later, but this way we can be sure each index will always be initialized
   104  	// correctly
   105  	maxOutput := 0
   106  	for _, seg := range segments {
   107  		maxOutput += len(seg)
   108  	}
   109  	s.output = make([]MapPair, maxOutput)
   110  
   111  	return nil
   112  }
   113  
   114  // reset can be manually called if sharing allocated state is desired, such as
   115  // with .doKeepTombstonesReusable()
   116  func (s *sortedMapMerger) reset(segments [][]MapPair) error {
   117  	s.input = segments
   118  
   119  	if cap(s.offsets) >= len(segments) {
   120  		s.offsets = s.offsets[:len(segments)]
   121  
   122  		// it existed before so we need to reset all offsets to 0
   123  		for i := range s.offsets {
   124  			s.offsets[i] = 0
   125  		}
   126  	} else {
   127  		s.offsets = make([]int, len(segments), int(float64(len(segments))*1.25))
   128  	}
   129  
   130  	// The maximum output is the sum of all the input segments if there are only
   131  	// unique keys and zero tombstones. If there are duplicate keys (i.e.
   132  	// updates) or tombstones, we will slice off some elements of the output
   133  	// later, but this way we can be sure each index will always be initialized
   134  	// correctly
   135  	maxOutput := 0
   136  	for _, seg := range segments {
   137  		maxOutput += len(seg)
   138  	}
   139  
   140  	if cap(s.output) >= maxOutput {
   141  		s.output = s.output[:maxOutput]
   142  		// no need to reset any values as all of them will be overridden anyway
   143  	} else {
   144  		s.output = make([]MapPair, maxOutput, int(float64(maxOutput)*1.25))
   145  	}
   146  
   147  	return nil
   148  }
   149  
   150  func (s *sortedMapMerger) findSegmentWithLowestKey() (MapPair, bool) {
   151  	bestSeg := -1
   152  	bestKey := []byte(nil)
   153  
   154  	for segmentID := 0; segmentID < len(s.input); segmentID++ {
   155  		// check if a segment is already exhausted, then skip
   156  		if s.offsets[segmentID] >= len(s.input[segmentID]) {
   157  			continue
   158  		}
   159  
   160  		currKey := s.input[segmentID][s.offsets[segmentID]].Key
   161  		if bestSeg == -1 {
   162  			// first time we're running, no need to compare, just set to current
   163  			bestSeg = segmentID
   164  			bestKey = currKey
   165  			continue
   166  		}
   167  
   168  		cmp := bytes.Compare(currKey, bestKey)
   169  		if cmp > 0 {
   170  			// the segment we are currently looking at has a higher key than our
   171  			// current best so we can completely ignore it
   172  			continue
   173  		}
   174  
   175  		if cmp < 0 {
   176  			// the segment we are currently looking at is a better match than the
   177  			// previous, this means, we have found a new favorite, but the previous
   178  			// best will still be valid in a future round
   179  			bestSeg = segmentID
   180  			bestKey = currKey
   181  		}
   182  
   183  		if cmp == 0 {
   184  			// this the most interesting case: we are looking at a duplicate key. In
   185  			// this case the rightmost ("latest") segment takes precedence, however,
   186  			// we must make sure that the previous match gets discarded, otherwise we
   187  			// will find it again in the next round.
   188  			//
   189  			// We can simply increase the offset before updating the bestSeg pointer,
   190  			// which means we will never look at this element again
   191  			s.offsets[bestSeg]++
   192  
   193  			// now that the old element is discarded, we can update our pointers
   194  			bestSeg = segmentID
   195  			bestKey = currKey
   196  		}
   197  	}
   198  
   199  	if bestSeg == -1 {
   200  		// we didn't find anything, looks like we have exhausted all segments
   201  		return MapPair{}, false
   202  	}
   203  
   204  	// we can now be sure that bestSeg,bestKey is the latest version of the
   205  	// lowest key, there is only one job left to do: increase the offset, so we
   206  	// never find this segment again
   207  	bestMatch := s.input[bestSeg][s.offsets[bestSeg]]
   208  	s.offsets[bestSeg]++
   209  
   210  	return bestMatch, true
   211  }