github.com/weaviate/weaviate@v1.24.6/adapters/repos/db/lsmkv/strategies_map.go (about)

     1  //                           _       _
     2  // __      _____  __ ___   ___  __ _| |_ ___
     3  // \ \ /\ / / _ \/ _` \ \ / / |/ _` | __/ _ \
     4  //  \ V  V /  __/ (_| |\ V /| | (_| | ||  __/
     5  //   \_/\_/ \___|\__,_| \_/ |_|\__,_|\__\___|
     6  //
     7  //  Copyright © 2016 - 2024 Weaviate B.V. All rights reserved.
     8  //
     9  //  CONTACT: hello@weaviate.io
    10  //
    11  
    12  package lsmkv
    13  
    14  import (
    15  	"bytes"
    16  	"encoding/binary"
    17  	"math"
    18  
    19  	"github.com/pkg/errors"
    20  )
    21  
    22  type mapDecoder struct{}
    23  
    24  func newMapDecoder() *mapDecoder {
    25  	return &mapDecoder{}
    26  }
    27  
    28  func (m *mapDecoder) Do(in []value, acceptDuplicates bool) ([]MapPair, error) {
    29  	// if acceptDuplicates {
    30  	// 	return m.doSimplified(in)
    31  	// }
    32  
    33  	seenKeys := map[string]uint{}
    34  	kvs := make([]MapPair, len(in))
    35  
    36  	// unmarshalling := time.Duration(0)
    37  
    38  	// beforeFirst := time.Now()
    39  	for i, pair := range in {
    40  		kv := MapPair{}
    41  		// beforeUnmarshal := time.Now()
    42  		err := kv.FromBytes(pair.value, pair.tombstone)
    43  		if err != nil {
    44  			return nil, err
    45  		}
    46  		// unmarshalling += time.Since(beforeUnmarshal)
    47  		kv.Tombstone = pair.tombstone
    48  		kvs[i] = kv
    49  		count := seenKeys[string(kv.Key)]
    50  		seenKeys[string(kv.Key)] = count + 1
    51  	}
    52  	// fmt.Printf("first decoder loop took %s\n", time.Since(beforeFirst))
    53  	// fmt.Printf("unmarshalling in first loop took %s\n", unmarshalling)
    54  
    55  	// beforeSecond := time.Now()
    56  	out := make([]MapPair, len(in))
    57  	i := 0
    58  	for _, pair := range kvs {
    59  		count := seenKeys[string(pair.Key)]
    60  		if count != 1 {
    61  			seenKeys[string(pair.Key)] = count - 1
    62  			continue
    63  
    64  		}
    65  
    66  		if pair.Tombstone {
    67  			continue
    68  		}
    69  
    70  		out[i] = pair
    71  		i++
    72  	}
    73  	// fmt.Printf("second decoder loop took %s\n", time.Since(beforeSecond))
    74  
    75  	return out[:i], nil
    76  }
    77  
    78  type tombstone struct {
    79  	pos int
    80  	key []byte
    81  }
    82  
    83  func (m *mapDecoder) doSimplified(in []value) ([]MapPair, error) {
    84  	out := make([]MapPair, len(in))
    85  
    86  	var tombstones []tombstone
    87  
    88  	i := 0
    89  	for _, raw := range in {
    90  		if raw.tombstone {
    91  			mp := MapPair{}
    92  			mp.FromBytes(raw.value, true)
    93  			tombstones = append(tombstones, tombstone{pos: i, key: mp.Key})
    94  			continue
    95  		}
    96  
    97  		out[i].FromBytes(raw.value, raw.tombstone)
    98  		i++
    99  	}
   100  
   101  	out = out[:i]
   102  
   103  	if len(tombstones) > 0 {
   104  		out = m.removeTombstonesFromResults(out, tombstones)
   105  	}
   106  
   107  	return out, nil
   108  }
   109  
   110  func (m *mapDecoder) removeTombstonesFromResults(candidates []MapPair,
   111  	tombstones []tombstone,
   112  ) []MapPair {
   113  	after := make([]MapPair, len(candidates))
   114  	newPos := 0
   115  	for origPos, candidate := range candidates {
   116  
   117  		skip := false
   118  		for _, tombstone := range tombstones {
   119  			if tombstone.pos > origPos && bytes.Equal(tombstone.key, candidate.Key) {
   120  				skip = true
   121  			}
   122  		}
   123  
   124  		if skip {
   125  			continue
   126  		}
   127  
   128  		after[newPos] = candidate
   129  		newPos++
   130  	}
   131  
   132  	return after[:newPos]
   133  }
   134  
   135  // DoPartial keeps "unused" tombstones
   136  func (m *mapDecoder) DoPartial(in []value) ([]MapPair, error) {
   137  	seenKeys := map[string]uint{}
   138  	kvs := make([]MapPair, len(in))
   139  
   140  	for i, pair := range in {
   141  		kv := MapPair{}
   142  		err := kv.FromBytes(pair.value, pair.tombstone)
   143  		if err != nil {
   144  			return nil, err
   145  		}
   146  		kv.Tombstone = pair.tombstone
   147  		kvs[i] = kv
   148  		count := seenKeys[string(kv.Key)]
   149  		seenKeys[string(kv.Key)] = count + 1
   150  	}
   151  
   152  	out := make([]MapPair, len(in))
   153  	i := 0
   154  	for _, pair := range kvs {
   155  		count := seenKeys[string(pair.Key)]
   156  		if count != 1 {
   157  			seenKeys[string(pair.Key)] = count - 1
   158  			continue
   159  
   160  		}
   161  
   162  		out[i] = pair
   163  		i++
   164  	}
   165  
   166  	return out[:i], nil
   167  }
   168  
   169  type MapPair struct {
   170  	Key       []byte
   171  	Value     []byte
   172  	Tombstone bool
   173  }
   174  
   175  // Size() returns the exact size in bytes that will be used when Bytes() is
   176  // called
   177  func (kv MapPair) Size() int {
   178  	// each field uses a uint16 (2 bytes) length indicator
   179  	return 2 + len(kv.Key) + 2 + len(kv.Value)
   180  }
   181  
   182  func (kv MapPair) EncodeBytes(buf []byte) error {
   183  	if len(buf) != kv.Size() {
   184  		return errors.Errorf("buffer has size %d, but MapPair has size %d",
   185  			len(buf), kv.Size())
   186  	}
   187  
   188  	// make sure the 2 byte length indicators will never overflow:
   189  	if len(kv.Key) >= math.MaxUint16 {
   190  		return errors.Errorf("mapCollection key must be smaller than %d",
   191  			math.MaxUint16)
   192  	}
   193  	keyLen := uint16(len(kv.Key))
   194  
   195  	if len(kv.Value) >= math.MaxUint16 {
   196  		return errors.Errorf("mapCollection value must be smaller than %d",
   197  			math.MaxUint16)
   198  	}
   199  	valueLen := uint16(len(kv.Value))
   200  
   201  	offset := 0
   202  	binary.LittleEndian.PutUint16(buf[offset:offset+2], keyLen)
   203  	offset += 2
   204  	copy(buf[offset:], kv.Key)
   205  	offset += len(kv.Key)
   206  
   207  	binary.LittleEndian.PutUint16(buf[offset:offset+2], valueLen)
   208  	offset += 2
   209  	copy(buf[offset:], kv.Value)
   210  
   211  	return nil
   212  }
   213  
   214  func (kv MapPair) Bytes() ([]byte, error) {
   215  	// make sure the 2 byte length indicators will never overflow:
   216  	if len(kv.Key) >= math.MaxUint16 {
   217  		return nil, errors.Errorf("mapCollection key must be smaller than %d",
   218  			math.MaxUint16)
   219  	}
   220  	keyLen := uint16(len(kv.Key))
   221  
   222  	if len(kv.Value) >= math.MaxUint16 {
   223  		return nil, errors.Errorf("mapCollection value must be smaller than %d",
   224  			math.MaxUint16)
   225  	}
   226  	valueLen := uint16(len(kv.Value))
   227  
   228  	out := bytes.NewBuffer(nil)
   229  
   230  	lenBuf := make([]byte, 2) // can be reused for both key and value len
   231  	binary.LittleEndian.PutUint16(lenBuf, keyLen)
   232  	if _, err := out.Write(lenBuf); err != nil {
   233  		return nil, errors.Wrap(err, "write map key length indicator")
   234  	}
   235  
   236  	if _, err := out.Write(kv.Key); err != nil {
   237  		return nil, errors.Wrap(err, "write map key")
   238  	}
   239  
   240  	binary.LittleEndian.PutUint16(lenBuf, valueLen)
   241  	if _, err := out.Write(lenBuf); err != nil {
   242  		return nil, errors.Wrap(err, "write map value length indicator")
   243  	}
   244  
   245  	if _, err := out.Write(kv.Value); err != nil {
   246  		return nil, errors.Wrap(err, "write map value")
   247  	}
   248  
   249  	return out.Bytes(), nil
   250  }
   251  
   252  func (kv *MapPair) FromBytes(in []byte, keyOnly bool) error {
   253  	var read uint16
   254  
   255  	// NOTE: A previous implementation was using copy statements in here to avoid
   256  	// sharing the memory. The general idea of that is good (protect against the
   257  	// mmaped memory being removed from a completed compaction), however this is
   258  	// the wrong place. By the time we are in this method, we can no longer
   259  	// control the memory safety of the "in" argument. Thus, such a copy must
   260  	// happen at a much earlier scope when a lock is held that protects against
   261  	// removing the segment. Such an implementation can now be found in
   262  	// segment_collection_strategy.go as part of the *segment.getCollection
   263  	// method. As a result all memory used here can now be considered read-only
   264  	// and is safe to be used indefinitely.
   265  
   266  	keyLen := binary.LittleEndian.Uint16(in[:2])
   267  	read += 2 // uint16 -> 2 bytes
   268  
   269  	kv.Key = in[read : read+keyLen]
   270  	read += keyLen
   271  
   272  	if keyOnly {
   273  		return nil
   274  	}
   275  
   276  	valueLen := binary.LittleEndian.Uint16(in[read : read+2])
   277  	read += 2
   278  
   279  	kv.Value = in[read : read+valueLen]
   280  	read += valueLen
   281  
   282  	if read != uint16(len(in)) {
   283  		return errors.Errorf("inconsistent map pair: read %d out of %d bytes",
   284  			read, len(in))
   285  	}
   286  
   287  	return nil
   288  }
   289  
   290  func (kv *MapPair) FromBytesReusable(in []byte, keyOnly bool) error {
   291  	var read uint16
   292  
   293  	keyLen := binary.LittleEndian.Uint16(in[:2])
   294  	read += 2 // uint16 -> 2 bytes
   295  
   296  	if int(keyLen) > cap(kv.Key) {
   297  		kv.Key = make([]byte, keyLen)
   298  	} else {
   299  		kv.Key = kv.Key[:keyLen]
   300  	}
   301  	copy(kv.Key, in[read:read+keyLen])
   302  	read += keyLen
   303  
   304  	if keyOnly {
   305  		return nil
   306  	}
   307  
   308  	valueLen := binary.LittleEndian.Uint16(in[read : read+2])
   309  	read += 2
   310  
   311  	if int(valueLen) > cap(kv.Value) {
   312  		kv.Value = make([]byte, valueLen)
   313  	} else {
   314  		kv.Value = kv.Value[:valueLen]
   315  	}
   316  	copy(kv.Value, in[read:read+valueLen])
   317  	read += valueLen
   318  
   319  	if read != uint16(len(in)) {
   320  		return errors.Errorf("inconsistent map pair: read %d out of %d bytes",
   321  			read, len(in))
   322  	}
   323  
   324  	return nil
   325  }
   326  
   327  type mapEncoder struct {
   328  	pairBuf []value
   329  }
   330  
   331  func newMapEncoder() *mapEncoder {
   332  	return &mapEncoder{}
   333  }
   334  
   335  func (m *mapEncoder) Do(kv MapPair) ([]value, error) {
   336  	v, err := kv.Bytes()
   337  	if err != nil {
   338  		return nil, err
   339  	}
   340  
   341  	out := make([]value, 1)
   342  	out[0] = value{
   343  		tombstone: kv.Tombstone,
   344  		value:     v,
   345  	}
   346  
   347  	return out, nil
   348  }
   349  
   350  func (m *mapEncoder) DoMulti(kvs []MapPair) ([]value, error) {
   351  	out := make([]value, len(kvs))
   352  
   353  	for i, kv := range kvs {
   354  		v := make([]byte, kv.Size())
   355  		err := kv.EncodeBytes(v)
   356  		if err != nil {
   357  			return nil, err
   358  		}
   359  
   360  		out[i] = value{
   361  			tombstone: kv.Tombstone,
   362  			value:     v,
   363  		}
   364  	}
   365  
   366  	return out, nil
   367  }
   368  
   369  // DoMultiReusable reuses a MapPair buffer that it exposes to the caller on
   370  // this request. Warning: The caller must make sure that they no longer access
   371  // the return value once they call this method a second time, otherwise they
   372  // risk overwriting a previous result. The intended usage for example in a loop
   373  // where each loop copies the results, for example using a bufio.Writer.
   374  func (m *mapEncoder) DoMultiReusable(kvs []MapPair) ([]value, error) {
   375  	m.resizeBuffer(len(kvs))
   376  
   377  	for i, kv := range kvs {
   378  		m.resizeValueAtBuffer(i, kv.Size())
   379  		err := kv.EncodeBytes(m.pairBuf[i].value)
   380  		if err != nil {
   381  			return nil, err
   382  		}
   383  
   384  		m.pairBuf[i].tombstone = kv.Tombstone
   385  	}
   386  
   387  	return m.pairBuf, nil
   388  }
   389  
   390  func (m *mapEncoder) resizeBuffer(size int) {
   391  	if cap(m.pairBuf) >= size {
   392  		m.pairBuf = m.pairBuf[:size]
   393  	} else {
   394  		m.pairBuf = make([]value, size, int(float64(size)*1.25))
   395  	}
   396  }
   397  
   398  func (m *mapEncoder) resizeValueAtBuffer(pos, size int) {
   399  	if cap(m.pairBuf[pos].value) >= size {
   400  		m.pairBuf[pos].value = m.pairBuf[pos].value[:size]
   401  	} else {
   402  		m.pairBuf[pos].value = make([]byte, size, int(float64(size)*1.25))
   403  	}
   404  }