github.com/weaviate/weaviate@v1.24.6/adapters/repos/db/lsmkv/strategies_set.go (about)

     1  //                           _       _
     2  // __      _____  __ ___   ___  __ _| |_ ___
     3  // \ \ /\ / / _ \/ _` \ \ / / |/ _` | __/ _ \
     4  //  \ V  V /  __/ (_| |\ V /| | (_| | ||  __/
     5  //   \_/\_/ \___|\__,_| \_/ |_|\__,_|\__\___|
     6  //
     7  //  Copyright © 2016 - 2024 Weaviate B.V. All rights reserved.
     8  //
     9  //  CONTACT: hello@weaviate.io
    10  //
    11  
    12  package lsmkv
    13  
    14  type setDecoder struct{}
    15  
    16  func newSetDecoder() *setDecoder {
    17  	return &setDecoder{}
    18  }
    19  
    20  func (s *setDecoder) Do(in []value) [][]byte {
    21  	// check if there are tombstones, if not, we can simply take the list without
    22  	// further processing
    23  	var tombstones int
    24  	for _, value := range in {
    25  		if value.tombstone {
    26  			tombstones++
    27  		}
    28  	}
    29  
    30  	if tombstones == 0 {
    31  		return s.doWithoutTombstones(in)
    32  	}
    33  
    34  	// there are tombstones, we need to remove them
    35  	// TODO: The logic below can be improved since don't care about the "latest"
    36  	// write on a set, as all writes are per definition identical. Any write that
    37  	// is not followed by a tombstone is fine
    38  	count := make(map[string]uint, len(in))
    39  	for _, value := range in {
    40  		count[string(value.value)] = count[string(value.value)] + 1
    41  	}
    42  	out := make([][]byte, len(in))
    43  
    44  	i := 0
    45  	for _, value := range in {
    46  		if count[string(value.value)] != 1 {
    47  			count[string(value.value)] = count[string(value.value)] - 1
    48  			continue
    49  		}
    50  
    51  		if value.tombstone {
    52  			continue
    53  		}
    54  
    55  		out[i] = value.value
    56  		i++
    57  	}
    58  
    59  	return out[:i]
    60  }
    61  
    62  func (s *setDecoder) doWithoutTombstones(in []value) [][]byte {
    63  	out := make([][]byte, len(in))
    64  	for i := range in {
    65  		out[i] = in[i].value
    66  	}
    67  
    68  	// take an arbitrary cutoff for when it is worth to remove duplicates. The
    69  	// assumption is that on larger lists, duplicates are more likely to be
    70  	// tolerated, for example, because the point is to build an allow list for a
    71  	// secondary index where a duplicate does not matter. If the amount is
    72  	// smaller than the cutoff this is more likely to be relevant to a user.
    73  	//
    74  	// As the list gets longer, removing duplicates gets a lot more expensive,
    75  	// hence it makes sense to skip the de-duplication, if we can be reasonably
    76  	// sure that it does not matter
    77  	if len(out) <= 1000 {
    78  		return s.deduplicateResults(out)
    79  	}
    80  
    81  	return out
    82  }
    83  
    84  func (s *setDecoder) deduplicateResults(in [][]byte) [][]byte {
    85  	out := make([][]byte, len(in))
    86  
    87  	seen := map[string]struct{}{}
    88  
    89  	i := 0
    90  	for _, elem := range in {
    91  		if _, ok := seen[string(elem)]; ok {
    92  			continue
    93  		}
    94  
    95  		out[i] = elem
    96  		seen[string(elem)] = struct{}{}
    97  		i++
    98  	}
    99  
   100  	return out[:i]
   101  }
   102  
   103  // DoPartial keeps any extra tombstones, but does not keep tombstones which
   104  // were "consumed"
   105  func (s *setDecoder) DoPartial(in []value) []value {
   106  	count := map[string]uint{}
   107  	for _, value := range in {
   108  		count[string(value.value)] = count[string(value.value)] + 1
   109  	}
   110  
   111  	out := make([]value, len(in))
   112  
   113  	i := 0
   114  	for _, value := range in {
   115  		if count[string(value.value)] != 1 {
   116  			count[string(value.value)] = count[string(value.value)] - 1
   117  			continue
   118  		}
   119  
   120  		out[i] = value
   121  		i++
   122  	}
   123  
   124  	return out[:i]
   125  }
   126  
   127  type setEncoder struct{}
   128  
   129  func newSetEncoder() *setEncoder {
   130  	return &setEncoder{}
   131  }
   132  
   133  func (s *setEncoder) Do(in [][]byte) []value {
   134  	out := make([]value, len(in))
   135  	for i, v := range in {
   136  		out[i] = value{
   137  			tombstone: false,
   138  			value:     v,
   139  		}
   140  	}
   141  
   142  	return out
   143  }