github.com/weaviate/weaviate@v1.24.6/adapters/repos/db/lsmkv/strategies_set.go (about) 1 // _ _ 2 // __ _____ __ ___ ___ __ _| |_ ___ 3 // \ \ /\ / / _ \/ _` \ \ / / |/ _` | __/ _ \ 4 // \ V V / __/ (_| |\ V /| | (_| | || __/ 5 // \_/\_/ \___|\__,_| \_/ |_|\__,_|\__\___| 6 // 7 // Copyright © 2016 - 2024 Weaviate B.V. All rights reserved. 8 // 9 // CONTACT: hello@weaviate.io 10 // 11 12 package lsmkv 13 14 type setDecoder struct{} 15 16 func newSetDecoder() *setDecoder { 17 return &setDecoder{} 18 } 19 20 func (s *setDecoder) Do(in []value) [][]byte { 21 // check if there are tombstones, if not, we can simply take the list without 22 // further processing 23 var tombstones int 24 for _, value := range in { 25 if value.tombstone { 26 tombstones++ 27 } 28 } 29 30 if tombstones == 0 { 31 return s.doWithoutTombstones(in) 32 } 33 34 // there are tombstones, we need to remove them 35 // TODO: The logic below can be improved since don't care about the "latest" 36 // write on a set, as all writes are per definition identical. Any write that 37 // is not followed by a tombstone is fine 38 count := make(map[string]uint, len(in)) 39 for _, value := range in { 40 count[string(value.value)] = count[string(value.value)] + 1 41 } 42 out := make([][]byte, len(in)) 43 44 i := 0 45 for _, value := range in { 46 if count[string(value.value)] != 1 { 47 count[string(value.value)] = count[string(value.value)] - 1 48 continue 49 } 50 51 if value.tombstone { 52 continue 53 } 54 55 out[i] = value.value 56 i++ 57 } 58 59 return out[:i] 60 } 61 62 func (s *setDecoder) doWithoutTombstones(in []value) [][]byte { 63 out := make([][]byte, len(in)) 64 for i := range in { 65 out[i] = in[i].value 66 } 67 68 // take an arbitrary cutoff for when it is worth to remove duplicates. The 69 // assumption is that on larger lists, duplicates are more likely to be 70 // tolerated, for example, because the point is to build an allow list for a 71 // secondary index where a duplicate does not matter. If the amount is 72 // smaller than the cutoff this is more likely to be relevant to a user. 73 // 74 // As the list gets longer, removing duplicates gets a lot more expensive, 75 // hence it makes sense to skip the de-duplication, if we can be reasonably 76 // sure that it does not matter 77 if len(out) <= 1000 { 78 return s.deduplicateResults(out) 79 } 80 81 return out 82 } 83 84 func (s *setDecoder) deduplicateResults(in [][]byte) [][]byte { 85 out := make([][]byte, len(in)) 86 87 seen := map[string]struct{}{} 88 89 i := 0 90 for _, elem := range in { 91 if _, ok := seen[string(elem)]; ok { 92 continue 93 } 94 95 out[i] = elem 96 seen[string(elem)] = struct{}{} 97 i++ 98 } 99 100 return out[:i] 101 } 102 103 // DoPartial keeps any extra tombstones, but does not keep tombstones which 104 // were "consumed" 105 func (s *setDecoder) DoPartial(in []value) []value { 106 count := map[string]uint{} 107 for _, value := range in { 108 count[string(value.value)] = count[string(value.value)] + 1 109 } 110 111 out := make([]value, len(in)) 112 113 i := 0 114 for _, value := range in { 115 if count[string(value.value)] != 1 { 116 count[string(value.value)] = count[string(value.value)] - 1 117 continue 118 } 119 120 out[i] = value 121 i++ 122 } 123 124 return out[:i] 125 } 126 127 type setEncoder struct{} 128 129 func newSetEncoder() *setEncoder { 130 return &setEncoder{} 131 } 132 133 func (s *setEncoder) Do(in [][]byte) []value { 134 out := make([]value, len(in)) 135 for i, v := range in { 136 out[i] = value{ 137 tombstone: false, 138 value: v, 139 } 140 } 141 142 return out 143 }