github.com/weaviate/weaviate@v1.24.6/adapters/repos/db/lsmkv/strategies_map.go (about) 1 // _ _ 2 // __ _____ __ ___ ___ __ _| |_ ___ 3 // \ \ /\ / / _ \/ _` \ \ / / |/ _` | __/ _ \ 4 // \ V V / __/ (_| |\ V /| | (_| | || __/ 5 // \_/\_/ \___|\__,_| \_/ |_|\__,_|\__\___| 6 // 7 // Copyright © 2016 - 2024 Weaviate B.V. All rights reserved. 8 // 9 // CONTACT: hello@weaviate.io 10 // 11 12 package lsmkv 13 14 import ( 15 "bytes" 16 "encoding/binary" 17 "math" 18 19 "github.com/pkg/errors" 20 ) 21 22 type mapDecoder struct{} 23 24 func newMapDecoder() *mapDecoder { 25 return &mapDecoder{} 26 } 27 28 func (m *mapDecoder) Do(in []value, acceptDuplicates bool) ([]MapPair, error) { 29 // if acceptDuplicates { 30 // return m.doSimplified(in) 31 // } 32 33 seenKeys := map[string]uint{} 34 kvs := make([]MapPair, len(in)) 35 36 // unmarshalling := time.Duration(0) 37 38 // beforeFirst := time.Now() 39 for i, pair := range in { 40 kv := MapPair{} 41 // beforeUnmarshal := time.Now() 42 err := kv.FromBytes(pair.value, pair.tombstone) 43 if err != nil { 44 return nil, err 45 } 46 // unmarshalling += time.Since(beforeUnmarshal) 47 kv.Tombstone = pair.tombstone 48 kvs[i] = kv 49 count := seenKeys[string(kv.Key)] 50 seenKeys[string(kv.Key)] = count + 1 51 } 52 // fmt.Printf("first decoder loop took %s\n", time.Since(beforeFirst)) 53 // fmt.Printf("unmarshalling in first loop took %s\n", unmarshalling) 54 55 // beforeSecond := time.Now() 56 out := make([]MapPair, len(in)) 57 i := 0 58 for _, pair := range kvs { 59 count := seenKeys[string(pair.Key)] 60 if count != 1 { 61 seenKeys[string(pair.Key)] = count - 1 62 continue 63 64 } 65 66 if pair.Tombstone { 67 continue 68 } 69 70 out[i] = pair 71 i++ 72 } 73 // fmt.Printf("second decoder loop took %s\n", time.Since(beforeSecond)) 74 75 return out[:i], nil 76 } 77 78 type tombstone struct { 79 pos int 80 key []byte 81 } 82 83 func (m *mapDecoder) doSimplified(in []value) ([]MapPair, error) { 84 out := make([]MapPair, len(in)) 85 86 var tombstones []tombstone 87 88 i := 0 89 for _, raw := range in { 90 if raw.tombstone { 91 mp := MapPair{} 92 mp.FromBytes(raw.value, true) 93 tombstones = append(tombstones, tombstone{pos: i, key: mp.Key}) 94 continue 95 } 96 97 out[i].FromBytes(raw.value, raw.tombstone) 98 i++ 99 } 100 101 out = out[:i] 102 103 if len(tombstones) > 0 { 104 out = m.removeTombstonesFromResults(out, tombstones) 105 } 106 107 return out, nil 108 } 109 110 func (m *mapDecoder) removeTombstonesFromResults(candidates []MapPair, 111 tombstones []tombstone, 112 ) []MapPair { 113 after := make([]MapPair, len(candidates)) 114 newPos := 0 115 for origPos, candidate := range candidates { 116 117 skip := false 118 for _, tombstone := range tombstones { 119 if tombstone.pos > origPos && bytes.Equal(tombstone.key, candidate.Key) { 120 skip = true 121 } 122 } 123 124 if skip { 125 continue 126 } 127 128 after[newPos] = candidate 129 newPos++ 130 } 131 132 return after[:newPos] 133 } 134 135 // DoPartial keeps "unused" tombstones 136 func (m *mapDecoder) DoPartial(in []value) ([]MapPair, error) { 137 seenKeys := map[string]uint{} 138 kvs := make([]MapPair, len(in)) 139 140 for i, pair := range in { 141 kv := MapPair{} 142 err := kv.FromBytes(pair.value, pair.tombstone) 143 if err != nil { 144 return nil, err 145 } 146 kv.Tombstone = pair.tombstone 147 kvs[i] = kv 148 count := seenKeys[string(kv.Key)] 149 seenKeys[string(kv.Key)] = count + 1 150 } 151 152 out := make([]MapPair, len(in)) 153 i := 0 154 for _, pair := range kvs { 155 count := seenKeys[string(pair.Key)] 156 if count != 1 { 157 seenKeys[string(pair.Key)] = count - 1 158 continue 159 160 } 161 162 out[i] = pair 163 i++ 164 } 165 166 return out[:i], nil 167 } 168 169 type MapPair struct { 170 Key []byte 171 Value []byte 172 Tombstone bool 173 } 174 175 // Size() returns the exact size in bytes that will be used when Bytes() is 176 // called 177 func (kv MapPair) Size() int { 178 // each field uses a uint16 (2 bytes) length indicator 179 return 2 + len(kv.Key) + 2 + len(kv.Value) 180 } 181 182 func (kv MapPair) EncodeBytes(buf []byte) error { 183 if len(buf) != kv.Size() { 184 return errors.Errorf("buffer has size %d, but MapPair has size %d", 185 len(buf), kv.Size()) 186 } 187 188 // make sure the 2 byte length indicators will never overflow: 189 if len(kv.Key) >= math.MaxUint16 { 190 return errors.Errorf("mapCollection key must be smaller than %d", 191 math.MaxUint16) 192 } 193 keyLen := uint16(len(kv.Key)) 194 195 if len(kv.Value) >= math.MaxUint16 { 196 return errors.Errorf("mapCollection value must be smaller than %d", 197 math.MaxUint16) 198 } 199 valueLen := uint16(len(kv.Value)) 200 201 offset := 0 202 binary.LittleEndian.PutUint16(buf[offset:offset+2], keyLen) 203 offset += 2 204 copy(buf[offset:], kv.Key) 205 offset += len(kv.Key) 206 207 binary.LittleEndian.PutUint16(buf[offset:offset+2], valueLen) 208 offset += 2 209 copy(buf[offset:], kv.Value) 210 211 return nil 212 } 213 214 func (kv MapPair) Bytes() ([]byte, error) { 215 // make sure the 2 byte length indicators will never overflow: 216 if len(kv.Key) >= math.MaxUint16 { 217 return nil, errors.Errorf("mapCollection key must be smaller than %d", 218 math.MaxUint16) 219 } 220 keyLen := uint16(len(kv.Key)) 221 222 if len(kv.Value) >= math.MaxUint16 { 223 return nil, errors.Errorf("mapCollection value must be smaller than %d", 224 math.MaxUint16) 225 } 226 valueLen := uint16(len(kv.Value)) 227 228 out := bytes.NewBuffer(nil) 229 230 lenBuf := make([]byte, 2) // can be reused for both key and value len 231 binary.LittleEndian.PutUint16(lenBuf, keyLen) 232 if _, err := out.Write(lenBuf); err != nil { 233 return nil, errors.Wrap(err, "write map key length indicator") 234 } 235 236 if _, err := out.Write(kv.Key); err != nil { 237 return nil, errors.Wrap(err, "write map key") 238 } 239 240 binary.LittleEndian.PutUint16(lenBuf, valueLen) 241 if _, err := out.Write(lenBuf); err != nil { 242 return nil, errors.Wrap(err, "write map value length indicator") 243 } 244 245 if _, err := out.Write(kv.Value); err != nil { 246 return nil, errors.Wrap(err, "write map value") 247 } 248 249 return out.Bytes(), nil 250 } 251 252 func (kv *MapPair) FromBytes(in []byte, keyOnly bool) error { 253 var read uint16 254 255 // NOTE: A previous implementation was using copy statements in here to avoid 256 // sharing the memory. The general idea of that is good (protect against the 257 // mmaped memory being removed from a completed compaction), however this is 258 // the wrong place. By the time we are in this method, we can no longer 259 // control the memory safety of the "in" argument. Thus, such a copy must 260 // happen at a much earlier scope when a lock is held that protects against 261 // removing the segment. Such an implementation can now be found in 262 // segment_collection_strategy.go as part of the *segment.getCollection 263 // method. As a result all memory used here can now be considered read-only 264 // and is safe to be used indefinitely. 265 266 keyLen := binary.LittleEndian.Uint16(in[:2]) 267 read += 2 // uint16 -> 2 bytes 268 269 kv.Key = in[read : read+keyLen] 270 read += keyLen 271 272 if keyOnly { 273 return nil 274 } 275 276 valueLen := binary.LittleEndian.Uint16(in[read : read+2]) 277 read += 2 278 279 kv.Value = in[read : read+valueLen] 280 read += valueLen 281 282 if read != uint16(len(in)) { 283 return errors.Errorf("inconsistent map pair: read %d out of %d bytes", 284 read, len(in)) 285 } 286 287 return nil 288 } 289 290 func (kv *MapPair) FromBytesReusable(in []byte, keyOnly bool) error { 291 var read uint16 292 293 keyLen := binary.LittleEndian.Uint16(in[:2]) 294 read += 2 // uint16 -> 2 bytes 295 296 if int(keyLen) > cap(kv.Key) { 297 kv.Key = make([]byte, keyLen) 298 } else { 299 kv.Key = kv.Key[:keyLen] 300 } 301 copy(kv.Key, in[read:read+keyLen]) 302 read += keyLen 303 304 if keyOnly { 305 return nil 306 } 307 308 valueLen := binary.LittleEndian.Uint16(in[read : read+2]) 309 read += 2 310 311 if int(valueLen) > cap(kv.Value) { 312 kv.Value = make([]byte, valueLen) 313 } else { 314 kv.Value = kv.Value[:valueLen] 315 } 316 copy(kv.Value, in[read:read+valueLen]) 317 read += valueLen 318 319 if read != uint16(len(in)) { 320 return errors.Errorf("inconsistent map pair: read %d out of %d bytes", 321 read, len(in)) 322 } 323 324 return nil 325 } 326 327 type mapEncoder struct { 328 pairBuf []value 329 } 330 331 func newMapEncoder() *mapEncoder { 332 return &mapEncoder{} 333 } 334 335 func (m *mapEncoder) Do(kv MapPair) ([]value, error) { 336 v, err := kv.Bytes() 337 if err != nil { 338 return nil, err 339 } 340 341 out := make([]value, 1) 342 out[0] = value{ 343 tombstone: kv.Tombstone, 344 value: v, 345 } 346 347 return out, nil 348 } 349 350 func (m *mapEncoder) DoMulti(kvs []MapPair) ([]value, error) { 351 out := make([]value, len(kvs)) 352 353 for i, kv := range kvs { 354 v := make([]byte, kv.Size()) 355 err := kv.EncodeBytes(v) 356 if err != nil { 357 return nil, err 358 } 359 360 out[i] = value{ 361 tombstone: kv.Tombstone, 362 value: v, 363 } 364 } 365 366 return out, nil 367 } 368 369 // DoMultiReusable reuses a MapPair buffer that it exposes to the caller on 370 // this request. Warning: The caller must make sure that they no longer access 371 // the return value once they call this method a second time, otherwise they 372 // risk overwriting a previous result. The intended usage for example in a loop 373 // where each loop copies the results, for example using a bufio.Writer. 374 func (m *mapEncoder) DoMultiReusable(kvs []MapPair) ([]value, error) { 375 m.resizeBuffer(len(kvs)) 376 377 for i, kv := range kvs { 378 m.resizeValueAtBuffer(i, kv.Size()) 379 err := kv.EncodeBytes(m.pairBuf[i].value) 380 if err != nil { 381 return nil, err 382 } 383 384 m.pairBuf[i].tombstone = kv.Tombstone 385 } 386 387 return m.pairBuf, nil 388 } 389 390 func (m *mapEncoder) resizeBuffer(size int) { 391 if cap(m.pairBuf) >= size { 392 m.pairBuf = m.pairBuf[:size] 393 } else { 394 m.pairBuf = make([]value, size, int(float64(size)*1.25)) 395 } 396 } 397 398 func (m *mapEncoder) resizeValueAtBuffer(pos, size int) { 399 if cap(m.pairBuf[pos].value) >= size { 400 m.pairBuf[pos].value = m.pairBuf[pos].value[:size] 401 } else { 402 m.pairBuf[pos].value = make([]byte, size, int(float64(size)*1.25)) 403 } 404 }