github.com/weaviate/weaviate@v1.24.6/adapters/repos/db/lsmkv/segment_serialization.go (about) 1 // _ _ 2 // __ _____ __ ___ ___ __ _| |_ ___ 3 // \ \ /\ / / _ \/ _` \ \ / / |/ _` | __/ _ \ 4 // \ V V / __/ (_| |\ V /| | (_| | || __/ 5 // \_/\_/ \___|\__,_| \_/ |_|\__,_|\__\___| 6 // 7 // Copyright © 2016 - 2024 Weaviate B.V. All rights reserved. 8 // 9 // CONTACT: hello@weaviate.io 10 // 11 12 package lsmkv 13 14 import ( 15 "encoding/binary" 16 "fmt" 17 "io" 18 19 "github.com/pkg/errors" 20 "github.com/weaviate/weaviate/adapters/repos/db/lsmkv/segmentindex" 21 "github.com/weaviate/weaviate/usecases/byteops" 22 ) 23 24 // a single node of strategy "replace" 25 type segmentReplaceNode struct { 26 tombstone bool 27 value []byte 28 primaryKey []byte 29 secondaryIndexCount uint16 30 secondaryKeys [][]byte 31 offset int 32 } 33 34 func (s *segmentReplaceNode) KeyIndexAndWriteTo(w io.Writer) (segmentindex.Key, error) { 35 out := segmentindex.Key{} 36 written := 0 37 38 buf := make([]byte, 9) 39 if s.tombstone { 40 buf[0] = 1 41 } else { 42 buf[0] = 0 43 } 44 45 valueLength := uint64(len(s.value)) 46 binary.LittleEndian.PutUint64(buf[1:9], valueLength) 47 if _, err := w.Write(buf); err != nil { 48 return out, err 49 } 50 51 written += 9 52 53 n, err := w.Write(s.value) 54 if err != nil { 55 return out, errors.Wrapf(err, "write node value") 56 } 57 written += n 58 59 keyLength := uint32(len(s.primaryKey)) 60 binary.LittleEndian.PutUint32(buf[0:4], keyLength) 61 if _, err := w.Write(buf[0:4]); err != nil { 62 return out, err 63 } 64 written += 4 65 66 n, err = w.Write(s.primaryKey) 67 if err != nil { 68 return out, errors.Wrapf(err, "write node key") 69 } 70 written += n 71 72 for j := 0; j < int(s.secondaryIndexCount); j++ { 73 var secondaryKeyLength uint32 74 if j < len(s.secondaryKeys) { 75 secondaryKeyLength = uint32(len(s.secondaryKeys[j])) 76 } 77 78 // write the key length in any case 79 binary.LittleEndian.PutUint32(buf[0:4], secondaryKeyLength) 80 if _, err := w.Write(buf[0:4]); err != nil { 81 return out, err 82 } 83 written += 4 84 85 if secondaryKeyLength == 0 { 86 // we're done here 87 continue 88 } 89 90 // only write the key if it exists 91 n, err = w.Write(s.secondaryKeys[j]) 92 if err != nil { 93 return out, errors.Wrapf(err, "write secondary key %d", j) 94 } 95 written += n 96 } 97 98 return segmentindex.Key{ 99 ValueStart: s.offset, 100 ValueEnd: s.offset + written, 101 Key: s.primaryKey, 102 SecondaryKeys: s.secondaryKeys, 103 }, nil 104 } 105 106 func ParseReplaceNode(r io.Reader, secondaryIndexCount uint16) (segmentReplaceNode, error) { 107 out := segmentReplaceNode{} 108 109 // 9 bytes is the most we can ever read uninterrupted, i.e. without a dynamic 110 // read in between. 111 tmpBuf := make([]byte, 9) 112 if n, err := io.ReadFull(r, tmpBuf); err != nil { 113 return out, errors.Wrap(err, "read tombstone and value length") 114 } else { 115 out.offset += n 116 } 117 118 out.tombstone = tmpBuf[0] == 0x1 119 valueLength := binary.LittleEndian.Uint64(tmpBuf[1:9]) 120 out.value = make([]byte, valueLength) 121 if n, err := io.ReadFull(r, out.value); err != nil { 122 return out, errors.Wrap(err, "read value") 123 } else { 124 out.offset += n 125 } 126 127 if n, err := io.ReadFull(r, tmpBuf[0:4]); err != nil { 128 return out, errors.Wrap(err, "read key length encoding") 129 } else { 130 out.offset += n 131 } 132 133 keyLength := binary.LittleEndian.Uint32(tmpBuf[0:4]) 134 out.primaryKey = make([]byte, keyLength) 135 if n, err := io.ReadFull(r, out.primaryKey); err != nil { 136 return out, errors.Wrap(err, "read key") 137 } else { 138 out.offset += n 139 } 140 141 if secondaryIndexCount > 0 { 142 out.secondaryKeys = make([][]byte, secondaryIndexCount) 143 } 144 145 for j := 0; j < int(secondaryIndexCount); j++ { 146 if n, err := io.ReadFull(r, tmpBuf[0:4]); err != nil { 147 return out, errors.Wrap(err, "read secondary key length encoding") 148 } else { 149 out.offset += n 150 } 151 secKeyLen := binary.LittleEndian.Uint32(tmpBuf[0:4]) 152 if secKeyLen == 0 { 153 continue 154 } 155 156 out.secondaryKeys[j] = make([]byte, secKeyLen) 157 if n, err := io.ReadFull(r, out.secondaryKeys[j]); err != nil { 158 return out, errors.Wrap(err, "read secondary key") 159 } else { 160 out.offset += n 161 } 162 } 163 164 return out, nil 165 } 166 167 func ParseReplaceNodeIntoPread(r io.Reader, secondaryIndexCount uint16, out *segmentReplaceNode) (err error) { 168 out.offset = 0 169 170 if err := binary.Read(r, binary.LittleEndian, &out.tombstone); err != nil { 171 return errors.Wrap(err, "read tombstone") 172 } 173 out.offset += 1 174 175 var valueLength uint64 176 if err := binary.Read(r, binary.LittleEndian, &valueLength); err != nil { 177 return errors.Wrap(err, "read value length encoding") 178 } 179 out.offset += 8 180 181 if int(valueLength) > cap(out.value) { 182 out.value = make([]byte, valueLength) 183 } else { 184 out.value = out.value[:valueLength] 185 } 186 187 if n, err := io.ReadFull(r, out.value); err != nil { 188 return errors.Wrap(err, "read value") 189 } else { 190 out.offset += n 191 } 192 193 var keyLength uint32 194 if err := binary.Read(r, binary.LittleEndian, &keyLength); err != nil { 195 return errors.Wrap(err, "read key length encoding") 196 } 197 out.offset += 4 198 199 out.primaryKey = make([]byte, keyLength) 200 if n, err := io.ReadFull(r, out.primaryKey); err != nil { 201 return errors.Wrap(err, "read key") 202 } else { 203 out.offset += n 204 } 205 206 if secondaryIndexCount > 0 { 207 out.secondaryKeys = make([][]byte, secondaryIndexCount) 208 } 209 210 for j := 0; j < int(secondaryIndexCount); j++ { 211 var secKeyLen uint32 212 if err := binary.Read(r, binary.LittleEndian, &secKeyLen); err != nil { 213 return errors.Wrap(err, "read secondary key length encoding") 214 } 215 out.offset += 4 216 217 if secKeyLen == 0 { 218 continue 219 } 220 221 out.secondaryKeys[j] = make([]byte, secKeyLen) 222 if n, err := io.ReadFull(r, out.secondaryKeys[j]); err != nil { 223 return errors.Wrap(err, "read secondary key") 224 } else { 225 out.offset += n 226 } 227 } 228 229 return nil 230 } 231 232 func ParseReplaceNodeIntoMMAP(r *byteops.ReadWriter, secondaryIndexCount uint16, out *segmentReplaceNode) error { 233 out.tombstone = r.ReadUint8() == 0x01 234 valueLength := r.ReadUint64() 235 236 if int(valueLength) > cap(out.value) { 237 out.value = make([]byte, valueLength) 238 } else { 239 out.value = out.value[:valueLength] 240 } 241 242 if _, err := r.CopyBytesFromBuffer(valueLength, out.value); err != nil { 243 return err 244 } 245 246 // Note: In a previous version (prior to 247 // https://github.com/weaviate/weaviate/pull/3660) this was a copy. The 248 // mentioned PR optimizes the Replace Cursor which led to this now being 249 // shared memory. After internal review, we believe this is safe to do. The 250 // cursor gives no guarantees about memory after calling .next(). Before 251 // .next() is called, this should be safe. Nevertheless, we are leaving this 252 // note in case a future bug appears, as this should make this spot easier to 253 // find. 254 out.primaryKey = r.ReadBytesFromBufferWithUint32LengthIndicator() 255 256 if secondaryIndexCount > 0 { 257 out.secondaryKeys = make([][]byte, secondaryIndexCount) 258 } 259 260 for j := 0; j < int(secondaryIndexCount); j++ { 261 // Note: In a previous version (prior to 262 // https://github.com/weaviate/weaviate/pull/3660) this was a copy. The 263 // mentioned PR optimizes the Replace Cursor which led to this now being 264 // shared memory. After internal review, we believe this is safe to do. The 265 // cursor gives no guarantees about memory after calling .next(). Before 266 // .next() is called, this should be safe. Nevertheless, we are leaving this 267 // note in case a future bug appears, as this should make this spot easier to 268 // find. 269 out.secondaryKeys[j] = r.ReadBytesFromBufferWithUint32LengthIndicator() 270 } 271 272 out.offset = int(r.Position) 273 return nil 274 } 275 276 // collection strategy does not support secondary keys at this time 277 type segmentCollectionNode struct { 278 values []value 279 primaryKey []byte 280 offset int 281 } 282 283 func (s segmentCollectionNode) KeyIndexAndWriteTo(w io.Writer) (segmentindex.Key, error) { 284 out := segmentindex.Key{} 285 written := 0 286 valueLen := uint64(len(s.values)) 287 buf := make([]byte, 9) 288 binary.LittleEndian.PutUint64(buf, valueLen) 289 if _, err := w.Write(buf[0:8]); err != nil { 290 return out, errors.Wrapf(err, "write values len for node") 291 } 292 written += 8 293 294 for i, value := range s.values { 295 if value.tombstone { 296 buf[0] = 0x01 297 } else { 298 buf[0] = 0x00 299 } 300 301 valueLen := uint64(len(value.value)) 302 binary.LittleEndian.PutUint64(buf[1:9], valueLen) 303 if _, err := w.Write(buf[0:9]); err != nil { 304 return out, errors.Wrapf(err, "write len of value %d", i) 305 } 306 written += 9 307 308 n, err := w.Write(value.value) 309 if err != nil { 310 return out, errors.Wrapf(err, "write value %d", i) 311 } 312 written += n 313 } 314 315 keyLength := uint32(len(s.primaryKey)) 316 binary.LittleEndian.PutUint32(buf[0:4], keyLength) 317 if _, err := w.Write(buf[0:4]); err != nil { 318 return out, errors.Wrapf(err, "write key length encoding for node") 319 } 320 written += 4 321 322 n, err := w.Write(s.primaryKey) 323 if err != nil { 324 return out, errors.Wrapf(err, "write node") 325 } 326 written += n 327 328 out = segmentindex.Key{ 329 ValueStart: s.offset, 330 ValueEnd: s.offset + written, 331 Key: s.primaryKey, 332 } 333 334 return out, nil 335 } 336 337 // ParseCollectionNode reads from r and parses the collection values into a segmentCollectionNode 338 // 339 // When only given an offset, r is constructed as a *bufio.Reader to avoid first reading the 340 // entire segment (could be GBs). Each consecutive read will be buffered to avoid excessive 341 // syscalls. 342 // 343 // When we already have a finite and manageable []byte (i.e. when we have already seeked to an 344 // lsmkv node and have start+end offset), r should be constructed as a *bytes.Reader, since the 345 // contents have already been `pread` from the segment contentFile. 346 func ParseCollectionNode(r io.Reader) (segmentCollectionNode, error) { 347 out := segmentCollectionNode{} 348 // 9 bytes is the most we can ever read uninterrupted, i.e. without a dynamic 349 // read in between. 350 tmpBuf := make([]byte, 9) 351 352 if n, err := io.ReadFull(r, tmpBuf[0:8]); err != nil { 353 return out, errors.Wrap(err, "read values len") 354 } else { 355 out.offset += n 356 } 357 358 valuesLen := binary.LittleEndian.Uint64(tmpBuf[0:8]) 359 out.values = make([]value, valuesLen) 360 for i := range out.values { 361 if n, err := io.ReadFull(r, tmpBuf[0:9]); err != nil { 362 return out, errors.Wrap(err, "read value tombstone and len") 363 } else { 364 out.offset += n 365 } 366 out.values[i].tombstone = tmpBuf[0] == 0x1 367 valueLen := binary.LittleEndian.Uint64(tmpBuf[1:9]) 368 out.values[i].value = make([]byte, valueLen) 369 n, err := io.ReadFull(r, out.values[i].value) 370 if err != nil { 371 return out, errors.Wrap(err, "read value") 372 } 373 out.offset += n 374 } 375 376 if n, err := io.ReadFull(r, tmpBuf[0:4]); err != nil { 377 return out, errors.Wrap(err, "read key len") 378 } else { 379 out.offset += n 380 } 381 keyLen := binary.LittleEndian.Uint32(tmpBuf[0:4]) 382 out.primaryKey = make([]byte, keyLen) 383 n, err := io.ReadFull(r, out.primaryKey) 384 if err != nil { 385 return out, errors.Wrap(err, "read key") 386 } 387 out.offset += n 388 389 return out, nil 390 } 391 392 // ParseCollectionNodeInto takes the []byte slice and parses it into the 393 // specified node. It does not perform any copies and the caller must be aware 394 // that memory may be shared between the two. As a result, the caller must make 395 // sure that they do not modify "in" while "node" is still in use. A safer 396 // alternative is to use ParseCollectionNode. 397 // 398 // The primary intention of this function is to provide a way to reuse buffers 399 // when the lifetime is controlled tightly, for example in cursors used within 400 // compactions. Use at your own risk! 401 // 402 // If the buffers of the provided node have enough capacity they will be 403 // reused. Only if the capacity is not enough, will an allocation occur. This 404 // allocation uses 25% overhead to avoid future allocations for nodes of 405 // similar size. 406 // 407 // As a result calling this method only makes sense if you plan on calling it 408 // multiple times. Calling it just once on an uninitialized node does not have 409 // major advantages over calling ParseCollectionNode. 410 func ParseCollectionNodeInto(r io.Reader, node *segmentCollectionNode) error { 411 // offset is only the local offset relative to "in". In the end we need to 412 // update the global offset. 413 offset := 0 414 415 buf := make([]byte, 9) 416 _, err := io.ReadFull(r, buf[0:8]) 417 if err != nil { 418 return fmt.Errorf("read values len: %w", err) 419 } 420 421 valuesLen := binary.LittleEndian.Uint64(buf[0:8]) 422 offset += 8 423 424 resizeValuesOfCollectionNode(node, valuesLen) 425 for i := range node.values { 426 _, err = io.ReadFull(r, buf) 427 if err != nil { 428 return fmt.Errorf("read values len: %w", err) 429 } 430 431 node.values[i].tombstone = buf[0] == 0x1 432 offset += 1 433 434 valueLen := binary.LittleEndian.Uint64(buf[1:9]) 435 offset += 8 436 437 resizeValueOfCollectionNodeAtPos(node, i, valueLen) 438 439 _, err = io.ReadFull(r, node.values[i].value) 440 if err != nil { 441 return fmt.Errorf("read node value: %w", err) 442 } 443 444 offset += int(valueLen) 445 } 446 447 _, err = io.ReadFull(r, buf[0:4]) 448 if err != nil { 449 return fmt.Errorf("read values len: %w", err) 450 } 451 keyLen := binary.LittleEndian.Uint32(buf) 452 offset += 4 453 454 resizeKeyOfCollectionNode(node, keyLen) 455 _, err = io.ReadFull(r, node.primaryKey) 456 if err != nil { 457 return fmt.Errorf("read primary key: %w", err) 458 } 459 offset += int(keyLen) 460 461 node.offset = offset 462 return nil 463 } 464 465 func resizeValuesOfCollectionNode(node *segmentCollectionNode, size uint64) { 466 if cap(node.values) >= int(size) { 467 node.values = node.values[:size] 468 } else { 469 // Allocate with 25% overhead to reduce chance of having to do multiple 470 // allocations sequentially. 471 node.values = make([]value, size, int(float64(size)*1.25)) 472 } 473 } 474 475 func resizeValueOfCollectionNodeAtPos(node *segmentCollectionNode, pos int, 476 size uint64, 477 ) { 478 if cap(node.values[pos].value) >= int(size) { 479 node.values[pos].value = node.values[pos].value[:size] 480 } else { 481 // Allocate with 25% overhead to reduce chance of having to do multiple 482 // allocations sequentially. 483 node.values[pos].value = make([]byte, size, int(float64(size)*1.25)) 484 } 485 } 486 487 func resizeKeyOfCollectionNode(node *segmentCollectionNode, size uint32) { 488 if cap(node.primaryKey) >= int(size) { 489 node.primaryKey = node.primaryKey[:size] 490 } else { 491 // Allocate with 25% overhead to reduce chance of having to do multiple 492 // allocations sequentially. 493 node.primaryKey = make([]byte, size, int(float64(size)*1.25)) 494 } 495 }