github.com/weaviate/weaviate@v1.24.6/adapters/repos/db/lsmkv/segment_serialization.go (about)

     1  //                           _       _
     2  // __      _____  __ ___   ___  __ _| |_ ___
     3  // \ \ /\ / / _ \/ _` \ \ / / |/ _` | __/ _ \
     4  //  \ V  V /  __/ (_| |\ V /| | (_| | ||  __/
     5  //   \_/\_/ \___|\__,_| \_/ |_|\__,_|\__\___|
     6  //
     7  //  Copyright © 2016 - 2024 Weaviate B.V. All rights reserved.
     8  //
     9  //  CONTACT: hello@weaviate.io
    10  //
    11  
    12  package lsmkv
    13  
    14  import (
    15  	"encoding/binary"
    16  	"fmt"
    17  	"io"
    18  
    19  	"github.com/pkg/errors"
    20  	"github.com/weaviate/weaviate/adapters/repos/db/lsmkv/segmentindex"
    21  	"github.com/weaviate/weaviate/usecases/byteops"
    22  )
    23  
    24  // a single node of strategy "replace"
    25  type segmentReplaceNode struct {
    26  	tombstone           bool
    27  	value               []byte
    28  	primaryKey          []byte
    29  	secondaryIndexCount uint16
    30  	secondaryKeys       [][]byte
    31  	offset              int
    32  }
    33  
    34  func (s *segmentReplaceNode) KeyIndexAndWriteTo(w io.Writer) (segmentindex.Key, error) {
    35  	out := segmentindex.Key{}
    36  	written := 0
    37  
    38  	buf := make([]byte, 9)
    39  	if s.tombstone {
    40  		buf[0] = 1
    41  	} else {
    42  		buf[0] = 0
    43  	}
    44  
    45  	valueLength := uint64(len(s.value))
    46  	binary.LittleEndian.PutUint64(buf[1:9], valueLength)
    47  	if _, err := w.Write(buf); err != nil {
    48  		return out, err
    49  	}
    50  
    51  	written += 9
    52  
    53  	n, err := w.Write(s.value)
    54  	if err != nil {
    55  		return out, errors.Wrapf(err, "write node value")
    56  	}
    57  	written += n
    58  
    59  	keyLength := uint32(len(s.primaryKey))
    60  	binary.LittleEndian.PutUint32(buf[0:4], keyLength)
    61  	if _, err := w.Write(buf[0:4]); err != nil {
    62  		return out, err
    63  	}
    64  	written += 4
    65  
    66  	n, err = w.Write(s.primaryKey)
    67  	if err != nil {
    68  		return out, errors.Wrapf(err, "write node key")
    69  	}
    70  	written += n
    71  
    72  	for j := 0; j < int(s.secondaryIndexCount); j++ {
    73  		var secondaryKeyLength uint32
    74  		if j < len(s.secondaryKeys) {
    75  			secondaryKeyLength = uint32(len(s.secondaryKeys[j]))
    76  		}
    77  
    78  		// write the key length in any case
    79  		binary.LittleEndian.PutUint32(buf[0:4], secondaryKeyLength)
    80  		if _, err := w.Write(buf[0:4]); err != nil {
    81  			return out, err
    82  		}
    83  		written += 4
    84  
    85  		if secondaryKeyLength == 0 {
    86  			// we're done here
    87  			continue
    88  		}
    89  
    90  		// only write the key if it exists
    91  		n, err = w.Write(s.secondaryKeys[j])
    92  		if err != nil {
    93  			return out, errors.Wrapf(err, "write secondary key %d", j)
    94  		}
    95  		written += n
    96  	}
    97  
    98  	return segmentindex.Key{
    99  		ValueStart:    s.offset,
   100  		ValueEnd:      s.offset + written,
   101  		Key:           s.primaryKey,
   102  		SecondaryKeys: s.secondaryKeys,
   103  	}, nil
   104  }
   105  
   106  func ParseReplaceNode(r io.Reader, secondaryIndexCount uint16) (segmentReplaceNode, error) {
   107  	out := segmentReplaceNode{}
   108  
   109  	// 9 bytes is the most we can ever read uninterrupted, i.e. without a dynamic
   110  	// read in between.
   111  	tmpBuf := make([]byte, 9)
   112  	if n, err := io.ReadFull(r, tmpBuf); err != nil {
   113  		return out, errors.Wrap(err, "read tombstone and value length")
   114  	} else {
   115  		out.offset += n
   116  	}
   117  
   118  	out.tombstone = tmpBuf[0] == 0x1
   119  	valueLength := binary.LittleEndian.Uint64(tmpBuf[1:9])
   120  	out.value = make([]byte, valueLength)
   121  	if n, err := io.ReadFull(r, out.value); err != nil {
   122  		return out, errors.Wrap(err, "read value")
   123  	} else {
   124  		out.offset += n
   125  	}
   126  
   127  	if n, err := io.ReadFull(r, tmpBuf[0:4]); err != nil {
   128  		return out, errors.Wrap(err, "read key length encoding")
   129  	} else {
   130  		out.offset += n
   131  	}
   132  
   133  	keyLength := binary.LittleEndian.Uint32(tmpBuf[0:4])
   134  	out.primaryKey = make([]byte, keyLength)
   135  	if n, err := io.ReadFull(r, out.primaryKey); err != nil {
   136  		return out, errors.Wrap(err, "read key")
   137  	} else {
   138  		out.offset += n
   139  	}
   140  
   141  	if secondaryIndexCount > 0 {
   142  		out.secondaryKeys = make([][]byte, secondaryIndexCount)
   143  	}
   144  
   145  	for j := 0; j < int(secondaryIndexCount); j++ {
   146  		if n, err := io.ReadFull(r, tmpBuf[0:4]); err != nil {
   147  			return out, errors.Wrap(err, "read secondary key length encoding")
   148  		} else {
   149  			out.offset += n
   150  		}
   151  		secKeyLen := binary.LittleEndian.Uint32(tmpBuf[0:4])
   152  		if secKeyLen == 0 {
   153  			continue
   154  		}
   155  
   156  		out.secondaryKeys[j] = make([]byte, secKeyLen)
   157  		if n, err := io.ReadFull(r, out.secondaryKeys[j]); err != nil {
   158  			return out, errors.Wrap(err, "read secondary key")
   159  		} else {
   160  			out.offset += n
   161  		}
   162  	}
   163  
   164  	return out, nil
   165  }
   166  
   167  func ParseReplaceNodeIntoPread(r io.Reader, secondaryIndexCount uint16, out *segmentReplaceNode) (err error) {
   168  	out.offset = 0
   169  
   170  	if err := binary.Read(r, binary.LittleEndian, &out.tombstone); err != nil {
   171  		return errors.Wrap(err, "read tombstone")
   172  	}
   173  	out.offset += 1
   174  
   175  	var valueLength uint64
   176  	if err := binary.Read(r, binary.LittleEndian, &valueLength); err != nil {
   177  		return errors.Wrap(err, "read value length encoding")
   178  	}
   179  	out.offset += 8
   180  
   181  	if int(valueLength) > cap(out.value) {
   182  		out.value = make([]byte, valueLength)
   183  	} else {
   184  		out.value = out.value[:valueLength]
   185  	}
   186  
   187  	if n, err := io.ReadFull(r, out.value); err != nil {
   188  		return errors.Wrap(err, "read value")
   189  	} else {
   190  		out.offset += n
   191  	}
   192  
   193  	var keyLength uint32
   194  	if err := binary.Read(r, binary.LittleEndian, &keyLength); err != nil {
   195  		return errors.Wrap(err, "read key length encoding")
   196  	}
   197  	out.offset += 4
   198  
   199  	out.primaryKey = make([]byte, keyLength)
   200  	if n, err := io.ReadFull(r, out.primaryKey); err != nil {
   201  		return errors.Wrap(err, "read key")
   202  	} else {
   203  		out.offset += n
   204  	}
   205  
   206  	if secondaryIndexCount > 0 {
   207  		out.secondaryKeys = make([][]byte, secondaryIndexCount)
   208  	}
   209  
   210  	for j := 0; j < int(secondaryIndexCount); j++ {
   211  		var secKeyLen uint32
   212  		if err := binary.Read(r, binary.LittleEndian, &secKeyLen); err != nil {
   213  			return errors.Wrap(err, "read secondary key length encoding")
   214  		}
   215  		out.offset += 4
   216  
   217  		if secKeyLen == 0 {
   218  			continue
   219  		}
   220  
   221  		out.secondaryKeys[j] = make([]byte, secKeyLen)
   222  		if n, err := io.ReadFull(r, out.secondaryKeys[j]); err != nil {
   223  			return errors.Wrap(err, "read secondary key")
   224  		} else {
   225  			out.offset += n
   226  		}
   227  	}
   228  
   229  	return nil
   230  }
   231  
   232  func ParseReplaceNodeIntoMMAP(r *byteops.ReadWriter, secondaryIndexCount uint16, out *segmentReplaceNode) error {
   233  	out.tombstone = r.ReadUint8() == 0x01
   234  	valueLength := r.ReadUint64()
   235  
   236  	if int(valueLength) > cap(out.value) {
   237  		out.value = make([]byte, valueLength)
   238  	} else {
   239  		out.value = out.value[:valueLength]
   240  	}
   241  
   242  	if _, err := r.CopyBytesFromBuffer(valueLength, out.value); err != nil {
   243  		return err
   244  	}
   245  
   246  	// Note: In a previous version (prior to
   247  	// https://github.com/weaviate/weaviate/pull/3660) this was a copy. The
   248  	// mentioned PR optimizes the Replace Cursor which led to this now being
   249  	// shared memory. After internal review, we believe this is safe to do. The
   250  	// cursor gives no guarantees about memory after calling .next(). Before
   251  	// .next() is called, this should be safe. Nevertheless, we are leaving this
   252  	// note in case a future bug appears, as this should make this spot easier to
   253  	// find.
   254  	out.primaryKey = r.ReadBytesFromBufferWithUint32LengthIndicator()
   255  
   256  	if secondaryIndexCount > 0 {
   257  		out.secondaryKeys = make([][]byte, secondaryIndexCount)
   258  	}
   259  
   260  	for j := 0; j < int(secondaryIndexCount); j++ {
   261  		// Note: In a previous version (prior to
   262  		// https://github.com/weaviate/weaviate/pull/3660) this was a copy. The
   263  		// mentioned PR optimizes the Replace Cursor which led to this now being
   264  		// shared memory. After internal review, we believe this is safe to do. The
   265  		// cursor gives no guarantees about memory after calling .next(). Before
   266  		// .next() is called, this should be safe. Nevertheless, we are leaving this
   267  		// note in case a future bug appears, as this should make this spot easier to
   268  		// find.
   269  		out.secondaryKeys[j] = r.ReadBytesFromBufferWithUint32LengthIndicator()
   270  	}
   271  
   272  	out.offset = int(r.Position)
   273  	return nil
   274  }
   275  
   276  // collection strategy does not support secondary keys at this time
   277  type segmentCollectionNode struct {
   278  	values     []value
   279  	primaryKey []byte
   280  	offset     int
   281  }
   282  
   283  func (s segmentCollectionNode) KeyIndexAndWriteTo(w io.Writer) (segmentindex.Key, error) {
   284  	out := segmentindex.Key{}
   285  	written := 0
   286  	valueLen := uint64(len(s.values))
   287  	buf := make([]byte, 9)
   288  	binary.LittleEndian.PutUint64(buf, valueLen)
   289  	if _, err := w.Write(buf[0:8]); err != nil {
   290  		return out, errors.Wrapf(err, "write values len for node")
   291  	}
   292  	written += 8
   293  
   294  	for i, value := range s.values {
   295  		if value.tombstone {
   296  			buf[0] = 0x01
   297  		} else {
   298  			buf[0] = 0x00
   299  		}
   300  
   301  		valueLen := uint64(len(value.value))
   302  		binary.LittleEndian.PutUint64(buf[1:9], valueLen)
   303  		if _, err := w.Write(buf[0:9]); err != nil {
   304  			return out, errors.Wrapf(err, "write len of value %d", i)
   305  		}
   306  		written += 9
   307  
   308  		n, err := w.Write(value.value)
   309  		if err != nil {
   310  			return out, errors.Wrapf(err, "write value %d", i)
   311  		}
   312  		written += n
   313  	}
   314  
   315  	keyLength := uint32(len(s.primaryKey))
   316  	binary.LittleEndian.PutUint32(buf[0:4], keyLength)
   317  	if _, err := w.Write(buf[0:4]); err != nil {
   318  		return out, errors.Wrapf(err, "write key length encoding for node")
   319  	}
   320  	written += 4
   321  
   322  	n, err := w.Write(s.primaryKey)
   323  	if err != nil {
   324  		return out, errors.Wrapf(err, "write node")
   325  	}
   326  	written += n
   327  
   328  	out = segmentindex.Key{
   329  		ValueStart: s.offset,
   330  		ValueEnd:   s.offset + written,
   331  		Key:        s.primaryKey,
   332  	}
   333  
   334  	return out, nil
   335  }
   336  
   337  // ParseCollectionNode reads from r and parses the collection values into a segmentCollectionNode
   338  //
   339  // When only given an offset, r is constructed as a *bufio.Reader to avoid first reading the
   340  // entire segment (could be GBs). Each consecutive read will be buffered to avoid excessive
   341  // syscalls.
   342  //
   343  // When we already have a finite and manageable []byte (i.e. when we have already seeked to an
   344  // lsmkv node and have start+end offset), r should be constructed as a *bytes.Reader, since the
   345  // contents have already been `pread` from the segment contentFile.
   346  func ParseCollectionNode(r io.Reader) (segmentCollectionNode, error) {
   347  	out := segmentCollectionNode{}
   348  	// 9 bytes is the most we can ever read uninterrupted, i.e. without a dynamic
   349  	// read in between.
   350  	tmpBuf := make([]byte, 9)
   351  
   352  	if n, err := io.ReadFull(r, tmpBuf[0:8]); err != nil {
   353  		return out, errors.Wrap(err, "read values len")
   354  	} else {
   355  		out.offset += n
   356  	}
   357  
   358  	valuesLen := binary.LittleEndian.Uint64(tmpBuf[0:8])
   359  	out.values = make([]value, valuesLen)
   360  	for i := range out.values {
   361  		if n, err := io.ReadFull(r, tmpBuf[0:9]); err != nil {
   362  			return out, errors.Wrap(err, "read value tombstone and len")
   363  		} else {
   364  			out.offset += n
   365  		}
   366  		out.values[i].tombstone = tmpBuf[0] == 0x1
   367  		valueLen := binary.LittleEndian.Uint64(tmpBuf[1:9])
   368  		out.values[i].value = make([]byte, valueLen)
   369  		n, err := io.ReadFull(r, out.values[i].value)
   370  		if err != nil {
   371  			return out, errors.Wrap(err, "read value")
   372  		}
   373  		out.offset += n
   374  	}
   375  
   376  	if n, err := io.ReadFull(r, tmpBuf[0:4]); err != nil {
   377  		return out, errors.Wrap(err, "read key len")
   378  	} else {
   379  		out.offset += n
   380  	}
   381  	keyLen := binary.LittleEndian.Uint32(tmpBuf[0:4])
   382  	out.primaryKey = make([]byte, keyLen)
   383  	n, err := io.ReadFull(r, out.primaryKey)
   384  	if err != nil {
   385  		return out, errors.Wrap(err, "read key")
   386  	}
   387  	out.offset += n
   388  
   389  	return out, nil
   390  }
   391  
   392  // ParseCollectionNodeInto takes the []byte slice and parses it into the
   393  // specified node. It does not perform any copies and the caller must be aware
   394  // that memory may be shared between the two. As a result, the caller must make
   395  // sure that they do not modify "in" while "node" is still in use. A safer
   396  // alternative is to use ParseCollectionNode.
   397  //
   398  // The primary intention of this function is to provide a way to reuse buffers
   399  // when the lifetime is controlled tightly, for example in cursors used within
   400  // compactions. Use at your own risk!
   401  //
   402  // If the buffers of the provided node have enough capacity they will be
   403  // reused. Only if the capacity is not enough, will an allocation occur. This
   404  // allocation uses 25% overhead to avoid future allocations for nodes of
   405  // similar size.
   406  //
   407  // As a result calling this method only makes sense if you plan on calling it
   408  // multiple times. Calling it just once on an uninitialized node does not have
   409  // major advantages over calling ParseCollectionNode.
   410  func ParseCollectionNodeInto(r io.Reader, node *segmentCollectionNode) error {
   411  	// offset is only the local offset relative to "in". In the end we need to
   412  	// update the global offset.
   413  	offset := 0
   414  
   415  	buf := make([]byte, 9)
   416  	_, err := io.ReadFull(r, buf[0:8])
   417  	if err != nil {
   418  		return fmt.Errorf("read values len: %w", err)
   419  	}
   420  
   421  	valuesLen := binary.LittleEndian.Uint64(buf[0:8])
   422  	offset += 8
   423  
   424  	resizeValuesOfCollectionNode(node, valuesLen)
   425  	for i := range node.values {
   426  		_, err = io.ReadFull(r, buf)
   427  		if err != nil {
   428  			return fmt.Errorf("read values len: %w", err)
   429  		}
   430  
   431  		node.values[i].tombstone = buf[0] == 0x1
   432  		offset += 1
   433  
   434  		valueLen := binary.LittleEndian.Uint64(buf[1:9])
   435  		offset += 8
   436  
   437  		resizeValueOfCollectionNodeAtPos(node, i, valueLen)
   438  
   439  		_, err = io.ReadFull(r, node.values[i].value)
   440  		if err != nil {
   441  			return fmt.Errorf("read node value: %w", err)
   442  		}
   443  
   444  		offset += int(valueLen)
   445  	}
   446  
   447  	_, err = io.ReadFull(r, buf[0:4])
   448  	if err != nil {
   449  		return fmt.Errorf("read values len: %w", err)
   450  	}
   451  	keyLen := binary.LittleEndian.Uint32(buf)
   452  	offset += 4
   453  
   454  	resizeKeyOfCollectionNode(node, keyLen)
   455  	_, err = io.ReadFull(r, node.primaryKey)
   456  	if err != nil {
   457  		return fmt.Errorf("read primary key: %w", err)
   458  	}
   459  	offset += int(keyLen)
   460  
   461  	node.offset = offset
   462  	return nil
   463  }
   464  
   465  func resizeValuesOfCollectionNode(node *segmentCollectionNode, size uint64) {
   466  	if cap(node.values) >= int(size) {
   467  		node.values = node.values[:size]
   468  	} else {
   469  		// Allocate with 25% overhead to reduce chance of having to do multiple
   470  		// allocations sequentially.
   471  		node.values = make([]value, size, int(float64(size)*1.25))
   472  	}
   473  }
   474  
   475  func resizeValueOfCollectionNodeAtPos(node *segmentCollectionNode, pos int,
   476  	size uint64,
   477  ) {
   478  	if cap(node.values[pos].value) >= int(size) {
   479  		node.values[pos].value = node.values[pos].value[:size]
   480  	} else {
   481  		// Allocate with 25% overhead to reduce chance of having to do multiple
   482  		// allocations sequentially.
   483  		node.values[pos].value = make([]byte, size, int(float64(size)*1.25))
   484  	}
   485  }
   486  
   487  func resizeKeyOfCollectionNode(node *segmentCollectionNode, size uint32) {
   488  	if cap(node.primaryKey) >= int(size) {
   489  		node.primaryKey = node.primaryKey[:size]
   490  	} else {
   491  		// Allocate with 25% overhead to reduce chance of having to do multiple
   492  		// allocations sequentially.
   493  		node.primaryKey = make([]byte, size, int(float64(size)*1.25))
   494  	}
   495  }