github.com/weaviate/weaviate@v1.24.6/adapters/repos/db/lsmkv/segment_key_and_tombstone_extractor.go (about)

     1  //                           _       _
     2  // __      _____  __ ___   ___  __ _| |_ ___
     3  // \ \ /\ / / _ \/ _` \ \ / / |/ _` | __/ _ \
     4  //  \ V  V /  __/ (_| |\ V /| | (_| | ||  __/
     5  //   \_/\_/ \___|\__,_| \_/ |_|\__,_|\__\___|
     6  //
     7  //  Copyright © 2016 - 2024 Weaviate B.V. All rights reserved.
     8  //
     9  //  CONTACT: hello@weaviate.io
    10  //
    11  
    12  package lsmkv
    13  
    14  import (
    15  	"encoding/binary"
    16  )
    17  
    18  // bufferedKeyAndTombstoneExtractor is a tool to build up the count stats for
    19  // disk segments (i.e. all the keys in this segment as well as whether they
    20  // contain a tombstone or not). It tries to be relatively memory-efficient
    21  // while doing a whole-segment disk scan. It uses a primitive []byte buffer
    22  // for its output which needs to be allocated just once. It can only read until
    23  // the buffer is full, then it needs to call a callback fn which can do
    24  // something with the data. After the callback function has been called on each
    25  // key, the output buffer is reset. If the input segment it not at EOF yet,
    26  // this cycle repeats
    27  type bufferedKeyAndTombstoneExtractor struct {
    28  	outputBuffer        []byte
    29  	outputBufferOffset  uint64
    30  	offset              uint64
    31  	end                 uint64
    32  	rawSegment          []byte
    33  	secondaryIndexCount uint16
    34  	callback            keyAndTombstoneCallbackFn
    35  	callbackCycle       int
    36  }
    37  
    38  type keyAndTombstoneCallbackFn func(key []byte, tombstone bool)
    39  
    40  func newBufferedKeyAndTombstoneExtractor(rawSegment []byte, initialOffset uint64,
    41  	end uint64, outputBufferSize uint64, secondaryIndexCount uint16,
    42  	callback keyAndTombstoneCallbackFn,
    43  ) *bufferedKeyAndTombstoneExtractor {
    44  	return &bufferedKeyAndTombstoneExtractor{
    45  		rawSegment:          rawSegment,
    46  		offset:              initialOffset,
    47  		end:                 end,
    48  		outputBuffer:        make([]byte, outputBufferSize),
    49  		outputBufferOffset:  0,
    50  		secondaryIndexCount: secondaryIndexCount,
    51  		callback:            callback,
    52  	}
    53  }
    54  
    55  func (e *bufferedKeyAndTombstoneExtractor) do() {
    56  	for {
    57  		if e.offset >= e.end {
    58  			break
    59  		}
    60  
    61  		// returns false if the output buffer ran full
    62  		ok := e.readSingleEntry()
    63  		if !ok {
    64  			e.flushAndCallback()
    65  		}
    66  	}
    67  
    68  	// one final callback
    69  	e.flushAndCallback()
    70  }
    71  
    72  // returns true if the cycle completed, returns false if the cycle did not
    73  // complete because the output buffer was full. In that case, the offsets have
    74  // been reset to the values they had at the beginning of the cycle
    75  func (e *bufferedKeyAndTombstoneExtractor) readSingleEntry() bool {
    76  	// if we discover during an iteration that the next entry can't fit in the
    77  	// buffer anymore, we must return to the start of this iteration, so that
    78  	// the this work can be picked up here once the buffer has been flushed
    79  	offsetAtLoopStart := e.offset
    80  	outputOffsetAtLoopStart := e.outputBufferOffset
    81  
    82  	// the first output size check is static, as we will always read 5 bytes,
    83  	// no matter what. If they can't even fit, we can abort right away
    84  	if !e.outputBufferCanFit(5) {
    85  		e.offset = offsetAtLoopStart
    86  		e.outputBufferOffset = outputOffsetAtLoopStart
    87  		return false
    88  	}
    89  
    90  	// copy tombstone value into output buffer
    91  	e.outputBuffer[e.outputBufferOffset] = e.rawSegment[e.offset]
    92  	e.offset++
    93  	e.outputBufferOffset++
    94  
    95  	valueLen := binary.LittleEndian.Uint64(e.rawSegment[e.offset : e.offset+8])
    96  	e.offset += 8
    97  
    98  	// we're not actually interested in the value, so we can skip it entirely
    99  	e.offset += valueLen
   100  
   101  	primaryKeyLen := binary.LittleEndian.Uint32(e.rawSegment[e.offset : e.offset+4])
   102  	if !e.outputBufferCanFit(uint64(primaryKeyLen) + 4) {
   103  		e.offset = offsetAtLoopStart
   104  		e.outputBufferOffset = outputOffsetAtLoopStart
   105  		return false
   106  	}
   107  
   108  	// copy the primary key len indicator into the output buffer
   109  	copy(e.outputBuffer[e.outputBufferOffset:e.outputBufferOffset+4],
   110  		e.rawSegment[e.offset:e.offset+4])
   111  	e.offset += 4
   112  	e.outputBufferOffset += 4
   113  
   114  	// then copy the key itself
   115  	copy(e.outputBuffer[e.outputBufferOffset:e.outputBufferOffset+uint64(primaryKeyLen)], e.rawSegment[e.offset:e.offset+uint64(primaryKeyLen)])
   116  	e.offset += uint64(primaryKeyLen)
   117  	e.outputBufferOffset += uint64(primaryKeyLen)
   118  
   119  	for i := uint16(0); i < e.secondaryIndexCount; i++ {
   120  		secKeyLen := binary.LittleEndian.Uint32(e.rawSegment[e.offset : e.offset+4])
   121  		e.offset += 4
   122  		e.offset += uint64(secKeyLen)
   123  	}
   124  
   125  	return true
   126  }
   127  
   128  func (e *bufferedKeyAndTombstoneExtractor) outputBufferCanFit(size uint64) bool {
   129  	return (uint64(len(e.outputBuffer)) - e.outputBufferOffset) >= size
   130  }
   131  
   132  // flushAndCallback calls the callback fn for each key/tombstone pair in the
   133  // buffer, then resets the buffer offset, making it ready to be overwritten in
   134  // the next cycle
   135  func (e *bufferedKeyAndTombstoneExtractor) flushAndCallback() {
   136  	end := e.outputBufferOffset
   137  	e.outputBufferOffset = 0
   138  	for e.outputBufferOffset < end {
   139  		var tombstone bool
   140  		if e.outputBuffer[e.outputBufferOffset] == 0x01 {
   141  			tombstone = true
   142  		}
   143  
   144  		e.outputBufferOffset++
   145  
   146  		primaryKeyLen := binary.LittleEndian.Uint32(e.outputBuffer[e.outputBufferOffset : e.outputBufferOffset+4])
   147  
   148  		e.outputBufferOffset += 4
   149  
   150  		e.callback(e.outputBuffer[e.outputBufferOffset:e.outputBufferOffset+uint64(primaryKeyLen)],
   151  			tombstone)
   152  		e.outputBufferOffset += uint64(primaryKeyLen)
   153  	}
   154  
   155  	// reset outputBufferOffset for next batch
   156  	e.outputBufferOffset = 0
   157  
   158  	e.callbackCycle++
   159  }