github.com/weaviate/weaviate@v1.24.6/adapters/repos/db/lsmkv/segment_key_and_tombstone_extractor.go (about) 1 // _ _ 2 // __ _____ __ ___ ___ __ _| |_ ___ 3 // \ \ /\ / / _ \/ _` \ \ / / |/ _` | __/ _ \ 4 // \ V V / __/ (_| |\ V /| | (_| | || __/ 5 // \_/\_/ \___|\__,_| \_/ |_|\__,_|\__\___| 6 // 7 // Copyright © 2016 - 2024 Weaviate B.V. All rights reserved. 8 // 9 // CONTACT: hello@weaviate.io 10 // 11 12 package lsmkv 13 14 import ( 15 "encoding/binary" 16 ) 17 18 // bufferedKeyAndTombstoneExtractor is a tool to build up the count stats for 19 // disk segments (i.e. all the keys in this segment as well as whether they 20 // contain a tombstone or not). It tries to be relatively memory-efficient 21 // while doing a whole-segment disk scan. It uses a primitive []byte buffer 22 // for its output which needs to be allocated just once. It can only read until 23 // the buffer is full, then it needs to call a callback fn which can do 24 // something with the data. After the callback function has been called on each 25 // key, the output buffer is reset. If the input segment it not at EOF yet, 26 // this cycle repeats 27 type bufferedKeyAndTombstoneExtractor struct { 28 outputBuffer []byte 29 outputBufferOffset uint64 30 offset uint64 31 end uint64 32 rawSegment []byte 33 secondaryIndexCount uint16 34 callback keyAndTombstoneCallbackFn 35 callbackCycle int 36 } 37 38 type keyAndTombstoneCallbackFn func(key []byte, tombstone bool) 39 40 func newBufferedKeyAndTombstoneExtractor(rawSegment []byte, initialOffset uint64, 41 end uint64, outputBufferSize uint64, secondaryIndexCount uint16, 42 callback keyAndTombstoneCallbackFn, 43 ) *bufferedKeyAndTombstoneExtractor { 44 return &bufferedKeyAndTombstoneExtractor{ 45 rawSegment: rawSegment, 46 offset: initialOffset, 47 end: end, 48 outputBuffer: make([]byte, outputBufferSize), 49 outputBufferOffset: 0, 50 secondaryIndexCount: secondaryIndexCount, 51 callback: callback, 52 } 53 } 54 55 func (e *bufferedKeyAndTombstoneExtractor) do() { 56 for { 57 if e.offset >= e.end { 58 break 59 } 60 61 // returns false if the output buffer ran full 62 ok := e.readSingleEntry() 63 if !ok { 64 e.flushAndCallback() 65 } 66 } 67 68 // one final callback 69 e.flushAndCallback() 70 } 71 72 // returns true if the cycle completed, returns false if the cycle did not 73 // complete because the output buffer was full. In that case, the offsets have 74 // been reset to the values they had at the beginning of the cycle 75 func (e *bufferedKeyAndTombstoneExtractor) readSingleEntry() bool { 76 // if we discover during an iteration that the next entry can't fit in the 77 // buffer anymore, we must return to the start of this iteration, so that 78 // the this work can be picked up here once the buffer has been flushed 79 offsetAtLoopStart := e.offset 80 outputOffsetAtLoopStart := e.outputBufferOffset 81 82 // the first output size check is static, as we will always read 5 bytes, 83 // no matter what. If they can't even fit, we can abort right away 84 if !e.outputBufferCanFit(5) { 85 e.offset = offsetAtLoopStart 86 e.outputBufferOffset = outputOffsetAtLoopStart 87 return false 88 } 89 90 // copy tombstone value into output buffer 91 e.outputBuffer[e.outputBufferOffset] = e.rawSegment[e.offset] 92 e.offset++ 93 e.outputBufferOffset++ 94 95 valueLen := binary.LittleEndian.Uint64(e.rawSegment[e.offset : e.offset+8]) 96 e.offset += 8 97 98 // we're not actually interested in the value, so we can skip it entirely 99 e.offset += valueLen 100 101 primaryKeyLen := binary.LittleEndian.Uint32(e.rawSegment[e.offset : e.offset+4]) 102 if !e.outputBufferCanFit(uint64(primaryKeyLen) + 4) { 103 e.offset = offsetAtLoopStart 104 e.outputBufferOffset = outputOffsetAtLoopStart 105 return false 106 } 107 108 // copy the primary key len indicator into the output buffer 109 copy(e.outputBuffer[e.outputBufferOffset:e.outputBufferOffset+4], 110 e.rawSegment[e.offset:e.offset+4]) 111 e.offset += 4 112 e.outputBufferOffset += 4 113 114 // then copy the key itself 115 copy(e.outputBuffer[e.outputBufferOffset:e.outputBufferOffset+uint64(primaryKeyLen)], e.rawSegment[e.offset:e.offset+uint64(primaryKeyLen)]) 116 e.offset += uint64(primaryKeyLen) 117 e.outputBufferOffset += uint64(primaryKeyLen) 118 119 for i := uint16(0); i < e.secondaryIndexCount; i++ { 120 secKeyLen := binary.LittleEndian.Uint32(e.rawSegment[e.offset : e.offset+4]) 121 e.offset += 4 122 e.offset += uint64(secKeyLen) 123 } 124 125 return true 126 } 127 128 func (e *bufferedKeyAndTombstoneExtractor) outputBufferCanFit(size uint64) bool { 129 return (uint64(len(e.outputBuffer)) - e.outputBufferOffset) >= size 130 } 131 132 // flushAndCallback calls the callback fn for each key/tombstone pair in the 133 // buffer, then resets the buffer offset, making it ready to be overwritten in 134 // the next cycle 135 func (e *bufferedKeyAndTombstoneExtractor) flushAndCallback() { 136 end := e.outputBufferOffset 137 e.outputBufferOffset = 0 138 for e.outputBufferOffset < end { 139 var tombstone bool 140 if e.outputBuffer[e.outputBufferOffset] == 0x01 { 141 tombstone = true 142 } 143 144 e.outputBufferOffset++ 145 146 primaryKeyLen := binary.LittleEndian.Uint32(e.outputBuffer[e.outputBufferOffset : e.outputBufferOffset+4]) 147 148 e.outputBufferOffset += 4 149 150 e.callback(e.outputBuffer[e.outputBufferOffset:e.outputBufferOffset+uint64(primaryKeyLen)], 151 tombstone) 152 e.outputBufferOffset += uint64(primaryKeyLen) 153 } 154 155 // reset outputBufferOffset for next batch 156 e.outputBufferOffset = 0 157 158 e.callbackCycle++ 159 }