github.com/weaviate/weaviate@v1.24.6/adapters/repos/db/lsmkv/compactor_map.go (about) 1 // _ _ 2 // __ _____ __ ___ ___ __ _| |_ ___ 3 // \ \ /\ / / _ \/ _` \ \ / / |/ _` | __/ _ \ 4 // \ V V / __/ (_| |\ V /| | (_| | || __/ 5 // \_/\_/ \___|\__,_| \_/ |_|\__,_|\__\___| 6 // 7 // Copyright © 2016 - 2024 Weaviate B.V. All rights reserved. 8 // 9 // CONTACT: hello@weaviate.io 10 // 11 12 package lsmkv 13 14 import ( 15 "bufio" 16 "bytes" 17 "io" 18 "sort" 19 20 "github.com/pkg/errors" 21 "github.com/weaviate/weaviate/adapters/repos/db/lsmkv/segmentindex" 22 ) 23 24 type compactorMap struct { 25 // c1 is always the older segment, so when there is a conflict c2 wins 26 // (because of the replace strategy) 27 c1 *segmentCursorCollectionReusable 28 c2 *segmentCursorCollectionReusable 29 30 // the level matching those of the cursors 31 currentLevel uint16 32 secondaryIndexCount uint16 33 // Tells if tombstones or keys without corresponding values 34 // can be removed from merged segment. 35 // (left segment is root (1st) one, keepTombstones is off for bucket) 36 cleanupTombstones bool 37 38 w io.WriteSeeker 39 bufw *bufio.Writer 40 41 scratchSpacePath string 42 43 // for backward-compatibility with states where the disk state for maps was 44 // not guaranteed to be sorted yet 45 requiresSorting bool 46 } 47 48 func newCompactorMapCollection(w io.WriteSeeker, 49 c1, c2 *segmentCursorCollectionReusable, level, secondaryIndexCount uint16, 50 scratchSpacePath string, requiresSorting bool, cleanupTombstones bool, 51 ) *compactorMap { 52 return &compactorMap{ 53 c1: c1, 54 c2: c2, 55 w: w, 56 bufw: bufio.NewWriterSize(w, 256*1024), 57 currentLevel: level, 58 cleanupTombstones: cleanupTombstones, 59 secondaryIndexCount: secondaryIndexCount, 60 scratchSpacePath: scratchSpacePath, 61 requiresSorting: requiresSorting, 62 } 63 } 64 65 func (c *compactorMap) do() error { 66 if err := c.init(); err != nil { 67 return errors.Wrap(err, "init") 68 } 69 70 kis, err := c.writeKeys() 71 if err != nil { 72 return errors.Wrap(err, "write keys") 73 } 74 75 if err := c.writeIndices(kis); err != nil { 76 return errors.Wrap(err, "write index") 77 } 78 79 // flush buffered, so we can safely seek on underlying writer 80 if err := c.bufw.Flush(); err != nil { 81 return errors.Wrap(err, "flush buffered") 82 } 83 84 var dataEnd uint64 = segmentindex.HeaderSize 85 if len(kis) > 0 { 86 dataEnd = uint64(kis[len(kis)-1].ValueEnd) 87 } 88 89 if err := c.writeHeader(c.currentLevel, 0, c.secondaryIndexCount, 90 dataEnd); err != nil { 91 return errors.Wrap(err, "write header") 92 } 93 94 return nil 95 } 96 97 func (c *compactorMap) init() error { 98 // write a dummy header, we don't know the contents of the actual header yet, 99 // we will seek to the beginning and overwrite the actual header at the very 100 // end 101 102 if _, err := c.bufw.Write(make([]byte, segmentindex.HeaderSize)); err != nil { 103 return errors.Wrap(err, "write empty header") 104 } 105 106 return nil 107 } 108 109 func (c *compactorMap) writeKeys() ([]segmentindex.Key, error) { 110 key1, value1, _ := c.c1.first() 111 key2, value2, _ := c.c2.first() 112 113 // the (dummy) header was already written, this is our initial offset 114 offset := segmentindex.HeaderSize 115 116 var kis []segmentindex.Key 117 pairs := newReusableMapPairs() 118 me := newMapEncoder() 119 ssm := newSortedMapMerger() 120 121 for { 122 if key1 == nil && key2 == nil { 123 break 124 } 125 if bytes.Equal(key1, key2) { 126 pairs.ResizeLeft(len(value1)) 127 pairs.ResizeRight(len(value2)) 128 129 for i, v := range value1 { 130 if err := pairs.left[i].FromBytes(v.value, false); err != nil { 131 return nil, err 132 } 133 pairs.left[i].Tombstone = v.tombstone 134 } 135 136 for i, v := range value2 { 137 if err := pairs.right[i].FromBytes(v.value, false); err != nil { 138 return nil, err 139 } 140 pairs.right[i].Tombstone = v.tombstone 141 } 142 143 if c.requiresSorting { 144 sort.Slice(pairs.left, func(a, b int) bool { 145 return bytes.Compare(pairs.left[a].Key, pairs.left[b].Key) < 0 146 }) 147 sort.Slice(pairs.right, func(a, b int) bool { 148 return bytes.Compare(pairs.right[a].Key, pairs.right[b].Key) < 0 149 }) 150 } 151 152 ssm.reset([][]MapPair{pairs.left, pairs.right}) 153 mergedPairs, err := ssm. 154 doKeepTombstonesReusable() 155 if err != nil { 156 return nil, err 157 } 158 159 mergedEncoded, err := me.DoMultiReusable(mergedPairs) 160 if err != nil { 161 return nil, err 162 } 163 164 if values, skip := c.cleanupValues(mergedEncoded); !skip { 165 ki, err := c.writeIndividualNode(offset, key2, values) 166 if err != nil { 167 return nil, errors.Wrap(err, "write individual node (equal keys)") 168 } 169 170 offset = ki.ValueEnd 171 kis = append(kis, ki) 172 } 173 // advance both! 174 key1, value1, _ = c.c1.next() 175 key2, value2, _ = c.c2.next() 176 continue 177 } 178 179 if (key1 != nil && bytes.Compare(key1, key2) == -1) || key2 == nil { 180 // key 1 is smaller 181 if values, skip := c.cleanupValues(value1); !skip { 182 ki, err := c.writeIndividualNode(offset, key1, values) 183 if err != nil { 184 return nil, errors.Wrap(err, "write individual node (key1 smaller)") 185 } 186 187 offset = ki.ValueEnd 188 kis = append(kis, ki) 189 } 190 key1, value1, _ = c.c1.next() 191 } else { 192 // key 2 is smaller 193 if values, skip := c.cleanupValues(value2); !skip { 194 ki, err := c.writeIndividualNode(offset, key2, values) 195 if err != nil { 196 return nil, errors.Wrap(err, "write individual node (key2 smaller)") 197 } 198 199 offset = ki.ValueEnd 200 kis = append(kis, ki) 201 } 202 key2, value2, _ = c.c2.next() 203 } 204 } 205 206 return kis, nil 207 } 208 209 func (c *compactorMap) writeIndividualNode(offset int, key []byte, 210 values []value, 211 ) (segmentindex.Key, error) { 212 // NOTE: There are no guarantees in the cursor logic that any memory is valid 213 // for more than a single iteration. Every time you call next() to advance 214 // the cursor, any memory might be reused. 215 // 216 // This includes the key buffer which was the cause of 217 // https://github.com/weaviate/weaviate/issues/3517 218 // 219 // A previous logic created a new assignment in each iteration, but thatwas 220 // not an explicit guarantee. A change in v1.21 (for pread/mmap) added a 221 // reusable buffer for the key which surfaced this bug. 222 keyCopy := make([]byte, len(key)) 223 copy(keyCopy, key) 224 225 return segmentCollectionNode{ 226 values: values, 227 primaryKey: keyCopy, 228 offset: offset, 229 }.KeyIndexAndWriteTo(c.bufw) 230 } 231 232 func (c *compactorMap) writeIndices(keys []segmentindex.Key) error { 233 indices := segmentindex.Indexes{ 234 Keys: keys, 235 SecondaryIndexCount: c.secondaryIndexCount, 236 ScratchSpacePath: c.scratchSpacePath, 237 } 238 239 _, err := indices.WriteTo(c.bufw) 240 return err 241 } 242 243 // writeHeader assumes that everything has been written to the underlying 244 // writer and it is now safe to seek to the beginning and override the initial 245 // header 246 func (c *compactorMap) writeHeader(level, version, secondaryIndices uint16, 247 startOfIndex uint64, 248 ) error { 249 if _, err := c.w.Seek(0, io.SeekStart); err != nil { 250 return errors.Wrap(err, "seek to beginning to write header") 251 } 252 253 h := &segmentindex.Header{ 254 Level: level, 255 Version: version, 256 SecondaryIndices: secondaryIndices, 257 Strategy: segmentindex.StrategyMapCollection, 258 IndexStart: startOfIndex, 259 } 260 261 if _, err := h.WriteTo(c.w); err != nil { 262 return err 263 } 264 265 return nil 266 } 267 268 // Removes values with tombstone set from input slice. Output slice may be smaller than input one. 269 // Returned skip of true means there are no values left (key can be omitted in segment) 270 // WARN: method can alter input slice by swapping its elements and reducing length (not capacity) 271 func (c *compactorMap) cleanupValues(values []value) (vals []value, skip bool) { 272 if !c.cleanupTombstones { 273 return values, false 274 } 275 276 // Reuse input slice not to allocate new memory 277 // Rearrange slice in a way that tombstoned values are moved to the end 278 // and reduce slice's length. 279 last := 0 280 for i := 0; i < len(values); i++ { 281 if !values[i].tombstone { 282 // Swap both elements instead overwritting `last` by `i`. 283 // Overwrite would result in `values[last].value` pointing to the same slice 284 // as `values[i].value`. 285 // If `values` slice is reused by multiple nodes (as it happens for map cursors 286 // `segmentCursorCollectionReusable` using `segmentCollectionNode` as buffer) 287 // populating slice `values[i].value` would overwrite slice `values[last].value`. 288 // Swaps makes sure `values[i].value` and `values[last].value` point to different slices. 289 values[last], values[i] = values[i], values[last] 290 last++ 291 } 292 } 293 294 if last == 0 { 295 return nil, true 296 } 297 return values[:last], false 298 }