github.com/weaviate/weaviate@v1.24.6/adapters/repos/db/lsmkv/segment_group.go (about) 1 // _ _ 2 // __ _____ __ ___ ___ __ _| |_ ___ 3 // \ \ /\ / / _ \/ _` \ \ / / |/ _` | __/ _ \ 4 // \ V V / __/ (_| |\ V /| | (_| | || __/ 5 // \_/\_/ \___|\__,_| \_/ |_|\__,_|\__\___| 6 // 7 // Copyright © 2016 - 2024 Weaviate B.V. All rights reserved. 8 // 9 // CONTACT: hello@weaviate.io 10 // 11 12 package lsmkv 13 14 import ( 15 "context" 16 "errors" 17 "fmt" 18 "io/fs" 19 "os" 20 "path/filepath" 21 "strings" 22 "sync" 23 24 "github.com/sirupsen/logrus" 25 "github.com/weaviate/weaviate/adapters/repos/db/roaringset" 26 "github.com/weaviate/weaviate/entities/cyclemanager" 27 "github.com/weaviate/weaviate/entities/lsmkv" 28 "github.com/weaviate/weaviate/entities/storagestate" 29 ) 30 31 type SegmentGroup struct { 32 segments []*segment 33 34 // Lock() for changing the currently active segments, RLock() for normal 35 // operation 36 maintenanceLock sync.RWMutex 37 dir string 38 39 strategy string 40 41 compactionCallbackCtrl cyclemanager.CycleCallbackCtrl 42 43 logger logrus.FieldLogger 44 45 // for backward-compatibility with states where the disk state for maps was 46 // not guaranteed to be sorted yet 47 mapRequiresSorting bool 48 49 status storagestate.Status 50 statusLock sync.Mutex 51 metrics *Metrics 52 53 // all "replace" buckets support counting through net additions, but not all 54 // produce a meaningful count. Typically, the only count we're interested in 55 // is that of the bucket that holds objects 56 monitorCount bool 57 58 mmapContents bool 59 keepTombstones bool // see bucket for more datails 60 useBloomFilter bool // see bucket for more datails 61 calcCountNetAdditions bool // see bucket for more datails 62 compactLeftOverSegments bool // see bucket for more datails 63 } 64 65 type sgConfig struct { 66 dir string 67 strategy string 68 mapRequiresSorting bool 69 monitorCount bool 70 mmapContents bool 71 keepTombstones bool 72 useBloomFilter bool 73 calcCountNetAdditions bool 74 forceCompaction bool 75 } 76 77 func newSegmentGroup(logger logrus.FieldLogger, metrics *Metrics, 78 compactionCallbacks cyclemanager.CycleCallbackGroup, cfg sgConfig, 79 ) (*SegmentGroup, error) { 80 list, err := os.ReadDir(cfg.dir) 81 if err != nil { 82 return nil, err 83 } 84 85 sg := &SegmentGroup{ 86 segments: make([]*segment, len(list)), 87 dir: cfg.dir, 88 logger: logger, 89 metrics: metrics, 90 monitorCount: cfg.monitorCount, 91 mapRequiresSorting: cfg.mapRequiresSorting, 92 strategy: cfg.strategy, 93 mmapContents: cfg.mmapContents, 94 keepTombstones: cfg.keepTombstones, 95 useBloomFilter: cfg.useBloomFilter, 96 calcCountNetAdditions: cfg.calcCountNetAdditions, 97 compactLeftOverSegments: cfg.forceCompaction, 98 } 99 100 segmentIndex := 0 101 102 segmentsAlreadyRecoveredFromCompaction := make(map[string]struct{}) 103 104 // Note: it's important to process first the compacted segments 105 // TODO: a single iteration may be possible 106 107 for _, entry := range list { 108 if filepath.Ext(entry.Name()) != ".tmp" { 109 continue 110 } 111 112 potentialCompactedSegmentFileName := strings.TrimSuffix(entry.Name(), ".tmp") 113 114 if filepath.Ext(potentialCompactedSegmentFileName) != ".db" { 115 // another kind of temporal file, ignore at this point but it may need to be deleted... 116 continue 117 } 118 119 jointSegments := segmentID(potentialCompactedSegmentFileName) 120 jointSegmentsIDs := strings.Split(jointSegments, "_") 121 122 if len(jointSegmentsIDs) != 2 { 123 return nil, fmt.Errorf("invalid compacted segment file name %q", entry.Name()) 124 } 125 126 leftSegmentFilename := fmt.Sprintf("segment-%s.db", jointSegmentsIDs[0]) 127 rightSegmentFilename := fmt.Sprintf("segment-%s.db", jointSegmentsIDs[1]) 128 129 leftSegmentPath := filepath.Join(sg.dir, leftSegmentFilename) 130 rightSegmentPath := filepath.Join(sg.dir, rightSegmentFilename) 131 132 leftSegmentFound, err := fileExists(leftSegmentPath) 133 if err != nil { 134 return nil, fmt.Errorf("check for presence of segment %s: %w", leftSegmentFilename, err) 135 } 136 137 rightSegmentFound, err := fileExists(rightSegmentPath) 138 if err != nil { 139 return nil, fmt.Errorf("check for presence of segment %s: %w", rightSegmentFilename, err) 140 } 141 142 if leftSegmentFound && rightSegmentFound { 143 if err := os.Remove(filepath.Join(sg.dir, entry.Name())); err != nil { 144 return nil, fmt.Errorf("delete partially compacted segment %q: %w", entry.Name(), err) 145 } 146 continue 147 } 148 149 if leftSegmentFound && !rightSegmentFound { 150 return nil, fmt.Errorf("missing right segment %q", rightSegmentFilename) 151 } 152 153 if !leftSegmentFound && rightSegmentFound { 154 rightSegment, err := newSegment(rightSegmentPath, logger, 155 metrics, sg.makeExistsOnLower(segmentIndex), 156 sg.mmapContents, sg.useBloomFilter, sg.calcCountNetAdditions, true) 157 if err != nil { 158 return nil, fmt.Errorf("init segment %s: %w", rightSegmentFilename, err) 159 } 160 161 err = rightSegment.drop() 162 if err != nil { 163 return nil, fmt.Errorf("delete already compacted right segment %s: %w", rightSegmentFilename, err) 164 } 165 } 166 167 if err := os.Rename(filepath.Join(sg.dir, entry.Name()), rightSegmentPath); err != nil { 168 return nil, fmt.Errorf("rename compacted segment file %q as %q: %w", entry.Name(), rightSegmentFilename, err) 169 } 170 171 segment, err := newSegment(rightSegmentPath, logger, 172 metrics, sg.makeExistsOnLower(segmentIndex), 173 sg.mmapContents, sg.useBloomFilter, sg.calcCountNetAdditions, true) 174 if err != nil { 175 return nil, fmt.Errorf("init segment %s: %w", rightSegmentFilename, err) 176 } 177 178 sg.segments[segmentIndex] = segment 179 segmentIndex++ 180 181 segmentsAlreadyRecoveredFromCompaction[rightSegmentFilename] = struct{}{} 182 } 183 184 for _, entry := range list { 185 if filepath.Ext(entry.Name()) != ".db" { 186 // skip, this could be commit log, etc. 187 continue 188 } 189 190 _, alreadyRecoveredFromCompaction := segmentsAlreadyRecoveredFromCompaction[entry.Name()] 191 if alreadyRecoveredFromCompaction { 192 // the .db file was already removed and restored from a compacted segment 193 continue 194 } 195 196 // before we can mount this file, we need to check if a WAL exists for it. 197 // If yes, we must assume that the flush never finished, as otherwise the 198 // WAL would have been lsmkv.Deleted. Thus we must remove it. 199 walFileName := strings.TrimSuffix(entry.Name(), ".db") + ".wal" 200 ok, err := fileExists(filepath.Join(sg.dir, walFileName)) 201 if err != nil { 202 return nil, fmt.Errorf("check for presence of wals for segment %s: %w", 203 entry.Name(), err) 204 } 205 if ok { 206 // the segment will be recovered from the WAL 207 err := os.Remove(filepath.Join(sg.dir, entry.Name())) 208 if err != nil { 209 return nil, fmt.Errorf("delete partially written segment %s: %w", entry.Name(), err) 210 } 211 212 logger.WithField("action", "lsm_segment_init"). 213 WithField("path", filepath.Join(sg.dir, entry.Name())). 214 WithField("wal_path", walFileName). 215 Info("Discarded (partially written) LSM segment, because an active WAL for " + 216 "the same segment was found. A recovery from the WAL will follow.") 217 218 continue 219 } 220 221 segment, err := newSegment(filepath.Join(sg.dir, entry.Name()), logger, 222 metrics, sg.makeExistsOnLower(segmentIndex), 223 sg.mmapContents, sg.useBloomFilter, sg.calcCountNetAdditions, false) 224 if err != nil { 225 return nil, fmt.Errorf("init segment %s: %w", entry.Name(), err) 226 } 227 228 sg.segments[segmentIndex] = segment 229 segmentIndex++ 230 } 231 232 sg.segments = sg.segments[:segmentIndex] 233 234 if sg.monitorCount { 235 sg.metrics.ObjectCount(sg.count()) 236 } 237 238 id := "segmentgroup/compaction/" + sg.dir 239 sg.compactionCallbackCtrl = compactionCallbacks.Register(id, sg.compactIfLevelsMatch) 240 241 return sg, nil 242 } 243 244 func (sg *SegmentGroup) makeExistsOnLower(nextSegmentIndex int) existsOnLowerSegmentsFn { 245 return func(key []byte) (bool, error) { 246 if nextSegmentIndex == 0 { 247 // this is already the lowest possible segment, we can guarantee that 248 // any key in this segment is previously unseen. 249 return false, nil 250 } 251 252 v, err := sg.getWithUpperSegmentBoundary(key, nextSegmentIndex-1) 253 if err != nil { 254 return false, fmt.Errorf("check exists on segments lower than %d: %w", 255 nextSegmentIndex, err) 256 } 257 258 return v != nil, nil 259 } 260 } 261 262 func (sg *SegmentGroup) add(path string) error { 263 sg.maintenanceLock.Lock() 264 defer sg.maintenanceLock.Unlock() 265 266 newSegmentIndex := len(sg.segments) 267 segment, err := newSegment(path, sg.logger, 268 sg.metrics, sg.makeExistsOnLower(newSegmentIndex), 269 sg.mmapContents, sg.useBloomFilter, sg.calcCountNetAdditions, true) 270 if err != nil { 271 return fmt.Errorf("init segment %s: %w", path, err) 272 } 273 274 sg.segments = append(sg.segments, segment) 275 return nil 276 } 277 278 func (sg *SegmentGroup) get(key []byte) ([]byte, error) { 279 sg.maintenanceLock.RLock() 280 defer sg.maintenanceLock.RUnlock() 281 282 return sg.getWithUpperSegmentBoundary(key, len(sg.segments)-1) 283 } 284 285 // not thread-safe on its own, as the assumption is that this is called from a 286 // lockholder, e.g. within .get() 287 func (sg *SegmentGroup) getWithUpperSegmentBoundary(key []byte, topMostSegment int) ([]byte, error) { 288 // assumes "replace" strategy 289 290 // start with latest and exit as soon as something is found, thus making sure 291 // the latest takes presence 292 for i := topMostSegment; i >= 0; i-- { 293 v, err := sg.segments[i].get(key) 294 if err != nil { 295 if errors.Is(err, lsmkv.NotFound) { 296 continue 297 } 298 299 if errors.Is(err, lsmkv.Deleted) { 300 return nil, nil 301 } 302 303 panic(fmt.Sprintf("unsupported error in segmentGroup.get(): %v", err)) 304 } 305 306 return v, nil 307 } 308 309 return nil, nil 310 } 311 312 func (sg *SegmentGroup) getBySecondaryIntoMemory(pos int, key []byte, buffer []byte) ([]byte, []byte, error) { 313 sg.maintenanceLock.RLock() 314 defer sg.maintenanceLock.RUnlock() 315 316 // assumes "replace" strategy 317 318 // start with latest and exit as soon as something is found, thus making sure 319 // the latest takes presence 320 for i := len(sg.segments) - 1; i >= 0; i-- { 321 v, err, allocatedBuff := sg.segments[i].getBySecondaryIntoMemory(pos, key, buffer) 322 if err != nil { 323 if errors.Is(err, lsmkv.NotFound) { 324 continue 325 } 326 327 if errors.Is(err, lsmkv.Deleted) { 328 return nil, nil, nil 329 } 330 331 panic(fmt.Sprintf("unsupported error in segmentGroup.get(): %v", err)) 332 } 333 334 return v, allocatedBuff, nil 335 } 336 337 return nil, nil, nil 338 } 339 340 func (sg *SegmentGroup) getCollection(key []byte) ([]value, error) { 341 sg.maintenanceLock.RLock() 342 defer sg.maintenanceLock.RUnlock() 343 344 var out []value 345 346 // start with first and do not exit 347 for _, segment := range sg.segments { 348 v, err := segment.getCollection(key) 349 if err != nil { 350 if errors.Is(err, lsmkv.NotFound) { 351 continue 352 } 353 354 return nil, err 355 } 356 357 if len(out) == 0 { 358 out = v 359 } else { 360 out = append(out, v...) 361 } 362 } 363 364 return out, nil 365 } 366 367 func (sg *SegmentGroup) getCollectionBySegments(key []byte) ([][]value, error) { 368 sg.maintenanceLock.RLock() 369 defer sg.maintenanceLock.RUnlock() 370 371 out := make([][]value, len(sg.segments)) 372 373 i := 0 374 // start with first and do not exit 375 for _, segment := range sg.segments { 376 v, err := segment.getCollection(key) 377 if err != nil { 378 if errors.Is(err, lsmkv.NotFound) { 379 continue 380 } 381 382 return nil, err 383 } 384 385 out[i] = v 386 i++ 387 } 388 389 return out[:i], nil 390 } 391 392 func (sg *SegmentGroup) roaringSetGet(key []byte) (roaringset.BitmapLayers, error) { 393 sg.maintenanceLock.RLock() 394 defer sg.maintenanceLock.RUnlock() 395 396 var out roaringset.BitmapLayers 397 398 // start with first and do not exit 399 for _, segment := range sg.segments { 400 rs, err := segment.roaringSetGet(key) 401 if err != nil { 402 if errors.Is(err, lsmkv.NotFound) { 403 continue 404 } 405 406 return nil, err 407 } 408 409 out = append(out, rs) 410 } 411 412 return out, nil 413 } 414 415 func (sg *SegmentGroup) count() int { 416 sg.maintenanceLock.RLock() 417 defer sg.maintenanceLock.RUnlock() 418 419 count := 0 420 for _, seg := range sg.segments { 421 count += seg.countNetAdditions 422 } 423 424 return count 425 } 426 427 func (sg *SegmentGroup) shutdown(ctx context.Context) error { 428 if err := sg.compactionCallbackCtrl.Unregister(ctx); err != nil { 429 return fmt.Errorf("long-running compaction in progress: %w", ctx.Err()) 430 } 431 432 // Lock acquirement placed after compaction cycle stop request, due to occasional deadlock, 433 // because compaction logic used in cycle also requires maintenance lock. 434 // 435 // If lock is grabbed by shutdown method and compaction in cycle loop starts right after, 436 // it is blocked waiting for the same lock, eventually blocking entire cycle loop and preventing to read stop signal. 437 // If stop signal can not be read, shutdown will not receive stop result and will not proceed with further execution. 438 // Maintenance lock will then never be released. 439 sg.maintenanceLock.Lock() 440 defer sg.maintenanceLock.Unlock() 441 442 for i, seg := range sg.segments { 443 if err := seg.close(); err != nil { 444 return err 445 } 446 447 sg.segments[i] = nil 448 } 449 450 // make sure the segment list itself is set to nil. In case a memtable will 451 // still flush after closing, it might try to read from a disk segment list 452 // otherwise and run into nil-pointer problems. 453 sg.segments = nil 454 455 return nil 456 } 457 458 func (sg *SegmentGroup) UpdateStatus(status storagestate.Status) { 459 sg.statusLock.Lock() 460 defer sg.statusLock.Unlock() 461 462 sg.status = status 463 } 464 465 func (sg *SegmentGroup) isReadyOnly() bool { 466 sg.statusLock.Lock() 467 defer sg.statusLock.Unlock() 468 469 return sg.status == storagestate.StatusReadOnly 470 } 471 472 func fileExists(path string) (bool, error) { 473 _, err := os.Stat(path) 474 if err == nil { 475 return true, nil 476 } 477 478 if errors.Is(err, fs.ErrNotExist) { 479 return false, nil 480 } 481 482 return false, err 483 }