github.com/scottcagno/storage@v1.8.0/pkg/lsmt/lsmtree.go (about) 1 package lsmt 2 3 import ( 4 "bytes" 5 "fmt" 6 "github.com/scottcagno/storage/pkg/bloom" 7 "github.com/scottcagno/storage/pkg/lsmt/binary" 8 "github.com/scottcagno/storage/pkg/lsmt/mtbl" 9 "github.com/scottcagno/storage/pkg/lsmt/sstable" 10 "github.com/scottcagno/storage/pkg/lsmt/wal" 11 "github.com/scottcagno/storage/pkg/util" 12 "hash/crc32" 13 "os" 14 "path/filepath" 15 "sync" 16 ) 17 18 const version = "v1.7.0" 19 20 var Tombstone = []byte(nil) 21 22 // LSMTree is an LSMTree 23 type LSMTree struct { 24 conf *LSMConfig 25 walbase string // walbase is the write-ahead commit log base filepath 26 sstbase string // sstbase is the ss-table and index base filepath where data resides 27 lock sync.RWMutex // lock is a mutex that synchronizes access to the data 28 wacl *wal.WAL // wacl is the write-ahead commit log 29 memt *mtbl.RBTree // memt is the main mem-table (red-black tree) instance 30 sstm *sstable.SSTManager // sstm is the sorted-strings table manager 31 bloom *bloom.BloomFilter // bloom is a bloom filter 32 logger *Logger // logger is a logger for the lsm-tree 33 } 34 35 // OpenLSMTree opens or creates an LSMTree instance. 36 func OpenLSMTree(c *LSMConfig) (*LSMTree, error) { 37 // check lsm config 38 conf := checkLSMConfig(c) 39 // make sure we are working with absolute paths 40 base, err := filepath.Abs(conf.BaseDir) 41 if err != nil { 42 return nil, err 43 } 44 // sanitize any path separators 45 base = filepath.ToSlash(base) 46 // check for checksum file 47 err = checkChecksum(version, base) 48 if err != nil { 49 return nil, err 50 } 51 // create log base directory 52 walbase := filepath.Join(base, defaultWalDir) 53 err = os.MkdirAll(walbase, os.ModeDir) 54 if err != nil { 55 return nil, err 56 } 57 // create data base directory 58 sstbase := filepath.Join(base, defaultSstDir) 59 err = os.MkdirAll(sstbase, os.ModeDir) 60 if err != nil { 61 return nil, err 62 } 63 // open write-ahead commit log 64 wacl, err := wal.OpenWAL(&wal.WALConfig{ 65 BasePath: walbase, 66 MaxFileSize: conf.FlushThreshold, 67 SyncOnWrite: conf.SyncOnWrite, 68 }) 69 if err != nil { 70 return nil, err 71 } 72 // open ss-table-manager 73 sstm, err := sstable.OpenSSTManager(sstbase) 74 if err != nil { 75 return nil, err 76 } 77 // create lsm-tree instance and return 78 lsmt := &LSMTree{ 79 conf: conf, 80 walbase: walbase, 81 sstbase: sstbase, 82 wacl: wacl, 83 memt: mtbl.NewRBTree(), 84 sstm: sstm, 85 bloom: bloom.NewBloomFilter(conf.BloomFilterSize), 86 logger: NewLogger(conf.LoggingLevel), 87 } 88 // load mem-table with commit log data 89 err = lsmt.loadFromWriteAheadCommitLog() 90 if err != nil { 91 return nil, err 92 } 93 // populate bloom filter 94 err = lsmt.populateBloomFilter() 95 if err != nil { 96 return nil, err 97 } 98 // return lsm-tree 99 return lsmt, nil 100 } 101 102 func CalcCRC(d []byte) uint32 { 103 return crc32.Checksum(d, crc32.MakeTable(crc32.Koopman)) 104 } 105 106 func calculateChecksum(against string) (uint32, string) { 107 // calculate checksum 108 const d = `Reality is only a Rorschach ink-blot, you know.` 109 n := CalcCRC([]byte(d + against)) 110 return n, fmt.Sprintf("checksum: %d", n) 111 } 112 113 func checkChecksum(against, base string) error { 114 // sanitize the path 115 path := filepath.Join(base, ".sum.txt") 116 // check to see if the path is there 117 if _, err := os.Stat(path); os.IsNotExist(err) { 118 // if not, initialize it 119 err = os.MkdirAll(base, os.ModeDir) 120 if err != nil { 121 return err 122 } 123 // calculate checksum 124 _, str := calculateChecksum(against) 125 // then write the calculated checksum out to a new file 126 err = os.WriteFile(path, []byte(str), 0666) 127 if err != nil { 128 return err 129 } 130 // return (nil is good) 131 return nil 132 } 133 // file exists, so lets read the checksum file 134 data, err := os.ReadFile(path) 135 if err != nil { 136 return err 137 } 138 // calculate checksum 139 _, str := calculateChecksum(against) 140 if str != string(data) { 141 return ErrBadChecksum 142 } 143 // return (nil is good) 144 return nil 145 } 146 147 // loadFromWriteAheadCommitLog loads any entries from the 148 // segmented write-ahead commit file back into the mem-table 149 func (lsm *LSMTree) loadFromWriteAheadCommitLog() error { 150 // lock 151 lsm.lock.Lock() 152 defer lsm.lock.Unlock() 153 // log info 154 lsm.logger.Info("adding write-ahead log entries to mem-table") 155 // scan through the write-ahead log... 156 err := lsm.wacl.Scan(func(e *binary.Entry) bool { 157 // ... and insert data back into the mem-table 158 lsm.memt.Put(e) 159 return true 160 }) 161 if err != nil { 162 // log error 163 lsm.logger.Error("scanning write-ahead log: %s", err) 164 return err 165 } 166 return nil 167 } 168 169 // populateBloomFilter attempts to read through the keys in the 170 // mem-table, and then the ss-table(s) and fill out the bloom 171 // filter as thoroughly as possible. 172 func (lsm *LSMTree) populateBloomFilter() error { 173 // lock 174 lsm.lock.Lock() 175 defer lsm.lock.Unlock() 176 // log info 177 lsm.logger.Info("adding ss-table entries to bloom filter") 178 // add entries from linear ss-table scan 179 err := lsm.sstm.Scan(sstable.ScanNewToOld, func(e *binary.Entry) bool { 180 // make sure entry is not a tombstone 181 if e != nil { 182 if e.Value != nil && !bytes.Equal(e.Value, Tombstone) { 183 // add entry to bloom filter 184 lsm.bloom.Set(e.Key) 185 } else if bytes.Equal(e.Value, Tombstone) { 186 // remove entry from bloom filter 187 lsm.bloom.Unset(e.Key) 188 } 189 } 190 return true 191 }) 192 if err != nil { 193 // log error 194 lsm.logger.Error("scanning s-tables: %s", err) 195 return err 196 } 197 // log info 198 lsm.logger.Info("adding mem-table entries to bloom filter") 199 // add entries from mem-table 200 lsm.memt.Scan(func(e *binary.Entry) bool { 201 // make sure entry is not a tombstone 202 if e != nil { 203 if e.Value != nil && !bytes.Equal(e.Value, Tombstone) { 204 // add entry to bloom filter 205 lsm.bloom.Set(e.Key) 206 } else if bytes.Equal(e.Value, Tombstone) { 207 // remove entry from bloom filter 208 lsm.bloom.Unset(e.Key) 209 } 210 } 211 return true 212 }) 213 return nil 214 } 215 216 // cycleWAL closes the current (open) active write-ahead commit 217 // log--removes all the files on disk and opens a fresh one 218 func (lsm *LSMTree) cycleWAL() error { 219 // let's reset the write-ahead commit log 220 err := lsm.wacl.CloseAndRemove() 221 if err != nil { 222 // log error 223 lsm.logger.Error("closing and removing write-ahead log: %s", err) 224 return err 225 } 226 // open a fresh write-ahead commit log 227 lsm.wacl, err = wal.OpenWAL(&wal.WALConfig{ 228 BasePath: lsm.walbase, 229 MaxFileSize: lsm.conf.FlushThreshold, 230 SyncOnWrite: lsm.conf.SyncOnWrite, 231 }) 232 if err != nil { 233 // log error 234 lsm.logger.Error("opening fresh write-ahead log: %s", err) 235 return err 236 } 237 return nil 238 } 239 240 func (lsm *LSMTree) needFlush(memTableSize int64) error { 241 if memTableSize > lsm.conf.FlushThreshold { 242 return ErrFlushThreshold 243 } 244 return nil 245 } 246 247 func (lsm *LSMTree) FlushToSSTableAndCycleWAL(memt *mtbl.RBTree) error { 248 /* 249 // check err properly 250 if err != nil { 251 // make sure it's the mem-table doesn't need flushing 252 if err != ErrFlushThreshold { 253 return err 254 } 255 // looks like it needs a flush 256 err = lsm.sstm.FlushToSSTable(lsm.memt) 257 if err != nil { 258 return err 259 } 260 // let's reset the write-ahead commit log 261 err = lsm.cycleWAL() 262 if err != nil { 263 return err 264 } 265 } 266 */ 267 // attempt to flush 268 err := lsm.sstm.FlushToSSTable(memt) 269 if err != nil { 270 return err 271 } 272 // let's reset the write-ahead commit log 273 err = lsm.cycleWAL() 274 if err != nil { 275 return err 276 } 277 // no error, simply return 278 return nil 279 } 280 281 // checkEntry ensures the entry does not violate the max key and value config 282 func (lsm *LSMTree) checkEntry(e *binary.Entry) error { 283 // init err 284 var err error 285 // key checks 286 err = checkKey(e.Key, lsm.conf.MaxKeySize) 287 if err != nil { 288 return err 289 } 290 // value checks 291 err = checkValue(e.Value, lsm.conf.MaxValueSize) 292 if err != nil { 293 return err 294 } 295 return nil 296 } 297 298 func checkKey(k []byte, max int64) error { 299 if k == nil || len(k) < minKeySizeAllowed { 300 return ErrBadKey 301 } 302 if int64(len(k)) > max { 303 return ErrKeyTooLarge 304 } 305 return nil 306 } 307 308 func checkValue(v []byte, max int64) error { 309 if v == nil || len(v) < minValueSizeAllowed { 310 return ErrBadValue 311 } 312 if int64(len(v)) > max { 313 return ErrValueTooLarge 314 } 315 return nil 316 } 317 318 // Has returns a boolean signaling weather or not the key 319 // is in the LSMTree. It should be noted that in some cases 320 // this may return a false positive, but it should never 321 // return a false negative. 322 func (lsm *LSMTree) Has(k string) bool { 323 // check key 324 err := checkKey([]byte(k), lsm.conf.MaxKeySize) 325 if err != nil { 326 return false 327 } 328 // check bloom filter 329 if ok := lsm.bloom.MayHave([]byte(k)); !ok { 330 // definitely not in the bloom filter 331 return false 332 } 333 // low probability of false positive, 334 // but let's check the mem-table 335 if ok := lsm.memt.HasKey(k); ok { 336 // definitely in the mem-table, return true. 337 // it should be noted that we cannot return 338 // false from here, because if we do we are 339 // saying that it is not in the mem-table, but 340 // it still could be found on disk.... 341 return true 342 } 343 // so I suppose at this point it's really 344 // unlikely to be found, but let's search 345 // anyway, because well... why not? 346 // do linear semi-binary-ish search 347 de, err := lsm.sstm.LinearSearch(k) 348 // check err 349 if err != nil && err == binary.ErrEntryNotFound { 350 // definitely not in the ss-table 351 return false 352 } 353 // otherwise, check value (in case of tombstone) 354 if de == nil || de.Value == nil { 355 // definitely not in the ss-table 356 return false 357 } 358 // otherwise, we found it homey! 359 return true 360 } 361 362 // Put takes a key and a value and adds them to the LSMTree. If 363 // the entry already exists, it should overwrite the old entry. 364 func (lsm *LSMTree) Put(k string, v []byte) error { 365 // lock 366 lsm.lock.Lock() 367 defer lsm.lock.Unlock() 368 // create binary entry 369 e := &binary.Entry{Key: []byte(k), Value: v} 370 // check entry 371 err := lsm.checkEntry(e) 372 if err != nil { 373 return err 374 } 375 // write entry to the write-ahead commit log 376 _, err = lsm.wacl.Write(e) 377 if err != nil { 378 return err 379 } 380 // write entry to mem-table 381 _, needFlush := lsm.memt.UpsertAndCheckIfFull(e, lsm.conf.FlushThreshold) 382 // check if we should do a flush 383 if needFlush { 384 // log info 385 lsm.logger.Info("mem-table needs flush, attempting to flush now", err) 386 // attempt to flush 387 err = lsm.FlushToSSTableAndCycleWAL(lsm.memt) 388 if err != nil { 389 // log error 390 lsm.logger.Error("flushing mem-table: %s", err) 391 return err 392 } 393 } 394 // add to bloom filter 395 lsm.bloom.Set([]byte(k)) 396 return nil 397 } 398 399 // Get takes a key and attempts to find a match in the LSMTree. If 400 // a match cannot be found Get returns a nil value and ErrNotFound. 401 // Get first checks the bloom filter, then the mem-table. If it is 402 // still not found it attempts to do a binary search on the for the 403 // key in the ss-index and if that yields no result it will try to 404 // find the entry by doing a linear search of the ss-table itself. 405 func (lsm *LSMTree) Get(k string) ([]byte, error) { 406 // read lock 407 lsm.lock.RLock() 408 defer lsm.lock.RUnlock() 409 // check key 410 err := checkKey([]byte(k), lsm.conf.MaxKeySize) 411 if err != nil { 412 return nil, err 413 } 414 // check bloom filter 415 if ok := lsm.bloom.MayHave([]byte(k)); !ok { 416 // definitely not in the lsm tree 417 return nil, ErrNotFound 418 } 419 // according to the bloom filter, it "may" be in 420 // tree, so lets start by searching the mem-table 421 e, found := lsm.memt.Get(&binary.Entry{Key: []byte(k)}) 422 if found && e.Value != nil { 423 // we found it! 424 return e.Value, nil 425 } 426 // we did not find it in the mem-table 427 // need to check error for tombstone 428 if e != nil && e.Value == nil { 429 // found tombstone entry (means this entry was 430 // deleted) so we can end our search here; just 431 // MAKE SURE you check for tombstone errors!!! 432 return nil, ErrNotFound 433 } 434 // check sparse index, and ss-tables, young to old 435 de, err := lsm.sstm.Search(k) 436 if err != nil { 437 // if we get a bad entry, it most likely means 438 // that our sparse index couldn't find it, but 439 // there is still a chance it may be on disk 440 if err == binary.ErrBadEntry { 441 // do linear semi-binary-ish search 442 de, err = lsm.sstm.LinearSearch(k) 443 // check err 444 if err != nil && err == binary.ErrEntryNotFound { 445 return nil, ErrNotFound 446 } 447 // otherwise, check value (in case of tombstone) 448 if de == nil || de.Value == nil { 449 return nil, ErrNotFound 450 } 451 // otherwise, we found it homey! 452 return de.Value, nil 453 } 454 // -> IF YOU ARE HERE... 455 // Then the value may not be here (or you didn't check 456 // all the potential errors that can be returned), dummy 457 return nil, err 458 } 459 // check to make sure entry is not a tombstone 460 if de == nil || de.Value == nil { 461 return nil, ErrNotFound 462 } 463 // may have found it 464 return de.Value, nil 465 } 466 467 // GetLinear takes a key and attempts to find a match in the LSMTree. If 468 // a match cannot be found Get returns a nil value and ErrNotFound. 469 // Get first checks the bloom filter, then the mem-table. If it is 470 // still not found [this is where it differs from Get] it attempts 471 // to do a linear search directly of the ss-table itself. It can be 472 // a bit quicker [if you know that your data is not memory resident.] 473 func (lsm *LSMTree) GetLinear(k string) ([]byte, error) { 474 // read lock 475 lsm.lock.RLock() 476 defer lsm.lock.RUnlock() 477 // check key 478 err := checkKey([]byte(k), lsm.conf.MaxKeySize) 479 if err != nil { 480 return nil, err 481 } 482 // check bloom filter 483 if ok := lsm.bloom.MayHave([]byte(k)); !ok { 484 // definitely not in the lsm tree 485 return nil, ErrNotFound 486 } 487 // according to the bloom filter, it "may" be in 488 // tree, so lets start by searching the mem-table 489 e, found := lsm.memt.Get(&binary.Entry{Key: []byte(k)}) 490 if found && e.Value != nil { 491 // we found it! 492 return e.Value, nil 493 } 494 // we did not find it in the mem-table 495 // need to check error for tombstone 496 if e != nil && e.Value == nil { 497 // found tombstone entry (means this entry was 498 // deleted) so we can end our search here; just 499 // MAKE SURE you check for tombstone errors!!! 500 return nil, ErrNotFound 501 } 502 // do linear semi-binary-ish search 503 de, err := lsm.sstm.LinearSearch(k) 504 // check err 505 if err != nil && err == binary.ErrEntryNotFound { 506 return nil, ErrNotFound 507 } 508 // otherwise, check value (in case of tombstone) 509 if de == nil || de.Value == nil { 510 return nil, ErrNotFound 511 } 512 // otherwise, we found it homey! 513 return de.Value, nil 514 } 515 516 // Del takes a key and overwrites the record with a tomstone or 517 // a 'deleted' or nil entry. It leaves the key in the LSMTree 518 // so that future table versions can properly merge. 519 func (lsm *LSMTree) Del(k string) error { 520 // lock 521 lsm.lock.Lock() 522 defer lsm.lock.Unlock() 523 // check key 524 err := checkKey([]byte(k), lsm.conf.MaxKeySize) 525 if err != nil { 526 return err 527 } 528 // create binary entry 529 e := &binary.Entry{Key: []byte(k), Value: nil} 530 // write entry to the write-ahead commit log 531 _, err = lsm.wacl.Write(e) 532 if err != nil { 533 return err 534 } 535 // write entry to mem-table 536 _, needFlush := lsm.memt.UpsertAndCheckIfFull(e, lsm.conf.FlushThreshold) 537 // check if we should do a flush 538 if needFlush { 539 // log info 540 lsm.logger.Info("mem-table needs flush, attempting to flush now", err) 541 // attempt to flush 542 err = lsm.FlushToSSTableAndCycleWAL(lsm.memt) 543 if err != nil { 544 // log error 545 lsm.logger.Error("flushing mem-table: %s", err) 546 return err 547 } 548 } 549 // update sparse index 550 lsm.sstm.CheckDeleteInSparseIndex(k) 551 // remove from bloom filter 552 lsm.bloom.Unset([]byte(k)) 553 return nil 554 } 555 556 const ( 557 ScanNewToOld int = sstable.ScanNewToOld 558 ScanOldToNew int = sstable.ScanOldToNew 559 ) 560 561 type ScanDirection = sstable.ScanDirection 562 563 // Scan takes a scan direction and an iteration function and scans the ss-tables 564 // in the provided direction (young to old, or old to young) and provides you with 565 // a pointer to each entry during iteration. *It should be noted that modification 566 // of the entry pointer has unknown effects. 567 func (lsm *LSMTree) Scan(direction int, iter func(e *binary.Entry) bool) error { 568 // lock 569 lsm.lock.Lock() 570 defer lsm.lock.Unlock() 571 // ss-table-manager scan method 572 return lsm.sstm.Scan(sstable.ScanDirection(direction), iter) 573 } 574 575 // Sync forces a sync 576 func (lsm *LSMTree) Sync() error { 577 // lock 578 lsm.lock.Lock() 579 defer lsm.lock.Unlock() 580 // sync write-ahead commit log 581 err := lsm.wacl.Sync() 582 if err != nil { 583 return err 584 } 585 return nil 586 } 587 588 // PutBatch takes a batch of entries and adds all of them at 589 // one time. It acts a bit like a transaction. If you have a 590 // configuration option of SyncOnWrite: true it will be disabled 591 // temporarily and the batch will sync at the end of all the 592 // writes. This is to give a slight performance advantage. It 593 // should be worth noting that very large batches may have an 594 // impact on performance and may also cause frequent ss-table 595 // flushes which may result in fragmentation. 596 func (lsm *LSMTree) PutBatch(batch *binary.Batch) error { 597 // lock 598 lsm.lock.Lock() 599 defer lsm.lock.Unlock() 600 // write batch to the write-ahead commit log 601 err := lsm.wacl.WriteBatch(batch) 602 if err != nil { 603 return err 604 } 605 // write batch to mem-table 606 _, needFlush := lsm.memt.UpsertBatchAndCheckIfFull(batch, lsm.conf.FlushThreshold) 607 // check if we should do a flush 608 if needFlush { 609 // log info 610 lsm.logger.Info("mem-table needs flush, attempting to flush now", err) 611 // attempt to flush 612 err = lsm.FlushToSSTableAndCycleWAL(lsm.memt) 613 if err != nil { 614 // log error 615 lsm.logger.Error("flushing mem-table: %s", err) 616 return err 617 } 618 } 619 // add to bloom filter 620 for _, e := range batch.Entries { 621 lsm.bloom.Set(e.Key) 622 } 623 return nil 624 } 625 626 // GetBatch attempts to find entries matching the keys provided. If a matching 627 // entry is found, it is added to the batch that is returned. If a matching 628 // entry cannot be found it is simply skipped and not added to the batch. GetBatch 629 // will return a nil error if all the matching entries were found. If it found 630 // some but not all, GetBatch will return ErrIncompleteSet along with the batch 631 // of entries that it could find. If it could not find any matches at all, the 632 // batch will be nil and GetBatch will return an ErrNotFound 633 func (lsm *LSMTree) GetBatch(keys ...string) (*binary.Batch, error) { 634 // create batch to return 635 batch := binary.NewBatch() 636 // iterate over keys 637 for _, key := range keys { 638 // check bloom filter 639 if ok := lsm.bloom.MayHave([]byte(key)); !ok { 640 // definitely not in the lsm tree 641 continue // skip and look for next key 642 } 643 // according to the bloom filter, it "may" be in 644 // tree, so lets start by searching the mem-table 645 e, found := lsm.memt.Get(&binary.Entry{Key: []byte(key)}) 646 if found && e.Value != nil { 647 // we found a match! add match to batch, and... 648 batch.WriteEntry(e) 649 continue // skip and lok for next key 650 } 651 // we did not find it in the mem-table 652 // need to check error for tombstone 653 if e == nil || (e.Value == nil || bytes.Equal(e.Value, Tombstone)) { 654 // found tombstone entry (means this entry was 655 // deleted) so we can end our search here 656 continue // skip and look for the next key 657 } 658 // boom filter says maybe, checked the mem-table with 659 // no luck apparently, so now let us check the sparse 660 // index and see what we come up with 661 de, err := lsm.sstm.Search(key) 662 if err != nil { 663 // if we get a bad entry, it most likely means 664 // that our sparse index couldn't find it, but 665 // there is still a chance it may be on disk 666 if err == binary.ErrBadEntry { 667 // do linear semi-binary-ish search 668 de, err = lsm.sstm.LinearSearch(key) 669 // check err 670 if err != nil && err == binary.ErrEntryNotFound { 671 return nil, ErrNotFound 672 } 673 if de == nil || de.Value == nil { 674 continue // skip and look for the next key 675 } 676 // otherwise, we found it homey! add match to batch, and... 677 batch.WriteEntry(de) 678 continue // skip and lok for next key 679 } 680 // -> IF YOU ARE HERE... 681 // Then the value may not be here (or you didn't check 682 // all the potential errors that can be returned), dummy 683 continue // skip and lok for next key 684 } 685 // check to make sure entry is not a tombstone 686 if de == nil || de.Value == nil { 687 continue // skip and lok for next key 688 } 689 // may have found it; add match to batch, and... 690 batch.WriteEntry(de) 691 continue // skip and lok for next key 692 } 693 // check the batch 694 if batch.Len() == 0 { 695 // nothing at all was found 696 return nil, ErrNotFound 697 } 698 if batch.Len() == len(keys) { 699 // we found all the potential matches! 700 return batch, nil 701 } 702 return batch, ErrIncompleteSet 703 } 704 705 func (lsm *LSMTree) Stats() (*LSMTreeStats, error) { 706 return &LSMTreeStats{ 707 Config: lsm.conf, 708 MtEntries: lsm.memt.Count(), 709 MtSize: lsm.memt.Size(), 710 BfEntries: lsm.bloom.Count(), 711 BfSize: int64(util.Sizeof(lsm.bloom)), 712 }, nil 713 } 714 715 func (lsm *LSMTree) Close() error { 716 // close write-ahead commit log 717 err := lsm.wacl.Close() 718 if err != nil { 719 return err 720 } 721 // close sst-manager 722 err = lsm.sstm.Close() 723 if err != nil { 724 return err 725 } 726 return nil 727 }