github.com/rosedblabs/rosedb/v2@v2.3.7-0.20240423093736-a89ea823e5b9/db.go (about) 1 package rosedb 2 3 import ( 4 "context" 5 "errors" 6 "fmt" 7 "io" 8 "os" 9 "path/filepath" 10 "regexp" 11 "sync" 12 "time" 13 14 "github.com/bwmarrin/snowflake" 15 "github.com/gofrs/flock" 16 "github.com/robfig/cron/v3" 17 "github.com/rosedblabs/rosedb/v2/index" 18 "github.com/rosedblabs/rosedb/v2/utils" 19 "github.com/rosedblabs/wal" 20 ) 21 22 const ( 23 fileLockName = "FLOCK" 24 dataFileNameSuffix = ".SEG" 25 hintFileNameSuffix = ".HINT" 26 mergeFinNameSuffix = ".MERGEFIN" 27 ) 28 29 // DB represents a ROSEDB database instance. 30 // It is built on the bitcask model, which is a log-structured storage. 31 // It uses WAL to write data, and uses an in-memory index to store the key 32 // and the position of the data in the WAL, 33 // the index will be rebuilt when the database is opened. 34 // 35 // The main advantage of ROSEDB is that it is very fast to write, read, and delete data. 36 // Because it only needs one disk IO to complete a single operation. 37 // 38 // But since we should store all keys and their positions(index) in memory, 39 // our total data size is limited by the memory size. 40 // 41 // So if your memory can almost hold all the keys, ROSEDB is the perfect storage engine for you. 42 type DB struct { 43 dataFiles *wal.WAL // data files are a sets of segment files in WAL. 44 hintFile *wal.WAL // hint file is used to store the key and the position for fast startup. 45 index index.Indexer 46 options Options 47 fileLock *flock.Flock 48 mu sync.RWMutex 49 closed bool 50 mergeRunning uint32 // indicate if the database is merging 51 batchPool sync.Pool 52 recordPool sync.Pool 53 encodeHeader []byte 54 watchCh chan *Event // user consume channel for watch events 55 watcher *Watcher 56 expiredCursorKey []byte // the location to which DeleteExpiredKeys executes. 57 cronScheduler *cron.Cron // cron scheduler for auto merge task 58 } 59 60 // Stat represents the statistics of the database. 61 type Stat struct { 62 // Total number of keys 63 KeysNum int 64 // Total disk size of database directory 65 DiskSize int64 66 } 67 68 // Open a database with the specified options. 69 // If the database directory does not exist, it will be created automatically. 70 // 71 // Multiple processes can not use the same database directory at the same time, 72 // otherwise it will return ErrDatabaseIsUsing. 73 // 74 // It will open the wal files in the database directory and load the index from them. 75 // Return the DB instance, or an error if any. 76 func Open(options Options) (*DB, error) { 77 // check options 78 if err := checkOptions(options); err != nil { 79 return nil, err 80 } 81 82 // create data directory if not exist 83 if _, err := os.Stat(options.DirPath); err != nil { 84 if err := os.MkdirAll(options.DirPath, os.ModePerm); err != nil { 85 return nil, err 86 } 87 } 88 89 // create file lock, prevent multiple processes from using the same database directory 90 fileLock := flock.New(filepath.Join(options.DirPath, fileLockName)) 91 hold, err := fileLock.TryLock() 92 if err != nil { 93 return nil, err 94 } 95 if !hold { 96 return nil, ErrDatabaseIsUsing 97 } 98 99 // load merge files if exists 100 if err = loadMergeFiles(options.DirPath); err != nil { 101 return nil, err 102 } 103 104 // init DB instance 105 db := &DB{ 106 index: index.NewIndexer(), 107 options: options, 108 fileLock: fileLock, 109 batchPool: sync.Pool{New: newBatch}, 110 recordPool: sync.Pool{New: newRecord}, 111 encodeHeader: make([]byte, maxLogRecordHeaderSize), 112 } 113 114 // open data files 115 if db.dataFiles, err = db.openWalFiles(); err != nil { 116 return nil, err 117 } 118 119 // load index 120 if err = db.loadIndex(); err != nil { 121 return nil, err 122 } 123 124 // enable watch 125 if options.WatchQueueSize > 0 { 126 db.watchCh = make(chan *Event, 100) 127 db.watcher = NewWatcher(options.WatchQueueSize) 128 // run a goroutine to synchronize event information 129 go db.watcher.sendEvent(db.watchCh) 130 } 131 132 // enable auto merge task 133 if len(options.AutoMergeCronExpr) > 0 { 134 db.cronScheduler = cron.New( 135 cron.WithParser( 136 cron.NewParser(cron.SecondOptional | cron.Minute | cron.Hour | 137 cron.Dom | cron.Month | cron.Dow | cron.Descriptor), 138 ), 139 ) 140 _, err = db.cronScheduler.AddFunc(options.AutoMergeCronExpr, func() { 141 // maybe we should deal with different errors with different logic, but a background task can't omit its error. 142 // after auto merge, we should close and reopen the db. 143 _ = db.Merge(true) 144 }) 145 if err != nil { 146 return nil, err 147 } 148 db.cronScheduler.Start() 149 } 150 151 return db, nil 152 } 153 154 func (db *DB) openWalFiles() (*wal.WAL, error) { 155 // open data files from WAL 156 walFiles, err := wal.Open(wal.Options{ 157 DirPath: db.options.DirPath, 158 SegmentSize: db.options.SegmentSize, 159 SegmentFileExt: dataFileNameSuffix, 160 BlockCache: db.options.BlockCache, 161 Sync: db.options.Sync, 162 BytesPerSync: db.options.BytesPerSync, 163 }) 164 if err != nil { 165 return nil, err 166 } 167 return walFiles, nil 168 } 169 170 func (db *DB) loadIndex() error { 171 // load index frm hint file 172 if err := db.loadIndexFromHintFile(); err != nil { 173 return err 174 } 175 // load index from data files 176 if err := db.loadIndexFromWAL(); err != nil { 177 return err 178 } 179 return nil 180 } 181 182 // Close the database, close all data files and release file lock. 183 // Set the closed flag to true. 184 // The DB instance cannot be used after closing. 185 func (db *DB) Close() error { 186 db.mu.Lock() 187 defer db.mu.Unlock() 188 189 if err := db.closeFiles(); err != nil { 190 return err 191 } 192 193 // release file lock 194 if err := db.fileLock.Unlock(); err != nil { 195 return err 196 } 197 198 // close watch channel 199 if db.options.WatchQueueSize > 0 { 200 close(db.watchCh) 201 } 202 203 // close auto merge cron scheduler 204 if db.cronScheduler != nil { 205 db.cronScheduler.Stop() 206 } 207 208 db.closed = true 209 return nil 210 } 211 212 // closeFiles close all data files and hint file 213 func (db *DB) closeFiles() error { 214 // close wal 215 if err := db.dataFiles.Close(); err != nil { 216 return err 217 } 218 // close hint file if exists 219 if db.hintFile != nil { 220 if err := db.hintFile.Close(); err != nil { 221 return err 222 } 223 } 224 return nil 225 } 226 227 // Sync all data files to the underlying storage. 228 func (db *DB) Sync() error { 229 db.mu.Lock() 230 defer db.mu.Unlock() 231 232 return db.dataFiles.Sync() 233 } 234 235 // Stat returns the statistics of the database. 236 func (db *DB) Stat() *Stat { 237 db.mu.Lock() 238 defer db.mu.Unlock() 239 240 diskSize, err := utils.DirSize(db.options.DirPath) 241 if err != nil { 242 panic(fmt.Sprintf("rosedb: get database directory size error: %v", err)) 243 } 244 245 return &Stat{ 246 KeysNum: db.index.Size(), 247 DiskSize: diskSize, 248 } 249 } 250 251 // Put a key-value pair into the database. 252 // Actually, it will open a new batch and commit it. 253 // You can think the batch has only one Put operation. 254 func (db *DB) Put(key []byte, value []byte) error { 255 batch := db.batchPool.Get().(*Batch) 256 defer func() { 257 batch.reset() 258 db.batchPool.Put(batch) 259 }() 260 // This is a single put operation, we can set Sync to false. 261 // Because the data will be written to the WAL, 262 // and the WAL file will be synced to disk according to the DB options. 263 batch.init(false, false, db) 264 if err := batch.Put(key, value); err != nil { 265 _ = batch.Rollback() 266 return err 267 } 268 return batch.Commit() 269 } 270 271 // PutWithTTL a key-value pair into the database, with a ttl. 272 // Actually, it will open a new batch and commit it. 273 // You can think the batch has only one PutWithTTL operation. 274 func (db *DB) PutWithTTL(key []byte, value []byte, ttl time.Duration) error { 275 batch := db.batchPool.Get().(*Batch) 276 defer func() { 277 batch.reset() 278 db.batchPool.Put(batch) 279 }() 280 // This is a single put operation, we can set Sync to false. 281 // Because the data will be written to the WAL, 282 // and the WAL file will be synced to disk according to the DB options. 283 batch.init(false, false, db) 284 if err := batch.PutWithTTL(key, value, ttl); err != nil { 285 _ = batch.Rollback() 286 return err 287 } 288 return batch.Commit() 289 } 290 291 // Get the value of the specified key from the database. 292 // Actually, it will open a new batch and commit it. 293 // You can think the batch has only one Get operation. 294 func (db *DB) Get(key []byte) ([]byte, error) { 295 batch := db.batchPool.Get().(*Batch) 296 batch.init(true, false, db) 297 defer func() { 298 _ = batch.Commit() 299 batch.reset() 300 db.batchPool.Put(batch) 301 }() 302 return batch.Get(key) 303 } 304 305 // Delete the specified key from the database. 306 // Actually, it will open a new batch and commit it. 307 // You can think the batch has only one Delete operation. 308 func (db *DB) Delete(key []byte) error { 309 batch := db.batchPool.Get().(*Batch) 310 defer func() { 311 batch.reset() 312 db.batchPool.Put(batch) 313 }() 314 // This is a single delete operation, we can set Sync to false. 315 // Because the data will be written to the WAL, 316 // and the WAL file will be synced to disk according to the DB options. 317 batch.init(false, false, db) 318 if err := batch.Delete(key); err != nil { 319 _ = batch.Rollback() 320 return err 321 } 322 return batch.Commit() 323 } 324 325 // Exist checks if the specified key exists in the database. 326 // Actually, it will open a new batch and commit it. 327 // You can think the batch has only one Exist operation. 328 func (db *DB) Exist(key []byte) (bool, error) { 329 batch := db.batchPool.Get().(*Batch) 330 batch.init(true, false, db) 331 defer func() { 332 _ = batch.Commit() 333 batch.reset() 334 db.batchPool.Put(batch) 335 }() 336 return batch.Exist(key) 337 } 338 339 // Expire sets the ttl of the key. 340 func (db *DB) Expire(key []byte, ttl time.Duration) error { 341 batch := db.batchPool.Get().(*Batch) 342 defer func() { 343 batch.reset() 344 db.batchPool.Put(batch) 345 }() 346 // This is a single expire operation, we can set Sync to false. 347 // Because the data will be written to the WAL, 348 // and the WAL file will be synced to disk according to the DB options. 349 batch.init(false, false, db) 350 if err := batch.Expire(key, ttl); err != nil { 351 _ = batch.Rollback() 352 return err 353 } 354 return batch.Commit() 355 } 356 357 // TTL get the ttl of the key. 358 func (db *DB) TTL(key []byte) (time.Duration, error) { 359 batch := db.batchPool.Get().(*Batch) 360 batch.init(true, false, db) 361 defer func() { 362 _ = batch.Commit() 363 batch.reset() 364 db.batchPool.Put(batch) 365 }() 366 return batch.TTL(key) 367 } 368 369 // Persist removes the ttl of the key. 370 // If the key does not exist or expired, it will return ErrKeyNotFound. 371 func (db *DB) Persist(key []byte) error { 372 batch := db.batchPool.Get().(*Batch) 373 defer func() { 374 batch.reset() 375 db.batchPool.Put(batch) 376 }() 377 // This is a single persist operation, we can set Sync to false. 378 // Because the data will be written to the WAL, 379 // and the WAL file will be synced to disk according to the DB options. 380 batch.init(false, false, db) 381 if err := batch.Persist(key); err != nil { 382 _ = batch.Rollback() 383 return err 384 } 385 return batch.Commit() 386 } 387 388 func (db *DB) Watch() (<-chan *Event, error) { 389 if db.options.WatchQueueSize <= 0 { 390 return nil, ErrWatchDisabled 391 } 392 return db.watchCh, nil 393 } 394 395 // Ascend calls handleFn for each key/value pair in the db in ascending order. 396 func (db *DB) Ascend(handleFn func(k []byte, v []byte) (bool, error)) { 397 db.mu.RLock() 398 defer db.mu.RUnlock() 399 400 db.index.Ascend(func(key []byte, pos *wal.ChunkPosition) (bool, error) { 401 chunk, err := db.dataFiles.Read(pos) 402 if err != nil { 403 return false, err 404 } 405 if value := db.checkValue(chunk); value != nil { 406 return handleFn(key, value) 407 } 408 return true, nil 409 }) 410 } 411 412 // AscendRange calls handleFn for each key/value pair in the db within the range [startKey, endKey] in ascending order. 413 func (db *DB) AscendRange(startKey, endKey []byte, handleFn func(k []byte, v []byte) (bool, error)) { 414 db.mu.RLock() 415 defer db.mu.RUnlock() 416 417 db.index.AscendRange(startKey, endKey, func(key []byte, pos *wal.ChunkPosition) (bool, error) { 418 chunk, err := db.dataFiles.Read(pos) 419 if err != nil { 420 return false, nil 421 } 422 if value := db.checkValue(chunk); value != nil { 423 return handleFn(key, value) 424 } 425 return true, nil 426 }) 427 } 428 429 // AscendGreaterOrEqual calls handleFn for each key/value pair in the db with keys greater than or equal to the given key. 430 func (db *DB) AscendGreaterOrEqual(key []byte, handleFn func(k []byte, v []byte) (bool, error)) { 431 db.mu.RLock() 432 defer db.mu.RUnlock() 433 434 db.index.AscendGreaterOrEqual(key, func(key []byte, pos *wal.ChunkPosition) (bool, error) { 435 chunk, err := db.dataFiles.Read(pos) 436 if err != nil { 437 return false, nil 438 } 439 if value := db.checkValue(chunk); value != nil { 440 return handleFn(key, value) 441 } 442 return true, nil 443 }) 444 } 445 446 // AscendKeys calls handleFn for each key in the db in ascending order. 447 // Since our expiry time is stored in the value, if you want to filter expired keys, 448 // you need to set parameter filterExpired to true. But the performance will be affected. 449 // Because we need to read the value of each key to determine if it is expired. 450 func (db *DB) AscendKeys(pattern []byte, filterExpired bool, handleFn func(k []byte) (bool, error)) { 451 db.mu.RLock() 452 defer db.mu.RUnlock() 453 454 var reg *regexp.Regexp 455 if len(pattern) > 0 { 456 reg = regexp.MustCompile(string(pattern)) 457 } 458 459 db.index.Ascend(func(key []byte, pos *wal.ChunkPosition) (bool, error) { 460 if reg == nil || reg.Match(key) { 461 var invalid bool 462 if filterExpired { 463 chunk, err := db.dataFiles.Read(pos) 464 if err != nil { 465 return false, err 466 } 467 if value := db.checkValue(chunk); value == nil { 468 invalid = true 469 } 470 } 471 if invalid { 472 return true, nil 473 } 474 return handleFn(key) 475 } 476 return true, nil 477 }) 478 } 479 480 // Descend calls handleFn for each key/value pair in the db in descending order. 481 func (db *DB) Descend(handleFn func(k []byte, v []byte) (bool, error)) { 482 db.mu.RLock() 483 defer db.mu.RUnlock() 484 485 db.index.Descend(func(key []byte, pos *wal.ChunkPosition) (bool, error) { 486 chunk, err := db.dataFiles.Read(pos) 487 if err != nil { 488 return false, nil 489 } 490 if value := db.checkValue(chunk); value != nil { 491 return handleFn(key, value) 492 } 493 return true, nil 494 }) 495 } 496 497 // DescendRange calls handleFn for each key/value pair in the db within the range [startKey, endKey] in descending order. 498 func (db *DB) DescendRange(startKey, endKey []byte, handleFn func(k []byte, v []byte) (bool, error)) { 499 db.mu.RLock() 500 defer db.mu.RUnlock() 501 502 db.index.DescendRange(startKey, endKey, func(key []byte, pos *wal.ChunkPosition) (bool, error) { 503 chunk, err := db.dataFiles.Read(pos) 504 if err != nil { 505 return false, nil 506 } 507 if value := db.checkValue(chunk); value != nil { 508 return handleFn(key, value) 509 } 510 return true, nil 511 }) 512 } 513 514 // DescendLessOrEqual calls handleFn for each key/value pair in the db with keys less than or equal to the given key. 515 func (db *DB) DescendLessOrEqual(key []byte, handleFn func(k []byte, v []byte) (bool, error)) { 516 db.mu.RLock() 517 defer db.mu.RUnlock() 518 519 db.index.DescendLessOrEqual(key, func(key []byte, pos *wal.ChunkPosition) (bool, error) { 520 chunk, err := db.dataFiles.Read(pos) 521 if err != nil { 522 return false, nil 523 } 524 if value := db.checkValue(chunk); value != nil { 525 return handleFn(key, value) 526 } 527 return true, nil 528 }) 529 } 530 531 // DescendKeys calls handleFn for each key in the db in descending order. 532 // Since our expiry time is stored in the value, if you want to filter expired keys, 533 // you need to set parameter filterExpired to true. But the performance will be affected. 534 // Because we need to read the value of each key to determine if it is expired. 535 func (db *DB) DescendKeys(pattern []byte, filterExpired bool, handleFn func(k []byte) (bool, error)) { 536 db.mu.RLock() 537 defer db.mu.RUnlock() 538 539 var reg *regexp.Regexp 540 if len(pattern) > 0 { 541 reg = regexp.MustCompile(string(pattern)) 542 } 543 544 db.index.Descend(func(key []byte, pos *wal.ChunkPosition) (bool, error) { 545 if reg == nil || reg.Match(key) { 546 var invalid bool 547 if filterExpired { 548 chunk, err := db.dataFiles.Read(pos) 549 if err != nil { 550 return false, err 551 } 552 if value := db.checkValue(chunk); value == nil { 553 invalid = true 554 } 555 } 556 if invalid { 557 return true, nil 558 } 559 return handleFn(key) 560 } 561 return true, nil 562 }) 563 } 564 565 func (db *DB) checkValue(chunk []byte) []byte { 566 record := decodeLogRecord(chunk) 567 now := time.Now().UnixNano() 568 if record.Type != LogRecordDeleted && !record.IsExpired(now) { 569 return record.Value 570 } 571 return nil 572 } 573 574 func checkOptions(options Options) error { 575 if options.DirPath == "" { 576 return errors.New("database dir path is empty") 577 } 578 if options.SegmentSize <= 0 { 579 return errors.New("database data file size must be greater than 0") 580 } 581 582 if len(options.AutoMergeCronExpr) > 0 { 583 if _, err := cron.NewParser(cron.SecondOptional | cron.Minute | cron.Hour | cron.Dom | cron.Month | cron.Dow | cron.Descriptor). 584 Parse(options.AutoMergeCronExpr); err != nil { 585 return fmt.Errorf("databse auto merge cron expression is invalid, err: %s", err) 586 } 587 } 588 589 return nil 590 } 591 592 // loadIndexFromWAL loads index from WAL. 593 // It will iterate over all the WAL files and read data 594 // from them to rebuild the index. 595 func (db *DB) loadIndexFromWAL() error { 596 mergeFinSegmentId, err := getMergeFinSegmentId(db.options.DirPath) 597 if err != nil { 598 return err 599 } 600 indexRecords := make(map[uint64][]*IndexRecord) 601 now := time.Now().UnixNano() 602 // get a reader for WAL 603 reader := db.dataFiles.NewReader() 604 for { 605 // if the current segment id is less than the mergeFinSegmentId, 606 // we can skip this segment because it has been merged, 607 // and we can load index from the hint file directly. 608 if reader.CurrentSegmentId() <= mergeFinSegmentId { 609 reader.SkipCurrentSegment() 610 continue 611 } 612 613 chunk, position, err := reader.Next() 614 if err != nil { 615 if err == io.EOF { 616 break 617 } 618 return err 619 } 620 // decode and get log record 621 record := decodeLogRecord(chunk) 622 623 // if we get the end of a batch, 624 // all records in this batch are ready to be indexed. 625 if record.Type == LogRecordBatchFinished { 626 batchId, err := snowflake.ParseBytes(record.Key) 627 if err != nil { 628 return err 629 } 630 for _, idxRecord := range indexRecords[uint64(batchId)] { 631 if idxRecord.recordType == LogRecordNormal { 632 db.index.Put(idxRecord.key, idxRecord.position) 633 } 634 if idxRecord.recordType == LogRecordDeleted { 635 db.index.Delete(idxRecord.key) 636 } 637 } 638 // delete indexRecords according to batchId after indexing 639 delete(indexRecords, uint64(batchId)) 640 } else if record.Type == LogRecordNormal && record.BatchId == mergeFinishedBatchID { 641 // if the record is a normal record and the batch id is 0, 642 // it means that the record is involved in the merge operation. 643 // so put the record into index directly. 644 db.index.Put(record.Key, position) 645 } else { 646 // expired records should not be indexed 647 if record.IsExpired(now) { 648 db.index.Delete(record.Key) 649 continue 650 } 651 // put the record into the temporary indexRecords 652 indexRecords[record.BatchId] = append(indexRecords[record.BatchId], 653 &IndexRecord{ 654 key: record.Key, 655 recordType: record.Type, 656 position: position, 657 }) 658 } 659 } 660 return nil 661 } 662 663 // DeleteExpiredKeys scan the entire index in ascending order to delete expired keys. 664 // It is a time-consuming operation, so we need to specify a timeout 665 // to prevent the DB from being unavailable for a long time. 666 func (db *DB) DeleteExpiredKeys(timeout time.Duration) error { 667 // set timeout 668 ctx, cancel := context.WithTimeout(context.Background(), timeout) 669 defer cancel() 670 done := make(chan struct{}, 1) 671 672 var innerErr error 673 now := time.Now().UnixNano() 674 go func(ctx context.Context) { 675 db.mu.Lock() 676 defer db.mu.Unlock() 677 for { 678 // select 100 keys from the db.index 679 positions := make([]*wal.ChunkPosition, 0, 100) 680 db.index.AscendGreaterOrEqual(db.expiredCursorKey, func(k []byte, pos *wal.ChunkPosition) (bool, error) { 681 positions = append(positions, pos) 682 if len(positions) >= 100 { 683 return false, nil 684 } 685 return true, nil 686 }) 687 688 // If keys in the db.index has been traversed, len(positions) will be 0. 689 if len(positions) == 0 { 690 db.expiredCursorKey = nil 691 done <- struct{}{} 692 return 693 } 694 695 // delete from index if the key is expired. 696 for _, pos := range positions { 697 chunk, err := db.dataFiles.Read(pos) 698 if err != nil { 699 innerErr = err 700 done <- struct{}{} 701 return 702 } 703 record := decodeLogRecord(chunk) 704 if record.IsExpired(now) { 705 db.index.Delete(record.Key) 706 } 707 db.expiredCursorKey = record.Key 708 } 709 } 710 }(ctx) 711 712 select { 713 case <-ctx.Done(): 714 return innerErr 715 case <-done: 716 return nil 717 } 718 }