github.com/rosedblabs/rosedb/v2@v2.3.7-0.20240423093736-a89ea823e5b9/db.go (about)

     1  package rosedb
     2  
     3  import (
     4  	"context"
     5  	"errors"
     6  	"fmt"
     7  	"io"
     8  	"os"
     9  	"path/filepath"
    10  	"regexp"
    11  	"sync"
    12  	"time"
    13  
    14  	"github.com/bwmarrin/snowflake"
    15  	"github.com/gofrs/flock"
    16  	"github.com/robfig/cron/v3"
    17  	"github.com/rosedblabs/rosedb/v2/index"
    18  	"github.com/rosedblabs/rosedb/v2/utils"
    19  	"github.com/rosedblabs/wal"
    20  )
    21  
    22  const (
    23  	fileLockName       = "FLOCK"
    24  	dataFileNameSuffix = ".SEG"
    25  	hintFileNameSuffix = ".HINT"
    26  	mergeFinNameSuffix = ".MERGEFIN"
    27  )
    28  
    29  // DB represents a ROSEDB database instance.
    30  // It is built on the bitcask model, which is a log-structured storage.
    31  // It uses WAL to write data, and uses an in-memory index to store the key
    32  // and the position of the data in the WAL,
    33  // the index will be rebuilt when the database is opened.
    34  //
    35  // The main advantage of ROSEDB is that it is very fast to write, read, and delete data.
    36  // Because it only needs one disk IO to complete a single operation.
    37  //
    38  // But since we should store all keys and their positions(index) in memory,
    39  // our total data size is limited by the memory size.
    40  //
    41  // So if your memory can almost hold all the keys, ROSEDB is the perfect storage engine for you.
    42  type DB struct {
    43  	dataFiles        *wal.WAL // data files are a sets of segment files in WAL.
    44  	hintFile         *wal.WAL // hint file is used to store the key and the position for fast startup.
    45  	index            index.Indexer
    46  	options          Options
    47  	fileLock         *flock.Flock
    48  	mu               sync.RWMutex
    49  	closed           bool
    50  	mergeRunning     uint32 // indicate if the database is merging
    51  	batchPool        sync.Pool
    52  	recordPool       sync.Pool
    53  	encodeHeader     []byte
    54  	watchCh          chan *Event // user consume channel for watch events
    55  	watcher          *Watcher
    56  	expiredCursorKey []byte     // the location to which DeleteExpiredKeys executes.
    57  	cronScheduler    *cron.Cron // cron scheduler for auto merge task
    58  }
    59  
    60  // Stat represents the statistics of the database.
    61  type Stat struct {
    62  	// Total number of keys
    63  	KeysNum int
    64  	// Total disk size of database directory
    65  	DiskSize int64
    66  }
    67  
    68  // Open a database with the specified options.
    69  // If the database directory does not exist, it will be created automatically.
    70  //
    71  // Multiple processes can not use the same database directory at the same time,
    72  // otherwise it will return ErrDatabaseIsUsing.
    73  //
    74  // It will open the wal files in the database directory and load the index from them.
    75  // Return the DB instance, or an error if any.
    76  func Open(options Options) (*DB, error) {
    77  	// check options
    78  	if err := checkOptions(options); err != nil {
    79  		return nil, err
    80  	}
    81  
    82  	// create data directory if not exist
    83  	if _, err := os.Stat(options.DirPath); err != nil {
    84  		if err := os.MkdirAll(options.DirPath, os.ModePerm); err != nil {
    85  			return nil, err
    86  		}
    87  	}
    88  
    89  	// create file lock, prevent multiple processes from using the same database directory
    90  	fileLock := flock.New(filepath.Join(options.DirPath, fileLockName))
    91  	hold, err := fileLock.TryLock()
    92  	if err != nil {
    93  		return nil, err
    94  	}
    95  	if !hold {
    96  		return nil, ErrDatabaseIsUsing
    97  	}
    98  
    99  	// load merge files if exists
   100  	if err = loadMergeFiles(options.DirPath); err != nil {
   101  		return nil, err
   102  	}
   103  
   104  	// init DB instance
   105  	db := &DB{
   106  		index:        index.NewIndexer(),
   107  		options:      options,
   108  		fileLock:     fileLock,
   109  		batchPool:    sync.Pool{New: newBatch},
   110  		recordPool:   sync.Pool{New: newRecord},
   111  		encodeHeader: make([]byte, maxLogRecordHeaderSize),
   112  	}
   113  
   114  	// open data files
   115  	if db.dataFiles, err = db.openWalFiles(); err != nil {
   116  		return nil, err
   117  	}
   118  
   119  	// load index
   120  	if err = db.loadIndex(); err != nil {
   121  		return nil, err
   122  	}
   123  
   124  	// enable watch
   125  	if options.WatchQueueSize > 0 {
   126  		db.watchCh = make(chan *Event, 100)
   127  		db.watcher = NewWatcher(options.WatchQueueSize)
   128  		// run a goroutine to synchronize event information
   129  		go db.watcher.sendEvent(db.watchCh)
   130  	}
   131  
   132  	// enable auto merge task
   133  	if len(options.AutoMergeCronExpr) > 0 {
   134  		db.cronScheduler = cron.New(
   135  			cron.WithParser(
   136  				cron.NewParser(cron.SecondOptional | cron.Minute | cron.Hour |
   137  					cron.Dom | cron.Month | cron.Dow | cron.Descriptor),
   138  			),
   139  		)
   140  		_, err = db.cronScheduler.AddFunc(options.AutoMergeCronExpr, func() {
   141  			// maybe we should deal with different errors with different logic, but a background task can't omit its error.
   142  			// after auto merge, we should close and reopen the db.
   143  			_ = db.Merge(true)
   144  		})
   145  		if err != nil {
   146  			return nil, err
   147  		}
   148  		db.cronScheduler.Start()
   149  	}
   150  
   151  	return db, nil
   152  }
   153  
   154  func (db *DB) openWalFiles() (*wal.WAL, error) {
   155  	// open data files from WAL
   156  	walFiles, err := wal.Open(wal.Options{
   157  		DirPath:        db.options.DirPath,
   158  		SegmentSize:    db.options.SegmentSize,
   159  		SegmentFileExt: dataFileNameSuffix,
   160  		BlockCache:     db.options.BlockCache,
   161  		Sync:           db.options.Sync,
   162  		BytesPerSync:   db.options.BytesPerSync,
   163  	})
   164  	if err != nil {
   165  		return nil, err
   166  	}
   167  	return walFiles, nil
   168  }
   169  
   170  func (db *DB) loadIndex() error {
   171  	// load index frm hint file
   172  	if err := db.loadIndexFromHintFile(); err != nil {
   173  		return err
   174  	}
   175  	// load index from data files
   176  	if err := db.loadIndexFromWAL(); err != nil {
   177  		return err
   178  	}
   179  	return nil
   180  }
   181  
   182  // Close the database, close all data files and release file lock.
   183  // Set the closed flag to true.
   184  // The DB instance cannot be used after closing.
   185  func (db *DB) Close() error {
   186  	db.mu.Lock()
   187  	defer db.mu.Unlock()
   188  
   189  	if err := db.closeFiles(); err != nil {
   190  		return err
   191  	}
   192  
   193  	// release file lock
   194  	if err := db.fileLock.Unlock(); err != nil {
   195  		return err
   196  	}
   197  
   198  	// close watch channel
   199  	if db.options.WatchQueueSize > 0 {
   200  		close(db.watchCh)
   201  	}
   202  
   203  	// close auto merge cron scheduler
   204  	if db.cronScheduler != nil {
   205  		db.cronScheduler.Stop()
   206  	}
   207  
   208  	db.closed = true
   209  	return nil
   210  }
   211  
   212  // closeFiles close all data files and hint file
   213  func (db *DB) closeFiles() error {
   214  	// close wal
   215  	if err := db.dataFiles.Close(); err != nil {
   216  		return err
   217  	}
   218  	// close hint file if exists
   219  	if db.hintFile != nil {
   220  		if err := db.hintFile.Close(); err != nil {
   221  			return err
   222  		}
   223  	}
   224  	return nil
   225  }
   226  
   227  // Sync all data files to the underlying storage.
   228  func (db *DB) Sync() error {
   229  	db.mu.Lock()
   230  	defer db.mu.Unlock()
   231  
   232  	return db.dataFiles.Sync()
   233  }
   234  
   235  // Stat returns the statistics of the database.
   236  func (db *DB) Stat() *Stat {
   237  	db.mu.Lock()
   238  	defer db.mu.Unlock()
   239  
   240  	diskSize, err := utils.DirSize(db.options.DirPath)
   241  	if err != nil {
   242  		panic(fmt.Sprintf("rosedb: get database directory size error: %v", err))
   243  	}
   244  
   245  	return &Stat{
   246  		KeysNum:  db.index.Size(),
   247  		DiskSize: diskSize,
   248  	}
   249  }
   250  
   251  // Put a key-value pair into the database.
   252  // Actually, it will open a new batch and commit it.
   253  // You can think the batch has only one Put operation.
   254  func (db *DB) Put(key []byte, value []byte) error {
   255  	batch := db.batchPool.Get().(*Batch)
   256  	defer func() {
   257  		batch.reset()
   258  		db.batchPool.Put(batch)
   259  	}()
   260  	// This is a single put operation, we can set Sync to false.
   261  	// Because the data will be written to the WAL,
   262  	// and the WAL file will be synced to disk according to the DB options.
   263  	batch.init(false, false, db)
   264  	if err := batch.Put(key, value); err != nil {
   265  		_ = batch.Rollback()
   266  		return err
   267  	}
   268  	return batch.Commit()
   269  }
   270  
   271  // PutWithTTL a key-value pair into the database, with a ttl.
   272  // Actually, it will open a new batch and commit it.
   273  // You can think the batch has only one PutWithTTL operation.
   274  func (db *DB) PutWithTTL(key []byte, value []byte, ttl time.Duration) error {
   275  	batch := db.batchPool.Get().(*Batch)
   276  	defer func() {
   277  		batch.reset()
   278  		db.batchPool.Put(batch)
   279  	}()
   280  	// This is a single put operation, we can set Sync to false.
   281  	// Because the data will be written to the WAL,
   282  	// and the WAL file will be synced to disk according to the DB options.
   283  	batch.init(false, false, db)
   284  	if err := batch.PutWithTTL(key, value, ttl); err != nil {
   285  		_ = batch.Rollback()
   286  		return err
   287  	}
   288  	return batch.Commit()
   289  }
   290  
   291  // Get the value of the specified key from the database.
   292  // Actually, it will open a new batch and commit it.
   293  // You can think the batch has only one Get operation.
   294  func (db *DB) Get(key []byte) ([]byte, error) {
   295  	batch := db.batchPool.Get().(*Batch)
   296  	batch.init(true, false, db)
   297  	defer func() {
   298  		_ = batch.Commit()
   299  		batch.reset()
   300  		db.batchPool.Put(batch)
   301  	}()
   302  	return batch.Get(key)
   303  }
   304  
   305  // Delete the specified key from the database.
   306  // Actually, it will open a new batch and commit it.
   307  // You can think the batch has only one Delete operation.
   308  func (db *DB) Delete(key []byte) error {
   309  	batch := db.batchPool.Get().(*Batch)
   310  	defer func() {
   311  		batch.reset()
   312  		db.batchPool.Put(batch)
   313  	}()
   314  	// This is a single delete operation, we can set Sync to false.
   315  	// Because the data will be written to the WAL,
   316  	// and the WAL file will be synced to disk according to the DB options.
   317  	batch.init(false, false, db)
   318  	if err := batch.Delete(key); err != nil {
   319  		_ = batch.Rollback()
   320  		return err
   321  	}
   322  	return batch.Commit()
   323  }
   324  
   325  // Exist checks if the specified key exists in the database.
   326  // Actually, it will open a new batch and commit it.
   327  // You can think the batch has only one Exist operation.
   328  func (db *DB) Exist(key []byte) (bool, error) {
   329  	batch := db.batchPool.Get().(*Batch)
   330  	batch.init(true, false, db)
   331  	defer func() {
   332  		_ = batch.Commit()
   333  		batch.reset()
   334  		db.batchPool.Put(batch)
   335  	}()
   336  	return batch.Exist(key)
   337  }
   338  
   339  // Expire sets the ttl of the key.
   340  func (db *DB) Expire(key []byte, ttl time.Duration) error {
   341  	batch := db.batchPool.Get().(*Batch)
   342  	defer func() {
   343  		batch.reset()
   344  		db.batchPool.Put(batch)
   345  	}()
   346  	// This is a single expire operation, we can set Sync to false.
   347  	// Because the data will be written to the WAL,
   348  	// and the WAL file will be synced to disk according to the DB options.
   349  	batch.init(false, false, db)
   350  	if err := batch.Expire(key, ttl); err != nil {
   351  		_ = batch.Rollback()
   352  		return err
   353  	}
   354  	return batch.Commit()
   355  }
   356  
   357  // TTL get the ttl of the key.
   358  func (db *DB) TTL(key []byte) (time.Duration, error) {
   359  	batch := db.batchPool.Get().(*Batch)
   360  	batch.init(true, false, db)
   361  	defer func() {
   362  		_ = batch.Commit()
   363  		batch.reset()
   364  		db.batchPool.Put(batch)
   365  	}()
   366  	return batch.TTL(key)
   367  }
   368  
   369  // Persist removes the ttl of the key.
   370  // If the key does not exist or expired, it will return ErrKeyNotFound.
   371  func (db *DB) Persist(key []byte) error {
   372  	batch := db.batchPool.Get().(*Batch)
   373  	defer func() {
   374  		batch.reset()
   375  		db.batchPool.Put(batch)
   376  	}()
   377  	// This is a single persist operation, we can set Sync to false.
   378  	// Because the data will be written to the WAL,
   379  	// and the WAL file will be synced to disk according to the DB options.
   380  	batch.init(false, false, db)
   381  	if err := batch.Persist(key); err != nil {
   382  		_ = batch.Rollback()
   383  		return err
   384  	}
   385  	return batch.Commit()
   386  }
   387  
   388  func (db *DB) Watch() (<-chan *Event, error) {
   389  	if db.options.WatchQueueSize <= 0 {
   390  		return nil, ErrWatchDisabled
   391  	}
   392  	return db.watchCh, nil
   393  }
   394  
   395  // Ascend calls handleFn for each key/value pair in the db in ascending order.
   396  func (db *DB) Ascend(handleFn func(k []byte, v []byte) (bool, error)) {
   397  	db.mu.RLock()
   398  	defer db.mu.RUnlock()
   399  
   400  	db.index.Ascend(func(key []byte, pos *wal.ChunkPosition) (bool, error) {
   401  		chunk, err := db.dataFiles.Read(pos)
   402  		if err != nil {
   403  			return false, err
   404  		}
   405  		if value := db.checkValue(chunk); value != nil {
   406  			return handleFn(key, value)
   407  		}
   408  		return true, nil
   409  	})
   410  }
   411  
   412  // AscendRange calls handleFn for each key/value pair in the db within the range [startKey, endKey] in ascending order.
   413  func (db *DB) AscendRange(startKey, endKey []byte, handleFn func(k []byte, v []byte) (bool, error)) {
   414  	db.mu.RLock()
   415  	defer db.mu.RUnlock()
   416  
   417  	db.index.AscendRange(startKey, endKey, func(key []byte, pos *wal.ChunkPosition) (bool, error) {
   418  		chunk, err := db.dataFiles.Read(pos)
   419  		if err != nil {
   420  			return false, nil
   421  		}
   422  		if value := db.checkValue(chunk); value != nil {
   423  			return handleFn(key, value)
   424  		}
   425  		return true, nil
   426  	})
   427  }
   428  
   429  // AscendGreaterOrEqual calls handleFn for each key/value pair in the db with keys greater than or equal to the given key.
   430  func (db *DB) AscendGreaterOrEqual(key []byte, handleFn func(k []byte, v []byte) (bool, error)) {
   431  	db.mu.RLock()
   432  	defer db.mu.RUnlock()
   433  
   434  	db.index.AscendGreaterOrEqual(key, func(key []byte, pos *wal.ChunkPosition) (bool, error) {
   435  		chunk, err := db.dataFiles.Read(pos)
   436  		if err != nil {
   437  			return false, nil
   438  		}
   439  		if value := db.checkValue(chunk); value != nil {
   440  			return handleFn(key, value)
   441  		}
   442  		return true, nil
   443  	})
   444  }
   445  
   446  // AscendKeys calls handleFn for each key in the db in ascending order.
   447  // Since our expiry time is stored in the value, if you want to filter expired keys,
   448  // you need to set parameter filterExpired to true. But the performance will be affected.
   449  // Because we need to read the value of each key to determine if it is expired.
   450  func (db *DB) AscendKeys(pattern []byte, filterExpired bool, handleFn func(k []byte) (bool, error)) {
   451  	db.mu.RLock()
   452  	defer db.mu.RUnlock()
   453  
   454  	var reg *regexp.Regexp
   455  	if len(pattern) > 0 {
   456  		reg = regexp.MustCompile(string(pattern))
   457  	}
   458  
   459  	db.index.Ascend(func(key []byte, pos *wal.ChunkPosition) (bool, error) {
   460  		if reg == nil || reg.Match(key) {
   461  			var invalid bool
   462  			if filterExpired {
   463  				chunk, err := db.dataFiles.Read(pos)
   464  				if err != nil {
   465  					return false, err
   466  				}
   467  				if value := db.checkValue(chunk); value == nil {
   468  					invalid = true
   469  				}
   470  			}
   471  			if invalid {
   472  				return true, nil
   473  			}
   474  			return handleFn(key)
   475  		}
   476  		return true, nil
   477  	})
   478  }
   479  
   480  // Descend calls handleFn for each key/value pair in the db in descending order.
   481  func (db *DB) Descend(handleFn func(k []byte, v []byte) (bool, error)) {
   482  	db.mu.RLock()
   483  	defer db.mu.RUnlock()
   484  
   485  	db.index.Descend(func(key []byte, pos *wal.ChunkPosition) (bool, error) {
   486  		chunk, err := db.dataFiles.Read(pos)
   487  		if err != nil {
   488  			return false, nil
   489  		}
   490  		if value := db.checkValue(chunk); value != nil {
   491  			return handleFn(key, value)
   492  		}
   493  		return true, nil
   494  	})
   495  }
   496  
   497  // DescendRange calls handleFn for each key/value pair in the db within the range [startKey, endKey] in descending order.
   498  func (db *DB) DescendRange(startKey, endKey []byte, handleFn func(k []byte, v []byte) (bool, error)) {
   499  	db.mu.RLock()
   500  	defer db.mu.RUnlock()
   501  
   502  	db.index.DescendRange(startKey, endKey, func(key []byte, pos *wal.ChunkPosition) (bool, error) {
   503  		chunk, err := db.dataFiles.Read(pos)
   504  		if err != nil {
   505  			return false, nil
   506  		}
   507  		if value := db.checkValue(chunk); value != nil {
   508  			return handleFn(key, value)
   509  		}
   510  		return true, nil
   511  	})
   512  }
   513  
   514  // DescendLessOrEqual calls handleFn for each key/value pair in the db with keys less than or equal to the given key.
   515  func (db *DB) DescendLessOrEqual(key []byte, handleFn func(k []byte, v []byte) (bool, error)) {
   516  	db.mu.RLock()
   517  	defer db.mu.RUnlock()
   518  
   519  	db.index.DescendLessOrEqual(key, func(key []byte, pos *wal.ChunkPosition) (bool, error) {
   520  		chunk, err := db.dataFiles.Read(pos)
   521  		if err != nil {
   522  			return false, nil
   523  		}
   524  		if value := db.checkValue(chunk); value != nil {
   525  			return handleFn(key, value)
   526  		}
   527  		return true, nil
   528  	})
   529  }
   530  
   531  // DescendKeys calls handleFn for each key in the db in descending order.
   532  // Since our expiry time is stored in the value, if you want to filter expired keys,
   533  // you need to set parameter filterExpired to true. But the performance will be affected.
   534  // Because we need to read the value of each key to determine if it is expired.
   535  func (db *DB) DescendKeys(pattern []byte, filterExpired bool, handleFn func(k []byte) (bool, error)) {
   536  	db.mu.RLock()
   537  	defer db.mu.RUnlock()
   538  
   539  	var reg *regexp.Regexp
   540  	if len(pattern) > 0 {
   541  		reg = regexp.MustCompile(string(pattern))
   542  	}
   543  
   544  	db.index.Descend(func(key []byte, pos *wal.ChunkPosition) (bool, error) {
   545  		if reg == nil || reg.Match(key) {
   546  			var invalid bool
   547  			if filterExpired {
   548  				chunk, err := db.dataFiles.Read(pos)
   549  				if err != nil {
   550  					return false, err
   551  				}
   552  				if value := db.checkValue(chunk); value == nil {
   553  					invalid = true
   554  				}
   555  			}
   556  			if invalid {
   557  				return true, nil
   558  			}
   559  			return handleFn(key)
   560  		}
   561  		return true, nil
   562  	})
   563  }
   564  
   565  func (db *DB) checkValue(chunk []byte) []byte {
   566  	record := decodeLogRecord(chunk)
   567  	now := time.Now().UnixNano()
   568  	if record.Type != LogRecordDeleted && !record.IsExpired(now) {
   569  		return record.Value
   570  	}
   571  	return nil
   572  }
   573  
   574  func checkOptions(options Options) error {
   575  	if options.DirPath == "" {
   576  		return errors.New("database dir path is empty")
   577  	}
   578  	if options.SegmentSize <= 0 {
   579  		return errors.New("database data file size must be greater than 0")
   580  	}
   581  
   582  	if len(options.AutoMergeCronExpr) > 0 {
   583  		if _, err := cron.NewParser(cron.SecondOptional | cron.Minute | cron.Hour | cron.Dom | cron.Month | cron.Dow | cron.Descriptor).
   584  			Parse(options.AutoMergeCronExpr); err != nil {
   585  			return fmt.Errorf("databse auto merge cron expression is invalid, err: %s", err)
   586  		}
   587  	}
   588  
   589  	return nil
   590  }
   591  
   592  // loadIndexFromWAL loads index from WAL.
   593  // It will iterate over all the WAL files and read data
   594  // from them to rebuild the index.
   595  func (db *DB) loadIndexFromWAL() error {
   596  	mergeFinSegmentId, err := getMergeFinSegmentId(db.options.DirPath)
   597  	if err != nil {
   598  		return err
   599  	}
   600  	indexRecords := make(map[uint64][]*IndexRecord)
   601  	now := time.Now().UnixNano()
   602  	// get a reader for WAL
   603  	reader := db.dataFiles.NewReader()
   604  	for {
   605  		// if the current segment id is less than the mergeFinSegmentId,
   606  		// we can skip this segment because it has been merged,
   607  		// and we can load index from the hint file directly.
   608  		if reader.CurrentSegmentId() <= mergeFinSegmentId {
   609  			reader.SkipCurrentSegment()
   610  			continue
   611  		}
   612  
   613  		chunk, position, err := reader.Next()
   614  		if err != nil {
   615  			if err == io.EOF {
   616  				break
   617  			}
   618  			return err
   619  		}
   620  		// decode and get log record
   621  		record := decodeLogRecord(chunk)
   622  
   623  		// if we get the end of a batch,
   624  		// all records in this batch are ready to be indexed.
   625  		if record.Type == LogRecordBatchFinished {
   626  			batchId, err := snowflake.ParseBytes(record.Key)
   627  			if err != nil {
   628  				return err
   629  			}
   630  			for _, idxRecord := range indexRecords[uint64(batchId)] {
   631  				if idxRecord.recordType == LogRecordNormal {
   632  					db.index.Put(idxRecord.key, idxRecord.position)
   633  				}
   634  				if idxRecord.recordType == LogRecordDeleted {
   635  					db.index.Delete(idxRecord.key)
   636  				}
   637  			}
   638  			// delete indexRecords according to batchId after indexing
   639  			delete(indexRecords, uint64(batchId))
   640  		} else if record.Type == LogRecordNormal && record.BatchId == mergeFinishedBatchID {
   641  			// if the record is a normal record and the batch id is 0,
   642  			// it means that the record is involved in the merge operation.
   643  			// so put the record into index directly.
   644  			db.index.Put(record.Key, position)
   645  		} else {
   646  			// expired records should not be indexed
   647  			if record.IsExpired(now) {
   648  				db.index.Delete(record.Key)
   649  				continue
   650  			}
   651  			// put the record into the temporary indexRecords
   652  			indexRecords[record.BatchId] = append(indexRecords[record.BatchId],
   653  				&IndexRecord{
   654  					key:        record.Key,
   655  					recordType: record.Type,
   656  					position:   position,
   657  				})
   658  		}
   659  	}
   660  	return nil
   661  }
   662  
   663  // DeleteExpiredKeys scan the entire index in ascending order to delete expired keys.
   664  // It is a time-consuming operation, so we need to specify a timeout
   665  // to prevent the DB from being unavailable for a long time.
   666  func (db *DB) DeleteExpiredKeys(timeout time.Duration) error {
   667  	// set timeout
   668  	ctx, cancel := context.WithTimeout(context.Background(), timeout)
   669  	defer cancel()
   670  	done := make(chan struct{}, 1)
   671  
   672  	var innerErr error
   673  	now := time.Now().UnixNano()
   674  	go func(ctx context.Context) {
   675  		db.mu.Lock()
   676  		defer db.mu.Unlock()
   677  		for {
   678  			// select 100 keys from the db.index
   679  			positions := make([]*wal.ChunkPosition, 0, 100)
   680  			db.index.AscendGreaterOrEqual(db.expiredCursorKey, func(k []byte, pos *wal.ChunkPosition) (bool, error) {
   681  				positions = append(positions, pos)
   682  				if len(positions) >= 100 {
   683  					return false, nil
   684  				}
   685  				return true, nil
   686  			})
   687  
   688  			// If keys in the db.index has been traversed, len(positions) will be 0.
   689  			if len(positions) == 0 {
   690  				db.expiredCursorKey = nil
   691  				done <- struct{}{}
   692  				return
   693  			}
   694  
   695  			// delete from index if the key is expired.
   696  			for _, pos := range positions {
   697  				chunk, err := db.dataFiles.Read(pos)
   698  				if err != nil {
   699  					innerErr = err
   700  					done <- struct{}{}
   701  					return
   702  				}
   703  				record := decodeLogRecord(chunk)
   704  				if record.IsExpired(now) {
   705  					db.index.Delete(record.Key)
   706  				}
   707  				db.expiredCursorKey = record.Key
   708  			}
   709  		}
   710  	}(ctx)
   711  
   712  	select {
   713  	case <-ctx.Done():
   714  		return innerErr
   715  	case <-done:
   716  		return nil
   717  	}
   718  }