github.com/klaytn/klaytn@v1.12.1/storage/database/sharded_database.go (about)

     1  // Copyright 2019 The klaytn Authors
     2  // This file is part of the klaytn library.
     3  //
     4  // The klaytn library is free software: you can redistribute it and/or modify
     5  // it under the terms of the GNU Lesser General Public License as published by
     6  // the Free Software Foundation, either version 3 of the License, or
     7  // (at your option) any later version.
     8  //
     9  // The klaytn library is distributed in the hope that it will be useful,
    10  // but WITHOUT ANY WARRANTY; without even the implied warranty of
    11  // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
    12  // GNU Lesser General Public License for more details.
    13  //
    14  // You should have received a copy of the GNU Lesser General Public License
    15  // along with the klaytn library. If not, see <http://www.gnu.org/licenses/>.
    16  
    17  package database
    18  
    19  import (
    20  	"bytes"
    21  	"container/heap"
    22  	"context"
    23  	"fmt"
    24  	"path"
    25  	"strconv"
    26  	"sync"
    27  
    28  	"github.com/klaytn/klaytn/common"
    29  	"github.com/pkg/errors"
    30  )
    31  
    32  var errKeyLengthZero = fmt.Errorf("database key for sharded database should be greater than 0")
    33  
    34  const numShardsLimit = 256
    35  
    36  type shardedDB struct {
    37  	fn        string
    38  	shards    []Database
    39  	numShards uint
    40  
    41  	sdbBatchTaskCh chan sdbBatchTask
    42  }
    43  
    44  type sdbBatchTask struct {
    45  	batch    Batch               // A batch that each worker executes.
    46  	index    int                 // Index of given batch.
    47  	resultCh chan sdbBatchResult // Batch result channel for each shardedDBBatch.
    48  }
    49  
    50  type sdbBatchResult struct {
    51  	index int   // Index of the batch result.
    52  	err   error // Error from the batch write operation.
    53  }
    54  
    55  // newShardedDB creates database with numShards shards, or partitions.
    56  // The type of database is specified DBConfig.DBType.
    57  func newShardedDB(dbc *DBConfig, et DBEntryType, numShards uint) (*shardedDB, error) {
    58  	if numShards == 0 {
    59  		logger.Crit("numShards should be greater than 0!")
    60  	}
    61  
    62  	if numShards > numShardsLimit {
    63  		logger.Crit(fmt.Sprintf("numShards should be equal to or smaller than %v, but it is %v.", numShardsLimit, numShards))
    64  	}
    65  
    66  	if !IsPow2(numShards) {
    67  		logger.Crit(fmt.Sprintf("numShards should be power of two, but it is %v", numShards))
    68  	}
    69  
    70  	shards := make([]Database, 0, numShards)
    71  	sdbBatchTaskCh := make(chan sdbBatchTask, numShards*2)
    72  	sdbLevelDBCacheSize := dbc.LevelDBCacheSize / int(numShards)
    73  	sdbOpenFilesLimit := dbc.OpenFilesLimit / int(numShards)
    74  	sdbRocksDBCacheSize := GetDefaultRocksDBConfig().CacheSize / uint64(numShards)
    75  	sdbRocksDBMaxOpenFiles := GetDefaultRocksDBConfig().MaxOpenFiles / int(numShards)
    76  	if dbc.RocksDBConfig != nil {
    77  		sdbRocksDBCacheSize = dbc.RocksDBConfig.CacheSize / uint64(numShards)
    78  		sdbRocksDBMaxOpenFiles = dbc.RocksDBConfig.MaxOpenFiles / int(numShards)
    79  	}
    80  	for i := 0; i < int(numShards); i++ {
    81  		copiedDBC := *dbc
    82  		copiedDBC.Dir = path.Join(copiedDBC.Dir, strconv.Itoa(i))
    83  		copiedDBC.LevelDBCacheSize = sdbLevelDBCacheSize
    84  		copiedDBC.OpenFilesLimit = sdbOpenFilesLimit
    85  		if copiedDBC.RocksDBConfig != nil {
    86  			copiedDBC.RocksDBConfig.CacheSize = sdbRocksDBCacheSize
    87  			copiedDBC.RocksDBConfig.MaxOpenFiles = sdbRocksDBMaxOpenFiles
    88  		}
    89  
    90  		db, err := newDatabase(&copiedDBC, et)
    91  		if err != nil {
    92  			return nil, err
    93  		}
    94  		shards = append(shards, db)
    95  		go batchWriteWorker(sdbBatchTaskCh)
    96  	}
    97  
    98  	logger.Info("Created a sharded database", "dbType", et, "numShards", numShards)
    99  	return &shardedDB{
   100  		fn: dbc.Dir, shards: shards,
   101  		numShards: numShards, sdbBatchTaskCh: sdbBatchTaskCh,
   102  	}, nil
   103  }
   104  
   105  // batchWriteWorker executes passed batch tasks.
   106  func batchWriteWorker(batchTasks <-chan sdbBatchTask) {
   107  	for task := range batchTasks {
   108  		task.resultCh <- sdbBatchResult{task.index, task.batch.Write()}
   109  	}
   110  }
   111  
   112  // IsPow2 checks if the given number is power of two or not.
   113  func IsPow2(num uint) bool {
   114  	return (num & (num - 1)) == 0
   115  }
   116  
   117  // shardIndexByKey returns shard index derived from the given key.
   118  // If len(key) is zero, it returns errKeyLengthZero.
   119  func shardIndexByKey(key []byte, numShards uint) (int, error) {
   120  	if len(key) == 0 {
   121  		return 0, errKeyLengthZero
   122  	}
   123  
   124  	return int(key[0]) & (int(numShards) - 1), nil
   125  }
   126  
   127  // getShardByKey returns the shard corresponding to the given key.
   128  func (db *shardedDB) getShardByKey(key []byte) (Database, error) {
   129  	if shardIndex, err := shardIndexByKey(key, uint(db.numShards)); err != nil {
   130  		return nil, err
   131  	} else {
   132  		return db.shards[shardIndex], nil
   133  	}
   134  }
   135  
   136  func (db *shardedDB) Put(key []byte, value []byte) error {
   137  	if shard, err := db.getShardByKey(key); err != nil {
   138  		return err
   139  	} else {
   140  		return shard.Put(key, value)
   141  	}
   142  }
   143  
   144  func (db *shardedDB) Get(key []byte) ([]byte, error) {
   145  	if shard, err := db.getShardByKey(key); err != nil {
   146  		return nil, err
   147  	} else {
   148  		return shard.Get(key)
   149  	}
   150  }
   151  
   152  func (db *shardedDB) Has(key []byte) (bool, error) {
   153  	if shard, err := db.getShardByKey(key); err != nil {
   154  		return false, err
   155  	} else {
   156  		return shard.Has(key)
   157  	}
   158  }
   159  
   160  func (db *shardedDB) Delete(key []byte) error {
   161  	if shard, err := db.getShardByKey(key); err != nil {
   162  		return err
   163  	} else {
   164  		return shard.Delete(key)
   165  	}
   166  }
   167  
   168  func (db *shardedDB) Close() {
   169  	close(db.sdbBatchTaskCh)
   170  
   171  	for _, shard := range db.shards {
   172  		shard.Close()
   173  	}
   174  }
   175  
   176  // Not enough size of channel slows down the iterator
   177  const (
   178  	shardedDBCombineChanSize = 1024 // Size of resultCh
   179  	shardedDBSubChannelSize  = 128  // Size of each sub-channel of resultChs
   180  )
   181  
   182  // shardedDBIterator iterates all items of each shardDB.
   183  // This is useful when you want to get items in serial in binary-alphabetigcal order.
   184  type shardedDBIterator struct {
   185  	parallelIterator shardedDBParallelIterator
   186  
   187  	resultCh chan common.Entry
   188  	key      []byte // current key
   189  	value    []byte // current value
   190  }
   191  
   192  // NewIterator creates a binary-alphabetical iterator over a subset
   193  // of database content with a particular key prefix, starting at a particular
   194  // initial key (or after, if it does not exist).
   195  func (db *shardedDB) NewIterator(prefix []byte, start []byte) Iterator {
   196  	it := &shardedDBIterator{
   197  		parallelIterator: db.NewParallelIterator(context.TODO(), prefix, start, nil),
   198  		resultCh:         make(chan common.Entry, shardedDBCombineChanSize),
   199  	}
   200  
   201  	go it.runCombineWorker()
   202  
   203  	return it
   204  }
   205  
   206  // NewIteratorUnsorted creates a iterator over the entire keyspace contained within
   207  // the key-value database. This is useful when you want to get items fast in serial.
   208  // If you want to get ordered items in serial, checkout shardedDB.NewIterator()
   209  // If you want to get items in parallel from channels, checkout shardedDB.NewParallelIterator()
   210  // IteratorUnsorted is a implementation of Iterator and data are accessed with
   211  // Next(), Key() and Value() methods. With ChanIterator, data can be accessed with
   212  // channels. The channels are gained with Channels() method.
   213  func (db *shardedDB) NewIteratorUnsorted(prefix []byte, start []byte) Iterator {
   214  	resultCh := make(chan common.Entry, shardedDBCombineChanSize)
   215  	return &shardedDBIterator{
   216  		parallelIterator: db.NewParallelIterator(context.TODO(), prefix, start, resultCh),
   217  		resultCh:         resultCh,
   218  	}
   219  }
   220  
   221  // runCombineWorker fetches any key/value from resultChs and put the data in resultCh
   222  // in binary-alphabetical order.
   223  func (it *shardedDBIterator) runCombineWorker() {
   224  	// creates min-priority queue smallest values from each iterators
   225  	entries := &entryHeap{}
   226  	heap.Init(entries)
   227  	for i, ch := range it.parallelIterator.resultChs {
   228  		if e, ok := <-ch; ok {
   229  			heap.Push(entries, entryWithShardNum{e, i})
   230  		}
   231  	}
   232  
   233  chanIter:
   234  	for len(*entries) != 0 {
   235  		// check if done
   236  		select {
   237  		case <-it.parallelIterator.ctx.Done():
   238  			logger.Trace("[shardedDBIterator] combine worker ends due to ctx")
   239  			break chanIter
   240  		default:
   241  		}
   242  
   243  		// look for smallest key
   244  		minEntry := heap.Pop(entries).(entryWithShardNum)
   245  
   246  		// fill resultCh with smallest key
   247  		it.resultCh <- minEntry.Entry
   248  
   249  		// fill used entry with new entry
   250  		// skip this if channel is closed
   251  		if e, ok := <-it.parallelIterator.resultChs[minEntry.shardNum]; ok {
   252  			heap.Push(entries, entryWithShardNum{e, minEntry.shardNum})
   253  		}
   254  	}
   255  	logger.Trace("[shardedDBIterator] combine worker finished")
   256  	close(it.resultCh)
   257  }
   258  
   259  // Next gets the next item from iterators.
   260  func (it *shardedDBIterator) Next() bool {
   261  	e, ok := <-it.resultCh
   262  	if !ok {
   263  		logger.Debug("[shardedDBIterator] Next is called on closed channel")
   264  		return false
   265  	}
   266  	it.key, it.value = e.Key, e.Val
   267  	return true
   268  }
   269  
   270  func (it *shardedDBIterator) Error() error {
   271  	for i, iter := range it.parallelIterator.iterators {
   272  		if iter.Error() != nil {
   273  			logger.Error("[shardedDBIterator] error from iterator",
   274  				"err", iter.Error(), "shardNum", i, "key", it.key, "val", it.value)
   275  			return iter.Error()
   276  		}
   277  	}
   278  	return nil
   279  }
   280  
   281  func (it *shardedDBIterator) Key() []byte {
   282  	return it.key
   283  }
   284  
   285  func (it *shardedDBIterator) Value() []byte {
   286  	return it.value
   287  }
   288  
   289  func (it *shardedDBIterator) Release() {
   290  	it.parallelIterator.cancel()
   291  }
   292  
   293  type entryWithShardNum struct {
   294  	common.Entry
   295  	shardNum int
   296  }
   297  
   298  type entryHeap []entryWithShardNum
   299  
   300  func (e entryHeap) Len() int {
   301  	return len(e)
   302  }
   303  
   304  func (e entryHeap) Less(i, j int) bool {
   305  	return bytes.Compare(e[i].Key, e[j].Key) < 0
   306  }
   307  
   308  func (e entryHeap) Swap(i, j int) {
   309  	e[i], e[j] = e[j], e[i]
   310  }
   311  
   312  func (e *entryHeap) Push(x interface{}) {
   313  	*e = append(*e, x.(entryWithShardNum))
   314  }
   315  
   316  func (e *entryHeap) Pop() interface{} {
   317  	old := *e
   318  	n := len(old)
   319  	element := old[n-1]
   320  	*e = old[0 : n-1]
   321  	return element
   322  }
   323  
   324  // shardedDBParallelIterator creates iterators for each shard DB.
   325  // Channels subscribing each iterators can be gained.
   326  // Each iterators fetch values in binary-alphabetical order.
   327  // This is useful when you want to operate on each items in parallel.
   328  type shardedDBParallelIterator struct {
   329  	ctx    context.Context
   330  	cancel context.CancelFunc
   331  
   332  	iterators []Iterator
   333  
   334  	combinedChan bool // all workers put items to one resultChan
   335  	shardNum     int  // num of shards left to iterate
   336  	shardNumMu   *sync.Mutex
   337  	resultChs    []chan common.Entry
   338  }
   339  
   340  // NewParallelIterator creates iterators for each shard DB. This is useful when you
   341  // want to operate on each items in parallel.
   342  // If `resultCh` is given, all items are written to `resultCh`, unsorted with a
   343  // particular key prefix, starting at a particular initial key. If `resultCh`
   344  // is not given, new channels are created for each DB. Items are written to
   345  // corresponding channels in binary-alphabetical order. The channels can be
   346  // gained by calling `Channels()`.
   347  //
   348  // If you want to get ordered items in serial, checkout shardedDB.NewIterator()
   349  // If you want to get unordered items in serial with Iterator Interface,
   350  // checkout shardedDB.NewIteratorUnsorted().
   351  func (db *shardedDB) NewParallelIterator(ctx context.Context, prefix []byte, start []byte, resultCh chan common.Entry) shardedDBParallelIterator {
   352  	if ctx == nil {
   353  		ctx = context.TODO()
   354  	}
   355  
   356  	it := shardedDBParallelIterator{
   357  		ctx:          ctx,
   358  		cancel:       nil,
   359  		iterators:    make([]Iterator, len(db.shards)),
   360  		combinedChan: resultCh != nil,
   361  		shardNum:     len(db.shards),
   362  		shardNumMu:   &sync.Mutex{},
   363  		resultChs:    make([]chan common.Entry, len(db.shards)),
   364  	}
   365  	it.ctx, it.cancel = context.WithCancel(ctx)
   366  
   367  	for i, shard := range db.shards {
   368  		it.iterators[i] = shard.NewIterator(prefix, start)
   369  		if resultCh == nil {
   370  			it.resultChs[i] = make(chan common.Entry, shardedDBSubChannelSize)
   371  		} else {
   372  			it.resultChs[i] = resultCh
   373  		}
   374  		go it.runChanWorker(it.ctx, it.iterators[i], it.resultChs[i])
   375  	}
   376  
   377  	return it
   378  }
   379  
   380  // runChanWorker runs a worker. The worker gets key/value pair from
   381  // `it` and push the value to `resultCh`.
   382  // `iterator.Release()` is called after all iterating is finished.
   383  // `resultCh` is closed after the iterating is finished.
   384  func (sit *shardedDBParallelIterator) runChanWorker(ctx context.Context, it Iterator, resultCh chan common.Entry) {
   385  iter:
   386  	for it.Next() {
   387  		select {
   388  		case <-ctx.Done():
   389  			break iter
   390  		default:
   391  		}
   392  		key := make([]byte, len(it.Key()))
   393  		val := make([]byte, len(it.Value()))
   394  		copy(key, it.Key())
   395  		copy(val, it.Value())
   396  		resultCh <- common.Entry{Key: key, Val: val}
   397  	}
   398  	// Release the iterator. There is nothing to iterate anymore.
   399  	it.Release()
   400  	// Close `resultCh`. If it is `combinedChan`, the close only happens
   401  	// when this is the last living worker.
   402  	sit.shardNumMu.Lock()
   403  	defer sit.shardNumMu.Unlock()
   404  	if sit.shardNum--; sit.combinedChan && sit.shardNum > 0 {
   405  		return
   406  	}
   407  	close(resultCh)
   408  }
   409  
   410  // Channels returns channels that can subscribe on.
   411  func (it *shardedDBParallelIterator) Channels() []chan common.Entry {
   412  	return it.resultChs
   413  }
   414  
   415  // Release stops all iterators, channels and workers
   416  // Even Release() is called, there could be some items left in the channel.
   417  // Each iterator.Release() is called in `runChanWorker`.
   418  func (it *shardedDBParallelIterator) Release() {
   419  	it.cancel()
   420  }
   421  
   422  func (db *shardedDB) NewBatch() Batch {
   423  	batches := make([]Batch, 0, db.numShards)
   424  	for i := 0; i < int(db.numShards); i++ {
   425  		batches = append(batches, db.shards[i].NewBatch())
   426  	}
   427  
   428  	return &shardedDBBatch{
   429  		batches: batches, numBatches: db.numShards,
   430  		taskCh: db.sdbBatchTaskCh, resultCh: make(chan sdbBatchResult, db.numShards),
   431  	}
   432  }
   433  
   434  func (db *shardedDB) Type() DBType {
   435  	return ShardedDB
   436  }
   437  
   438  func (db *shardedDB) Meter(prefix string) {
   439  	for index, shard := range db.shards {
   440  		shard.Meter(prefix + strconv.Itoa(index))
   441  	}
   442  }
   443  
   444  func (db *shardedDB) TryCatchUpWithPrimary() error {
   445  	for _, shard := range db.shards {
   446  		if err := shard.TryCatchUpWithPrimary(); err != nil {
   447  			return err
   448  		}
   449  	}
   450  	return nil
   451  }
   452  
   453  type shardedDBBatch struct {
   454  	batches    []Batch
   455  	numBatches uint
   456  
   457  	taskCh   chan sdbBatchTask
   458  	resultCh chan sdbBatchResult
   459  }
   460  
   461  func (sdbBatch *shardedDBBatch) Put(key []byte, value []byte) error {
   462  	if ShardIndex, err := shardIndexByKey(key, sdbBatch.numBatches); err != nil {
   463  		return err
   464  	} else {
   465  		return sdbBatch.batches[ShardIndex].Put(key, value)
   466  	}
   467  }
   468  
   469  func (sdbBatch *shardedDBBatch) Delete(key []byte) error {
   470  	if ShardIndex, err := shardIndexByKey(key, sdbBatch.numBatches); err != nil {
   471  		return err
   472  	} else {
   473  		return sdbBatch.batches[ShardIndex].Delete(key)
   474  	}
   475  }
   476  
   477  // ValueSize is called to determine whether to write batches when it exceeds
   478  // certain limit. shardedDB returns the largest size of its batches to
   479  // write all batches at once when one of batch exceeds the limit.
   480  func (sdbBatch *shardedDBBatch) ValueSize() int {
   481  	maxSize := 0
   482  	for _, batch := range sdbBatch.batches {
   483  		if batch.ValueSize() > maxSize {
   484  			maxSize = batch.ValueSize()
   485  		}
   486  	}
   487  	return maxSize
   488  }
   489  
   490  // Write passes the list of batch tasks to taskCh so batch can be processed
   491  // by underlying workers. Write waits until all workers return the result.
   492  func (sdbBatch *shardedDBBatch) Write() error {
   493  	for index, batch := range sdbBatch.batches {
   494  		sdbBatch.taskCh <- sdbBatchTask{batch, index, sdbBatch.resultCh}
   495  	}
   496  
   497  	var err error
   498  	for range sdbBatch.batches {
   499  		if batchResult := <-sdbBatch.resultCh; batchResult.err != nil {
   500  			logger.Error("Error while writing sharded batch", "index", batchResult.index, "err", batchResult.err)
   501  			err = batchResult.err
   502  		}
   503  	}
   504  	// Leave logs for each error but only return the last one.
   505  	return err
   506  }
   507  
   508  func (sdbBatch *shardedDBBatch) Reset() {
   509  	for _, batch := range sdbBatch.batches {
   510  		batch.Reset()
   511  	}
   512  }
   513  
   514  func (sdbBatch *shardedDBBatch) Release() {
   515  	for _, batch := range sdbBatch.batches {
   516  		batch.Release()
   517  	}
   518  }
   519  
   520  func (sdbBatch *shardedDBBatch) Replay(w KeyValueWriter) error {
   521  	for _, batch := range sdbBatch.batches {
   522  		if err := batch.Replay(w); err != nil {
   523  			return err
   524  		}
   525  	}
   526  	return nil
   527  }
   528  
   529  func (db *shardedDB) Stat(property string) (string, error) {
   530  	stats := ""
   531  	errs := ""
   532  	for idx, shard := range db.shards {
   533  		stat, err := shard.Stat(property)
   534  		if err == nil {
   535  			headInfo := fmt.Sprintf(" [shard%d:%s]\n", idx, shard.Type())
   536  			stats += headInfo + stat
   537  		} else {
   538  			errs += fmt.Sprintf("shard[%d]: %s", idx, err.Error())
   539  		}
   540  	}
   541  	if errs == "" {
   542  		return stats, nil
   543  	} else {
   544  		return stats, errors.New(errs)
   545  	}
   546  }
   547  
   548  func (db *shardedDB) Compact(start []byte, limit []byte) error {
   549  	errs := ""
   550  	for idx, shard := range db.shards {
   551  		if err := shard.Compact(start, limit); err != nil {
   552  			errs += fmt.Sprintf("shard[%d]: %s", idx, err.Error())
   553  		}
   554  	}
   555  	if errs == "" {
   556  		return nil
   557  	} else {
   558  		return errors.New(errs)
   559  	}
   560  }