github.com/klaytn/klaytn@v1.12.1/storage/database/sharded_database.go (about) 1 // Copyright 2019 The klaytn Authors 2 // This file is part of the klaytn library. 3 // 4 // The klaytn library is free software: you can redistribute it and/or modify 5 // it under the terms of the GNU Lesser General Public License as published by 6 // the Free Software Foundation, either version 3 of the License, or 7 // (at your option) any later version. 8 // 9 // The klaytn library is distributed in the hope that it will be useful, 10 // but WITHOUT ANY WARRANTY; without even the implied warranty of 11 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12 // GNU Lesser General Public License for more details. 13 // 14 // You should have received a copy of the GNU Lesser General Public License 15 // along with the klaytn library. If not, see <http://www.gnu.org/licenses/>. 16 17 package database 18 19 import ( 20 "bytes" 21 "container/heap" 22 "context" 23 "fmt" 24 "path" 25 "strconv" 26 "sync" 27 28 "github.com/klaytn/klaytn/common" 29 "github.com/pkg/errors" 30 ) 31 32 var errKeyLengthZero = fmt.Errorf("database key for sharded database should be greater than 0") 33 34 const numShardsLimit = 256 35 36 type shardedDB struct { 37 fn string 38 shards []Database 39 numShards uint 40 41 sdbBatchTaskCh chan sdbBatchTask 42 } 43 44 type sdbBatchTask struct { 45 batch Batch // A batch that each worker executes. 46 index int // Index of given batch. 47 resultCh chan sdbBatchResult // Batch result channel for each shardedDBBatch. 48 } 49 50 type sdbBatchResult struct { 51 index int // Index of the batch result. 52 err error // Error from the batch write operation. 53 } 54 55 // newShardedDB creates database with numShards shards, or partitions. 56 // The type of database is specified DBConfig.DBType. 57 func newShardedDB(dbc *DBConfig, et DBEntryType, numShards uint) (*shardedDB, error) { 58 if numShards == 0 { 59 logger.Crit("numShards should be greater than 0!") 60 } 61 62 if numShards > numShardsLimit { 63 logger.Crit(fmt.Sprintf("numShards should be equal to or smaller than %v, but it is %v.", numShardsLimit, numShards)) 64 } 65 66 if !IsPow2(numShards) { 67 logger.Crit(fmt.Sprintf("numShards should be power of two, but it is %v", numShards)) 68 } 69 70 shards := make([]Database, 0, numShards) 71 sdbBatchTaskCh := make(chan sdbBatchTask, numShards*2) 72 sdbLevelDBCacheSize := dbc.LevelDBCacheSize / int(numShards) 73 sdbOpenFilesLimit := dbc.OpenFilesLimit / int(numShards) 74 sdbRocksDBCacheSize := GetDefaultRocksDBConfig().CacheSize / uint64(numShards) 75 sdbRocksDBMaxOpenFiles := GetDefaultRocksDBConfig().MaxOpenFiles / int(numShards) 76 if dbc.RocksDBConfig != nil { 77 sdbRocksDBCacheSize = dbc.RocksDBConfig.CacheSize / uint64(numShards) 78 sdbRocksDBMaxOpenFiles = dbc.RocksDBConfig.MaxOpenFiles / int(numShards) 79 } 80 for i := 0; i < int(numShards); i++ { 81 copiedDBC := *dbc 82 copiedDBC.Dir = path.Join(copiedDBC.Dir, strconv.Itoa(i)) 83 copiedDBC.LevelDBCacheSize = sdbLevelDBCacheSize 84 copiedDBC.OpenFilesLimit = sdbOpenFilesLimit 85 if copiedDBC.RocksDBConfig != nil { 86 copiedDBC.RocksDBConfig.CacheSize = sdbRocksDBCacheSize 87 copiedDBC.RocksDBConfig.MaxOpenFiles = sdbRocksDBMaxOpenFiles 88 } 89 90 db, err := newDatabase(&copiedDBC, et) 91 if err != nil { 92 return nil, err 93 } 94 shards = append(shards, db) 95 go batchWriteWorker(sdbBatchTaskCh) 96 } 97 98 logger.Info("Created a sharded database", "dbType", et, "numShards", numShards) 99 return &shardedDB{ 100 fn: dbc.Dir, shards: shards, 101 numShards: numShards, sdbBatchTaskCh: sdbBatchTaskCh, 102 }, nil 103 } 104 105 // batchWriteWorker executes passed batch tasks. 106 func batchWriteWorker(batchTasks <-chan sdbBatchTask) { 107 for task := range batchTasks { 108 task.resultCh <- sdbBatchResult{task.index, task.batch.Write()} 109 } 110 } 111 112 // IsPow2 checks if the given number is power of two or not. 113 func IsPow2(num uint) bool { 114 return (num & (num - 1)) == 0 115 } 116 117 // shardIndexByKey returns shard index derived from the given key. 118 // If len(key) is zero, it returns errKeyLengthZero. 119 func shardIndexByKey(key []byte, numShards uint) (int, error) { 120 if len(key) == 0 { 121 return 0, errKeyLengthZero 122 } 123 124 return int(key[0]) & (int(numShards) - 1), nil 125 } 126 127 // getShardByKey returns the shard corresponding to the given key. 128 func (db *shardedDB) getShardByKey(key []byte) (Database, error) { 129 if shardIndex, err := shardIndexByKey(key, uint(db.numShards)); err != nil { 130 return nil, err 131 } else { 132 return db.shards[shardIndex], nil 133 } 134 } 135 136 func (db *shardedDB) Put(key []byte, value []byte) error { 137 if shard, err := db.getShardByKey(key); err != nil { 138 return err 139 } else { 140 return shard.Put(key, value) 141 } 142 } 143 144 func (db *shardedDB) Get(key []byte) ([]byte, error) { 145 if shard, err := db.getShardByKey(key); err != nil { 146 return nil, err 147 } else { 148 return shard.Get(key) 149 } 150 } 151 152 func (db *shardedDB) Has(key []byte) (bool, error) { 153 if shard, err := db.getShardByKey(key); err != nil { 154 return false, err 155 } else { 156 return shard.Has(key) 157 } 158 } 159 160 func (db *shardedDB) Delete(key []byte) error { 161 if shard, err := db.getShardByKey(key); err != nil { 162 return err 163 } else { 164 return shard.Delete(key) 165 } 166 } 167 168 func (db *shardedDB) Close() { 169 close(db.sdbBatchTaskCh) 170 171 for _, shard := range db.shards { 172 shard.Close() 173 } 174 } 175 176 // Not enough size of channel slows down the iterator 177 const ( 178 shardedDBCombineChanSize = 1024 // Size of resultCh 179 shardedDBSubChannelSize = 128 // Size of each sub-channel of resultChs 180 ) 181 182 // shardedDBIterator iterates all items of each shardDB. 183 // This is useful when you want to get items in serial in binary-alphabetigcal order. 184 type shardedDBIterator struct { 185 parallelIterator shardedDBParallelIterator 186 187 resultCh chan common.Entry 188 key []byte // current key 189 value []byte // current value 190 } 191 192 // NewIterator creates a binary-alphabetical iterator over a subset 193 // of database content with a particular key prefix, starting at a particular 194 // initial key (or after, if it does not exist). 195 func (db *shardedDB) NewIterator(prefix []byte, start []byte) Iterator { 196 it := &shardedDBIterator{ 197 parallelIterator: db.NewParallelIterator(context.TODO(), prefix, start, nil), 198 resultCh: make(chan common.Entry, shardedDBCombineChanSize), 199 } 200 201 go it.runCombineWorker() 202 203 return it 204 } 205 206 // NewIteratorUnsorted creates a iterator over the entire keyspace contained within 207 // the key-value database. This is useful when you want to get items fast in serial. 208 // If you want to get ordered items in serial, checkout shardedDB.NewIterator() 209 // If you want to get items in parallel from channels, checkout shardedDB.NewParallelIterator() 210 // IteratorUnsorted is a implementation of Iterator and data are accessed with 211 // Next(), Key() and Value() methods. With ChanIterator, data can be accessed with 212 // channels. The channels are gained with Channels() method. 213 func (db *shardedDB) NewIteratorUnsorted(prefix []byte, start []byte) Iterator { 214 resultCh := make(chan common.Entry, shardedDBCombineChanSize) 215 return &shardedDBIterator{ 216 parallelIterator: db.NewParallelIterator(context.TODO(), prefix, start, resultCh), 217 resultCh: resultCh, 218 } 219 } 220 221 // runCombineWorker fetches any key/value from resultChs and put the data in resultCh 222 // in binary-alphabetical order. 223 func (it *shardedDBIterator) runCombineWorker() { 224 // creates min-priority queue smallest values from each iterators 225 entries := &entryHeap{} 226 heap.Init(entries) 227 for i, ch := range it.parallelIterator.resultChs { 228 if e, ok := <-ch; ok { 229 heap.Push(entries, entryWithShardNum{e, i}) 230 } 231 } 232 233 chanIter: 234 for len(*entries) != 0 { 235 // check if done 236 select { 237 case <-it.parallelIterator.ctx.Done(): 238 logger.Trace("[shardedDBIterator] combine worker ends due to ctx") 239 break chanIter 240 default: 241 } 242 243 // look for smallest key 244 minEntry := heap.Pop(entries).(entryWithShardNum) 245 246 // fill resultCh with smallest key 247 it.resultCh <- minEntry.Entry 248 249 // fill used entry with new entry 250 // skip this if channel is closed 251 if e, ok := <-it.parallelIterator.resultChs[minEntry.shardNum]; ok { 252 heap.Push(entries, entryWithShardNum{e, minEntry.shardNum}) 253 } 254 } 255 logger.Trace("[shardedDBIterator] combine worker finished") 256 close(it.resultCh) 257 } 258 259 // Next gets the next item from iterators. 260 func (it *shardedDBIterator) Next() bool { 261 e, ok := <-it.resultCh 262 if !ok { 263 logger.Debug("[shardedDBIterator] Next is called on closed channel") 264 return false 265 } 266 it.key, it.value = e.Key, e.Val 267 return true 268 } 269 270 func (it *shardedDBIterator) Error() error { 271 for i, iter := range it.parallelIterator.iterators { 272 if iter.Error() != nil { 273 logger.Error("[shardedDBIterator] error from iterator", 274 "err", iter.Error(), "shardNum", i, "key", it.key, "val", it.value) 275 return iter.Error() 276 } 277 } 278 return nil 279 } 280 281 func (it *shardedDBIterator) Key() []byte { 282 return it.key 283 } 284 285 func (it *shardedDBIterator) Value() []byte { 286 return it.value 287 } 288 289 func (it *shardedDBIterator) Release() { 290 it.parallelIterator.cancel() 291 } 292 293 type entryWithShardNum struct { 294 common.Entry 295 shardNum int 296 } 297 298 type entryHeap []entryWithShardNum 299 300 func (e entryHeap) Len() int { 301 return len(e) 302 } 303 304 func (e entryHeap) Less(i, j int) bool { 305 return bytes.Compare(e[i].Key, e[j].Key) < 0 306 } 307 308 func (e entryHeap) Swap(i, j int) { 309 e[i], e[j] = e[j], e[i] 310 } 311 312 func (e *entryHeap) Push(x interface{}) { 313 *e = append(*e, x.(entryWithShardNum)) 314 } 315 316 func (e *entryHeap) Pop() interface{} { 317 old := *e 318 n := len(old) 319 element := old[n-1] 320 *e = old[0 : n-1] 321 return element 322 } 323 324 // shardedDBParallelIterator creates iterators for each shard DB. 325 // Channels subscribing each iterators can be gained. 326 // Each iterators fetch values in binary-alphabetical order. 327 // This is useful when you want to operate on each items in parallel. 328 type shardedDBParallelIterator struct { 329 ctx context.Context 330 cancel context.CancelFunc 331 332 iterators []Iterator 333 334 combinedChan bool // all workers put items to one resultChan 335 shardNum int // num of shards left to iterate 336 shardNumMu *sync.Mutex 337 resultChs []chan common.Entry 338 } 339 340 // NewParallelIterator creates iterators for each shard DB. This is useful when you 341 // want to operate on each items in parallel. 342 // If `resultCh` is given, all items are written to `resultCh`, unsorted with a 343 // particular key prefix, starting at a particular initial key. If `resultCh` 344 // is not given, new channels are created for each DB. Items are written to 345 // corresponding channels in binary-alphabetical order. The channels can be 346 // gained by calling `Channels()`. 347 // 348 // If you want to get ordered items in serial, checkout shardedDB.NewIterator() 349 // If you want to get unordered items in serial with Iterator Interface, 350 // checkout shardedDB.NewIteratorUnsorted(). 351 func (db *shardedDB) NewParallelIterator(ctx context.Context, prefix []byte, start []byte, resultCh chan common.Entry) shardedDBParallelIterator { 352 if ctx == nil { 353 ctx = context.TODO() 354 } 355 356 it := shardedDBParallelIterator{ 357 ctx: ctx, 358 cancel: nil, 359 iterators: make([]Iterator, len(db.shards)), 360 combinedChan: resultCh != nil, 361 shardNum: len(db.shards), 362 shardNumMu: &sync.Mutex{}, 363 resultChs: make([]chan common.Entry, len(db.shards)), 364 } 365 it.ctx, it.cancel = context.WithCancel(ctx) 366 367 for i, shard := range db.shards { 368 it.iterators[i] = shard.NewIterator(prefix, start) 369 if resultCh == nil { 370 it.resultChs[i] = make(chan common.Entry, shardedDBSubChannelSize) 371 } else { 372 it.resultChs[i] = resultCh 373 } 374 go it.runChanWorker(it.ctx, it.iterators[i], it.resultChs[i]) 375 } 376 377 return it 378 } 379 380 // runChanWorker runs a worker. The worker gets key/value pair from 381 // `it` and push the value to `resultCh`. 382 // `iterator.Release()` is called after all iterating is finished. 383 // `resultCh` is closed after the iterating is finished. 384 func (sit *shardedDBParallelIterator) runChanWorker(ctx context.Context, it Iterator, resultCh chan common.Entry) { 385 iter: 386 for it.Next() { 387 select { 388 case <-ctx.Done(): 389 break iter 390 default: 391 } 392 key := make([]byte, len(it.Key())) 393 val := make([]byte, len(it.Value())) 394 copy(key, it.Key()) 395 copy(val, it.Value()) 396 resultCh <- common.Entry{Key: key, Val: val} 397 } 398 // Release the iterator. There is nothing to iterate anymore. 399 it.Release() 400 // Close `resultCh`. If it is `combinedChan`, the close only happens 401 // when this is the last living worker. 402 sit.shardNumMu.Lock() 403 defer sit.shardNumMu.Unlock() 404 if sit.shardNum--; sit.combinedChan && sit.shardNum > 0 { 405 return 406 } 407 close(resultCh) 408 } 409 410 // Channels returns channels that can subscribe on. 411 func (it *shardedDBParallelIterator) Channels() []chan common.Entry { 412 return it.resultChs 413 } 414 415 // Release stops all iterators, channels and workers 416 // Even Release() is called, there could be some items left in the channel. 417 // Each iterator.Release() is called in `runChanWorker`. 418 func (it *shardedDBParallelIterator) Release() { 419 it.cancel() 420 } 421 422 func (db *shardedDB) NewBatch() Batch { 423 batches := make([]Batch, 0, db.numShards) 424 for i := 0; i < int(db.numShards); i++ { 425 batches = append(batches, db.shards[i].NewBatch()) 426 } 427 428 return &shardedDBBatch{ 429 batches: batches, numBatches: db.numShards, 430 taskCh: db.sdbBatchTaskCh, resultCh: make(chan sdbBatchResult, db.numShards), 431 } 432 } 433 434 func (db *shardedDB) Type() DBType { 435 return ShardedDB 436 } 437 438 func (db *shardedDB) Meter(prefix string) { 439 for index, shard := range db.shards { 440 shard.Meter(prefix + strconv.Itoa(index)) 441 } 442 } 443 444 func (db *shardedDB) TryCatchUpWithPrimary() error { 445 for _, shard := range db.shards { 446 if err := shard.TryCatchUpWithPrimary(); err != nil { 447 return err 448 } 449 } 450 return nil 451 } 452 453 type shardedDBBatch struct { 454 batches []Batch 455 numBatches uint 456 457 taskCh chan sdbBatchTask 458 resultCh chan sdbBatchResult 459 } 460 461 func (sdbBatch *shardedDBBatch) Put(key []byte, value []byte) error { 462 if ShardIndex, err := shardIndexByKey(key, sdbBatch.numBatches); err != nil { 463 return err 464 } else { 465 return sdbBatch.batches[ShardIndex].Put(key, value) 466 } 467 } 468 469 func (sdbBatch *shardedDBBatch) Delete(key []byte) error { 470 if ShardIndex, err := shardIndexByKey(key, sdbBatch.numBatches); err != nil { 471 return err 472 } else { 473 return sdbBatch.batches[ShardIndex].Delete(key) 474 } 475 } 476 477 // ValueSize is called to determine whether to write batches when it exceeds 478 // certain limit. shardedDB returns the largest size of its batches to 479 // write all batches at once when one of batch exceeds the limit. 480 func (sdbBatch *shardedDBBatch) ValueSize() int { 481 maxSize := 0 482 for _, batch := range sdbBatch.batches { 483 if batch.ValueSize() > maxSize { 484 maxSize = batch.ValueSize() 485 } 486 } 487 return maxSize 488 } 489 490 // Write passes the list of batch tasks to taskCh so batch can be processed 491 // by underlying workers. Write waits until all workers return the result. 492 func (sdbBatch *shardedDBBatch) Write() error { 493 for index, batch := range sdbBatch.batches { 494 sdbBatch.taskCh <- sdbBatchTask{batch, index, sdbBatch.resultCh} 495 } 496 497 var err error 498 for range sdbBatch.batches { 499 if batchResult := <-sdbBatch.resultCh; batchResult.err != nil { 500 logger.Error("Error while writing sharded batch", "index", batchResult.index, "err", batchResult.err) 501 err = batchResult.err 502 } 503 } 504 // Leave logs for each error but only return the last one. 505 return err 506 } 507 508 func (sdbBatch *shardedDBBatch) Reset() { 509 for _, batch := range sdbBatch.batches { 510 batch.Reset() 511 } 512 } 513 514 func (sdbBatch *shardedDBBatch) Release() { 515 for _, batch := range sdbBatch.batches { 516 batch.Release() 517 } 518 } 519 520 func (sdbBatch *shardedDBBatch) Replay(w KeyValueWriter) error { 521 for _, batch := range sdbBatch.batches { 522 if err := batch.Replay(w); err != nil { 523 return err 524 } 525 } 526 return nil 527 } 528 529 func (db *shardedDB) Stat(property string) (string, error) { 530 stats := "" 531 errs := "" 532 for idx, shard := range db.shards { 533 stat, err := shard.Stat(property) 534 if err == nil { 535 headInfo := fmt.Sprintf(" [shard%d:%s]\n", idx, shard.Type()) 536 stats += headInfo + stat 537 } else { 538 errs += fmt.Sprintf("shard[%d]: %s", idx, err.Error()) 539 } 540 } 541 if errs == "" { 542 return stats, nil 543 } else { 544 return stats, errors.New(errs) 545 } 546 } 547 548 func (db *shardedDB) Compact(start []byte, limit []byte) error { 549 errs := "" 550 for idx, shard := range db.shards { 551 if err := shard.Compact(start, limit); err != nil { 552 errs += fmt.Sprintf("shard[%d]: %s", idx, err.Error()) 553 } 554 } 555 if errs == "" { 556 return nil 557 } else { 558 return errors.New(errs) 559 } 560 }