github.com/klaytn/klaytn@v1.10.2/storage/database/sharded_database.go (about) 1 // Copyright 2019 The klaytn Authors 2 // This file is part of the klaytn library. 3 // 4 // The klaytn library is free software: you can redistribute it and/or modify 5 // it under the terms of the GNU Lesser General Public License as published by 6 // the Free Software Foundation, either version 3 of the License, or 7 // (at your option) any later version. 8 // 9 // The klaytn library is distributed in the hope that it will be useful, 10 // but WITHOUT ANY WARRANTY; without even the implied warranty of 11 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12 // GNU Lesser General Public License for more details. 13 // 14 // You should have received a copy of the GNU Lesser General Public License 15 // along with the klaytn library. If not, see <http://www.gnu.org/licenses/>. 16 17 package database 18 19 import ( 20 "bytes" 21 "container/heap" 22 "context" 23 "fmt" 24 "path" 25 "strconv" 26 "sync" 27 28 "github.com/klaytn/klaytn/common" 29 ) 30 31 var errKeyLengthZero = fmt.Errorf("database key for sharded database should be greater than 0") 32 33 const numShardsLimit = 256 34 35 type shardedDB struct { 36 fn string 37 shards []Database 38 numShards uint 39 40 sdbBatchTaskCh chan sdbBatchTask 41 } 42 43 type sdbBatchTask struct { 44 batch Batch // A batch that each worker executes. 45 index int // Index of given batch. 46 resultCh chan sdbBatchResult // Batch result channel for each shardedDBBatch. 47 } 48 49 type sdbBatchResult struct { 50 index int // Index of the batch result. 51 err error // Error from the batch write operation. 52 } 53 54 // newShardedDB creates database with numShards shards, or partitions. 55 // The type of database is specified DBConfig.DBType. 56 func newShardedDB(dbc *DBConfig, et DBEntryType, numShards uint) (*shardedDB, error) { 57 if numShards == 0 { 58 logger.Crit("numShards should be greater than 0!") 59 } 60 61 if numShards > numShardsLimit { 62 logger.Crit(fmt.Sprintf("numShards should be equal to or smaller than %v, but it is %v.", numShardsLimit, numShards)) 63 } 64 65 if !IsPow2(numShards) { 66 logger.Crit(fmt.Sprintf("numShards should be power of two, but it is %v", numShards)) 67 } 68 69 shards := make([]Database, 0, numShards) 70 sdbBatchTaskCh := make(chan sdbBatchTask, numShards*2) 71 sdbLevelDBCacheSize := dbc.LevelDBCacheSize / int(numShards) 72 sdbOpenFilesLimit := dbc.OpenFilesLimit / int(numShards) 73 for i := 0; i < int(numShards); i++ { 74 copiedDBC := *dbc 75 copiedDBC.Dir = path.Join(copiedDBC.Dir, strconv.Itoa(i)) 76 copiedDBC.LevelDBCacheSize = sdbLevelDBCacheSize 77 copiedDBC.OpenFilesLimit = sdbOpenFilesLimit 78 79 db, err := newDatabase(&copiedDBC, et) 80 if err != nil { 81 return nil, err 82 } 83 shards = append(shards, db) 84 go batchWriteWorker(sdbBatchTaskCh) 85 } 86 87 logger.Info("Created a sharded database", "dbType", et, "numShards", numShards) 88 return &shardedDB{ 89 fn: dbc.Dir, shards: shards, 90 numShards: numShards, sdbBatchTaskCh: sdbBatchTaskCh, 91 }, nil 92 } 93 94 // batchWriteWorker executes passed batch tasks. 95 func batchWriteWorker(batchTasks <-chan sdbBatchTask) { 96 for task := range batchTasks { 97 task.resultCh <- sdbBatchResult{task.index, task.batch.Write()} 98 } 99 } 100 101 // IsPow2 checks if the given number is power of two or not. 102 func IsPow2(num uint) bool { 103 return (num & (num - 1)) == 0 104 } 105 106 // shardIndexByKey returns shard index derived from the given key. 107 // If len(key) is zero, it returns errKeyLengthZero. 108 func shardIndexByKey(key []byte, numShards uint) (int, error) { 109 if len(key) == 0 { 110 return 0, errKeyLengthZero 111 } 112 113 return int(key[0]) & (int(numShards) - 1), nil 114 } 115 116 // getShardByKey returns the shard corresponding to the given key. 117 func (db *shardedDB) getShardByKey(key []byte) (Database, error) { 118 if shardIndex, err := shardIndexByKey(key, uint(db.numShards)); err != nil { 119 return nil, err 120 } else { 121 return db.shards[shardIndex], nil 122 } 123 } 124 125 func (db *shardedDB) Put(key []byte, value []byte) error { 126 if shard, err := db.getShardByKey(key); err != nil { 127 return err 128 } else { 129 return shard.Put(key, value) 130 } 131 } 132 133 func (db *shardedDB) Get(key []byte) ([]byte, error) { 134 if shard, err := db.getShardByKey(key); err != nil { 135 return nil, err 136 } else { 137 return shard.Get(key) 138 } 139 } 140 141 func (db *shardedDB) Has(key []byte) (bool, error) { 142 if shard, err := db.getShardByKey(key); err != nil { 143 return false, err 144 } else { 145 return shard.Has(key) 146 } 147 } 148 149 func (db *shardedDB) Delete(key []byte) error { 150 if shard, err := db.getShardByKey(key); err != nil { 151 return err 152 } else { 153 return shard.Delete(key) 154 } 155 } 156 157 func (db *shardedDB) Close() { 158 close(db.sdbBatchTaskCh) 159 160 for _, shard := range db.shards { 161 shard.Close() 162 } 163 } 164 165 // Not enough size of channel slows down the iterator 166 const shardedDBCombineChanSize = 1024 // Size of resultCh 167 const shardedDBSubChannelSize = 128 // Size of each sub-channel of resultChs 168 169 // shardedDBIterator iterates all items of each shardDB. 170 // This is useful when you want to get items in serial in binary-alphabetigcal order. 171 type shardedDBIterator struct { 172 parallelIterator shardedDBParallelIterator 173 174 resultCh chan common.Entry 175 key []byte // current key 176 value []byte // current value 177 } 178 179 // NewIterator creates a binary-alphabetical iterator over a subset 180 // of database content with a particular key prefix, starting at a particular 181 // initial key (or after, if it does not exist). 182 func (db *shardedDB) NewIterator(prefix []byte, start []byte) Iterator { 183 it := &shardedDBIterator{ 184 parallelIterator: db.NewParallelIterator(context.TODO(), prefix, start, nil), 185 resultCh: make(chan common.Entry, shardedDBCombineChanSize), 186 } 187 188 go it.runCombineWorker() 189 190 return it 191 } 192 193 // NewIteratorUnsorted creates a iterator over the entire keyspace contained within 194 // the key-value database. This is useful when you want to get items fast in serial. 195 // If you want to get ordered items in serial, checkout shardedDB.NewIterator() 196 // If you want to get items in parallel from channels, checkout shardedDB.NewParallelIterator() 197 // IteratorUnsorted is a implementation of Iterator and data are accessed with 198 // Next(), Key() and Value() methods. With ChanIterator, data can be accessed with 199 // channels. The channels are gained with Channels() method. 200 func (db *shardedDB) NewIteratorUnsorted(prefix []byte, start []byte) Iterator { 201 resultCh := make(chan common.Entry, shardedDBCombineChanSize) 202 return &shardedDBIterator{ 203 parallelIterator: db.NewParallelIterator(context.TODO(), prefix, start, resultCh), 204 resultCh: resultCh, 205 } 206 } 207 208 // runCombineWorker fetches any key/value from resultChs and put the data in resultCh 209 // in binary-alphabetical order. 210 func (it *shardedDBIterator) runCombineWorker() { 211 // creates min-priority queue smallest values from each iterators 212 entries := &entryHeap{} 213 heap.Init(entries) 214 for i, ch := range it.parallelIterator.resultChs { 215 if e, ok := <-ch; ok { 216 heap.Push(entries, entryWithShardNum{e, i}) 217 } 218 } 219 220 chanIter: 221 for len(*entries) != 0 { 222 // check if done 223 select { 224 case <-it.parallelIterator.ctx.Done(): 225 logger.Trace("[shardedDBIterator] combine worker ends due to ctx") 226 break chanIter 227 default: 228 } 229 230 // look for smallest key 231 minEntry := heap.Pop(entries).(entryWithShardNum) 232 233 // fill resultCh with smallest key 234 it.resultCh <- minEntry.Entry 235 236 // fill used entry with new entry 237 // skip this if channel is closed 238 if e, ok := <-it.parallelIterator.resultChs[minEntry.shardNum]; ok { 239 heap.Push(entries, entryWithShardNum{e, minEntry.shardNum}) 240 } 241 } 242 logger.Trace("[shardedDBIterator] combine worker finished") 243 close(it.resultCh) 244 } 245 246 // Next gets the next item from iterators. 247 func (it *shardedDBIterator) Next() bool { 248 e, ok := <-it.resultCh 249 if !ok { 250 logger.Debug("[shardedDBIterator] Next is called on closed channel") 251 return false 252 } 253 it.key, it.value = e.Key, e.Val 254 return true 255 } 256 257 func (it *shardedDBIterator) Error() error { 258 for i, iter := range it.parallelIterator.iterators { 259 if iter.Error() != nil { 260 logger.Error("[shardedDBIterator] error from iterator", 261 "err", iter.Error(), "shardNum", i, "key", it.key, "val", it.value) 262 return iter.Error() 263 } 264 } 265 return nil 266 } 267 268 func (it *shardedDBIterator) Key() []byte { 269 return it.key 270 } 271 272 func (it *shardedDBIterator) Value() []byte { 273 return it.value 274 } 275 276 func (it *shardedDBIterator) Release() { 277 it.parallelIterator.cancel() 278 } 279 280 type entryWithShardNum struct { 281 common.Entry 282 shardNum int 283 } 284 285 type entryHeap []entryWithShardNum 286 287 func (e entryHeap) Len() int { 288 return len(e) 289 } 290 291 func (e entryHeap) Less(i, j int) bool { 292 return bytes.Compare(e[i].Key, e[j].Key) < 0 293 } 294 295 func (e entryHeap) Swap(i, j int) { 296 e[i], e[j] = e[j], e[i] 297 } 298 299 func (e *entryHeap) Push(x interface{}) { 300 *e = append(*e, x.(entryWithShardNum)) 301 } 302 303 func (e *entryHeap) Pop() interface{} { 304 old := *e 305 n := len(old) 306 element := old[n-1] 307 *e = old[0 : n-1] 308 return element 309 } 310 311 // shardedDBParallelIterator creates iterators for each shard DB. 312 // Channels subscribing each iterators can be gained. 313 // Each iterators fetch values in binary-alphabetical order. 314 // This is useful when you want to operate on each items in parallel. 315 type shardedDBParallelIterator struct { 316 ctx context.Context 317 cancel context.CancelFunc 318 319 iterators []Iterator 320 321 combinedChan bool // all workers put items to one resultChan 322 shardNum int // num of shards left to iterate 323 shardNumMu *sync.Mutex 324 resultChs []chan common.Entry 325 } 326 327 // NewParallelIterator creates iterators for each shard DB. This is useful when you 328 // want to operate on each items in parallel. 329 // If `resultCh` is given, all items are written to `resultCh`, unsorted with a 330 // particular key prefix, starting at a particular initial key. If `resultCh` 331 // is not given, new channels are created for each DB. Items are written to 332 // corresponding channels in binary-alphabetical order. The channels can be 333 // gained by calling `Channels()`. 334 // 335 // If you want to get ordered items in serial, checkout shardedDB.NewIterator() 336 // If you want to get unordered items in serial with Iterator Interface, 337 // checkout shardedDB.NewIteratorUnsorted(). 338 func (db *shardedDB) NewParallelIterator(ctx context.Context, prefix []byte, start []byte, resultCh chan common.Entry) shardedDBParallelIterator { 339 if ctx == nil { 340 ctx = context.TODO() 341 } 342 343 it := shardedDBParallelIterator{ 344 ctx: ctx, 345 cancel: nil, 346 iterators: make([]Iterator, len(db.shards)), 347 combinedChan: resultCh != nil, 348 shardNum: len(db.shards), 349 shardNumMu: &sync.Mutex{}, 350 resultChs: make([]chan common.Entry, len(db.shards)), 351 } 352 it.ctx, it.cancel = context.WithCancel(ctx) 353 354 for i, shard := range db.shards { 355 it.iterators[i] = shard.NewIterator(prefix, start) 356 if resultCh == nil { 357 it.resultChs[i] = make(chan common.Entry, shardedDBSubChannelSize) 358 } else { 359 it.resultChs[i] = resultCh 360 } 361 go it.runChanWorker(it.ctx, it.iterators[i], it.resultChs[i]) 362 } 363 364 return it 365 } 366 367 // runChanWorker runs a worker. The worker gets key/value pair from 368 // `it` and push the value to `resultCh`. 369 // `iterator.Release()` is called after all iterating is finished. 370 // `resultCh` is closed after the iterating is finished. 371 func (sit *shardedDBParallelIterator) runChanWorker(ctx context.Context, it Iterator, resultCh chan common.Entry) { 372 iter: 373 for it.Next() { 374 select { 375 case <-ctx.Done(): 376 break iter 377 default: 378 } 379 key := make([]byte, len(it.Key())) 380 val := make([]byte, len(it.Value())) 381 copy(key, it.Key()) 382 copy(val, it.Value()) 383 resultCh <- common.Entry{Key: key, Val: val} 384 } 385 // Release the iterator. There is nothing to iterate anymore. 386 it.Release() 387 // Close `resultCh`. If it is `combinedChan`, the close only happens 388 // when this is the last living worker. 389 sit.shardNumMu.Lock() 390 defer sit.shardNumMu.Unlock() 391 if sit.shardNum--; sit.combinedChan && sit.shardNum > 0 { 392 return 393 } 394 close(resultCh) 395 } 396 397 // Channels returns channels that can subscribe on. 398 func (it *shardedDBParallelIterator) Channels() []chan common.Entry { 399 return it.resultChs 400 } 401 402 // Release stops all iterators, channels and workers 403 // Even Release() is called, there could be some items left in the channel. 404 // Each iterator.Release() is called in `runChanWorker`. 405 func (it *shardedDBParallelIterator) Release() { 406 it.cancel() 407 } 408 409 func (db *shardedDB) NewBatch() Batch { 410 batches := make([]Batch, 0, db.numShards) 411 for i := 0; i < int(db.numShards); i++ { 412 batches = append(batches, db.shards[i].NewBatch()) 413 } 414 415 return &shardedDBBatch{ 416 batches: batches, numBatches: db.numShards, 417 taskCh: db.sdbBatchTaskCh, resultCh: make(chan sdbBatchResult, db.numShards), 418 } 419 } 420 421 func (db *shardedDB) Type() DBType { 422 return ShardedDB 423 } 424 425 func (db *shardedDB) Meter(prefix string) { 426 for index, shard := range db.shards { 427 shard.Meter(prefix + strconv.Itoa(index)) 428 } 429 } 430 431 type shardedDBBatch struct { 432 batches []Batch 433 numBatches uint 434 435 taskCh chan sdbBatchTask 436 resultCh chan sdbBatchResult 437 } 438 439 func (sdbBatch *shardedDBBatch) Put(key []byte, value []byte) error { 440 if ShardIndex, err := shardIndexByKey(key, sdbBatch.numBatches); err != nil { 441 return err 442 } else { 443 return sdbBatch.batches[ShardIndex].Put(key, value) 444 } 445 } 446 447 func (sdbBatch *shardedDBBatch) Delete(key []byte) error { 448 if ShardIndex, err := shardIndexByKey(key, sdbBatch.numBatches); err != nil { 449 return err 450 } else { 451 return sdbBatch.batches[ShardIndex].Delete(key) 452 } 453 } 454 455 // ValueSize is called to determine whether to write batches when it exceeds 456 // certain limit. shardedDB returns the largest size of its batches to 457 // write all batches at once when one of batch exceeds the limit. 458 func (sdbBatch *shardedDBBatch) ValueSize() int { 459 maxSize := 0 460 for _, batch := range sdbBatch.batches { 461 if batch.ValueSize() > maxSize { 462 maxSize = batch.ValueSize() 463 } 464 } 465 return maxSize 466 } 467 468 // Write passes the list of batch tasks to taskCh so batch can be processed 469 // by underlying workers. Write waits until all workers return the result. 470 func (sdbBatch *shardedDBBatch) Write() error { 471 for index, batch := range sdbBatch.batches { 472 sdbBatch.taskCh <- sdbBatchTask{batch, index, sdbBatch.resultCh} 473 } 474 475 var err error 476 for range sdbBatch.batches { 477 if batchResult := <-sdbBatch.resultCh; batchResult.err != nil { 478 logger.Error("Error while writing sharded batch", "index", batchResult.index, "err", batchResult.err) 479 err = batchResult.err 480 } 481 } 482 // Leave logs for each error but only return the last one. 483 return err 484 } 485 486 func (sdbBatch *shardedDBBatch) Reset() { 487 for _, batch := range sdbBatch.batches { 488 batch.Reset() 489 } 490 } 491 492 func (sdbBatch *shardedDBBatch) Replay(w KeyValueWriter) error { 493 for _, batch := range sdbBatch.batches { 494 if err := batch.Replay(w); err != nil { 495 return err 496 } 497 } 498 return nil 499 }