
     1  // Copyright 2018 The go-ethereum Authors
     2  // This file is part of the go-ethereum library.
     3  //
     4  // The go-ethereum library is free software: you can redistribute it and/or modify
     5  // it under the terms of the GNU Lesser General Public License as published by
     6  // the Free Software Foundation, either version 3 of the License, or
     7  // (at your option) any later version.
     8  //
     9  // The go-ethereum library is distributed in the hope that it will be useful,
    10  // but WITHOUT ANY WARRANTY; without even the implied warranty of
    12  // GNU Lesser General Public License for more details.
    13  //
    14  // You should have received a copy of the GNU Lesser General Public License
    15  // along with the go-ethereum library. If not, see <>.
    17  /*
    18  Counting number of items in garbage collection index
    20  The number of items in garbage collection index is not the same as the number of
    21  chunks in retrieval index (total number of stored chunks). Chunk can be garbage
    22  collected only when it is set to a synced state by ModSetSync, and only then can
    23  be counted into garbage collection size, which determines whether a number of
    24  chunk should be removed from the storage by the garbage collection. This opens a
    25  possibility that the storage size exceeds the limit if files are locally
    26  uploaded and the node is not connected to other nodes or there is a problem with
    27  syncing.
    29  Tracking of garbage collection size (gcSize) is focused on performance. Key
    30  points:
    32   1. counting the number of key/value pairs in LevelDB takes around 0.7s for 1e6
    33      on a very fast ssd (unacceptable long time in reality)
    34   2. locking leveldb batch writes with a global mutex (serial batch writes) is
    35      not acceptable, we should use locking per chunk address
    37  Because of point 1. we cannot count the number of items in garbage collection
    38  index in New constructor as it could last very long for realistic scenarios
    39  where limit is 5e6 and nodes are running on slower hdd disks or cloud providers
    40  with low IOPS.
    42  Point 2. is a performance optimization to allow parallel batch writes with
    43  getters, putters and setters. Every single batch that they create contain only
    44  information related to a single chunk, no relations with other chunks or shared
    45  statistical data (like gcSize). This approach avoids race conditions on writing
    46  batches in parallel, but creates a problem of synchronizing statistical data
    47  values like gcSize. With global mutex lock, any data could be written by any
    48  batch, but would not use utilize the full potential of leveldb parallel writes.
    50  To mitigate this two problems, the implementation of counting and persisting
    51  gcSize is split into two parts. One is the in-memory value (gcSize) that is fast
    52  to read and write with a dedicated mutex (gcSizeMu) if the batch which adds or
    53  removes items from garbage collection index is successful. The second part is
    54  the reliable persistence of this value to leveldb database, as storedGCSize
    55  field. This database field is saved by writeGCSizeWorker and writeGCSize
    56  functions when in-memory gcSize variable is changed, but no too often to avoid
    57  very frequent database writes. This database writes are triggered by
    58  writeGCSizeTrigger when a call is made to function incGCSize. Trigger ensures
    59  that no database writes are done only when gcSize is changed (contrary to a
    60  simpler periodic writes or checks). A backoff of 10s in writeGCSizeWorker
    61  ensures that no frequent batch writes are made. Saving the storedGCSize on
    62  database Close function ensures that in-memory gcSize is persisted when database
    63  is closed.
    65  This persistence must be resilient to failures like panics. For this purpose, a
    66  collection of hashes that are added to the garbage collection index, but still
    67  not persisted to storedGCSize, must be tracked to count them in when DB is
    68  constructed again with New function after the failure (swarm node restarts). On
    69  every batch write that adds a new item to garbage collection index, the same
    70  hash is added to gcUncountedHashesIndex. This ensures that there is a persisted
    71  information which hashes were added to the garbage collection index. But, when
    72  the storedGCSize is saved by writeGCSize function, this values are removed in
    73  the same batch in which storedGCSize is changed to ensure consistency. When the
    74  panic happen, or database Close method is not saved. The database storage
    75  contains all information to reliably and efficiently get the correct number of
    76  items in garbage collection index. This is performed in the New function when
    77  all hashes in gcUncountedHashesIndex are counted, added to the storedGCSize and
    78  saved to the disk before the database is constructed again. Index
    79  gcUncountedHashesIndex is acting as dirty bit for recovery that provides
    80  information what needs to be corrected. With a simple dirty bit, the whole
    81  garbage collection index should me counted on recovery instead only the items in
    82  gcUncountedHashesIndex. Because of the triggering mechanizm of writeGCSizeWorker
    83  and relatively short backoff time, the number of hashes in
    84  gcUncountedHashesIndex should be low and it should take a very short time to
    85  recover from the previous failure. If there was no failure and
    86  gcUncountedHashesIndex is empty, which is the usual case, New function will take
    87  the minimal time to return.
    88  */
    90  package localstore
    92  import (
    93  	"time"
    95  	""
    96  	""
    97  	""
    98  )
   100  var (
   101  	// gcTargetRatio defines the target number of items
   102  	// in garbage collection index that will not be removed
   103  	// on garbage collection. The target number of items
   104  	// is calculated by gcTarget function. This value must be
   105  	// in range (0,1]. For example, with 0.9 value,
   106  	// garbage collection will leave 90% of defined capacity
   107  	// in database after its run. This prevents frequent
   108  	// garbage collection runs.
   109  	gcTargetRatio = 0.9
   110  	// gcBatchSize limits the number of chunks in a single
   111  	// leveldb batch on garbage collection.
   112  	gcBatchSize int64 = 1000
   113  )
   115  // collectGarbageWorker is a long running function that waits for
   116  // collectGarbageTrigger channel to signal a garbage collection
   117  // run. GC run iterates on gcIndex and removes older items
   118  // form retrieval and other indexes.
   119  func (db *DB) collectGarbageWorker() {
   120  	for {
   121  		select {
   122  		case <-db.collectGarbageTrigger:
   123  			// run a single collect garbage run and
   124  			// if done is false, gcBatchSize is reached and
   125  			// another collect garbage run is needed
   126  			collectedCount, done, err := db.collectGarbage()
   127  			if err != nil {
   128  				log.Error("localstore collect garbage", "err", err)
   129  			}
   130  			// check if another gc run is needed
   131  			if !done {
   132  				db.triggerGarbageCollection()
   133  			}
   135  			if testHookCollectGarbage != nil {
   136  				testHookCollectGarbage(collectedCount)
   137  			}
   138  		case <-db.close:
   139  			return
   140  		}
   141  	}
   142  }
   144  // collectGarbage removes chunks from retrieval and other
   145  // indexes if maximal number of chunks in database is reached.
   146  // This function returns the number of removed chunks. If done
   147  // is false, another call to this function is needed to collect
   148  // the rest of the garbage as the batch size limit is reached.
   149  // This function is called in collectGarbageWorker.
   150  func (db *DB) collectGarbage() (collectedCount int64, done bool, err error) {
   151  	batch := new(leveldb.Batch)
   152  	target := db.gcTarget()
   154  	done = true
   155  	err = db.gcIndex.Iterate(func(item shed.Item) (stop bool, err error) {
   156  		// protect parallel updates
   157  		unlock, err := db.lockAddr(item.Address)
   158  		if err != nil {
   159  			return false, err
   160  		}
   161  		defer unlock()
   163  		gcSize := db.getGCSize()
   164  		if gcSize-collectedCount <= target {
   165  			return true, nil
   166  		}
   167  		// delete from retrieve, pull, gc
   168  		db.retrievalDataIndex.DeleteInBatch(batch, item)
   169  		db.retrievalAccessIndex.DeleteInBatch(batch, item)
   170  		db.pullIndex.DeleteInBatch(batch, item)
   171  		db.gcIndex.DeleteInBatch(batch, item)
   172  		collectedCount++
   173  		if collectedCount >= gcBatchSize {
   174  			// bach size limit reached,
   175  			// another gc run is needed
   176  			done = false
   177  			return true, nil
   178  		}
   179  		return false, nil
   180  	}, nil)
   181  	if err != nil {
   182  		return 0, false, err
   183  	}
   185  	err = db.shed.WriteBatch(batch)
   186  	if err != nil {
   187  		return 0, false, err
   188  	}
   189  	// batch is written, decrement gcSize
   190  	db.incGCSize(-collectedCount)
   191  	return collectedCount, done, nil
   192  }
   194  // gcTrigger retruns the absolute value for garbage collection
   195  // target value, calculated from db.capacity and gcTargetRatio.
   196  func (db *DB) gcTarget() (target int64) {
   197  	return int64(float64(db.capacity) * gcTargetRatio)
   198  }
   200  // incGCSize increments gcSize by the provided number.
   201  // If count is negative, it will decrement gcSize.
   202  func (db *DB) incGCSize(count int64) {
   203  	if count == 0 {
   204  		return
   205  	}
   207  	db.gcSizeMu.Lock()
   208  	new := db.gcSize + count
   209  	db.gcSize = new
   210  	db.gcSizeMu.Unlock()
   212  	select {
   213  	case db.writeGCSizeTrigger <- struct{}{}:
   214  	default:
   215  	}
   216  	if new >= db.capacity {
   217  		db.triggerGarbageCollection()
   218  	}
   219  }
   221  // getGCSize returns gcSize value by locking it
   222  // with gcSizeMu mutex.
   223  func (db *DB) getGCSize() (count int64) {
   224  	db.gcSizeMu.RLock()
   225  	count = db.gcSize
   226  	db.gcSizeMu.RUnlock()
   227  	return count
   228  }
   230  // triggerGarbageCollection signals collectGarbageWorker
   231  // to call collectGarbage.
   232  func (db *DB) triggerGarbageCollection() {
   233  	select {
   234  	case db.collectGarbageTrigger <- struct{}{}:
   235  	case <-db.close:
   236  	default:
   237  	}
   238  }
   240  // writeGCSizeWorker writes gcSize on trigger event
   241  // and waits writeGCSizeDelay after each write.
   242  // It implements a linear backoff with delay of
   243  // writeGCSizeDelay duration to avoid very frequent
   244  // database operations.
   245  func (db *DB) writeGCSizeWorker() {
   246  	for {
   247  		select {
   248  		case <-db.writeGCSizeTrigger:
   249  			err := db.writeGCSize(db.getGCSize())
   250  			if err != nil {
   251  				log.Error("localstore write gc size", "err", err)
   252  			}
   253  			// Wait some time before writing gc size in the next
   254  			// iteration. This prevents frequent I/O operations.
   255  			select {
   256  			case <-time.After(10 * time.Second):
   257  			case <-db.close:
   258  				return
   259  			}
   260  		case <-db.close:
   261  			return
   262  		}
   263  	}
   264  }
   266  // writeGCSize stores the number of items in gcIndex.
   267  // It removes all hashes from gcUncountedHashesIndex
   268  // not to include them on the next DB initialization
   269  // (New function) when gcSize is counted.
   270  func (db *DB) writeGCSize(gcSize int64) (err error) {
   271  	const maxBatchSize = 1000
   273  	batch := new(leveldb.Batch)
   274  	db.storedGCSize.PutInBatch(batch, uint64(gcSize))
   275  	batchSize := 1
   277  	// use only one iterator as it acquires its snapshot
   278  	// not to remove hashes from index that are added
   279  	// after stored gc size is written
   280  	err = db.gcUncountedHashesIndex.Iterate(func(item shed.Item) (stop bool, err error) {
   281  		db.gcUncountedHashesIndex.DeleteInBatch(batch, item)
   282  		batchSize++
   283  		if batchSize >= maxBatchSize {
   284  			err = db.shed.WriteBatch(batch)
   285  			if err != nil {
   286  				return false, err
   287  			}
   288  			batch.Reset()
   289  			batchSize = 0
   290  		}
   291  		return false, nil
   292  	}, nil)
   293  	if err != nil {
   294  		return err
   295  	}
   296  	return db.shed.WriteBatch(batch)
   297  }
   299  // testHookCollectGarbage is a hook that can provide
   300  // information when a garbage collection run is done
   301  // and how many items it removed.
   302  var testHookCollectGarbage func(collectedCount int64)