github.com/ethersphere/bee/v2@v2.2.0/pkg/storer/reserve.go (about)

     1  // Copyright 2023 The Swarm Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  package storer
     6  
     7  import (
     8  	"context"
     9  	"encoding/hex"
    10  	"errors"
    11  	"fmt"
    12  	"math"
    13  	"slices"
    14  	"sync"
    15  	"sync/atomic"
    16  	"time"
    17  
    18  	"github.com/ethersphere/bee/v2/pkg/postage"
    19  	"github.com/ethersphere/bee/v2/pkg/storage"
    20  	"github.com/ethersphere/bee/v2/pkg/storage/storageutil"
    21  	"github.com/ethersphere/bee/v2/pkg/storer/internal/reserve"
    22  	"github.com/ethersphere/bee/v2/pkg/storer/internal/transaction"
    23  	"github.com/ethersphere/bee/v2/pkg/swarm"
    24  )
    25  
    26  const (
    27  	reserveOverCapacity = "reserveOverCapacity"
    28  	reserveUnreserved   = "reserveUnreserved"
    29  	batchExpiry         = "batchExpiry"
    30  	batchExpiryDone     = "batchExpiryDone"
    31  )
    32  
    33  var errMaxRadius = errors.New("max radius reached")
    34  var reserveSizeWithinRadius atomic.Uint64
    35  
    36  type Syncer interface {
    37  	// Number of active historical syncing jobs.
    38  	SyncRate() float64
    39  	Start(context.Context)
    40  }
    41  
    42  func threshold(capacity int) int { return capacity * 5 / 10 }
    43  
    44  func (db *DB) startReserveWorkers(
    45  	ctx context.Context,
    46  	radius func() (uint8, error),
    47  ) {
    48  	ctx, cancel := context.WithCancel(ctx)
    49  	go func() {
    50  		<-db.quit
    51  		cancel()
    52  	}()
    53  
    54  	db.inFlight.Add(1)
    55  	go db.reserveWorker(ctx)
    56  
    57  	select {
    58  	case <-time.After(db.opts.reserveWarmupDuration):
    59  	case <-db.quit:
    60  		return
    61  	}
    62  
    63  	r, err := radius()
    64  	if err != nil {
    65  		db.logger.Error(err, "reserve worker initial radius")
    66  		return // node shutdown
    67  	}
    68  
    69  	err = db.reserve.SetRadius(r)
    70  	if err != nil {
    71  		db.logger.Error(err, "reserve set radius")
    72  	} else {
    73  		db.metrics.StorageRadius.Set(float64(r))
    74  	}
    75  
    76  	// syncing can now begin now that the reserver worker is running
    77  	db.syncer.Start(ctx)
    78  }
    79  
    80  func (db *DB) countWithinRadius(ctx context.Context) (int, error) {
    81  
    82  	count := 0
    83  	missing := 0
    84  	radius := db.StorageRadius()
    85  
    86  	evictBatches := make(map[string]bool)
    87  
    88  	err := db.reserve.IterateChunksItems(0, func(ci *reserve.ChunkBinItem) (bool, error) {
    89  		if ci.Bin >= radius {
    90  			count++
    91  		}
    92  
    93  		if exists, err := db.batchstore.Exists(ci.BatchID); err == nil && !exists {
    94  			missing++
    95  			evictBatches[string(ci.BatchID)] = true
    96  		}
    97  		return false, nil
    98  	})
    99  	if err != nil {
   100  		return 0, err
   101  	}
   102  
   103  	for batch := range evictBatches {
   104  		db.logger.Debug("reserve: invalid batch", "batch_id", hex.EncodeToString([]byte(batch)))
   105  		err = errors.Join(err, db.EvictBatch(ctx, []byte(batch)))
   106  	}
   107  
   108  	db.metrics.ReserveSizeWithinRadius.Set(float64(count))
   109  	db.metrics.ReserveMissingBatch.Set(float64(missing))
   110  	reserveSizeWithinRadius.Store(uint64(count))
   111  
   112  	return count, err
   113  }
   114  
   115  func (db *DB) reserveWorker(ctx context.Context) {
   116  	defer db.inFlight.Done()
   117  
   118  	batchExpiryTrigger, batchExpiryUnsub := db.events.Subscribe(batchExpiry)
   119  	defer batchExpiryUnsub()
   120  
   121  	overCapTrigger, overCapUnsub := db.events.Subscribe(reserveOverCapacity)
   122  	defer overCapUnsub()
   123  
   124  	thresholdTicker := time.NewTicker(db.opts.reserveWakeupDuration)
   125  	defer thresholdTicker.Stop()
   126  
   127  	_, _ = db.countWithinRadius(ctx)
   128  
   129  	for {
   130  		select {
   131  		case <-ctx.Done():
   132  			return
   133  		case <-batchExpiryTrigger:
   134  
   135  			err := db.evictExpiredBatches(ctx)
   136  			if err != nil {
   137  				db.logger.Warning("reserve worker evict expired batches", "error", err)
   138  			}
   139  
   140  			db.events.Trigger(batchExpiryDone)
   141  
   142  			if !db.reserve.IsWithinCapacity() {
   143  				db.events.Trigger(reserveOverCapacity)
   144  			}
   145  
   146  		case <-overCapTrigger:
   147  
   148  			db.metrics.OverCapTriggerCount.Inc()
   149  			if err := db.unreserve(ctx); err != nil {
   150  				db.logger.Warning("reserve worker unreserve", "error", err)
   151  			}
   152  
   153  		case <-thresholdTicker.C:
   154  
   155  			radius := db.reserve.Radius()
   156  			count, err := db.countWithinRadius(ctx)
   157  			if err != nil {
   158  				db.logger.Warning("reserve worker count within radius", "error", err)
   159  				continue
   160  			}
   161  
   162  			if count < threshold(db.reserve.Capacity()) && db.syncer.SyncRate() == 0 && radius > db.opts.minimumRadius {
   163  				radius--
   164  				if err := db.reserve.SetRadius(radius); err != nil {
   165  					db.logger.Error(err, "reserve set radius")
   166  				}
   167  				db.metrics.StorageRadius.Set(float64(radius))
   168  				db.logger.Info("reserve radius decrease", "radius", radius)
   169  			}
   170  		}
   171  	}
   172  }
   173  
   174  func (db *DB) evictExpiredBatches(ctx context.Context) error {
   175  
   176  	batches, err := db.getExpiredBatches()
   177  	if err != nil {
   178  		return err
   179  	}
   180  
   181  	for _, batchID := range batches {
   182  		evicted, err := db.evictBatch(ctx, batchID, math.MaxInt, swarm.MaxBins)
   183  		if err != nil {
   184  			return err
   185  		}
   186  		if evicted > 0 {
   187  			db.logger.Debug("evicted expired batch", "batch_id", hex.EncodeToString(batchID), "total_evicted", evicted)
   188  		}
   189  		err = db.storage.Run(ctx, func(st transaction.Store) error {
   190  			return st.IndexStore().Delete(&expiredBatchItem{BatchID: batchID})
   191  		})
   192  		if err != nil {
   193  			return err
   194  		}
   195  	}
   196  
   197  	return nil
   198  }
   199  
   200  func (db *DB) getExpiredBatches() ([][]byte, error) {
   201  	var batchesToEvict [][]byte
   202  	err := db.storage.IndexStore().Iterate(storage.Query{
   203  		Factory:      func() storage.Item { return new(expiredBatchItem) },
   204  		ItemProperty: storage.QueryItemID,
   205  	}, func(result storage.Result) (bool, error) {
   206  		batchesToEvict = append(batchesToEvict, []byte(result.ID))
   207  		return false, nil
   208  	})
   209  	if err != nil {
   210  		return nil, err
   211  	}
   212  	return batchesToEvict, nil
   213  }
   214  
   215  func (db *DB) evictBatch(
   216  	ctx context.Context,
   217  	batchID []byte,
   218  	evictCount int,
   219  	upToBin uint8,
   220  ) (evicted int, err error) {
   221  	dur := captureDuration(time.Now())
   222  	defer func() {
   223  		db.metrics.ReserveSize.Set(float64(db.reserve.Size()))
   224  		db.metrics.MethodCallsDuration.WithLabelValues("reserve", "EvictBatch").Observe(dur())
   225  		if err == nil {
   226  			db.metrics.MethodCalls.WithLabelValues("reserve", "EvictBatch", "success").Inc()
   227  		} else {
   228  			db.metrics.MethodCalls.WithLabelValues("reserve", "EvictBatch", "failure").Inc()
   229  		}
   230  		if upToBin == swarm.MaxBins {
   231  			db.metrics.ExpiredChunkCount.Add(float64(evicted))
   232  		} else {
   233  			db.metrics.EvictedChunkCount.Add(float64(evicted))
   234  		}
   235  		db.logger.Debug(
   236  			"reserve eviction",
   237  			"uptoBin", upToBin,
   238  			"evicted", evicted,
   239  			"batchID", hex.EncodeToString(batchID),
   240  			"new_size", db.reserve.Size(),
   241  		)
   242  	}()
   243  
   244  	return db.reserve.EvictBatchBin(ctx, batchID, evictCount, upToBin)
   245  }
   246  
   247  // EvictBatch evicts all chunks belonging to a batch from the reserve.
   248  func (db *DB) EvictBatch(ctx context.Context, batchID []byte) error {
   249  	if db.reserve == nil {
   250  		// if reserve is not configured, do nothing
   251  		return nil
   252  	}
   253  
   254  	err := db.storage.Run(ctx, func(tx transaction.Store) error {
   255  		return tx.IndexStore().Put(&expiredBatchItem{BatchID: batchID})
   256  	})
   257  	if err != nil {
   258  		return fmt.Errorf("save expired batch: %w", err)
   259  	}
   260  
   261  	db.events.Trigger(batchExpiry)
   262  	return nil
   263  }
   264  
   265  func (db *DB) ReserveGet(ctx context.Context, addr swarm.Address, batchID []byte, stampHash []byte) (ch swarm.Chunk, err error) {
   266  	dur := captureDuration(time.Now())
   267  	defer func() {
   268  		db.metrics.MethodCallsDuration.WithLabelValues("reserve", "ReserveGet").Observe(dur())
   269  		if err == nil || errors.Is(err, storage.ErrNotFound) {
   270  			db.metrics.MethodCalls.WithLabelValues("reserve", "ReserveGet", "success").Inc()
   271  		} else {
   272  			db.metrics.MethodCalls.WithLabelValues("reserve", "ReserveGet", "failure").Inc()
   273  			db.logger.Debug("reserve get error", "error", err)
   274  		}
   275  	}()
   276  
   277  	return db.reserve.Get(ctx, addr, batchID, stampHash)
   278  }
   279  
   280  func (db *DB) ReserveHas(addr swarm.Address, batchID []byte, stampHash []byte) (has bool, err error) {
   281  	dur := captureDuration(time.Now())
   282  	defer func() {
   283  		db.metrics.MethodCallsDuration.WithLabelValues("reserve", "ReserveHas").Observe(dur())
   284  		if err == nil {
   285  			db.metrics.MethodCalls.WithLabelValues("reserve", "ReserveHas", "success").Inc()
   286  		} else {
   287  			db.metrics.MethodCalls.WithLabelValues("reserve", "ReserveHas", "failure").Inc()
   288  			db.logger.Debug("reserve has error", "error", err)
   289  		}
   290  	}()
   291  
   292  	return db.reserve.Has(addr, batchID, stampHash)
   293  }
   294  
   295  // ReservePutter returns a Putter for inserting chunks into the reserve.
   296  func (db *DB) ReservePutter() storage.Putter {
   297  	return putterWithMetrics{
   298  		storage.PutterFunc(
   299  			func(ctx context.Context, chunk swarm.Chunk) error {
   300  				err := db.reserve.Put(ctx, chunk)
   301  				if err != nil {
   302  					db.logger.Debug("reserve put error", "error", err)
   303  					return fmt.Errorf("reserve putter.Put: %w", err)
   304  				}
   305  				db.reserveBinEvents.Trigger(string(db.po(chunk.Address())))
   306  				if !db.reserve.IsWithinCapacity() {
   307  					db.events.Trigger(reserveOverCapacity)
   308  				}
   309  				db.metrics.ReserveSize.Set(float64(db.reserve.Size()))
   310  				return nil
   311  			},
   312  		),
   313  		db.metrics,
   314  		"reserve",
   315  	}
   316  }
   317  
   318  func (db *DB) unreserve(ctx context.Context) (err error) {
   319  	dur := captureDuration(time.Now())
   320  	defer func() {
   321  		db.metrics.MethodCallsDuration.WithLabelValues("reserve", "unreserve").Observe(dur())
   322  		if err == nil {
   323  			db.metrics.MethodCalls.WithLabelValues("reserve", "unreserve", "success").Inc()
   324  		} else {
   325  			db.metrics.MethodCalls.WithLabelValues("reserve", "unreserve", "failure").Inc()
   326  		}
   327  	}()
   328  
   329  	radius := db.reserve.Radius()
   330  	defer db.events.Trigger(reserveUnreserved)
   331  
   332  	target := db.reserve.EvictionTarget()
   333  	if target <= 0 {
   334  		return nil
   335  	}
   336  
   337  	db.logger.Info("unreserve start", "target", target, "radius", radius)
   338  
   339  	batchExpiry, unsub := db.events.Subscribe(batchExpiry)
   340  	defer unsub()
   341  
   342  	totalEvicted := 0
   343  
   344  	var batches [][]byte
   345  	err = db.batchstore.Iterate(func(b *postage.Batch) (bool, error) {
   346  		batches = append(batches, b.ID)
   347  		return false, nil
   348  	})
   349  	if err != nil {
   350  		return err
   351  	}
   352  
   353  	for radius < swarm.MaxBins {
   354  
   355  		for _, b := range batches {
   356  
   357  			select {
   358  			case <-batchExpiry:
   359  				db.logger.Debug("stopping unreserve, received batch expiration signal")
   360  				return nil
   361  			default:
   362  			}
   363  
   364  			evict := target - totalEvicted
   365  			if evict < int(db.opts.reserveMinEvictCount) { // evict at least a min count
   366  				evict = int(db.opts.reserveMinEvictCount)
   367  			}
   368  
   369  			binEvicted, err := db.evictBatch(ctx, b, evict, radius)
   370  			// eviction happens in batches, so we need to keep track of the total
   371  			// number of chunks evicted even if there was an error
   372  			totalEvicted += binEvicted
   373  
   374  			// we can only get error here for critical cases, for eg. batch commit
   375  			// error, which is not recoverable
   376  			if err != nil {
   377  				return err
   378  			}
   379  
   380  			if totalEvicted >= target {
   381  				db.logger.Info("unreserve finished", "evicted", totalEvicted, "radius", radius)
   382  				return nil
   383  			}
   384  		}
   385  
   386  		radius++
   387  		db.logger.Info("reserve radius increase", "radius", radius)
   388  		_ = db.reserve.SetRadius(radius)
   389  		db.metrics.StorageRadius.Set(float64(radius))
   390  	}
   391  
   392  	return errMaxRadius
   393  }
   394  
   395  // ReserveLastBinIDs returns all of the highest binIDs from all the bins in the reserve and the epoch time of the reserve.
   396  func (db *DB) ReserveLastBinIDs() ([]uint64, uint64, error) {
   397  	if db.reserve == nil {
   398  		return nil, 0, nil
   399  	}
   400  
   401  	return db.reserve.LastBinIDs()
   402  }
   403  
   404  func (db *DB) ReserveIterateChunks(cb func(swarm.Chunk) (bool, error)) error {
   405  	return db.reserve.IterateChunks(0, cb)
   406  }
   407  
   408  func (db *DB) StorageRadius() uint8 {
   409  	if db.reserve == nil {
   410  		return 0
   411  	}
   412  	return db.reserve.Radius()
   413  }
   414  
   415  func (db *DB) ReserveSize() int {
   416  	if db.reserve == nil {
   417  		return 0
   418  	}
   419  	return db.reserve.Size()
   420  }
   421  
   422  func (db *DB) ReserveSizeWithinRadius() uint64 {
   423  	return reserveSizeWithinRadius.Load()
   424  }
   425  
   426  func (db *DB) IsWithinStorageRadius(addr swarm.Address) bool {
   427  	if db.reserve == nil {
   428  		return false
   429  	}
   430  	return swarm.Proximity(addr.Bytes(), db.baseAddr.Bytes()) >= db.reserve.Radius()
   431  }
   432  
   433  // BinC is the result returned from the SubscribeBin channel that contains the chunk address and the binID
   434  type BinC struct {
   435  	Address   swarm.Address
   436  	BinID     uint64
   437  	BatchID   []byte
   438  	StampHash []byte
   439  }
   440  
   441  // SubscribeBin returns a channel that feeds all the chunks in the reserve from a certain bin between a start and end binIDs.
   442  func (db *DB) SubscribeBin(ctx context.Context, bin uint8, start uint64) (<-chan *BinC, func(), <-chan error) {
   443  	out := make(chan *BinC)
   444  	done := make(chan struct{})
   445  	errC := make(chan error, 1)
   446  
   447  	db.inFlight.Add(1)
   448  	go func() {
   449  		defer db.inFlight.Done()
   450  
   451  		trigger, unsub := db.reserveBinEvents.Subscribe(string(bin))
   452  		defer unsub()
   453  		defer close(out)
   454  
   455  		for {
   456  
   457  			err := db.reserve.IterateBin(bin, start, func(a swarm.Address, binID uint64, batchID, stampHash []byte) (bool, error) {
   458  				select {
   459  				case out <- &BinC{Address: a, BinID: binID, BatchID: batchID, StampHash: stampHash}:
   460  					start = binID + 1
   461  				case <-done:
   462  					return true, nil
   463  				case <-db.quit:
   464  					return false, ErrDBQuit
   465  				case <-ctx.Done():
   466  					return false, ctx.Err()
   467  				}
   468  
   469  				return false, nil
   470  			})
   471  			if err != nil {
   472  				errC <- err
   473  				return
   474  			}
   475  
   476  			select {
   477  			case <-trigger:
   478  			case <-done:
   479  				return
   480  			case <-db.quit:
   481  				errC <- ErrDBQuit
   482  				return
   483  			case <-ctx.Done():
   484  				errC <- err
   485  				return
   486  			}
   487  		}
   488  	}()
   489  
   490  	var doneOnce sync.Once
   491  	return out, func() {
   492  		doneOnce.Do(func() { close(done) })
   493  	}, errC
   494  }
   495  
   496  // expiredBatchItem is a storage.Item implementation for expired batches.
   497  type expiredBatchItem struct {
   498  	BatchID []byte
   499  }
   500  
   501  // ID implements storage.Item.
   502  func (e *expiredBatchItem) ID() string {
   503  	return string(e.BatchID)
   504  }
   505  
   506  // Namespace implements storage.Item.
   507  func (e *expiredBatchItem) Namespace() string {
   508  	return "expiredBatchItem"
   509  }
   510  
   511  // Marshal implements storage.Item.
   512  // It is a no-op as expiredBatchItem is not serialized.
   513  func (e *expiredBatchItem) Marshal() ([]byte, error) {
   514  	return nil, nil
   515  }
   516  
   517  // Unmarshal implements storage.Item.
   518  // It is a no-op as expiredBatchItem is not serialized.
   519  func (e *expiredBatchItem) Unmarshal(_ []byte) error {
   520  	return nil
   521  }
   522  
   523  // Clone implements storage.Item.
   524  func (e *expiredBatchItem) Clone() storage.Item {
   525  	if e == nil {
   526  		return nil
   527  	}
   528  	return &expiredBatchItem{
   529  		BatchID: slices.Clone(e.BatchID),
   530  	}
   531  }
   532  
   533  // String implements storage.Item.
   534  func (e *expiredBatchItem) String() string {
   535  	return storageutil.JoinFields(e.Namespace(), e.ID())
   536  }
   537  
   538  func (db *DB) po(addr swarm.Address) uint8 {
   539  	return swarm.Proximity(db.baseAddr.Bytes(), addr.Bytes())
   540  }