github.com/ethersphere/bee/v2@v2.2.0/pkg/salud/salud.go (about)

     1  // Copyright 2023 The Swarm Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  // Package salud monitors the connected peers, calculates certain thresholds, and marks peers as unhealthy that
     6  // fall short of the thresholds to maintain network salud (health).
     7  package salud
     8  
     9  import (
    10  	"context"
    11  	"sort"
    12  	"sync"
    13  	"time"
    14  
    15  	"github.com/ethersphere/bee/v2/pkg/log"
    16  	"github.com/ethersphere/bee/v2/pkg/status"
    17  	"github.com/ethersphere/bee/v2/pkg/storer"
    18  	"github.com/ethersphere/bee/v2/pkg/swarm"
    19  	"github.com/ethersphere/bee/v2/pkg/topology"
    20  	"go.uber.org/atomic"
    21  )
    22  
    23  // loggerName is the tree path name of the logger for this package.
    24  const loggerName = "salud"
    25  
    26  const (
    27  	wakeup                 = time.Minute * 5
    28  	requestTimeout         = time.Second * 10
    29  	DefaultMinPeersPerBin  = 4
    30  	DefaultDurPercentile   = 0.4 // consider 40% as healthy, lower percentile = stricter duration check
    31  	DefaultConnsPercentile = 0.8 // consider 80% as healthy, lower percentile = stricter conns check
    32  )
    33  
    34  type topologyDriver interface {
    35  	UpdatePeerHealth(peer swarm.Address, health bool, dur time.Duration)
    36  	topology.PeerIterator
    37  }
    38  
    39  type peerStatus interface {
    40  	PeerSnapshot(ctx context.Context, peer swarm.Address) (*status.Snapshot, error)
    41  }
    42  
    43  type reserve interface {
    44  	storer.RadiusChecker
    45  	ReserveSize() int
    46  }
    47  
    48  type service struct {
    49  	wg            sync.WaitGroup
    50  	quit          chan struct{}
    51  	logger        log.Logger
    52  	topology      topologyDriver
    53  	status        peerStatus
    54  	metrics       metrics
    55  	isSelfHealthy *atomic.Bool
    56  	reserve       reserve
    57  
    58  	radiusSubsMtx sync.Mutex
    59  	radiusC       []chan uint8
    60  }
    61  
    62  func New(
    63  	status peerStatus,
    64  	topology topologyDriver,
    65  	reserve reserve,
    66  	logger log.Logger,
    67  	warmup time.Duration,
    68  	mode string,
    69  	minPeersPerbin int,
    70  	durPercentile float64,
    71  	connsPercentile float64,
    72  ) *service {
    73  
    74  	metrics := newMetrics()
    75  
    76  	s := &service{
    77  		quit:          make(chan struct{}),
    78  		logger:        logger.WithName(loggerName).Register(),
    79  		status:        status,
    80  		topology:      topology,
    81  		metrics:       metrics,
    82  		isSelfHealthy: atomic.NewBool(true),
    83  		reserve:       reserve,
    84  	}
    85  
    86  	s.wg.Add(1)
    87  	go s.worker(warmup, mode, minPeersPerbin, durPercentile, connsPercentile)
    88  
    89  	return s
    90  
    91  }
    92  
    93  func (s *service) worker(warmup time.Duration, mode string, minPeersPerbin int, durPercentile float64, connsPercentile float64) {
    94  	defer s.wg.Done()
    95  
    96  	select {
    97  	case <-s.quit:
    98  		return
    99  	case <-time.After(warmup):
   100  	}
   101  
   102  	for {
   103  
   104  		s.salud(mode, minPeersPerbin, durPercentile, connsPercentile)
   105  
   106  		select {
   107  		case <-s.quit:
   108  			return
   109  		case <-time.After(wakeup):
   110  		}
   111  	}
   112  }
   113  
   114  func (s *service) Close() error {
   115  	close(s.quit)
   116  	s.wg.Wait()
   117  	return nil
   118  }
   119  
   120  type peer struct {
   121  	status   *status.Snapshot
   122  	dur      time.Duration
   123  	addr     swarm.Address
   124  	bin      uint8
   125  	neighbor bool
   126  }
   127  
   128  // salud acquires the status snapshot of every peer and computes an nth percentile of response duration and connected
   129  // per count, the most common storage radius, and the batch commitment, and based on these values, marks peers as unhealhy that fall beyond
   130  // the allowed thresholds.
   131  func (s *service) salud(mode string, minPeersPerbin int, durPercentile float64, connsPercentile float64) {
   132  
   133  	var (
   134  		mtx      sync.Mutex
   135  		wg       sync.WaitGroup
   136  		totaldur float64
   137  		peers    []peer
   138  		bins     [swarm.MaxBins]int
   139  	)
   140  
   141  	_ = s.topology.EachConnectedPeer(func(addr swarm.Address, bin uint8) (stop bool, jumpToNext bool, err error) {
   142  		wg.Add(1)
   143  		go func() {
   144  			defer wg.Done()
   145  
   146  			ctx, cancel := context.WithTimeout(context.Background(), requestTimeout)
   147  			defer cancel()
   148  
   149  			start := time.Now()
   150  			snapshot, err := s.status.PeerSnapshot(ctx, addr)
   151  			dur := time.Since(start)
   152  
   153  			if err != nil {
   154  				s.topology.UpdatePeerHealth(addr, false, dur)
   155  				return
   156  			}
   157  
   158  			if snapshot.BeeMode != mode {
   159  				return
   160  			}
   161  
   162  			mtx.Lock()
   163  			bins[bin]++
   164  			totaldur += dur.Seconds()
   165  			peers = append(peers, peer{snapshot, dur, addr, bin, s.reserve.IsWithinStorageRadius(addr)})
   166  			mtx.Unlock()
   167  		}()
   168  		return false, false, nil
   169  	}, topology.Select{})
   170  
   171  	wg.Wait()
   172  
   173  	if len(peers) == 0 {
   174  		return
   175  	}
   176  
   177  	networkRadius, nHoodRadius := s.radius(peers)
   178  	avgDur := totaldur / float64(len(peers))
   179  	pDur := percentileDur(peers, durPercentile)
   180  	pConns := percentileConns(peers, connsPercentile)
   181  	commitment := commitment(peers)
   182  
   183  	s.metrics.AvgDur.Set(avgDur)
   184  	s.metrics.PDur.Set(pDur)
   185  	s.metrics.PConns.Set(float64(pConns))
   186  	s.metrics.NetworkRadius.Set(float64(networkRadius))
   187  	s.metrics.NeighborhoodRadius.Set(float64(nHoodRadius))
   188  	s.metrics.Commitment.Set(float64(commitment))
   189  
   190  	s.logger.Debug("computed", "avg_dur", avgDur, "pDur", pDur, "pConns", pConns, "network_radius", networkRadius, "neighborhood_radius", nHoodRadius, "batch_commitment", commitment)
   191  
   192  	for _, peer := range peers {
   193  
   194  		var healthy bool
   195  
   196  		// every bin should have at least some peers, healthy or not
   197  		if bins[peer.bin] <= minPeersPerbin {
   198  			s.metrics.Healthy.Inc()
   199  			s.topology.UpdatePeerHealth(peer.addr, true, peer.dur)
   200  			continue
   201  		}
   202  
   203  		if networkRadius > 0 && peer.status.StorageRadius < uint32(networkRadius-1) {
   204  			s.logger.Debug("radius health failure", "radius", peer.status.StorageRadius, "peer_address", peer.addr)
   205  		} else if peer.dur.Seconds() > pDur {
   206  			s.logger.Debug("response duration below threshold", "duration", peer.dur, "peer_address", peer.addr)
   207  		} else if peer.status.ConnectedPeers < pConns {
   208  			s.logger.Debug("connections count below threshold", "connections", peer.status.ConnectedPeers, "peer_address", peer.addr)
   209  		} else if peer.status.BatchCommitment != commitment {
   210  			s.logger.Debug("batch commitment check failure", "commitment", peer.status.BatchCommitment, "peer_address", peer.addr)
   211  		} else {
   212  			healthy = true
   213  		}
   214  
   215  		s.topology.UpdatePeerHealth(peer.addr, healthy, peer.dur)
   216  		if healthy {
   217  			s.metrics.Healthy.Inc()
   218  		} else {
   219  			s.metrics.Unhealthy.Inc()
   220  			bins[peer.bin]--
   221  		}
   222  	}
   223  
   224  	selfHealth := true
   225  	if nHoodRadius == networkRadius && s.reserve.StorageRadius() != networkRadius {
   226  		selfHealth = false
   227  		s.logger.Warning("node is unhealthy due to storage radius discrepancy", "self_radius", s.reserve.StorageRadius(), "network_radius", networkRadius)
   228  	}
   229  
   230  	s.isSelfHealthy.Store(selfHealth)
   231  
   232  	s.publishRadius(networkRadius)
   233  }
   234  
   235  func (s *service) IsHealthy() bool {
   236  	return s.isSelfHealthy.Load()
   237  }
   238  
   239  func (s *service) publishRadius(r uint8) {
   240  	s.radiusSubsMtx.Lock()
   241  	defer s.radiusSubsMtx.Unlock()
   242  	for _, cb := range s.radiusC {
   243  		select {
   244  		case cb <- r:
   245  		default:
   246  		}
   247  	}
   248  }
   249  
   250  func (s *service) SubscribeNetworkStorageRadius() (<-chan uint8, func()) {
   251  	s.radiusSubsMtx.Lock()
   252  	defer s.radiusSubsMtx.Unlock()
   253  
   254  	c := make(chan uint8, 1)
   255  	s.radiusC = append(s.radiusC, c)
   256  
   257  	return c, func() {
   258  		s.radiusSubsMtx.Lock()
   259  		defer s.radiusSubsMtx.Unlock()
   260  		for i, cc := range s.radiusC {
   261  			if c == cc {
   262  				s.radiusC = append(s.radiusC[:i], s.radiusC[i+1:]...)
   263  				break
   264  			}
   265  		}
   266  	}
   267  }
   268  
   269  // percentileDur finds the p percentile of response duration.
   270  // Less is better.
   271  func percentileDur(peers []peer, p float64) float64 {
   272  
   273  	index := int(float64(len(peers)) * p)
   274  
   275  	sort.Slice(peers, func(i, j int) bool {
   276  		return peers[i].dur < peers[j].dur // ascending
   277  	})
   278  
   279  	return peers[index].dur.Seconds()
   280  }
   281  
   282  // percentileConns finds the p percentile of connection count.
   283  // More is better.
   284  func percentileConns(peers []peer, p float64) uint64 {
   285  
   286  	index := int(float64(len(peers)) * p)
   287  
   288  	sort.Slice(peers, func(i, j int) bool {
   289  		return peers[i].status.ConnectedPeers > peers[j].status.ConnectedPeers // descending
   290  	})
   291  
   292  	return peers[index].status.ConnectedPeers
   293  }
   294  
   295  // radius finds the most common radius.
   296  func (s *service) radius(peers []peer) (uint8, uint8) {
   297  
   298  	var networkRadius [swarm.MaxBins]int
   299  	var nHoodRadius [swarm.MaxBins]int
   300  
   301  	for _, peer := range peers {
   302  		if peer.status.StorageRadius < uint32(swarm.MaxBins) {
   303  			if peer.neighbor {
   304  				nHoodRadius[peer.status.StorageRadius]++
   305  			}
   306  			networkRadius[peer.status.StorageRadius]++
   307  		}
   308  	}
   309  
   310  	networkR := maxIndex(networkRadius[:])
   311  	hoodR := maxIndex(nHoodRadius[:])
   312  
   313  	return uint8(networkR), uint8(hoodR)
   314  }
   315  
   316  // commitment finds the most common batch commitment.
   317  func commitment(peers []peer) uint64 {
   318  
   319  	commitments := make(map[uint64]int)
   320  
   321  	for _, peer := range peers {
   322  		commitments[peer.status.BatchCommitment]++
   323  	}
   324  
   325  	var (
   326  		maxCount             = 0
   327  		maxCommitment uint64 = 0
   328  	)
   329  
   330  	for commitment, count := range commitments {
   331  		if count > maxCount {
   332  			maxCommitment = commitment
   333  			maxCount = count
   334  		}
   335  	}
   336  
   337  	return maxCommitment
   338  }
   339  
   340  func maxIndex(n []int) int {
   341  
   342  	maxValue := 0
   343  	index := 0
   344  	for i, c := range n {
   345  		if c > maxValue {
   346  			maxValue = c
   347  			index = i
   348  		}
   349  	}
   350  
   351  	return index
   352  }