github.com/ethersphere/bee/v2@v2.2.0/pkg/salud/salud.go (about) 1 // Copyright 2023 The Swarm Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 // Package salud monitors the connected peers, calculates certain thresholds, and marks peers as unhealthy that 6 // fall short of the thresholds to maintain network salud (health). 7 package salud 8 9 import ( 10 "context" 11 "sort" 12 "sync" 13 "time" 14 15 "github.com/ethersphere/bee/v2/pkg/log" 16 "github.com/ethersphere/bee/v2/pkg/status" 17 "github.com/ethersphere/bee/v2/pkg/storer" 18 "github.com/ethersphere/bee/v2/pkg/swarm" 19 "github.com/ethersphere/bee/v2/pkg/topology" 20 "go.uber.org/atomic" 21 ) 22 23 // loggerName is the tree path name of the logger for this package. 24 const loggerName = "salud" 25 26 const ( 27 wakeup = time.Minute * 5 28 requestTimeout = time.Second * 10 29 DefaultMinPeersPerBin = 4 30 DefaultDurPercentile = 0.4 // consider 40% as healthy, lower percentile = stricter duration check 31 DefaultConnsPercentile = 0.8 // consider 80% as healthy, lower percentile = stricter conns check 32 ) 33 34 type topologyDriver interface { 35 UpdatePeerHealth(peer swarm.Address, health bool, dur time.Duration) 36 topology.PeerIterator 37 } 38 39 type peerStatus interface { 40 PeerSnapshot(ctx context.Context, peer swarm.Address) (*status.Snapshot, error) 41 } 42 43 type reserve interface { 44 storer.RadiusChecker 45 ReserveSize() int 46 } 47 48 type service struct { 49 wg sync.WaitGroup 50 quit chan struct{} 51 logger log.Logger 52 topology topologyDriver 53 status peerStatus 54 metrics metrics 55 isSelfHealthy *atomic.Bool 56 reserve reserve 57 58 radiusSubsMtx sync.Mutex 59 radiusC []chan uint8 60 } 61 62 func New( 63 status peerStatus, 64 topology topologyDriver, 65 reserve reserve, 66 logger log.Logger, 67 warmup time.Duration, 68 mode string, 69 minPeersPerbin int, 70 durPercentile float64, 71 connsPercentile float64, 72 ) *service { 73 74 metrics := newMetrics() 75 76 s := &service{ 77 quit: make(chan struct{}), 78 logger: logger.WithName(loggerName).Register(), 79 status: status, 80 topology: topology, 81 metrics: metrics, 82 isSelfHealthy: atomic.NewBool(true), 83 reserve: reserve, 84 } 85 86 s.wg.Add(1) 87 go s.worker(warmup, mode, minPeersPerbin, durPercentile, connsPercentile) 88 89 return s 90 91 } 92 93 func (s *service) worker(warmup time.Duration, mode string, minPeersPerbin int, durPercentile float64, connsPercentile float64) { 94 defer s.wg.Done() 95 96 select { 97 case <-s.quit: 98 return 99 case <-time.After(warmup): 100 } 101 102 for { 103 104 s.salud(mode, minPeersPerbin, durPercentile, connsPercentile) 105 106 select { 107 case <-s.quit: 108 return 109 case <-time.After(wakeup): 110 } 111 } 112 } 113 114 func (s *service) Close() error { 115 close(s.quit) 116 s.wg.Wait() 117 return nil 118 } 119 120 type peer struct { 121 status *status.Snapshot 122 dur time.Duration 123 addr swarm.Address 124 bin uint8 125 neighbor bool 126 } 127 128 // salud acquires the status snapshot of every peer and computes an nth percentile of response duration and connected 129 // per count, the most common storage radius, and the batch commitment, and based on these values, marks peers as unhealhy that fall beyond 130 // the allowed thresholds. 131 func (s *service) salud(mode string, minPeersPerbin int, durPercentile float64, connsPercentile float64) { 132 133 var ( 134 mtx sync.Mutex 135 wg sync.WaitGroup 136 totaldur float64 137 peers []peer 138 bins [swarm.MaxBins]int 139 ) 140 141 _ = s.topology.EachConnectedPeer(func(addr swarm.Address, bin uint8) (stop bool, jumpToNext bool, err error) { 142 wg.Add(1) 143 go func() { 144 defer wg.Done() 145 146 ctx, cancel := context.WithTimeout(context.Background(), requestTimeout) 147 defer cancel() 148 149 start := time.Now() 150 snapshot, err := s.status.PeerSnapshot(ctx, addr) 151 dur := time.Since(start) 152 153 if err != nil { 154 s.topology.UpdatePeerHealth(addr, false, dur) 155 return 156 } 157 158 if snapshot.BeeMode != mode { 159 return 160 } 161 162 mtx.Lock() 163 bins[bin]++ 164 totaldur += dur.Seconds() 165 peers = append(peers, peer{snapshot, dur, addr, bin, s.reserve.IsWithinStorageRadius(addr)}) 166 mtx.Unlock() 167 }() 168 return false, false, nil 169 }, topology.Select{}) 170 171 wg.Wait() 172 173 if len(peers) == 0 { 174 return 175 } 176 177 networkRadius, nHoodRadius := s.radius(peers) 178 avgDur := totaldur / float64(len(peers)) 179 pDur := percentileDur(peers, durPercentile) 180 pConns := percentileConns(peers, connsPercentile) 181 commitment := commitment(peers) 182 183 s.metrics.AvgDur.Set(avgDur) 184 s.metrics.PDur.Set(pDur) 185 s.metrics.PConns.Set(float64(pConns)) 186 s.metrics.NetworkRadius.Set(float64(networkRadius)) 187 s.metrics.NeighborhoodRadius.Set(float64(nHoodRadius)) 188 s.metrics.Commitment.Set(float64(commitment)) 189 190 s.logger.Debug("computed", "avg_dur", avgDur, "pDur", pDur, "pConns", pConns, "network_radius", networkRadius, "neighborhood_radius", nHoodRadius, "batch_commitment", commitment) 191 192 for _, peer := range peers { 193 194 var healthy bool 195 196 // every bin should have at least some peers, healthy or not 197 if bins[peer.bin] <= minPeersPerbin { 198 s.metrics.Healthy.Inc() 199 s.topology.UpdatePeerHealth(peer.addr, true, peer.dur) 200 continue 201 } 202 203 if networkRadius > 0 && peer.status.StorageRadius < uint32(networkRadius-1) { 204 s.logger.Debug("radius health failure", "radius", peer.status.StorageRadius, "peer_address", peer.addr) 205 } else if peer.dur.Seconds() > pDur { 206 s.logger.Debug("response duration below threshold", "duration", peer.dur, "peer_address", peer.addr) 207 } else if peer.status.ConnectedPeers < pConns { 208 s.logger.Debug("connections count below threshold", "connections", peer.status.ConnectedPeers, "peer_address", peer.addr) 209 } else if peer.status.BatchCommitment != commitment { 210 s.logger.Debug("batch commitment check failure", "commitment", peer.status.BatchCommitment, "peer_address", peer.addr) 211 } else { 212 healthy = true 213 } 214 215 s.topology.UpdatePeerHealth(peer.addr, healthy, peer.dur) 216 if healthy { 217 s.metrics.Healthy.Inc() 218 } else { 219 s.metrics.Unhealthy.Inc() 220 bins[peer.bin]-- 221 } 222 } 223 224 selfHealth := true 225 if nHoodRadius == networkRadius && s.reserve.StorageRadius() != networkRadius { 226 selfHealth = false 227 s.logger.Warning("node is unhealthy due to storage radius discrepancy", "self_radius", s.reserve.StorageRadius(), "network_radius", networkRadius) 228 } 229 230 s.isSelfHealthy.Store(selfHealth) 231 232 s.publishRadius(networkRadius) 233 } 234 235 func (s *service) IsHealthy() bool { 236 return s.isSelfHealthy.Load() 237 } 238 239 func (s *service) publishRadius(r uint8) { 240 s.radiusSubsMtx.Lock() 241 defer s.radiusSubsMtx.Unlock() 242 for _, cb := range s.radiusC { 243 select { 244 case cb <- r: 245 default: 246 } 247 } 248 } 249 250 func (s *service) SubscribeNetworkStorageRadius() (<-chan uint8, func()) { 251 s.radiusSubsMtx.Lock() 252 defer s.radiusSubsMtx.Unlock() 253 254 c := make(chan uint8, 1) 255 s.radiusC = append(s.radiusC, c) 256 257 return c, func() { 258 s.radiusSubsMtx.Lock() 259 defer s.radiusSubsMtx.Unlock() 260 for i, cc := range s.radiusC { 261 if c == cc { 262 s.radiusC = append(s.radiusC[:i], s.radiusC[i+1:]...) 263 break 264 } 265 } 266 } 267 } 268 269 // percentileDur finds the p percentile of response duration. 270 // Less is better. 271 func percentileDur(peers []peer, p float64) float64 { 272 273 index := int(float64(len(peers)) * p) 274 275 sort.Slice(peers, func(i, j int) bool { 276 return peers[i].dur < peers[j].dur // ascending 277 }) 278 279 return peers[index].dur.Seconds() 280 } 281 282 // percentileConns finds the p percentile of connection count. 283 // More is better. 284 func percentileConns(peers []peer, p float64) uint64 { 285 286 index := int(float64(len(peers)) * p) 287 288 sort.Slice(peers, func(i, j int) bool { 289 return peers[i].status.ConnectedPeers > peers[j].status.ConnectedPeers // descending 290 }) 291 292 return peers[index].status.ConnectedPeers 293 } 294 295 // radius finds the most common radius. 296 func (s *service) radius(peers []peer) (uint8, uint8) { 297 298 var networkRadius [swarm.MaxBins]int 299 var nHoodRadius [swarm.MaxBins]int 300 301 for _, peer := range peers { 302 if peer.status.StorageRadius < uint32(swarm.MaxBins) { 303 if peer.neighbor { 304 nHoodRadius[peer.status.StorageRadius]++ 305 } 306 networkRadius[peer.status.StorageRadius]++ 307 } 308 } 309 310 networkR := maxIndex(networkRadius[:]) 311 hoodR := maxIndex(nHoodRadius[:]) 312 313 return uint8(networkR), uint8(hoodR) 314 } 315 316 // commitment finds the most common batch commitment. 317 func commitment(peers []peer) uint64 { 318 319 commitments := make(map[uint64]int) 320 321 for _, peer := range peers { 322 commitments[peer.status.BatchCommitment]++ 323 } 324 325 var ( 326 maxCount = 0 327 maxCommitment uint64 = 0 328 ) 329 330 for commitment, count := range commitments { 331 if count > maxCount { 332 maxCommitment = commitment 333 maxCount = count 334 } 335 } 336 337 return maxCommitment 338 } 339 340 func maxIndex(n []int) int { 341 342 maxValue := 0 343 index := 0 344 for i, c := range n { 345 if c > maxValue { 346 maxValue = c 347 index = i 348 } 349 } 350 351 return index 352 }