github.com/ethersphere/bee/v2@v2.2.0/pkg/topology/kademlia/internal/metrics/metrics.go (about) 1 // Copyright 2021 The Swarm Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 // Package metrics provides service for collecting various metrics about peers. 6 // It is intended to be used with the kademlia where the metrics are collected. 7 package metrics 8 9 import ( 10 "encoding/json" 11 "errors" 12 "fmt" 13 "sync" 14 "time" 15 16 "github.com/ethersphere/bee/v2/pkg/p2p" 17 "github.com/ethersphere/bee/v2/pkg/shed" 18 "github.com/ethersphere/bee/v2/pkg/swarm" 19 "github.com/syndtr/goleveldb/leveldb" 20 ) 21 22 const ewmaSmoothing = 0.1 23 24 // PeerConnectionDirection represents peer connection direction. 25 type PeerConnectionDirection string 26 27 const ( 28 PeerConnectionDirectionInbound PeerConnectionDirection = "inbound" 29 PeerConnectionDirectionOutbound PeerConnectionDirection = "outbound" 30 ) 31 32 // RecordOp is a definition of a peer metrics Record 33 // operation whose execution modifies a specific metrics. 34 type RecordOp func(*Counters) 35 36 // PeerLogIn will first update the current last seen to the give time t and as 37 // the second it'll set the direction of the session connection to the given 38 // value. The force flag will force the peer re-login if he's already logged in. 39 // The time is set as Unix timestamp ignoring the timezone. The operation will 40 // panic if the given time is before the Unix epoch. 41 func PeerLogIn(t time.Time, dir PeerConnectionDirection) RecordOp { 42 return func(cs *Counters) { 43 cs.Lock() 44 defer cs.Unlock() 45 46 if cs.isLoggedIn { 47 return // Ignore when the peer is already logged in. 48 } 49 cs.isLoggedIn = true 50 51 ls := t.UnixNano() 52 if ls < 0 { 53 panic(fmt.Errorf("time before unix epoch: %s", t)) 54 } 55 cs.sessionConnDirection = dir 56 cs.lastSeenTimestamp = ls 57 } 58 } 59 60 // PeerLogOut will first update the connection session and total duration with 61 // the difference of the given time t and the current last seen value. As the 62 // second it'll also update the last seen peer metrics to the given time t. 63 // The time is set as Unix timestamp ignoring the timezone. The operation will 64 // panic if the given time is before the Unix epoch. 65 func PeerLogOut(t time.Time) RecordOp { 66 return func(cs *Counters) { 67 cs.Lock() 68 defer cs.Unlock() 69 70 if !cs.isLoggedIn { 71 return // Ignore when the peer is not logged in. 72 } 73 cs.isLoggedIn = false 74 75 curLs := cs.lastSeenTimestamp 76 newLs := t.UnixNano() 77 if newLs < 0 { 78 panic(fmt.Errorf("time before unix epoch: %s", t)) 79 } 80 81 cs.sessionConnDuration = time.Duration(newLs - curLs) 82 cs.connTotalDuration += cs.sessionConnDuration 83 cs.lastSeenTimestamp = newLs 84 } 85 } 86 87 // IncSessionConnectionRetry increments the session connection retry 88 // counter by 1. 89 func IncSessionConnectionRetry() RecordOp { 90 return func(cs *Counters) { 91 cs.Lock() 92 defer cs.Unlock() 93 94 cs.sessionConnRetry++ 95 } 96 } 97 98 // PeerLatency records the average peer latency. 99 func PeerLatency(t time.Duration) RecordOp { 100 return func(cs *Counters) { 101 cs.Lock() 102 defer cs.Unlock() 103 // short circuit the first measurement 104 if cs.latencyEWMA == 0 { 105 cs.latencyEWMA = t 106 return 107 } 108 v := (ewmaSmoothing * float64(t)) + (1-ewmaSmoothing)*float64(cs.latencyEWMA) 109 cs.latencyEWMA = time.Duration(v) 110 } 111 } 112 113 // PeerReachability updates the last reachability status. 114 func PeerReachability(s p2p.ReachabilityStatus) RecordOp { 115 return func(cs *Counters) { 116 cs.Lock() 117 defer cs.Unlock() 118 cs.ReachabilityStatus = s 119 } 120 } 121 122 // PeerHealth updates the last health status of a peers. 123 func PeerHealth(isHealty bool) RecordOp { 124 return func(cs *Counters) { 125 cs.Lock() 126 defer cs.Unlock() 127 cs.Healthy = isHealty 128 } 129 } 130 131 // Snapshot represents a snapshot of peers' metrics counters. 132 type Snapshot struct { 133 LastSeenTimestamp int64 134 SessionConnectionRetry uint64 135 ConnectionTotalDuration time.Duration 136 SessionConnectionDuration time.Duration 137 SessionConnectionDirection PeerConnectionDirection 138 LatencyEWMA time.Duration 139 Reachability p2p.ReachabilityStatus 140 Healthy bool 141 } 142 143 // persistentCounters is a helper struct used for persisting selected counters. 144 type persistentCounters struct { 145 PeerAddress swarm.Address `json:"peerAddress"` 146 LastSeenTimestamp int64 `json:"lastSeenTimestamp"` 147 ConnTotalDuration time.Duration `json:"connTotalDuration"` 148 } 149 150 // Counters represents a collection of peer metrics 151 // mainly collected for statistics and debugging. 152 type Counters struct { 153 sync.Mutex 154 155 // Bookkeeping. 156 isLoggedIn bool 157 peerAddress swarm.Address 158 159 // Counters. 160 lastSeenTimestamp int64 161 connTotalDuration time.Duration 162 sessionConnRetry uint64 163 sessionConnDuration time.Duration 164 sessionConnDirection PeerConnectionDirection 165 latencyEWMA time.Duration 166 ReachabilityStatus p2p.ReachabilityStatus 167 Healthy bool 168 } 169 170 // UnmarshalJSON unmarshal just the persistent counters. 171 func (cs *Counters) UnmarshalJSON(b []byte) (err error) { 172 var val persistentCounters 173 if err := json.Unmarshal(b, &val); err != nil { 174 return err 175 } 176 cs.Lock() 177 cs.peerAddress = val.PeerAddress 178 cs.lastSeenTimestamp = val.LastSeenTimestamp 179 cs.connTotalDuration = val.ConnTotalDuration 180 cs.Unlock() 181 return nil 182 } 183 184 // MarshalJSON marshals just the persistent counters. 185 func (cs *Counters) MarshalJSON() ([]byte, error) { 186 cs.Lock() 187 val := persistentCounters{ 188 PeerAddress: cs.peerAddress, 189 LastSeenTimestamp: cs.lastSeenTimestamp, 190 ConnTotalDuration: cs.connTotalDuration, 191 } 192 cs.Unlock() 193 return json.Marshal(val) 194 } 195 196 // snapshot returns current snapshot of counters referenced to the given t. 197 func (cs *Counters) snapshot(t time.Time) *Snapshot { 198 cs.Lock() 199 defer cs.Unlock() 200 201 connTotalDuration := cs.connTotalDuration 202 sessionConnDuration := cs.sessionConnDuration 203 if cs.isLoggedIn { 204 sessionConnDuration = t.Sub(time.Unix(0, cs.lastSeenTimestamp)) 205 connTotalDuration += sessionConnDuration 206 } 207 208 return &Snapshot{ 209 LastSeenTimestamp: cs.lastSeenTimestamp, 210 SessionConnectionRetry: cs.sessionConnRetry, 211 ConnectionTotalDuration: connTotalDuration, 212 SessionConnectionDuration: sessionConnDuration, 213 SessionConnectionDirection: cs.sessionConnDirection, 214 LatencyEWMA: cs.latencyEWMA, 215 Reachability: cs.ReachabilityStatus, 216 Healthy: cs.Healthy, 217 } 218 } 219 220 // NewCollector is a convenient constructor for creating new Collector. 221 func NewCollector(db *shed.DB) (*Collector, error) { 222 const name = "kademlia-counters" 223 224 c := new(Collector) 225 226 val, err := db.NewStructField(name) 227 if err != nil { 228 return nil, fmt.Errorf("field initialization for %q failed: %w", name, err) 229 } 230 c.persistence = &val 231 232 counters := make(map[string]persistentCounters) 233 if err := val.Get(&counters); err != nil && !errors.Is(err, leveldb.ErrNotFound) { 234 return nil, err 235 } 236 237 for _, val := range counters { 238 c.counters.Store(val.PeerAddress.ByteString(), &Counters{ 239 peerAddress: val.PeerAddress, 240 lastSeenTimestamp: val.LastSeenTimestamp, 241 connTotalDuration: val.ConnTotalDuration, 242 }) 243 } 244 245 return c, nil 246 } 247 248 // Collector collects various metrics about 249 // peers specified be the swarm.Address. 250 type Collector struct { 251 counters sync.Map 252 persistence *shed.StructField 253 } 254 255 // Record records a set of metrics for peer specified by the given address. 256 func (c *Collector) Record(addr swarm.Address, rop ...RecordOp) { 257 val, _ := c.counters.LoadOrStore(addr.ByteString(), &Counters{peerAddress: addr}) 258 for _, op := range rop { 259 op(val.(*Counters)) 260 } 261 } 262 263 // Snapshot returns the current state of the metrics collector for peer(s). 264 // The given time t is used to calculate the duration of the current session, 265 // if any. If an address or a set of addresses is specified then only metrics 266 // related to them will be returned, otherwise metrics for all peers will be 267 // returned. If the peer is still logged in, the session-related counters will 268 // be evaluated against the last seen time, which equals to the login time. If 269 // the peer is logged out, then the session counters will reflect its last 270 // session. 271 func (c *Collector) Snapshot(t time.Time, addresses ...swarm.Address) map[string]*Snapshot { 272 snapshot := make(map[string]*Snapshot) 273 274 for _, addr := range addresses { 275 val, ok := c.counters.Load(addr.ByteString()) 276 if !ok { 277 continue 278 } 279 cs := val.(*Counters) 280 snapshot[addr.ByteString()] = cs.snapshot(t) 281 } 282 283 if len(addresses) == 0 { 284 c.counters.Range(func(key, val interface{}) bool { 285 cs := val.(*Counters) 286 snapshot[cs.peerAddress.ByteString()] = cs.snapshot(t) 287 return true 288 }) 289 } 290 291 return snapshot 292 } 293 294 // IsUnreachable returns true if the peer is unreachable. 295 func (c *Collector) IsUnreachable(addr swarm.Address) bool { 296 val, ok := c.counters.Load(addr.ByteString()) 297 if !ok { 298 return true 299 } 300 cs := val.(*Counters) 301 302 cs.Lock() 303 defer cs.Unlock() 304 305 return cs.ReachabilityStatus != p2p.ReachabilityStatusPublic 306 } 307 308 // ExcludeOp is a function type used to filter peers on certain fields. 309 type ExcludeOp func(*Counters) bool 310 311 // Reachable is used to filter reachable or unreachable peers based on r. 312 func Reachability(filterReachable bool) ExcludeOp { 313 return func(cs *Counters) bool { 314 reachble := cs.ReachabilityStatus == p2p.ReachabilityStatusPublic 315 if filterReachable { 316 return reachble 317 } 318 return !reachble 319 } 320 } 321 322 // Unreachable is used to filter unhealthy peers. 323 func Health(filterHealthy bool) ExcludeOp { 324 return func(cs *Counters) bool { 325 if filterHealthy { 326 return cs.Healthy 327 } 328 return !cs.Healthy 329 } 330 } 331 332 // Exclude returns false if the addr passes all exclusion operations. 333 func (c *Collector) Exclude(addr swarm.Address, fop ...ExcludeOp) bool { 334 val, ok := c.counters.Load(addr.ByteString()) 335 if !ok { 336 return true 337 } 338 cs := val.(*Counters) 339 cs.Lock() 340 defer cs.Unlock() 341 342 for _, f := range fop { 343 if f(cs) { 344 return true 345 } 346 } 347 348 return false 349 } 350 351 // Inspect allows inspecting current snapshot for the given 352 // peer address by executing the inspection function. 353 func (c *Collector) Inspect(addr swarm.Address) *Snapshot { 354 snapshots := c.Snapshot(time.Now(), addr) 355 return snapshots[addr.ByteString()] 356 } 357 358 // Flush sync the dirty in memory counters for all peers by flushing their 359 // values to the underlying storage. 360 func (c *Collector) Flush() error { 361 counters := make(map[string]interface{}) 362 c.counters.Range(func(key, val interface{}) bool { 363 cs := val.(*Counters) 364 counters[cs.peerAddress.ByteString()] = val 365 return true 366 }) 367 368 if err := c.persistence.Put(counters); err != nil { 369 return fmt.Errorf("unable to persist counters: %w", err) 370 } 371 return nil 372 } 373 374 // Finalize tries to log out all ongoing peer sessions. 375 func (c *Collector) Finalize(t time.Time, remove bool) error { 376 c.counters.Range(func(_, val interface{}) bool { 377 cs := val.(*Counters) 378 PeerLogOut(t)(cs) 379 return true 380 }) 381 382 if err := c.Flush(); err != nil { 383 return err 384 } 385 386 if remove { 387 c.counters.Range(func(_, val interface{}) bool { 388 cs := val.(*Counters) 389 c.counters.Delete(cs.peerAddress.ByteString()) 390 return true 391 }) 392 } 393 394 return nil 395 }