github.com/ethersphere/bee/v2@v2.2.0/pkg/topology/kademlia/internal/metrics/metrics.go (about)

     1  // Copyright 2021 The Swarm Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  // Package metrics provides service for collecting various metrics about peers.
     6  // It is intended to be used with the kademlia where the metrics are collected.
     7  package metrics
     8  
     9  import (
    10  	"encoding/json"
    11  	"errors"
    12  	"fmt"
    13  	"sync"
    14  	"time"
    15  
    16  	"github.com/ethersphere/bee/v2/pkg/p2p"
    17  	"github.com/ethersphere/bee/v2/pkg/shed"
    18  	"github.com/ethersphere/bee/v2/pkg/swarm"
    19  	"github.com/syndtr/goleveldb/leveldb"
    20  )
    21  
    22  const ewmaSmoothing = 0.1
    23  
    24  // PeerConnectionDirection represents peer connection direction.
    25  type PeerConnectionDirection string
    26  
    27  const (
    28  	PeerConnectionDirectionInbound  PeerConnectionDirection = "inbound"
    29  	PeerConnectionDirectionOutbound PeerConnectionDirection = "outbound"
    30  )
    31  
    32  // RecordOp is a definition of a peer metrics Record
    33  // operation whose execution modifies a specific metrics.
    34  type RecordOp func(*Counters)
    35  
    36  // PeerLogIn will first update the current last seen to the give time t and as
    37  // the second it'll set the direction of the session connection to the given
    38  // value. The force flag will force the peer re-login if he's already logged in.
    39  // The time is set as Unix timestamp ignoring the timezone. The operation will
    40  // panic if the given time is before the Unix epoch.
    41  func PeerLogIn(t time.Time, dir PeerConnectionDirection) RecordOp {
    42  	return func(cs *Counters) {
    43  		cs.Lock()
    44  		defer cs.Unlock()
    45  
    46  		if cs.isLoggedIn {
    47  			return // Ignore when the peer is already logged in.
    48  		}
    49  		cs.isLoggedIn = true
    50  
    51  		ls := t.UnixNano()
    52  		if ls < 0 {
    53  			panic(fmt.Errorf("time before unix epoch: %s", t))
    54  		}
    55  		cs.sessionConnDirection = dir
    56  		cs.lastSeenTimestamp = ls
    57  	}
    58  }
    59  
    60  // PeerLogOut will first update the connection session and total duration with
    61  // the difference of the given time t and the current last seen value. As the
    62  // second it'll also update the last seen peer metrics to the given time t.
    63  // The time is set as Unix timestamp ignoring the timezone. The operation will
    64  // panic if the given time is before the Unix epoch.
    65  func PeerLogOut(t time.Time) RecordOp {
    66  	return func(cs *Counters) {
    67  		cs.Lock()
    68  		defer cs.Unlock()
    69  
    70  		if !cs.isLoggedIn {
    71  			return // Ignore when the peer is not logged in.
    72  		}
    73  		cs.isLoggedIn = false
    74  
    75  		curLs := cs.lastSeenTimestamp
    76  		newLs := t.UnixNano()
    77  		if newLs < 0 {
    78  			panic(fmt.Errorf("time before unix epoch: %s", t))
    79  		}
    80  
    81  		cs.sessionConnDuration = time.Duration(newLs - curLs)
    82  		cs.connTotalDuration += cs.sessionConnDuration
    83  		cs.lastSeenTimestamp = newLs
    84  	}
    85  }
    86  
    87  // IncSessionConnectionRetry increments the session connection retry
    88  // counter by 1.
    89  func IncSessionConnectionRetry() RecordOp {
    90  	return func(cs *Counters) {
    91  		cs.Lock()
    92  		defer cs.Unlock()
    93  
    94  		cs.sessionConnRetry++
    95  	}
    96  }
    97  
    98  // PeerLatency records the average peer latency.
    99  func PeerLatency(t time.Duration) RecordOp {
   100  	return func(cs *Counters) {
   101  		cs.Lock()
   102  		defer cs.Unlock()
   103  		// short circuit the first measurement
   104  		if cs.latencyEWMA == 0 {
   105  			cs.latencyEWMA = t
   106  			return
   107  		}
   108  		v := (ewmaSmoothing * float64(t)) + (1-ewmaSmoothing)*float64(cs.latencyEWMA)
   109  		cs.latencyEWMA = time.Duration(v)
   110  	}
   111  }
   112  
   113  // PeerReachability updates the last reachability status.
   114  func PeerReachability(s p2p.ReachabilityStatus) RecordOp {
   115  	return func(cs *Counters) {
   116  		cs.Lock()
   117  		defer cs.Unlock()
   118  		cs.ReachabilityStatus = s
   119  	}
   120  }
   121  
   122  // PeerHealth updates the last health status of a peers.
   123  func PeerHealth(isHealty bool) RecordOp {
   124  	return func(cs *Counters) {
   125  		cs.Lock()
   126  		defer cs.Unlock()
   127  		cs.Healthy = isHealty
   128  	}
   129  }
   130  
   131  // Snapshot represents a snapshot of peers' metrics counters.
   132  type Snapshot struct {
   133  	LastSeenTimestamp          int64
   134  	SessionConnectionRetry     uint64
   135  	ConnectionTotalDuration    time.Duration
   136  	SessionConnectionDuration  time.Duration
   137  	SessionConnectionDirection PeerConnectionDirection
   138  	LatencyEWMA                time.Duration
   139  	Reachability               p2p.ReachabilityStatus
   140  	Healthy                    bool
   141  }
   142  
   143  // persistentCounters is a helper struct used for persisting selected counters.
   144  type persistentCounters struct {
   145  	PeerAddress       swarm.Address `json:"peerAddress"`
   146  	LastSeenTimestamp int64         `json:"lastSeenTimestamp"`
   147  	ConnTotalDuration time.Duration `json:"connTotalDuration"`
   148  }
   149  
   150  // Counters represents a collection of peer metrics
   151  // mainly collected for statistics and debugging.
   152  type Counters struct {
   153  	sync.Mutex
   154  
   155  	// Bookkeeping.
   156  	isLoggedIn  bool
   157  	peerAddress swarm.Address
   158  
   159  	// Counters.
   160  	lastSeenTimestamp    int64
   161  	connTotalDuration    time.Duration
   162  	sessionConnRetry     uint64
   163  	sessionConnDuration  time.Duration
   164  	sessionConnDirection PeerConnectionDirection
   165  	latencyEWMA          time.Duration
   166  	ReachabilityStatus   p2p.ReachabilityStatus
   167  	Healthy              bool
   168  }
   169  
   170  // UnmarshalJSON unmarshal just the persistent counters.
   171  func (cs *Counters) UnmarshalJSON(b []byte) (err error) {
   172  	var val persistentCounters
   173  	if err := json.Unmarshal(b, &val); err != nil {
   174  		return err
   175  	}
   176  	cs.Lock()
   177  	cs.peerAddress = val.PeerAddress
   178  	cs.lastSeenTimestamp = val.LastSeenTimestamp
   179  	cs.connTotalDuration = val.ConnTotalDuration
   180  	cs.Unlock()
   181  	return nil
   182  }
   183  
   184  // MarshalJSON marshals just the persistent counters.
   185  func (cs *Counters) MarshalJSON() ([]byte, error) {
   186  	cs.Lock()
   187  	val := persistentCounters{
   188  		PeerAddress:       cs.peerAddress,
   189  		LastSeenTimestamp: cs.lastSeenTimestamp,
   190  		ConnTotalDuration: cs.connTotalDuration,
   191  	}
   192  	cs.Unlock()
   193  	return json.Marshal(val)
   194  }
   195  
   196  // snapshot returns current snapshot of counters referenced to the given t.
   197  func (cs *Counters) snapshot(t time.Time) *Snapshot {
   198  	cs.Lock()
   199  	defer cs.Unlock()
   200  
   201  	connTotalDuration := cs.connTotalDuration
   202  	sessionConnDuration := cs.sessionConnDuration
   203  	if cs.isLoggedIn {
   204  		sessionConnDuration = t.Sub(time.Unix(0, cs.lastSeenTimestamp))
   205  		connTotalDuration += sessionConnDuration
   206  	}
   207  
   208  	return &Snapshot{
   209  		LastSeenTimestamp:          cs.lastSeenTimestamp,
   210  		SessionConnectionRetry:     cs.sessionConnRetry,
   211  		ConnectionTotalDuration:    connTotalDuration,
   212  		SessionConnectionDuration:  sessionConnDuration,
   213  		SessionConnectionDirection: cs.sessionConnDirection,
   214  		LatencyEWMA:                cs.latencyEWMA,
   215  		Reachability:               cs.ReachabilityStatus,
   216  		Healthy:                    cs.Healthy,
   217  	}
   218  }
   219  
   220  // NewCollector is a convenient constructor for creating new Collector.
   221  func NewCollector(db *shed.DB) (*Collector, error) {
   222  	const name = "kademlia-counters"
   223  
   224  	c := new(Collector)
   225  
   226  	val, err := db.NewStructField(name)
   227  	if err != nil {
   228  		return nil, fmt.Errorf("field initialization for %q failed: %w", name, err)
   229  	}
   230  	c.persistence = &val
   231  
   232  	counters := make(map[string]persistentCounters)
   233  	if err := val.Get(&counters); err != nil && !errors.Is(err, leveldb.ErrNotFound) {
   234  		return nil, err
   235  	}
   236  
   237  	for _, val := range counters {
   238  		c.counters.Store(val.PeerAddress.ByteString(), &Counters{
   239  			peerAddress:       val.PeerAddress,
   240  			lastSeenTimestamp: val.LastSeenTimestamp,
   241  			connTotalDuration: val.ConnTotalDuration,
   242  		})
   243  	}
   244  
   245  	return c, nil
   246  }
   247  
   248  // Collector collects various metrics about
   249  // peers specified be the swarm.Address.
   250  type Collector struct {
   251  	counters    sync.Map
   252  	persistence *shed.StructField
   253  }
   254  
   255  // Record records a set of metrics for peer specified by the given address.
   256  func (c *Collector) Record(addr swarm.Address, rop ...RecordOp) {
   257  	val, _ := c.counters.LoadOrStore(addr.ByteString(), &Counters{peerAddress: addr})
   258  	for _, op := range rop {
   259  		op(val.(*Counters))
   260  	}
   261  }
   262  
   263  // Snapshot returns the current state of the metrics collector for peer(s).
   264  // The given time t is used to calculate the duration of the current session,
   265  // if any. If an address or a set of addresses is specified then only metrics
   266  // related to them will be returned, otherwise metrics for all peers will be
   267  // returned. If the peer is still logged in, the session-related counters will
   268  // be evaluated against the last seen time, which equals to the login time. If
   269  // the peer is logged out, then the session counters will reflect its last
   270  // session.
   271  func (c *Collector) Snapshot(t time.Time, addresses ...swarm.Address) map[string]*Snapshot {
   272  	snapshot := make(map[string]*Snapshot)
   273  
   274  	for _, addr := range addresses {
   275  		val, ok := c.counters.Load(addr.ByteString())
   276  		if !ok {
   277  			continue
   278  		}
   279  		cs := val.(*Counters)
   280  		snapshot[addr.ByteString()] = cs.snapshot(t)
   281  	}
   282  
   283  	if len(addresses) == 0 {
   284  		c.counters.Range(func(key, val interface{}) bool {
   285  			cs := val.(*Counters)
   286  			snapshot[cs.peerAddress.ByteString()] = cs.snapshot(t)
   287  			return true
   288  		})
   289  	}
   290  
   291  	return snapshot
   292  }
   293  
   294  // IsUnreachable returns true if the peer is unreachable.
   295  func (c *Collector) IsUnreachable(addr swarm.Address) bool {
   296  	val, ok := c.counters.Load(addr.ByteString())
   297  	if !ok {
   298  		return true
   299  	}
   300  	cs := val.(*Counters)
   301  
   302  	cs.Lock()
   303  	defer cs.Unlock()
   304  
   305  	return cs.ReachabilityStatus != p2p.ReachabilityStatusPublic
   306  }
   307  
   308  // ExcludeOp is a function type used to filter peers on certain fields.
   309  type ExcludeOp func(*Counters) bool
   310  
   311  // Reachable is used to filter reachable or unreachable peers based on r.
   312  func Reachability(filterReachable bool) ExcludeOp {
   313  	return func(cs *Counters) bool {
   314  		reachble := cs.ReachabilityStatus == p2p.ReachabilityStatusPublic
   315  		if filterReachable {
   316  			return reachble
   317  		}
   318  		return !reachble
   319  	}
   320  }
   321  
   322  // Unreachable is used to filter unhealthy peers.
   323  func Health(filterHealthy bool) ExcludeOp {
   324  	return func(cs *Counters) bool {
   325  		if filterHealthy {
   326  			return cs.Healthy
   327  		}
   328  		return !cs.Healthy
   329  	}
   330  }
   331  
   332  // Exclude returns false if the addr passes all exclusion operations.
   333  func (c *Collector) Exclude(addr swarm.Address, fop ...ExcludeOp) bool {
   334  	val, ok := c.counters.Load(addr.ByteString())
   335  	if !ok {
   336  		return true
   337  	}
   338  	cs := val.(*Counters)
   339  	cs.Lock()
   340  	defer cs.Unlock()
   341  
   342  	for _, f := range fop {
   343  		if f(cs) {
   344  			return true
   345  		}
   346  	}
   347  
   348  	return false
   349  }
   350  
   351  // Inspect allows inspecting current snapshot for the given
   352  // peer address by executing the inspection function.
   353  func (c *Collector) Inspect(addr swarm.Address) *Snapshot {
   354  	snapshots := c.Snapshot(time.Now(), addr)
   355  	return snapshots[addr.ByteString()]
   356  }
   357  
   358  // Flush sync the dirty in memory counters for all peers by flushing their
   359  // values to the underlying storage.
   360  func (c *Collector) Flush() error {
   361  	counters := make(map[string]interface{})
   362  	c.counters.Range(func(key, val interface{}) bool {
   363  		cs := val.(*Counters)
   364  		counters[cs.peerAddress.ByteString()] = val
   365  		return true
   366  	})
   367  
   368  	if err := c.persistence.Put(counters); err != nil {
   369  		return fmt.Errorf("unable to persist counters: %w", err)
   370  	}
   371  	return nil
   372  }
   373  
   374  // Finalize tries to log out all ongoing peer sessions.
   375  func (c *Collector) Finalize(t time.Time, remove bool) error {
   376  	c.counters.Range(func(_, val interface{}) bool {
   377  		cs := val.(*Counters)
   378  		PeerLogOut(t)(cs)
   379  		return true
   380  	})
   381  
   382  	if err := c.Flush(); err != nil {
   383  		return err
   384  	}
   385  
   386  	if remove {
   387  		c.counters.Range(func(_, val interface{}) bool {
   388  			cs := val.(*Counters)
   389  			c.counters.Delete(cs.peerAddress.ByteString())
   390  			return true
   391  		})
   392  	}
   393  
   394  	return nil
   395  }