github.com/celestiaorg/celestia-node@v0.15.0-beta.1/share/p2p/peers/metrics.go (about)

     1  package peers
     2  
     3  import (
     4  	"context"
     5  	"fmt"
     6  	"sync"
     7  	"time"
     8  
     9  	pubsub "github.com/libp2p/go-libp2p-pubsub"
    10  	"github.com/libp2p/go-libp2p/core/peer"
    11  	"go.opentelemetry.io/otel"
    12  	"go.opentelemetry.io/otel/attribute"
    13  	"go.opentelemetry.io/otel/metric"
    14  
    15  	"github.com/celestiaorg/celestia-node/libs/utils"
    16  	"github.com/celestiaorg/celestia-node/share/p2p/shrexsub"
    17  )
    18  
    19  const (
    20  	isInstantKey  = "is_instant"
    21  	doneResultKey = "done_result"
    22  
    23  	sourceKey                  = "source"
    24  	sourceShrexSub  peerSource = "shrexsub"
    25  	sourceFullNodes peerSource = "full_nodes"
    26  
    27  	blacklistPeerReasonKey                     = "blacklist_reason"
    28  	reasonInvalidHash      blacklistPeerReason = "invalid_hash"
    29  	reasonMisbehave        blacklistPeerReason = "misbehave"
    30  
    31  	validationResultKey = "validation_result"
    32  	validationAccept    = "accept"
    33  	validationReject    = "reject"
    34  	validationIgnore    = "ignore"
    35  
    36  	peerStatusKey                 = "peer_status"
    37  	peerStatusActive   peerStatus = "active"
    38  	peerStatusCooldown peerStatus = "cooldown"
    39  
    40  	poolStatusKey                    = "pool_status"
    41  	poolStatusCreated     poolStatus = "created"
    42  	poolStatusValidated   poolStatus = "validated"
    43  	poolStatusBlacklisted poolStatus = "blacklisted"
    44  	// Pool status model:
    45  	//        	created(unvalidated)
    46  	//  	/						\
    47  	//  validated  	 			 blacklisted
    48  )
    49  
    50  var meter = otel.Meter("shrex_peer_manager")
    51  
    52  type blacklistPeerReason string
    53  
    54  type peerStatus string
    55  
    56  type poolStatus string
    57  
    58  type peerSource string
    59  
    60  type metrics struct {
    61  	getPeer                  metric.Int64Counter   // attributes: source, is_instant
    62  	getPeerWaitTimeHistogram metric.Int64Histogram // attributes: source
    63  	getPeerPoolSizeHistogram metric.Int64Histogram // attributes: source
    64  	doneResult               metric.Int64Counter   // attributes: source, done_result
    65  	validationResult         metric.Int64Counter   // attributes: validation_result
    66  
    67  	shrexPools               metric.Int64ObservableGauge // attributes: pool_status
    68  	fullNodesPool            metric.Int64ObservableGauge // attributes: pool_status
    69  	blacklistedPeersByReason sync.Map
    70  	blacklistedPeers         metric.Int64ObservableGauge // attributes: blacklist_reason
    71  }
    72  
    73  func initMetrics(manager *Manager) (*metrics, error) {
    74  	getPeer, err := meter.Int64Counter("peer_manager_get_peer_counter",
    75  		metric.WithDescription("get peer counter"))
    76  	if err != nil {
    77  		return nil, err
    78  	}
    79  
    80  	getPeerWaitTimeHistogram, err := meter.Int64Histogram("peer_manager_get_peer_ms_time_hist",
    81  		metric.WithDescription("get peer time histogram(ms), observed only for async get(is_instant = false)"))
    82  	if err != nil {
    83  		return nil, err
    84  	}
    85  
    86  	getPeerPoolSizeHistogram, err := meter.Int64Histogram("peer_manager_get_peer_pool_size_hist",
    87  		metric.WithDescription("amount of available active peers in pool at time when get was called"))
    88  	if err != nil {
    89  		return nil, err
    90  	}
    91  
    92  	doneResult, err := meter.Int64Counter("peer_manager_done_result_counter",
    93  		metric.WithDescription("done results counter"))
    94  	if err != nil {
    95  		return nil, err
    96  	}
    97  
    98  	validationResult, err := meter.Int64Counter("peer_manager_validation_result_counter",
    99  		metric.WithDescription("validation result counter"))
   100  	if err != nil {
   101  		return nil, err
   102  	}
   103  
   104  	shrexPools, err := meter.Int64ObservableGauge("peer_manager_pools_gauge",
   105  		metric.WithDescription("pools amount"))
   106  	if err != nil {
   107  		return nil, err
   108  	}
   109  
   110  	fullNodesPool, err := meter.Int64ObservableGauge("peer_manager_full_nodes_gauge",
   111  		metric.WithDescription("full nodes pool peers amount"))
   112  	if err != nil {
   113  		return nil, err
   114  	}
   115  
   116  	blacklisted, err := meter.Int64ObservableGauge("peer_manager_blacklisted_peers",
   117  		metric.WithDescription("blacklisted peers amount"))
   118  	if err != nil {
   119  		return nil, err
   120  	}
   121  
   122  	metrics := &metrics{
   123  		getPeer:                  getPeer,
   124  		getPeerWaitTimeHistogram: getPeerWaitTimeHistogram,
   125  		doneResult:               doneResult,
   126  		validationResult:         validationResult,
   127  		shrexPools:               shrexPools,
   128  		fullNodesPool:            fullNodesPool,
   129  		getPeerPoolSizeHistogram: getPeerPoolSizeHistogram,
   130  		blacklistedPeers:         blacklisted,
   131  	}
   132  
   133  	callback := func(ctx context.Context, observer metric.Observer) error {
   134  		for poolStatus, count := range manager.shrexPools() {
   135  			observer.ObserveInt64(shrexPools, count,
   136  				metric.WithAttributes(
   137  					attribute.String(poolStatusKey, string(poolStatus))))
   138  		}
   139  
   140  		observer.ObserveInt64(fullNodesPool, int64(manager.fullNodes.len()),
   141  			metric.WithAttributes(
   142  				attribute.String(peerStatusKey, string(peerStatusActive))))
   143  		observer.ObserveInt64(fullNodesPool, int64(manager.fullNodes.cooldown.len()),
   144  			metric.WithAttributes(
   145  				attribute.String(peerStatusKey, string(peerStatusCooldown))))
   146  
   147  		metrics.blacklistedPeersByReason.Range(func(key, value any) bool {
   148  			reason := key.(blacklistPeerReason)
   149  			amount := value.(int)
   150  			observer.ObserveInt64(blacklisted, int64(amount),
   151  				metric.WithAttributes(
   152  					attribute.String(blacklistPeerReasonKey, string(reason))))
   153  			return true
   154  		})
   155  		return nil
   156  	}
   157  	_, err = meter.RegisterCallback(callback, shrexPools, fullNodesPool, blacklisted)
   158  	if err != nil {
   159  		return nil, fmt.Errorf("registering metrics callback: %w", err)
   160  	}
   161  	return metrics, nil
   162  }
   163  
   164  func (m *metrics) observeGetPeer(
   165  	ctx context.Context,
   166  	source peerSource, poolSize int, waitTime time.Duration,
   167  ) {
   168  	if m == nil {
   169  		return
   170  	}
   171  	ctx = utils.ResetContextOnError(ctx)
   172  	m.getPeer.Add(ctx, 1,
   173  		metric.WithAttributes(
   174  			attribute.String(sourceKey, string(source)),
   175  			attribute.Bool(isInstantKey, waitTime == 0)))
   176  	if source == sourceShrexSub {
   177  		m.getPeerPoolSizeHistogram.Record(ctx, int64(poolSize),
   178  			metric.WithAttributes(
   179  				attribute.String(sourceKey, string(source))))
   180  	}
   181  
   182  	// record wait time only for async gets
   183  	if waitTime > 0 {
   184  		m.getPeerWaitTimeHistogram.Record(ctx, waitTime.Milliseconds(),
   185  			metric.WithAttributes(
   186  				attribute.String(sourceKey, string(source))))
   187  	}
   188  }
   189  
   190  func (m *metrics) observeDoneResult(source peerSource, result result) {
   191  	if m == nil {
   192  		return
   193  	}
   194  
   195  	ctx := context.Background()
   196  	m.doneResult.Add(ctx, 1,
   197  		metric.WithAttributes(
   198  			attribute.String(sourceKey, string(source)),
   199  			attribute.String(doneResultKey, string(result))))
   200  }
   201  
   202  // validationObserver is a middleware that observes validation results as metrics
   203  func (m *metrics) validationObserver(validator shrexsub.ValidatorFn) shrexsub.ValidatorFn {
   204  	if m == nil {
   205  		return validator
   206  	}
   207  	return func(ctx context.Context, id peer.ID, n shrexsub.Notification) pubsub.ValidationResult {
   208  		res := validator(ctx, id, n)
   209  
   210  		var resStr string
   211  		switch res {
   212  		case pubsub.ValidationAccept:
   213  			resStr = validationAccept
   214  		case pubsub.ValidationReject:
   215  			resStr = validationReject
   216  		case pubsub.ValidationIgnore:
   217  			resStr = validationIgnore
   218  		default:
   219  			resStr = "unknown"
   220  		}
   221  
   222  		ctx = utils.ResetContextOnError(ctx)
   223  
   224  		m.validationResult.Add(ctx, 1,
   225  			metric.WithAttributes(
   226  				attribute.String(validationResultKey, resStr)))
   227  		return res
   228  	}
   229  }
   230  
   231  // observeBlacklistPeers stores amount of blacklisted peers by reason
   232  func (m *metrics) observeBlacklistPeers(reason blacklistPeerReason, amount int) {
   233  	if m == nil {
   234  		return
   235  	}
   236  	for {
   237  		prevVal, loaded := m.blacklistedPeersByReason.LoadOrStore(reason, amount)
   238  		if !loaded {
   239  			return
   240  		}
   241  
   242  		newVal := prevVal.(int) + amount
   243  		if m.blacklistedPeersByReason.CompareAndSwap(reason, prevVal, newVal) {
   244  			return
   245  		}
   246  	}
   247  }
   248  
   249  // shrexPools collects amount of shrex pools by poolStatus
   250  func (m *Manager) shrexPools() map[poolStatus]int64 {
   251  	m.lock.Lock()
   252  	defer m.lock.Unlock()
   253  
   254  	shrexPools := make(map[poolStatus]int64)
   255  	for _, p := range m.pools {
   256  		if !p.isValidatedDataHash.Load() {
   257  			shrexPools[poolStatusCreated]++
   258  			continue
   259  		}
   260  
   261  		// pool is validated but not synced
   262  		shrexPools[poolStatusValidated]++
   263  	}
   264  
   265  	shrexPools[poolStatusBlacklisted] = int64(len(m.blacklistedHashes))
   266  	return shrexPools
   267  }