github.com/cilium/cilium@v1.16.2/pkg/bgpv1/metrics/metrics.go (about)

     1  // SPDX-License-Identifier: Apache-2.0
     2  // Copyright Authors of Cilium
     3  
     4  package metrics
     5  
     6  import (
     7  	"context"
     8  	"net/netip"
     9  	"strconv"
    10  
    11  	"github.com/cilium/hive/cell"
    12  	"github.com/prometheus/client_golang/prometheus"
    13  	"github.com/sirupsen/logrus"
    14  
    15  	"github.com/cilium/cilium/pkg/bgpv1/agent"
    16  	"github.com/cilium/cilium/pkg/bgpv1/types"
    17  	"github.com/cilium/cilium/pkg/metrics"
    18  	"github.com/cilium/cilium/pkg/option"
    19  	"github.com/cilium/cilium/pkg/time"
    20  )
    21  
    22  const (
    23  	LabelVRouter  = "vrouter"
    24  	LabelNeighbor = "neighbor"
    25  	LabelAfi      = "afi"
    26  	LabelSafi     = "safi"
    27  
    28  	metricsSubsystem = "bgp_control_plane"
    29  )
    30  
    31  type collector struct {
    32  	SessionState          *prometheus.Desc
    33  	TotalAdvertisedRoutes *prometheus.Desc
    34  	TotalReceivedRoutes   *prometheus.Desc
    35  
    36  	in collectorIn
    37  }
    38  
    39  type collectorIn struct {
    40  	cell.In
    41  
    42  	Logger        logrus.FieldLogger
    43  	DaemonConfig  *option.DaemonConfig
    44  	Registry      *metrics.Registry
    45  	RouterManager agent.BGPRouterManager
    46  }
    47  
    48  // RegisterCollector registers the BGP Control Plane metrics collector to the
    49  // global prometheus registry. We don't rely on the metrics.Metric because the
    50  // collectors we can provide through metrics.Metric needs to implement
    51  // prometheus.Collector per metric which is not optimal in our case. We can
    52  // retrieve the multiple metrics from the single call to
    53  // RouterManager.GetPeers() and it is wasteful to call the same function
    54  // multiple times for each metric. Thus, we provide a raw Collector through
    55  // MustRegister interface. We may want to revisit this in the future.
    56  func RegisterCollector(in collectorIn) {
    57  	// Don't provide the collector if BGP control plane is disabled
    58  	if !in.DaemonConfig.EnableBGPControlPlane {
    59  		return
    60  	}
    61  	in.Registry.MustRegister(&collector{
    62  		SessionState: prometheus.NewDesc(
    63  			prometheus.BuildFQName(metrics.Namespace, metricsSubsystem, "session_state"),
    64  			"Current state of the BGP session with the peer, Up = 1 or Down = 0",
    65  			[]string{LabelVRouter, LabelNeighbor}, nil,
    66  		),
    67  		TotalAdvertisedRoutes: prometheus.NewDesc(
    68  			prometheus.BuildFQName(metrics.Namespace, metricsSubsystem, "advertised_routes"),
    69  			"Number of routes advertised to the peer",
    70  			[]string{LabelVRouter, LabelNeighbor, LabelAfi, LabelSafi}, nil,
    71  		),
    72  		TotalReceivedRoutes: prometheus.NewDesc(
    73  			prometheus.BuildFQName(metrics.Namespace, metricsSubsystem, "received_routes"),
    74  			"Number of routes received from the peer",
    75  			[]string{LabelVRouter, LabelNeighbor, LabelAfi, LabelSafi}, nil,
    76  		),
    77  		in: in,
    78  	})
    79  }
    80  
    81  func (c *collector) Describe(ch chan<- *prometheus.Desc) {
    82  	ch <- c.SessionState
    83  	ch <- c.TotalAdvertisedRoutes
    84  	ch <- c.TotalReceivedRoutes
    85  }
    86  
    87  func (c *collector) Collect(ch chan<- prometheus.Metric) {
    88  	// We defensively set a 5 sec timeout here. When the underlying router
    89  	// is not responsive, we cannot make a progress. 5 sec is chosen to be
    90  	// a too long time that we should never hit for normal cases. We should
    91  	// revisit this timeout when the metrics collection starts to involve a
    92  	// network communication.
    93  	ctx, cancel := context.WithTimeout(context.Background(), time.Second*5)
    94  	peers, err := c.in.RouterManager.GetPeers(ctx)
    95  	cancel()
    96  	if err != nil {
    97  		c.in.Logger.WithError(err).Error("Failed to retrieve BGP peer information. Metrics is not collected.")
    98  		return
    99  	}
   100  
   101  	for _, peer := range peers {
   102  		if peer == nil {
   103  			continue
   104  		}
   105  
   106  		vrouterLabel := strconv.FormatInt(peer.LocalAsn, 10)
   107  
   108  		addr, err := netip.ParseAddr(peer.PeerAddress)
   109  		if err != nil {
   110  			continue
   111  		}
   112  
   113  		neighborLabel := netip.AddrPortFrom(addr, uint16(peer.PeerPort)).String()
   114  
   115  		// Collect session state metrics
   116  		var up float64
   117  		if peer.SessionState == types.SessionEstablished.String() {
   118  			up = 1
   119  		} else {
   120  			up = 0
   121  		}
   122  		ch <- prometheus.MustNewConstMetric(
   123  			c.SessionState,
   124  			prometheus.GaugeValue,
   125  			up,
   126  			vrouterLabel,
   127  			neighborLabel,
   128  		)
   129  
   130  		// Collect route metrics per address family
   131  		for _, family := range peer.Families {
   132  			if family == nil {
   133  				continue
   134  			}
   135  			ch <- prometheus.MustNewConstMetric(
   136  				c.TotalAdvertisedRoutes,
   137  				prometheus.GaugeValue,
   138  				float64(family.Advertised),
   139  				vrouterLabel,
   140  				neighborLabel,
   141  				family.Afi,
   142  				family.Safi,
   143  			)
   144  			ch <- prometheus.MustNewConstMetric(
   145  				c.TotalReceivedRoutes,
   146  				prometheus.GaugeValue,
   147  				float64(family.Received),
   148  				vrouterLabel,
   149  				neighborLabel,
   150  				family.Afi,
   151  				family.Safi,
   152  			)
   153  		}
   154  	}
   155  }