go.ligato.io/vpp-agent/v3@v3.5.0/plugins/telemetry/prometheus.go (about)

     1  //  Copyright (c) 2019 Cisco and/or its affiliates.
     2  //
     3  //  Licensed under the Apache License, Version 2.0 (the "License");
     4  //  you may not use this file except in compliance with the License.
     5  //  You may obtain a copy of the License at:
     6  //
     7  //      http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  //  Unless required by applicable law or agreed to in writing, software
    10  //  distributed under the License is distributed on an "AS IS" BASIS,
    11  //  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  //  See the License for the specific language governing permissions and
    13  //  limitations under the License.
    14  
    15  package telemetry
    16  
    17  import (
    18  	"context"
    19  	"fmt"
    20  	"strconv"
    21  
    22  	"github.com/prometheus/client_golang/prometheus"
    23  )
    24  
    25  const (
    26  	// Registry path for telemetry metrics
    27  	registryPath = "/metrics"
    28  
    29  	vppMetricsNamespace = "vpp"
    30  
    31  	// Metrics label used for agent label
    32  	agentLabel = "agent"
    33  )
    34  
    35  // Runtime metrics
    36  const (
    37  	runtimeMetricsNamespace = "runtime"
    38  
    39  	runtimeThreadLabel   = "thread"
    40  	runtimeThreadIDLabel = "threadID"
    41  	runtimeItemLabel     = "item"
    42  
    43  	runtimeCallsMetric          = "calls"
    44  	runtimeVectorsMetric        = "vectors"
    45  	runtimeSuspendsMetric       = "suspends"
    46  	runtimeClocksMetric         = "clocks"
    47  	runtimeVectorsPerCallMetric = "vectors_per_call"
    48  )
    49  
    50  // Memory metrics
    51  const (
    52  	memoryMetricsNamespace = "memory"
    53  
    54  	memoryThreadLabel   = "thread"
    55  	memoryThreadIDLabel = "threadID"
    56  
    57  	memoryObjectsMetric         = "objects"
    58  	memoryUsedMetric            = "used"
    59  	memoryTotalMetric           = "total"
    60  	memoryFreeMetric            = "free"
    61  	memoryTrimmableMetric       = "trimmable"
    62  	memoryFreeChunksMetric      = "free_chunks"
    63  	memoryFreeFastbinBlksMetric = "free_fastbin_blks"
    64  	memoryMaxTotalAlloc         = "max_total_allocated"
    65  	memorySizeMetric            = "size"
    66  	memoryPagesMetric           = "pages"
    67  )
    68  
    69  // Buffers metrics
    70  const (
    71  	buffersMetricsNamespace = "buffers"
    72  
    73  	buffersThreadIDLabel = "threadID"
    74  	buffersItemLabel     = "item"
    75  	buffersIndexLabel    = "index"
    76  
    77  	buffersSizeMetric     = "size"
    78  	buffersAllocMetric    = "alloc"
    79  	buffersFreeMetric     = "free"
    80  	buffersNumAllocMetric = "num_alloc"
    81  	buffersNumFreeMetric  = "num_free"
    82  )
    83  
    84  // Node metrics
    85  const (
    86  	nodeMetricsNamespace = "nodes"
    87  
    88  	nodeCounterItemLabel   = "item"
    89  	nodeCounterReasonLabel = "reason"
    90  
    91  	nodeCounterCounterMetric = "counter"
    92  )
    93  
    94  // Interface metrics
    95  const (
    96  	ifMetricsNamespace = "interfaces"
    97  
    98  	ifCounterNameLabel  = "name"
    99  	ifCounterIndexLabel = "index"
   100  
   101  	ifCounterRxPackets = "rx_packets"
   102  	ifCounterRxBytes   = "rx_bytes"
   103  	ifCounterRxErrors  = "rx_errors"
   104  	ifCounterTxPackets = "tx_packets"
   105  	ifCounterTxBytes   = "tx_bytes"
   106  	ifCounterTxErrors  = "tx_errors"
   107  	ifCounterDrops     = "drops"
   108  	ifCounterPunts     = "punts"
   109  	ifCounterIP4       = "ip4"
   110  	ifCounterIP6       = "ip6"
   111  	ifCounterRxNoBuf   = "rx_no_buf"
   112  	ifCounterRxMiss    = "rx_miss"
   113  )
   114  
   115  type prometheusMetrics struct {
   116  	runtimeGaugeVecs map[string]*prometheus.GaugeVec
   117  	runtimeStats     map[string]*runtimeStats
   118  
   119  	memoryGaugeVecs map[string]*prometheus.GaugeVec
   120  	memoryStats     map[string]*memoryStats
   121  
   122  	buffersGaugeVecs map[string]*prometheus.GaugeVec
   123  	buffersStats     map[string]*buffersStats
   124  
   125  	nodeCounterGaugeVecs map[string]*prometheus.GaugeVec
   126  	nodeCounterStats     map[string]*nodeCounterStats
   127  
   128  	ifCounterGaugeVecs map[string]*prometheus.GaugeVec
   129  	ifCounterStats     map[string]*ifCounterStats
   130  }
   131  
   132  type runtimeStats struct {
   133  	threadName string
   134  	threadID   uint
   135  	itemName   string
   136  	metrics    map[string]prometheus.Gauge
   137  }
   138  
   139  type memoryStats struct {
   140  	threadName string
   141  	threadID   uint
   142  	metrics    map[string]prometheus.Gauge
   143  }
   144  
   145  type buffersStats struct {
   146  	threadID  uint
   147  	itemName  string
   148  	itemIndex uint
   149  	metrics   map[string]prometheus.Gauge
   150  }
   151  
   152  type nodeCounterStats struct {
   153  	itemName string
   154  	metrics  map[string]prometheus.Gauge
   155  }
   156  
   157  type ifCounterStats struct {
   158  	name    string
   159  	metrics map[string]prometheus.Gauge
   160  }
   161  
   162  func (p *Plugin) registerPrometheus() error {
   163  	p.Log.Debugf("registering prometheus registry path: %v", registryPath)
   164  
   165  	// Runtime metrics
   166  	p.runtimeGaugeVecs = make(map[string]*prometheus.GaugeVec)
   167  	p.runtimeStats = make(map[string]*runtimeStats)
   168  
   169  	for _, metric := range [][2]string{
   170  		{runtimeCallsMetric, "Number of calls"},
   171  		{runtimeVectorsMetric, "Number of vectors"},
   172  		{runtimeSuspendsMetric, "Number of suspends"},
   173  		{runtimeClocksMetric, "Number of clocks"},
   174  		{runtimeVectorsPerCallMetric, "Number of vectors per call"},
   175  	} {
   176  		name := metric[0]
   177  		p.runtimeGaugeVecs[name] = prometheus.NewGaugeVec(prometheus.GaugeOpts{
   178  			Namespace: vppMetricsNamespace,
   179  			Subsystem: runtimeMetricsNamespace,
   180  			Name:      name,
   181  			Help:      metric[1],
   182  			ConstLabels: prometheus.Labels{
   183  				agentLabel: p.ServiceLabel.GetAgentLabel(),
   184  			},
   185  		}, []string{runtimeItemLabel, runtimeThreadLabel, runtimeThreadIDLabel})
   186  	}
   187  
   188  	// register created vectors to prometheus
   189  	for name, metric := range p.runtimeGaugeVecs {
   190  		if err := p.Prometheus.Register(registryPath, metric); err != nil {
   191  			p.Log.Errorf("failed to register %v metric: %v", name, err)
   192  			return err
   193  		}
   194  	}
   195  
   196  	// Memory metrics
   197  	p.memoryGaugeVecs = make(map[string]*prometheus.GaugeVec)
   198  	p.memoryStats = make(map[string]*memoryStats)
   199  
   200  	for _, metric := range [][2]string{
   201  		{memoryObjectsMetric, "Number of objects"},
   202  		{memoryUsedMetric, "Used memory"},
   203  		{memoryTotalMetric, "Total memory"},
   204  		{memoryFreeMetric, "Free memory"},
   205  		{memorySizeMetric, "Size"},
   206  		{memoryPagesMetric, "Pages"},
   207  		{memoryTrimmableMetric, "Trimmable"},
   208  		{memoryFreeChunksMetric, "Free Chunks"},
   209  		{memoryFreeFastbinBlksMetric, "Free Fastbin Bulks"},
   210  		{memoryMaxTotalAlloc, "Max Total Allocations"},
   211  	} {
   212  		name := metric[0]
   213  		p.memoryGaugeVecs[name] = prometheus.NewGaugeVec(prometheus.GaugeOpts{
   214  			Namespace: vppMetricsNamespace,
   215  			Subsystem: memoryMetricsNamespace,
   216  			Name:      name,
   217  			Help:      metric[1],
   218  			ConstLabels: prometheus.Labels{
   219  				agentLabel: p.ServiceLabel.GetAgentLabel(),
   220  			},
   221  		}, []string{memoryThreadLabel, memoryThreadIDLabel})
   222  
   223  	}
   224  
   225  	// register created vectors to prometheus
   226  	for name, metric := range p.memoryGaugeVecs {
   227  		if err := p.Prometheus.Register(registryPath, metric); err != nil {
   228  			p.Log.Errorf("failed to register %v metric: %v", name, err)
   229  			return err
   230  		}
   231  	}
   232  
   233  	// Buffers metrics
   234  	p.buffersGaugeVecs = make(map[string]*prometheus.GaugeVec)
   235  	p.buffersStats = make(map[string]*buffersStats)
   236  
   237  	for _, metric := range [][2]string{
   238  		{buffersSizeMetric, "Size of buffer"},
   239  		{buffersAllocMetric, "Allocated"},
   240  		{buffersFreeMetric, "Free"},
   241  		{buffersNumAllocMetric, "Number of allocated"},
   242  		{buffersNumFreeMetric, "Number of free"},
   243  	} {
   244  		name := metric[0]
   245  		p.buffersGaugeVecs[name] = prometheus.NewGaugeVec(prometheus.GaugeOpts{
   246  			Namespace: vppMetricsNamespace,
   247  			Subsystem: buffersMetricsNamespace,
   248  			Name:      name,
   249  			Help:      metric[1],
   250  			ConstLabels: prometheus.Labels{
   251  				agentLabel: p.ServiceLabel.GetAgentLabel(),
   252  			},
   253  		}, []string{buffersThreadIDLabel, buffersItemLabel, buffersIndexLabel})
   254  
   255  	}
   256  
   257  	// register created vectors to prometheus
   258  	for name, metric := range p.buffersGaugeVecs {
   259  		if err := p.Prometheus.Register(registryPath, metric); err != nil {
   260  			p.Log.Errorf("failed to register %v metric: %v", name, err)
   261  			return err
   262  		}
   263  	}
   264  
   265  	// Node counters metrics
   266  	p.nodeCounterGaugeVecs = make(map[string]*prometheus.GaugeVec)
   267  	p.nodeCounterStats = make(map[string]*nodeCounterStats)
   268  
   269  	for _, metric := range [][2]string{
   270  		{nodeCounterCounterMetric, "Counter"},
   271  	} {
   272  		name := metric[0]
   273  		p.nodeCounterGaugeVecs[name] = prometheus.NewGaugeVec(prometheus.GaugeOpts{
   274  			Namespace: vppMetricsNamespace,
   275  			Subsystem: nodeMetricsNamespace,
   276  			Name:      name,
   277  			Help:      metric[1],
   278  			ConstLabels: prometheus.Labels{
   279  				agentLabel: p.ServiceLabel.GetAgentLabel(),
   280  			},
   281  		}, []string{nodeCounterItemLabel, nodeCounterReasonLabel})
   282  
   283  	}
   284  
   285  	// register created vectors to prometheus
   286  	for name, metric := range p.nodeCounterGaugeVecs {
   287  		if err := p.Prometheus.Register(registryPath, metric); err != nil {
   288  			p.Log.Errorf("failed to register %v metric: %v", name, err)
   289  			return err
   290  		}
   291  	}
   292  
   293  	// Interface counter metrics
   294  	p.ifCounterGaugeVecs = make(map[string]*prometheus.GaugeVec)
   295  	p.ifCounterStats = make(map[string]*ifCounterStats)
   296  
   297  	for _, metric := range [][2]string{
   298  		{ifCounterRxPackets, "RX packets"},
   299  		{ifCounterRxBytes, "RX bytes"},
   300  		{ifCounterRxErrors, "RX errors"},
   301  		{ifCounterTxPackets, "TX packets"},
   302  		{ifCounterTxBytes, "TX bytes"},
   303  		{ifCounterTxErrors, "TX errors"},
   304  		{ifCounterDrops, "Drops"},
   305  		{ifCounterPunts, "Punts"},
   306  		{ifCounterIP4, "IP4"},
   307  		{ifCounterIP6, "IP6"},
   308  		{ifCounterRxNoBuf, "RX nobuf"},
   309  		{ifCounterRxMiss, "RX miss"},
   310  	} {
   311  		name := metric[0]
   312  		p.ifCounterGaugeVecs[name] = prometheus.NewGaugeVec(prometheus.GaugeOpts{
   313  			Namespace: vppMetricsNamespace,
   314  			Subsystem: ifMetricsNamespace,
   315  			Name:      name,
   316  			Help:      metric[1],
   317  			ConstLabels: prometheus.Labels{
   318  				agentLabel: p.ServiceLabel.GetAgentLabel(),
   319  			},
   320  		}, []string{ifCounterNameLabel, ifCounterIndexLabel})
   321  
   322  	}
   323  
   324  	// register created vectors to prometheus
   325  	for name, metric := range p.ifCounterGaugeVecs {
   326  		if err := p.Prometheus.Register(registryPath, metric); err != nil {
   327  			p.Log.Errorf("failed to register %v metric: %v", name, err)
   328  			return err
   329  		}
   330  	}
   331  
   332  	return nil
   333  }
   334  
   335  func (p *Plugin) updatePrometheus(ctx context.Context) {
   336  	p.tracef("running update")
   337  
   338  	if !p.skipped[runtimeMetricsNamespace] {
   339  		// Update runtime
   340  		runtimeInfo, err := p.handler.GetRuntimeInfo(ctx)
   341  		if err != nil {
   342  			p.Log.Errorf("GetRuntimeInfo failed: %v", err)
   343  		} else {
   344  			p.tracef("runtime info: %+v", runtimeInfo)
   345  			for _, thread := range runtimeInfo.GetThreads() {
   346  				for _, item := range thread.Items {
   347  					stats, ok := p.runtimeStats[item.Name]
   348  					if !ok {
   349  						stats = &runtimeStats{
   350  							threadID:   thread.ID,
   351  							threadName: thread.Name,
   352  							itemName:   item.Name,
   353  							metrics:    map[string]prometheus.Gauge{},
   354  						}
   355  						p.runtimeStats[item.Name] = stats
   356  
   357  						// add gauges with corresponding labels into vectors
   358  						for k, vec := range p.runtimeGaugeVecs {
   359  							stats.metrics[k], err = vec.GetMetricWith(prometheus.Labels{
   360  								runtimeItemLabel:     item.Name,
   361  								runtimeThreadLabel:   thread.Name,
   362  								runtimeThreadIDLabel: strconv.Itoa(int(thread.ID)),
   363  							})
   364  							if err != nil {
   365  								p.Log.Error(err)
   366  							}
   367  						}
   368  					}
   369  
   370  					stats.metrics[runtimeCallsMetric].Set(float64(item.Calls))
   371  					stats.metrics[runtimeVectorsMetric].Set(float64(item.Vectors))
   372  					stats.metrics[runtimeSuspendsMetric].Set(float64(item.Suspends))
   373  					stats.metrics[runtimeClocksMetric].Set(item.Clocks)
   374  					stats.metrics[runtimeVectorsPerCallMetric].Set(item.VectorsPerCall)
   375  				}
   376  			}
   377  		}
   378  	}
   379  
   380  	if !p.skipped[buffersMetricsNamespace] {
   381  		// Update buffers
   382  		buffersInfo, err := p.handler.GetBuffersInfo(ctx)
   383  		if err != nil {
   384  			p.Log.Errorf("GetBuffersInfo failed: %v", err)
   385  		} else {
   386  			p.tracef("buffers info: %+v", buffersInfo)
   387  			for _, item := range buffersInfo.GetItems() {
   388  				stats, ok := p.buffersStats[item.Name]
   389  				if !ok {
   390  					stats = &buffersStats{
   391  						threadID:  item.ThreadID,
   392  						itemName:  item.Name,
   393  						itemIndex: item.Index,
   394  						metrics:   map[string]prometheus.Gauge{},
   395  					}
   396  					p.buffersStats[item.Name] = stats
   397  
   398  					// add gauges with corresponding labels into vectors
   399  					for k, vec := range p.buffersGaugeVecs {
   400  						stats.metrics[k], err = vec.GetMetricWith(prometheus.Labels{
   401  							buffersThreadIDLabel: strconv.Itoa(int(item.ThreadID)),
   402  							buffersItemLabel:     item.Name,
   403  							buffersIndexLabel:    strconv.Itoa(int(item.Index)),
   404  						})
   405  						if err != nil {
   406  							p.Log.Error(err)
   407  						}
   408  					}
   409  				}
   410  
   411  				stats.metrics[buffersSizeMetric].Set(float64(item.Size))
   412  				stats.metrics[buffersAllocMetric].Set(float64(item.Alloc))
   413  				stats.metrics[buffersFreeMetric].Set(float64(item.Free))
   414  				stats.metrics[buffersNumAllocMetric].Set(float64(item.NumAlloc))
   415  				stats.metrics[buffersNumFreeMetric].Set(float64(item.NumFree))
   416  			}
   417  		}
   418  	}
   419  
   420  	if !p.skipped[memoryMetricsNamespace] {
   421  		// Update memory
   422  		memoryInfo, err := p.handler.GetMemory(ctx)
   423  		if err != nil {
   424  			p.Log.Errorf("GetMemory failed: %v", err)
   425  		} else {
   426  			p.tracef("memory info: %+v", memoryInfo)
   427  			for _, thread := range memoryInfo.GetThreads() {
   428  				stats, ok := p.memoryStats[thread.Name]
   429  				if !ok {
   430  					stats = &memoryStats{
   431  						threadName: thread.Name,
   432  						threadID:   thread.ID,
   433  						metrics:    map[string]prometheus.Gauge{},
   434  					}
   435  					p.memoryStats[thread.Name] = stats
   436  
   437  					// add gauges with corresponding labels into vectors
   438  					for k, vec := range p.memoryGaugeVecs {
   439  						stats.metrics[k], err = vec.GetMetricWith(prometheus.Labels{
   440  							memoryThreadLabel:   thread.Name,
   441  							memoryThreadIDLabel: strconv.Itoa(int(thread.ID)),
   442  						})
   443  						if err != nil {
   444  							p.Log.Error(err)
   445  						}
   446  					}
   447  				}
   448  
   449  				stats.metrics[memoryUsedMetric].Set(float64(thread.Used))
   450  				stats.metrics[memoryTotalMetric].Set(float64(thread.Total))
   451  				stats.metrics[memoryFreeMetric].Set(float64(thread.Free))
   452  				stats.metrics[memorySizeMetric].Set(float64(thread.Size))
   453  				stats.metrics[memoryPagesMetric].Set(float64(thread.Pages))
   454  				stats.metrics[memoryTrimmableMetric].Set(float64(thread.Trimmable))
   455  				stats.metrics[memoryFreeChunksMetric].Set(float64(thread.FreeChunks))
   456  				stats.metrics[memoryFreeFastbinBlksMetric].Set(float64(thread.FreeFastbinBlks))
   457  				stats.metrics[memoryMaxTotalAlloc].Set(float64(thread.MaxTotalAlloc))
   458  			}
   459  		}
   460  	}
   461  
   462  	if !p.skipped[nodeMetricsNamespace] {
   463  		// Update node counters
   464  		nodeCountersInfo, err := p.handler.GetNodeCounters(ctx)
   465  		if err != nil {
   466  			p.Log.Errorf("GetNodeCounters failed: %v", err)
   467  		} else {
   468  			p.tracef("node counters info: %+v", nodeCountersInfo)
   469  			for _, item := range nodeCountersInfo.GetCounters() {
   470  				stats, ok := p.nodeCounterStats[item.Name]
   471  				if !ok {
   472  					stats = &nodeCounterStats{
   473  						itemName: item.Name,
   474  						metrics:  map[string]prometheus.Gauge{},
   475  					}
   476  					p.nodeCounterStats[item.Name] = stats
   477  
   478  					// add gauges with corresponding labels into vectors
   479  					for k, vec := range p.nodeCounterGaugeVecs {
   480  						stats.metrics[k], err = vec.GetMetricWith(prometheus.Labels{
   481  							nodeCounterItemLabel:   item.Node,
   482  							nodeCounterReasonLabel: item.Name,
   483  						})
   484  						if err != nil {
   485  							p.Log.Error(err)
   486  						}
   487  					}
   488  				}
   489  
   490  				stats.metrics[nodeCounterCounterMetric].Set(float64(item.Value))
   491  			}
   492  		}
   493  	}
   494  
   495  	if !p.skipped[ifMetricsNamespace] {
   496  		// Update interface counters
   497  		ifStats, err := p.handler.GetInterfaceStats(ctx)
   498  		if err != nil {
   499  			p.Log.Errorf("GetInterfaceStats failed: %v", err)
   500  			return
   501  		} else {
   502  			p.tracef("interface stats: %+v", ifStats)
   503  			if ifStats == nil {
   504  				return
   505  			}
   506  			for _, item := range ifStats.Interfaces {
   507  				stats, ok := p.ifCounterStats[item.InterfaceName]
   508  				if !ok {
   509  					stats = &ifCounterStats{
   510  						name:    item.InterfaceName,
   511  						metrics: map[string]prometheus.Gauge{},
   512  					}
   513  					p.ifCounterStats[item.InterfaceName] = stats
   514  
   515  					// add gauges with corresponding labels into vectors
   516  					for k, vec := range p.ifCounterGaugeVecs {
   517  						stats.metrics[k], err = vec.GetMetricWith(prometheus.Labels{
   518  							ifCounterNameLabel:  item.InterfaceName,
   519  							ifCounterIndexLabel: fmt.Sprint(item.InterfaceIndex),
   520  						})
   521  						if err != nil {
   522  							p.Log.Error(err)
   523  						}
   524  					}
   525  				}
   526  
   527  				stats.metrics[ifCounterRxPackets].Set(float64(item.Rx.Packets))
   528  				stats.metrics[ifCounterRxBytes].Set(float64(item.Rx.Bytes))
   529  				stats.metrics[ifCounterRxErrors].Set(float64(item.RxErrors))
   530  				stats.metrics[ifCounterTxPackets].Set(float64(item.Tx.Packets))
   531  				stats.metrics[ifCounterTxBytes].Set(float64(item.Tx.Bytes))
   532  				stats.metrics[ifCounterTxErrors].Set(float64(item.TxErrors))
   533  				stats.metrics[ifCounterDrops].Set(float64(item.Drops))
   534  				stats.metrics[ifCounterPunts].Set(float64(item.Punts))
   535  				stats.metrics[ifCounterIP4].Set(float64(item.IP4))
   536  				stats.metrics[ifCounterIP6].Set(float64(item.IP6))
   537  				stats.metrics[ifCounterRxNoBuf].Set(float64(item.RxNoBuf))
   538  				stats.metrics[ifCounterRxMiss].Set(float64(item.RxMiss))
   539  			}
   540  		}
   541  	}
   542  
   543  	p.tracef("update complete")
   544  }