github.com/netdata/go.d.plugin@v0.58.1/modules/consul/collect_metrics.go (about)

     1  // SPDX-License-Identifier: GPL-3.0-or-later
     2  
     3  package consul
     4  
     5  import (
     6  	"fmt"
     7  	"math"
     8  	"strconv"
     9  	"strings"
    10  
    11  	"github.com/netdata/go.d.plugin/pkg/prometheus"
    12  )
    13  
    14  func (c *Consul) collectMetricsPrometheus(mx map[string]int64) error {
    15  	mfs, err := c.prom.Scrape()
    16  	if err != nil {
    17  		return err
    18  	}
    19  
    20  	// Key Metrics (https://developer.hashicorp.com/consul/docs/agent/telemetry#key-metrics)
    21  
    22  	// prometheus metrics are messy:
    23  	// - if 'disable_hostname' is false (default):
    24  	//   - consul_autopilot_failure_tolerance => consul_hostname_autopilot_failure_tolerance
    25  	//   - both are exposed
    26  	//   - only the one with the hostname has the correct value
    27  	// - 1.14.3 (it probably has something to do with cloud management version):
    28  	//   - runtime_sys_bytes => runtime_sys_bytes_sys_bytes; consul_autopilot_healthy => consul_autopilot_healthy_healthy
    29  	//   - both are exposed
    30  	//   - only the one with the double name has the correct value
    31  
    32  	if c.isServer() {
    33  		c.collectSummary(mx, mfs, "raft_thread_main_saturation")
    34  		c.collectSummary(mx, mfs, "raft_thread_fsm_saturation")
    35  		c.collectSummary(mx, mfs, "raft_boltdb_logsPerBatch")
    36  		c.collectSummary(mx, mfs, "kvs_apply")
    37  		c.collectSummary(mx, mfs, "txn_apply")
    38  		c.collectSummary(mx, mfs, "raft_boltdb_storeLogs")
    39  		c.collectSummary(mx, mfs, "raft_rpc_installSnapshot") // make sense for followers only
    40  		c.collectSummary(mx, mfs, "raft_commitTime")          // make sense for leader only
    41  		c.collectSummary(mx, mfs, "raft_leader_lastContact")  // make sense for leader only
    42  
    43  		c.collectCounter(mx, mfs, "raft_apply", precision) // make sense for leader only
    44  		c.collectCounter(mx, mfs, "raft_state_candidate", 1)
    45  		c.collectCounter(mx, mfs, "raft_state_leader", 1)
    46  
    47  		c.collectGaugeBool(mx, mfs, "autopilot_healthy", "autopilot_healthy_healthy")
    48  		c.collectGaugeBool(mx, mfs, "server_isLeader", "server_isLeader_isLeader")
    49  		c.collectGauge(mx, mfs, "autopilot_failure_tolerance", 1, "autopilot_failure_tolerance_failure_tolerance")
    50  		c.collectGauge(mx, mfs, "raft_fsm_lastRestoreDuration", 1)
    51  		c.collectGauge(mx, mfs, "raft_leader_oldestLogAge", 1, "raft_leader_oldestLogAge_oldestLogAge")
    52  		c.collectGauge(mx, mfs, "raft_boltdb_freelistBytes", 1, "raft_boltdb_freelistBytes_freelistBytes")
    53  
    54  		if isLeader, ok := c.isLeader(mfs); ok {
    55  			if isLeader && !c.hasLeaderCharts {
    56  				c.addLeaderCharts()
    57  				c.hasLeaderCharts = true
    58  			}
    59  			if !isLeader && c.hasLeaderCharts {
    60  				c.removeLeaderCharts()
    61  				c.hasLeaderCharts = false
    62  			}
    63  			if !isLeader && !c.hasFollowerCharts {
    64  				c.addFollowerCharts()
    65  				c.hasFollowerCharts = true
    66  			}
    67  			if isLeader && c.hasFollowerCharts {
    68  				c.removeFollowerCharts()
    69  				c.hasFollowerCharts = false
    70  			}
    71  		}
    72  	}
    73  
    74  	c.collectGauge(mx, mfs, "system_licenseExpiration", 3600, "system_licenseExpiration_licenseExpiration")
    75  
    76  	c.collectCounter(mx, mfs, "client_rpc", 1)
    77  	c.collectCounter(mx, mfs, "client_rpc_exceeded", 1)
    78  	c.collectCounter(mx, mfs, "client_rpc_failed", 1)
    79  
    80  	c.collectGauge(mx, mfs, "runtime_alloc_bytes", 1, "runtime_alloc_bytes_alloc_bytes")
    81  	c.collectGauge(mx, mfs, "runtime_sys_bytes", 1, "runtime_sys_bytes_sys_bytes")
    82  	c.collectGauge(mx, mfs, "runtime_total_gc_pause_ns", 1, "runtime_total_gc_pause_ns_total_gc_pause_ns")
    83  
    84  	return nil
    85  }
    86  
    87  func (c *Consul) isLeader(mfs prometheus.MetricFamilies) (bool, bool) {
    88  	var mf *prometheus.MetricFamily
    89  	for _, v := range []string{"server_isLeader_isLeader", "server_isLeader"} {
    90  		if mf = mfs.GetGauge(c.promMetricNameWithHostname(v)); mf != nil {
    91  			break
    92  		}
    93  		if mf = mfs.GetGauge(c.promMetricName(v)); mf != nil {
    94  			break
    95  		}
    96  	}
    97  
    98  	if mf == nil {
    99  		return false, false
   100  	}
   101  
   102  	return mf.Metrics()[0].Gauge().Value() == 1, true
   103  }
   104  
   105  func (c *Consul) collectGauge(mx map[string]int64, mfs prometheus.MetricFamilies, name string, mul float64, aliases ...string) {
   106  	var mf *prometheus.MetricFamily
   107  	for _, v := range append(aliases, name) {
   108  		if mf = mfs.GetGauge(c.promMetricNameWithHostname(v)); mf != nil {
   109  			break
   110  		}
   111  		if mf = mfs.GetGauge(c.promMetricName(v)); mf != nil {
   112  			break
   113  		}
   114  	}
   115  
   116  	if mf == nil {
   117  		return
   118  	}
   119  
   120  	v := mf.Metrics()[0].Gauge().Value()
   121  
   122  	if !math.IsNaN(v) {
   123  		mx[name] = int64(v * mul)
   124  	}
   125  }
   126  
   127  func (c *Consul) collectGaugeBool(mx map[string]int64, mfs prometheus.MetricFamilies, name string, aliases ...string) {
   128  	var mf *prometheus.MetricFamily
   129  	for _, v := range append(aliases, name) {
   130  		if mf = mfs.GetGauge(c.promMetricNameWithHostname(v)); mf != nil {
   131  			break
   132  		}
   133  		if mf = mfs.GetGauge(c.promMetricName(v)); mf != nil {
   134  			break
   135  		}
   136  	}
   137  
   138  	if mf == nil {
   139  		return
   140  	}
   141  
   142  	v := mf.Metrics()[0].Gauge().Value()
   143  
   144  	if !math.IsNaN(v) {
   145  		mx[name+"_yes"] = boolToInt(v == 1)
   146  		mx[name+"_no"] = boolToInt(v == 0)
   147  	}
   148  }
   149  
   150  func (c *Consul) collectCounter(mx map[string]int64, mfs prometheus.MetricFamilies, name string, mul float64) {
   151  	mf := mfs.GetCounter(c.promMetricName(name))
   152  	if mf == nil {
   153  		return
   154  	}
   155  
   156  	v := mf.Metrics()[0].Counter().Value()
   157  
   158  	if !math.IsNaN(v) {
   159  		mx[name] = int64(v * mul)
   160  	}
   161  }
   162  
   163  func (c *Consul) collectSummary(mx map[string]int64, mfs prometheus.MetricFamilies, name string) {
   164  	mf := mfs.GetSummary(c.promMetricName(name))
   165  	if mf == nil {
   166  		return
   167  	}
   168  
   169  	m := mf.Metrics()[0]
   170  
   171  	for _, q := range m.Summary().Quantiles() {
   172  		v := q.Value()
   173  		// MaxAge is 10 seconds (hardcoded)
   174  		// https://github.com/hashicorp/go-metrics/blob/b6d5c860c07ef6eeec89f4a662c7b452dd4d0c93/prometheus/prometheus.go#L227
   175  		if math.IsNaN(v) {
   176  			v = 0
   177  		}
   178  
   179  		id := fmt.Sprintf("%s_quantile=%s", name, formatFloat(q.Quantile()))
   180  		mx[id] = int64(v * precision * precision)
   181  	}
   182  
   183  	mx[name+"_sum"] = int64(m.Summary().Sum() * precision)
   184  	mx[name+"_count"] = int64(m.Summary().Count())
   185  }
   186  
   187  func (c *Consul) promMetricName(name string) string {
   188  	px := c.cfg.DebugConfig.Telemetry.MetricsPrefix
   189  	return px + "_" + name
   190  }
   191  
   192  var forbiddenCharsReplacer = strings.NewReplacer(" ", "_", ".", "_", "=", "_", "-", "_", "/", "_")
   193  
   194  // controlled by 'disable_hostname'
   195  // https://developer.hashicorp.com/consul/docs/agent/config/config-files#telemetry-disable_hostname
   196  func (c *Consul) promMetricNameWithHostname(name string) string {
   197  	px := c.cfg.DebugConfig.Telemetry.MetricsPrefix
   198  	node := forbiddenCharsReplacer.Replace(c.cfg.Config.NodeName)
   199  
   200  	return px + "_" + node + "_" + name
   201  }
   202  
   203  func formatFloat(v float64) string {
   204  	return strconv.FormatFloat(v, 'f', -1, 64)
   205  }