k8s.io/kubernetes@v1.31.0-alpha.0.0.20240520171757-56147500dadc/pkg/proxy/metrics/metrics.go (about)

     1  /*
     2  Copyright 2017 The Kubernetes Authors.
     3  
     4  Licensed under the Apache License, Version 2.0 (the "License");
     5  you may not use this file except in compliance with the License.
     6  You may obtain a copy of the License at
     7  
     8      http://www.apache.org/licenses/LICENSE-2.0
     9  
    10  Unless required by applicable law or agreed to in writing, software
    11  distributed under the License is distributed on an "AS IS" BASIS,
    12  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    13  See the License for the specific language governing permissions and
    14  limitations under the License.
    15  */
    16  
    17  package metrics
    18  
    19  import (
    20  	"sync"
    21  	"time"
    22  
    23  	"k8s.io/component-base/metrics"
    24  	"k8s.io/component-base/metrics/legacyregistry"
    25  	"k8s.io/klog/v2"
    26  	kubeproxyconfig "k8s.io/kubernetes/pkg/proxy/apis/config"
    27  	"k8s.io/kubernetes/pkg/proxy/util/nfacct"
    28  )
    29  
    30  const kubeProxySubsystem = "kubeproxy"
    31  
    32  var (
    33  	// SyncProxyRulesLatency is the latency of one round of kube-proxy syncing proxy
    34  	// rules. (With the iptables proxy, this includes both full and partial syncs.)
    35  	SyncProxyRulesLatency = metrics.NewHistogram(
    36  		&metrics.HistogramOpts{
    37  			Subsystem:      kubeProxySubsystem,
    38  			Name:           "sync_proxy_rules_duration_seconds",
    39  			Help:           "SyncProxyRules latency in seconds",
    40  			Buckets:        metrics.ExponentialBuckets(0.001, 2, 15),
    41  			StabilityLevel: metrics.ALPHA,
    42  		},
    43  	)
    44  
    45  	// SyncFullProxyRulesLatency is the latency of one round of full rule syncing.
    46  	SyncFullProxyRulesLatency = metrics.NewHistogram(
    47  		&metrics.HistogramOpts{
    48  			Subsystem:      kubeProxySubsystem,
    49  			Name:           "sync_full_proxy_rules_duration_seconds",
    50  			Help:           "SyncProxyRules latency in seconds for full resyncs",
    51  			Buckets:        metrics.ExponentialBuckets(0.001, 2, 15),
    52  			StabilityLevel: metrics.ALPHA,
    53  		},
    54  	)
    55  
    56  	// SyncPartialProxyRulesLatency is the latency of one round of partial rule syncing.
    57  	SyncPartialProxyRulesLatency = metrics.NewHistogram(
    58  		&metrics.HistogramOpts{
    59  			Subsystem:      kubeProxySubsystem,
    60  			Name:           "sync_partial_proxy_rules_duration_seconds",
    61  			Help:           "SyncProxyRules latency in seconds for partial resyncs",
    62  			Buckets:        metrics.ExponentialBuckets(0.001, 2, 15),
    63  			StabilityLevel: metrics.ALPHA,
    64  		},
    65  	)
    66  
    67  	// SyncProxyRulesLastTimestamp is the timestamp proxy rules were last
    68  	// successfully synced.
    69  	SyncProxyRulesLastTimestamp = metrics.NewGauge(
    70  		&metrics.GaugeOpts{
    71  			Subsystem:      kubeProxySubsystem,
    72  			Name:           "sync_proxy_rules_last_timestamp_seconds",
    73  			Help:           "The last time proxy rules were successfully synced",
    74  			StabilityLevel: metrics.ALPHA,
    75  		},
    76  	)
    77  
    78  	// NetworkProgrammingLatency is defined as the time it took to program the network - from the time
    79  	// the service or pod has changed to the time the change was propagated and the proper kube-proxy
    80  	// rules were synced. Exported for each endpoints object that were part of the rules sync.
    81  	// See https://github.com/kubernetes/community/blob/master/sig-scalability/slos/network_programming_latency.md
    82  	// Note that the metrics is partially based on the time exported by the endpoints controller on
    83  	// the master machine. The measurement may be inaccurate if there is a clock drift between the
    84  	// node and master machine.
    85  	NetworkProgrammingLatency = metrics.NewHistogram(
    86  		&metrics.HistogramOpts{
    87  			Subsystem: kubeProxySubsystem,
    88  			Name:      "network_programming_duration_seconds",
    89  			Help:      "In Cluster Network Programming Latency in seconds",
    90  			Buckets: metrics.MergeBuckets(
    91  				metrics.LinearBuckets(0.25, 0.25, 2), // 0.25s, 0.50s
    92  				metrics.LinearBuckets(1, 1, 59),      // 1s, 2s, 3s, ... 59s
    93  				metrics.LinearBuckets(60, 5, 12),     // 60s, 65s, 70s, ... 115s
    94  				metrics.LinearBuckets(120, 30, 7),    // 2min, 2.5min, 3min, ..., 5min
    95  			),
    96  			StabilityLevel: metrics.ALPHA,
    97  		},
    98  	)
    99  
   100  	// EndpointChangesPending is the number of pending endpoint changes that
   101  	// have not yet been synced to the proxy.
   102  	EndpointChangesPending = metrics.NewGauge(
   103  		&metrics.GaugeOpts{
   104  			Subsystem:      kubeProxySubsystem,
   105  			Name:           "sync_proxy_rules_endpoint_changes_pending",
   106  			Help:           "Pending proxy rules Endpoint changes",
   107  			StabilityLevel: metrics.ALPHA,
   108  		},
   109  	)
   110  
   111  	// EndpointChangesTotal is the number of endpoint changes that the proxy
   112  	// has seen.
   113  	EndpointChangesTotal = metrics.NewCounter(
   114  		&metrics.CounterOpts{
   115  			Subsystem:      kubeProxySubsystem,
   116  			Name:           "sync_proxy_rules_endpoint_changes_total",
   117  			Help:           "Cumulative proxy rules Endpoint changes",
   118  			StabilityLevel: metrics.ALPHA,
   119  		},
   120  	)
   121  
   122  	// ServiceChangesPending is the number of pending service changes that
   123  	// have not yet been synced to the proxy.
   124  	ServiceChangesPending = metrics.NewGauge(
   125  		&metrics.GaugeOpts{
   126  			Subsystem:      kubeProxySubsystem,
   127  			Name:           "sync_proxy_rules_service_changes_pending",
   128  			Help:           "Pending proxy rules Service changes",
   129  			StabilityLevel: metrics.ALPHA,
   130  		},
   131  	)
   132  
   133  	// ServiceChangesTotal is the number of service changes that the proxy has
   134  	// seen.
   135  	ServiceChangesTotal = metrics.NewCounter(
   136  		&metrics.CounterOpts{
   137  			Subsystem:      kubeProxySubsystem,
   138  			Name:           "sync_proxy_rules_service_changes_total",
   139  			Help:           "Cumulative proxy rules Service changes",
   140  			StabilityLevel: metrics.ALPHA,
   141  		},
   142  	)
   143  
   144  	// iptablesCTStateInvalidDroppedPacketsDescription describe the metrics for the number of packets dropped
   145  	// by iptables which were marked INVALID by conntrack.
   146  	iptablesCTStateInvalidDroppedPacketsDescription = metrics.NewDesc(
   147  		"kubeproxy_iptables_ct_state_invalid_dropped_packets_total",
   148  		"packets dropped by iptables to work around conntrack problems",
   149  		nil, nil, metrics.ALPHA, "")
   150  	IPTablesCTStateInvalidDroppedNFAcctCounter = "ct_state_invalid_dropped_pkts"
   151  
   152  	// IPTablesRestoreFailuresTotal is the number of iptables restore failures that the proxy has
   153  	// seen.
   154  	IPTablesRestoreFailuresTotal = metrics.NewCounter(
   155  		&metrics.CounterOpts{
   156  			Subsystem:      kubeProxySubsystem,
   157  			Name:           "sync_proxy_rules_iptables_restore_failures_total",
   158  			Help:           "Cumulative proxy iptables restore failures",
   159  			StabilityLevel: metrics.ALPHA,
   160  		},
   161  	)
   162  
   163  	// IPTablesPartialRestoreFailuresTotal is the number of iptables *partial* restore
   164  	// failures (resulting in a fall back to a full restore) that the proxy has seen.
   165  	IPTablesPartialRestoreFailuresTotal = metrics.NewCounter(
   166  		&metrics.CounterOpts{
   167  			Subsystem:      kubeProxySubsystem,
   168  			Name:           "sync_proxy_rules_iptables_partial_restore_failures_total",
   169  			Help:           "Cumulative proxy iptables partial restore failures",
   170  			StabilityLevel: metrics.ALPHA,
   171  		},
   172  	)
   173  
   174  	// IPTablesRulesTotal is the total number of iptables rules that the iptables
   175  	// proxy has installed.
   176  	IPTablesRulesTotal = metrics.NewGaugeVec(
   177  		&metrics.GaugeOpts{
   178  			Subsystem:      kubeProxySubsystem,
   179  			Name:           "sync_proxy_rules_iptables_total",
   180  			Help:           "Total number of iptables rules owned by kube-proxy",
   181  			StabilityLevel: metrics.ALPHA,
   182  		},
   183  		[]string{"table"},
   184  	)
   185  
   186  	// IPTablesRulesLastSync is the number of iptables rules that the iptables proxy
   187  	// updated in the last sync.
   188  	IPTablesRulesLastSync = metrics.NewGaugeVec(
   189  		&metrics.GaugeOpts{
   190  			Subsystem:      kubeProxySubsystem,
   191  			Name:           "sync_proxy_rules_iptables_last",
   192  			Help:           "Number of iptables rules written by kube-proxy in last sync",
   193  			StabilityLevel: metrics.ALPHA,
   194  		},
   195  		[]string{"table"},
   196  	)
   197  
   198  	// NFTablesSyncFailuresTotal is the number of nftables sync failures that the
   199  	// proxy has seen.
   200  	NFTablesSyncFailuresTotal = metrics.NewCounter(
   201  		&metrics.CounterOpts{
   202  			Subsystem:      kubeProxySubsystem,
   203  			Name:           "sync_proxy_rules_nftables_sync_failures_total",
   204  			Help:           "Cumulative proxy nftables sync failures",
   205  			StabilityLevel: metrics.ALPHA,
   206  		},
   207  	)
   208  
   209  	// NFTablesCleanupFailuresTotal is the number of nftables stale chain cleanup
   210  	// failures that the proxy has seen.
   211  	NFTablesCleanupFailuresTotal = metrics.NewCounter(
   212  		&metrics.CounterOpts{
   213  			Subsystem:      kubeProxySubsystem,
   214  			Name:           "sync_proxy_rules_nftables_cleanup_failures_total",
   215  			Help:           "Cumulative proxy nftables cleanup failures",
   216  			StabilityLevel: metrics.ALPHA,
   217  		},
   218  	)
   219  
   220  	// ProxyHealthzTotal is the number of returned HTTP Status for each
   221  	// healthz probe.
   222  	ProxyHealthzTotal = metrics.NewCounterVec(
   223  		&metrics.CounterOpts{
   224  			Subsystem:      kubeProxySubsystem,
   225  			Name:           "proxy_healthz_total",
   226  			Help:           "Cumulative proxy healthz HTTP status",
   227  			StabilityLevel: metrics.ALPHA,
   228  		},
   229  		[]string{"code"},
   230  	)
   231  
   232  	// ProxyLivezTotal is the number of returned HTTP Status for each
   233  	// livez probe.
   234  	ProxyLivezTotal = metrics.NewCounterVec(
   235  		&metrics.CounterOpts{
   236  			Subsystem:      kubeProxySubsystem,
   237  			Name:           "proxy_livez_total",
   238  			Help:           "Cumulative proxy livez HTTP status",
   239  			StabilityLevel: metrics.ALPHA,
   240  		},
   241  		[]string{"code"},
   242  	)
   243  
   244  	// SyncProxyRulesLastQueuedTimestamp is the last time a proxy sync was
   245  	// requested. If this is much larger than
   246  	// kubeproxy_sync_proxy_rules_last_timestamp_seconds, then something is hung.
   247  	SyncProxyRulesLastQueuedTimestamp = metrics.NewGauge(
   248  		&metrics.GaugeOpts{
   249  			Subsystem:      kubeProxySubsystem,
   250  			Name:           "sync_proxy_rules_last_queued_timestamp_seconds",
   251  			Help:           "The last time a sync of proxy rules was queued",
   252  			StabilityLevel: metrics.ALPHA,
   253  		},
   254  	)
   255  
   256  	// SyncProxyRulesNoLocalEndpointsTotal is the total number of rules that do
   257  	// not have an available endpoint. This can be caused by an internal
   258  	// traffic policy with no available local workload.
   259  	SyncProxyRulesNoLocalEndpointsTotal = metrics.NewGaugeVec(
   260  		&metrics.GaugeOpts{
   261  			Subsystem:      kubeProxySubsystem,
   262  			Name:           "sync_proxy_rules_no_local_endpoints_total",
   263  			Help:           "Number of services with a Local traffic policy and no endpoints",
   264  			StabilityLevel: metrics.ALPHA,
   265  		},
   266  		[]string{"traffic_policy"},
   267  	)
   268  )
   269  
   270  var registerMetricsOnce sync.Once
   271  
   272  // RegisterMetrics registers kube-proxy metrics.
   273  func RegisterMetrics(mode kubeproxyconfig.ProxyMode) {
   274  	registerMetricsOnce.Do(func() {
   275  		// Core kube-proxy metrics for all backends
   276  		legacyregistry.MustRegister(SyncProxyRulesLatency)
   277  		legacyregistry.MustRegister(SyncProxyRulesLastQueuedTimestamp)
   278  		legacyregistry.MustRegister(SyncProxyRulesLastTimestamp)
   279  		legacyregistry.MustRegister(EndpointChangesPending)
   280  		legacyregistry.MustRegister(EndpointChangesTotal)
   281  		legacyregistry.MustRegister(ServiceChangesPending)
   282  		legacyregistry.MustRegister(ServiceChangesTotal)
   283  		legacyregistry.MustRegister(ProxyHealthzTotal)
   284  		legacyregistry.MustRegister(ProxyLivezTotal)
   285  
   286  		// FIXME: winkernel does not implement these
   287  		legacyregistry.MustRegister(NetworkProgrammingLatency)
   288  		legacyregistry.MustRegister(SyncProxyRulesNoLocalEndpointsTotal)
   289  
   290  		switch mode {
   291  		case kubeproxyconfig.ProxyModeIPTables:
   292  			legacyregistry.CustomMustRegister(newCTStateInvalidPacketsCollector())
   293  			legacyregistry.MustRegister(SyncFullProxyRulesLatency)
   294  			legacyregistry.MustRegister(SyncPartialProxyRulesLatency)
   295  			legacyregistry.MustRegister(IPTablesRestoreFailuresTotal)
   296  			legacyregistry.MustRegister(IPTablesPartialRestoreFailuresTotal)
   297  			legacyregistry.MustRegister(IPTablesRulesTotal)
   298  			legacyregistry.MustRegister(IPTablesRulesLastSync)
   299  
   300  		case kubeproxyconfig.ProxyModeIPVS:
   301  			legacyregistry.MustRegister(IPTablesRestoreFailuresTotal)
   302  
   303  		case kubeproxyconfig.ProxyModeNFTables:
   304  			legacyregistry.MustRegister(NFTablesSyncFailuresTotal)
   305  			legacyregistry.MustRegister(NFTablesCleanupFailuresTotal)
   306  
   307  		case kubeproxyconfig.ProxyModeKernelspace:
   308  			// currently no winkernel-specific metrics
   309  		}
   310  	})
   311  }
   312  
   313  // SinceInSeconds gets the time since the specified start in seconds.
   314  func SinceInSeconds(start time.Time) float64 {
   315  	return time.Since(start).Seconds()
   316  }
   317  
   318  var _ metrics.StableCollector = &ctStateInvalidPacketsCollector{}
   319  
   320  func newCTStateInvalidPacketsCollector() *ctStateInvalidPacketsCollector {
   321  	client, err := nfacct.New()
   322  	if err != nil {
   323  		klog.ErrorS(err, "failed to initialize nfacct client")
   324  	}
   325  	return &ctStateInvalidPacketsCollector{client: client}
   326  }
   327  
   328  type ctStateInvalidPacketsCollector struct {
   329  	metrics.BaseStableCollector
   330  	client nfacct.Interface
   331  }
   332  
   333  // DescribeWithStability implements the metrics.StableCollector interface.
   334  func (c *ctStateInvalidPacketsCollector) DescribeWithStability(ch chan<- *metrics.Desc) {
   335  	ch <- iptablesCTStateInvalidDroppedPacketsDescription
   336  }
   337  
   338  // CollectWithStability implements the metrics.StableCollector interface.
   339  func (c *ctStateInvalidPacketsCollector) CollectWithStability(ch chan<- metrics.Metric) {
   340  	if c.client != nil {
   341  		counter, err := c.client.Get(IPTablesCTStateInvalidDroppedNFAcctCounter)
   342  		if err != nil {
   343  			klog.ErrorS(err, "failed to collect nfacct counter")
   344  		} else {
   345  			metric, err := metrics.NewConstMetric(iptablesCTStateInvalidDroppedPacketsDescription, metrics.CounterValue, float64(counter.Packets))
   346  			if err != nil {
   347  				klog.ErrorS(err, "failed to create constant metric")
   348  			} else {
   349  				ch <- metric
   350  			}
   351  		}
   352  	}
   353  }