k8s.io/kubernetes@v1.31.0-alpha.0.0.20240520171757-56147500dadc/pkg/proxy/metrics/metrics.go (about) 1 /* 2 Copyright 2017 The Kubernetes Authors. 3 4 Licensed under the Apache License, Version 2.0 (the "License"); 5 you may not use this file except in compliance with the License. 6 You may obtain a copy of the License at 7 8 http://www.apache.org/licenses/LICENSE-2.0 9 10 Unless required by applicable law or agreed to in writing, software 11 distributed under the License is distributed on an "AS IS" BASIS, 12 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 See the License for the specific language governing permissions and 14 limitations under the License. 15 */ 16 17 package metrics 18 19 import ( 20 "sync" 21 "time" 22 23 "k8s.io/component-base/metrics" 24 "k8s.io/component-base/metrics/legacyregistry" 25 "k8s.io/klog/v2" 26 kubeproxyconfig "k8s.io/kubernetes/pkg/proxy/apis/config" 27 "k8s.io/kubernetes/pkg/proxy/util/nfacct" 28 ) 29 30 const kubeProxySubsystem = "kubeproxy" 31 32 var ( 33 // SyncProxyRulesLatency is the latency of one round of kube-proxy syncing proxy 34 // rules. (With the iptables proxy, this includes both full and partial syncs.) 35 SyncProxyRulesLatency = metrics.NewHistogram( 36 &metrics.HistogramOpts{ 37 Subsystem: kubeProxySubsystem, 38 Name: "sync_proxy_rules_duration_seconds", 39 Help: "SyncProxyRules latency in seconds", 40 Buckets: metrics.ExponentialBuckets(0.001, 2, 15), 41 StabilityLevel: metrics.ALPHA, 42 }, 43 ) 44 45 // SyncFullProxyRulesLatency is the latency of one round of full rule syncing. 46 SyncFullProxyRulesLatency = metrics.NewHistogram( 47 &metrics.HistogramOpts{ 48 Subsystem: kubeProxySubsystem, 49 Name: "sync_full_proxy_rules_duration_seconds", 50 Help: "SyncProxyRules latency in seconds for full resyncs", 51 Buckets: metrics.ExponentialBuckets(0.001, 2, 15), 52 StabilityLevel: metrics.ALPHA, 53 }, 54 ) 55 56 // SyncPartialProxyRulesLatency is the latency of one round of partial rule syncing. 57 SyncPartialProxyRulesLatency = metrics.NewHistogram( 58 &metrics.HistogramOpts{ 59 Subsystem: kubeProxySubsystem, 60 Name: "sync_partial_proxy_rules_duration_seconds", 61 Help: "SyncProxyRules latency in seconds for partial resyncs", 62 Buckets: metrics.ExponentialBuckets(0.001, 2, 15), 63 StabilityLevel: metrics.ALPHA, 64 }, 65 ) 66 67 // SyncProxyRulesLastTimestamp is the timestamp proxy rules were last 68 // successfully synced. 69 SyncProxyRulesLastTimestamp = metrics.NewGauge( 70 &metrics.GaugeOpts{ 71 Subsystem: kubeProxySubsystem, 72 Name: "sync_proxy_rules_last_timestamp_seconds", 73 Help: "The last time proxy rules were successfully synced", 74 StabilityLevel: metrics.ALPHA, 75 }, 76 ) 77 78 // NetworkProgrammingLatency is defined as the time it took to program the network - from the time 79 // the service or pod has changed to the time the change was propagated and the proper kube-proxy 80 // rules were synced. Exported for each endpoints object that were part of the rules sync. 81 // See https://github.com/kubernetes/community/blob/master/sig-scalability/slos/network_programming_latency.md 82 // Note that the metrics is partially based on the time exported by the endpoints controller on 83 // the master machine. The measurement may be inaccurate if there is a clock drift between the 84 // node and master machine. 85 NetworkProgrammingLatency = metrics.NewHistogram( 86 &metrics.HistogramOpts{ 87 Subsystem: kubeProxySubsystem, 88 Name: "network_programming_duration_seconds", 89 Help: "In Cluster Network Programming Latency in seconds", 90 Buckets: metrics.MergeBuckets( 91 metrics.LinearBuckets(0.25, 0.25, 2), // 0.25s, 0.50s 92 metrics.LinearBuckets(1, 1, 59), // 1s, 2s, 3s, ... 59s 93 metrics.LinearBuckets(60, 5, 12), // 60s, 65s, 70s, ... 115s 94 metrics.LinearBuckets(120, 30, 7), // 2min, 2.5min, 3min, ..., 5min 95 ), 96 StabilityLevel: metrics.ALPHA, 97 }, 98 ) 99 100 // EndpointChangesPending is the number of pending endpoint changes that 101 // have not yet been synced to the proxy. 102 EndpointChangesPending = metrics.NewGauge( 103 &metrics.GaugeOpts{ 104 Subsystem: kubeProxySubsystem, 105 Name: "sync_proxy_rules_endpoint_changes_pending", 106 Help: "Pending proxy rules Endpoint changes", 107 StabilityLevel: metrics.ALPHA, 108 }, 109 ) 110 111 // EndpointChangesTotal is the number of endpoint changes that the proxy 112 // has seen. 113 EndpointChangesTotal = metrics.NewCounter( 114 &metrics.CounterOpts{ 115 Subsystem: kubeProxySubsystem, 116 Name: "sync_proxy_rules_endpoint_changes_total", 117 Help: "Cumulative proxy rules Endpoint changes", 118 StabilityLevel: metrics.ALPHA, 119 }, 120 ) 121 122 // ServiceChangesPending is the number of pending service changes that 123 // have not yet been synced to the proxy. 124 ServiceChangesPending = metrics.NewGauge( 125 &metrics.GaugeOpts{ 126 Subsystem: kubeProxySubsystem, 127 Name: "sync_proxy_rules_service_changes_pending", 128 Help: "Pending proxy rules Service changes", 129 StabilityLevel: metrics.ALPHA, 130 }, 131 ) 132 133 // ServiceChangesTotal is the number of service changes that the proxy has 134 // seen. 135 ServiceChangesTotal = metrics.NewCounter( 136 &metrics.CounterOpts{ 137 Subsystem: kubeProxySubsystem, 138 Name: "sync_proxy_rules_service_changes_total", 139 Help: "Cumulative proxy rules Service changes", 140 StabilityLevel: metrics.ALPHA, 141 }, 142 ) 143 144 // iptablesCTStateInvalidDroppedPacketsDescription describe the metrics for the number of packets dropped 145 // by iptables which were marked INVALID by conntrack. 146 iptablesCTStateInvalidDroppedPacketsDescription = metrics.NewDesc( 147 "kubeproxy_iptables_ct_state_invalid_dropped_packets_total", 148 "packets dropped by iptables to work around conntrack problems", 149 nil, nil, metrics.ALPHA, "") 150 IPTablesCTStateInvalidDroppedNFAcctCounter = "ct_state_invalid_dropped_pkts" 151 152 // IPTablesRestoreFailuresTotal is the number of iptables restore failures that the proxy has 153 // seen. 154 IPTablesRestoreFailuresTotal = metrics.NewCounter( 155 &metrics.CounterOpts{ 156 Subsystem: kubeProxySubsystem, 157 Name: "sync_proxy_rules_iptables_restore_failures_total", 158 Help: "Cumulative proxy iptables restore failures", 159 StabilityLevel: metrics.ALPHA, 160 }, 161 ) 162 163 // IPTablesPartialRestoreFailuresTotal is the number of iptables *partial* restore 164 // failures (resulting in a fall back to a full restore) that the proxy has seen. 165 IPTablesPartialRestoreFailuresTotal = metrics.NewCounter( 166 &metrics.CounterOpts{ 167 Subsystem: kubeProxySubsystem, 168 Name: "sync_proxy_rules_iptables_partial_restore_failures_total", 169 Help: "Cumulative proxy iptables partial restore failures", 170 StabilityLevel: metrics.ALPHA, 171 }, 172 ) 173 174 // IPTablesRulesTotal is the total number of iptables rules that the iptables 175 // proxy has installed. 176 IPTablesRulesTotal = metrics.NewGaugeVec( 177 &metrics.GaugeOpts{ 178 Subsystem: kubeProxySubsystem, 179 Name: "sync_proxy_rules_iptables_total", 180 Help: "Total number of iptables rules owned by kube-proxy", 181 StabilityLevel: metrics.ALPHA, 182 }, 183 []string{"table"}, 184 ) 185 186 // IPTablesRulesLastSync is the number of iptables rules that the iptables proxy 187 // updated in the last sync. 188 IPTablesRulesLastSync = metrics.NewGaugeVec( 189 &metrics.GaugeOpts{ 190 Subsystem: kubeProxySubsystem, 191 Name: "sync_proxy_rules_iptables_last", 192 Help: "Number of iptables rules written by kube-proxy in last sync", 193 StabilityLevel: metrics.ALPHA, 194 }, 195 []string{"table"}, 196 ) 197 198 // NFTablesSyncFailuresTotal is the number of nftables sync failures that the 199 // proxy has seen. 200 NFTablesSyncFailuresTotal = metrics.NewCounter( 201 &metrics.CounterOpts{ 202 Subsystem: kubeProxySubsystem, 203 Name: "sync_proxy_rules_nftables_sync_failures_total", 204 Help: "Cumulative proxy nftables sync failures", 205 StabilityLevel: metrics.ALPHA, 206 }, 207 ) 208 209 // NFTablesCleanupFailuresTotal is the number of nftables stale chain cleanup 210 // failures that the proxy has seen. 211 NFTablesCleanupFailuresTotal = metrics.NewCounter( 212 &metrics.CounterOpts{ 213 Subsystem: kubeProxySubsystem, 214 Name: "sync_proxy_rules_nftables_cleanup_failures_total", 215 Help: "Cumulative proxy nftables cleanup failures", 216 StabilityLevel: metrics.ALPHA, 217 }, 218 ) 219 220 // ProxyHealthzTotal is the number of returned HTTP Status for each 221 // healthz probe. 222 ProxyHealthzTotal = metrics.NewCounterVec( 223 &metrics.CounterOpts{ 224 Subsystem: kubeProxySubsystem, 225 Name: "proxy_healthz_total", 226 Help: "Cumulative proxy healthz HTTP status", 227 StabilityLevel: metrics.ALPHA, 228 }, 229 []string{"code"}, 230 ) 231 232 // ProxyLivezTotal is the number of returned HTTP Status for each 233 // livez probe. 234 ProxyLivezTotal = metrics.NewCounterVec( 235 &metrics.CounterOpts{ 236 Subsystem: kubeProxySubsystem, 237 Name: "proxy_livez_total", 238 Help: "Cumulative proxy livez HTTP status", 239 StabilityLevel: metrics.ALPHA, 240 }, 241 []string{"code"}, 242 ) 243 244 // SyncProxyRulesLastQueuedTimestamp is the last time a proxy sync was 245 // requested. If this is much larger than 246 // kubeproxy_sync_proxy_rules_last_timestamp_seconds, then something is hung. 247 SyncProxyRulesLastQueuedTimestamp = metrics.NewGauge( 248 &metrics.GaugeOpts{ 249 Subsystem: kubeProxySubsystem, 250 Name: "sync_proxy_rules_last_queued_timestamp_seconds", 251 Help: "The last time a sync of proxy rules was queued", 252 StabilityLevel: metrics.ALPHA, 253 }, 254 ) 255 256 // SyncProxyRulesNoLocalEndpointsTotal is the total number of rules that do 257 // not have an available endpoint. This can be caused by an internal 258 // traffic policy with no available local workload. 259 SyncProxyRulesNoLocalEndpointsTotal = metrics.NewGaugeVec( 260 &metrics.GaugeOpts{ 261 Subsystem: kubeProxySubsystem, 262 Name: "sync_proxy_rules_no_local_endpoints_total", 263 Help: "Number of services with a Local traffic policy and no endpoints", 264 StabilityLevel: metrics.ALPHA, 265 }, 266 []string{"traffic_policy"}, 267 ) 268 ) 269 270 var registerMetricsOnce sync.Once 271 272 // RegisterMetrics registers kube-proxy metrics. 273 func RegisterMetrics(mode kubeproxyconfig.ProxyMode) { 274 registerMetricsOnce.Do(func() { 275 // Core kube-proxy metrics for all backends 276 legacyregistry.MustRegister(SyncProxyRulesLatency) 277 legacyregistry.MustRegister(SyncProxyRulesLastQueuedTimestamp) 278 legacyregistry.MustRegister(SyncProxyRulesLastTimestamp) 279 legacyregistry.MustRegister(EndpointChangesPending) 280 legacyregistry.MustRegister(EndpointChangesTotal) 281 legacyregistry.MustRegister(ServiceChangesPending) 282 legacyregistry.MustRegister(ServiceChangesTotal) 283 legacyregistry.MustRegister(ProxyHealthzTotal) 284 legacyregistry.MustRegister(ProxyLivezTotal) 285 286 // FIXME: winkernel does not implement these 287 legacyregistry.MustRegister(NetworkProgrammingLatency) 288 legacyregistry.MustRegister(SyncProxyRulesNoLocalEndpointsTotal) 289 290 switch mode { 291 case kubeproxyconfig.ProxyModeIPTables: 292 legacyregistry.CustomMustRegister(newCTStateInvalidPacketsCollector()) 293 legacyregistry.MustRegister(SyncFullProxyRulesLatency) 294 legacyregistry.MustRegister(SyncPartialProxyRulesLatency) 295 legacyregistry.MustRegister(IPTablesRestoreFailuresTotal) 296 legacyregistry.MustRegister(IPTablesPartialRestoreFailuresTotal) 297 legacyregistry.MustRegister(IPTablesRulesTotal) 298 legacyregistry.MustRegister(IPTablesRulesLastSync) 299 300 case kubeproxyconfig.ProxyModeIPVS: 301 legacyregistry.MustRegister(IPTablesRestoreFailuresTotal) 302 303 case kubeproxyconfig.ProxyModeNFTables: 304 legacyregistry.MustRegister(NFTablesSyncFailuresTotal) 305 legacyregistry.MustRegister(NFTablesCleanupFailuresTotal) 306 307 case kubeproxyconfig.ProxyModeKernelspace: 308 // currently no winkernel-specific metrics 309 } 310 }) 311 } 312 313 // SinceInSeconds gets the time since the specified start in seconds. 314 func SinceInSeconds(start time.Time) float64 { 315 return time.Since(start).Seconds() 316 } 317 318 var _ metrics.StableCollector = &ctStateInvalidPacketsCollector{} 319 320 func newCTStateInvalidPacketsCollector() *ctStateInvalidPacketsCollector { 321 client, err := nfacct.New() 322 if err != nil { 323 klog.ErrorS(err, "failed to initialize nfacct client") 324 } 325 return &ctStateInvalidPacketsCollector{client: client} 326 } 327 328 type ctStateInvalidPacketsCollector struct { 329 metrics.BaseStableCollector 330 client nfacct.Interface 331 } 332 333 // DescribeWithStability implements the metrics.StableCollector interface. 334 func (c *ctStateInvalidPacketsCollector) DescribeWithStability(ch chan<- *metrics.Desc) { 335 ch <- iptablesCTStateInvalidDroppedPacketsDescription 336 } 337 338 // CollectWithStability implements the metrics.StableCollector interface. 339 func (c *ctStateInvalidPacketsCollector) CollectWithStability(ch chan<- metrics.Metric) { 340 if c.client != nil { 341 counter, err := c.client.Get(IPTablesCTStateInvalidDroppedNFAcctCounter) 342 if err != nil { 343 klog.ErrorS(err, "failed to collect nfacct counter") 344 } else { 345 metric, err := metrics.NewConstMetric(iptablesCTStateInvalidDroppedPacketsDescription, metrics.CounterValue, float64(counter.Packets)) 346 if err != nil { 347 klog.ErrorS(err, "failed to create constant metric") 348 } else { 349 ch <- metric 350 } 351 } 352 } 353 }