k8s.io/kubernetes@v1.31.0-alpha.0.0.20240520171757-56147500dadc/cluster/images/etcd-version-monitor/etcd-version-monitor.go (about) 1 /* 2 Copyright 2017 The Kubernetes Authors. 3 4 Licensed under the Apache License, Version 2.0 (the "License"); 5 you may not use this file except in compliance with the License. 6 You may obtain a copy of the License at 7 8 http://www.apache.org/licenses/LICENSE-2.0 9 10 Unless required by applicable law or agreed to in writing, software 11 distributed under the License is distributed on an "AS IS" BASIS, 12 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 See the License for the specific language governing permissions and 14 limitations under the License. 15 */ 16 17 package main 18 19 import ( 20 "bytes" 21 "encoding/json" 22 "errors" 23 goflag "flag" 24 "fmt" 25 "net/http" 26 "time" 27 28 "github.com/gogo/protobuf/proto" 29 dto "github.com/prometheus/client_model/go" 30 "github.com/spf13/pflag" 31 32 "k8s.io/component-base/metrics" 33 "k8s.io/component-base/metrics/testutil" 34 "k8s.io/klog/v2" 35 ) 36 37 // Initialize the prometheus instrumentation and client related flags. 38 var ( 39 listenAddress string 40 metricsPath string 41 etcdVersionScrapeURI string 42 etcdMetricsScrapeURI string 43 scrapeTimeout time.Duration 44 ) 45 46 func registerFlags(fs *pflag.FlagSet) { 47 fs.StringVar(&listenAddress, "listen-address", "localhost:9101", "Address to listen on for serving prometheus metrics") 48 fs.StringVar(&metricsPath, "metrics-path", "/metrics", "Path under which prometheus metrics are to be served") 49 fs.StringVar(&etcdVersionScrapeURI, "etcd-version-scrape-uri", "http://localhost:2379/version", "URI to scrape etcd version info") 50 fs.StringVar(&etcdMetricsScrapeURI, "etcd-metrics-scrape-uri", "http://localhost:2379/metrics", "URI to scrape etcd metrics") 51 fs.DurationVar(&scrapeTimeout, "scrape-timeout", 15*time.Second, "Timeout for trying to get stats from etcd") 52 } 53 54 const ( 55 namespace = "etcd" // For prefixing prometheus metrics 56 ) 57 58 // Initialize prometheus metrics to be exported. 59 var ( 60 // Register all custom metrics with a dedicated registry to keep them separate. 61 customMetricRegistry = metrics.NewKubeRegistry() 62 63 // Custom etcd version metric since etcd 3.2- does not export one. 64 // This will be replaced by https://github.com/etcd-io/etcd/pull/8960 in etcd 3.3. 65 etcdVersion = metrics.NewGaugeVec( 66 &metrics.GaugeOpts{ 67 Namespace: namespace, 68 Name: "version_info", 69 Help: "Etcd server's binary version", 70 StabilityLevel: metrics.ALPHA, 71 }, 72 []string{"binary_version"}) 73 74 gatherer = &monitorGatherer{ 75 // Rewrite rules for etcd metrics that are exported by default. 76 exported: map[string]*exportedMetric{ 77 // etcd 3.0 metric format for total grpc requests with renamed method and service labels. 78 "etcd_grpc_requests_total": { 79 rewriters: []rewriteFunc{ 80 func(mf *dto.MetricFamily) (*dto.MetricFamily, error) { 81 mf = deepCopyMetricFamily(mf) 82 renameLabels(mf, map[string]string{ 83 "grpc_method": "method", 84 "grpc_service": "service", 85 }) 86 return mf, nil 87 }, 88 }, 89 }, 90 // etcd 3.1+ metric format for total grpc requests. 91 "grpc_server_handled_total": { 92 rewriters: []rewriteFunc{ 93 // Export the metric exactly as-is. For 3.1+ metrics, we will 94 // pass all metrics directly through. 95 identity, 96 // Write to the etcd 3.0 metric format for backward compatibility. 97 func(mf *dto.MetricFamily) (*dto.MetricFamily, error) { 98 mf = deepCopyMetricFamily(mf) 99 renameMetric(mf, "etcd_grpc_requests_total") 100 renameLabels(mf, map[string]string{ 101 "grpc_method": "method", 102 "grpc_service": "service", 103 }) 104 filterMetricsByLabels(mf, map[string]string{ 105 "grpc_type": "unary", 106 }) 107 groupCounterMetricsByLabels(mf, map[string]bool{ 108 "grpc_type": true, 109 "grpc_code": true, 110 }) 111 return mf, nil 112 }, 113 }, 114 }, 115 116 // etcd 3.0 metric format for grpc request latencies, 117 // rewritten to the etcd 3.1+ format. 118 "etcd_grpc_unary_requests_duration_seconds": { 119 rewriters: []rewriteFunc{ 120 func(mf *dto.MetricFamily) (*dto.MetricFamily, error) { 121 mf = deepCopyMetricFamily(mf) 122 renameMetric(mf, "grpc_server_handling_seconds") 123 tpeName := "grpc_type" 124 tpeVal := "unary" 125 for _, m := range mf.Metric { 126 m.Label = append(m.Label, &dto.LabelPair{Name: &tpeName, Value: &tpeVal}) 127 } 128 return mf, nil 129 }, 130 }, 131 }, 132 // etcd 3.1+ metric format for total grpc requests. 133 "grpc_server_handling_seconds": {}, 134 }, 135 } 136 ) 137 138 // monitorGatherer is a custom metric gatherer for prometheus that exports custom metrics 139 // defined by this monitor as well as rewritten etcd metrics. 140 type monitorGatherer struct { 141 exported map[string]*exportedMetric 142 } 143 144 // exportedMetric identifies a metric that is exported and defines how it is rewritten before 145 // it is exported. 146 type exportedMetric struct { 147 rewriters []rewriteFunc 148 } 149 150 // rewriteFunc rewrites metrics before they are exported. 151 type rewriteFunc func(mf *dto.MetricFamily) (*dto.MetricFamily, error) 152 153 func (m *monitorGatherer) Gather() ([]*dto.MetricFamily, error) { 154 etcdMetrics, err := scrapeMetrics() 155 if err != nil { 156 return nil, err 157 } 158 exported, err := m.rewriteExportedMetrics(etcdMetrics) 159 if err != nil { 160 return nil, err 161 } 162 custom, err := customMetricRegistry.Gather() 163 if err != nil { 164 return nil, err 165 } 166 result := make([]*dto.MetricFamily, 0, len(exported)+len(custom)) 167 result = append(result, exported...) 168 result = append(result, custom...) 169 return result, nil 170 } 171 172 func (m *monitorGatherer) rewriteExportedMetrics(metrics map[string]*dto.MetricFamily) ([]*dto.MetricFamily, error) { 173 results := make([]*dto.MetricFamily, 0, len(metrics)) 174 for n, mf := range metrics { 175 if e, ok := m.exported[n]; ok { 176 // Apply rewrite rules for metrics that have them. 177 if e.rewriters == nil { 178 results = append(results, mf) 179 } else { 180 for _, rewriter := range e.rewriters { 181 new, err := rewriter(mf) 182 if err != nil { 183 return nil, err 184 } 185 results = append(results, new) 186 } 187 } 188 } else { 189 // Proxy all metrics without any rewrite rules directly. 190 results = append(results, mf) 191 } 192 } 193 return results, nil 194 } 195 196 // EtcdVersion struct for unmarshalling the json response from etcd's /version endpoint. 197 type EtcdVersion struct { 198 BinaryVersion string `json:"etcdserver"` 199 ClusterVersion string `json:"etcdcluster"` 200 } 201 202 // Function for fetching etcd version info and feeding it to the prometheus metric. 203 func getVersion(lastSeenBinaryVersion *string) error { 204 // Create the get request for the etcd version endpoint. 205 req, err := http.NewRequest("GET", etcdVersionScrapeURI, nil) 206 if err != nil { 207 return fmt.Errorf("failed to create GET request for etcd version: %v", err) 208 } 209 210 // Send the get request and receive a response. 211 client := &http.Client{} 212 resp, err := client.Do(req) 213 if err != nil { 214 return fmt.Errorf("failed to receive GET response for etcd version: %v", err) 215 } 216 defer resp.Body.Close() 217 218 // Obtain EtcdVersion from the JSON response. 219 var version EtcdVersion 220 if err := json.NewDecoder(resp.Body).Decode(&version); err != nil { 221 return fmt.Errorf("failed to decode etcd version JSON: %v", err) 222 } 223 224 // Return without updating the version if it stayed the same since last time. 225 if *lastSeenBinaryVersion == version.BinaryVersion { 226 return nil 227 } 228 229 // Delete the metric for the previous version. 230 if *lastSeenBinaryVersion != "" { 231 deleted := etcdVersion.Delete(metrics.Labels{"binary_version": *lastSeenBinaryVersion}) 232 if !deleted { 233 return errors.New("failed to delete previous version's metric") 234 } 235 } 236 237 // Record the new version in a metric. 238 etcdVersion.With(metrics.Labels{ 239 "binary_version": version.BinaryVersion, 240 }).Set(0) 241 *lastSeenBinaryVersion = version.BinaryVersion 242 return nil 243 } 244 245 // Periodically fetches etcd version info. 246 func getVersionPeriodically(stopCh <-chan struct{}) { 247 lastSeenBinaryVersion := "" 248 for { 249 if err := getVersion(&lastSeenBinaryVersion); err != nil { 250 klog.Errorf("Failed to fetch etcd version: %v", err) 251 } 252 select { 253 case <-stopCh: 254 return 255 case <-time.After(scrapeTimeout): 256 } 257 } 258 } 259 260 // scrapeMetrics scrapes the prometheus metrics from the etcd metrics URI. 261 func scrapeMetrics() (map[string]*dto.MetricFamily, error) { 262 req, err := http.NewRequest("GET", etcdMetricsScrapeURI, nil) 263 if err != nil { 264 return nil, fmt.Errorf("failed to create GET request for etcd metrics: %v", err) 265 } 266 267 // Send the get request and receive a response. 268 client := &http.Client{} 269 resp, err := client.Do(req) 270 if err != nil { 271 return nil, fmt.Errorf("failed to receive GET response for etcd metrics: %v", err) 272 } 273 defer resp.Body.Close() 274 275 return testutil.TextToMetricFamilies(resp.Body) 276 } 277 278 func renameMetric(mf *dto.MetricFamily, name string) { 279 mf.Name = &name 280 } 281 282 func renameLabels(mf *dto.MetricFamily, nameMapping map[string]string) { 283 for _, m := range mf.Metric { 284 for _, lbl := range m.Label { 285 if alias, ok := nameMapping[*lbl.Name]; ok { 286 lbl.Name = &alias 287 } 288 } 289 } 290 } 291 292 func filterMetricsByLabels(mf *dto.MetricFamily, labelValues map[string]string) { 293 buf := mf.Metric[:0] 294 for _, m := range mf.Metric { 295 shouldRemove := false 296 for _, lbl := range m.Label { 297 if val, ok := labelValues[*lbl.Name]; ok && val != *lbl.Value { 298 shouldRemove = true 299 break 300 } 301 } 302 if !shouldRemove { 303 buf = append(buf, m) 304 } 305 } 306 mf.Metric = buf 307 } 308 309 func groupCounterMetricsByLabels(mf *dto.MetricFamily, names map[string]bool) { 310 buf := mf.Metric[:0] 311 deleteLabels(mf, names) 312 byLabels := map[string]*dto.Metric{} 313 for _, m := range mf.Metric { 314 if metric, ok := byLabels[labelsKey(m.Label)]; ok { 315 metric.Counter.Value = proto.Float64(*metric.Counter.Value + *m.Counter.Value) 316 } else { 317 byLabels[labelsKey(m.Label)] = m 318 buf = append(buf, m) 319 } 320 } 321 mf.Metric = buf 322 } 323 324 func labelsKey(lbls []*dto.LabelPair) string { 325 var buf bytes.Buffer 326 for i, lbl := range lbls { 327 buf.WriteString(lbl.String()) 328 if i < len(lbls)-1 { 329 buf.WriteString(",") 330 } 331 } 332 return buf.String() 333 } 334 335 func deleteLabels(mf *dto.MetricFamily, names map[string]bool) { 336 for _, m := range mf.Metric { 337 buf := m.Label[:0] 338 for _, lbl := range m.Label { 339 shouldRemove := names[*lbl.Name] 340 if !shouldRemove { 341 buf = append(buf, lbl) 342 } 343 } 344 m.Label = buf 345 } 346 } 347 348 func identity(mf *dto.MetricFamily) (*dto.MetricFamily, error) { 349 return mf, nil 350 } 351 352 func deepCopyMetricFamily(mf *dto.MetricFamily) *dto.MetricFamily { 353 r := &dto.MetricFamily{} 354 r.Name = mf.Name 355 r.Help = mf.Help 356 r.Type = mf.Type 357 r.Metric = make([]*dto.Metric, len(mf.Metric)) 358 for i, m := range mf.Metric { 359 r.Metric[i] = deepCopyMetric(m) 360 } 361 return r 362 } 363 364 func deepCopyMetric(m *dto.Metric) *dto.Metric { 365 r := &dto.Metric{} 366 r.Label = make([]*dto.LabelPair, len(m.Label)) 367 for i, lp := range m.Label { 368 r.Label[i] = deepCopyLabelPair(lp) 369 } 370 r.Gauge = m.Gauge 371 r.Counter = m.Counter 372 r.Summary = m.Summary 373 r.Untyped = m.Untyped 374 r.Histogram = m.Histogram 375 r.TimestampMs = m.TimestampMs 376 return r 377 } 378 379 func deepCopyLabelPair(lp *dto.LabelPair) *dto.LabelPair { 380 r := &dto.LabelPair{} 381 r.Name = lp.Name 382 r.Value = lp.Value 383 return r 384 } 385 386 func main() { 387 // Register the commandline flags passed to the tool. 388 registerFlags(pflag.CommandLine) 389 pflag.CommandLine.AddGoFlagSet(goflag.CommandLine) 390 pflag.Parse() 391 392 // Register the metrics we defined above with prometheus. 393 customMetricRegistry.MustRegister(etcdVersion) 394 395 // Spawn threads for periodically scraping etcd version metrics. 396 stopCh := make(chan struct{}) 397 defer close(stopCh) 398 go getVersionPeriodically(stopCh) 399 400 // Serve our metrics on listenAddress/metricsPath. 401 klog.Infof("Listening on: %v", listenAddress) 402 http.Handle(metricsPath, metrics.HandlerFor(gatherer, metrics.HandlerOpts{})) 403 klog.Errorf("Stopped listening/serving metrics: %v", http.ListenAndServe(listenAddress, nil)) 404 }