k8s.io/kubernetes@v1.31.0-alpha.0.0.20240520171757-56147500dadc/cluster/images/etcd-version-monitor/etcd-version-monitor.go (about)

     1  /*
     2  Copyright 2017 The Kubernetes Authors.
     3  
     4  Licensed under the Apache License, Version 2.0 (the "License");
     5  you may not use this file except in compliance with the License.
     6  You may obtain a copy of the License at
     7  
     8      http://www.apache.org/licenses/LICENSE-2.0
     9  
    10  Unless required by applicable law or agreed to in writing, software
    11  distributed under the License is distributed on an "AS IS" BASIS,
    12  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    13  See the License for the specific language governing permissions and
    14  limitations under the License.
    15  */
    16  
    17  package main
    18  
    19  import (
    20  	"bytes"
    21  	"encoding/json"
    22  	"errors"
    23  	goflag "flag"
    24  	"fmt"
    25  	"net/http"
    26  	"time"
    27  
    28  	"github.com/gogo/protobuf/proto"
    29  	dto "github.com/prometheus/client_model/go"
    30  	"github.com/spf13/pflag"
    31  
    32  	"k8s.io/component-base/metrics"
    33  	"k8s.io/component-base/metrics/testutil"
    34  	"k8s.io/klog/v2"
    35  )
    36  
    37  // Initialize the prometheus instrumentation and client related flags.
    38  var (
    39  	listenAddress        string
    40  	metricsPath          string
    41  	etcdVersionScrapeURI string
    42  	etcdMetricsScrapeURI string
    43  	scrapeTimeout        time.Duration
    44  )
    45  
    46  func registerFlags(fs *pflag.FlagSet) {
    47  	fs.StringVar(&listenAddress, "listen-address", "localhost:9101", "Address to listen on for serving prometheus metrics")
    48  	fs.StringVar(&metricsPath, "metrics-path", "/metrics", "Path under which prometheus metrics are to be served")
    49  	fs.StringVar(&etcdVersionScrapeURI, "etcd-version-scrape-uri", "http://localhost:2379/version", "URI to scrape etcd version info")
    50  	fs.StringVar(&etcdMetricsScrapeURI, "etcd-metrics-scrape-uri", "http://localhost:2379/metrics", "URI to scrape etcd metrics")
    51  	fs.DurationVar(&scrapeTimeout, "scrape-timeout", 15*time.Second, "Timeout for trying to get stats from etcd")
    52  }
    53  
    54  const (
    55  	namespace = "etcd" // For prefixing prometheus metrics
    56  )
    57  
    58  // Initialize prometheus metrics to be exported.
    59  var (
    60  	// Register all custom metrics with a dedicated registry to keep them separate.
    61  	customMetricRegistry = metrics.NewKubeRegistry()
    62  
    63  	// Custom etcd version metric since etcd 3.2- does not export one.
    64  	// This will be replaced by https://github.com/etcd-io/etcd/pull/8960 in etcd 3.3.
    65  	etcdVersion = metrics.NewGaugeVec(
    66  		&metrics.GaugeOpts{
    67  			Namespace:      namespace,
    68  			Name:           "version_info",
    69  			Help:           "Etcd server's binary version",
    70  			StabilityLevel: metrics.ALPHA,
    71  		},
    72  		[]string{"binary_version"})
    73  
    74  	gatherer = &monitorGatherer{
    75  		// Rewrite rules for etcd metrics that are exported by default.
    76  		exported: map[string]*exportedMetric{
    77  			// etcd 3.0 metric format for total grpc requests with renamed method and service labels.
    78  			"etcd_grpc_requests_total": {
    79  				rewriters: []rewriteFunc{
    80  					func(mf *dto.MetricFamily) (*dto.MetricFamily, error) {
    81  						mf = deepCopyMetricFamily(mf)
    82  						renameLabels(mf, map[string]string{
    83  							"grpc_method":  "method",
    84  							"grpc_service": "service",
    85  						})
    86  						return mf, nil
    87  					},
    88  				},
    89  			},
    90  			// etcd 3.1+ metric format for total grpc requests.
    91  			"grpc_server_handled_total": {
    92  				rewriters: []rewriteFunc{
    93  					// Export the metric exactly as-is. For 3.1+ metrics, we will
    94  					// pass all metrics directly through.
    95  					identity,
    96  					// Write to the etcd 3.0 metric format for backward compatibility.
    97  					func(mf *dto.MetricFamily) (*dto.MetricFamily, error) {
    98  						mf = deepCopyMetricFamily(mf)
    99  						renameMetric(mf, "etcd_grpc_requests_total")
   100  						renameLabels(mf, map[string]string{
   101  							"grpc_method":  "method",
   102  							"grpc_service": "service",
   103  						})
   104  						filterMetricsByLabels(mf, map[string]string{
   105  							"grpc_type": "unary",
   106  						})
   107  						groupCounterMetricsByLabels(mf, map[string]bool{
   108  							"grpc_type": true,
   109  							"grpc_code": true,
   110  						})
   111  						return mf, nil
   112  					},
   113  				},
   114  			},
   115  
   116  			// etcd 3.0 metric format for grpc request latencies,
   117  			// rewritten to the etcd 3.1+ format.
   118  			"etcd_grpc_unary_requests_duration_seconds": {
   119  				rewriters: []rewriteFunc{
   120  					func(mf *dto.MetricFamily) (*dto.MetricFamily, error) {
   121  						mf = deepCopyMetricFamily(mf)
   122  						renameMetric(mf, "grpc_server_handling_seconds")
   123  						tpeName := "grpc_type"
   124  						tpeVal := "unary"
   125  						for _, m := range mf.Metric {
   126  							m.Label = append(m.Label, &dto.LabelPair{Name: &tpeName, Value: &tpeVal})
   127  						}
   128  						return mf, nil
   129  					},
   130  				},
   131  			},
   132  			// etcd 3.1+ metric format for total grpc requests.
   133  			"grpc_server_handling_seconds": {},
   134  		},
   135  	}
   136  )
   137  
   138  // monitorGatherer is a custom metric gatherer for prometheus that exports custom metrics
   139  // defined by this monitor as well as rewritten etcd metrics.
   140  type monitorGatherer struct {
   141  	exported map[string]*exportedMetric
   142  }
   143  
   144  // exportedMetric identifies a metric that is exported and defines how it is rewritten before
   145  // it is exported.
   146  type exportedMetric struct {
   147  	rewriters []rewriteFunc
   148  }
   149  
   150  // rewriteFunc rewrites metrics before they are exported.
   151  type rewriteFunc func(mf *dto.MetricFamily) (*dto.MetricFamily, error)
   152  
   153  func (m *monitorGatherer) Gather() ([]*dto.MetricFamily, error) {
   154  	etcdMetrics, err := scrapeMetrics()
   155  	if err != nil {
   156  		return nil, err
   157  	}
   158  	exported, err := m.rewriteExportedMetrics(etcdMetrics)
   159  	if err != nil {
   160  		return nil, err
   161  	}
   162  	custom, err := customMetricRegistry.Gather()
   163  	if err != nil {
   164  		return nil, err
   165  	}
   166  	result := make([]*dto.MetricFamily, 0, len(exported)+len(custom))
   167  	result = append(result, exported...)
   168  	result = append(result, custom...)
   169  	return result, nil
   170  }
   171  
   172  func (m *monitorGatherer) rewriteExportedMetrics(metrics map[string]*dto.MetricFamily) ([]*dto.MetricFamily, error) {
   173  	results := make([]*dto.MetricFamily, 0, len(metrics))
   174  	for n, mf := range metrics {
   175  		if e, ok := m.exported[n]; ok {
   176  			// Apply rewrite rules for metrics that have them.
   177  			if e.rewriters == nil {
   178  				results = append(results, mf)
   179  			} else {
   180  				for _, rewriter := range e.rewriters {
   181  					new, err := rewriter(mf)
   182  					if err != nil {
   183  						return nil, err
   184  					}
   185  					results = append(results, new)
   186  				}
   187  			}
   188  		} else {
   189  			// Proxy all metrics without any rewrite rules directly.
   190  			results = append(results, mf)
   191  		}
   192  	}
   193  	return results, nil
   194  }
   195  
   196  // EtcdVersion struct for unmarshalling the json response from etcd's /version endpoint.
   197  type EtcdVersion struct {
   198  	BinaryVersion  string `json:"etcdserver"`
   199  	ClusterVersion string `json:"etcdcluster"`
   200  }
   201  
   202  // Function for fetching etcd version info and feeding it to the prometheus metric.
   203  func getVersion(lastSeenBinaryVersion *string) error {
   204  	// Create the get request for the etcd version endpoint.
   205  	req, err := http.NewRequest("GET", etcdVersionScrapeURI, nil)
   206  	if err != nil {
   207  		return fmt.Errorf("failed to create GET request for etcd version: %v", err)
   208  	}
   209  
   210  	// Send the get request and receive a response.
   211  	client := &http.Client{}
   212  	resp, err := client.Do(req)
   213  	if err != nil {
   214  		return fmt.Errorf("failed to receive GET response for etcd version: %v", err)
   215  	}
   216  	defer resp.Body.Close()
   217  
   218  	// Obtain EtcdVersion from the JSON response.
   219  	var version EtcdVersion
   220  	if err := json.NewDecoder(resp.Body).Decode(&version); err != nil {
   221  		return fmt.Errorf("failed to decode etcd version JSON: %v", err)
   222  	}
   223  
   224  	// Return without updating the version if it stayed the same since last time.
   225  	if *lastSeenBinaryVersion == version.BinaryVersion {
   226  		return nil
   227  	}
   228  
   229  	// Delete the metric for the previous version.
   230  	if *lastSeenBinaryVersion != "" {
   231  		deleted := etcdVersion.Delete(metrics.Labels{"binary_version": *lastSeenBinaryVersion})
   232  		if !deleted {
   233  			return errors.New("failed to delete previous version's metric")
   234  		}
   235  	}
   236  
   237  	// Record the new version in a metric.
   238  	etcdVersion.With(metrics.Labels{
   239  		"binary_version": version.BinaryVersion,
   240  	}).Set(0)
   241  	*lastSeenBinaryVersion = version.BinaryVersion
   242  	return nil
   243  }
   244  
   245  // Periodically fetches etcd version info.
   246  func getVersionPeriodically(stopCh <-chan struct{}) {
   247  	lastSeenBinaryVersion := ""
   248  	for {
   249  		if err := getVersion(&lastSeenBinaryVersion); err != nil {
   250  			klog.Errorf("Failed to fetch etcd version: %v", err)
   251  		}
   252  		select {
   253  		case <-stopCh:
   254  			return
   255  		case <-time.After(scrapeTimeout):
   256  		}
   257  	}
   258  }
   259  
   260  // scrapeMetrics scrapes the prometheus metrics from the etcd metrics URI.
   261  func scrapeMetrics() (map[string]*dto.MetricFamily, error) {
   262  	req, err := http.NewRequest("GET", etcdMetricsScrapeURI, nil)
   263  	if err != nil {
   264  		return nil, fmt.Errorf("failed to create GET request for etcd metrics: %v", err)
   265  	}
   266  
   267  	// Send the get request and receive a response.
   268  	client := &http.Client{}
   269  	resp, err := client.Do(req)
   270  	if err != nil {
   271  		return nil, fmt.Errorf("failed to receive GET response for etcd metrics: %v", err)
   272  	}
   273  	defer resp.Body.Close()
   274  
   275  	return testutil.TextToMetricFamilies(resp.Body)
   276  }
   277  
   278  func renameMetric(mf *dto.MetricFamily, name string) {
   279  	mf.Name = &name
   280  }
   281  
   282  func renameLabels(mf *dto.MetricFamily, nameMapping map[string]string) {
   283  	for _, m := range mf.Metric {
   284  		for _, lbl := range m.Label {
   285  			if alias, ok := nameMapping[*lbl.Name]; ok {
   286  				lbl.Name = &alias
   287  			}
   288  		}
   289  	}
   290  }
   291  
   292  func filterMetricsByLabels(mf *dto.MetricFamily, labelValues map[string]string) {
   293  	buf := mf.Metric[:0]
   294  	for _, m := range mf.Metric {
   295  		shouldRemove := false
   296  		for _, lbl := range m.Label {
   297  			if val, ok := labelValues[*lbl.Name]; ok && val != *lbl.Value {
   298  				shouldRemove = true
   299  				break
   300  			}
   301  		}
   302  		if !shouldRemove {
   303  			buf = append(buf, m)
   304  		}
   305  	}
   306  	mf.Metric = buf
   307  }
   308  
   309  func groupCounterMetricsByLabels(mf *dto.MetricFamily, names map[string]bool) {
   310  	buf := mf.Metric[:0]
   311  	deleteLabels(mf, names)
   312  	byLabels := map[string]*dto.Metric{}
   313  	for _, m := range mf.Metric {
   314  		if metric, ok := byLabels[labelsKey(m.Label)]; ok {
   315  			metric.Counter.Value = proto.Float64(*metric.Counter.Value + *m.Counter.Value)
   316  		} else {
   317  			byLabels[labelsKey(m.Label)] = m
   318  			buf = append(buf, m)
   319  		}
   320  	}
   321  	mf.Metric = buf
   322  }
   323  
   324  func labelsKey(lbls []*dto.LabelPair) string {
   325  	var buf bytes.Buffer
   326  	for i, lbl := range lbls {
   327  		buf.WriteString(lbl.String())
   328  		if i < len(lbls)-1 {
   329  			buf.WriteString(",")
   330  		}
   331  	}
   332  	return buf.String()
   333  }
   334  
   335  func deleteLabels(mf *dto.MetricFamily, names map[string]bool) {
   336  	for _, m := range mf.Metric {
   337  		buf := m.Label[:0]
   338  		for _, lbl := range m.Label {
   339  			shouldRemove := names[*lbl.Name]
   340  			if !shouldRemove {
   341  				buf = append(buf, lbl)
   342  			}
   343  		}
   344  		m.Label = buf
   345  	}
   346  }
   347  
   348  func identity(mf *dto.MetricFamily) (*dto.MetricFamily, error) {
   349  	return mf, nil
   350  }
   351  
   352  func deepCopyMetricFamily(mf *dto.MetricFamily) *dto.MetricFamily {
   353  	r := &dto.MetricFamily{}
   354  	r.Name = mf.Name
   355  	r.Help = mf.Help
   356  	r.Type = mf.Type
   357  	r.Metric = make([]*dto.Metric, len(mf.Metric))
   358  	for i, m := range mf.Metric {
   359  		r.Metric[i] = deepCopyMetric(m)
   360  	}
   361  	return r
   362  }
   363  
   364  func deepCopyMetric(m *dto.Metric) *dto.Metric {
   365  	r := &dto.Metric{}
   366  	r.Label = make([]*dto.LabelPair, len(m.Label))
   367  	for i, lp := range m.Label {
   368  		r.Label[i] = deepCopyLabelPair(lp)
   369  	}
   370  	r.Gauge = m.Gauge
   371  	r.Counter = m.Counter
   372  	r.Summary = m.Summary
   373  	r.Untyped = m.Untyped
   374  	r.Histogram = m.Histogram
   375  	r.TimestampMs = m.TimestampMs
   376  	return r
   377  }
   378  
   379  func deepCopyLabelPair(lp *dto.LabelPair) *dto.LabelPair {
   380  	r := &dto.LabelPair{}
   381  	r.Name = lp.Name
   382  	r.Value = lp.Value
   383  	return r
   384  }
   385  
   386  func main() {
   387  	// Register the commandline flags passed to the tool.
   388  	registerFlags(pflag.CommandLine)
   389  	pflag.CommandLine.AddGoFlagSet(goflag.CommandLine)
   390  	pflag.Parse()
   391  
   392  	// Register the metrics we defined above with prometheus.
   393  	customMetricRegistry.MustRegister(etcdVersion)
   394  
   395  	// Spawn threads for periodically scraping etcd version metrics.
   396  	stopCh := make(chan struct{})
   397  	defer close(stopCh)
   398  	go getVersionPeriodically(stopCh)
   399  
   400  	// Serve our metrics on listenAddress/metricsPath.
   401  	klog.Infof("Listening on: %v", listenAddress)
   402  	http.Handle(metricsPath, metrics.HandlerFor(gatherer, metrics.HandlerOpts{}))
   403  	klog.Errorf("Stopped listening/serving metrics: %v", http.ListenAndServe(listenAddress, nil))
   404  }