k8s.io/perf-tests/clusterloader2@v0.0.0-20240304094227-64bdb12da87e/pkg/measurement/common/etcd_metrics.go (about)

     1  /*
     2  Copyright 2018 The Kubernetes Authors.
     3  
     4  Licensed under the Apache License, Version 2.0 (the "License");
     5  you may not use this file except in compliance with the License.
     6  You may obtain a copy of the License at
     7  
     8      http://www.apache.org/licenses/LICENSE-2.0
     9  
    10  Unless required by applicable law or agreed to in writing, software
    11  distributed under the License is distributed on an "AS IS" BASIS,
    12  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    13  See the License for the specific language governing permissions and
    14  limitations under the License.
    15  */
    16  
    17  package common
    18  
    19  import (
    20  	"fmt"
    21  	"math"
    22  	"os"
    23  	"sync"
    24  	"time"
    25  
    26  	"github.com/prometheus/common/model"
    27  	"k8s.io/klog/v2"
    28  	"k8s.io/perf-tests/clusterloader2/pkg/measurement"
    29  	measurementutil "k8s.io/perf-tests/clusterloader2/pkg/measurement/util"
    30  	"k8s.io/perf-tests/clusterloader2/pkg/provider"
    31  	"k8s.io/perf-tests/clusterloader2/pkg/util"
    32  )
    33  
    34  const (
    35  	etcdMetricsMetricName = "EtcdMetrics"
    36  )
    37  
    38  func init() {
    39  	if err := measurement.Register(etcdMetricsMetricName, createEtcdMetricsMeasurement); err != nil {
    40  		klog.Fatalf("Cannot register %s: %v", etcdMetricsMetricName, err)
    41  	}
    42  }
    43  
    44  func createEtcdMetricsMeasurement() measurement.Measurement {
    45  	return &etcdMetricsMeasurement{
    46  		stopCh:  make(chan struct{}),
    47  		wg:      &sync.WaitGroup{},
    48  		metrics: newEtcdMetrics(),
    49  	}
    50  }
    51  
    52  type etcdMetricsMeasurement struct {
    53  	sync.Mutex
    54  	isRunning bool
    55  	stopCh    chan struct{}
    56  	wg        *sync.WaitGroup
    57  	metrics   *etcdMetrics
    58  }
    59  
    60  // Execute supports two actions:
    61  // - start - Starts collecting etcd metrics.
    62  // - gather - Gathers and prints etcd metrics summary.
    63  func (e *etcdMetricsMeasurement) Execute(config *measurement.Config) ([]measurement.Summary, error) {
    64  	provider := config.ClusterFramework.GetClusterConfig().Provider
    65  	// Etcd is only exposed on localhost level. We are using ssh method
    66  	if !provider.Features().SupportSSHToMaster {
    67  		klog.Warningf("not grabbing etcd metrics through master SSH: unsupported for provider, %s", config.ClusterFramework.GetClusterConfig().Provider.Name())
    68  		return nil, nil
    69  	}
    70  
    71  	action, err := util.GetString(config.Params, "action")
    72  	if err != nil {
    73  		return nil, err
    74  	}
    75  
    76  	hosts := config.ClusterFramework.GetClusterConfig().MasterIPs
    77  	if len(hosts) < 1 {
    78  		klog.Warningf("ETCD measurements will be disabled due to no MasterIps: %v", hosts)
    79  		return nil, nil
    80  	}
    81  
    82  	etcdInsecurePort := config.ClusterFramework.GetClusterConfig().EtcdInsecurePort
    83  	switch action {
    84  	case "start":
    85  		klog.V(2).Infof("%s: starting etcd metrics collecting...", e)
    86  		waitTime, err := util.GetDurationOrDefault(config.Params, "waitTime", time.Minute)
    87  		if err != nil {
    88  			return nil, err
    89  		}
    90  		for _, h := range hosts {
    91  			e.startCollecting(h, provider, waitTime, etcdInsecurePort)
    92  		}
    93  		return nil, nil
    94  	case "gather":
    95  		for _, h := range hosts {
    96  			if err = e.stopAndSummarize(h, provider, etcdInsecurePort); err != nil {
    97  				return nil, err
    98  			}
    99  		}
   100  		content, err := util.PrettyPrintJSON(e.metrics)
   101  		if err != nil {
   102  			return nil, err
   103  		}
   104  		summary := measurement.CreateSummary(etcdMetricsMetricName, "json", content)
   105  		return []measurement.Summary{summary}, nil
   106  	default:
   107  		return nil, fmt.Errorf("unknown action %v", action)
   108  	}
   109  }
   110  
   111  // Dispose cleans up after the measurement.
   112  func (e *etcdMetricsMeasurement) Dispose() {
   113  	if e.isRunning {
   114  		e.isRunning = false
   115  		close(e.stopCh)
   116  		e.wg.Wait()
   117  	}
   118  }
   119  
   120  func (e *etcdMetricsMeasurement) String() string {
   121  	return etcdMetricsMetricName
   122  }
   123  
   124  func (e *etcdMetricsMeasurement) startCollecting(host string, provider provider.Provider, interval time.Duration, port int) {
   125  	e.isRunning = true
   126  	e.wg.Add(1)
   127  
   128  	collectEtcdDatabaseSize := func() error {
   129  		dbSize, err := e.getEtcdDatabaseSize(host, provider, port)
   130  		if err != nil {
   131  			return err
   132  		}
   133  
   134  		e.Lock()
   135  		defer e.Unlock()
   136  		e.metrics.MaxDatabaseSize = math.Max(e.metrics.MaxDatabaseSize, dbSize)
   137  
   138  		return nil
   139  	}
   140  	go func() {
   141  		defer e.wg.Done()
   142  		for {
   143  			select {
   144  			case <-time.After(interval):
   145  				err := collectEtcdDatabaseSize()
   146  				if err != nil {
   147  					klog.Errorf("%s: failed to collect etcd database size", e)
   148  					continue
   149  				}
   150  			case <-e.stopCh:
   151  				return
   152  			}
   153  		}
   154  	}()
   155  }
   156  
   157  func (e *etcdMetricsMeasurement) stopAndSummarize(host string, provider provider.Provider, port int) error {
   158  	defer e.Dispose()
   159  	// Do some one-off collection of metrics.
   160  	samples, err := e.getEtcdMetrics(host, provider, port)
   161  	if err != nil {
   162  		return err
   163  	}
   164  
   165  	collectEtcdMetrics := func(sample *model.Sample) {
   166  		var hist *measurementutil.HistogramVec
   167  		switch sample.Metric[model.MetricNameLabel] {
   168  		case "etcd_disk_backend_commit_duration_seconds_bucket":
   169  			hist = &e.metrics.BackendCommitDuration
   170  		case "etcd_debugging_snap_save_total_duration_seconds_bucket":
   171  			hist = &e.metrics.SnapshotSaveTotalDuration
   172  		case "etcd_disk_wal_fsync_duration_seconds_bucket":
   173  			hist = &e.metrics.WalFsyncDuration
   174  		case "etcd_network_peer_round_trip_time_seconds_bucket":
   175  			hist = &e.metrics.PeerRoundTripTime
   176  		default:
   177  			return
   178  		}
   179  
   180  		e.Lock()
   181  		measurementutil.ConvertSampleToBucket(sample, hist)
   182  		e.Unlock()
   183  	}
   184  	for _, sample := range samples {
   185  		collectEtcdMetrics(sample)
   186  	}
   187  	return nil
   188  }
   189  
   190  func (e *etcdMetricsMeasurement) getEtcdMetrics(host string, provider provider.Provider, port int) ([]*model.Sample, error) {
   191  
   192  	// In https://github.com/kubernetes/kubernetes/pull/74690, mTLS is enabled for etcd server
   193  	// in order to bypass TLS credential requirement when checking etc /metrics and /health, you
   194  	// need to provide the insecure http port number to access etcd, http://localhost:2382 for
   195  	// example.
   196  	cmd := fmt.Sprintf("curl http://localhost:%d/metrics", port)
   197  	samples, err := e.sshEtcdMetrics(cmd, host, provider)
   198  	if err == nil {
   199  		return samples, nil
   200  	}
   201  	klog.Warningf("%s: call on %d port (%s) failed due to %v. Falling back to default 2379 port.", e, port, cmd, err)
   202  
   203  	// Use old endpoint if new one fails, "2379" is hard-coded here as well, it is kept as is since
   204  	// we don't want to bloat the cluster config only for a fall-back attempt.
   205  	etcdCert, etcdKey, etcdHost := os.Getenv("ETCD_CERTIFICATE"), os.Getenv("ETCD_KEY"), os.Getenv("ETCD_HOST")
   206  	if etcdHost == "" {
   207  		etcdHost = "localhost"
   208  	}
   209  	if etcdCert == "" || etcdKey == "" {
   210  		klog.Warning("empty etcd cert or key, using http")
   211  		cmd = fmt.Sprintf("curl http://%s:2379/metrics", etcdHost)
   212  	} else {
   213  		cmd = fmt.Sprintf("curl -k --cert %s --key %s https://%s:2379/metrics", etcdCert, etcdKey, etcdHost)
   214  	}
   215  
   216  	return e.sshEtcdMetrics(cmd, host, provider)
   217  }
   218  
   219  func (e *etcdMetricsMeasurement) sshEtcdMetrics(cmd, host string, provider provider.Provider) ([]*model.Sample, error) {
   220  	sshResult, err := measurementutil.SSH(cmd, host+":22", provider)
   221  	if err != nil {
   222  		return nil, fmt.Errorf("unexpected error (code: %d) in ssh connection to master: %#v", sshResult.Code, err)
   223  	} else if sshResult.Code != 0 {
   224  		return nil, fmt.Errorf("failed running command: %s on the host: %s, result: %+v", cmd, host, sshResult)
   225  	}
   226  	data := sshResult.Stdout
   227  
   228  	return measurementutil.ExtractMetricSamples(data)
   229  }
   230  
   231  func (e *etcdMetricsMeasurement) getEtcdDatabaseSize(host string, provider provider.Provider, port int) (float64, error) {
   232  	samples, err := e.getEtcdMetrics(host, provider, port)
   233  	if err != nil {
   234  		return 0, err
   235  	}
   236  	for _, sample := range samples {
   237  		if sample.Metric[model.MetricNameLabel] == "etcd_debugging_mvcc_db_total_size_in_bytes" ||
   238  			sample.Metric[model.MetricNameLabel] == "etcd_mvcc_db_total_size_in_bytes" {
   239  			return float64(sample.Value), nil
   240  		}
   241  	}
   242  	return 0, fmt.Errorf("couldn't find etcd database size metric")
   243  }
   244  
   245  type etcdMetrics struct {
   246  	BackendCommitDuration     measurementutil.HistogramVec `json:"backendCommitDuration"`
   247  	SnapshotSaveTotalDuration measurementutil.HistogramVec `json:"snapshotSaveTotalDuration"`
   248  	PeerRoundTripTime         measurementutil.HistogramVec `json:"peerRoundTripTime"`
   249  	WalFsyncDuration          measurementutil.HistogramVec `json:"walFsyncDuration"`
   250  	MaxDatabaseSize           float64                      `json:"maxDatabaseSize"`
   251  }
   252  
   253  func newEtcdMetrics() *etcdMetrics {
   254  	return &etcdMetrics{
   255  		BackendCommitDuration:     make(measurementutil.HistogramVec, 0),
   256  		SnapshotSaveTotalDuration: make(measurementutil.HistogramVec, 0),
   257  		PeerRoundTripTime:         make(measurementutil.HistogramVec, 0),
   258  		WalFsyncDuration:          make(measurementutil.HistogramVec, 0),
   259  	}
   260  }