k8s.io/perf-tests/clusterloader2@v0.0.0-20240304094227-64bdb12da87e/pkg/measurement/common/etcd_metrics.go (about) 1 /* 2 Copyright 2018 The Kubernetes Authors. 3 4 Licensed under the Apache License, Version 2.0 (the "License"); 5 you may not use this file except in compliance with the License. 6 You may obtain a copy of the License at 7 8 http://www.apache.org/licenses/LICENSE-2.0 9 10 Unless required by applicable law or agreed to in writing, software 11 distributed under the License is distributed on an "AS IS" BASIS, 12 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 See the License for the specific language governing permissions and 14 limitations under the License. 15 */ 16 17 package common 18 19 import ( 20 "fmt" 21 "math" 22 "os" 23 "sync" 24 "time" 25 26 "github.com/prometheus/common/model" 27 "k8s.io/klog/v2" 28 "k8s.io/perf-tests/clusterloader2/pkg/measurement" 29 measurementutil "k8s.io/perf-tests/clusterloader2/pkg/measurement/util" 30 "k8s.io/perf-tests/clusterloader2/pkg/provider" 31 "k8s.io/perf-tests/clusterloader2/pkg/util" 32 ) 33 34 const ( 35 etcdMetricsMetricName = "EtcdMetrics" 36 ) 37 38 func init() { 39 if err := measurement.Register(etcdMetricsMetricName, createEtcdMetricsMeasurement); err != nil { 40 klog.Fatalf("Cannot register %s: %v", etcdMetricsMetricName, err) 41 } 42 } 43 44 func createEtcdMetricsMeasurement() measurement.Measurement { 45 return &etcdMetricsMeasurement{ 46 stopCh: make(chan struct{}), 47 wg: &sync.WaitGroup{}, 48 metrics: newEtcdMetrics(), 49 } 50 } 51 52 type etcdMetricsMeasurement struct { 53 sync.Mutex 54 isRunning bool 55 stopCh chan struct{} 56 wg *sync.WaitGroup 57 metrics *etcdMetrics 58 } 59 60 // Execute supports two actions: 61 // - start - Starts collecting etcd metrics. 62 // - gather - Gathers and prints etcd metrics summary. 63 func (e *etcdMetricsMeasurement) Execute(config *measurement.Config) ([]measurement.Summary, error) { 64 provider := config.ClusterFramework.GetClusterConfig().Provider 65 // Etcd is only exposed on localhost level. We are using ssh method 66 if !provider.Features().SupportSSHToMaster { 67 klog.Warningf("not grabbing etcd metrics through master SSH: unsupported for provider, %s", config.ClusterFramework.GetClusterConfig().Provider.Name()) 68 return nil, nil 69 } 70 71 action, err := util.GetString(config.Params, "action") 72 if err != nil { 73 return nil, err 74 } 75 76 hosts := config.ClusterFramework.GetClusterConfig().MasterIPs 77 if len(hosts) < 1 { 78 klog.Warningf("ETCD measurements will be disabled due to no MasterIps: %v", hosts) 79 return nil, nil 80 } 81 82 etcdInsecurePort := config.ClusterFramework.GetClusterConfig().EtcdInsecurePort 83 switch action { 84 case "start": 85 klog.V(2).Infof("%s: starting etcd metrics collecting...", e) 86 waitTime, err := util.GetDurationOrDefault(config.Params, "waitTime", time.Minute) 87 if err != nil { 88 return nil, err 89 } 90 for _, h := range hosts { 91 e.startCollecting(h, provider, waitTime, etcdInsecurePort) 92 } 93 return nil, nil 94 case "gather": 95 for _, h := range hosts { 96 if err = e.stopAndSummarize(h, provider, etcdInsecurePort); err != nil { 97 return nil, err 98 } 99 } 100 content, err := util.PrettyPrintJSON(e.metrics) 101 if err != nil { 102 return nil, err 103 } 104 summary := measurement.CreateSummary(etcdMetricsMetricName, "json", content) 105 return []measurement.Summary{summary}, nil 106 default: 107 return nil, fmt.Errorf("unknown action %v", action) 108 } 109 } 110 111 // Dispose cleans up after the measurement. 112 func (e *etcdMetricsMeasurement) Dispose() { 113 if e.isRunning { 114 e.isRunning = false 115 close(e.stopCh) 116 e.wg.Wait() 117 } 118 } 119 120 func (e *etcdMetricsMeasurement) String() string { 121 return etcdMetricsMetricName 122 } 123 124 func (e *etcdMetricsMeasurement) startCollecting(host string, provider provider.Provider, interval time.Duration, port int) { 125 e.isRunning = true 126 e.wg.Add(1) 127 128 collectEtcdDatabaseSize := func() error { 129 dbSize, err := e.getEtcdDatabaseSize(host, provider, port) 130 if err != nil { 131 return err 132 } 133 134 e.Lock() 135 defer e.Unlock() 136 e.metrics.MaxDatabaseSize = math.Max(e.metrics.MaxDatabaseSize, dbSize) 137 138 return nil 139 } 140 go func() { 141 defer e.wg.Done() 142 for { 143 select { 144 case <-time.After(interval): 145 err := collectEtcdDatabaseSize() 146 if err != nil { 147 klog.Errorf("%s: failed to collect etcd database size", e) 148 continue 149 } 150 case <-e.stopCh: 151 return 152 } 153 } 154 }() 155 } 156 157 func (e *etcdMetricsMeasurement) stopAndSummarize(host string, provider provider.Provider, port int) error { 158 defer e.Dispose() 159 // Do some one-off collection of metrics. 160 samples, err := e.getEtcdMetrics(host, provider, port) 161 if err != nil { 162 return err 163 } 164 165 collectEtcdMetrics := func(sample *model.Sample) { 166 var hist *measurementutil.HistogramVec 167 switch sample.Metric[model.MetricNameLabel] { 168 case "etcd_disk_backend_commit_duration_seconds_bucket": 169 hist = &e.metrics.BackendCommitDuration 170 case "etcd_debugging_snap_save_total_duration_seconds_bucket": 171 hist = &e.metrics.SnapshotSaveTotalDuration 172 case "etcd_disk_wal_fsync_duration_seconds_bucket": 173 hist = &e.metrics.WalFsyncDuration 174 case "etcd_network_peer_round_trip_time_seconds_bucket": 175 hist = &e.metrics.PeerRoundTripTime 176 default: 177 return 178 } 179 180 e.Lock() 181 measurementutil.ConvertSampleToBucket(sample, hist) 182 e.Unlock() 183 } 184 for _, sample := range samples { 185 collectEtcdMetrics(sample) 186 } 187 return nil 188 } 189 190 func (e *etcdMetricsMeasurement) getEtcdMetrics(host string, provider provider.Provider, port int) ([]*model.Sample, error) { 191 192 // In https://github.com/kubernetes/kubernetes/pull/74690, mTLS is enabled for etcd server 193 // in order to bypass TLS credential requirement when checking etc /metrics and /health, you 194 // need to provide the insecure http port number to access etcd, http://localhost:2382 for 195 // example. 196 cmd := fmt.Sprintf("curl http://localhost:%d/metrics", port) 197 samples, err := e.sshEtcdMetrics(cmd, host, provider) 198 if err == nil { 199 return samples, nil 200 } 201 klog.Warningf("%s: call on %d port (%s) failed due to %v. Falling back to default 2379 port.", e, port, cmd, err) 202 203 // Use old endpoint if new one fails, "2379" is hard-coded here as well, it is kept as is since 204 // we don't want to bloat the cluster config only for a fall-back attempt. 205 etcdCert, etcdKey, etcdHost := os.Getenv("ETCD_CERTIFICATE"), os.Getenv("ETCD_KEY"), os.Getenv("ETCD_HOST") 206 if etcdHost == "" { 207 etcdHost = "localhost" 208 } 209 if etcdCert == "" || etcdKey == "" { 210 klog.Warning("empty etcd cert or key, using http") 211 cmd = fmt.Sprintf("curl http://%s:2379/metrics", etcdHost) 212 } else { 213 cmd = fmt.Sprintf("curl -k --cert %s --key %s https://%s:2379/metrics", etcdCert, etcdKey, etcdHost) 214 } 215 216 return e.sshEtcdMetrics(cmd, host, provider) 217 } 218 219 func (e *etcdMetricsMeasurement) sshEtcdMetrics(cmd, host string, provider provider.Provider) ([]*model.Sample, error) { 220 sshResult, err := measurementutil.SSH(cmd, host+":22", provider) 221 if err != nil { 222 return nil, fmt.Errorf("unexpected error (code: %d) in ssh connection to master: %#v", sshResult.Code, err) 223 } else if sshResult.Code != 0 { 224 return nil, fmt.Errorf("failed running command: %s on the host: %s, result: %+v", cmd, host, sshResult) 225 } 226 data := sshResult.Stdout 227 228 return measurementutil.ExtractMetricSamples(data) 229 } 230 231 func (e *etcdMetricsMeasurement) getEtcdDatabaseSize(host string, provider provider.Provider, port int) (float64, error) { 232 samples, err := e.getEtcdMetrics(host, provider, port) 233 if err != nil { 234 return 0, err 235 } 236 for _, sample := range samples { 237 if sample.Metric[model.MetricNameLabel] == "etcd_debugging_mvcc_db_total_size_in_bytes" || 238 sample.Metric[model.MetricNameLabel] == "etcd_mvcc_db_total_size_in_bytes" { 239 return float64(sample.Value), nil 240 } 241 } 242 return 0, fmt.Errorf("couldn't find etcd database size metric") 243 } 244 245 type etcdMetrics struct { 246 BackendCommitDuration measurementutil.HistogramVec `json:"backendCommitDuration"` 247 SnapshotSaveTotalDuration measurementutil.HistogramVec `json:"snapshotSaveTotalDuration"` 248 PeerRoundTripTime measurementutil.HistogramVec `json:"peerRoundTripTime"` 249 WalFsyncDuration measurementutil.HistogramVec `json:"walFsyncDuration"` 250 MaxDatabaseSize float64 `json:"maxDatabaseSize"` 251 } 252 253 func newEtcdMetrics() *etcdMetrics { 254 return &etcdMetrics{ 255 BackendCommitDuration: make(measurementutil.HistogramVec, 0), 256 SnapshotSaveTotalDuration: make(measurementutil.HistogramVec, 0), 257 PeerRoundTripTime: make(measurementutil.HistogramVec, 0), 258 WalFsyncDuration: make(measurementutil.HistogramVec, 0), 259 } 260 }