k8s.io/perf-tests/clusterloader2@v0.0.0-20240304094227-64bdb12da87e/pkg/measurement/common/kube_state_metrics_measurement.go

k8s.io/perf-tests/clusterloader2@v0.0.0-20240304094227-64bdb12da87e/pkg/measurement/common/kube_state_metrics_measurement.go (about)

     1  /*
     2  Copyright 2021 The Kubernetes Authors.
     3  
     4  Licensed under the Apache License, Version 2.0 (the "License");
     5  you may not use this file except in compliance with the License.
     6  You may obtain a copy of the License at
     7  
     8      http://www.apache.org/licenses/LICENSE-2.0
     9  
    10  Unless required by applicable law or agreed to in writing, software
    11  distributed under the License is distributed on an "AS IS" BASIS,
    12  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    13  See the License for the specific language governing permissions and
    14  limitations under the License.
    15  */
    16  
    17  package common
    18  
    19  import (
    20  	"context"
    21  	"fmt"
    22  	"sync"
    23  	"time"
    24  
    25  	"k8s.io/perf-tests/clusterloader2/pkg/measurement"
    26  	measurementutil "k8s.io/perf-tests/clusterloader2/pkg/measurement/util"
    27  	"k8s.io/perf-tests/clusterloader2/pkg/util"
    28  
    29  	"github.com/prometheus/common/model"
    30  	clientset "k8s.io/client-go/kubernetes"
    31  	"k8s.io/klog/v2"
    32  )
    33  
    34  const (
    35  	ksmLatencyName               = "KubeStateMetricsLatency"
    36  	ksmRequestDurationMetricName = model.LabelValue("http_request_duration_seconds_bucket")
    37  	probeIntervalDefault         = 30 * time.Second
    38  	ksmNamespace                 = "kube-state-metrics-perf-test"
    39  	ksmServiceName               = "kube-state-metrics"
    40  	ksmSelfPort                  = 8081
    41  	ksmMetricsPort               = 8080
    42  )
    43  
    44  type ksmLatencyMeasurement struct {
    45  	ctx            context.Context
    46  	cancel         func()
    47  	isRunning      bool
    48  	namespace      string
    49  	serviceName    string
    50  	metricsPort    int
    51  	selfPort       int
    52  	initialLatency *measurementutil.Histogram
    53  	wg             sync.WaitGroup
    54  }
    55  
    56  func init() {
    57  	if err := measurement.Register(ksmLatencyName, CreateKSMLatencyMeasurement); err != nil {
    58  		klog.Fatalf("Cannot register %s: %v", ksmLatencyName, err)
    59  	}
    60  }
    61  
    62  // CreateKSMLatencyMeasurement creates a new Kube State
    63  // Metrics Measurement.
    64  func CreateKSMLatencyMeasurement() measurement.Measurement {
    65  	ctx, cancel := context.WithCancel(context.Background())
    66  	return &ksmLatencyMeasurement{
    67  		namespace:   ksmNamespace,
    68  		serviceName: ksmServiceName,
    69  		selfPort:    ksmSelfPort,
    70  		metricsPort: ksmMetricsPort,
    71  		ctx:         ctx,
    72  		cancel:      cancel,
    73  	}
    74  }
    75  
    76  // Execute supports two actions:
    77  // - start - starts goroutine and queries /metrics every probeIntervalDefault interval,
    78  // it also collects initial latency metrics.
    79  // - gather - gathers latency metrics and creates a latency summary.
    80  func (m *ksmLatencyMeasurement) Execute(config *measurement.Config) ([]measurement.Summary, error) {
    81  	if !config.CloudProvider.Features().SupportKubeStateMetrics {
    82  		klog.Infof("not executing KSMLatencyMeasurement: unsupported for provider, %s", config.ClusterFramework.GetClusterConfig().Provider.Name())
    83  		return nil, nil
    84  	}
    85  	action, err := util.GetString(config.Params, "action")
    86  	if err != nil {
    87  		return nil, err
    88  	}
    89  	client := config.ClusterFramework.GetClientSets().GetClient()
    90  	switch action {
    91  	case "start":
    92  		if m.isRunning {
    93  			klog.V(2).Infof("%s: measurement already running", m)
    94  			return nil, nil
    95  		}
    96  		// Start executing calls towards the kube-state-metrics /metrics endpoint
    97  		// every probeIntervalDefault until gather is called.
    98  		// probeIntervalDefault equals the scrapping interval we suggest.
    99  		// If we cannot get metrics for two minutes we are already going over
   100  		// the scrape interval so we should cancel.
   101  		m.startQuerying(m.ctx, client, probeIntervalDefault)
   102  		// Retrieve initial latency when first call is done.
   103  		m.initialLatency, err = m.retrieveKSMLatencyMetrics(m.ctx, client)
   104  		return nil, err
   105  	case "gather":
   106  		defer m.cancel()
   107  		return m.createKSMLatencySummary(m.ctx, client)
   108  	default:
   109  		return nil, fmt.Errorf("unknown action %v", action)
   110  	}
   111  }
   112  
   113  func (m *ksmLatencyMeasurement) stop() error {
   114  	if !m.isRunning {
   115  		return fmt.Errorf("%s: measurement was not running", m)
   116  	}
   117  	m.cancel()
   118  	m.wg.Wait()
   119  	return nil
   120  }
   121  
   122  // createKSMLatencyReport gathers the latency one last time and creates the summary based on the Quantile from the sub histograms.
   123  // Afterwards it creates the Summary Report.
   124  func (m *ksmLatencyMeasurement) createKSMLatencySummary(ctx context.Context, client clientset.Interface) ([]measurement.Summary, error) {
   125  	latestLatency, err := m.retrieveKSMLatencyMetrics(ctx, client)
   126  	if err != nil {
   127  		return nil, err
   128  	}
   129  	if err = m.stop(); err != nil {
   130  		return nil, err
   131  	}
   132  	// We want to subtract the latest histogram from the first one we collect.
   133  	finalLatency := HistogramSub(latestLatency, m.initialLatency)
   134  	// Pretty Print the report.
   135  	result := &measurementutil.LatencyMetric{}
   136  	if err = SetQuantileFromHistogram(result, finalLatency); err != nil {
   137  		return nil, err
   138  	}
   139  	content, err := util.PrettyPrintJSON(result)
   140  	if err != nil {
   141  		return nil, err
   142  	}
   143  	// Create Summary.
   144  	return []measurement.Summary{measurement.CreateSummary(ksmLatencyName, "json", content)}, nil
   145  }
   146  
   147  // startQuerying queries /metrics endpoint of kube-state-metrics kube_ metrics every interval
   148  // and stops when stop is called.
   149  func (m *ksmLatencyMeasurement) startQuerying(ctx context.Context, client clientset.Interface, interval time.Duration) {
   150  	m.isRunning = true
   151  	m.wg.Add(1)
   152  	go m.queryLoop(ctx, client, interval)
   153  }
   154  
   155  func (m *ksmLatencyMeasurement) queryLoop(ctx context.Context, client clientset.Interface, interval time.Duration) {
   156  	defer m.wg.Done()
   157  	for {
   158  		select {
   159  		case <-ctx.Done():
   160  			return
   161  		case <-time.After(interval):
   162  			var output string
   163  			output, err := m.getMetricsFromService(ctx, client, m.metricsPort)
   164  			if err != nil {
   165  				klog.V(2).Infof("error during fetching metrics from service: %v", err)
   166  			}
   167  			if output == "" {
   168  				klog.V(2).Infof("/metrics endpoint of kube-state-metrics returned no data in namespace: %s from service: %s port: %d", m.namespace, m.serviceName, m.metricsPort)
   169  			}
   170  
   171  		}
   172  	}
   173  }
   174  
   175  func (m *ksmLatencyMeasurement) retrieveKSMLatencyMetrics(ctx context.Context, c clientset.Interface) (*measurementutil.Histogram, error) {
   176  	ksmHist := measurementutil.NewHistogram(nil)
   177  	output, err := m.getMetricsFromService(ctx, c, m.selfPort)
   178  	if err != nil {
   179  		return ksmHist, err
   180  	}
   181  	samples, err := measurementutil.ExtractMetricSamples(output)
   182  	if err != nil {
   183  		return ksmHist, err
   184  	}
   185  	for _, sample := range samples {
   186  		switch sample.Metric[model.MetricNameLabel] {
   187  		case ksmRequestDurationMetricName:
   188  			measurementutil.ConvertSampleToHistogram(sample, ksmHist)
   189  		}
   190  	}
   191  	return ksmHist, nil
   192  }
   193  
   194  func (m *ksmLatencyMeasurement) getMetricsFromService(ctx context.Context, client clientset.Interface, port int) (string, error) {
   195  	ctx, cancel := context.WithTimeout(ctx, 2*time.Minute)
   196  	defer cancel()
   197  	out, err := client.CoreV1().RESTClient().Get().
   198  		Resource("services").
   199  		SubResource("proxy").
   200  		Namespace(m.namespace).
   201  		Name(fmt.Sprintf("%v:%v", m.serviceName, port)).
   202  		Suffix("metrics").
   203  		Do(ctx).Raw()
   204  	return string(out), err
   205  }
   206  
   207  // Dispose cleans up after the measurement.
   208  func (m *ksmLatencyMeasurement) Dispose() {
   209  	if err := m.stop(); err != nil {
   210  		klog.V(2).Infof("error during dispose call: %v", err)
   211  	}
   212  }
   213  
   214  // String returns string representation of this measurement.
   215  func (m *ksmLatencyMeasurement) String() string {
   216  	return ksmLatencyName
   217  }