k8s.io/kubernetes@v1.29.3/test/integration/apiserver/flowcontrol/concurrency_util_test.go (about)

     1  /*
     2  Copyright 2019 The Kubernetes Authors.
     3  
     4  Licensed under the Apache License, Version 2.0 (the "License");
     5  you may not use this file except in compliance with the License.
     6  You may obtain a copy of the License at
     7  
     8      http://www.apache.org/licenses/LICENSE-2.0
     9  
    10  Unless required by applicable law or agreed to in writing, software
    11  distributed under the License is distributed on an "AS IS" BASIS,
    12  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    13  See the License for the specific language governing permissions and
    14  limitations under the License.
    15  */
    16  
    17  package flowcontrol
    18  
    19  import (
    20  	"context"
    21  	"fmt"
    22  	"io"
    23  	"math"
    24  	"strings"
    25  	"sync"
    26  	"testing"
    27  	"time"
    28  
    29  	"github.com/prometheus/common/expfmt"
    30  	"github.com/prometheus/common/model"
    31  
    32  	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
    33  	"k8s.io/apiserver/pkg/authorization/authorizer"
    34  	clientset "k8s.io/client-go/kubernetes"
    35  	"k8s.io/kubernetes/cmd/kube-apiserver/app/options"
    36  	"k8s.io/kubernetes/pkg/controlplane"
    37  	"k8s.io/kubernetes/test/integration/framework"
    38  	"k8s.io/kubernetes/test/utils/ktesting"
    39  )
    40  
    41  const (
    42  	nominalConcurrencyLimitMetricsName = "apiserver_flowcontrol_nominal_limit_seats"
    43  	requestExecutionSecondsSumName     = "apiserver_flowcontrol_request_execution_seconds_sum"
    44  	requestExecutionSecondsCountName   = "apiserver_flowcontrol_request_execution_seconds_count"
    45  	priorityLevelSeatUtilSumName       = "apiserver_flowcontrol_priority_level_seat_utilization_sum"
    46  	priorityLevelSeatUtilCountName     = "apiserver_flowcontrol_priority_level_seat_utilization_count"
    47  	fakeworkDuration                   = 200 * time.Millisecond
    48  	testWarmUpTime                     = 2 * time.Second
    49  	testTime                           = 10 * time.Second
    50  )
    51  
    52  type SumAndCount struct {
    53  	Sum   float64
    54  	Count int
    55  }
    56  
    57  type plMetrics struct {
    58  	execSeconds    SumAndCount
    59  	seatUtil       SumAndCount
    60  	availableSeats int
    61  }
    62  
    63  // metricSnapshot maps from a priority level label to
    64  // a plMetrics struct containing APF metrics of interest
    65  type metricSnapshot map[string]plMetrics
    66  
    67  // Client request latency measurement
    68  type clientLatencyMeasurement struct {
    69  	SumAndCount
    70  	SumSq float64 // latency sum of squares
    71  	Mu    sync.Mutex
    72  }
    73  
    74  func (clm *clientLatencyMeasurement) reset() {
    75  	clm.Mu.Lock()
    76  	defer clm.Mu.Unlock()
    77  	clm.Sum = 0
    78  	clm.Count = 0
    79  	clm.SumSq = 0
    80  }
    81  
    82  func (clm *clientLatencyMeasurement) update(duration float64) {
    83  	clm.Mu.Lock()
    84  	defer clm.Mu.Unlock()
    85  	clm.Count += 1
    86  	clm.Sum += duration
    87  	clm.SumSq += duration * duration
    88  }
    89  
    90  func (clm *clientLatencyMeasurement) getStats() clientLatencyStats {
    91  	clm.Mu.Lock()
    92  	defer clm.Mu.Unlock()
    93  	mean := clm.Sum / float64(clm.Count)
    94  	ss := clm.SumSq - mean*clm.Sum // reduced from ss := sumsq - 2*mean*sum + float64(count)*mean*mean
    95  	// Set ss to 0 if negative value is resulted from floating point calculations
    96  	if ss < 0 {
    97  		ss = 0
    98  	}
    99  	stdDev := math.Sqrt(ss / float64(clm.Count))
   100  	cv := stdDev / mean
   101  	return clientLatencyStats{mean: mean, stdDev: stdDev, cv: cv}
   102  }
   103  
   104  type clientLatencyStats struct {
   105  	mean   float64 // latency average
   106  	stdDev float64 // latency population standard deviation
   107  	cv     float64 // latency coefficient of variation
   108  }
   109  
   110  type plMetricAvg struct {
   111  	reqExecution float64 // average request execution time
   112  	seatUtil     float64 // average seat utilization
   113  }
   114  
   115  func intervalMetricAvg(snapshot0, snapshot1 metricSnapshot, plLabel string) plMetricAvg {
   116  	plmT0 := snapshot0[plLabel]
   117  	plmT1 := snapshot1[plLabel]
   118  	return plMetricAvg{
   119  		reqExecution: (plmT1.execSeconds.Sum - plmT0.execSeconds.Sum) / float64(plmT1.execSeconds.Count-plmT0.execSeconds.Count),
   120  		seatUtil:     (plmT1.seatUtil.Sum - plmT0.seatUtil.Sum) / float64(plmT1.seatUtil.Count-plmT0.seatUtil.Count),
   121  	}
   122  }
   123  
   124  type noxuDelayingAuthorizer struct {
   125  	Authorizer authorizer.Authorizer
   126  }
   127  
   128  func (d *noxuDelayingAuthorizer) Authorize(ctx context.Context, a authorizer.Attributes) (authorizer.Decision, string, error) {
   129  	if a.GetUser().GetName() == "noxu1" || a.GetUser().GetName() == "noxu2" {
   130  		time.Sleep(fakeworkDuration) // simulate fake work with sleep
   131  	}
   132  	return d.Authorizer.Authorize(ctx, a)
   133  }
   134  
   135  // TestConcurrencyIsolation tests the concurrency isolation between priority levels.
   136  // The test defines two priority levels for this purpose, and corresponding flow schemas.
   137  // To one priority level, this test sends many more concurrent requests than the configuration
   138  // allows to execute at once, while sending fewer than allowed to the other priority level.
   139  // The primary check is that the low flow gets all the seats it wants, but is modulated by
   140  // recognizing that there are uncontrolled overheads in the system.
   141  //
   142  // This test differs from TestPriorityLevelIsolation since TestPriorityLevelIsolation checks throughput instead
   143  // of concurrency. In order to mitigate the effects of system noise, a delaying authorizer is used to artificially
   144  // increase request execution time to make the system noise relatively insignificant.
   145  // Secondarily, this test also checks the observed seat utilizations against values derived from expecting that
   146  // the throughput observed by the client equals the execution throughput observed by the server.
   147  func TestConcurrencyIsolation(t *testing.T) {
   148  	_, ctx := ktesting.NewTestContext(t)
   149  	ctx, cancel := context.WithCancel(ctx)
   150  	defer cancel()
   151  
   152  	_, kubeConfig, closeFn := framework.StartTestServer(ctx, t, framework.TestServerSetup{
   153  		ModifyServerRunOptions: func(opts *options.ServerRunOptions) {
   154  			// Ensure all clients are allowed to send requests.
   155  			opts.Authorization.Modes = []string{"AlwaysAllow"}
   156  			opts.GenericServerRunOptions.MaxRequestsInFlight = 10
   157  			opts.GenericServerRunOptions.MaxMutatingRequestsInFlight = 10
   158  		},
   159  		ModifyServerConfig: func(config *controlplane.Config) {
   160  			// Wrap default authorizer with one that delays requests from noxu clients
   161  			config.GenericConfig.Authorization.Authorizer = &noxuDelayingAuthorizer{config.GenericConfig.Authorization.Authorizer}
   162  		},
   163  	})
   164  	defer closeFn()
   165  
   166  	loopbackClient := clientset.NewForConfigOrDie(kubeConfig)
   167  	noxu1Client := getClientFor(kubeConfig, "noxu1")
   168  	noxu2Client := getClientFor(kubeConfig, "noxu2")
   169  
   170  	queueLength := 50
   171  	concurrencyShares := 100
   172  
   173  	plNoxu1, _, err := createPriorityLevelAndBindingFlowSchemaForUser(
   174  		loopbackClient, "noxu1", concurrencyShares, queueLength)
   175  	if err != nil {
   176  		t.Error(err)
   177  	}
   178  	plNoxu2, _, err := createPriorityLevelAndBindingFlowSchemaForUser(
   179  		loopbackClient, "noxu2", concurrencyShares, queueLength)
   180  	if err != nil {
   181  		t.Error(err)
   182  	}
   183  
   184  	stopCh := make(chan struct{})
   185  	wg := sync.WaitGroup{}
   186  
   187  	// "elephant"
   188  	noxu1NumGoroutines := 5 + queueLength
   189  	var noxu1LatMeasure clientLatencyMeasurement
   190  	wg.Add(noxu1NumGoroutines)
   191  	streamRequests(noxu1NumGoroutines, func() {
   192  		start := time.Now()
   193  		_, err := noxu1Client.CoreV1().Namespaces().Get(ctx, "default", metav1.GetOptions{})
   194  		duration := time.Since(start).Seconds()
   195  		noxu1LatMeasure.update(duration)
   196  		if err != nil {
   197  			t.Error(err)
   198  		}
   199  	}, &wg, stopCh)
   200  	// "mouse"
   201  	noxu2NumGoroutines := 3
   202  	var noxu2LatMeasure clientLatencyMeasurement
   203  	wg.Add(noxu2NumGoroutines)
   204  	streamRequests(noxu2NumGoroutines, func() {
   205  		start := time.Now()
   206  		_, err := noxu2Client.CoreV1().Namespaces().Get(ctx, "default", metav1.GetOptions{})
   207  		duration := time.Since(start).Seconds()
   208  		noxu2LatMeasure.update(duration)
   209  		if err != nil {
   210  			t.Error(err)
   211  		}
   212  	}, &wg, stopCh)
   213  
   214  	// Warm up
   215  	time.Sleep(testWarmUpTime)
   216  
   217  	noxu1LatMeasure.reset()
   218  	noxu2LatMeasure.reset()
   219  	snapshot0, err := getRequestMetricsSnapshot(loopbackClient)
   220  	if err != nil {
   221  		t.Error(err)
   222  	}
   223  	time.Sleep(testTime) // after warming up, the test enters a steady state
   224  	snapshot1, err := getRequestMetricsSnapshot(loopbackClient)
   225  	if err != nil {
   226  		t.Error(err)
   227  	}
   228  	close(stopCh)
   229  
   230  	// Check the assumptions of the test
   231  	noxu1T0 := snapshot0[plNoxu1.Name]
   232  	noxu1T1 := snapshot1[plNoxu1.Name]
   233  	noxu2T0 := snapshot0[plNoxu2.Name]
   234  	noxu2T1 := snapshot1[plNoxu2.Name]
   235  	if noxu1T0.seatUtil.Count >= noxu1T1.seatUtil.Count || noxu2T0.seatUtil.Count >= noxu2T1.seatUtil.Count {
   236  		t.Errorf("SeatUtilCount check failed: noxu1 t0 count %d, t1 count %d; noxu2 t0 count %d, t1 count %d",
   237  			noxu1T0.seatUtil.Count, noxu1T1.seatUtil.Count, noxu2T0.seatUtil.Count, noxu2T1.seatUtil.Count)
   238  	}
   239  	t.Logf("noxu1 priority level concurrency limit: %d", noxu1T0.availableSeats)
   240  	t.Logf("noxu2 priority level concurrency limit: %d", noxu2T0.availableSeats)
   241  	if (noxu1T0.availableSeats != noxu1T1.availableSeats) || (noxu2T0.availableSeats != noxu2T1.availableSeats) {
   242  		t.Errorf("The number of available seats changed: noxu1 (%d, %d) noxu2 (%d, %d)",
   243  			noxu1T0.availableSeats, noxu1T1.availableSeats, noxu2T0.availableSeats, noxu2T1.availableSeats)
   244  	}
   245  	if (noxu1T0.availableSeats <= 4) || (noxu2T0.availableSeats <= 4) {
   246  		t.Errorf("The number of available seats for test client priority levels are too small: (%d, %d). Expecting a number > 4",
   247  			noxu1T0.availableSeats, noxu2T0.availableSeats)
   248  	}
   249  	// No requests should be rejected under normal situations
   250  	_, rejectedReqCounts, err := getRequestCountOfPriorityLevel(loopbackClient)
   251  	if err != nil {
   252  		t.Error(err)
   253  	}
   254  	if rejectedReqCounts[plNoxu1.Name] > 0 {
   255  		t.Errorf(`%d requests from the "elephant" stream were rejected unexpectedly`, rejectedReqCounts[plNoxu1.Name])
   256  	}
   257  	if rejectedReqCounts[plNoxu2.Name] > 0 {
   258  		t.Errorf(`%d requests from the "mouse" stream were rejected unexpectedly`, rejectedReqCounts[plNoxu2.Name])
   259  	}
   260  
   261  	// Calculate APF server side metric averages during the test interval
   262  	noxu1Avg := intervalMetricAvg(snapshot0, snapshot1, plNoxu1.Name)
   263  	noxu2Avg := intervalMetricAvg(snapshot0, snapshot1, plNoxu2.Name)
   264  	t.Logf("\nnoxu1 avg request execution time %v\nnoxu2 avg request execution time %v", noxu1Avg.reqExecution, noxu2Avg.reqExecution)
   265  	t.Logf("\nnoxu1 avg seat utilization %v\nnoxu2 avg seat utilization %v", noxu1Avg.seatUtil, noxu2Avg.seatUtil)
   266  
   267  	// Wait till the client goroutines finish before computing the client side request latency statistics
   268  	wg.Wait()
   269  	noxu1LatStats := noxu1LatMeasure.getStats()
   270  	noxu2LatStats := noxu2LatMeasure.getStats()
   271  	t.Logf("noxu1 client request count %d duration mean %v stddev %v cv %v", noxu1LatMeasure.Count, noxu1LatStats.mean, noxu1LatStats.stdDev, noxu1LatStats.cv)
   272  	t.Logf("noxu2 client request count %d duration mean %v stddev %v cv %v", noxu2LatMeasure.Count, noxu2LatStats.mean, noxu2LatStats.stdDev, noxu2LatStats.cv)
   273  
   274  	// Calculate server-side observed concurrency
   275  	noxu1ObservedConcurrency := noxu1Avg.seatUtil * float64(noxu1T0.availableSeats)
   276  	noxu2ObservedConcurrency := noxu2Avg.seatUtil * float64(noxu2T0.availableSeats)
   277  	// Expected concurrency is derived from equal throughput assumption on both the client-side and the server-side
   278  	noxu1ExpectedConcurrency := float64(noxu1NumGoroutines) * noxu1Avg.reqExecution / noxu1LatStats.mean
   279  	noxu2ExpectedConcurrency := float64(noxu2NumGoroutines) * noxu2Avg.reqExecution / noxu2LatStats.mean
   280  	t.Logf("Concurrency of noxu1:noxu2 - expected (%v:%v), observed (%v:%v)", noxu1ExpectedConcurrency, noxu2ExpectedConcurrency, noxu1ObservedConcurrency, noxu2ObservedConcurrency)
   281  
   282  	// There are uncontrolled overheads that introduce noise into the system. The coefficient of variation (CV), that is,
   283  	// standard deviation divided by mean, for a class of traffic is a characterization of all the noise that applied to
   284  	// that class. We found that noxu1 generally had a much bigger CV than noxu2. This makes sense, because noxu1 probes
   285  	// more behavior --- the waiting in queues. So we take the minimum of the two as an indicator of the relative amount
   286  	// of noise that comes from all the other behavior. Currently, we use 2 times the experienced coefficient of variation
   287  	// as the margin of error.
   288  	margin := 2 * math.Min(noxu1LatStats.cv, noxu2LatStats.cv)
   289  	t.Logf("Error margin is %v", margin)
   290  
   291  	isConcurrencyExpected := func(name string, observed float64, expected float64) bool {
   292  		relativeErr := math.Abs(expected-observed) / expected
   293  		t.Logf("%v relative error is %v", name, relativeErr)
   294  		return relativeErr <= margin
   295  	}
   296  	if !isConcurrencyExpected(plNoxu1.Name, noxu1ObservedConcurrency, noxu1ExpectedConcurrency) {
   297  		t.Errorf("Concurrency observed by noxu1 is off. Expected: %v, observed: %v", noxu1ExpectedConcurrency, noxu1ObservedConcurrency)
   298  	}
   299  	if !isConcurrencyExpected(plNoxu2.Name, noxu2ObservedConcurrency, noxu2ExpectedConcurrency) {
   300  		t.Errorf("Concurrency observed by noxu2 is off. Expected: %v, observed: %v", noxu2ExpectedConcurrency, noxu2ObservedConcurrency)
   301  	}
   302  
   303  	// Check the server-side APF seat utilization measurements
   304  	if math.Abs(1-noxu1Avg.seatUtil) > 0.05 {
   305  		t.Errorf("noxu1Avg.seatUtil=%v is too far from expected=1.0", noxu1Avg.seatUtil)
   306  	}
   307  	noxu2ExpectedSeatUtil := float64(noxu2NumGoroutines) / float64(noxu2T0.availableSeats)
   308  	if math.Abs(noxu2ExpectedSeatUtil-noxu2Avg.seatUtil) > 0.05 {
   309  		t.Errorf("noxu2Avg.seatUtil=%v is too far from expected=%v", noxu2Avg.seatUtil, noxu2ExpectedSeatUtil)
   310  	}
   311  }
   312  
   313  func getRequestMetricsSnapshot(c clientset.Interface) (metricSnapshot, error) {
   314  
   315  	resp, err := getMetrics(c)
   316  	if err != nil {
   317  		return nil, err
   318  	}
   319  
   320  	dec := expfmt.NewDecoder(strings.NewReader(string(resp)), expfmt.FmtText)
   321  	decoder := expfmt.SampleDecoder{
   322  		Dec:  dec,
   323  		Opts: &expfmt.DecodeOptions{},
   324  	}
   325  
   326  	snapshot := metricSnapshot{}
   327  
   328  	for {
   329  		var v model.Vector
   330  		if err := decoder.Decode(&v); err != nil {
   331  			if err == io.EOF {
   332  				// Expected loop termination condition.
   333  				return snapshot, nil
   334  			}
   335  			return nil, fmt.Errorf("failed decoding metrics: %v", err)
   336  		}
   337  		for _, metric := range v {
   338  			plLabel := string(metric.Metric[labelPriorityLevel])
   339  			entry := plMetrics{}
   340  			if v, ok := snapshot[plLabel]; ok {
   341  				entry = v
   342  			}
   343  			switch name := string(metric.Metric[model.MetricNameLabel]); name {
   344  			case requestExecutionSecondsSumName:
   345  				entry.execSeconds.Sum = float64(metric.Value)
   346  			case requestExecutionSecondsCountName:
   347  				entry.execSeconds.Count = int(metric.Value)
   348  			case priorityLevelSeatUtilSumName:
   349  				entry.seatUtil.Sum = float64(metric.Value)
   350  			case priorityLevelSeatUtilCountName:
   351  				entry.seatUtil.Count = int(metric.Value)
   352  			case nominalConcurrencyLimitMetricsName:
   353  				entry.availableSeats = int(metric.Value)
   354  			}
   355  			snapshot[plLabel] = entry
   356  		}
   357  	}
   358  }