k8s.io/kubernetes@v1.31.0-alpha.0.0.20240520171757-56147500dadc/test/integration/apiserver/flowcontrol/concurrency_util_test.go

k8s.io/kubernetes@v1.31.0-alpha.0.0.20240520171757-56147500dadc/test/integration/apiserver/flowcontrol/concurrency_util_test.go (about)

     1  /*
     2  Copyright 2019 The Kubernetes Authors.
     3  
     4  Licensed under the Apache License, Version 2.0 (the "License");
     5  you may not use this file except in compliance with the License.
     6  You may obtain a copy of the License at
     7  
     8      http://www.apache.org/licenses/LICENSE-2.0
     9  
    10  Unless required by applicable law or agreed to in writing, software
    11  distributed under the License is distributed on an "AS IS" BASIS,
    12  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    13  See the License for the specific language governing permissions and
    14  limitations under the License.
    15  */
    16  
    17  package flowcontrol
    18  
    19  import (
    20  	"context"
    21  	"fmt"
    22  	"io"
    23  	"math"
    24  	"strings"
    25  	"sync"
    26  	"testing"
    27  	"time"
    28  
    29  	"github.com/prometheus/common/expfmt"
    30  	"github.com/prometheus/common/model"
    31  
    32  	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
    33  	"k8s.io/apiserver/pkg/authorization/authorizer"
    34  	clientset "k8s.io/client-go/kubernetes"
    35  	"k8s.io/kubernetes/cmd/kube-apiserver/app/options"
    36  	"k8s.io/kubernetes/pkg/controlplane"
    37  	"k8s.io/kubernetes/test/integration/framework"
    38  	"k8s.io/kubernetes/test/utils/ktesting"
    39  )
    40  
    41  const (
    42  	nominalConcurrencyLimitMetricsName = "apiserver_flowcontrol_nominal_limit_seats"
    43  	requestExecutionSecondsSumName     = "apiserver_flowcontrol_request_execution_seconds_sum"
    44  	requestExecutionSecondsCountName   = "apiserver_flowcontrol_request_execution_seconds_count"
    45  	priorityLevelSeatUtilSumName       = "apiserver_flowcontrol_priority_level_seat_utilization_sum"
    46  	priorityLevelSeatUtilCountName     = "apiserver_flowcontrol_priority_level_seat_utilization_count"
    47  	fakeworkDuration                   = 200 * time.Millisecond
    48  	testWarmUpTime                     = 2 * time.Second
    49  	testTime                           = 10 * time.Second
    50  )
    51  
    52  type SumAndCount struct {
    53  	Sum   float64
    54  	Count int
    55  }
    56  
    57  type plMetrics struct {
    58  	execSeconds    SumAndCount
    59  	seatUtil       SumAndCount
    60  	availableSeats int
    61  }
    62  
    63  // metricSnapshot maps from a priority level label to
    64  // a plMetrics struct containing APF metrics of interest
    65  type metricSnapshot map[string]plMetrics
    66  
    67  // Client request latency measurement
    68  type clientLatencyMeasurement struct {
    69  	SumAndCount
    70  	SumSq float64 // latency sum of squares
    71  	Mu    sync.Mutex
    72  }
    73  
    74  func (clm *clientLatencyMeasurement) reset() {
    75  	clm.Mu.Lock()
    76  	defer clm.Mu.Unlock()
    77  	clm.Sum = 0
    78  	clm.Count = 0
    79  	clm.SumSq = 0
    80  }
    81  
    82  func (clm *clientLatencyMeasurement) update(duration float64) {
    83  	clm.Mu.Lock()
    84  	defer clm.Mu.Unlock()
    85  	clm.Count += 1
    86  	clm.Sum += duration
    87  	clm.SumSq += duration * duration
    88  }
    89  
    90  func (clm *clientLatencyMeasurement) getStats() clientLatencyStats {
    91  	clm.Mu.Lock()
    92  	defer clm.Mu.Unlock()
    93  	mean := clm.Sum / float64(clm.Count)
    94  	ss := clm.SumSq - mean*clm.Sum // reduced from ss := sumsq - 2*mean*sum + float64(count)*mean*mean
    95  	// Set ss to 0 if negative value is resulted from floating point calculations
    96  	if ss < 0 {
    97  		ss = 0
    98  	}
    99  	stdDev := math.Sqrt(ss / float64(clm.Count))
   100  	cv := stdDev / mean
   101  	return clientLatencyStats{mean: mean, stdDev: stdDev, cv: cv}
   102  }
   103  
   104  type clientLatencyStats struct {
   105  	mean   float64 // latency average
   106  	stdDev float64 // latency population standard deviation
   107  	cv     float64 // latency coefficient of variation
   108  }
   109  
   110  type plMetricAvg struct {
   111  	reqExecution float64 // average request execution time
   112  	seatUtil     float64 // average seat utilization
   113  }
   114  
   115  func intervalMetricAvg(snapshot0, snapshot1 metricSnapshot, plLabel string) plMetricAvg {
   116  	plmT0 := snapshot0[plLabel]
   117  	plmT1 := snapshot1[plLabel]
   118  	return plMetricAvg{
   119  		reqExecution: (plmT1.execSeconds.Sum - plmT0.execSeconds.Sum) / float64(plmT1.execSeconds.Count-plmT0.execSeconds.Count),
   120  		seatUtil:     (plmT1.seatUtil.Sum - plmT0.seatUtil.Sum) / float64(plmT1.seatUtil.Count-plmT0.seatUtil.Count),
   121  	}
   122  }
   123  
   124  type noxuDelayingAuthorizer struct {
   125  	Authorizer authorizer.Authorizer
   126  }
   127  
   128  func (d *noxuDelayingAuthorizer) Authorize(ctx context.Context, a authorizer.Attributes) (authorizer.Decision, string, error) {
   129  	if a.GetUser().GetName() == "noxu1" || a.GetUser().GetName() == "noxu2" {
   130  		time.Sleep(fakeworkDuration) // simulate fake work with sleep
   131  	}
   132  	return d.Authorizer.Authorize(ctx, a)
   133  }
   134  
   135  // TestConcurrencyIsolation tests the concurrency isolation between priority levels.
   136  // The test defines two priority levels for this purpose, and corresponding flow schemas.
   137  // To one priority level, this test sends many more concurrent requests than the configuration
   138  // allows to execute at once, while sending fewer than allowed to the other priority level.
   139  // The primary check is that the low flow gets all the seats it wants, but is modulated by
   140  // recognizing that there are uncontrolled overheads in the system.
   141  //
   142  // This test differs from TestPriorityLevelIsolation since TestPriorityLevelIsolation checks throughput instead
   143  // of concurrency. In order to mitigate the effects of system noise, a delaying authorizer is used to artificially
   144  // increase request execution time to make the system noise relatively insignificant.
   145  // Secondarily, this test also checks the observed seat utilizations against values derived from expecting that
   146  // the throughput observed by the client equals the execution throughput observed by the server.
   147  func TestConcurrencyIsolation(t *testing.T) {
   148  	tCtx := ktesting.Init(t)
   149  	_, kubeConfig, closeFn := framework.StartTestServer(tCtx, t, framework.TestServerSetup{
   150  		ModifyServerRunOptions: func(opts *options.ServerRunOptions) {
   151  			// Ensure all clients are allowed to send requests.
   152  			opts.Authorization.Modes = []string{"AlwaysAllow"}
   153  			opts.GenericServerRunOptions.MaxRequestsInFlight = 10
   154  			opts.GenericServerRunOptions.MaxMutatingRequestsInFlight = 10
   155  		},
   156  		ModifyServerConfig: func(config *controlplane.Config) {
   157  			// Wrap default authorizer with one that delays requests from noxu clients
   158  			config.ControlPlane.Generic.Authorization.Authorizer = &noxuDelayingAuthorizer{config.ControlPlane.Generic.Authorization.Authorizer}
   159  		},
   160  	})
   161  	defer closeFn()
   162  
   163  	loopbackClient := clientset.NewForConfigOrDie(kubeConfig)
   164  	noxu1Client := getClientFor(kubeConfig, "noxu1")
   165  	noxu2Client := getClientFor(kubeConfig, "noxu2")
   166  
   167  	queueLength := 50
   168  	concurrencyShares := 100
   169  
   170  	plNoxu1, _, err := createPriorityLevelAndBindingFlowSchemaForUser(
   171  		loopbackClient, "noxu1", concurrencyShares, queueLength)
   172  	if err != nil {
   173  		t.Error(err)
   174  	}
   175  	plNoxu2, _, err := createPriorityLevelAndBindingFlowSchemaForUser(
   176  		loopbackClient, "noxu2", concurrencyShares, queueLength)
   177  	if err != nil {
   178  		t.Error(err)
   179  	}
   180  
   181  	stopCh := make(chan struct{})
   182  	wg := sync.WaitGroup{}
   183  
   184  	// "elephant"
   185  	noxu1NumGoroutines := 5 + queueLength
   186  	var noxu1LatMeasure clientLatencyMeasurement
   187  	wg.Add(noxu1NumGoroutines)
   188  	streamRequests(noxu1NumGoroutines, func() {
   189  		start := time.Now()
   190  		_, err := noxu1Client.CoreV1().Namespaces().Get(tCtx, "default", metav1.GetOptions{})
   191  		duration := time.Since(start).Seconds()
   192  		noxu1LatMeasure.update(duration)
   193  		if err != nil {
   194  			t.Error(err)
   195  		}
   196  	}, &wg, stopCh)
   197  	// "mouse"
   198  	noxu2NumGoroutines := 3
   199  	var noxu2LatMeasure clientLatencyMeasurement
   200  	wg.Add(noxu2NumGoroutines)
   201  	streamRequests(noxu2NumGoroutines, func() {
   202  		start := time.Now()
   203  		_, err := noxu2Client.CoreV1().Namespaces().Get(tCtx, "default", metav1.GetOptions{})
   204  		duration := time.Since(start).Seconds()
   205  		noxu2LatMeasure.update(duration)
   206  		if err != nil {
   207  			t.Error(err)
   208  		}
   209  	}, &wg, stopCh)
   210  
   211  	// Warm up
   212  	time.Sleep(testWarmUpTime)
   213  
   214  	noxu1LatMeasure.reset()
   215  	noxu2LatMeasure.reset()
   216  	snapshot0, err := getRequestMetricsSnapshot(loopbackClient)
   217  	if err != nil {
   218  		t.Error(err)
   219  	}
   220  	time.Sleep(testTime) // after warming up, the test enters a steady state
   221  	snapshot1, err := getRequestMetricsSnapshot(loopbackClient)
   222  	if err != nil {
   223  		t.Error(err)
   224  	}
   225  	close(stopCh)
   226  
   227  	// Check the assumptions of the test
   228  	noxu1T0 := snapshot0[plNoxu1.Name]
   229  	noxu1T1 := snapshot1[plNoxu1.Name]
   230  	noxu2T0 := snapshot0[plNoxu2.Name]
   231  	noxu2T1 := snapshot1[plNoxu2.Name]
   232  	if noxu1T0.seatUtil.Count >= noxu1T1.seatUtil.Count || noxu2T0.seatUtil.Count >= noxu2T1.seatUtil.Count {
   233  		t.Errorf("SeatUtilCount check failed: noxu1 t0 count %d, t1 count %d; noxu2 t0 count %d, t1 count %d",
   234  			noxu1T0.seatUtil.Count, noxu1T1.seatUtil.Count, noxu2T0.seatUtil.Count, noxu2T1.seatUtil.Count)
   235  	}
   236  	t.Logf("noxu1 priority level concurrency limit: %d", noxu1T0.availableSeats)
   237  	t.Logf("noxu2 priority level concurrency limit: %d", noxu2T0.availableSeats)
   238  	if (noxu1T0.availableSeats != noxu1T1.availableSeats) || (noxu2T0.availableSeats != noxu2T1.availableSeats) {
   239  		t.Errorf("The number of available seats changed: noxu1 (%d, %d) noxu2 (%d, %d)",
   240  			noxu1T0.availableSeats, noxu1T1.availableSeats, noxu2T0.availableSeats, noxu2T1.availableSeats)
   241  	}
   242  	if (noxu1T0.availableSeats <= 4) || (noxu2T0.availableSeats <= 4) {
   243  		t.Errorf("The number of available seats for test client priority levels are too small: (%d, %d). Expecting a number > 4",
   244  			noxu1T0.availableSeats, noxu2T0.availableSeats)
   245  	}
   246  	// No requests should be rejected under normal situations
   247  	_, rejectedReqCounts, err := getRequestCountOfPriorityLevel(loopbackClient)
   248  	if err != nil {
   249  		t.Error(err)
   250  	}
   251  	if rejectedReqCounts[plNoxu1.Name] > 0 {
   252  		t.Errorf(`%d requests from the "elephant" stream were rejected unexpectedly`, rejectedReqCounts[plNoxu1.Name])
   253  	}
   254  	if rejectedReqCounts[plNoxu2.Name] > 0 {
   255  		t.Errorf(`%d requests from the "mouse" stream were rejected unexpectedly`, rejectedReqCounts[plNoxu2.Name])
   256  	}
   257  
   258  	// Calculate APF server side metric averages during the test interval
   259  	noxu1Avg := intervalMetricAvg(snapshot0, snapshot1, plNoxu1.Name)
   260  	noxu2Avg := intervalMetricAvg(snapshot0, snapshot1, plNoxu2.Name)
   261  	t.Logf("\nnoxu1 avg request execution time %v\nnoxu2 avg request execution time %v", noxu1Avg.reqExecution, noxu2Avg.reqExecution)
   262  	t.Logf("\nnoxu1 avg seat utilization %v\nnoxu2 avg seat utilization %v", noxu1Avg.seatUtil, noxu2Avg.seatUtil)
   263  
   264  	// Wait till the client goroutines finish before computing the client side request latency statistics
   265  	wg.Wait()
   266  	noxu1LatStats := noxu1LatMeasure.getStats()
   267  	noxu2LatStats := noxu2LatMeasure.getStats()
   268  	t.Logf("noxu1 client request count %d duration mean %v stddev %v cv %v", noxu1LatMeasure.Count, noxu1LatStats.mean, noxu1LatStats.stdDev, noxu1LatStats.cv)
   269  	t.Logf("noxu2 client request count %d duration mean %v stddev %v cv %v", noxu2LatMeasure.Count, noxu2LatStats.mean, noxu2LatStats.stdDev, noxu2LatStats.cv)
   270  
   271  	// Calculate server-side observed concurrency
   272  	noxu1ObservedConcurrency := noxu1Avg.seatUtil * float64(noxu1T0.availableSeats)
   273  	noxu2ObservedConcurrency := noxu2Avg.seatUtil * float64(noxu2T0.availableSeats)
   274  	// Expected concurrency is derived from equal throughput assumption on both the client-side and the server-side
   275  	noxu1ExpectedConcurrency := float64(noxu1NumGoroutines) * noxu1Avg.reqExecution / noxu1LatStats.mean
   276  	noxu2ExpectedConcurrency := float64(noxu2NumGoroutines) * noxu2Avg.reqExecution / noxu2LatStats.mean
   277  	t.Logf("Concurrency of noxu1:noxu2 - expected (%v:%v), observed (%v:%v)", noxu1ExpectedConcurrency, noxu2ExpectedConcurrency, noxu1ObservedConcurrency, noxu2ObservedConcurrency)
   278  
   279  	// There are uncontrolled overheads that introduce noise into the system. The coefficient of variation (CV), that is,
   280  	// standard deviation divided by mean, for a class of traffic is a characterization of all the noise that applied to
   281  	// that class. We found that noxu1 generally had a much bigger CV than noxu2. This makes sense, because noxu1 probes
   282  	// more behavior --- the waiting in queues. So we take the minimum of the two as an indicator of the relative amount
   283  	// of noise that comes from all the other behavior. Currently, we use 2 times the experienced coefficient of variation
   284  	// as the margin of error.
   285  	margin := 2 * math.Min(noxu1LatStats.cv, noxu2LatStats.cv)
   286  	t.Logf("Error margin is %v", margin)
   287  
   288  	isConcurrencyExpected := func(name string, observed float64, expected float64) bool {
   289  		relativeErr := math.Abs(expected-observed) / expected
   290  		t.Logf("%v relative error is %v", name, relativeErr)
   291  		return relativeErr <= margin
   292  	}
   293  	if !isConcurrencyExpected(plNoxu1.Name, noxu1ObservedConcurrency, noxu1ExpectedConcurrency) {
   294  		t.Errorf("Concurrency observed by noxu1 is off. Expected: %v, observed: %v", noxu1ExpectedConcurrency, noxu1ObservedConcurrency)
   295  	}
   296  	if !isConcurrencyExpected(plNoxu2.Name, noxu2ObservedConcurrency, noxu2ExpectedConcurrency) {
   297  		t.Errorf("Concurrency observed by noxu2 is off. Expected: %v, observed: %v", noxu2ExpectedConcurrency, noxu2ObservedConcurrency)
   298  	}
   299  
   300  	// Check the server-side APF seat utilization measurements
   301  	if math.Abs(1-noxu1Avg.seatUtil) > 0.05 {
   302  		t.Errorf("noxu1Avg.seatUtil=%v is too far from expected=1.0", noxu1Avg.seatUtil)
   303  	}
   304  	noxu2ExpectedSeatUtil := float64(noxu2NumGoroutines) / float64(noxu2T0.availableSeats)
   305  	if math.Abs(noxu2ExpectedSeatUtil-noxu2Avg.seatUtil) > 0.05 {
   306  		t.Errorf("noxu2Avg.seatUtil=%v is too far from expected=%v", noxu2Avg.seatUtil, noxu2ExpectedSeatUtil)
   307  	}
   308  }
   309  
   310  func getRequestMetricsSnapshot(c clientset.Interface) (metricSnapshot, error) {
   311  
   312  	resp, err := getMetrics(c)
   313  	if err != nil {
   314  		return nil, err
   315  	}
   316  
   317  	dec := expfmt.NewDecoder(strings.NewReader(string(resp)), expfmt.NewFormat(expfmt.TypeTextPlain))
   318  	decoder := expfmt.SampleDecoder{
   319  		Dec:  dec,
   320  		Opts: &expfmt.DecodeOptions{},
   321  	}
   322  
   323  	snapshot := metricSnapshot{}
   324  
   325  	for {
   326  		var v model.Vector
   327  		if err := decoder.Decode(&v); err != nil {
   328  			if err == io.EOF {
   329  				// Expected loop termination condition.
   330  				return snapshot, nil
   331  			}
   332  			return nil, fmt.Errorf("failed decoding metrics: %v", err)
   333  		}
   334  		for _, metric := range v {
   335  			plLabel := string(metric.Metric[labelPriorityLevel])
   336  			entry := plMetrics{}
   337  			if v, ok := snapshot[plLabel]; ok {
   338  				entry = v
   339  			}
   340  			switch name := string(metric.Metric[model.MetricNameLabel]); name {
   341  			case requestExecutionSecondsSumName:
   342  				entry.execSeconds.Sum = float64(metric.Value)
   343  			case requestExecutionSecondsCountName:
   344  				entry.execSeconds.Count = int(metric.Value)
   345  			case priorityLevelSeatUtilSumName:
   346  				entry.seatUtil.Sum = float64(metric.Value)
   347  			case priorityLevelSeatUtilCountName:
   348  				entry.seatUtil.Count = int(metric.Value)
   349  			case nominalConcurrencyLimitMetricsName:
   350  				entry.availableSeats = int(metric.Value)
   351  			}
   352  			snapshot[plLabel] = entry
   353  		}
   354  	}
   355  }