k8s.io/kubernetes@v1.29.3/test/integration/apiserver/flowcontrol/concurrency_util_test.go (about) 1 /* 2 Copyright 2019 The Kubernetes Authors. 3 4 Licensed under the Apache License, Version 2.0 (the "License"); 5 you may not use this file except in compliance with the License. 6 You may obtain a copy of the License at 7 8 http://www.apache.org/licenses/LICENSE-2.0 9 10 Unless required by applicable law or agreed to in writing, software 11 distributed under the License is distributed on an "AS IS" BASIS, 12 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 See the License for the specific language governing permissions and 14 limitations under the License. 15 */ 16 17 package flowcontrol 18 19 import ( 20 "context" 21 "fmt" 22 "io" 23 "math" 24 "strings" 25 "sync" 26 "testing" 27 "time" 28 29 "github.com/prometheus/common/expfmt" 30 "github.com/prometheus/common/model" 31 32 metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" 33 "k8s.io/apiserver/pkg/authorization/authorizer" 34 clientset "k8s.io/client-go/kubernetes" 35 "k8s.io/kubernetes/cmd/kube-apiserver/app/options" 36 "k8s.io/kubernetes/pkg/controlplane" 37 "k8s.io/kubernetes/test/integration/framework" 38 "k8s.io/kubernetes/test/utils/ktesting" 39 ) 40 41 const ( 42 nominalConcurrencyLimitMetricsName = "apiserver_flowcontrol_nominal_limit_seats" 43 requestExecutionSecondsSumName = "apiserver_flowcontrol_request_execution_seconds_sum" 44 requestExecutionSecondsCountName = "apiserver_flowcontrol_request_execution_seconds_count" 45 priorityLevelSeatUtilSumName = "apiserver_flowcontrol_priority_level_seat_utilization_sum" 46 priorityLevelSeatUtilCountName = "apiserver_flowcontrol_priority_level_seat_utilization_count" 47 fakeworkDuration = 200 * time.Millisecond 48 testWarmUpTime = 2 * time.Second 49 testTime = 10 * time.Second 50 ) 51 52 type SumAndCount struct { 53 Sum float64 54 Count int 55 } 56 57 type plMetrics struct { 58 execSeconds SumAndCount 59 seatUtil SumAndCount 60 availableSeats int 61 } 62 63 // metricSnapshot maps from a priority level label to 64 // a plMetrics struct containing APF metrics of interest 65 type metricSnapshot map[string]plMetrics 66 67 // Client request latency measurement 68 type clientLatencyMeasurement struct { 69 SumAndCount 70 SumSq float64 // latency sum of squares 71 Mu sync.Mutex 72 } 73 74 func (clm *clientLatencyMeasurement) reset() { 75 clm.Mu.Lock() 76 defer clm.Mu.Unlock() 77 clm.Sum = 0 78 clm.Count = 0 79 clm.SumSq = 0 80 } 81 82 func (clm *clientLatencyMeasurement) update(duration float64) { 83 clm.Mu.Lock() 84 defer clm.Mu.Unlock() 85 clm.Count += 1 86 clm.Sum += duration 87 clm.SumSq += duration * duration 88 } 89 90 func (clm *clientLatencyMeasurement) getStats() clientLatencyStats { 91 clm.Mu.Lock() 92 defer clm.Mu.Unlock() 93 mean := clm.Sum / float64(clm.Count) 94 ss := clm.SumSq - mean*clm.Sum // reduced from ss := sumsq - 2*mean*sum + float64(count)*mean*mean 95 // Set ss to 0 if negative value is resulted from floating point calculations 96 if ss < 0 { 97 ss = 0 98 } 99 stdDev := math.Sqrt(ss / float64(clm.Count)) 100 cv := stdDev / mean 101 return clientLatencyStats{mean: mean, stdDev: stdDev, cv: cv} 102 } 103 104 type clientLatencyStats struct { 105 mean float64 // latency average 106 stdDev float64 // latency population standard deviation 107 cv float64 // latency coefficient of variation 108 } 109 110 type plMetricAvg struct { 111 reqExecution float64 // average request execution time 112 seatUtil float64 // average seat utilization 113 } 114 115 func intervalMetricAvg(snapshot0, snapshot1 metricSnapshot, plLabel string) plMetricAvg { 116 plmT0 := snapshot0[plLabel] 117 plmT1 := snapshot1[plLabel] 118 return plMetricAvg{ 119 reqExecution: (plmT1.execSeconds.Sum - plmT0.execSeconds.Sum) / float64(plmT1.execSeconds.Count-plmT0.execSeconds.Count), 120 seatUtil: (plmT1.seatUtil.Sum - plmT0.seatUtil.Sum) / float64(plmT1.seatUtil.Count-plmT0.seatUtil.Count), 121 } 122 } 123 124 type noxuDelayingAuthorizer struct { 125 Authorizer authorizer.Authorizer 126 } 127 128 func (d *noxuDelayingAuthorizer) Authorize(ctx context.Context, a authorizer.Attributes) (authorizer.Decision, string, error) { 129 if a.GetUser().GetName() == "noxu1" || a.GetUser().GetName() == "noxu2" { 130 time.Sleep(fakeworkDuration) // simulate fake work with sleep 131 } 132 return d.Authorizer.Authorize(ctx, a) 133 } 134 135 // TestConcurrencyIsolation tests the concurrency isolation between priority levels. 136 // The test defines two priority levels for this purpose, and corresponding flow schemas. 137 // To one priority level, this test sends many more concurrent requests than the configuration 138 // allows to execute at once, while sending fewer than allowed to the other priority level. 139 // The primary check is that the low flow gets all the seats it wants, but is modulated by 140 // recognizing that there are uncontrolled overheads in the system. 141 // 142 // This test differs from TestPriorityLevelIsolation since TestPriorityLevelIsolation checks throughput instead 143 // of concurrency. In order to mitigate the effects of system noise, a delaying authorizer is used to artificially 144 // increase request execution time to make the system noise relatively insignificant. 145 // Secondarily, this test also checks the observed seat utilizations against values derived from expecting that 146 // the throughput observed by the client equals the execution throughput observed by the server. 147 func TestConcurrencyIsolation(t *testing.T) { 148 _, ctx := ktesting.NewTestContext(t) 149 ctx, cancel := context.WithCancel(ctx) 150 defer cancel() 151 152 _, kubeConfig, closeFn := framework.StartTestServer(ctx, t, framework.TestServerSetup{ 153 ModifyServerRunOptions: func(opts *options.ServerRunOptions) { 154 // Ensure all clients are allowed to send requests. 155 opts.Authorization.Modes = []string{"AlwaysAllow"} 156 opts.GenericServerRunOptions.MaxRequestsInFlight = 10 157 opts.GenericServerRunOptions.MaxMutatingRequestsInFlight = 10 158 }, 159 ModifyServerConfig: func(config *controlplane.Config) { 160 // Wrap default authorizer with one that delays requests from noxu clients 161 config.GenericConfig.Authorization.Authorizer = &noxuDelayingAuthorizer{config.GenericConfig.Authorization.Authorizer} 162 }, 163 }) 164 defer closeFn() 165 166 loopbackClient := clientset.NewForConfigOrDie(kubeConfig) 167 noxu1Client := getClientFor(kubeConfig, "noxu1") 168 noxu2Client := getClientFor(kubeConfig, "noxu2") 169 170 queueLength := 50 171 concurrencyShares := 100 172 173 plNoxu1, _, err := createPriorityLevelAndBindingFlowSchemaForUser( 174 loopbackClient, "noxu1", concurrencyShares, queueLength) 175 if err != nil { 176 t.Error(err) 177 } 178 plNoxu2, _, err := createPriorityLevelAndBindingFlowSchemaForUser( 179 loopbackClient, "noxu2", concurrencyShares, queueLength) 180 if err != nil { 181 t.Error(err) 182 } 183 184 stopCh := make(chan struct{}) 185 wg := sync.WaitGroup{} 186 187 // "elephant" 188 noxu1NumGoroutines := 5 + queueLength 189 var noxu1LatMeasure clientLatencyMeasurement 190 wg.Add(noxu1NumGoroutines) 191 streamRequests(noxu1NumGoroutines, func() { 192 start := time.Now() 193 _, err := noxu1Client.CoreV1().Namespaces().Get(ctx, "default", metav1.GetOptions{}) 194 duration := time.Since(start).Seconds() 195 noxu1LatMeasure.update(duration) 196 if err != nil { 197 t.Error(err) 198 } 199 }, &wg, stopCh) 200 // "mouse" 201 noxu2NumGoroutines := 3 202 var noxu2LatMeasure clientLatencyMeasurement 203 wg.Add(noxu2NumGoroutines) 204 streamRequests(noxu2NumGoroutines, func() { 205 start := time.Now() 206 _, err := noxu2Client.CoreV1().Namespaces().Get(ctx, "default", metav1.GetOptions{}) 207 duration := time.Since(start).Seconds() 208 noxu2LatMeasure.update(duration) 209 if err != nil { 210 t.Error(err) 211 } 212 }, &wg, stopCh) 213 214 // Warm up 215 time.Sleep(testWarmUpTime) 216 217 noxu1LatMeasure.reset() 218 noxu2LatMeasure.reset() 219 snapshot0, err := getRequestMetricsSnapshot(loopbackClient) 220 if err != nil { 221 t.Error(err) 222 } 223 time.Sleep(testTime) // after warming up, the test enters a steady state 224 snapshot1, err := getRequestMetricsSnapshot(loopbackClient) 225 if err != nil { 226 t.Error(err) 227 } 228 close(stopCh) 229 230 // Check the assumptions of the test 231 noxu1T0 := snapshot0[plNoxu1.Name] 232 noxu1T1 := snapshot1[plNoxu1.Name] 233 noxu2T0 := snapshot0[plNoxu2.Name] 234 noxu2T1 := snapshot1[plNoxu2.Name] 235 if noxu1T0.seatUtil.Count >= noxu1T1.seatUtil.Count || noxu2T0.seatUtil.Count >= noxu2T1.seatUtil.Count { 236 t.Errorf("SeatUtilCount check failed: noxu1 t0 count %d, t1 count %d; noxu2 t0 count %d, t1 count %d", 237 noxu1T0.seatUtil.Count, noxu1T1.seatUtil.Count, noxu2T0.seatUtil.Count, noxu2T1.seatUtil.Count) 238 } 239 t.Logf("noxu1 priority level concurrency limit: %d", noxu1T0.availableSeats) 240 t.Logf("noxu2 priority level concurrency limit: %d", noxu2T0.availableSeats) 241 if (noxu1T0.availableSeats != noxu1T1.availableSeats) || (noxu2T0.availableSeats != noxu2T1.availableSeats) { 242 t.Errorf("The number of available seats changed: noxu1 (%d, %d) noxu2 (%d, %d)", 243 noxu1T0.availableSeats, noxu1T1.availableSeats, noxu2T0.availableSeats, noxu2T1.availableSeats) 244 } 245 if (noxu1T0.availableSeats <= 4) || (noxu2T0.availableSeats <= 4) { 246 t.Errorf("The number of available seats for test client priority levels are too small: (%d, %d). Expecting a number > 4", 247 noxu1T0.availableSeats, noxu2T0.availableSeats) 248 } 249 // No requests should be rejected under normal situations 250 _, rejectedReqCounts, err := getRequestCountOfPriorityLevel(loopbackClient) 251 if err != nil { 252 t.Error(err) 253 } 254 if rejectedReqCounts[plNoxu1.Name] > 0 { 255 t.Errorf(`%d requests from the "elephant" stream were rejected unexpectedly`, rejectedReqCounts[plNoxu1.Name]) 256 } 257 if rejectedReqCounts[plNoxu2.Name] > 0 { 258 t.Errorf(`%d requests from the "mouse" stream were rejected unexpectedly`, rejectedReqCounts[plNoxu2.Name]) 259 } 260 261 // Calculate APF server side metric averages during the test interval 262 noxu1Avg := intervalMetricAvg(snapshot0, snapshot1, plNoxu1.Name) 263 noxu2Avg := intervalMetricAvg(snapshot0, snapshot1, plNoxu2.Name) 264 t.Logf("\nnoxu1 avg request execution time %v\nnoxu2 avg request execution time %v", noxu1Avg.reqExecution, noxu2Avg.reqExecution) 265 t.Logf("\nnoxu1 avg seat utilization %v\nnoxu2 avg seat utilization %v", noxu1Avg.seatUtil, noxu2Avg.seatUtil) 266 267 // Wait till the client goroutines finish before computing the client side request latency statistics 268 wg.Wait() 269 noxu1LatStats := noxu1LatMeasure.getStats() 270 noxu2LatStats := noxu2LatMeasure.getStats() 271 t.Logf("noxu1 client request count %d duration mean %v stddev %v cv %v", noxu1LatMeasure.Count, noxu1LatStats.mean, noxu1LatStats.stdDev, noxu1LatStats.cv) 272 t.Logf("noxu2 client request count %d duration mean %v stddev %v cv %v", noxu2LatMeasure.Count, noxu2LatStats.mean, noxu2LatStats.stdDev, noxu2LatStats.cv) 273 274 // Calculate server-side observed concurrency 275 noxu1ObservedConcurrency := noxu1Avg.seatUtil * float64(noxu1T0.availableSeats) 276 noxu2ObservedConcurrency := noxu2Avg.seatUtil * float64(noxu2T0.availableSeats) 277 // Expected concurrency is derived from equal throughput assumption on both the client-side and the server-side 278 noxu1ExpectedConcurrency := float64(noxu1NumGoroutines) * noxu1Avg.reqExecution / noxu1LatStats.mean 279 noxu2ExpectedConcurrency := float64(noxu2NumGoroutines) * noxu2Avg.reqExecution / noxu2LatStats.mean 280 t.Logf("Concurrency of noxu1:noxu2 - expected (%v:%v), observed (%v:%v)", noxu1ExpectedConcurrency, noxu2ExpectedConcurrency, noxu1ObservedConcurrency, noxu2ObservedConcurrency) 281 282 // There are uncontrolled overheads that introduce noise into the system. The coefficient of variation (CV), that is, 283 // standard deviation divided by mean, for a class of traffic is a characterization of all the noise that applied to 284 // that class. We found that noxu1 generally had a much bigger CV than noxu2. This makes sense, because noxu1 probes 285 // more behavior --- the waiting in queues. So we take the minimum of the two as an indicator of the relative amount 286 // of noise that comes from all the other behavior. Currently, we use 2 times the experienced coefficient of variation 287 // as the margin of error. 288 margin := 2 * math.Min(noxu1LatStats.cv, noxu2LatStats.cv) 289 t.Logf("Error margin is %v", margin) 290 291 isConcurrencyExpected := func(name string, observed float64, expected float64) bool { 292 relativeErr := math.Abs(expected-observed) / expected 293 t.Logf("%v relative error is %v", name, relativeErr) 294 return relativeErr <= margin 295 } 296 if !isConcurrencyExpected(plNoxu1.Name, noxu1ObservedConcurrency, noxu1ExpectedConcurrency) { 297 t.Errorf("Concurrency observed by noxu1 is off. Expected: %v, observed: %v", noxu1ExpectedConcurrency, noxu1ObservedConcurrency) 298 } 299 if !isConcurrencyExpected(plNoxu2.Name, noxu2ObservedConcurrency, noxu2ExpectedConcurrency) { 300 t.Errorf("Concurrency observed by noxu2 is off. Expected: %v, observed: %v", noxu2ExpectedConcurrency, noxu2ObservedConcurrency) 301 } 302 303 // Check the server-side APF seat utilization measurements 304 if math.Abs(1-noxu1Avg.seatUtil) > 0.05 { 305 t.Errorf("noxu1Avg.seatUtil=%v is too far from expected=1.0", noxu1Avg.seatUtil) 306 } 307 noxu2ExpectedSeatUtil := float64(noxu2NumGoroutines) / float64(noxu2T0.availableSeats) 308 if math.Abs(noxu2ExpectedSeatUtil-noxu2Avg.seatUtil) > 0.05 { 309 t.Errorf("noxu2Avg.seatUtil=%v is too far from expected=%v", noxu2Avg.seatUtil, noxu2ExpectedSeatUtil) 310 } 311 } 312 313 func getRequestMetricsSnapshot(c clientset.Interface) (metricSnapshot, error) { 314 315 resp, err := getMetrics(c) 316 if err != nil { 317 return nil, err 318 } 319 320 dec := expfmt.NewDecoder(strings.NewReader(string(resp)), expfmt.FmtText) 321 decoder := expfmt.SampleDecoder{ 322 Dec: dec, 323 Opts: &expfmt.DecodeOptions{}, 324 } 325 326 snapshot := metricSnapshot{} 327 328 for { 329 var v model.Vector 330 if err := decoder.Decode(&v); err != nil { 331 if err == io.EOF { 332 // Expected loop termination condition. 333 return snapshot, nil 334 } 335 return nil, fmt.Errorf("failed decoding metrics: %v", err) 336 } 337 for _, metric := range v { 338 plLabel := string(metric.Metric[labelPriorityLevel]) 339 entry := plMetrics{} 340 if v, ok := snapshot[plLabel]; ok { 341 entry = v 342 } 343 switch name := string(metric.Metric[model.MetricNameLabel]); name { 344 case requestExecutionSecondsSumName: 345 entry.execSeconds.Sum = float64(metric.Value) 346 case requestExecutionSecondsCountName: 347 entry.execSeconds.Count = int(metric.Value) 348 case priorityLevelSeatUtilSumName: 349 entry.seatUtil.Sum = float64(metric.Value) 350 case priorityLevelSeatUtilCountName: 351 entry.seatUtil.Count = int(metric.Value) 352 case nominalConcurrencyLimitMetricsName: 353 entry.availableSeats = int(metric.Value) 354 } 355 snapshot[plLabel] = entry 356 } 357 } 358 }