k8s.io/kubernetes@v1.29.3/test/e2e/network/service_latency.go (about) 1 /* 2 Copyright 2015 The Kubernetes Authors. 3 4 Licensed under the Apache License, Version 2.0 (the "License"); 5 you may not use this file except in compliance with the License. 6 You may obtain a copy of the License at 7 8 http://www.apache.org/licenses/LICENSE-2.0 9 10 Unless required by applicable law or agreed to in writing, software 11 distributed under the License is distributed on an "AS IS" BASIS, 12 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 See the License for the specific language governing permissions and 14 limitations under the License. 15 */ 16 17 package network 18 19 import ( 20 "context" 21 "fmt" 22 "sort" 23 "strings" 24 "time" 25 26 v1 "k8s.io/api/core/v1" 27 metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" 28 "k8s.io/apimachinery/pkg/runtime" 29 "k8s.io/apimachinery/pkg/util/sets" 30 "k8s.io/apimachinery/pkg/watch" 31 "k8s.io/client-go/kubernetes" 32 "k8s.io/client-go/tools/cache" 33 "k8s.io/client-go/util/flowcontrol" 34 "k8s.io/kubernetes/test/e2e/framework" 35 e2erc "k8s.io/kubernetes/test/e2e/framework/rc" 36 "k8s.io/kubernetes/test/e2e/network/common" 37 testutils "k8s.io/kubernetes/test/utils" 38 imageutils "k8s.io/kubernetes/test/utils/image" 39 admissionapi "k8s.io/pod-security-admission/api" 40 41 "github.com/onsi/ginkgo/v2" 42 ) 43 44 type durations []time.Duration 45 46 func (d durations) Len() int { return len(d) } 47 func (d durations) Less(i, j int) bool { return d[i] < d[j] } 48 func (d durations) Swap(i, j int) { d[i], d[j] = d[j], d[i] } 49 50 var _ = common.SIGDescribe("Service endpoints latency", func() { 51 f := framework.NewDefaultFramework("svc-latency") 52 f.NamespacePodSecurityLevel = admissionapi.LevelBaseline 53 54 /* 55 Release: v1.9 56 Testname: Service endpoint latency, thresholds 57 Description: Run 100 iterations of create service with the Pod running the pause image, measure the time it takes for creating the service and the endpoint with the service name is available. These durations are captured for 100 iterations, then the durations are sorted to compute 50th, 90th and 99th percentile. The single server latency MUST not exceed liberally set thresholds of 20s for 50th percentile and 50s for the 90th percentile. 58 */ 59 framework.ConformanceIt("should not be very high", func(ctx context.Context) { 60 const ( 61 // These are very generous criteria. Ideally we will 62 // get this much lower in the future. See issue 63 // #10436. 64 limitMedian = time.Second * 20 65 limitTail = time.Second * 50 66 67 // Numbers chosen to make the test complete in a short amount 68 // of time. This sample size is not actually large enough to 69 // reliably measure tails (it may give false positives, but not 70 // false negatives), but it should catch low hanging fruit. 71 // 72 // Note that these are fixed and do not depend on the 73 // size of the cluster. Setting parallelTrials larger 74 // distorts the measurements. Perhaps this wouldn't be 75 // true on HA clusters. 76 totalTrials = 200 77 parallelTrials = 15 78 minSampleSize = 100 79 80 // Acceptable failure ratio for getting service latencies. 81 acceptableFailureRatio = .05 82 ) 83 84 // Turn off rate limiting--it interferes with our measurements. 85 cfg, err := framework.LoadConfig() 86 if err != nil { 87 framework.Failf("Unable to load config: %v", err) 88 } 89 cfg.RateLimiter = flowcontrol.NewFakeAlwaysRateLimiter() 90 f.ClientSet = kubernetes.NewForConfigOrDie(cfg) 91 92 failing := sets.NewString() 93 d, err := runServiceLatencies(ctx, f, parallelTrials, totalTrials, acceptableFailureRatio) 94 if err != nil { 95 failing.Insert(fmt.Sprintf("Not all RC/pod/service trials succeeded: %v", err)) 96 } 97 dSorted := durations(d) 98 sort.Sort(dSorted) 99 n := len(dSorted) 100 if n < minSampleSize { 101 failing.Insert(fmt.Sprintf("Did not get a good sample size: %v", dSorted)) 102 } 103 if n < 2 { 104 failing.Insert("Less than two runs succeeded; aborting.") 105 framework.Failf(strings.Join(failing.List(), "\n")) 106 } 107 percentile := func(p int) time.Duration { 108 est := n * p / 100 109 if est >= n { 110 return dSorted[n-1] 111 } 112 return dSorted[est] 113 } 114 framework.Logf("Latencies: %v", dSorted) 115 p50 := percentile(50) 116 p90 := percentile(90) 117 p99 := percentile(99) 118 framework.Logf("50 %%ile: %v", p50) 119 framework.Logf("90 %%ile: %v", p90) 120 framework.Logf("99 %%ile: %v", p99) 121 framework.Logf("Total sample count: %v", len(dSorted)) 122 123 if p50 > limitMedian { 124 failing.Insert("Median latency should be less than " + limitMedian.String()) 125 } 126 if p99 > limitTail { 127 failing.Insert("Tail (99 percentile) latency should be less than " + limitTail.String()) 128 } 129 if failing.Len() > 0 { 130 errList := strings.Join(failing.List(), "\n") 131 helpfulInfo := fmt.Sprintf("\n50, 90, 99 percentiles: %v %v %v", p50, p90, p99) 132 framework.Failf(errList + helpfulInfo) 133 } 134 }) 135 }) 136 137 func runServiceLatencies(ctx context.Context, f *framework.Framework, inParallel, total int, acceptableFailureRatio float32) (output []time.Duration, err error) { 138 cfg := testutils.RCConfig{ 139 Client: f.ClientSet, 140 Image: imageutils.GetPauseImageName(), 141 Name: "svc-latency-rc", 142 Namespace: f.Namespace.Name, 143 Replicas: 1, 144 PollInterval: time.Second, 145 } 146 if err := e2erc.RunRC(ctx, cfg); err != nil { 147 return nil, err 148 } 149 150 // Run a single watcher, to reduce the number of API calls we have to 151 // make; this is to minimize the timing error. It's how kube-proxy 152 // consumes the endpoints data, so it seems like the right thing to 153 // test. 154 endpointQueries := newQuerier() 155 startEndpointWatcher(ctx, f, endpointQueries) 156 defer close(endpointQueries.stop) 157 158 // run one test and throw it away-- this is to make sure that the pod's 159 // ready status has propagated. 160 _, err = singleServiceLatency(ctx, f, cfg.Name, endpointQueries) 161 framework.ExpectNoError(err) 162 163 // These channels are never closed, and each attempt sends on exactly 164 // one of these channels, so the sum of the things sent over them will 165 // be exactly total. 166 errs := make(chan error, total) 167 durations := make(chan time.Duration, total) 168 169 blocker := make(chan struct{}, inParallel) 170 for i := 0; i < total; i++ { 171 go func() { 172 defer ginkgo.GinkgoRecover() 173 blocker <- struct{}{} 174 defer func() { <-blocker }() 175 if d, err := singleServiceLatency(ctx, f, cfg.Name, endpointQueries); err != nil { 176 errs <- err 177 } else { 178 durations <- d 179 } 180 }() 181 } 182 183 errCount := 0 184 for i := 0; i < total; i++ { 185 select { 186 case e := <-errs: 187 framework.Logf("Got error: %v", e) 188 errCount++ 189 case d := <-durations: 190 output = append(output, d) 191 } 192 } 193 if errCount != 0 { 194 framework.Logf("Got %d errors out of %d tries", errCount, total) 195 errRatio := float32(errCount) / float32(total) 196 if errRatio > acceptableFailureRatio { 197 return output, fmt.Errorf("error ratio %g is higher than the acceptable ratio %g", errRatio, acceptableFailureRatio) 198 } 199 } 200 return output, nil 201 } 202 203 type endpointQuery struct { 204 endpointsName string 205 endpoints *v1.Endpoints 206 result chan<- struct{} 207 } 208 209 type endpointQueries struct { 210 requests map[string]*endpointQuery 211 212 stop chan struct{} 213 requestChan chan *endpointQuery 214 seenChan chan *v1.Endpoints 215 } 216 217 func newQuerier() *endpointQueries { 218 eq := &endpointQueries{ 219 requests: map[string]*endpointQuery{}, 220 221 stop: make(chan struct{}, 100), 222 requestChan: make(chan *endpointQuery), 223 seenChan: make(chan *v1.Endpoints, 100), 224 } 225 go eq.join() 226 return eq 227 } 228 229 // join merges the incoming streams of requests and added endpoints. It has 230 // nice properties like: 231 // - remembering an endpoint if it happens to arrive before it is requested. 232 // - closing all outstanding requests (returning nil) if it is stopped. 233 func (eq *endpointQueries) join() { 234 defer func() { 235 // Terminate all pending requests, so that no goroutine will 236 // block indefinitely. 237 for _, req := range eq.requests { 238 if req.result != nil { 239 close(req.result) 240 } 241 } 242 }() 243 244 for { 245 select { 246 case <-eq.stop: 247 return 248 case req := <-eq.requestChan: 249 if cur, ok := eq.requests[req.endpointsName]; ok && cur.endpoints != nil { 250 // We've already gotten the result, so we can 251 // immediately satisfy this request. 252 delete(eq.requests, req.endpointsName) 253 req.endpoints = cur.endpoints 254 close(req.result) 255 } else { 256 // Save this request. 257 eq.requests[req.endpointsName] = req 258 } 259 case got := <-eq.seenChan: 260 if req, ok := eq.requests[got.Name]; ok { 261 if req.result != nil { 262 // Satisfy a request. 263 delete(eq.requests, got.Name) 264 req.endpoints = got 265 close(req.result) 266 } 267 // We've already recorded a result, but 268 // haven't gotten the request yet. Only 269 // keep the first result. 270 } else { 271 // We haven't gotten the corresponding request 272 // yet, save this result. 273 eq.requests[got.Name] = &endpointQuery{ 274 endpoints: got, 275 } 276 } 277 } 278 } 279 } 280 281 // request blocks until the requested endpoint is seen. 282 func (eq *endpointQueries) request(endpointsName string) *v1.Endpoints { 283 result := make(chan struct{}) 284 req := &endpointQuery{ 285 endpointsName: endpointsName, 286 result: result, 287 } 288 eq.requestChan <- req 289 <-result 290 return req.endpoints 291 } 292 293 // marks e as added; does not block. 294 func (eq *endpointQueries) added(e *v1.Endpoints) { 295 eq.seenChan <- e 296 } 297 298 // blocks until it has finished syncing. 299 func startEndpointWatcher(ctx context.Context, f *framework.Framework, q *endpointQueries) { 300 _, controller := cache.NewInformer( 301 &cache.ListWatch{ 302 ListFunc: func(options metav1.ListOptions) (runtime.Object, error) { 303 obj, err := f.ClientSet.CoreV1().Endpoints(f.Namespace.Name).List(ctx, options) 304 return runtime.Object(obj), err 305 }, 306 WatchFunc: func(options metav1.ListOptions) (watch.Interface, error) { 307 return f.ClientSet.CoreV1().Endpoints(f.Namespace.Name).Watch(ctx, options) 308 }, 309 }, 310 &v1.Endpoints{}, 311 0, 312 cache.ResourceEventHandlerFuncs{ 313 AddFunc: func(obj interface{}) { 314 if e, ok := obj.(*v1.Endpoints); ok { 315 if len(e.Subsets) > 0 && len(e.Subsets[0].Addresses) > 0 { 316 q.added(e) 317 } 318 } 319 }, 320 UpdateFunc: func(old, cur interface{}) { 321 if e, ok := cur.(*v1.Endpoints); ok { 322 if len(e.Subsets) > 0 && len(e.Subsets[0].Addresses) > 0 { 323 q.added(e) 324 } 325 } 326 }, 327 }, 328 ) 329 330 go controller.Run(q.stop) 331 332 // Wait for the controller to sync, so that we don't count any warm-up time. 333 for !controller.HasSynced() { 334 time.Sleep(100 * time.Millisecond) 335 } 336 } 337 338 func singleServiceLatency(ctx context.Context, f *framework.Framework, name string, q *endpointQueries) (time.Duration, error) { 339 // Make a service that points to that pod. 340 svc := &v1.Service{ 341 ObjectMeta: metav1.ObjectMeta{ 342 GenerateName: "latency-svc-", 343 }, 344 Spec: v1.ServiceSpec{ 345 Ports: []v1.ServicePort{{Protocol: v1.ProtocolTCP, Port: 80}}, 346 Selector: map[string]string{"name": name}, 347 Type: v1.ServiceTypeClusterIP, 348 SessionAffinity: v1.ServiceAffinityNone, 349 }, 350 } 351 startTime := time.Now() 352 gotSvc, err := f.ClientSet.CoreV1().Services(f.Namespace.Name).Create(ctx, svc, metav1.CreateOptions{}) 353 if err != nil { 354 return 0, err 355 } 356 framework.Logf("Created: %v", gotSvc.Name) 357 358 if e := q.request(gotSvc.Name); e == nil { 359 return 0, fmt.Errorf("never got a result for endpoint %v", gotSvc.Name) 360 } 361 stopTime := time.Now() 362 d := stopTime.Sub(startTime) 363 framework.Logf("Got endpoints: %v [%v]", gotSvc.Name, d) 364 return d, nil 365 }