k8s.io/perf-tests/clusterloader2@v0.0.0-20240304094227-64bdb12da87e/pkg/measurement/common/service_creation_latency.go (about)

     1  /*
     2  Copyright 2019 The Kubernetes Authors.
     3  
     4  Licensed under the Apache License, Version 2.0 (the "License");
     5  you may not use this file except in compliance with the License.
     6  You may obtain a copy of the License at
     7  
     8      http://www.apache.org/licenses/LICENSE-2.0
     9  
    10  Unless required by applicable law or agreed to in writing, software
    11  distributed under the License is distributed on an "AS IS" BASIS,
    12  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    13  See the License for the specific language governing permissions and
    14  limitations under the License.
    15  */
    16  
    17  package common
    18  
    19  import (
    20  	"context"
    21  	"fmt"
    22  	"net"
    23  	"sync"
    24  	"time"
    25  
    26  	corev1 "k8s.io/api/core/v1"
    27  	"k8s.io/apimachinery/pkg/api/equality"
    28  	v1 "k8s.io/apimachinery/pkg/apis/meta/v1"
    29  	"k8s.io/apimachinery/pkg/runtime"
    30  	"k8s.io/apimachinery/pkg/watch"
    31  	clientset "k8s.io/client-go/kubernetes"
    32  	"k8s.io/client-go/tools/cache"
    33  	"k8s.io/klog/v2"
    34  
    35  	"k8s.io/apimachinery/pkg/util/wait"
    36  	"k8s.io/perf-tests/clusterloader2/pkg/execservice"
    37  	"k8s.io/perf-tests/clusterloader2/pkg/measurement"
    38  	measurementutil "k8s.io/perf-tests/clusterloader2/pkg/measurement/util"
    39  	"k8s.io/perf-tests/clusterloader2/pkg/measurement/util/checker"
    40  	"k8s.io/perf-tests/clusterloader2/pkg/measurement/util/informer"
    41  	"k8s.io/perf-tests/clusterloader2/pkg/measurement/util/workerqueue"
    42  	"k8s.io/perf-tests/clusterloader2/pkg/util"
    43  )
    44  
    45  const (
    46  	serviceCreationLatencyName           = "ServiceCreationLatency"
    47  	serviceCreationLatencyWorkers        = 10
    48  	defaultServiceCreationLatencyTimeout = 10 * time.Minute
    49  	defaultCheckInterval                 = 10 * time.Second
    50  	pingBackoff                          = 1 * time.Second
    51  	pingChecks                           = 3
    52  
    53  	creatingPhase     = "creating"
    54  	ipAssigningPhase  = "ipAssigning"
    55  	reachabilityPhase = "reachability"
    56  	deletingPhase     = "deleting"
    57  	deletedPhase      = "deleted"
    58  )
    59  
    60  func init() {
    61  	if err := measurement.Register(serviceCreationLatencyName, createServiceCreationLatencyMeasurement); err != nil {
    62  		klog.Fatalf("cant register service %v", err)
    63  	}
    64  }
    65  
    66  func createServiceCreationLatencyMeasurement() measurement.Measurement {
    67  	return &serviceCreationLatencyMeasurement{
    68  		selector:      util.NewObjectSelector(),
    69  		queue:         workerqueue.NewWorkerQueue(serviceCreationLatencyWorkers),
    70  		creationTimes: measurementutil.NewObjectTransitionTimes(serviceCreationLatencyName),
    71  		pingCheckers:  checker.NewMap(),
    72  	}
    73  }
    74  
    75  type serviceCreationLatencyMeasurement struct {
    76  	selector      *util.ObjectSelector
    77  	waitTimeout   time.Duration
    78  	stopCh        chan struct{}
    79  	isRunning     bool
    80  	queue         workerqueue.Interface
    81  	client        clientset.Interface
    82  	creationTimes *measurementutil.ObjectTransitionTimes
    83  	pingCheckers  checker.Map
    84  	lock          sync.Mutex
    85  }
    86  
    87  // Execute executes service startup latency measurement actions.
    88  // Services can be specified by field and/or label selectors.
    89  // If namespace is not passed by parameter, all-namespace scope is assumed.
    90  // "start" action starts observation of the services.
    91  // "waitForReady" waits until all services are reachable.
    92  // "waitForDeletion" waits until all services are deleted
    93  // "gather" returns service created latency summary.
    94  // This measurement only works for services with ClusterIP, NodePort and LoadBalancer type.
    95  func (s *serviceCreationLatencyMeasurement) Execute(config *measurement.Config) ([]measurement.Summary, error) {
    96  	s.client = config.ClusterFramework.GetClientSets().GetClient()
    97  	action, err := util.GetString(config.Params, "action")
    98  	if err != nil {
    99  		return nil, err
   100  	}
   101  	if !config.ClusterLoaderConfig.ExecServiceConfig.Enable {
   102  		return nil, fmt.Errorf("enable-exec-service flag not enabled")
   103  	}
   104  
   105  	switch action {
   106  	case "start":
   107  		if err := s.selector.Parse(config.Params); err != nil {
   108  			return nil, err
   109  		}
   110  		s.waitTimeout, err = util.GetDurationOrDefault(config.Params, "waitTimeout", defaultServiceCreationLatencyTimeout)
   111  		if err != nil {
   112  			return nil, err
   113  		}
   114  		return nil, s.start()
   115  	case "waitForReady":
   116  		return nil, s.waitForReady()
   117  	case "waitForDeletion":
   118  		return nil, s.waitForDeletion()
   119  	case "gather":
   120  		return s.gather(config.Identifier)
   121  	default:
   122  		return nil, fmt.Errorf("unknown action %v", action)
   123  	}
   124  }
   125  
   126  // Dispose cleans up after the measurement.
   127  func (s *serviceCreationLatencyMeasurement) Dispose() {
   128  	if s.isRunning {
   129  		s.isRunning = false
   130  		close(s.stopCh)
   131  	}
   132  	s.queue.Stop()
   133  	s.lock.Lock()
   134  	defer s.lock.Unlock()
   135  	s.pingCheckers.Dispose()
   136  }
   137  
   138  // String returns a string representation of the metric.
   139  func (s *serviceCreationLatencyMeasurement) String() string {
   140  	return serviceCreationLatencyName + ": " + s.selector.String()
   141  }
   142  
   143  func (s *serviceCreationLatencyMeasurement) start() error {
   144  	if s.isRunning {
   145  		klog.V(2).Infof("%s: service creation latency measurement already running", s)
   146  		return nil
   147  	}
   148  	klog.V(2).Infof("%s: starting service creation latency measurement...", s)
   149  
   150  	s.isRunning = true
   151  	s.stopCh = make(chan struct{})
   152  
   153  	i := informer.NewInformer(
   154  		&cache.ListWatch{
   155  			ListFunc: func(options v1.ListOptions) (runtime.Object, error) {
   156  				s.selector.ApplySelectors(&options)
   157  				return s.client.CoreV1().Services(s.selector.Namespace).List(context.TODO(), options)
   158  			},
   159  			WatchFunc: func(options v1.ListOptions) (watch.Interface, error) {
   160  				s.selector.ApplySelectors(&options)
   161  				return s.client.CoreV1().Services(s.selector.Namespace).Watch(context.TODO(), options)
   162  			},
   163  		},
   164  		func(oldObj, newObj interface{}) {
   165  			f := func() {
   166  				s.handleObject(oldObj, newObj)
   167  			}
   168  			s.queue.Add(&f)
   169  		},
   170  	)
   171  	return informer.StartAndSync(i, s.stopCh, informerSyncTimeout)
   172  }
   173  
   174  func (s *serviceCreationLatencyMeasurement) waitForReady() error {
   175  	return wait.Poll(defaultCheckInterval, s.waitTimeout, func() (bool, error) {
   176  		for _, svcType := range []corev1.ServiceType{corev1.ServiceTypeClusterIP, corev1.ServiceTypeNodePort, corev1.ServiceTypeLoadBalancer} {
   177  			reachable := s.creationTimes.Count(phaseName(reachabilityPhase, svcType))
   178  			created := s.creationTimes.Count(phaseName(creatingPhase, svcType))
   179  			klog.V(2).Infof("%s type %s: %d created, %d reachable", s, svcType, created, reachable)
   180  			if created != reachable {
   181  				return false, nil
   182  			}
   183  		}
   184  		return true, nil
   185  	})
   186  }
   187  
   188  func (s *serviceCreationLatencyMeasurement) waitForDeletion() error {
   189  	return wait.Poll(defaultCheckInterval, s.waitTimeout, func() (bool, error) {
   190  		for _, svcType := range []corev1.ServiceType{corev1.ServiceTypeClusterIP, corev1.ServiceTypeNodePort, corev1.ServiceTypeLoadBalancer} {
   191  			deleted := s.creationTimes.Count(phaseName(deletedPhase, svcType))
   192  			created := s.creationTimes.Count(phaseName(creatingPhase, svcType))
   193  			klog.V(2).Infof("%s type %s: %d created, %d deleted", s, svcType, created, deleted)
   194  			if created != deleted {
   195  				return false, nil
   196  			}
   197  		}
   198  		return true, nil
   199  	})
   200  }
   201  
   202  var serviceCreationTransitions = map[string]measurementutil.Transition{
   203  	"create_to_available_clusterip": {
   204  		From: phaseName(creatingPhase, corev1.ServiceTypeClusterIP),
   205  		To:   phaseName(reachabilityPhase, corev1.ServiceTypeClusterIP),
   206  	},
   207  	"create_to_available_nodeport": {
   208  		From: phaseName(creatingPhase, corev1.ServiceTypeNodePort),
   209  		To:   phaseName(reachabilityPhase, corev1.ServiceTypeNodePort),
   210  	},
   211  	"create_to_assigned_loadbalancer": {
   212  		From: phaseName(creatingPhase, corev1.ServiceTypeLoadBalancer),
   213  		To:   phaseName(ipAssigningPhase, corev1.ServiceTypeLoadBalancer),
   214  	},
   215  	"assigned_to_available_loadbalancer": {
   216  		From: phaseName(ipAssigningPhase, corev1.ServiceTypeLoadBalancer),
   217  		To:   phaseName(reachabilityPhase, corev1.ServiceTypeLoadBalancer),
   218  	},
   219  	"create_to_available_loadbalancer": {
   220  		From: phaseName(creatingPhase, corev1.ServiceTypeLoadBalancer),
   221  		To:   phaseName(reachabilityPhase, corev1.ServiceTypeLoadBalancer),
   222  	},
   223  	"delete_loadbalancer": {
   224  		From: phaseName(deletingPhase, corev1.ServiceTypeLoadBalancer),
   225  		To:   phaseName(deletedPhase, corev1.ServiceTypeLoadBalancer),
   226  	},
   227  }
   228  
   229  func (s *serviceCreationLatencyMeasurement) gather(identifier string) ([]measurement.Summary, error) {
   230  	klog.V(2).Infof("%s: gathering service created latency measurement...", s)
   231  	if !s.isRunning {
   232  		return nil, fmt.Errorf("metric %s has not been started", s)
   233  	}
   234  
   235  	// NOTE: For ClusterIP or NodePort type of service, the cluster ip or node port is assigned as part of service creation API call, so the ipAssigning phase is no sense.
   236  	serviceCreationLatency := s.creationTimes.CalculateTransitionsLatency(serviceCreationTransitions, measurementutil.MatchAll)
   237  
   238  	content, err := util.PrettyPrintJSON(measurementutil.LatencyMapToPerfData(serviceCreationLatency))
   239  	if err != nil {
   240  		return nil, err
   241  	}
   242  	summary := measurement.CreateSummary(fmt.Sprintf("%s_%s", serviceCreationLatencyName, identifier), "json", content)
   243  	return []measurement.Summary{summary}, nil
   244  }
   245  
   246  func (s *serviceCreationLatencyMeasurement) handleObject(oldObj, newObj interface{}) {
   247  	var oldService *corev1.Service
   248  	var newService *corev1.Service
   249  	var ok bool
   250  	oldService, ok = oldObj.(*corev1.Service)
   251  	if oldObj != nil && !ok {
   252  		klog.Errorf("%s: uncastable old object: %v", s, oldObj)
   253  		return
   254  	}
   255  	newService, ok = newObj.(*corev1.Service)
   256  	if newObj != nil && !ok {
   257  		klog.Errorf("%s: uncastable new object: %v", s, newObj)
   258  		return
   259  	}
   260  	if isEqual := oldService != nil &&
   261  		newService != nil &&
   262  		equality.Semantic.DeepEqual(oldService.Spec, newService.Spec) &&
   263  		equality.Semantic.DeepEqual(oldService.Status, newService.Status); isEqual {
   264  		return
   265  	}
   266  
   267  	// TODO(#680): Make it thread-safe.
   268  	if !s.isRunning {
   269  		return
   270  	}
   271  	if newObj == nil {
   272  		if err := s.deleteObject(oldService); err != nil {
   273  			klog.Errorf("%s: delete checker error: %v", s, err)
   274  		}
   275  		return
   276  	}
   277  	if err := s.updateObject(newService); err != nil {
   278  		klog.Errorf("%s: create checker error: %v", s, err)
   279  	}
   280  }
   281  
   282  func (s *serviceCreationLatencyMeasurement) deleteObject(svc *corev1.Service) error {
   283  	key, err := cache.DeletionHandlingMetaNamespaceKeyFunc(svc)
   284  	if err != nil {
   285  		return fmt.Errorf("meta key created error: %v", err)
   286  	}
   287  	s.lock.Lock()
   288  	defer s.lock.Unlock()
   289  	s.creationTimes.Set(key, phaseName(deletingPhase, svc.Spec.Type), svc.ObjectMeta.DeletionTimestamp.Time)
   290  	s.creationTimes.Set(key, phaseName(deletedPhase, svc.Spec.Type), time.Now())
   291  	s.pingCheckers.DeleteAndStop(key)
   292  	return nil
   293  }
   294  
   295  func (s *serviceCreationLatencyMeasurement) updateObject(svc *corev1.Service) error {
   296  	// This measurement only works for services with ClusterIP, NodePort and LoadBalancer type.
   297  	if svc.Spec.Type != corev1.ServiceTypeClusterIP && svc.Spec.Type != corev1.ServiceTypeNodePort && svc.Spec.Type != corev1.ServiceTypeLoadBalancer {
   298  		return nil
   299  	}
   300  	key, err := cache.DeletionHandlingMetaNamespaceKeyFunc(svc)
   301  	if err != nil {
   302  		return fmt.Errorf("meta key created error: %v", err)
   303  	}
   304  	if _, exists := s.creationTimes.Get(key, phaseName(creatingPhase, svc.Spec.Type)); !exists {
   305  		s.creationTimes.Set(key, phaseName(creatingPhase, svc.Spec.Type), svc.CreationTimestamp.Time)
   306  	}
   307  	if svc.Spec.Type == corev1.ServiceTypeLoadBalancer && len(svc.Status.LoadBalancer.Ingress) < 1 {
   308  		return nil
   309  	}
   310  	// NOTE: For ClusterIP or NodePort type of service, the cluster ip or node port is assigned as part of service creation API call, so the ipAssigning phase is no sense.
   311  	if svc.Spec.Type == corev1.ServiceTypeLoadBalancer {
   312  		if _, exists := s.creationTimes.Get(key, phaseName(ipAssigningPhase, svc.Spec.Type)); exists {
   313  			return nil
   314  		}
   315  		s.creationTimes.Set(key, phaseName(ipAssigningPhase, svc.Spec.Type), time.Now())
   316  	}
   317  	pc := &pingChecker{
   318  		callerName:    s.String(),
   319  		svc:           svc,
   320  		creationTimes: s.creationTimes,
   321  		stopCh:        make(chan struct{}),
   322  	}
   323  	pc.run()
   324  	s.lock.Lock()
   325  	defer s.lock.Unlock()
   326  	s.pingCheckers.Add(key, pc)
   327  
   328  	return nil
   329  }
   330  
   331  func phaseName(phase string, serviceType corev1.ServiceType) string {
   332  	return fmt.Sprintf("%s_%s", phase, serviceType)
   333  }
   334  
   335  type pingChecker struct {
   336  	callerName    string
   337  	svc           *corev1.Service
   338  	creationTimes *measurementutil.ObjectTransitionTimes
   339  	stopCh        chan struct{}
   340  }
   341  
   342  func (p *pingChecker) run() {
   343  	key, err := cache.DeletionHandlingMetaNamespaceKeyFunc(p.svc)
   344  	if err != nil {
   345  		klog.Errorf("%s: meta key created error: %v", p.callerName, err)
   346  		return
   347  	}
   348  	success := 0
   349  	for {
   350  		select {
   351  		case <-p.stopCh:
   352  			return
   353  		default:
   354  			// TODO(#685): Make ping checks less communication heavy.
   355  			pod, err := execservice.GetPod()
   356  			if err != nil {
   357  				klog.Warningf("call to execservice.GetPod() ended with error: %v", err)
   358  				success = 0
   359  				time.Sleep(pingBackoff)
   360  				continue
   361  			}
   362  			var ips []string
   363  			var port int32
   364  			switch p.svc.Spec.Type {
   365  			case corev1.ServiceTypeClusterIP:
   366  				ips = p.svc.Spec.ClusterIPs
   367  				port = p.svc.Spec.Ports[0].Port
   368  			case corev1.ServiceTypeNodePort:
   369  				ips = []string{pod.Status.HostIP}
   370  				port = p.svc.Spec.Ports[0].NodePort
   371  			case corev1.ServiceTypeLoadBalancer:
   372  				for _, ingress := range p.svc.Status.LoadBalancer.Ingress {
   373  					ips = append(ips, ingress.IP)
   374  				}
   375  				port = p.svc.Spec.Ports[0].Port
   376  			}
   377  			for _, ip := range ips {
   378  				address := net.JoinHostPort(ip, fmt.Sprint(port))
   379  				command := fmt.Sprintf("curl %s", address)
   380  				_, err = execservice.RunCommand(context.TODO(), pod, command)
   381  				if err != nil {
   382  					break
   383  				}
   384  			}
   385  			if err != nil {
   386  				success = 0
   387  				time.Sleep(pingBackoff)
   388  				continue
   389  			}
   390  			success++
   391  			if success == pingChecks {
   392  				p.creationTimes.Set(key, phaseName(reachabilityPhase, p.svc.Spec.Type), time.Now())
   393  				return
   394  			}
   395  		}
   396  	}
   397  }
   398  
   399  func (p *pingChecker) Stop() {
   400  	close(p.stopCh)
   401  }