k8s.io/perf-tests/clusterloader2@v0.0.0-20240304094227-64bdb12da87e/pkg/measurement/common/api_availability_measurement.go (about)

     1  /*
     2  Copyright 2020 The Kubernetes Authors.
     3  
     4  Licensed under the Apache License, Version 2.0 (the "License");
     5  you may not use this file except in compliance with the License.
     6  You may obtain a copy of the License at
     7  
     8      http://www.apache.org/licenses/LICENSE-2.0
     9  
    10  Unless required by applicable law or agreed to in writing, software
    11  distributed under the License is distributed on an "AS IS" BASIS,
    12  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    13  See the License for the specific language governing permissions and
    14  limitations under the License.
    15  */
    16  
    17  package common
    18  
    19  import (
    20  	"context"
    21  	"fmt"
    22  	"net/http"
    23  	"strconv"
    24  	"sync"
    25  	"time"
    26  
    27  	clientset "k8s.io/client-go/kubernetes"
    28  	"k8s.io/klog/v2"
    29  	"k8s.io/perf-tests/clusterloader2/pkg/errors"
    30  	"k8s.io/perf-tests/clusterloader2/pkg/execservice"
    31  	"k8s.io/perf-tests/clusterloader2/pkg/measurement"
    32  	"k8s.io/perf-tests/clusterloader2/pkg/provider"
    33  	"k8s.io/perf-tests/clusterloader2/pkg/util"
    34  )
    35  
    36  const (
    37  	apiAvailabilityMeasurementName = "APIAvailability"
    38  )
    39  
    40  func init() {
    41  	if err := measurement.Register(apiAvailabilityMeasurementName, createAPIAvailabilityMeasurement); err != nil {
    42  		klog.Fatalf("Cannot register %s: %v", apiAvailabilityMeasurementName, err)
    43  	}
    44  }
    45  
    46  func createAPIAvailabilityMeasurement() measurement.Measurement {
    47  	return &apiAvailabilityMeasurement{}
    48  }
    49  
    50  type apiAvailabilityMeasurement struct {
    51  	isRunning           bool
    52  	isPaused            bool
    53  	pauseCh             chan struct{}
    54  	unpauseCh           chan struct{}
    55  	stopCh              chan struct{}
    56  	pollFrequency       time.Duration
    57  	hostIPs             []string
    58  	summaries           []measurement.Summary
    59  	clusterLevelMetrics *apiAvailabilityMetrics
    60  	threshold           float64
    61  	// Should we check public IPs of the host VMs
    62  	useHostPublicIPs bool
    63  	// Should we check internal IPs of the host VMs
    64  	useHostInternalIPs bool
    65  	// Metrics per host internal IP.
    66  	hostLevelMetrics           map[string]*apiAvailabilityMetrics
    67  	hostPollTimeoutSeconds     int
    68  	hostPollExecTimeoutSeconds int
    69  	wg                         sync.WaitGroup
    70  	lock                       sync.Mutex
    71  }
    72  
    73  func (a *apiAvailabilityMeasurement) updateHostAvailabilityMetrics(c clientset.Interface, provider provider.Provider) {
    74  	wg := sync.WaitGroup{}
    75  	wg.Add(len(a.hostIPs))
    76  	mu := sync.Mutex{}
    77  	for _, ip := range a.hostIPs {
    78  		ip := ip
    79  		go func() {
    80  			defer wg.Done()
    81  			statusCode, err := a.pollHost(ip)
    82  			availability := statusCode == strconv.Itoa(http.StatusOK)
    83  			if err != nil {
    84  				klog.Warningf("execservice issue: %s", err.Error())
    85  			}
    86  			if !availability {
    87  				klog.Warningf("host %s not available; HTTP status code: %s", ip, statusCode)
    88  			}
    89  			mu.Lock()
    90  			defer mu.Unlock()
    91  			a.hostLevelMetrics[ip].update(availability)
    92  		}()
    93  	}
    94  	wg.Wait()
    95  }
    96  
    97  func (a *apiAvailabilityMeasurement) pollHost(hostIP string) (string, error) {
    98  	pod, err := execservice.GetPod()
    99  	if err != nil {
   100  		return "", fmt.Errorf("problem with GetPod(): %w", err)
   101  	}
   102  	cmd := fmt.Sprintf("curl --connect-timeout %d -s -k -w \"%%{http_code}\" -o /dev/null https://%s:443/readyz", a.hostPollTimeoutSeconds, hostIP)
   103  	ctx, cancel := context.WithTimeout(context.Background(), time.Duration(a.hostPollExecTimeoutSeconds)*time.Second)
   104  	defer cancel()
   105  	output, err := execservice.RunCommand(ctx, pod, cmd)
   106  	if err != nil {
   107  		return "", fmt.Errorf("problem with RunCommand(): output=%q, err=%w", output, err)
   108  	}
   109  	return output, nil
   110  }
   111  
   112  func (a *apiAvailabilityMeasurement) updateClusterAvailabilityMetrics(c clientset.Interface) {
   113  	result := c.CoreV1().RESTClient().Get().AbsPath("/readyz").Do(context.Background())
   114  	status := 0
   115  	result.StatusCode(&status)
   116  	availability := status == http.StatusOK
   117  	if !availability {
   118  		klog.Warningf("cluster not available; HTTP status code: %d", status)
   119  	}
   120  	a.clusterLevelMetrics.update(availability)
   121  }
   122  
   123  func (a *apiAvailabilityMeasurement) Execute(config *measurement.Config) ([]measurement.Summary, error) {
   124  	action, err := util.GetString(config.Params, "action")
   125  	if err != nil {
   126  		return nil, err
   127  	}
   128  
   129  	a.lock.Lock()
   130  	defer a.lock.Unlock()
   131  
   132  	switch action {
   133  	case "start":
   134  		return nil, a.start(config)
   135  	case "pause":
   136  		a.pause()
   137  		return nil, nil
   138  	case "unpause":
   139  		a.unpause()
   140  		return nil, nil
   141  	case "gather":
   142  		return a.gather()
   143  	default:
   144  		return nil, fmt.Errorf("unknown action %v", action)
   145  	}
   146  }
   147  
   148  func (a *apiAvailabilityMeasurement) start(config *measurement.Config) error {
   149  	if a.isRunning {
   150  		klog.V(2).Infof("%s: measurement already running", a)
   151  		return nil
   152  	}
   153  	if err := a.initFields(config); err != nil {
   154  		return err
   155  	}
   156  	k8sClient := config.ClusterFramework.GetClientSets().GetClient()
   157  	provider := config.ClusterFramework.GetClusterConfig().Provider
   158  	a.wg.Add(1)
   159  
   160  	go func() {
   161  		defer a.wg.Done()
   162  		for {
   163  			if a.isPaused {
   164  				select {
   165  				case <-a.unpauseCh:
   166  					a.isPaused = false
   167  				case <-a.stopCh:
   168  					return
   169  				}
   170  			}
   171  			select {
   172  			case <-a.pauseCh:
   173  				a.isPaused = true
   174  			case <-time.After(a.pollFrequency):
   175  				a.updateClusterAvailabilityMetrics(k8sClient)
   176  				if a.hostLevelAvailabilityEnabled() {
   177  					a.updateHostAvailabilityMetrics(k8sClient, provider)
   178  				}
   179  			case <-a.stopCh:
   180  				return
   181  			}
   182  		}
   183  	}()
   184  	return nil
   185  }
   186  
   187  func (a *apiAvailabilityMeasurement) initFields(config *measurement.Config) (err error) {
   188  	a.isRunning = true
   189  	a.stopCh = make(chan struct{})
   190  	a.pauseCh = make(chan struct{})
   191  	a.unpauseCh = make(chan struct{})
   192  	frequency, err := util.GetDuration(config.Params, "pollFrequency")
   193  	if err != nil {
   194  		return err
   195  	}
   196  	a.pollFrequency = frequency
   197  
   198  	threshold, err := util.GetFloat64OrDefault(config.Params, "threshold", 0.0)
   199  	if err != nil {
   200  		return err
   201  	}
   202  	a.threshold = threshold
   203  
   204  	a.clusterLevelMetrics = &apiAvailabilityMetrics{}
   205  
   206  	a.useHostInternalIPs, err = util.GetBoolOrDefault(config.Params, "useHostInternalIPs", config.ClusterLoaderConfig.ExecServiceConfig.Enable && len(config.ClusterFramework.GetClusterConfig().MasterInternalIPs) != 0)
   207  	if err != nil {
   208  		return err
   209  	}
   210  	a.useHostPublicIPs, err = util.GetBoolOrDefault(config.Params, "useHostPublicIPs", false)
   211  	if err != nil {
   212  		return err
   213  	}
   214  	if a.useHostInternalIPs || a.useHostPublicIPs {
   215  		err = a.addHostIPs(config)
   216  		if err != nil {
   217  			return err
   218  		}
   219  	}
   220  	return nil
   221  }
   222  
   223  func (a *apiAvailabilityMeasurement) addHostIPs(config *measurement.Config) error {
   224  	if config.ClusterLoaderConfig.ExecServiceConfig.Enable {
   225  		if a.useHostInternalIPs {
   226  			if len(config.ClusterFramework.GetClusterConfig().MasterInternalIPs) == 0 {
   227  				return fmt.Errorf("%s: host internal IP(s) are not provided, cannot measure availability through internal IPs", a)
   228  			}
   229  			a.hostIPs = config.ClusterFramework.GetClusterConfig().MasterInternalIPs
   230  		}
   231  		if a.useHostPublicIPs {
   232  			if len(config.ClusterFramework.GetClusterConfig().MasterIPs) == 0 {
   233  				return fmt.Errorf("%s: host public IP(s) are not provided, cannot measure availability through public IPs", a)
   234  			}
   235  			a.hostIPs = append(a.hostIPs, config.ClusterFramework.GetClusterConfig().MasterIPs...)
   236  		}
   237  		a.hostLevelMetrics = map[string]*apiAvailabilityMetrics{}
   238  		for _, ip := range a.hostIPs {
   239  			a.hostLevelMetrics[ip] = &apiAvailabilityMetrics{}
   240  		}
   241  		hostPollTimeoutSeconds, err := util.GetIntOrDefault(config.Params, "hostPollTimeoutSeconds", 5)
   242  		if err != nil {
   243  			return err
   244  		}
   245  		a.hostPollTimeoutSeconds = hostPollTimeoutSeconds
   246  		hostPollExecTimeoutSeconds, err := util.GetIntOrDefault(config.Params, "hostPollExecTimeoutSeconds", 10)
   247  		if err != nil {
   248  			return err
   249  		}
   250  		a.hostPollExecTimeoutSeconds = hostPollExecTimeoutSeconds
   251  	} else {
   252  		return fmt.Errorf("%s: exec service is not enabled, cannot measure availability through host IPs", a)
   253  	}
   254  	return nil
   255  }
   256  
   257  func (a *apiAvailabilityMeasurement) hostLevelAvailabilityEnabled() bool {
   258  	return len(a.hostLevelMetrics) > 0
   259  }
   260  
   261  func (a *apiAvailabilityMeasurement) pause() {
   262  	if !a.isRunning {
   263  		klog.V(2).Infof("%s: measurement is not running", a)
   264  		return
   265  	}
   266  	if a.isPaused {
   267  		klog.Warningf("%s: measurement already paused", a)
   268  		return
   269  	}
   270  	a.pauseCh <- struct{}{}
   271  	klog.V(2).Infof("%s: pausing the measurement (stopping checking the availability)", a)
   272  }
   273  
   274  func (a *apiAvailabilityMeasurement) unpause() {
   275  	if !a.isRunning {
   276  		klog.V(2).Infof("%s: measurement is not running", a)
   277  		return
   278  	}
   279  	if !a.isPaused {
   280  		klog.Warningf("%s: measurement already unpaused", a)
   281  		return
   282  	}
   283  	a.unpauseCh <- struct{}{}
   284  	klog.V(2).Infof("%s: unpausing the measurement", a)
   285  }
   286  
   287  func (a *apiAvailabilityMeasurement) gather() ([]measurement.Summary, error) {
   288  	if !a.isRunning {
   289  		return nil, nil
   290  	}
   291  	close(a.stopCh)
   292  	a.wg.Wait()
   293  	a.isRunning = false
   294  	klog.V(2).Infof("%s: gathering summaries", apiAvailabilityMeasurementName)
   295  
   296  	output := apiAvailabilityOutput{
   297  		ClusterSummary: createClusterSummary(a.clusterLevelMetrics, a.pollFrequency),
   298  	}
   299  	if a.hostLevelAvailabilityEnabled() {
   300  		output.HostSummaries = createHostSummary(a.hostLevelMetrics, a.hostIPs, a.pollFrequency)
   301  	}
   302  
   303  	content, err := util.PrettyPrintJSON(output)
   304  	if err != nil {
   305  		return nil, err
   306  	}
   307  	summary := measurement.CreateSummary(apiAvailabilityMeasurementName, "json", content)
   308  	a.summaries = append(a.summaries, summary)
   309  	if sli := output.ClusterSummary.AvailabilityPercentage; sli < a.threshold {
   310  		err = errors.NewMetricViolationError("API availability", fmt.Sprintf("SLO not fulfilled (expected >= %.2f, got: %.2f)", a.threshold, sli))
   311  	}
   312  	return a.summaries, err
   313  }
   314  
   315  func (a *apiAvailabilityMeasurement) Dispose() {}
   316  
   317  func (a *apiAvailabilityMeasurement) String() string {
   318  	return apiAvailabilityMeasurementName
   319  }