k8s.io/perf-tests/clusterloader2@v0.0.0-20240304094227-64bdb12da87e/pkg/measurement/common/api_availability_measurement.go (about) 1 /* 2 Copyright 2020 The Kubernetes Authors. 3 4 Licensed under the Apache License, Version 2.0 (the "License"); 5 you may not use this file except in compliance with the License. 6 You may obtain a copy of the License at 7 8 http://www.apache.org/licenses/LICENSE-2.0 9 10 Unless required by applicable law or agreed to in writing, software 11 distributed under the License is distributed on an "AS IS" BASIS, 12 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 See the License for the specific language governing permissions and 14 limitations under the License. 15 */ 16 17 package common 18 19 import ( 20 "context" 21 "fmt" 22 "net/http" 23 "strconv" 24 "sync" 25 "time" 26 27 clientset "k8s.io/client-go/kubernetes" 28 "k8s.io/klog/v2" 29 "k8s.io/perf-tests/clusterloader2/pkg/errors" 30 "k8s.io/perf-tests/clusterloader2/pkg/execservice" 31 "k8s.io/perf-tests/clusterloader2/pkg/measurement" 32 "k8s.io/perf-tests/clusterloader2/pkg/provider" 33 "k8s.io/perf-tests/clusterloader2/pkg/util" 34 ) 35 36 const ( 37 apiAvailabilityMeasurementName = "APIAvailability" 38 ) 39 40 func init() { 41 if err := measurement.Register(apiAvailabilityMeasurementName, createAPIAvailabilityMeasurement); err != nil { 42 klog.Fatalf("Cannot register %s: %v", apiAvailabilityMeasurementName, err) 43 } 44 } 45 46 func createAPIAvailabilityMeasurement() measurement.Measurement { 47 return &apiAvailabilityMeasurement{} 48 } 49 50 type apiAvailabilityMeasurement struct { 51 isRunning bool 52 isPaused bool 53 pauseCh chan struct{} 54 unpauseCh chan struct{} 55 stopCh chan struct{} 56 pollFrequency time.Duration 57 hostIPs []string 58 summaries []measurement.Summary 59 clusterLevelMetrics *apiAvailabilityMetrics 60 threshold float64 61 // Should we check public IPs of the host VMs 62 useHostPublicIPs bool 63 // Should we check internal IPs of the host VMs 64 useHostInternalIPs bool 65 // Metrics per host internal IP. 66 hostLevelMetrics map[string]*apiAvailabilityMetrics 67 hostPollTimeoutSeconds int 68 hostPollExecTimeoutSeconds int 69 wg sync.WaitGroup 70 lock sync.Mutex 71 } 72 73 func (a *apiAvailabilityMeasurement) updateHostAvailabilityMetrics(c clientset.Interface, provider provider.Provider) { 74 wg := sync.WaitGroup{} 75 wg.Add(len(a.hostIPs)) 76 mu := sync.Mutex{} 77 for _, ip := range a.hostIPs { 78 ip := ip 79 go func() { 80 defer wg.Done() 81 statusCode, err := a.pollHost(ip) 82 availability := statusCode == strconv.Itoa(http.StatusOK) 83 if err != nil { 84 klog.Warningf("execservice issue: %s", err.Error()) 85 } 86 if !availability { 87 klog.Warningf("host %s not available; HTTP status code: %s", ip, statusCode) 88 } 89 mu.Lock() 90 defer mu.Unlock() 91 a.hostLevelMetrics[ip].update(availability) 92 }() 93 } 94 wg.Wait() 95 } 96 97 func (a *apiAvailabilityMeasurement) pollHost(hostIP string) (string, error) { 98 pod, err := execservice.GetPod() 99 if err != nil { 100 return "", fmt.Errorf("problem with GetPod(): %w", err) 101 } 102 cmd := fmt.Sprintf("curl --connect-timeout %d -s -k -w \"%%{http_code}\" -o /dev/null https://%s:443/readyz", a.hostPollTimeoutSeconds, hostIP) 103 ctx, cancel := context.WithTimeout(context.Background(), time.Duration(a.hostPollExecTimeoutSeconds)*time.Second) 104 defer cancel() 105 output, err := execservice.RunCommand(ctx, pod, cmd) 106 if err != nil { 107 return "", fmt.Errorf("problem with RunCommand(): output=%q, err=%w", output, err) 108 } 109 return output, nil 110 } 111 112 func (a *apiAvailabilityMeasurement) updateClusterAvailabilityMetrics(c clientset.Interface) { 113 result := c.CoreV1().RESTClient().Get().AbsPath("/readyz").Do(context.Background()) 114 status := 0 115 result.StatusCode(&status) 116 availability := status == http.StatusOK 117 if !availability { 118 klog.Warningf("cluster not available; HTTP status code: %d", status) 119 } 120 a.clusterLevelMetrics.update(availability) 121 } 122 123 func (a *apiAvailabilityMeasurement) Execute(config *measurement.Config) ([]measurement.Summary, error) { 124 action, err := util.GetString(config.Params, "action") 125 if err != nil { 126 return nil, err 127 } 128 129 a.lock.Lock() 130 defer a.lock.Unlock() 131 132 switch action { 133 case "start": 134 return nil, a.start(config) 135 case "pause": 136 a.pause() 137 return nil, nil 138 case "unpause": 139 a.unpause() 140 return nil, nil 141 case "gather": 142 return a.gather() 143 default: 144 return nil, fmt.Errorf("unknown action %v", action) 145 } 146 } 147 148 func (a *apiAvailabilityMeasurement) start(config *measurement.Config) error { 149 if a.isRunning { 150 klog.V(2).Infof("%s: measurement already running", a) 151 return nil 152 } 153 if err := a.initFields(config); err != nil { 154 return err 155 } 156 k8sClient := config.ClusterFramework.GetClientSets().GetClient() 157 provider := config.ClusterFramework.GetClusterConfig().Provider 158 a.wg.Add(1) 159 160 go func() { 161 defer a.wg.Done() 162 for { 163 if a.isPaused { 164 select { 165 case <-a.unpauseCh: 166 a.isPaused = false 167 case <-a.stopCh: 168 return 169 } 170 } 171 select { 172 case <-a.pauseCh: 173 a.isPaused = true 174 case <-time.After(a.pollFrequency): 175 a.updateClusterAvailabilityMetrics(k8sClient) 176 if a.hostLevelAvailabilityEnabled() { 177 a.updateHostAvailabilityMetrics(k8sClient, provider) 178 } 179 case <-a.stopCh: 180 return 181 } 182 } 183 }() 184 return nil 185 } 186 187 func (a *apiAvailabilityMeasurement) initFields(config *measurement.Config) (err error) { 188 a.isRunning = true 189 a.stopCh = make(chan struct{}) 190 a.pauseCh = make(chan struct{}) 191 a.unpauseCh = make(chan struct{}) 192 frequency, err := util.GetDuration(config.Params, "pollFrequency") 193 if err != nil { 194 return err 195 } 196 a.pollFrequency = frequency 197 198 threshold, err := util.GetFloat64OrDefault(config.Params, "threshold", 0.0) 199 if err != nil { 200 return err 201 } 202 a.threshold = threshold 203 204 a.clusterLevelMetrics = &apiAvailabilityMetrics{} 205 206 a.useHostInternalIPs, err = util.GetBoolOrDefault(config.Params, "useHostInternalIPs", config.ClusterLoaderConfig.ExecServiceConfig.Enable && len(config.ClusterFramework.GetClusterConfig().MasterInternalIPs) != 0) 207 if err != nil { 208 return err 209 } 210 a.useHostPublicIPs, err = util.GetBoolOrDefault(config.Params, "useHostPublicIPs", false) 211 if err != nil { 212 return err 213 } 214 if a.useHostInternalIPs || a.useHostPublicIPs { 215 err = a.addHostIPs(config) 216 if err != nil { 217 return err 218 } 219 } 220 return nil 221 } 222 223 func (a *apiAvailabilityMeasurement) addHostIPs(config *measurement.Config) error { 224 if config.ClusterLoaderConfig.ExecServiceConfig.Enable { 225 if a.useHostInternalIPs { 226 if len(config.ClusterFramework.GetClusterConfig().MasterInternalIPs) == 0 { 227 return fmt.Errorf("%s: host internal IP(s) are not provided, cannot measure availability through internal IPs", a) 228 } 229 a.hostIPs = config.ClusterFramework.GetClusterConfig().MasterInternalIPs 230 } 231 if a.useHostPublicIPs { 232 if len(config.ClusterFramework.GetClusterConfig().MasterIPs) == 0 { 233 return fmt.Errorf("%s: host public IP(s) are not provided, cannot measure availability through public IPs", a) 234 } 235 a.hostIPs = append(a.hostIPs, config.ClusterFramework.GetClusterConfig().MasterIPs...) 236 } 237 a.hostLevelMetrics = map[string]*apiAvailabilityMetrics{} 238 for _, ip := range a.hostIPs { 239 a.hostLevelMetrics[ip] = &apiAvailabilityMetrics{} 240 } 241 hostPollTimeoutSeconds, err := util.GetIntOrDefault(config.Params, "hostPollTimeoutSeconds", 5) 242 if err != nil { 243 return err 244 } 245 a.hostPollTimeoutSeconds = hostPollTimeoutSeconds 246 hostPollExecTimeoutSeconds, err := util.GetIntOrDefault(config.Params, "hostPollExecTimeoutSeconds", 10) 247 if err != nil { 248 return err 249 } 250 a.hostPollExecTimeoutSeconds = hostPollExecTimeoutSeconds 251 } else { 252 return fmt.Errorf("%s: exec service is not enabled, cannot measure availability through host IPs", a) 253 } 254 return nil 255 } 256 257 func (a *apiAvailabilityMeasurement) hostLevelAvailabilityEnabled() bool { 258 return len(a.hostLevelMetrics) > 0 259 } 260 261 func (a *apiAvailabilityMeasurement) pause() { 262 if !a.isRunning { 263 klog.V(2).Infof("%s: measurement is not running", a) 264 return 265 } 266 if a.isPaused { 267 klog.Warningf("%s: measurement already paused", a) 268 return 269 } 270 a.pauseCh <- struct{}{} 271 klog.V(2).Infof("%s: pausing the measurement (stopping checking the availability)", a) 272 } 273 274 func (a *apiAvailabilityMeasurement) unpause() { 275 if !a.isRunning { 276 klog.V(2).Infof("%s: measurement is not running", a) 277 return 278 } 279 if !a.isPaused { 280 klog.Warningf("%s: measurement already unpaused", a) 281 return 282 } 283 a.unpauseCh <- struct{}{} 284 klog.V(2).Infof("%s: unpausing the measurement", a) 285 } 286 287 func (a *apiAvailabilityMeasurement) gather() ([]measurement.Summary, error) { 288 if !a.isRunning { 289 return nil, nil 290 } 291 close(a.stopCh) 292 a.wg.Wait() 293 a.isRunning = false 294 klog.V(2).Infof("%s: gathering summaries", apiAvailabilityMeasurementName) 295 296 output := apiAvailabilityOutput{ 297 ClusterSummary: createClusterSummary(a.clusterLevelMetrics, a.pollFrequency), 298 } 299 if a.hostLevelAvailabilityEnabled() { 300 output.HostSummaries = createHostSummary(a.hostLevelMetrics, a.hostIPs, a.pollFrequency) 301 } 302 303 content, err := util.PrettyPrintJSON(output) 304 if err != nil { 305 return nil, err 306 } 307 summary := measurement.CreateSummary(apiAvailabilityMeasurementName, "json", content) 308 a.summaries = append(a.summaries, summary) 309 if sli := output.ClusterSummary.AvailabilityPercentage; sli < a.threshold { 310 err = errors.NewMetricViolationError("API availability", fmt.Sprintf("SLO not fulfilled (expected >= %.2f, got: %.2f)", a.threshold, sli)) 311 } 312 return a.summaries, err 313 } 314 315 func (a *apiAvailabilityMeasurement) Dispose() {} 316 317 func (a *apiAvailabilityMeasurement) String() string { 318 return apiAvailabilityMeasurementName 319 }