agones.dev/agones@v1.54.0/pkg/fleetautoscalers/fleetautoscalers.go (about) 1 /* 2 * Copyright 2018 Google LLC All Rights Reserved. 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17 package fleetautoscalers 18 19 import ( 20 "context" 21 "crypto/tls" 22 "crypto/x509" 23 "encoding/json" 24 "fmt" 25 "io" 26 "math" 27 "net/http" 28 "net/url" 29 "strings" 30 "time" 31 32 extism "github.com/extism/go-sdk" 33 "github.com/pkg/errors" 34 "github.com/robfig/cron/v3" 35 corev1 "k8s.io/api/core/v1" 36 "k8s.io/apimachinery/pkg/util/intstr" 37 "k8s.io/apimachinery/pkg/util/uuid" 38 39 agonesv1 "agones.dev/agones/pkg/apis/agones/v1" 40 autoscalingv1 "agones.dev/agones/pkg/apis/autoscaling/v1" 41 listeragonesv1 "agones.dev/agones/pkg/client/listers/agones/v1" 42 "agones.dev/agones/pkg/fleets" 43 "agones.dev/agones/pkg/gameservers" 44 gssets "agones.dev/agones/pkg/gameserversets" 45 "agones.dev/agones/pkg/util/runtime" 46 ) 47 48 const ( 49 maxDuration = "2540400h" // 290 Years 50 ) 51 52 // InactiveScheduleError denotes an error for schedules that are not currently active. 53 type InactiveScheduleError struct{} 54 55 func (InactiveScheduleError) Error() string { 56 return "inactive schedule, policy not applicable" 57 } 58 59 // computeDesiredFleetSize computes the new desired size of the given fleet 60 func computeDesiredFleetSize(ctx context.Context, state *fasState, pol autoscalingv1.FleetAutoscalerPolicy, f *agonesv1.Fleet, 61 gameServerNamespacedLister listeragonesv1.GameServerNamespaceLister, nodeCounts map[string]gameservers.NodeCount, fasLog *FasLogger) (int32, bool, error) { 62 63 var ( 64 replicas int32 65 limited bool 66 err error 67 ) 68 69 switch pol.Type { 70 case autoscalingv1.BufferPolicyType: 71 replicas, limited, err = applyBufferPolicy(state, pol.Buffer, f, fasLog) 72 case autoscalingv1.WebhookPolicyType: 73 replicas, limited, err = applyWebhookPolicy(state, pol.Webhook, f, fasLog) 74 case autoscalingv1.CounterPolicyType: 75 replicas, limited, err = applyCounterOrListPolicyWrapper(state, pol.Counter, nil, f, gameServerNamespacedLister, nodeCounts, fasLog) 76 case autoscalingv1.ListPolicyType: 77 replicas, limited, err = applyCounterOrListPolicyWrapper(state, nil, pol.List, f, gameServerNamespacedLister, nodeCounts, fasLog) 78 case autoscalingv1.SchedulePolicyType: 79 replicas, limited, err = applySchedulePolicy(ctx, state, pol.Schedule, f, gameServerNamespacedLister, nodeCounts, time.Now(), fasLog) 80 case autoscalingv1.ChainPolicyType: 81 replicas, limited, err = applyChainPolicy(ctx, state, pol.Chain, f, gameServerNamespacedLister, nodeCounts, time.Now(), fasLog) 82 case autoscalingv1.WasmPolicyType: 83 replicas, limited, err = applyWasmPolicy(ctx, state, pol.Wasm, f, fasLog) 84 85 default: 86 err = errors.New("wrong policy type, should be one of: Buffer, Webhook, Counter, List, Schedule, Chain") 87 } 88 89 if err != nil && !errors.Is(err, InactiveScheduleError{}) { 90 loggerForFleetAutoscalerKey(fasLog.fas.ObjectMeta.Name, fasLog.baseLogger). 91 Debugf("Failed to apply policy type %q: %v", pol.Type, err) 92 } 93 94 return replicas, limited, err 95 } 96 97 func applyWasmPolicy(ctx context.Context, state *fasState, wp *autoscalingv1.WasmPolicy, f *agonesv1.Fleet, log *FasLogger) (int32, bool, error) { 98 if !runtime.FeatureEnabled(runtime.FeatureWasmAutoscaler) { 99 return 0, false, errors.Errorf("cannot apply WasmPolicy unless feature flag %s is enabled", runtime.FeatureWasmAutoscaler) 100 } 101 102 if wp == nil { 103 return 0, false, errors.New("wasmPolicy parameter must not be nil") 104 } 105 106 if f == nil { 107 return 0, false, errors.New("fleet parameter must not be nil") 108 } 109 110 if state.wasmPlugin == nil { 111 // Build URL from the WasmPolicy 112 u, err := buildURLFromConfiguration(state, wp.From.URL) 113 if err != nil { 114 return 0, false, err 115 } 116 117 if state.httpClient == nil { 118 return 0, false, errors.New("http client not set") 119 } 120 121 res, err := state.httpClient.Get(u.String()) 122 if err != nil { 123 return 0, false, errors.Wrapf(err, "failed to fetch Wasm module from %s", u.String()) 124 } 125 defer res.Body.Close() //nolint:errcheck 126 127 if res.StatusCode != http.StatusOK { 128 return 0, false, fmt.Errorf("bad status code %d from the server: %s", res.StatusCode, u.String()) 129 } 130 131 b, err := io.ReadAll(res.Body) 132 if err != nil { 133 return 0, false, errors.Wrapf(err, "failed to read Wasm module from %s", u.String()) 134 } 135 136 data := extism.WasmData{Data: b} 137 if len(wp.Hash) > 0 { 138 data.Hash = wp.Hash 139 } 140 manifest := extism.Manifest{ 141 Wasm: []extism.Wasm{ 142 data, 143 }, 144 Config: wp.Config, 145 } 146 147 config := extism.PluginConfig{ 148 EnableWasi: true, 149 } 150 plugin, err := extism.NewPlugin(ctx, manifest, config, []extism.HostFunction{}) 151 if err != nil { 152 return 0, false, errors.Wrapf(err, "failed to create Wasm plugin from %s", u.String()) 153 } 154 state.wasmPlugin = plugin // Store the plugin in the state map 155 } 156 157 // Create FleetAutoscaleReview 158 review := autoscalingv1.FleetAutoscaleReview{ 159 Request: &autoscalingv1.FleetAutoscaleRequest{ 160 UID: uuid.NewUUID(), 161 Name: f.Name, 162 Namespace: f.Namespace, 163 Status: f.Status, 164 }, 165 Response: nil, 166 } 167 168 if runtime.FeatureEnabled(runtime.FeatureFleetAutoscaleRequestMetaData) { 169 review.Request.Annotations = f.ObjectMeta.Annotations 170 review.Request.Labels = f.ObjectMeta.Labels 171 } 172 173 b, err := json.Marshal(review) 174 if err != nil { 175 return 0, false, errors.Wrap(err, "failed to marshal autoscaling request") 176 } 177 178 _, b, err = state.wasmPlugin.CallWithContext(ctx, wp.Function, b) 179 if err != nil { 180 return 0, false, errors.Wrapf(err, "failed to call Wasm plugin function %s", wp.Function) 181 } 182 183 if err := json.Unmarshal(b, &review); err != nil { 184 return 0, false, errors.Wrap(err, "failed to unmarshal autoscaling response") 185 } 186 187 loggerForFleetAutoscalerKey(log.fas.ObjectMeta.Name, log.baseLogger).Debugf( 188 "Fleet Autoscaler operation completed for fleet: %s, with was function: %s", f.ObjectMeta.Name, wp.Function) 189 190 if review.Response.Scale { 191 return review.Response.Replicas, false, nil 192 } 193 194 return f.Status.Replicas, false, nil 195 } 196 197 // buildURLFromConfiguration - build URL for Webhook and set CARoot for client Transport 198 func buildURLFromConfiguration(state *fasState, w *autoscalingv1.URLConfiguration) (u *url.URL, err error) { 199 if w.URL != nil && w.Service != nil { 200 return nil, errors.New("service and URL cannot be used simultaneously") 201 } 202 203 // if we haven't created the http state yet, let's create the http client, with appropriate tls configuration. 204 if state.httpClient == nil { 205 config := &tls.Config{} 206 state.httpClient = &http.Client{ 207 Timeout: 15 * time.Second, 208 Transport: &http.Transport{ 209 TLSClientConfig: config, 210 }, 211 } 212 213 if w.CABundle != nil { 214 if err := setCABundle(config, w.CABundle); err != nil { 215 return nil, err 216 } 217 } 218 } 219 220 scheme := "http" 221 if w.CABundle != nil { 222 scheme = "https" 223 } 224 225 if w.URL != nil { 226 if *w.URL == "" { 227 return nil, errors.New("URL was not provided") 228 } 229 230 return url.ParseRequestURI(*w.URL) 231 } 232 233 if w.Service == nil { 234 return nil, errors.New("service was not provided, either URL or Service must be provided") 235 } 236 237 if w.Service.Name == "" { 238 return nil, errors.New("service name was not provided") 239 } 240 241 if w.Service.Path == nil { 242 empty := "" 243 w.Service.Path = &empty 244 } 245 246 if w.Service.Namespace == "" { 247 w.Service.Namespace = "default" 248 } 249 250 return createURL(scheme, w.Service.Name, w.Service.Namespace, *w.Service.Path, w.Service.Port), nil 251 } 252 253 // moved to a separate method to cover it with unit tests and check that URL corresponds to a proper pattern 254 func createURL(scheme, name, namespace, path string, port *int32) *url.URL { 255 var hostPort int32 = 8000 256 if port != nil { 257 hostPort = *port 258 } 259 260 return &url.URL{ 261 Scheme: scheme, 262 Host: fmt.Sprintf("%s.%s.svc:%d", name, namespace, hostPort), 263 Path: path, 264 } 265 } 266 267 func setCABundle(tls *tls.Config, caBundle []byte) error { 268 rootCAs := x509.NewCertPool() 269 if ok := rootCAs.AppendCertsFromPEM(caBundle); !ok { 270 return errors.New("no certs were appended from caBundle") 271 } 272 tls.RootCAs = rootCAs 273 return nil 274 } 275 276 func applyWebhookPolicy(state *fasState, w *autoscalingv1.URLConfiguration, f *agonesv1.Fleet, fasLog *FasLogger) (replicas int32, limited bool, err error) { 277 if w == nil { 278 return 0, false, errors.New("webhookPolicy parameter must not be nil") 279 } 280 281 if f == nil { 282 return 0, false, errors.New("fleet parameter must not be nil") 283 } 284 285 u, err := buildURLFromConfiguration(state, w) 286 if err != nil { 287 return 0, false, err 288 } 289 if state.httpClient == nil { 290 return 0, false, errors.New("http client not set") 291 } 292 293 faReq := autoscalingv1.FleetAutoscaleReview{ 294 Request: &autoscalingv1.FleetAutoscaleRequest{ 295 UID: uuid.NewUUID(), 296 Name: f.Name, 297 Namespace: f.Namespace, 298 Status: f.Status, 299 }, 300 Response: nil, 301 } 302 303 if runtime.FeatureEnabled(runtime.FeatureFleetAutoscaleRequestMetaData) { 304 faReq.Request.Annotations = f.ObjectMeta.Annotations 305 faReq.Request.Labels = f.ObjectMeta.Labels 306 } 307 308 b, err := json.Marshal(faReq) 309 if err != nil { 310 return 0, false, err 311 } 312 313 res, err := state.httpClient.Post( 314 u.String(), 315 "application/json", 316 strings.NewReader(string(b)), 317 ) 318 if err != nil { 319 return 0, false, err 320 } 321 defer func() { 322 if cerr := res.Body.Close(); cerr != nil { 323 if err != nil { 324 err = errors.Wrap(err, cerr.Error()) 325 } else { 326 err = cerr 327 } 328 } 329 }() 330 331 if res.StatusCode != http.StatusOK { 332 return 0, false, fmt.Errorf("bad status code %d from the server: %s", res.StatusCode, u.String()) 333 } 334 result, err := io.ReadAll(res.Body) 335 if err != nil { 336 return 0, false, err 337 } 338 339 var faResp autoscalingv1.FleetAutoscaleReview 340 err = json.Unmarshal(result, &faResp) 341 if err != nil { 342 return 0, false, err 343 } 344 345 // Log Fleet Autoscaler operation, handling nil or empty Name in one line 346 webhookPolicyName := "<nil>" 347 if w.Service != nil && w.Service.Name != "" { 348 webhookPolicyName = w.Service.Name 349 } 350 loggerForFleetAutoscalerKey(fasLog.fas.ObjectMeta.Name, fasLog.baseLogger).Debugf( 351 "Fleet Autoscaler operation completed for fleet: %s, with WebhookPolicy: %s", f.ObjectMeta.Name, webhookPolicyName) 352 353 if faResp.Response.Scale { 354 return faResp.Response.Replicas, false, nil 355 } 356 357 return f.Status.Replicas, false, nil 358 } 359 360 func applyBufferPolicy(_ *fasState, b *autoscalingv1.BufferPolicy, f *agonesv1.Fleet, fasLog *FasLogger) (int32, bool, error) { 361 var replicas int32 362 363 if b.BufferSize.Type == intstr.Int { 364 replicas = f.Status.AllocatedReplicas + int32(b.BufferSize.IntValue()) 365 } else { 366 // the percentage value is a little more complex, as we can't apply 367 // the desired percentage to any current value, but to the future one 368 // Example: we have 8 allocated replicas, 10 total replicas and bufferSize set to 30%. 369 // 30% means that we must have 30% ready instances in the fleet 370 // Right now there are 20%, so we must increase the fleet until we reach 30% 371 // To compute the new size, we start from the other end: if ready must be 30% 372 // it means that allocated must be 70% and adjust the fleet size to make that true. 373 bufferPercent, err := intstr.GetValueFromIntOrPercent(&b.BufferSize, 100, true) 374 if err != nil { 375 return 0, false, err 376 } 377 // use Math.Ceil to round the result up 378 replicas = int32(math.Ceil(float64(f.Status.AllocatedReplicas*100) / float64(100-bufferPercent))) 379 } 380 381 scalingInLimited := false 382 scalingOutLimited := false 383 384 if replicas < b.MinReplicas { 385 replicas = b.MinReplicas 386 scalingInLimited = true 387 } 388 if replicas > b.MaxReplicas { 389 replicas = b.MaxReplicas 390 scalingOutLimited = true 391 } 392 393 loggerForFleetAutoscalerKey(fasLog.fas.ObjectMeta.Name, fasLog.baseLogger).Debugf( 394 "Fleet Autoscaler operation completed for fleet: %s, with BufferPolicy: %v", f.ObjectMeta.Name, b.BufferSize) 395 396 return replicas, scalingInLimited || scalingOutLimited, nil 397 } 398 399 // New function to call applyCounterOrListPolicy 400 func applyCounterOrListPolicyWrapper(_ *fasState, c *autoscalingv1.CounterPolicy, l *autoscalingv1.ListPolicy, 401 f *agonesv1.Fleet, gameServerNamespacedLister listeragonesv1.GameServerNamespaceLister, 402 nodeCounts map[string]gameservers.NodeCount, fasLog *FasLogger) (int32, bool, error) { 403 404 // Call applyCounterOrListPolicy inside the wrapper 405 desiredReplicas, scalingLimited, err := applyCounterOrListPolicy(c, l, f, gameServerNamespacedLister, nodeCounts) 406 407 if err == nil { 408 // Log directly based on which policy is used, with a description of the key 409 if c != nil { 410 // Log the Key from CounterPolicy with a description 411 loggerForFleetAutoscalerKey(fasLog.fas.ObjectMeta.Name, fasLog.baseLogger).Debugf( 412 "Fleet Autoscaler operation completed for fleet: %s, with CounterPolicy - Key: %v", f.ObjectMeta.Name, c.Key) 413 } else if l != nil { 414 // Log the Key from ListPolicy with a description 415 loggerForFleetAutoscalerKey(fasLog.fas.ObjectMeta.Name, fasLog.baseLogger).Debugf( 416 "Fleet Autoscaler operation completed for fleet: %s, with ListPolicy - Key: %v", f.ObjectMeta.Name, l.Key) 417 } 418 } 419 420 return desiredReplicas, scalingLimited, err 421 } 422 423 func applyCounterOrListPolicy(c *autoscalingv1.CounterPolicy, l *autoscalingv1.ListPolicy, 424 f *agonesv1.Fleet, gameServerNamespacedLister listeragonesv1.GameServerNamespaceLister, 425 nodeCounts map[string]gameservers.NodeCount) (int32, bool, error) { 426 427 if !runtime.FeatureEnabled(runtime.FeatureCountsAndLists) { 428 return 0, false, errors.Errorf("cannot apply CounterPolicy unless feature flag %s is enabled", runtime.FeatureCountsAndLists) 429 } 430 431 var isCounter bool // True if a CounterPolicy False if a ListPolicy 432 var key string // The specified Counter or List 433 var count int64 // The Count or number of Values in the template Game Server 434 var capacity int64 // The Capacity in the template Game Server 435 var aggCount int64 // The Aggregate Count of the specified Counter or List of all GameServers across the GameServerSet in the Fleet 436 var aggCapacity int64 // The Aggregate Capacity of the specified Counter or List of all GameServers across the GameServerSet in the Fleet 437 var aggAllocatedCount int64 // The Aggregate Count of the specified Counter or List of GameServers in an Allocated state across the GameServerSet in the Fleet 438 var minCapacity int64 // The Minimum Aggregate Capacity 439 var maxCapacity int64 // The Maximum Aggregate Capacity 440 var bufferSize intstr.IntOrString 441 442 if c != nil { 443 isCounter = true 444 counter, ok := f.Spec.Template.Spec.Counters[c.Key] 445 if !ok { 446 return 0, false, errors.Errorf("cannot apply CounterPolicy as Counter key %s does not exist in the Fleet Spec", c.Key) 447 } 448 449 aggCounter, ok := f.Status.Counters[c.Key] 450 if !ok { 451 return 0, false, errors.Errorf("cannot apply CounterPolicy as Counter key %s does not exist in the Fleet Status", c.Key) 452 } 453 454 key = c.Key 455 count = counter.Count 456 capacity = counter.Capacity 457 aggCount = aggCounter.Count 458 aggCapacity = aggCounter.Capacity 459 aggAllocatedCount = aggCounter.AllocatedCount 460 minCapacity = c.MinCapacity 461 maxCapacity = c.MaxCapacity 462 bufferSize = c.BufferSize 463 464 } else { 465 isCounter = false 466 list, ok := f.Spec.Template.Spec.Lists[l.Key] 467 if !ok { 468 return 0, false, errors.Errorf("cannot apply ListPolicy as List key %s does not exist in the Fleet Spec", l.Key) 469 } 470 471 aggList, ok := f.Status.Lists[l.Key] 472 if !ok { 473 return 0, false, errors.Errorf("cannot apply ListPolicy as List key %s does not exist in the Fleet Status", l.Key) 474 } 475 476 key = l.Key 477 count = int64(len(list.Values)) 478 capacity = list.Capacity 479 aggCount = aggList.Count 480 aggCapacity = aggList.Capacity 481 aggAllocatedCount = aggList.AllocatedCount 482 minCapacity = l.MinCapacity 483 maxCapacity = l.MaxCapacity 484 bufferSize = l.BufferSize 485 } 486 487 // Checks if we've limited by TOTAL capacity 488 limited, scale := isLimited(aggCapacity, minCapacity, maxCapacity) 489 490 // Total current number of Replicas 491 replicas := f.Status.Replicas 492 493 // The buffer is the desired available capacity 494 var buffer int64 495 496 switch { 497 // Desired replicas based on BufferSize specified as an absolute value (i.e. 5) 498 case bufferSize.Type == intstr.Int: 499 buffer = int64(bufferSize.IntValue()) 500 // Desired replicas based on BufferSize specified as a percent (i.e. 5%) 501 case bufferSize.Type == intstr.String: 502 bufferPercent, err := intstr.GetValueFromIntOrPercent(&bufferSize, 100, isCounter) 503 if err != nil { 504 return 0, false, err 505 } 506 // If the Aggregated Allocated Counts is 0 then desired capacity gets calculated as 0. If the 507 // capacity of 1 replica is equal to or greater than minimum capacity we can exit early. 508 if aggAllocatedCount <= 0 && capacity >= minCapacity { 509 return 1, true, nil 510 } 511 512 // The desired TOTAL capacity based on the Aggregated Allocated Counts (see applyBufferPolicy for explanation) 513 desiredCapacity := int64(math.Ceil(float64(aggAllocatedCount*100) / float64(100-bufferPercent))) 514 // Convert into a desired AVAILABLE capacity aka the buffer 515 buffer = desiredCapacity - aggAllocatedCount 516 } 517 518 // Current available capacity across the TOTAL fleet 519 switch availableCapacity := aggCapacity - aggCount; { 520 case availableCapacity == buffer: 521 if limited { 522 return scaleLimited(scale, f, gameServerNamespacedLister, nodeCounts, key, isCounter, replicas, 523 capacity, aggCapacity, minCapacity, maxCapacity) 524 } 525 return replicas, false, nil 526 case availableCapacity < buffer: // Scale Up 527 if limited { // Case where we want to scale up, but we're already limited by MaxCapacity. 528 return scaleLimited(scale, f, gameServerNamespacedLister, nodeCounts, key, isCounter, replicas, 529 capacity, aggCapacity, minCapacity, maxCapacity) 530 } 531 return scaleUp(replicas, capacity, count, aggCapacity, availableCapacity, maxCapacity, 532 minCapacity, buffer) 533 case availableCapacity > buffer: // Scale Down 534 if limited && scale == 1 { // Case where we want to scale down but we're already limited by MinCapacity 535 return scaleLimited(scale, f, gameServerNamespacedLister, nodeCounts, key, isCounter, replicas, 536 capacity, aggCapacity, minCapacity, maxCapacity) 537 } 538 return scaleDown(f, gameServerNamespacedLister, nodeCounts, key, isCounter, replicas, aggCount, 539 aggCapacity, minCapacity, buffer) 540 } 541 542 if isCounter { 543 return 0, false, errors.Errorf("unable to apply CounterPolicy %v", c) 544 } 545 return 0, false, errors.Errorf("unable to apply ListPolicy %v", l) 546 } 547 548 func applySchedulePolicy(ctx context.Context, state *fasState, s *autoscalingv1.SchedulePolicy, f *agonesv1.Fleet, gameServerNamespacedLister listeragonesv1.GameServerNamespaceLister, nodeCounts map[string]gameservers.NodeCount, currentTime time.Time, fasLog *FasLogger) (int32, bool, error) { 549 // Ensure the scheduled autoscaler feature gate is enabled 550 if !runtime.FeatureEnabled(runtime.FeatureScheduledAutoscaler) { 551 return 0, false, errors.Errorf("cannot apply SchedulePolicy unless feature flag %s is enabled", runtime.FeatureScheduledAutoscaler) 552 } 553 554 if isScheduleActive(s, currentTime) { 555 return computeDesiredFleetSize(ctx, state, s.Policy, f, gameServerNamespacedLister, nodeCounts, fasLog) 556 } 557 558 loggerForFleetAutoscalerKey(fasLog.fas.ObjectMeta.Name, fasLog.baseLogger).Debugf( 559 "Fleet autoscaler check: Schedule not active for fleet %s", f.ObjectMeta.Name) 560 561 // If the schedule wasn't active then return the current replica amount of the fleet 562 return f.Status.Replicas, false, &InactiveScheduleError{} 563 } 564 565 func applyChainPolicy(ctx context.Context, state *fasState, c autoscalingv1.ChainPolicy, f *agonesv1.Fleet, gameServerNamespacedLister listeragonesv1.GameServerNamespaceLister, nodeCounts map[string]gameservers.NodeCount, currentTime time.Time, fasLog *FasLogger) (int32, bool, error) { 566 // Ensure the scheduled autoscaler feature gate is enabled 567 if !runtime.FeatureEnabled(runtime.FeatureScheduledAutoscaler) { 568 return 0, false, errors.Errorf("cannot apply ChainPolicy unless feature flag %s is enabled", runtime.FeatureScheduledAutoscaler) 569 } 570 571 replicas := f.Status.Replicas 572 var limited bool 573 var err error 574 var chainEntry autoscalingv1.FleetAutoscalerPolicyType 575 576 // Loop over all entries in the chain 577 for _, entry := range c { 578 switch entry.Type { 579 case autoscalingv1.SchedulePolicyType: 580 replicas, limited, err = applySchedulePolicy(ctx, state, entry.Schedule, f, gameServerNamespacedLister, nodeCounts, currentTime, fasLog) 581 582 if err != nil { 583 loggerForFleetAutoscalerKey(fasLog.fas.ObjectMeta.Name, fasLog.baseLogger).Debugf( 584 "Failed to apply SchedulePolicy ID=%s in ChainPolicy: %v", entry.ID, err) 585 } 586 case autoscalingv1.WebhookPolicyType: 587 replicas, limited, err = applyWebhookPolicy(state, entry.Webhook, f, fasLog) 588 589 if err != nil { 590 loggerForFleetAutoscalerKey(fasLog.fas.ObjectMeta.Name, fasLog.baseLogger).Debugf( 591 "Failed to apply WebhookPolicy ID=%s in ChainPolicy: %v", entry.ID, err) 592 } 593 default: 594 // Every other policy type we just want to compute the desired fleet and return it 595 replicas, limited, err = computeDesiredFleetSize(ctx, state, entry.FleetAutoscalerPolicy, f, gameServerNamespacedLister, nodeCounts, fasLog) 596 597 if err != nil && !errors.Is(err, InactiveScheduleError{}) { 598 loggerForFleetAutoscalerKey(fasLog.fas.ObjectMeta.Name, fasLog.baseLogger).Debugf( 599 "Failed to apply %s ID=%s in ChainPolicy: %v", entry.Type, entry.ID, err) 600 } 601 } 602 603 // If no error occurred, exit the loop early 604 if err == nil { 605 chainEntry = autoscalingv1.FleetAutoscalerPolicyType(fmt.Sprintf("%s:%s:%s", autoscalingv1.ChainPolicyType, entry.ID, entry.Type)) 606 break 607 } 608 } 609 610 if err != nil && !errors.Is(err, InactiveScheduleError{}) { 611 emitChainPolicyEvent(fasLog, "Unknown", "") 612 loggerForFleetAutoscalerKey(fasLog.fas.ObjectMeta.Name, fasLog.baseLogger).Debug("Failed to apply ChainPolicy: no valid policy applied") 613 return replicas, limited, err 614 } 615 616 currChainEntry := strings.Split(string(chainEntry), ":") 617 618 // Handle the final state of the chain and update status if necessary 619 if lastAppliedPolicy := fasLog.fas.Status.LastAppliedPolicy; strings.Contains(string(lastAppliedPolicy), string(autoscalingv1.ChainPolicyType)) { 620 prevChainEntry := strings.Split(string(lastAppliedPolicy), ":") 621 622 // Only log if there is a change in the policy 623 if len(prevChainEntry) > 2 && (currChainEntry[1] != prevChainEntry[1] || currChainEntry[2] != prevChainEntry[2]) { 624 fasLog.currChainEntry = &chainEntry 625 emitChainPolicyEvent(fasLog, currChainEntry[1], currChainEntry[2]) 626 } 627 } else { 628 fasLog.currChainEntry = &chainEntry 629 emitChainPolicyEvent(fasLog, currChainEntry[1], currChainEntry[2]) 630 } 631 632 return replicas, limited, nil 633 } 634 635 // isScheduleActive checks if a chain entry's is active and returns a boolean, true if active, false otherwise 636 func isScheduleActive(s *autoscalingv1.SchedulePolicy, currentTime time.Time) bool { 637 // Used for checking ahead of the schedule for daylight savings purposes 638 cronDelta := (time.Minute * -1) + (time.Second * -30) 639 640 // If the current time is before the start time, the schedule is inactive so return false 641 startTime := s.Between.Start.Time 642 if currentTime.Before(startTime) { 643 return false 644 } 645 646 // If an end time is present and the current time is after the end time, the schedule is inactive so return false 647 endTime := s.Between.End.Time 648 if !endTime.IsZero() && currentTime.After(endTime) { 649 return false 650 } 651 652 // If no startCron field is specified, then it's automatically true (duration is no longer relevant since we're always running) 653 if s.ActivePeriod.StartCron == "" { 654 return true 655 } 656 657 // Ignore the error as validation is already done within the validateChainPolicy after being unmarshalled 658 location, _ := time.LoadLocation(s.ActivePeriod.Timezone) 659 660 // Ignore the error as validation is already done within the validateChainPolicy after being unmarshalled 661 startCron, _ := cron.ParseStandard(s.ActivePeriod.StartCron) 662 663 // Ignore the error as validation is already done within the validateChainPolicy after being unmarshalled. 664 // If the duration is empty set it to the largest duration possible (290 years) 665 duration, _ := time.ParseDuration(s.ActivePeriod.Duration) 666 if s.ActivePeriod.Duration == "" { 667 duration, _ = time.ParseDuration(maxDuration) 668 } 669 670 // Get the current time - duration 671 currentTimeMinusDuration := currentTime.Add(duration * -1) 672 // Take (current time - duration) to get the first available start time 673 cronStartTime := startCron.Next(currentTimeMinusDuration.In(location)) 674 // Take the (cronStartTime + duration) to get the end time 675 cronEndTime := cronStartTime.Add(duration) 676 677 // If the current time is after the cronStartTime - 90 seconds (for daylight saving purposes) AND the current time before the cronEndTime 678 // then return true 679 // Example: startCron = 0 14 * * * // 2:00 PM Everyday | duration = 1 hr | cronDelta = 90 seconds | currentTime = 2024-08-01T14:30:00Z | currentTimeMinusDuration = 2024-08-01T13:30:00Z 680 // then cronStartTime = 2024-08-01T14:00:00Z and cronEndTime = 2024-08-01T15:00:00Z 681 // and since currentTime > cronStartTime + cronDelta AND currentTime < cronEndTime, we return true 682 if currentTime.After(cronStartTime.Add(cronDelta)) && currentTime.Before(cronEndTime) { 683 return true 684 } 685 686 return false 687 } 688 689 // getSortedGameServers returns the list of Game Servers for the Fleet in the order in which the 690 // Game Servers would be deleted. 691 func getSortedGameServers(f *agonesv1.Fleet, gameServerNamespacedLister listeragonesv1.GameServerNamespaceLister, 692 nodeCounts map[string]gameservers.NodeCount) ([]*agonesv1.GameServer, error) { 693 gsList, err := fleets.ListGameServersByFleetOwner(gameServerNamespacedLister, f) 694 if err != nil { 695 return nil, err 696 } 697 698 gameServers := gssets.SortGameServersByStrategy(f.Spec.Scheduling, gsList, nodeCounts, f.Spec.Priorities) 699 return gameServers, nil 700 } 701 702 // isLimited indicates that the calculated scale would be above or below the range defined by 703 // MinCapacity and MaxCapacity in the ListPolicy or CounterPolicy. 704 // Return 1 if the fleet needs to scale up, -1 if the fleets need to scale down, 0 if the fleet does 705 // not need to scale, or if the fleet is not limited. 706 func isLimited(aggCapacity, minCapacity, maxCapacity int64) (bool, int) { 707 if aggCapacity < minCapacity { // Scale up 708 return true, 1 709 } 710 if aggCapacity > maxCapacity { // Scale down 711 return true, -1 712 } 713 return false, 0 714 } 715 716 // scaleUpLimited scales up the fleet to meet the MinCapacity 717 func scaleUpLimited(replicas int32, capacity, aggCapacity, minCapacity int64) (int32, bool, error) { 718 if capacity == 0 { 719 return 0, false, errors.Errorf("cannot scale up as Capacity is equal to 0") 720 } 721 for aggCapacity < minCapacity { 722 aggCapacity += capacity 723 replicas++ 724 } 725 return replicas, true, nil 726 } 727 728 // scaleDownLimited scales down the fleet to meet the MaxCapacity 729 func scaleDownLimited(f *agonesv1.Fleet, gameServerNamespacedLister listeragonesv1.GameServerNamespaceLister, 730 nodeCounts map[string]gameservers.NodeCount, key string, isCounter bool, replicas int32, 731 aggCapacity, maxCapacity int64) (int32, bool, error) { 732 // Game Servers in order of deletion on scale down 733 gameServers, err := getSortedGameServers(f, gameServerNamespacedLister, nodeCounts) 734 if err != nil { 735 return 0, false, err 736 } 737 for _, gs := range gameServers { 738 if aggCapacity <= maxCapacity { 739 break 740 } 741 switch isCounter { 742 case true: 743 if counter, ok := gs.Status.Counters[key]; ok { 744 aggCapacity -= counter.Capacity 745 } 746 case false: 747 if list, ok := gs.Status.Lists[key]; ok { 748 aggCapacity -= list.Capacity 749 } 750 } 751 replicas-- 752 } 753 754 // We are not currently able to scale down to zero replicas, so one replica is the minimum allowed 755 if replicas < 1 { 756 replicas = 1 757 } 758 759 return replicas, true, nil 760 } 761 762 func scaleLimited(scale int, f *agonesv1.Fleet, gameServerNamespacedLister listeragonesv1.GameServerNamespaceLister, 763 nodeCounts map[string]gameservers.NodeCount, key string, isCounter bool, replicas int32, 764 capacity, aggCapacity, minCapacity, maxCapacity int64) (int32, bool, error) { 765 766 switch scale { 767 case 1: // scale up 768 return scaleUpLimited(replicas, capacity, aggCapacity, minCapacity) 769 case -1: // scale down 770 return scaleDownLimited(f, gameServerNamespacedLister, nodeCounts, key, isCounter, replicas, 771 aggCapacity, maxCapacity) 772 case 0: 773 return replicas, false, nil 774 } 775 776 return 0, false, errors.Errorf("cannot scale due to error in scaleLimited function") 777 } 778 779 // scaleUp scales up for either Integer or Percentage Buffer. 780 func scaleUp(replicas int32, capacity, count, aggCapacity, availableCapacity, maxCapacity, 781 minCapacity, buffer int64) (int32, bool, error) { 782 783 // How much capacity is gained by adding one more replica to the fleet. 784 replicaCapacity := capacity - count 785 if replicaCapacity <= 0 { 786 return 0, false, errors.Errorf("cannot scale up as adding additional replicas does not increase available Capacity") 787 } 788 789 additionalReplicas := int32(math.Ceil((float64(buffer) - float64(availableCapacity)) / float64(replicaCapacity))) 790 791 // Check to make sure we're not limited (over Max Capacity) 792 limited, _ := isLimited(aggCapacity+(int64(additionalReplicas)*capacity), minCapacity, maxCapacity) 793 if limited { 794 additionalReplicas = int32((maxCapacity - aggCapacity) / capacity) 795 } 796 797 return replicas + additionalReplicas, limited, nil 798 } 799 800 // scaleDown scales down for either Integer or Percentage Buffer. 801 func scaleDown(f *agonesv1.Fleet, gameServerNamespacedLister listeragonesv1.GameServerNamespaceLister, 802 nodeCounts map[string]gameservers.NodeCount, key string, isCounter bool, replicas int32, 803 aggCount, aggCapacity, minCapacity, buffer int64) (int32, bool, error) { 804 // Exit early if we're already at MinCapacity to avoid calling getSortedGameServers if unnecessary 805 if aggCapacity == minCapacity { 806 return replicas, true, nil 807 } 808 809 // We first need to get the individual game servers in order of deletion on scale down, as any 810 // game server may have a unique value for counts and / or capacity. 811 gameServers, err := getSortedGameServers(f, gameServerNamespacedLister, nodeCounts) 812 if err != nil { 813 return 0, false, err 814 } 815 816 var availableCapacity int64 817 818 // "Remove" one game server at a time in order of potential deletion. (Not actually removed here, 819 // that's done later, if possible, by the fleetautoscaler controller.) 820 for _, gs := range gameServers { 821 replicas-- 822 switch isCounter { 823 case true: 824 if counter, ok := gs.Status.Counters[key]; ok { 825 aggCount -= counter.Count 826 aggCapacity -= counter.Capacity 827 } else { 828 continue 829 } 830 case false: 831 if list, ok := gs.Status.Lists[key]; ok { 832 aggCount -= int64(len(list.Values)) 833 aggCapacity -= list.Capacity 834 } else { 835 continue 836 } 837 } 838 availableCapacity = aggCapacity - aggCount 839 // Check if we've overshot our buffer 840 if availableCapacity < buffer { 841 return replicas + 1, false, nil 842 } 843 // Check if we're Limited (Below MinCapacity) 844 if aggCapacity < minCapacity { 845 return replicas + 1, true, nil 846 } 847 // Check if we're at our desired Buffer 848 if availableCapacity == buffer { 849 return replicas, false, nil 850 } 851 // Check if we're at Limited 852 if aggCapacity == minCapacity { 853 return replicas, true, nil 854 } 855 } 856 857 // We are not currently able to scale down to zero replicas, so one replica is the minimum allowed. 858 if replicas < 1 { 859 replicas = 1 860 } 861 862 return replicas, false, nil 863 } 864 865 func emitChainPolicyEvent(fasLog *FasLogger, chainID string, chainType string) { 866 if fasLog.recorder == nil { 867 return 868 } 869 870 var eventMessage string 871 var eventType string 872 873 if chainID == "Unknown" { 874 eventMessage = fmt.Sprintf("FleetAutoscaler '%s' failed to apply ChainPolicy | ID: %s | Type: %s", 875 fasLog.fas.ObjectMeta.Name, chainID, chainType) 876 eventType = corev1.EventTypeWarning // Use Warning for failure 877 } else { 878 eventMessage = fmt.Sprintf("FleetAutoscaler '%s' successfully applied ChainPolicy | ID: %s | Type: %s", 879 fasLog.fas.ObjectMeta.Name, chainID, chainType) 880 eventType = corev1.EventTypeNormal // Use Normal for success 881 } 882 883 // Emit the event 884 fasLog.recorder.Eventf(fasLog.fas, eventType, "ChainPolicy", eventMessage) 885 }