agones.dev/agones@v1.54.0/pkg/metrics/controller.go (about) 1 // Copyright 2018 Google LLC All Rights Reserved. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 package metrics 16 17 import ( 18 "context" 19 "fmt" 20 "strconv" 21 "strings" 22 "sync" 23 "time" 24 25 agonesv1 "agones.dev/agones/pkg/apis/agones/v1" 26 autoscalingv1 "agones.dev/agones/pkg/apis/autoscaling/v1" 27 "agones.dev/agones/pkg/client/clientset/versioned" 28 "agones.dev/agones/pkg/client/informers/externalversions" 29 listerv1 "agones.dev/agones/pkg/client/listers/agones/v1" 30 autoscalinglisterv1 "agones.dev/agones/pkg/client/listers/autoscaling/v1" 31 fleetsv1 "agones.dev/agones/pkg/fleets" 32 "agones.dev/agones/pkg/util/runtime" 33 lru "github.com/hashicorp/golang-lru" 34 "github.com/pkg/errors" 35 "github.com/sirupsen/logrus" 36 "go.opencensus.io/stats" 37 "go.opencensus.io/tag" 38 corev1 "k8s.io/api/core/v1" 39 apiequality "k8s.io/apimachinery/pkg/api/equality" 40 "k8s.io/apimachinery/pkg/labels" 41 "k8s.io/apimachinery/pkg/util/intstr" 42 "k8s.io/apimachinery/pkg/util/wait" 43 "k8s.io/client-go/informers" 44 "k8s.io/client-go/kubernetes" 45 v1 "k8s.io/client-go/listers/core/v1" 46 "k8s.io/client-go/tools/cache" 47 ) 48 49 const ( 50 noneValue = "none" 51 52 // GameServersStateCount is the size of LRU cache and should contain all gameservers state changes 53 // Upper bound could be estimated as 10_000 of gameservers in total each moment, 10 state changes per each gameserver 54 // and about 10 minutes for a game session, and 6 gameservers per hour. 55 // For one hour 600k capacity would be enough, even if no records would be deleted. 56 // And calcDuration algorithm is removing those records, which already has been changed (old statuses). 57 // Key is Namespace, fleetName, GameServerName, State and float64 as value. 58 // Roughly 256 + 63 + 63 + 16 + 4 = 400 bytes per every record. 59 // In total we would have 229 MiB of space required to store GameServer State durations. 60 GameServersStateCount = 600_000 61 ) 62 63 var ( 64 // MetricResyncPeriod is the interval to re-synchronize metrics based on indexed cache. 65 MetricResyncPeriod = time.Second * 15 66 ) 67 68 func init() { 69 registerViews() 70 } 71 72 // Controller is a metrics controller collecting Agones state metrics 73 // 74 //nolint:govet // ignore fieldalignment, singleton 75 type Controller struct { 76 logger *logrus.Entry 77 gameServerLister listerv1.GameServerLister 78 nodeLister v1.NodeLister 79 gameServerSynced cache.InformerSynced 80 fleetSynced cache.InformerSynced 81 fleetLister listerv1.FleetLister 82 gameServerSetLister listerv1.GameServerSetLister 83 fasSynced cache.InformerSynced 84 fasLister autoscalinglisterv1.FleetAutoscalerLister 85 lock sync.Mutex 86 stateLock sync.Mutex 87 gsCount GameServerCount 88 faCount map[string]int64 89 gameServerStateLastChange *lru.Cache 90 now func() time.Time 91 } 92 93 // NewController returns a new metrics controller 94 func NewController( 95 _ kubernetes.Interface, 96 _ versioned.Interface, 97 kubeInformerFactory informers.SharedInformerFactory, 98 agonesInformerFactory externalversions.SharedInformerFactory) *Controller { 99 100 gameServer := agonesInformerFactory.Agones().V1().GameServers() 101 gsInformer := gameServer.Informer() 102 103 fleets := agonesInformerFactory.Agones().V1().Fleets() 104 fInformer := fleets.Informer() 105 fas := agonesInformerFactory.Autoscaling().V1().FleetAutoscalers() 106 fasInformer := fas.Informer() 107 node := kubeInformerFactory.Core().V1().Nodes() 108 109 gameServerSets := agonesInformerFactory.Agones().V1().GameServerSets() 110 111 // GameServerStateLastChange Contains the time when the GameServer 112 // changed its state last time 113 // on delete and state change remove GameServerName key 114 lruCache, err := lru.New(GameServersStateCount) 115 if err != nil { 116 logger.WithError(err).Fatal("Unable to create LRU cache") 117 } 118 119 c := &Controller{ 120 gameServerLister: gameServer.Lister(), 121 nodeLister: node.Lister(), 122 gameServerSynced: gsInformer.HasSynced, 123 fleetSynced: fInformer.HasSynced, 124 fleetLister: fleets.Lister(), 125 gameServerSetLister: gameServerSets.Lister(), 126 fasSynced: fasInformer.HasSynced, 127 fasLister: fas.Lister(), 128 gsCount: GameServerCount{}, 129 faCount: map[string]int64{}, 130 gameServerStateLastChange: lruCache, 131 now: time.Now, 132 } 133 134 c.logger = runtime.NewLoggerWithType(c) 135 136 _, _ = fInformer.AddEventHandler(cache.ResourceEventHandlerFuncs{ 137 AddFunc: c.recordFleetChanges, 138 UpdateFunc: func(_, next interface{}) { 139 c.recordFleetChanges(next) 140 }, 141 DeleteFunc: c.recordFleetDeletion, 142 }) 143 144 _, _ = fasInformer.AddEventHandler(cache.ResourceEventHandlerFuncs{ 145 AddFunc: func(added interface{}) { 146 c.recordFleetAutoScalerChanges(nil, added) 147 }, 148 UpdateFunc: c.recordFleetAutoScalerChanges, 149 DeleteFunc: c.recordFleetAutoScalerDeletion, 150 }) 151 152 _, _ = gsInformer.AddEventHandlerWithResyncPeriod(cache.ResourceEventHandlerFuncs{ 153 UpdateFunc: c.recordGameServerStatusChanges, 154 }, 0) 155 156 return c 157 } 158 159 func (c *Controller) recordFleetAutoScalerChanges(old, next interface{}) { 160 161 fas, ok := next.(*autoscalingv1.FleetAutoscaler) 162 if !ok { 163 return 164 } 165 166 // we looking for fleet name changes if that happens we need to reset 167 // metrics for the old fas. 168 if old != nil { 169 if oldFas, ok := old.(*autoscalingv1.FleetAutoscaler); ok && 170 oldFas.Spec.FleetName != fas.Spec.FleetName { 171 c.recordFleetAutoScalerDeletion(old) 172 } 173 } 174 175 // do not record fleetautoscaler, delete event will do this. 176 if fas.DeletionTimestamp != nil { 177 return 178 } 179 180 ctx, _ := tag.New(context.Background(), tag.Upsert(keyName, fas.Name), 181 tag.Upsert(keyFleetName, fas.Spec.FleetName), tag.Upsert(keyNamespace, fas.Namespace)) 182 183 ableToScale := 0 184 limited := 0 185 if fas.Status.AbleToScale { 186 ableToScale = 1 187 } 188 if fas.Status.ScalingLimited { 189 limited = 1 190 } 191 // recording status 192 stats.Record(ctx, 193 fasCurrentReplicasStats.M(int64(fas.Status.CurrentReplicas)), 194 fasDesiredReplicasStats.M(int64(fas.Status.DesiredReplicas)), 195 fasAbleToScaleStats.M(int64(ableToScale)), 196 fasLimitedStats.M(int64(limited))) 197 198 // recording buffer policy 199 if fas.Spec.Policy.Buffer != nil { 200 // recording limits 201 RecordWithTags(ctx, []tag.Mutator{tag.Upsert(keyType, "max")}, 202 fasBufferLimitsCountStats.M(int64(fas.Spec.Policy.Buffer.MaxReplicas))) 203 RecordWithTags(ctx, []tag.Mutator{tag.Upsert(keyType, "min")}, 204 fasBufferLimitsCountStats.M(int64(fas.Spec.Policy.Buffer.MinReplicas))) 205 206 // recording size 207 if fas.Spec.Policy.Buffer.BufferSize.Type == intstr.String { 208 // as percentage 209 sizeString := fas.Spec.Policy.Buffer.BufferSize.StrVal 210 if sizeString != "" { 211 if size, err := strconv.Atoi(sizeString[:len(sizeString)-1]); err == nil { 212 RecordWithTags(ctx, []tag.Mutator{tag.Upsert(keyType, "percentage")}, 213 fasBufferSizeStats.M(int64(size))) 214 } 215 } 216 } else { 217 // as count 218 RecordWithTags(ctx, []tag.Mutator{tag.Upsert(keyType, "count")}, 219 fasBufferSizeStats.M(int64(fas.Spec.Policy.Buffer.BufferSize.IntVal))) 220 } 221 } 222 } 223 224 func (c *Controller) recordFleetAutoScalerDeletion(obj interface{}) { 225 _, ok := obj.(*autoscalingv1.FleetAutoscaler) 226 if !ok { 227 return 228 } 229 230 if err := c.resyncFleetAutoScaler(); err != nil { 231 c.logger.WithError(err).Warn("Could not resync Fleet Autoscaler metrics") 232 } 233 } 234 235 func (c *Controller) recordFleetChanges(obj interface{}) { 236 f, ok := obj.(*agonesv1.Fleet) 237 if !ok { 238 return 239 } 240 241 // do not record fleet, delete event will do this. 242 if f.DeletionTimestamp != nil { 243 return 244 } 245 246 c.recordFleetReplicas(f.Name, f.Namespace, f.Status.Replicas, f.Status.AllocatedReplicas, 247 f.Status.ReadyReplicas, f.Spec.Replicas, f.Status.ReservedReplicas) 248 249 c.recordFleetRolloutPercentage(f) 250 251 if runtime.FeatureEnabled(runtime.FeatureCountsAndLists) { 252 if f.Status.Counters != nil { 253 c.recordCounters(f.Name, f.Namespace, f.Status.Counters) 254 } 255 if f.Status.Lists != nil { 256 c.recordLists(f.Name, f.Namespace, f.Status.Lists) 257 } 258 } 259 } 260 261 func (c *Controller) recordFleetRolloutPercentage(fleet *agonesv1.Fleet) { 262 gameServerSetNamespacedLister := c.gameServerSetLister.GameServerSets(fleet.ObjectMeta.Namespace) 263 list, err := fleetsv1.ListGameServerSetsByFleetOwner(gameServerSetNamespacedLister, fleet) 264 if err != nil { 265 c.logger.Errorf("Error listing GameServerSets for fleet %s in namespace %s: %v", fleet.Name, fleet.Namespace, err.Error()) 266 return 267 } 268 269 active, _ := c.filterGameServerSetByActive(fleet, list) 270 271 if active == nil { 272 fleetName := fleet.ObjectMeta.Namespace + "/" + fleet.ObjectMeta.Name 273 c.logger.Debugf("Could not find active GameServerSet %s", fleetName) 274 active = fleet.GameServerSet() 275 } 276 277 currentReplicas := active.Status.Replicas 278 desiredReplicas := fleet.Spec.Replicas 279 280 ctx, _ := tag.New(context.Background(), tag.Upsert(keyName, fleet.Name), tag.Upsert(keyNamespace, fleet.GetNamespace())) 281 282 // Record current replicas count 283 RecordWithTags(ctx, []tag.Mutator{tag.Upsert(keyType, "current_replicas")}, 284 fleetRolloutPercentStats.M(int64(currentReplicas))) 285 286 // Record desired replicas count 287 RecordWithTags(ctx, []tag.Mutator{tag.Upsert(keyType, "desired_replicas")}, 288 fleetRolloutPercentStats.M(int64(desiredReplicas))) 289 } 290 291 // filterGameServerSetByActive returns the active GameServerSet (or nil if it 292 // doesn't exist) and then the rest of the GameServerSets that are controlled 293 // by this Fleet 294 func (c *Controller) filterGameServerSetByActive(fleet *agonesv1.Fleet, list []*agonesv1.GameServerSet) (*agonesv1.GameServerSet, []*agonesv1.GameServerSet) { 295 var active *agonesv1.GameServerSet 296 var rest []*agonesv1.GameServerSet 297 298 for _, gsSet := range list { 299 if apiequality.Semantic.DeepEqual(gsSet.Spec.Template, fleet.Spec.Template) { 300 active = gsSet 301 } else { 302 rest = append(rest, gsSet) 303 } 304 } 305 306 return active, rest 307 } 308 309 func (c *Controller) recordFleetDeletion(obj interface{}) { 310 _, ok := obj.(*agonesv1.Fleet) 311 if !ok { 312 return 313 } 314 315 if err := c.resyncFleets(); err != nil { 316 // If for some reason resync fails, the entire metric state for fleets 317 // will be reset whenever the next Fleet gets deleted, in which case 318 // we end up back in a healthy state - so we aren't going to actively retry. 319 c.logger.WithError(err).Warn("Could not resync Fleet Metrics") 320 } 321 } 322 323 // resyncFleets resets all views associated with a Fleet, and recalculates all totals. 324 func (c *Controller) resyncFleets() error { 325 c.lock.Lock() 326 defer c.lock.Unlock() 327 fleets, err := c.fleetLister.List(labels.Everything()) 328 if err != nil { 329 return errors.Wrap(err, "could not resync Fleets") 330 } 331 332 fasList, err := c.fasLister.List(labels.Everything()) 333 if err != nil { 334 return errors.Wrap(err, "could not resync Fleets") 335 } 336 337 resetViews(fleetViews) 338 for _, f := range fleets { 339 c.recordFleetChanges(f) 340 } 341 for _, fas := range fasList { 342 c.recordFleetAutoScalerChanges(nil, fas) 343 } 344 c.collectGameServerCounts() 345 346 return nil 347 } 348 349 // resyncFleetAutoScaler resets all views associated with FleetAutoscalers, and recalculates metric totals. 350 func (c *Controller) resyncFleetAutoScaler() error { 351 c.lock.Lock() 352 defer c.lock.Unlock() 353 354 fasList, err := c.fasLister.List(labels.Everything()) 355 if err != nil { 356 return errors.Wrap(err, "could not resync FleetAutoScalers") 357 } 358 359 resetViews(fleetAutoscalerViews) 360 for _, fas := range fasList { 361 c.recordFleetAutoScalerChanges(nil, fas) 362 } 363 364 return nil 365 } 366 367 func (c *Controller) recordFleetReplicas(fleetName, fleetNamespace string, total, allocated, ready, desired, reserved int32) { 368 369 ctx, _ := tag.New(context.Background(), tag.Upsert(keyName, fleetName), tag.Upsert(keyNamespace, fleetNamespace)) 370 371 RecordWithTags(ctx, []tag.Mutator{tag.Upsert(keyType, "total")}, 372 fleetsReplicasCountStats.M(int64(total))) 373 RecordWithTags(ctx, []tag.Mutator{tag.Upsert(keyType, "allocated")}, 374 fleetsReplicasCountStats.M(int64(allocated))) 375 RecordWithTags(ctx, []tag.Mutator{tag.Upsert(keyType, "ready")}, 376 fleetsReplicasCountStats.M(int64(ready))) 377 RecordWithTags(ctx, []tag.Mutator{tag.Upsert(keyType, "desired")}, 378 fleetsReplicasCountStats.M(int64(desired))) 379 RecordWithTags(ctx, []tag.Mutator{tag.Upsert(keyType, "reserved")}, 380 fleetsReplicasCountStats.M(int64(reserved))) 381 } 382 383 // nolint:dupl // Linter errors on lines are duplicate of recordLists 384 func (c *Controller) recordCounters(fleetName, fleetNamespace string, counters map[string]agonesv1.AggregatedCounterStatus) { 385 386 ctx, _ := tag.New(context.Background(), tag.Upsert(keyName, fleetName), tag.Upsert(keyNamespace, fleetNamespace)) 387 388 for counter, counterStatus := range counters { 389 RecordWithTags(ctx, []tag.Mutator{tag.Upsert(keyType, "allocated_count"), tag.Upsert(keyCounter, counter)}, 390 fleetCountersStats.M(counterStatus.AllocatedCount)) 391 RecordWithTags(ctx, []tag.Mutator{tag.Upsert(keyType, "allocated_capacity"), tag.Upsert(keyCounter, counter)}, 392 fleetCountersStats.M(counterStatus.AllocatedCapacity)) 393 RecordWithTags(ctx, []tag.Mutator{tag.Upsert(keyType, "total_count"), tag.Upsert(keyCounter, counter)}, 394 fleetCountersStats.M(counterStatus.Count)) 395 RecordWithTags(ctx, []tag.Mutator{tag.Upsert(keyType, "total_capacity"), tag.Upsert(keyCounter, counter)}, 396 fleetCountersStats.M(counterStatus.Capacity)) 397 } 398 } 399 400 // nolint:dupl // Linter errors on lines are duplicate of recordCounters 401 func (c *Controller) recordLists(fleetName, fleetNamespace string, lists map[string]agonesv1.AggregatedListStatus) { 402 403 ctx, _ := tag.New(context.Background(), tag.Upsert(keyName, fleetName), tag.Upsert(keyNamespace, fleetNamespace)) 404 405 for list, listStatus := range lists { 406 RecordWithTags(ctx, []tag.Mutator{tag.Upsert(keyType, "allocated_count"), tag.Upsert(keyList, list)}, 407 fleetListsStats.M(listStatus.AllocatedCount)) 408 RecordWithTags(ctx, []tag.Mutator{tag.Upsert(keyType, "allocated_capacity"), tag.Upsert(keyList, list)}, 409 fleetListsStats.M(listStatus.AllocatedCapacity)) 410 RecordWithTags(ctx, []tag.Mutator{tag.Upsert(keyType, "total_count"), tag.Upsert(keyList, list)}, 411 fleetListsStats.M(listStatus.Count)) 412 RecordWithTags(ctx, []tag.Mutator{tag.Upsert(keyType, "total_capacity"), tag.Upsert(keyList, list)}, 413 fleetListsStats.M(listStatus.Capacity)) 414 } 415 } 416 417 // recordGameServerStatusChanged records gameserver status changes, however since it's based 418 // on cache events some events might collapsed and not appear, for example transition state 419 // like creating, port allocation, could be skipped. 420 // This is still very useful for final state, like READY, ERROR and since this is a counter 421 // (as opposed to gauge) you can aggregate using a rate, let's say how many gameserver are failing 422 // per second. 423 // Addition to the cache are not handled, otherwise resync would make metrics inaccurate by doubling 424 // current gameservers states. 425 func (c *Controller) recordGameServerStatusChanges(old, next interface{}) { 426 newGs, ok := next.(*agonesv1.GameServer) 427 if !ok { 428 return 429 } 430 oldGs, ok := old.(*agonesv1.GameServer) 431 if !ok { 432 return 433 } 434 435 fleetName := newGs.Labels[agonesv1.FleetNameLabel] 436 if fleetName == "" { 437 fleetName = noneValue 438 } 439 440 if runtime.FeatureEnabled(runtime.FeaturePlayerTracking) && 441 newGs.Status.Players != nil && 442 oldGs.Status.Players != nil { 443 444 if newGs.Status.Players.Count != oldGs.Status.Players.Count { 445 RecordWithTags(context.Background(), []tag.Mutator{tag.Upsert(keyFleetName, fleetName), 446 tag.Upsert(keyName, newGs.GetName()), tag.Upsert(keyNamespace, newGs.GetNamespace())}, gameServerPlayerConnectedTotal.M(newGs.Status.Players.Count)) 447 } 448 449 if newGs.Status.Players.Capacity-newGs.Status.Players.Count != oldGs.Status.Players.Capacity-oldGs.Status.Players.Count { 450 RecordWithTags(context.Background(), []tag.Mutator{tag.Upsert(keyFleetName, fleetName), 451 tag.Upsert(keyName, newGs.GetName()), tag.Upsert(keyNamespace, newGs.GetNamespace())}, gameServerPlayerCapacityTotal.M(newGs.Status.Players.Capacity-newGs.Status.Players.Count)) 452 } 453 454 } 455 456 if newGs.Status.State != oldGs.Status.State { 457 RecordWithTags(context.Background(), []tag.Mutator{tag.Upsert(keyType, string(newGs.Status.State)), 458 tag.Upsert(keyFleetName, fleetName), tag.Upsert(keyNamespace, newGs.GetNamespace())}, gameServerTotalStats.M(1)) 459 460 // Calculate the duration of the current state 461 duration, err := c.calcDuration(oldGs, newGs) 462 if err != nil { 463 c.logger.Warn(err.Error()) 464 } else { 465 RecordWithTags(context.Background(), []tag.Mutator{tag.Upsert(keyType, string(oldGs.Status.State)), 466 tag.Upsert(keyFleetName, fleetName), tag.Upsert(keyNamespace, newGs.GetNamespace())}, gsStateDurationSec.M(duration)) 467 } 468 } 469 } 470 471 // calcDuration calculates the duration between state changes 472 // store current time from creationTimestamp for each update received 473 // Assumptions: there is a possibility that one of the previous state change timestamps would be evicted, 474 // this measurement would be skipped. This is a trade off between accuracy of distribution calculation and the performance. 475 // Presumably occasional miss would not change the statistics too much. 476 func (c *Controller) calcDuration(oldGs, newGs *agonesv1.GameServer) (duration float64, err error) { 477 // currentTime - GameServer time from its start 478 currentTime := c.now().UTC().Sub(newGs.ObjectMeta.CreationTimestamp.Local().UTC()).Seconds() 479 480 fleetName := newGs.Labels[agonesv1.FleetNameLabel] 481 if fleetName == "" { 482 fleetName = defaultFleetTag 483 } 484 485 newGSKey := fmt.Sprintf("%s/%s/%s/%s", newGs.ObjectMeta.Namespace, fleetName, newGs.ObjectMeta.Name, newGs.Status.State) 486 oldGSKey := fmt.Sprintf("%s/%s/%s/%s", oldGs.ObjectMeta.Namespace, fleetName, oldGs.ObjectMeta.Name, oldGs.Status.State) 487 488 c.stateLock.Lock() 489 defer c.stateLock.Unlock() 490 switch { 491 case newGs.Status.State == agonesv1.GameServerStateCreating || newGs.Status.State == agonesv1.GameServerStatePortAllocation: 492 duration = currentTime 493 case !c.gameServerStateLastChange.Contains(oldGSKey): 494 err = fmt.Errorf("unable to calculate '%s' state duration of '%s' GameServer", oldGs.Status.State, oldGs.ObjectMeta.Name) 495 return 0, err 496 default: 497 val, ok := c.gameServerStateLastChange.Get(oldGSKey) 498 if !ok { 499 err = fmt.Errorf("could not find expected key %s", oldGSKey) 500 return 0, err 501 } 502 c.gameServerStateLastChange.Remove(oldGSKey) 503 duration = currentTime - val.(float64) 504 } 505 506 // Assuming that no State changes would occur after Shutdown 507 if newGs.Status.State != agonesv1.GameServerStateShutdown { 508 c.gameServerStateLastChange.Add(newGSKey, currentTime) 509 c.logger.Debugf("Adding new key %s, relative time: %f", newGSKey, currentTime) 510 } 511 if duration < 0. { 512 duration = 0 513 err = fmt.Errorf("negative duration for '%s' state of '%s' GameServer", oldGs.Status.State, oldGs.ObjectMeta.Name) 514 } 515 return duration, err 516 } 517 518 // Run the Metrics controller. Will block until stop is closed. 519 // Collect metrics via cache changes and parse the cache periodically to record resource counts. 520 func (c *Controller) Run(ctx context.Context, _ int) error { 521 c.logger.Debug("Wait for cache sync") 522 if !cache.WaitForCacheSync(ctx.Done(), c.gameServerSynced, c.fleetSynced, c.fasSynced) { 523 return errors.New("failed to wait for caches to sync") 524 } 525 wait.Until(c.collect, MetricResyncPeriod, ctx.Done()) 526 return nil 527 } 528 529 // collect all metrics that are not event-based. 530 // this is fired periodically. 531 func (c *Controller) collect() { 532 c.lock.Lock() 533 defer c.lock.Unlock() 534 c.collectGameServerCounts() 535 c.collectNodeCounts() 536 } 537 538 // collects gameservers count by going through our informer cache 539 // this not meant to be called concurrently 540 func (c *Controller) collectGameServerCounts() { 541 542 gameservers, err := c.gameServerLister.List(labels.Everything()) 543 if err != nil { 544 c.logger.WithError(err).Warn("failed listing gameservers") 545 return 546 } 547 548 if err := c.gsCount.record(gameservers); err != nil { 549 c.logger.WithError(err).Warn("error while recoding stats") 550 } 551 } 552 553 // collectNodeCounts count gameservers per node using informer cache. 554 func (c *Controller) collectNodeCounts() { 555 gsPerNodes := map[string]int32{} 556 557 gameservers, err := c.gameServerLister.List(labels.Everything()) 558 if err != nil { 559 c.logger.WithError(err).Warn("failed listing gameservers") 560 return 561 } 562 for _, gs := range gameservers { 563 if gs.Status.NodeName != "" { 564 gsPerNodes[gs.Status.NodeName]++ 565 } 566 } 567 568 nodes, err := c.nodeLister.List(labels.Everything()) 569 if err != nil { 570 c.logger.WithError(err).Warn("failed listing gameservers") 571 return 572 } 573 574 nodes = removeSystemNodes(nodes) 575 RecordWithTags(context.Background(), []tag.Mutator{tag.Insert(keyEmpty, "true")}, 576 nodesCountStats.M(int64(len(nodes)-len(gsPerNodes)))) 577 RecordWithTags(context.Background(), []tag.Mutator{tag.Insert(keyEmpty, "false")}, 578 nodesCountStats.M(int64(len(gsPerNodes)))) 579 580 for _, node := range nodes { 581 stats.Record(context.Background(), gsPerNodesCountStats.M(int64(gsPerNodes[node.Name]))) 582 } 583 } 584 585 func removeSystemNodes(nodes []*corev1.Node) []*corev1.Node { 586 var result []*corev1.Node 587 588 for _, n := range nodes { 589 if !isSystemNode(n) { 590 result = append(result, n) 591 } 592 } 593 594 return result 595 } 596 597 // isSystemNode determines if a node is a system node, by checking if it has any taints starting with "agones.dev/" 598 func isSystemNode(n *corev1.Node) bool { 599 for _, t := range n.Spec.Taints { 600 if strings.HasPrefix(t.Key, "agones.dev/") { 601 return true 602 } 603 } 604 605 return false 606 }