agones.dev/agones@v1.54.0/pkg/metrics/controller.go (about)

     1  // Copyright 2018 Google LLC All Rights Reserved.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  package metrics
    16  
    17  import (
    18  	"context"
    19  	"fmt"
    20  	"strconv"
    21  	"strings"
    22  	"sync"
    23  	"time"
    24  
    25  	agonesv1 "agones.dev/agones/pkg/apis/agones/v1"
    26  	autoscalingv1 "agones.dev/agones/pkg/apis/autoscaling/v1"
    27  	"agones.dev/agones/pkg/client/clientset/versioned"
    28  	"agones.dev/agones/pkg/client/informers/externalversions"
    29  	listerv1 "agones.dev/agones/pkg/client/listers/agones/v1"
    30  	autoscalinglisterv1 "agones.dev/agones/pkg/client/listers/autoscaling/v1"
    31  	fleetsv1 "agones.dev/agones/pkg/fleets"
    32  	"agones.dev/agones/pkg/util/runtime"
    33  	lru "github.com/hashicorp/golang-lru"
    34  	"github.com/pkg/errors"
    35  	"github.com/sirupsen/logrus"
    36  	"go.opencensus.io/stats"
    37  	"go.opencensus.io/tag"
    38  	corev1 "k8s.io/api/core/v1"
    39  	apiequality "k8s.io/apimachinery/pkg/api/equality"
    40  	"k8s.io/apimachinery/pkg/labels"
    41  	"k8s.io/apimachinery/pkg/util/intstr"
    42  	"k8s.io/apimachinery/pkg/util/wait"
    43  	"k8s.io/client-go/informers"
    44  	"k8s.io/client-go/kubernetes"
    45  	v1 "k8s.io/client-go/listers/core/v1"
    46  	"k8s.io/client-go/tools/cache"
    47  )
    48  
    49  const (
    50  	noneValue = "none"
    51  
    52  	// GameServersStateCount is the size of LRU cache and should contain all gameservers state changes
    53  	// Upper bound could be estimated as 10_000 of gameservers in total each moment, 10 state changes per each gameserver
    54  	// and about 10 minutes for a game session, and 6 gameservers per hour.
    55  	// For one hour 600k capacity would be enough, even if no records would be deleted.
    56  	// And calcDuration algorithm is removing those records, which already has been changed (old statuses).
    57  	// Key is Namespace, fleetName, GameServerName, State and float64 as value.
    58  	// Roughly 256 + 63 + 63 + 16 + 4 = 400 bytes per every record.
    59  	// In total we would have 229 MiB of space required to store GameServer State durations.
    60  	GameServersStateCount = 600_000
    61  )
    62  
    63  var (
    64  	// MetricResyncPeriod is the interval to re-synchronize metrics based on indexed cache.
    65  	MetricResyncPeriod = time.Second * 15
    66  )
    67  
    68  func init() {
    69  	registerViews()
    70  }
    71  
    72  // Controller is a metrics controller collecting Agones state metrics
    73  //
    74  //nolint:govet // ignore fieldalignment, singleton
    75  type Controller struct {
    76  	logger                    *logrus.Entry
    77  	gameServerLister          listerv1.GameServerLister
    78  	nodeLister                v1.NodeLister
    79  	gameServerSynced          cache.InformerSynced
    80  	fleetSynced               cache.InformerSynced
    81  	fleetLister               listerv1.FleetLister
    82  	gameServerSetLister       listerv1.GameServerSetLister
    83  	fasSynced                 cache.InformerSynced
    84  	fasLister                 autoscalinglisterv1.FleetAutoscalerLister
    85  	lock                      sync.Mutex
    86  	stateLock                 sync.Mutex
    87  	gsCount                   GameServerCount
    88  	faCount                   map[string]int64
    89  	gameServerStateLastChange *lru.Cache
    90  	now                       func() time.Time
    91  }
    92  
    93  // NewController returns a new metrics controller
    94  func NewController(
    95  	_ kubernetes.Interface,
    96  	_ versioned.Interface,
    97  	kubeInformerFactory informers.SharedInformerFactory,
    98  	agonesInformerFactory externalversions.SharedInformerFactory) *Controller {
    99  
   100  	gameServer := agonesInformerFactory.Agones().V1().GameServers()
   101  	gsInformer := gameServer.Informer()
   102  
   103  	fleets := agonesInformerFactory.Agones().V1().Fleets()
   104  	fInformer := fleets.Informer()
   105  	fas := agonesInformerFactory.Autoscaling().V1().FleetAutoscalers()
   106  	fasInformer := fas.Informer()
   107  	node := kubeInformerFactory.Core().V1().Nodes()
   108  
   109  	gameServerSets := agonesInformerFactory.Agones().V1().GameServerSets()
   110  
   111  	// GameServerStateLastChange Contains the time when the GameServer
   112  	// changed its state last time
   113  	// on delete and state change remove GameServerName key
   114  	lruCache, err := lru.New(GameServersStateCount)
   115  	if err != nil {
   116  		logger.WithError(err).Fatal("Unable to create LRU cache")
   117  	}
   118  
   119  	c := &Controller{
   120  		gameServerLister:          gameServer.Lister(),
   121  		nodeLister:                node.Lister(),
   122  		gameServerSynced:          gsInformer.HasSynced,
   123  		fleetSynced:               fInformer.HasSynced,
   124  		fleetLister:               fleets.Lister(),
   125  		gameServerSetLister:       gameServerSets.Lister(),
   126  		fasSynced:                 fasInformer.HasSynced,
   127  		fasLister:                 fas.Lister(),
   128  		gsCount:                   GameServerCount{},
   129  		faCount:                   map[string]int64{},
   130  		gameServerStateLastChange: lruCache,
   131  		now:                       time.Now,
   132  	}
   133  
   134  	c.logger = runtime.NewLoggerWithType(c)
   135  
   136  	_, _ = fInformer.AddEventHandler(cache.ResourceEventHandlerFuncs{
   137  		AddFunc: c.recordFleetChanges,
   138  		UpdateFunc: func(_, next interface{}) {
   139  			c.recordFleetChanges(next)
   140  		},
   141  		DeleteFunc: c.recordFleetDeletion,
   142  	})
   143  
   144  	_, _ = fasInformer.AddEventHandler(cache.ResourceEventHandlerFuncs{
   145  		AddFunc: func(added interface{}) {
   146  			c.recordFleetAutoScalerChanges(nil, added)
   147  		},
   148  		UpdateFunc: c.recordFleetAutoScalerChanges,
   149  		DeleteFunc: c.recordFleetAutoScalerDeletion,
   150  	})
   151  
   152  	_, _ = gsInformer.AddEventHandlerWithResyncPeriod(cache.ResourceEventHandlerFuncs{
   153  		UpdateFunc: c.recordGameServerStatusChanges,
   154  	}, 0)
   155  
   156  	return c
   157  }
   158  
   159  func (c *Controller) recordFleetAutoScalerChanges(old, next interface{}) {
   160  
   161  	fas, ok := next.(*autoscalingv1.FleetAutoscaler)
   162  	if !ok {
   163  		return
   164  	}
   165  
   166  	// we looking for fleet name changes if that happens we need to reset
   167  	// metrics for the old fas.
   168  	if old != nil {
   169  		if oldFas, ok := old.(*autoscalingv1.FleetAutoscaler); ok &&
   170  			oldFas.Spec.FleetName != fas.Spec.FleetName {
   171  			c.recordFleetAutoScalerDeletion(old)
   172  		}
   173  	}
   174  
   175  	// do not record fleetautoscaler, delete event will do this.
   176  	if fas.DeletionTimestamp != nil {
   177  		return
   178  	}
   179  
   180  	ctx, _ := tag.New(context.Background(), tag.Upsert(keyName, fas.Name),
   181  		tag.Upsert(keyFleetName, fas.Spec.FleetName), tag.Upsert(keyNamespace, fas.Namespace))
   182  
   183  	ableToScale := 0
   184  	limited := 0
   185  	if fas.Status.AbleToScale {
   186  		ableToScale = 1
   187  	}
   188  	if fas.Status.ScalingLimited {
   189  		limited = 1
   190  	}
   191  	// recording status
   192  	stats.Record(ctx,
   193  		fasCurrentReplicasStats.M(int64(fas.Status.CurrentReplicas)),
   194  		fasDesiredReplicasStats.M(int64(fas.Status.DesiredReplicas)),
   195  		fasAbleToScaleStats.M(int64(ableToScale)),
   196  		fasLimitedStats.M(int64(limited)))
   197  
   198  	// recording buffer policy
   199  	if fas.Spec.Policy.Buffer != nil {
   200  		// recording limits
   201  		RecordWithTags(ctx, []tag.Mutator{tag.Upsert(keyType, "max")},
   202  			fasBufferLimitsCountStats.M(int64(fas.Spec.Policy.Buffer.MaxReplicas)))
   203  		RecordWithTags(ctx, []tag.Mutator{tag.Upsert(keyType, "min")},
   204  			fasBufferLimitsCountStats.M(int64(fas.Spec.Policy.Buffer.MinReplicas)))
   205  
   206  		// recording size
   207  		if fas.Spec.Policy.Buffer.BufferSize.Type == intstr.String {
   208  			// as percentage
   209  			sizeString := fas.Spec.Policy.Buffer.BufferSize.StrVal
   210  			if sizeString != "" {
   211  				if size, err := strconv.Atoi(sizeString[:len(sizeString)-1]); err == nil {
   212  					RecordWithTags(ctx, []tag.Mutator{tag.Upsert(keyType, "percentage")},
   213  						fasBufferSizeStats.M(int64(size)))
   214  				}
   215  			}
   216  		} else {
   217  			// as count
   218  			RecordWithTags(ctx, []tag.Mutator{tag.Upsert(keyType, "count")},
   219  				fasBufferSizeStats.M(int64(fas.Spec.Policy.Buffer.BufferSize.IntVal)))
   220  		}
   221  	}
   222  }
   223  
   224  func (c *Controller) recordFleetAutoScalerDeletion(obj interface{}) {
   225  	_, ok := obj.(*autoscalingv1.FleetAutoscaler)
   226  	if !ok {
   227  		return
   228  	}
   229  
   230  	if err := c.resyncFleetAutoScaler(); err != nil {
   231  		c.logger.WithError(err).Warn("Could not resync Fleet Autoscaler metrics")
   232  	}
   233  }
   234  
   235  func (c *Controller) recordFleetChanges(obj interface{}) {
   236  	f, ok := obj.(*agonesv1.Fleet)
   237  	if !ok {
   238  		return
   239  	}
   240  
   241  	// do not record fleet, delete event will do this.
   242  	if f.DeletionTimestamp != nil {
   243  		return
   244  	}
   245  
   246  	c.recordFleetReplicas(f.Name, f.Namespace, f.Status.Replicas, f.Status.AllocatedReplicas,
   247  		f.Status.ReadyReplicas, f.Spec.Replicas, f.Status.ReservedReplicas)
   248  
   249  	c.recordFleetRolloutPercentage(f)
   250  
   251  	if runtime.FeatureEnabled(runtime.FeatureCountsAndLists) {
   252  		if f.Status.Counters != nil {
   253  			c.recordCounters(f.Name, f.Namespace, f.Status.Counters)
   254  		}
   255  		if f.Status.Lists != nil {
   256  			c.recordLists(f.Name, f.Namespace, f.Status.Lists)
   257  		}
   258  	}
   259  }
   260  
   261  func (c *Controller) recordFleetRolloutPercentage(fleet *agonesv1.Fleet) {
   262  	gameServerSetNamespacedLister := c.gameServerSetLister.GameServerSets(fleet.ObjectMeta.Namespace)
   263  	list, err := fleetsv1.ListGameServerSetsByFleetOwner(gameServerSetNamespacedLister, fleet)
   264  	if err != nil {
   265  		c.logger.Errorf("Error listing GameServerSets for fleet %s in namespace %s: %v", fleet.Name, fleet.Namespace, err.Error())
   266  		return
   267  	}
   268  
   269  	active, _ := c.filterGameServerSetByActive(fleet, list)
   270  
   271  	if active == nil {
   272  		fleetName := fleet.ObjectMeta.Namespace + "/" + fleet.ObjectMeta.Name
   273  		c.logger.Debugf("Could not find active GameServerSet %s", fleetName)
   274  		active = fleet.GameServerSet()
   275  	}
   276  
   277  	currentReplicas := active.Status.Replicas
   278  	desiredReplicas := fleet.Spec.Replicas
   279  
   280  	ctx, _ := tag.New(context.Background(), tag.Upsert(keyName, fleet.Name), tag.Upsert(keyNamespace, fleet.GetNamespace()))
   281  
   282  	// Record current replicas count
   283  	RecordWithTags(ctx, []tag.Mutator{tag.Upsert(keyType, "current_replicas")},
   284  		fleetRolloutPercentStats.M(int64(currentReplicas)))
   285  
   286  	// Record desired replicas count
   287  	RecordWithTags(ctx, []tag.Mutator{tag.Upsert(keyType, "desired_replicas")},
   288  		fleetRolloutPercentStats.M(int64(desiredReplicas)))
   289  }
   290  
   291  // filterGameServerSetByActive returns the active GameServerSet (or nil if it
   292  // doesn't exist) and then the rest of the GameServerSets that are controlled
   293  // by this Fleet
   294  func (c *Controller) filterGameServerSetByActive(fleet *agonesv1.Fleet, list []*agonesv1.GameServerSet) (*agonesv1.GameServerSet, []*agonesv1.GameServerSet) {
   295  	var active *agonesv1.GameServerSet
   296  	var rest []*agonesv1.GameServerSet
   297  
   298  	for _, gsSet := range list {
   299  		if apiequality.Semantic.DeepEqual(gsSet.Spec.Template, fleet.Spec.Template) {
   300  			active = gsSet
   301  		} else {
   302  			rest = append(rest, gsSet)
   303  		}
   304  	}
   305  
   306  	return active, rest
   307  }
   308  
   309  func (c *Controller) recordFleetDeletion(obj interface{}) {
   310  	_, ok := obj.(*agonesv1.Fleet)
   311  	if !ok {
   312  		return
   313  	}
   314  
   315  	if err := c.resyncFleets(); err != nil {
   316  		// If for some reason resync fails, the entire metric state for fleets
   317  		// will be reset whenever the next Fleet gets deleted, in which case
   318  		// we end up back in a healthy state - so we aren't going to actively retry.
   319  		c.logger.WithError(err).Warn("Could not resync Fleet Metrics")
   320  	}
   321  }
   322  
   323  // resyncFleets resets all views associated with a Fleet, and recalculates all totals.
   324  func (c *Controller) resyncFleets() error {
   325  	c.lock.Lock()
   326  	defer c.lock.Unlock()
   327  	fleets, err := c.fleetLister.List(labels.Everything())
   328  	if err != nil {
   329  		return errors.Wrap(err, "could not resync Fleets")
   330  	}
   331  
   332  	fasList, err := c.fasLister.List(labels.Everything())
   333  	if err != nil {
   334  		return errors.Wrap(err, "could not resync Fleets")
   335  	}
   336  
   337  	resetViews(fleetViews)
   338  	for _, f := range fleets {
   339  		c.recordFleetChanges(f)
   340  	}
   341  	for _, fas := range fasList {
   342  		c.recordFleetAutoScalerChanges(nil, fas)
   343  	}
   344  	c.collectGameServerCounts()
   345  
   346  	return nil
   347  }
   348  
   349  // resyncFleetAutoScaler resets all views associated with FleetAutoscalers, and recalculates metric totals.
   350  func (c *Controller) resyncFleetAutoScaler() error {
   351  	c.lock.Lock()
   352  	defer c.lock.Unlock()
   353  
   354  	fasList, err := c.fasLister.List(labels.Everything())
   355  	if err != nil {
   356  		return errors.Wrap(err, "could not resync FleetAutoScalers")
   357  	}
   358  
   359  	resetViews(fleetAutoscalerViews)
   360  	for _, fas := range fasList {
   361  		c.recordFleetAutoScalerChanges(nil, fas)
   362  	}
   363  
   364  	return nil
   365  }
   366  
   367  func (c *Controller) recordFleetReplicas(fleetName, fleetNamespace string, total, allocated, ready, desired, reserved int32) {
   368  
   369  	ctx, _ := tag.New(context.Background(), tag.Upsert(keyName, fleetName), tag.Upsert(keyNamespace, fleetNamespace))
   370  
   371  	RecordWithTags(ctx, []tag.Mutator{tag.Upsert(keyType, "total")},
   372  		fleetsReplicasCountStats.M(int64(total)))
   373  	RecordWithTags(ctx, []tag.Mutator{tag.Upsert(keyType, "allocated")},
   374  		fleetsReplicasCountStats.M(int64(allocated)))
   375  	RecordWithTags(ctx, []tag.Mutator{tag.Upsert(keyType, "ready")},
   376  		fleetsReplicasCountStats.M(int64(ready)))
   377  	RecordWithTags(ctx, []tag.Mutator{tag.Upsert(keyType, "desired")},
   378  		fleetsReplicasCountStats.M(int64(desired)))
   379  	RecordWithTags(ctx, []tag.Mutator{tag.Upsert(keyType, "reserved")},
   380  		fleetsReplicasCountStats.M(int64(reserved)))
   381  }
   382  
   383  // nolint:dupl // Linter errors on lines are duplicate of recordLists
   384  func (c *Controller) recordCounters(fleetName, fleetNamespace string, counters map[string]agonesv1.AggregatedCounterStatus) {
   385  
   386  	ctx, _ := tag.New(context.Background(), tag.Upsert(keyName, fleetName), tag.Upsert(keyNamespace, fleetNamespace))
   387  
   388  	for counter, counterStatus := range counters {
   389  		RecordWithTags(ctx, []tag.Mutator{tag.Upsert(keyType, "allocated_count"), tag.Upsert(keyCounter, counter)},
   390  			fleetCountersStats.M(counterStatus.AllocatedCount))
   391  		RecordWithTags(ctx, []tag.Mutator{tag.Upsert(keyType, "allocated_capacity"), tag.Upsert(keyCounter, counter)},
   392  			fleetCountersStats.M(counterStatus.AllocatedCapacity))
   393  		RecordWithTags(ctx, []tag.Mutator{tag.Upsert(keyType, "total_count"), tag.Upsert(keyCounter, counter)},
   394  			fleetCountersStats.M(counterStatus.Count))
   395  		RecordWithTags(ctx, []tag.Mutator{tag.Upsert(keyType, "total_capacity"), tag.Upsert(keyCounter, counter)},
   396  			fleetCountersStats.M(counterStatus.Capacity))
   397  	}
   398  }
   399  
   400  // nolint:dupl // Linter errors on lines are duplicate of recordCounters
   401  func (c *Controller) recordLists(fleetName, fleetNamespace string, lists map[string]agonesv1.AggregatedListStatus) {
   402  
   403  	ctx, _ := tag.New(context.Background(), tag.Upsert(keyName, fleetName), tag.Upsert(keyNamespace, fleetNamespace))
   404  
   405  	for list, listStatus := range lists {
   406  		RecordWithTags(ctx, []tag.Mutator{tag.Upsert(keyType, "allocated_count"), tag.Upsert(keyList, list)},
   407  			fleetListsStats.M(listStatus.AllocatedCount))
   408  		RecordWithTags(ctx, []tag.Mutator{tag.Upsert(keyType, "allocated_capacity"), tag.Upsert(keyList, list)},
   409  			fleetListsStats.M(listStatus.AllocatedCapacity))
   410  		RecordWithTags(ctx, []tag.Mutator{tag.Upsert(keyType, "total_count"), tag.Upsert(keyList, list)},
   411  			fleetListsStats.M(listStatus.Count))
   412  		RecordWithTags(ctx, []tag.Mutator{tag.Upsert(keyType, "total_capacity"), tag.Upsert(keyList, list)},
   413  			fleetListsStats.M(listStatus.Capacity))
   414  	}
   415  }
   416  
   417  // recordGameServerStatusChanged records gameserver status changes, however since it's based
   418  // on cache events some events might collapsed and not appear, for example transition state
   419  // like creating, port allocation, could be skipped.
   420  // This is still very useful for final state, like READY, ERROR and since this is a counter
   421  // (as opposed to gauge) you can aggregate using a rate, let's say how many gameserver are failing
   422  // per second.
   423  // Addition to the cache are not handled, otherwise resync would make metrics inaccurate by doubling
   424  // current gameservers states.
   425  func (c *Controller) recordGameServerStatusChanges(old, next interface{}) {
   426  	newGs, ok := next.(*agonesv1.GameServer)
   427  	if !ok {
   428  		return
   429  	}
   430  	oldGs, ok := old.(*agonesv1.GameServer)
   431  	if !ok {
   432  		return
   433  	}
   434  
   435  	fleetName := newGs.Labels[agonesv1.FleetNameLabel]
   436  	if fleetName == "" {
   437  		fleetName = noneValue
   438  	}
   439  
   440  	if runtime.FeatureEnabled(runtime.FeaturePlayerTracking) &&
   441  		newGs.Status.Players != nil &&
   442  		oldGs.Status.Players != nil {
   443  
   444  		if newGs.Status.Players.Count != oldGs.Status.Players.Count {
   445  			RecordWithTags(context.Background(), []tag.Mutator{tag.Upsert(keyFleetName, fleetName),
   446  				tag.Upsert(keyName, newGs.GetName()), tag.Upsert(keyNamespace, newGs.GetNamespace())}, gameServerPlayerConnectedTotal.M(newGs.Status.Players.Count))
   447  		}
   448  
   449  		if newGs.Status.Players.Capacity-newGs.Status.Players.Count != oldGs.Status.Players.Capacity-oldGs.Status.Players.Count {
   450  			RecordWithTags(context.Background(), []tag.Mutator{tag.Upsert(keyFleetName, fleetName),
   451  				tag.Upsert(keyName, newGs.GetName()), tag.Upsert(keyNamespace, newGs.GetNamespace())}, gameServerPlayerCapacityTotal.M(newGs.Status.Players.Capacity-newGs.Status.Players.Count))
   452  		}
   453  
   454  	}
   455  
   456  	if newGs.Status.State != oldGs.Status.State {
   457  		RecordWithTags(context.Background(), []tag.Mutator{tag.Upsert(keyType, string(newGs.Status.State)),
   458  			tag.Upsert(keyFleetName, fleetName), tag.Upsert(keyNamespace, newGs.GetNamespace())}, gameServerTotalStats.M(1))
   459  
   460  		// Calculate the duration of the current state
   461  		duration, err := c.calcDuration(oldGs, newGs)
   462  		if err != nil {
   463  			c.logger.Warn(err.Error())
   464  		} else {
   465  			RecordWithTags(context.Background(), []tag.Mutator{tag.Upsert(keyType, string(oldGs.Status.State)),
   466  				tag.Upsert(keyFleetName, fleetName), tag.Upsert(keyNamespace, newGs.GetNamespace())}, gsStateDurationSec.M(duration))
   467  		}
   468  	}
   469  }
   470  
   471  // calcDuration calculates the duration between state changes
   472  // store current time from creationTimestamp for each update received
   473  // Assumptions: there is a possibility that one of the previous state change timestamps would be evicted,
   474  // this measurement would be skipped. This is a trade off between accuracy of distribution calculation and the performance.
   475  // Presumably occasional miss would not change the statistics too much.
   476  func (c *Controller) calcDuration(oldGs, newGs *agonesv1.GameServer) (duration float64, err error) {
   477  	// currentTime - GameServer time from its start
   478  	currentTime := c.now().UTC().Sub(newGs.ObjectMeta.CreationTimestamp.Local().UTC()).Seconds()
   479  
   480  	fleetName := newGs.Labels[agonesv1.FleetNameLabel]
   481  	if fleetName == "" {
   482  		fleetName = defaultFleetTag
   483  	}
   484  
   485  	newGSKey := fmt.Sprintf("%s/%s/%s/%s", newGs.ObjectMeta.Namespace, fleetName, newGs.ObjectMeta.Name, newGs.Status.State)
   486  	oldGSKey := fmt.Sprintf("%s/%s/%s/%s", oldGs.ObjectMeta.Namespace, fleetName, oldGs.ObjectMeta.Name, oldGs.Status.State)
   487  
   488  	c.stateLock.Lock()
   489  	defer c.stateLock.Unlock()
   490  	switch {
   491  	case newGs.Status.State == agonesv1.GameServerStateCreating || newGs.Status.State == agonesv1.GameServerStatePortAllocation:
   492  		duration = currentTime
   493  	case !c.gameServerStateLastChange.Contains(oldGSKey):
   494  		err = fmt.Errorf("unable to calculate '%s' state duration of '%s' GameServer", oldGs.Status.State, oldGs.ObjectMeta.Name)
   495  		return 0, err
   496  	default:
   497  		val, ok := c.gameServerStateLastChange.Get(oldGSKey)
   498  		if !ok {
   499  			err = fmt.Errorf("could not find expected key %s", oldGSKey)
   500  			return 0, err
   501  		}
   502  		c.gameServerStateLastChange.Remove(oldGSKey)
   503  		duration = currentTime - val.(float64)
   504  	}
   505  
   506  	// Assuming that no State changes would occur after Shutdown
   507  	if newGs.Status.State != agonesv1.GameServerStateShutdown {
   508  		c.gameServerStateLastChange.Add(newGSKey, currentTime)
   509  		c.logger.Debugf("Adding new key %s, relative time: %f", newGSKey, currentTime)
   510  	}
   511  	if duration < 0. {
   512  		duration = 0
   513  		err = fmt.Errorf("negative duration for '%s' state of '%s' GameServer", oldGs.Status.State, oldGs.ObjectMeta.Name)
   514  	}
   515  	return duration, err
   516  }
   517  
   518  // Run the Metrics controller. Will block until stop is closed.
   519  // Collect metrics via cache changes and parse the cache periodically to record resource counts.
   520  func (c *Controller) Run(ctx context.Context, _ int) error {
   521  	c.logger.Debug("Wait for cache sync")
   522  	if !cache.WaitForCacheSync(ctx.Done(), c.gameServerSynced, c.fleetSynced, c.fasSynced) {
   523  		return errors.New("failed to wait for caches to sync")
   524  	}
   525  	wait.Until(c.collect, MetricResyncPeriod, ctx.Done())
   526  	return nil
   527  }
   528  
   529  // collect all metrics that are not event-based.
   530  // this is fired periodically.
   531  func (c *Controller) collect() {
   532  	c.lock.Lock()
   533  	defer c.lock.Unlock()
   534  	c.collectGameServerCounts()
   535  	c.collectNodeCounts()
   536  }
   537  
   538  // collects gameservers count by going through our informer cache
   539  // this not meant to be called concurrently
   540  func (c *Controller) collectGameServerCounts() {
   541  
   542  	gameservers, err := c.gameServerLister.List(labels.Everything())
   543  	if err != nil {
   544  		c.logger.WithError(err).Warn("failed listing gameservers")
   545  		return
   546  	}
   547  
   548  	if err := c.gsCount.record(gameservers); err != nil {
   549  		c.logger.WithError(err).Warn("error while recoding stats")
   550  	}
   551  }
   552  
   553  // collectNodeCounts count gameservers per node using informer cache.
   554  func (c *Controller) collectNodeCounts() {
   555  	gsPerNodes := map[string]int32{}
   556  
   557  	gameservers, err := c.gameServerLister.List(labels.Everything())
   558  	if err != nil {
   559  		c.logger.WithError(err).Warn("failed listing gameservers")
   560  		return
   561  	}
   562  	for _, gs := range gameservers {
   563  		if gs.Status.NodeName != "" {
   564  			gsPerNodes[gs.Status.NodeName]++
   565  		}
   566  	}
   567  
   568  	nodes, err := c.nodeLister.List(labels.Everything())
   569  	if err != nil {
   570  		c.logger.WithError(err).Warn("failed listing gameservers")
   571  		return
   572  	}
   573  
   574  	nodes = removeSystemNodes(nodes)
   575  	RecordWithTags(context.Background(), []tag.Mutator{tag.Insert(keyEmpty, "true")},
   576  		nodesCountStats.M(int64(len(nodes)-len(gsPerNodes))))
   577  	RecordWithTags(context.Background(), []tag.Mutator{tag.Insert(keyEmpty, "false")},
   578  		nodesCountStats.M(int64(len(gsPerNodes))))
   579  
   580  	for _, node := range nodes {
   581  		stats.Record(context.Background(), gsPerNodesCountStats.M(int64(gsPerNodes[node.Name])))
   582  	}
   583  }
   584  
   585  func removeSystemNodes(nodes []*corev1.Node) []*corev1.Node {
   586  	var result []*corev1.Node
   587  
   588  	for _, n := range nodes {
   589  		if !isSystemNode(n) {
   590  			result = append(result, n)
   591  		}
   592  	}
   593  
   594  	return result
   595  }
   596  
   597  // isSystemNode determines if a node is a system node, by checking if it has any taints starting with "agones.dev/"
   598  func isSystemNode(n *corev1.Node) bool {
   599  	for _, t := range n.Spec.Taints {
   600  		if strings.HasPrefix(t.Key, "agones.dev/") {
   601  			return true
   602  		}
   603  	}
   604  
   605  	return false
   606  }