agones.dev/agones@v1.54.0/pkg/gameserverallocations/allocator.go

agones.dev/agones@v1.54.0/pkg/gameserverallocations/allocator.go (about)

     1  // Copyright 2019 Google LLC All Rights Reserved.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  package gameserverallocations
    16  
    17  import (
    18  	"context"
    19  	"crypto/tls"
    20  	"crypto/x509"
    21  	goErrors "errors"
    22  	"fmt"
    23  	"strings"
    24  	"time"
    25  
    26  	"agones.dev/agones/pkg/allocation/converters"
    27  	pb "agones.dev/agones/pkg/allocation/go"
    28  	"agones.dev/agones/pkg/apis"
    29  	agonesv1 "agones.dev/agones/pkg/apis/agones/v1"
    30  	allocationv1 "agones.dev/agones/pkg/apis/allocation/v1"
    31  	multiclusterv1 "agones.dev/agones/pkg/apis/multicluster/v1"
    32  	getterv1 "agones.dev/agones/pkg/client/clientset/versioned/typed/agones/v1"
    33  	multiclusterinformerv1 "agones.dev/agones/pkg/client/informers/externalversions/multicluster/v1"
    34  	multiclusterlisterv1 "agones.dev/agones/pkg/client/listers/multicluster/v1"
    35  	"agones.dev/agones/pkg/util/apiserver"
    36  	"agones.dev/agones/pkg/util/logfields"
    37  	"agones.dev/agones/pkg/util/runtime"
    38  	"github.com/pkg/errors"
    39  	"github.com/sirupsen/logrus"
    40  	"go.opencensus.io/tag"
    41  	"google.golang.org/grpc"
    42  	"google.golang.org/grpc/codes"
    43  	"google.golang.org/grpc/credentials"
    44  	"google.golang.org/grpc/status"
    45  	corev1 "k8s.io/api/core/v1"
    46  	k8serrors "k8s.io/apimachinery/pkg/api/errors"
    47  	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
    48  	"k8s.io/apimachinery/pkg/labels"
    49  	k8sruntime "k8s.io/apimachinery/pkg/runtime"
    50  	"k8s.io/apimachinery/pkg/runtime/schema"
    51  	runtimeschema "k8s.io/apimachinery/pkg/runtime/schema"
    52  	"k8s.io/apimachinery/pkg/util/wait"
    53  	informercorev1 "k8s.io/client-go/informers/core/v1"
    54  	"k8s.io/client-go/kubernetes"
    55  	"k8s.io/client-go/kubernetes/scheme"
    56  	typedcorev1 "k8s.io/client-go/kubernetes/typed/core/v1"
    57  	corev1lister "k8s.io/client-go/listers/core/v1"
    58  	"k8s.io/client-go/tools/cache"
    59  	"k8s.io/client-go/tools/record"
    60  )
    61  
    62  var (
    63  	// ErrNoGameServer is returned when there are no Allocatable GameServers
    64  	// available
    65  	ErrNoGameServer = errors.New("Could not find an Allocatable GameServer")
    66  	// ErrConflictInGameServerSelection is returned when the candidate gameserver already allocated
    67  	ErrConflictInGameServerSelection = errors.New("The Gameserver was already allocated")
    68  	// ErrTotalTimeoutExceeded is used to signal that total retry timeout has been exceeded and no additional retries should be made
    69  	ErrTotalTimeoutExceeded = status.Errorf(codes.DeadlineExceeded, "remote allocation total timeout exceeded")
    70  	// ErrGameServerUpdateConflict is returned when the game server selected for applying the allocation cannot be updated
    71  	ErrGameServerUpdateConflict = errors.New("could not update the selected GameServer")
    72  )
    73  
    74  const (
    75  	// LastAllocatedAnnotationKey is a GameServer annotation containing an RFC 3339 formatted
    76  	// timestamp of the most recent allocation.
    77  	LastAllocatedAnnotationKey = "agones.dev/last-allocated"
    78  
    79  	secretClientCertName  = "tls.crt"
    80  	secretClientKeyName   = "tls.key"
    81  	secretCACertName      = "ca.crt"
    82  	allocatorPort         = "443"
    83  	maxBatchQueue         = 100
    84  	maxBatchBeforeRefresh = 100
    85  	localAllocationSource = "local"
    86  )
    87  
    88  var allocationRetry = wait.Backoff{
    89  	Steps:    5,
    90  	Duration: 10 * time.Millisecond,
    91  	Factor:   1.0,
    92  	Jitter:   0.1,
    93  }
    94  
    95  var remoteAllocationRetry = wait.Backoff{
    96  	Steps:    7,
    97  	Duration: 100 * time.Millisecond,
    98  	Factor:   2.0,
    99  }
   100  
   101  // Allocator handles game server allocation
   102  type Allocator struct {
   103  	baseLogger                   *logrus.Entry
   104  	allocationPolicyLister       multiclusterlisterv1.GameServerAllocationPolicyLister
   105  	allocationPolicySynced       cache.InformerSynced
   106  	secretLister                 corev1lister.SecretLister
   107  	secretSynced                 cache.InformerSynced
   108  	gameServerGetter             getterv1.GameServersGetter
   109  	recorder                     record.EventRecorder
   110  	pendingRequests              chan request
   111  	allocationCache              *AllocationCache
   112  	remoteAllocationCallback     func(context.Context, string, grpc.DialOption, *pb.AllocationRequest) (*pb.AllocationResponse, error)
   113  	remoteAllocationTimeout      time.Duration
   114  	totalRemoteAllocationTimeout time.Duration
   115  	batchWaitTime                time.Duration
   116  }
   117  
   118  // request is an async request for allocation
   119  type request struct {
   120  	gsa      *allocationv1.GameServerAllocation
   121  	response chan response
   122  }
   123  
   124  // response is an async response for a matching request
   125  type response struct {
   126  	request request
   127  	gs      *agonesv1.GameServer
   128  	err     error
   129  }
   130  
   131  // NewAllocator creates an instance of Allocator
   132  func NewAllocator(policyInformer multiclusterinformerv1.GameServerAllocationPolicyInformer, secretInformer informercorev1.SecretInformer, gameServerGetter getterv1.GameServersGetter,
   133  	kubeClient kubernetes.Interface, allocationCache *AllocationCache, remoteAllocationTimeout time.Duration, totalRemoteAllocationTimeout time.Duration, batchWaitTime time.Duration) *Allocator {
   134  	ah := &Allocator{
   135  		pendingRequests:              make(chan request, maxBatchQueue),
   136  		allocationPolicyLister:       policyInformer.Lister(),
   137  		allocationPolicySynced:       policyInformer.Informer().HasSynced,
   138  		secretLister:                 secretInformer.Lister(),
   139  		secretSynced:                 secretInformer.Informer().HasSynced,
   140  		gameServerGetter:             gameServerGetter,
   141  		allocationCache:              allocationCache,
   142  		batchWaitTime:                batchWaitTime,
   143  		remoteAllocationTimeout:      remoteAllocationTimeout,
   144  		totalRemoteAllocationTimeout: totalRemoteAllocationTimeout,
   145  		remoteAllocationCallback: func(ctx context.Context, endpoint string, dialOpts grpc.DialOption, request *pb.AllocationRequest) (*pb.AllocationResponse, error) {
   146  			conn, err := grpc.NewClient(endpoint, dialOpts)
   147  			if err != nil {
   148  				return nil, err
   149  			}
   150  			defer conn.Close() // nolint: errcheck
   151  
   152  			allocationCtx, cancel := context.WithTimeout(ctx, remoteAllocationTimeout)
   153  			defer cancel() // nolint: errcheck
   154  			grpcClient := pb.NewAllocationServiceClient(conn)
   155  			return grpcClient.Allocate(allocationCtx, request)
   156  		},
   157  	}
   158  
   159  	ah.baseLogger = runtime.NewLoggerWithType(ah)
   160  	eventBroadcaster := record.NewBroadcaster()
   161  	eventBroadcaster.StartLogging(ah.baseLogger.Debugf)
   162  	eventBroadcaster.StartRecordingToSink(&typedcorev1.EventSinkImpl{Interface: kubeClient.CoreV1().Events("")})
   163  	ah.recorder = eventBroadcaster.NewRecorder(scheme.Scheme, corev1.EventSource{Component: "GameServerAllocation-Allocator"})
   164  
   165  	return ah
   166  }
   167  
   168  // Run initiates the listeners.
   169  func (c *Allocator) Run(ctx context.Context) error {
   170  	if err := c.Sync(ctx); err != nil {
   171  		return err
   172  	}
   173  
   174  	if err := c.allocationCache.Run(ctx); err != nil {
   175  		return err
   176  	}
   177  
   178  	// workers and logic for batching allocations
   179  	go c.ListenAndAllocate(ctx, maxBatchQueue)
   180  
   181  	return nil
   182  }
   183  
   184  // Sync waits for cache to sync
   185  func (c *Allocator) Sync(ctx context.Context) error {
   186  	c.baseLogger.Debug("Wait for Allocator cache sync")
   187  	if !cache.WaitForCacheSync(ctx.Done(), c.secretSynced, c.allocationPolicySynced) {
   188  		return errors.New("failed to wait for caches to sync")
   189  	}
   190  	return nil
   191  }
   192  
   193  // Allocate CRDHandler for allocating a gameserver.
   194  func (c *Allocator) Allocate(ctx context.Context, gsa *allocationv1.GameServerAllocation) (out k8sruntime.Object, err error) {
   195  	latency := c.newMetrics(ctx)
   196  	defer func() {
   197  		if err != nil {
   198  			latency.setError()
   199  		}
   200  		latency.record()
   201  	}()
   202  	latency.setRequest(gsa)
   203  
   204  	// server side validation
   205  	if errs := gsa.Validate(); len(errs) > 0 {
   206  		kind := runtimeschema.GroupKind{
   207  			Group: allocationv1.SchemeGroupVersion.Group,
   208  			Kind:  "GameServerAllocation",
   209  		}
   210  		statusErr := k8serrors.NewInvalid(kind, gsa.Name, errs)
   211  		s := &statusErr.ErrStatus
   212  		var gvks []schema.GroupVersionKind
   213  		gvks, _, err := apiserver.Scheme.ObjectKinds(s)
   214  		if err != nil {
   215  			return nil, errors.Wrap(err, "could not find objectkinds for status")
   216  		}
   217  
   218  		c.loggerForGameServerAllocation(gsa).Debug("GameServerAllocation is invalid")
   219  		s.TypeMeta = metav1.TypeMeta{Kind: gvks[0].Kind, APIVersion: gvks[0].Version}
   220  		return s, nil
   221  	}
   222  
   223  	// Convert gsa required and preferred fields to selectors field
   224  	gsa.Converter()
   225  
   226  	// If multi-cluster setting is enabled, allocate base on the multicluster allocation policy.
   227  	if gsa.Spec.MultiClusterSetting.Enabled {
   228  		out, err = c.applyMultiClusterAllocation(ctx, gsa)
   229  	} else {
   230  		out, err = c.allocateFromLocalCluster(ctx, gsa)
   231  	}
   232  
   233  	if err != nil {
   234  		c.loggerForGameServerAllocation(gsa).WithError(err).Error("allocation failed")
   235  		return nil, err
   236  	}
   237  	latency.setResponse(out)
   238  
   239  	return out, nil
   240  }
   241  
   242  func (c *Allocator) loggerForGameServerAllocationKey(key string) *logrus.Entry {
   243  	return logfields.AugmentLogEntry(c.baseLogger, logfields.GameServerAllocationKey, key)
   244  }
   245  
   246  func (c *Allocator) loggerForGameServerAllocation(gsa *allocationv1.GameServerAllocation) *logrus.Entry {
   247  	gsaName := "NilGameServerAllocation"
   248  	if gsa != nil {
   249  		gsaName = gsa.Namespace + "/" + gsa.Name
   250  	}
   251  	return c.loggerForGameServerAllocationKey(gsaName).WithField("gsa", gsa)
   252  }
   253  
   254  // allocateFromLocalCluster allocates gameservers from the local cluster.
   255  // Registers number of times we retried before getting a success allocation
   256  func (c *Allocator) allocateFromLocalCluster(ctx context.Context, gsa *allocationv1.GameServerAllocation) (*allocationv1.GameServerAllocation, error) {
   257  	var gs *agonesv1.GameServer
   258  	retry := c.newMetrics(ctx)
   259  	retryCount := 0
   260  	err := Retry(allocationRetry, func() error {
   261  		var err error
   262  		gs, err = c.allocate(ctx, gsa)
   263  		retryCount++
   264  
   265  		if err != nil {
   266  			c.loggerForGameServerAllocation(gsa).WithError(err).Warn("Failed to Allocated. Retrying...")
   267  		} else {
   268  			retry.recordAllocationRetrySuccess(ctx, retryCount)
   269  		}
   270  		return err
   271  	})
   272  
   273  	if err != nil && err != ErrNoGameServer && err != ErrConflictInGameServerSelection {
   274  		c.allocationCache.Resync()
   275  		return nil, err
   276  	}
   277  
   278  	switch err {
   279  	case ErrNoGameServer, ErrGameServerUpdateConflict:
   280  		gsa.Status.State = allocationv1.GameServerAllocationUnAllocated
   281  	case ErrConflictInGameServerSelection:
   282  		gsa.Status.State = allocationv1.GameServerAllocationContention
   283  	default:
   284  		gsa.ObjectMeta.Name = gs.ObjectMeta.Name
   285  		gsa.Status.State = allocationv1.GameServerAllocationAllocated
   286  		gsa.Status.GameServerName = gs.ObjectMeta.Name
   287  		gsa.Status.Ports = gs.Status.Ports
   288  		gsa.Status.Address = gs.Status.Address
   289  		gsa.Status.Addresses = append(gsa.Status.Addresses, gs.Status.Addresses...)
   290  		gsa.Status.NodeName = gs.Status.NodeName
   291  		gsa.Status.Source = localAllocationSource
   292  		gsa.Status.Metadata = &allocationv1.GameServerMetadata{
   293  			Labels:      gs.ObjectMeta.Labels,
   294  			Annotations: gs.ObjectMeta.Annotations,
   295  		}
   296  		if runtime.FeatureEnabled(runtime.FeatureCountsAndLists) {
   297  			gsa.Status.Counters = gs.Status.Counters
   298  			gsa.Status.Lists = gs.Status.Lists
   299  		}
   300  	}
   301  
   302  	c.loggerForGameServerAllocation(gsa).Debug("Game server allocation")
   303  	return gsa, nil
   304  }
   305  
   306  // applyMultiClusterAllocation retrieves allocation policies and iterate on policies.
   307  // Then allocate gameservers from local or remote cluster accordingly.
   308  func (c *Allocator) applyMultiClusterAllocation(ctx context.Context, gsa *allocationv1.GameServerAllocation) (result *allocationv1.GameServerAllocation, err error) {
   309  	selector := labels.Everything()
   310  	if len(gsa.Spec.MultiClusterSetting.PolicySelector.MatchLabels)+len(gsa.Spec.MultiClusterSetting.PolicySelector.MatchExpressions) != 0 {
   311  		selector, err = metav1.LabelSelectorAsSelector(&gsa.Spec.MultiClusterSetting.PolicySelector)
   312  		if err != nil {
   313  			return nil, err
   314  		}
   315  	}
   316  
   317  	policies, err := c.allocationPolicyLister.GameServerAllocationPolicies(gsa.ObjectMeta.Namespace).List(selector)
   318  	if err != nil {
   319  		return nil, err
   320  	} else if len(policies) == 0 {
   321  		return nil, errors.New("no multi-cluster allocation policy is specified")
   322  	}
   323  
   324  	it := multiclusterv1.NewConnectionInfoIterator(policies)
   325  	for {
   326  		connectionInfo := it.Next()
   327  		if connectionInfo == nil {
   328  			break
   329  		}
   330  		if len(connectionInfo.AllocationEndpoints) == 0 {
   331  			// Change the namespace to the policy namespace and allocate locally
   332  			gsaCopy := gsa
   333  			if gsa.Namespace != connectionInfo.Namespace {
   334  				gsaCopy = gsa.DeepCopy()
   335  				gsaCopy.Namespace = connectionInfo.Namespace
   336  			}
   337  			result, err = c.allocateFromLocalCluster(ctx, gsaCopy)
   338  			if err != nil {
   339  				c.loggerForGameServerAllocation(gsaCopy).WithError(err).Error("self-allocation failed")
   340  			}
   341  		} else {
   342  			result, err = c.allocateFromRemoteCluster(gsa, connectionInfo, gsa.ObjectMeta.Namespace)
   343  			if err != nil {
   344  				c.loggerForGameServerAllocation(gsa).WithField("allocConnInfo", connectionInfo).WithError(err).Error("remote-allocation failed")
   345  			}
   346  		}
   347  		if result != nil && result.Status.State == allocationv1.GameServerAllocationAllocated {
   348  			return result, nil
   349  		}
   350  	}
   351  	return result, err
   352  }
   353  
   354  // allocateFromRemoteCluster allocates gameservers from a remote cluster by making
   355  // an http call to allocation service in that cluster.
   356  func (c *Allocator) allocateFromRemoteCluster(gsa *allocationv1.GameServerAllocation, connectionInfo *multiclusterv1.ClusterConnectionInfo, namespace string) (*allocationv1.GameServerAllocation, error) {
   357  	var allocationResponse *pb.AllocationResponse
   358  
   359  	// TODO: cache the client
   360  	dialOpts, err := c.createRemoteClusterDialOption(namespace, connectionInfo)
   361  	if err != nil {
   362  		return nil, err
   363  	}
   364  
   365  	// Forward the game server allocation request to another cluster,
   366  	// and disable multicluster settings to avoid the target cluster
   367  	// forward the allocation request again.
   368  	request := converters.ConvertGSAToAllocationRequest(gsa)
   369  	request.MultiClusterSetting.Enabled = false
   370  	request.Namespace = connectionInfo.Namespace
   371  
   372  	ctx, cancel := context.WithTimeout(context.Background(), c.totalRemoteAllocationTimeout)
   373  	defer cancel() // nolint: errcheck
   374  	// Retry on remote call failures.
   375  	var endpoint string
   376  	err = Retry(remoteAllocationRetry, func() error {
   377  		for i, ip := range connectionInfo.AllocationEndpoints {
   378  			select {
   379  			case <-ctx.Done():
   380  				return ErrTotalTimeoutExceeded
   381  			default:
   382  			}
   383  			endpoint = addPort(ip)
   384  			c.loggerForGameServerAllocationKey("remote-allocation").WithField("request", request).WithField("endpoint", endpoint).Debug("forwarding allocation request")
   385  			allocationResponse, err = c.remoteAllocationCallback(ctx, endpoint, dialOpts, request)
   386  			if err != nil {
   387  				c.baseLogger.WithError(err).Error("remote allocation failed")
   388  				// If there are multiple endpoints for the allocator connection and the current one is
   389  				// failing, try the next endpoint. Otherwise, return the error response.
   390  				if (i + 1) < len(connectionInfo.AllocationEndpoints) {
   391  					// If there is a server error try a different endpoint
   392  					c.loggerForGameServerAllocationKey("remote-allocation").WithField("request", request).WithError(err).WithField("endpoint", endpoint).Warn("The request failed. Trying next endpoint")
   393  					continue
   394  				}
   395  				return err
   396  			}
   397  			break
   398  		}
   399  
   400  		return nil
   401  	})
   402  
   403  	return converters.ConvertAllocationResponseToGSA(allocationResponse, endpoint), err
   404  }
   405  
   406  // createRemoteClusterDialOption creates a grpc client dial option with proper certs to make a remote call.
   407  func (c *Allocator) createRemoteClusterDialOption(namespace string, connectionInfo *multiclusterv1.ClusterConnectionInfo) (grpc.DialOption, error) {
   408  	// TODO: disableMTLS works for a single cluster; still need to address how the flag interacts with multi-cluster authentication.
   409  	clientCert, clientKey, caCert, err := c.getClientCertificates(namespace, connectionInfo.SecretName)
   410  	if err != nil {
   411  		return nil, err
   412  	}
   413  	if clientCert == nil || clientKey == nil {
   414  		return nil, fmt.Errorf("missing client certificate key pair in secret %s", connectionInfo.SecretName)
   415  	}
   416  
   417  	// Load client cert
   418  	cert, err := tls.X509KeyPair(clientCert, clientKey)
   419  	if err != nil {
   420  		return nil, err
   421  	}
   422  
   423  	tlsConfig := &tls.Config{Certificates: []tls.Certificate{cert}}
   424  	if len(connectionInfo.ServerCA) != 0 || len(caCert) != 0 {
   425  		// Load CA cert, if provided and trust the server certificate.
   426  		// This is required for self-signed certs.
   427  		tlsConfig.RootCAs = x509.NewCertPool()
   428  		if len(connectionInfo.ServerCA) != 0 && !tlsConfig.RootCAs.AppendCertsFromPEM(connectionInfo.ServerCA) {
   429  			return nil, errors.New("only PEM format is accepted for server CA")
   430  		}
   431  		// Add client CA cert, which can be used instead of / as well as the specified ServerCA cert
   432  		if len(caCert) != 0 {
   433  			_ = tlsConfig.RootCAs.AppendCertsFromPEM(caCert)
   434  		}
   435  	}
   436  
   437  	return grpc.WithTransportCredentials(credentials.NewTLS(tlsConfig)), nil
   438  }
   439  
   440  // getClientCertificates returns the client certificates and CA cert for remote allocation cluster call
   441  func (c *Allocator) getClientCertificates(namespace, secretName string) (clientCert, clientKey, caCert []byte, err error) {
   442  	secret, err := c.secretLister.Secrets(namespace).Get(secretName)
   443  	if err != nil {
   444  		return nil, nil, nil, err
   445  	}
   446  	if secret == nil || len(secret.Data) == 0 {
   447  		return nil, nil, nil, fmt.Errorf("secret %s does not have data", secretName)
   448  	}
   449  
   450  	// Create http client using cert
   451  	clientCert = secret.Data[secretClientCertName]
   452  	clientKey = secret.Data[secretClientKeyName]
   453  	caCert = secret.Data[secretCACertName]
   454  	return clientCert, clientKey, caCert, nil
   455  }
   456  
   457  // allocate allocated a GameServer from a given GameServerAllocation
   458  // this sets up allocation through a batch process.
   459  func (c *Allocator) allocate(ctx context.Context, gsa *allocationv1.GameServerAllocation) (*agonesv1.GameServer, error) {
   460  	// creates an allocation request. This contains the requested GameServerAllocation, as well as the
   461  	// channel we expect the return values to come back for this GameServerAllocation
   462  	req := request{gsa: gsa, response: make(chan response)}
   463  
   464  	// this pushes the request into the batching process
   465  	c.pendingRequests <- req
   466  
   467  	select {
   468  	case res := <-req.response: // wait for the batch to be completed
   469  		return res.gs, res.err
   470  	case <-ctx.Done():
   471  		return nil, ErrTotalTimeoutExceeded
   472  	}
   473  }
   474  
   475  // ListenAndAllocate is a blocking function that runs in a loop
   476  // looking at c.requestBatches for batches of requests that are coming through.
   477  func (c *Allocator) ListenAndAllocate(ctx context.Context, updateWorkerCount int) {
   478  	// setup workers for allocation updates. Push response values into
   479  	// this queue for concurrent updating of GameServers to Allocated
   480  	updateQueue := c.allocationUpdateWorkers(ctx, updateWorkerCount)
   481  
   482  	// Batch processing strategy:
   483  	// We constantly loop around the below for loop. If nothing is found in c.pendingRequests, we move to
   484  	// default: which will wait for half a second, to allow for some requests to backup in c.pendingRequests,
   485  	// providing us with a batch of Allocation requests in that channel
   486  
   487  	// Once we have 1 or more requests in c.pendingRequests (which is buffered to 100), we can start the batch process.
   488  
   489  	// Assuming this is the first run (either entirely, or for a while), list will be nil, and therefore the first
   490  	// thing that will be done is retrieving the Ready GameServers and sorting them for this batch via
   491  	// c.listSortedReadyGameServers(). This list is maintained as we flow through the batch.
   492  
   493  	// We then use findGameServerForAllocation to loop around the sorted list of Ready GameServers to look for matches
   494  	// against the preferred and required selectors of the GameServerAllocation. If there is an error, we immediately
   495  	// pass that straight back to the response channel for this GameServerAllocation.
   496  
   497  	// Assuming we find a matching GameServer to our GameServerAllocation, we remove it from the list and the backing
   498  	// Ready GameServer cache.
   499  
   500  	// We then pass the found GameServers into the updateQueue, where there are updateWorkerCount number of goroutines
   501  	// waiting to concurrently attempt to move the GameServer into an Allocated state, and return the result to
   502  	// GameServerAllocation request's response channel
   503  
   504  	// Then we get the next item off the batch (c.pendingRequests), and do this all over again, but this time, we have
   505  	// an already sorted list of GameServers, so we only need to find one that matches our GameServerAllocation
   506  	// selectors, and put it into updateQueue
   507  
   508  	// The tracking of requestCount >= maxBatchBeforeRefresh is necessary, because without it, at high enough load
   509  	// the list of GameServers that we are using to allocate would never get refreshed (list = nil) with an updated
   510  	// list of Ready GameServers, and you would eventually never be able to Allocate anything as long as the load
   511  	// continued.
   512  
   513  	var list []*agonesv1.GameServer
   514  	var sortKey uint64
   515  	requestCount := 0
   516  
   517  	for {
   518  		select {
   519  		case req := <-c.pendingRequests:
   520  			// refresh the list after every 100 allocations made in a single batch
   521  			if requestCount >= maxBatchBeforeRefresh {
   522  				list = nil
   523  				requestCount = 0
   524  			}
   525  
   526  			if runtime.FeatureEnabled(runtime.FeatureCountsAndLists) {
   527  				// SortKey returns the sorting values (list of Priorities) as a determinstic key.
   528  				// In case gsa.Spec.Priorities is nil this will still return a sortKey.
   529  				// In case of error this will return 0 for the sortKey.
   530  				newSortKey, err := req.gsa.SortKey()
   531  				if err != nil {
   532  					c.baseLogger.WithError(err).Warn("error getting sortKey for GameServerAllocationSpec", err)
   533  				}
   534  				// Set sortKey if this is the first request, or the previous request errored on creating a sortKey.
   535  				if sortKey == uint64(0) {
   536  					sortKey = newSortKey
   537  				}
   538  
   539  				if newSortKey != sortKey {
   540  					sortKey = newSortKey
   541  					list = nil
   542  					requestCount = 0
   543  				}
   544  			}
   545  
   546  			requestCount++
   547  
   548  			if list == nil {
   549  				if !runtime.FeatureEnabled(runtime.FeatureCountsAndLists) || req.gsa.Spec.Scheduling == apis.Packed {
   550  					list = c.allocationCache.ListSortedGameServers(req.gsa)
   551  				} else {
   552  					// If FeatureCountsAndLists and Scheduling == Distributed, sort game servers by Priorities
   553  					list = c.allocationCache.ListSortedGameServersPriorities(req.gsa)
   554  				}
   555  			}
   556  
   557  			gs, index, err := findGameServerForAllocation(req.gsa, list)
   558  			if err != nil {
   559  				req.response <- response{request: req, gs: nil, err: err}
   560  				continue
   561  			}
   562  			// remove the game server that has been allocated
   563  			list = append(list[:index], list[index+1:]...)
   564  
   565  			if err := c.allocationCache.RemoveGameServer(gs); err != nil {
   566  				// this seems unlikely, but lets handle it just in case
   567  				req.response <- response{request: req, gs: nil, err: err}
   568  				continue
   569  			}
   570  
   571  			updateQueue <- response{request: req, gs: gs.DeepCopy(), err: nil}
   572  
   573  		case <-ctx.Done():
   574  			return
   575  		default:
   576  			list = nil
   577  			requestCount = 0
   578  			// slow down cpu churn, and allow items to batch
   579  			time.Sleep(c.batchWaitTime)
   580  		}
   581  	}
   582  }
   583  
   584  // allocationUpdateWorkers runs workerCount number of goroutines as workers to
   585  // process each GameServer passed into the returned updateQueue
   586  // Each worker will concurrently attempt to move the GameServer to an Allocated
   587  // state and then respond to the initial request's response channel with the
   588  // details of that update
   589  func (c *Allocator) allocationUpdateWorkers(ctx context.Context, workerCount int) chan<- response {
   590  	updateQueue := make(chan response)
   591  
   592  	for i := 0; i < workerCount; i++ {
   593  		go func() {
   594  			for {
   595  				select {
   596  				case res := <-updateQueue:
   597  					gs, err := c.applyAllocationToGameServer(ctx, res.request.gsa.Spec.MetaPatch, res.gs, res.request.gsa)
   598  					if err != nil {
   599  						if !k8serrors.IsConflict(errors.Cause(err)) {
   600  							// since we could not allocate, we should put it back
   601  							// but not if it's a conflict, as the cache is no longer up to date, and
   602  							// we should wait for it to get updated with fresh info.
   603  							c.allocationCache.AddGameServer(gs)
   604  						}
   605  						res.err = ErrGameServerUpdateConflict
   606  					} else {
   607  						// put the GameServer back into the cache, so it's immediately around for re-allocation
   608  						c.allocationCache.AddGameServer(gs)
   609  						res.gs = gs
   610  					}
   611  
   612  					res.request.response <- res
   613  				case <-ctx.Done():
   614  					return
   615  				}
   616  			}
   617  		}()
   618  	}
   619  
   620  	return updateQueue
   621  }
   622  
   623  // applyAllocationToGameServer patches the inputted GameServer with the allocation metadata changes, and updates it to the Allocated State.
   624  // Returns the updated GameServer.
   625  func (c *Allocator) applyAllocationToGameServer(ctx context.Context, mp allocationv1.MetaPatch, gs *agonesv1.GameServer, gsa *allocationv1.GameServerAllocation) (*agonesv1.GameServer, error) {
   626  	// patch ObjectMeta labels
   627  	if mp.Labels != nil {
   628  		if gs.ObjectMeta.Labels == nil {
   629  			gs.ObjectMeta.Labels = make(map[string]string, len(mp.Labels))
   630  		}
   631  		for key, value := range mp.Labels {
   632  			gs.ObjectMeta.Labels[key] = value
   633  		}
   634  	}
   635  
   636  	if gs.ObjectMeta.Annotations == nil {
   637  		gs.ObjectMeta.Annotations = make(map[string]string, len(mp.Annotations))
   638  	}
   639  	// apply annotations patch
   640  	for key, value := range mp.Annotations {
   641  		gs.ObjectMeta.Annotations[key] = value
   642  	}
   643  
   644  	// add last allocated, so it always gets updated, even if it is already Allocated
   645  	ts, err := time.Now().MarshalText()
   646  	if err != nil {
   647  		return nil, err
   648  	}
   649  	gs.ObjectMeta.Annotations[LastAllocatedAnnotationKey] = string(ts)
   650  	gs.Status.State = agonesv1.GameServerStateAllocated
   651  
   652  	// perfom any Counter or List actions
   653  	var counterErrors error
   654  	var listErrors error
   655  	if runtime.FeatureEnabled(runtime.FeatureCountsAndLists) {
   656  		if gsa.Spec.Counters != nil {
   657  			for counter, ca := range gsa.Spec.Counters {
   658  				counterErrors = goErrors.Join(counterErrors, ca.CounterActions(counter, gs))
   659  			}
   660  		}
   661  		if gsa.Spec.Lists != nil {
   662  			for list, la := range gsa.Spec.Lists {
   663  				listErrors = goErrors.Join(listErrors, la.ListActions(list, gs))
   664  			}
   665  		}
   666  	}
   667  
   668  	gsUpdate, updateErr := c.gameServerGetter.GameServers(gs.ObjectMeta.Namespace).Update(ctx, gs, metav1.UpdateOptions{})
   669  	if updateErr != nil {
   670  		return gsUpdate, updateErr
   671  	}
   672  
   673  	// If successful Update record any Counter or List action errors as a warning
   674  	if counterErrors != nil {
   675  		c.recorder.Event(gsUpdate, corev1.EventTypeWarning, "CounterActionError", counterErrors.Error())
   676  	}
   677  	if listErrors != nil {
   678  		c.recorder.Event(gsUpdate, corev1.EventTypeWarning, "ListActionError", listErrors.Error())
   679  	}
   680  	c.recorder.Event(gsUpdate, corev1.EventTypeNormal, string(gsUpdate.Status.State), "Allocated")
   681  
   682  	return gsUpdate, updateErr
   683  }
   684  
   685  // Retry retries fn based on backoff provided.
   686  func Retry(backoff wait.Backoff, fn func() error) error {
   687  	var lastConflictErr error
   688  	err := wait.ExponentialBackoff(backoff, func() (bool, error) {
   689  		err := fn()
   690  
   691  		st, ok := status.FromError(err)
   692  		if ok {
   693  			if st.Code() == codes.ResourceExhausted {
   694  				return true, err
   695  			}
   696  		}
   697  
   698  		switch {
   699  		case err == nil:
   700  			return true, nil
   701  		case err == ErrNoGameServer:
   702  			return true, err
   703  		case err == ErrTotalTimeoutExceeded:
   704  			return true, err
   705  		default:
   706  			lastConflictErr = err
   707  			return false, nil
   708  		}
   709  	})
   710  	if wait.Interrupted(err) {
   711  		err = lastConflictErr
   712  	}
   713  	return err
   714  }
   715  
   716  // newMetrics creates a new gsa latency recorder.
   717  func (c *Allocator) newMetrics(ctx context.Context) *metrics {
   718  	ctx, err := tag.New(ctx, latencyTags...)
   719  	if err != nil {
   720  		c.baseLogger.WithError(err).Warn("failed to tag latency recorder.")
   721  	}
   722  	return &metrics{
   723  		ctx:              ctx,
   724  		gameServerLister: c.allocationCache.gameServerLister,
   725  		logger:           c.baseLogger,
   726  		start:            time.Now(),
   727  	}
   728  }
   729  
   730  func addPort(ip string) string {
   731  	if strings.Contains(ip, ":") {
   732  		return ip
   733  	}
   734  	return fmt.Sprintf("%s:%s", ip, allocatorPort)
   735  }