agones.dev/agones@v1.53.0/pkg/gameserverallocations/allocator.go (about) 1 // Copyright 2019 Google LLC All Rights Reserved. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 package gameserverallocations 16 17 import ( 18 "context" 19 "crypto/tls" 20 "crypto/x509" 21 goErrors "errors" 22 "fmt" 23 "strings" 24 "time" 25 26 "agones.dev/agones/pkg/allocation/converters" 27 pb "agones.dev/agones/pkg/allocation/go" 28 "agones.dev/agones/pkg/apis" 29 agonesv1 "agones.dev/agones/pkg/apis/agones/v1" 30 allocationv1 "agones.dev/agones/pkg/apis/allocation/v1" 31 multiclusterv1 "agones.dev/agones/pkg/apis/multicluster/v1" 32 getterv1 "agones.dev/agones/pkg/client/clientset/versioned/typed/agones/v1" 33 multiclusterinformerv1 "agones.dev/agones/pkg/client/informers/externalversions/multicluster/v1" 34 multiclusterlisterv1 "agones.dev/agones/pkg/client/listers/multicluster/v1" 35 "agones.dev/agones/pkg/util/apiserver" 36 "agones.dev/agones/pkg/util/logfields" 37 "agones.dev/agones/pkg/util/runtime" 38 "github.com/pkg/errors" 39 "github.com/sirupsen/logrus" 40 "go.opencensus.io/tag" 41 "google.golang.org/grpc" 42 "google.golang.org/grpc/codes" 43 "google.golang.org/grpc/credentials" 44 "google.golang.org/grpc/status" 45 corev1 "k8s.io/api/core/v1" 46 k8serrors "k8s.io/apimachinery/pkg/api/errors" 47 metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" 48 "k8s.io/apimachinery/pkg/labels" 49 k8sruntime "k8s.io/apimachinery/pkg/runtime" 50 "k8s.io/apimachinery/pkg/runtime/schema" 51 runtimeschema "k8s.io/apimachinery/pkg/runtime/schema" 52 "k8s.io/apimachinery/pkg/util/wait" 53 informercorev1 "k8s.io/client-go/informers/core/v1" 54 "k8s.io/client-go/kubernetes" 55 "k8s.io/client-go/kubernetes/scheme" 56 typedcorev1 "k8s.io/client-go/kubernetes/typed/core/v1" 57 corev1lister "k8s.io/client-go/listers/core/v1" 58 "k8s.io/client-go/tools/cache" 59 "k8s.io/client-go/tools/record" 60 ) 61 62 var ( 63 // ErrNoGameServer is returned when there are no Allocatable GameServers 64 // available 65 ErrNoGameServer = errors.New("Could not find an Allocatable GameServer") 66 // ErrConflictInGameServerSelection is returned when the candidate gameserver already allocated 67 ErrConflictInGameServerSelection = errors.New("The Gameserver was already allocated") 68 // ErrTotalTimeoutExceeded is used to signal that total retry timeout has been exceeded and no additional retries should be made 69 ErrTotalTimeoutExceeded = status.Errorf(codes.DeadlineExceeded, "remote allocation total timeout exceeded") 70 ) 71 72 const ( 73 // LastAllocatedAnnotationKey is a GameServer annotation containing an RFC 3339 formatted 74 // timestamp of the most recent allocation. 75 LastAllocatedAnnotationKey = "agones.dev/last-allocated" 76 77 secretClientCertName = "tls.crt" 78 secretClientKeyName = "tls.key" 79 secretCACertName = "ca.crt" 80 allocatorPort = "443" 81 maxBatchQueue = 100 82 maxBatchBeforeRefresh = 100 83 localAllocationSource = "local" 84 ) 85 86 var allocationRetry = wait.Backoff{ 87 Steps: 5, 88 Duration: 10 * time.Millisecond, 89 Factor: 1.0, 90 Jitter: 0.1, 91 } 92 93 var remoteAllocationRetry = wait.Backoff{ 94 Steps: 7, 95 Duration: 100 * time.Millisecond, 96 Factor: 2.0, 97 } 98 99 // Allocator handles game server allocation 100 type Allocator struct { 101 baseLogger *logrus.Entry 102 allocationPolicyLister multiclusterlisterv1.GameServerAllocationPolicyLister 103 allocationPolicySynced cache.InformerSynced 104 secretLister corev1lister.SecretLister 105 secretSynced cache.InformerSynced 106 gameServerGetter getterv1.GameServersGetter 107 recorder record.EventRecorder 108 pendingRequests chan request 109 allocationCache *AllocationCache 110 remoteAllocationCallback func(context.Context, string, grpc.DialOption, *pb.AllocationRequest) (*pb.AllocationResponse, error) 111 remoteAllocationTimeout time.Duration 112 totalRemoteAllocationTimeout time.Duration 113 batchWaitTime time.Duration 114 } 115 116 // request is an async request for allocation 117 type request struct { 118 gsa *allocationv1.GameServerAllocation 119 response chan response 120 } 121 122 // response is an async response for a matching request 123 type response struct { 124 request request 125 gs *agonesv1.GameServer 126 err error 127 } 128 129 // NewAllocator creates an instance of Allocator 130 func NewAllocator(policyInformer multiclusterinformerv1.GameServerAllocationPolicyInformer, secretInformer informercorev1.SecretInformer, gameServerGetter getterv1.GameServersGetter, 131 kubeClient kubernetes.Interface, allocationCache *AllocationCache, remoteAllocationTimeout time.Duration, totalRemoteAllocationTimeout time.Duration, batchWaitTime time.Duration) *Allocator { 132 ah := &Allocator{ 133 pendingRequests: make(chan request, maxBatchQueue), 134 allocationPolicyLister: policyInformer.Lister(), 135 allocationPolicySynced: policyInformer.Informer().HasSynced, 136 secretLister: secretInformer.Lister(), 137 secretSynced: secretInformer.Informer().HasSynced, 138 gameServerGetter: gameServerGetter, 139 allocationCache: allocationCache, 140 batchWaitTime: batchWaitTime, 141 remoteAllocationTimeout: remoteAllocationTimeout, 142 totalRemoteAllocationTimeout: totalRemoteAllocationTimeout, 143 remoteAllocationCallback: func(ctx context.Context, endpoint string, dialOpts grpc.DialOption, request *pb.AllocationRequest) (*pb.AllocationResponse, error) { 144 conn, err := grpc.NewClient(endpoint, dialOpts) 145 if err != nil { 146 return nil, err 147 } 148 defer conn.Close() // nolint: errcheck 149 150 allocationCtx, cancel := context.WithTimeout(ctx, remoteAllocationTimeout) 151 defer cancel() // nolint: errcheck 152 grpcClient := pb.NewAllocationServiceClient(conn) 153 return grpcClient.Allocate(allocationCtx, request) 154 }, 155 } 156 157 ah.baseLogger = runtime.NewLoggerWithType(ah) 158 eventBroadcaster := record.NewBroadcaster() 159 eventBroadcaster.StartLogging(ah.baseLogger.Debugf) 160 eventBroadcaster.StartRecordingToSink(&typedcorev1.EventSinkImpl{Interface: kubeClient.CoreV1().Events("")}) 161 ah.recorder = eventBroadcaster.NewRecorder(scheme.Scheme, corev1.EventSource{Component: "GameServerAllocation-Allocator"}) 162 163 return ah 164 } 165 166 // Run initiates the listeners. 167 func (c *Allocator) Run(ctx context.Context) error { 168 if err := c.Sync(ctx); err != nil { 169 return err 170 } 171 172 if err := c.allocationCache.Run(ctx); err != nil { 173 return err 174 } 175 176 // workers and logic for batching allocations 177 go c.ListenAndAllocate(ctx, maxBatchQueue) 178 179 return nil 180 } 181 182 // Sync waits for cache to sync 183 func (c *Allocator) Sync(ctx context.Context) error { 184 c.baseLogger.Debug("Wait for Allocator cache sync") 185 if !cache.WaitForCacheSync(ctx.Done(), c.secretSynced, c.allocationPolicySynced) { 186 return errors.New("failed to wait for caches to sync") 187 } 188 return nil 189 } 190 191 // Allocate CRDHandler for allocating a gameserver. 192 func (c *Allocator) Allocate(ctx context.Context, gsa *allocationv1.GameServerAllocation) (out k8sruntime.Object, err error) { 193 latency := c.newMetrics(ctx) 194 defer func() { 195 if err != nil { 196 latency.setError() 197 } 198 latency.record() 199 }() 200 latency.setRequest(gsa) 201 202 // server side validation 203 if errs := gsa.Validate(); len(errs) > 0 { 204 kind := runtimeschema.GroupKind{ 205 Group: allocationv1.SchemeGroupVersion.Group, 206 Kind: "GameServerAllocation", 207 } 208 statusErr := k8serrors.NewInvalid(kind, gsa.Name, errs) 209 s := &statusErr.ErrStatus 210 var gvks []schema.GroupVersionKind 211 gvks, _, err := apiserver.Scheme.ObjectKinds(s) 212 if err != nil { 213 return nil, errors.Wrap(err, "could not find objectkinds for status") 214 } 215 216 c.loggerForGameServerAllocation(gsa).Debug("GameServerAllocation is invalid") 217 s.TypeMeta = metav1.TypeMeta{Kind: gvks[0].Kind, APIVersion: gvks[0].Version} 218 return s, nil 219 } 220 221 // Convert gsa required and preferred fields to selectors field 222 gsa.Converter() 223 224 // If multi-cluster setting is enabled, allocate base on the multicluster allocation policy. 225 if gsa.Spec.MultiClusterSetting.Enabled { 226 out, err = c.applyMultiClusterAllocation(ctx, gsa) 227 } else { 228 out, err = c.allocateFromLocalCluster(ctx, gsa) 229 } 230 231 if err != nil { 232 c.loggerForGameServerAllocation(gsa).WithError(err).Error("allocation failed") 233 return nil, err 234 } 235 latency.setResponse(out) 236 237 return out, nil 238 } 239 240 func (c *Allocator) loggerForGameServerAllocationKey(key string) *logrus.Entry { 241 return logfields.AugmentLogEntry(c.baseLogger, logfields.GameServerAllocationKey, key) 242 } 243 244 func (c *Allocator) loggerForGameServerAllocation(gsa *allocationv1.GameServerAllocation) *logrus.Entry { 245 gsaName := "NilGameServerAllocation" 246 if gsa != nil { 247 gsaName = gsa.Namespace + "/" + gsa.Name 248 } 249 return c.loggerForGameServerAllocationKey(gsaName).WithField("gsa", gsa) 250 } 251 252 // allocateFromLocalCluster allocates gameservers from the local cluster. 253 // Registers number of times we retried before getting a success allocation 254 func (c *Allocator) allocateFromLocalCluster(ctx context.Context, gsa *allocationv1.GameServerAllocation) (*allocationv1.GameServerAllocation, error) { 255 var gs *agonesv1.GameServer 256 retry := c.newMetrics(ctx) 257 retryCount := 0 258 err := Retry(allocationRetry, func() error { 259 var err error 260 gs, err = c.allocate(ctx, gsa) 261 retryCount++ 262 263 if err != nil { 264 c.loggerForGameServerAllocation(gsa).WithError(err).Warn("Failed to Allocated. Retrying...") 265 } else { 266 retry.recordAllocationRetrySuccess(ctx, retryCount) 267 } 268 return err 269 }) 270 271 if err != nil && err != ErrNoGameServer && err != ErrConflictInGameServerSelection { 272 c.allocationCache.Resync() 273 return nil, err 274 } 275 276 switch err { 277 case ErrNoGameServer: 278 gsa.Status.State = allocationv1.GameServerAllocationUnAllocated 279 case ErrConflictInGameServerSelection: 280 gsa.Status.State = allocationv1.GameServerAllocationContention 281 default: 282 gsa.ObjectMeta.Name = gs.ObjectMeta.Name 283 gsa.Status.State = allocationv1.GameServerAllocationAllocated 284 gsa.Status.GameServerName = gs.ObjectMeta.Name 285 gsa.Status.Ports = gs.Status.Ports 286 gsa.Status.Address = gs.Status.Address 287 gsa.Status.Addresses = append(gsa.Status.Addresses, gs.Status.Addresses...) 288 gsa.Status.NodeName = gs.Status.NodeName 289 gsa.Status.Source = localAllocationSource 290 gsa.Status.Metadata = &allocationv1.GameServerMetadata{ 291 Labels: gs.ObjectMeta.Labels, 292 Annotations: gs.ObjectMeta.Annotations, 293 } 294 if runtime.FeatureEnabled(runtime.FeatureCountsAndLists) { 295 gsa.Status.Counters = gs.Status.Counters 296 gsa.Status.Lists = gs.Status.Lists 297 } 298 } 299 300 c.loggerForGameServerAllocation(gsa).Debug("Game server allocation") 301 return gsa, nil 302 } 303 304 // applyMultiClusterAllocation retrieves allocation policies and iterate on policies. 305 // Then allocate gameservers from local or remote cluster accordingly. 306 func (c *Allocator) applyMultiClusterAllocation(ctx context.Context, gsa *allocationv1.GameServerAllocation) (result *allocationv1.GameServerAllocation, err error) { 307 selector := labels.Everything() 308 if len(gsa.Spec.MultiClusterSetting.PolicySelector.MatchLabels)+len(gsa.Spec.MultiClusterSetting.PolicySelector.MatchExpressions) != 0 { 309 selector, err = metav1.LabelSelectorAsSelector(&gsa.Spec.MultiClusterSetting.PolicySelector) 310 if err != nil { 311 return nil, err 312 } 313 } 314 315 policies, err := c.allocationPolicyLister.GameServerAllocationPolicies(gsa.ObjectMeta.Namespace).List(selector) 316 if err != nil { 317 return nil, err 318 } else if len(policies) == 0 { 319 return nil, errors.New("no multi-cluster allocation policy is specified") 320 } 321 322 it := multiclusterv1.NewConnectionInfoIterator(policies) 323 for { 324 connectionInfo := it.Next() 325 if connectionInfo == nil { 326 break 327 } 328 if len(connectionInfo.AllocationEndpoints) == 0 { 329 // Change the namespace to the policy namespace and allocate locally 330 gsaCopy := gsa 331 if gsa.Namespace != connectionInfo.Namespace { 332 gsaCopy = gsa.DeepCopy() 333 gsaCopy.Namespace = connectionInfo.Namespace 334 } 335 result, err = c.allocateFromLocalCluster(ctx, gsaCopy) 336 if err != nil { 337 c.loggerForGameServerAllocation(gsaCopy).WithError(err).Error("self-allocation failed") 338 } 339 } else { 340 result, err = c.allocateFromRemoteCluster(gsa, connectionInfo, gsa.ObjectMeta.Namespace) 341 if err != nil { 342 c.loggerForGameServerAllocation(gsa).WithField("allocConnInfo", connectionInfo).WithError(err).Error("remote-allocation failed") 343 } 344 } 345 if result != nil && result.Status.State == allocationv1.GameServerAllocationAllocated { 346 return result, nil 347 } 348 } 349 return result, err 350 } 351 352 // allocateFromRemoteCluster allocates gameservers from a remote cluster by making 353 // an http call to allocation service in that cluster. 354 func (c *Allocator) allocateFromRemoteCluster(gsa *allocationv1.GameServerAllocation, connectionInfo *multiclusterv1.ClusterConnectionInfo, namespace string) (*allocationv1.GameServerAllocation, error) { 355 var allocationResponse *pb.AllocationResponse 356 357 // TODO: cache the client 358 dialOpts, err := c.createRemoteClusterDialOption(namespace, connectionInfo) 359 if err != nil { 360 return nil, err 361 } 362 363 // Forward the game server allocation request to another cluster, 364 // and disable multicluster settings to avoid the target cluster 365 // forward the allocation request again. 366 request := converters.ConvertGSAToAllocationRequest(gsa) 367 request.MultiClusterSetting.Enabled = false 368 request.Namespace = connectionInfo.Namespace 369 370 ctx, cancel := context.WithTimeout(context.Background(), c.totalRemoteAllocationTimeout) 371 defer cancel() // nolint: errcheck 372 // Retry on remote call failures. 373 var endpoint string 374 err = Retry(remoteAllocationRetry, func() error { 375 for i, ip := range connectionInfo.AllocationEndpoints { 376 select { 377 case <-ctx.Done(): 378 return ErrTotalTimeoutExceeded 379 default: 380 } 381 endpoint = addPort(ip) 382 c.loggerForGameServerAllocationKey("remote-allocation").WithField("request", request).WithField("endpoint", endpoint).Debug("forwarding allocation request") 383 allocationResponse, err = c.remoteAllocationCallback(ctx, endpoint, dialOpts, request) 384 if err != nil { 385 c.baseLogger.WithError(err).Error("remote allocation failed") 386 // If there are multiple endpoints for the allocator connection and the current one is 387 // failing, try the next endpoint. Otherwise, return the error response. 388 if (i + 1) < len(connectionInfo.AllocationEndpoints) { 389 // If there is a server error try a different endpoint 390 c.loggerForGameServerAllocationKey("remote-allocation").WithField("request", request).WithError(err).WithField("endpoint", endpoint).Warn("The request failed. Trying next endpoint") 391 continue 392 } 393 return err 394 } 395 break 396 } 397 398 return nil 399 }) 400 401 return converters.ConvertAllocationResponseToGSA(allocationResponse, endpoint), err 402 } 403 404 // createRemoteClusterDialOption creates a grpc client dial option with proper certs to make a remote call. 405 func (c *Allocator) createRemoteClusterDialOption(namespace string, connectionInfo *multiclusterv1.ClusterConnectionInfo) (grpc.DialOption, error) { 406 // TODO: disableMTLS works for a single cluster; still need to address how the flag interacts with multi-cluster authentication. 407 clientCert, clientKey, caCert, err := c.getClientCertificates(namespace, connectionInfo.SecretName) 408 if err != nil { 409 return nil, err 410 } 411 if clientCert == nil || clientKey == nil { 412 return nil, fmt.Errorf("missing client certificate key pair in secret %s", connectionInfo.SecretName) 413 } 414 415 // Load client cert 416 cert, err := tls.X509KeyPair(clientCert, clientKey) 417 if err != nil { 418 return nil, err 419 } 420 421 tlsConfig := &tls.Config{Certificates: []tls.Certificate{cert}} 422 if len(connectionInfo.ServerCA) != 0 || len(caCert) != 0 { 423 // Load CA cert, if provided and trust the server certificate. 424 // This is required for self-signed certs. 425 tlsConfig.RootCAs = x509.NewCertPool() 426 if len(connectionInfo.ServerCA) != 0 && !tlsConfig.RootCAs.AppendCertsFromPEM(connectionInfo.ServerCA) { 427 return nil, errors.New("only PEM format is accepted for server CA") 428 } 429 // Add client CA cert, which can be used instead of / as well as the specified ServerCA cert 430 if len(caCert) != 0 { 431 _ = tlsConfig.RootCAs.AppendCertsFromPEM(caCert) 432 } 433 } 434 435 return grpc.WithTransportCredentials(credentials.NewTLS(tlsConfig)), nil 436 } 437 438 // getClientCertificates returns the client certificates and CA cert for remote allocation cluster call 439 func (c *Allocator) getClientCertificates(namespace, secretName string) (clientCert, clientKey, caCert []byte, err error) { 440 secret, err := c.secretLister.Secrets(namespace).Get(secretName) 441 if err != nil { 442 return nil, nil, nil, err 443 } 444 if secret == nil || len(secret.Data) == 0 { 445 return nil, nil, nil, fmt.Errorf("secret %s does not have data", secretName) 446 } 447 448 // Create http client using cert 449 clientCert = secret.Data[secretClientCertName] 450 clientKey = secret.Data[secretClientKeyName] 451 caCert = secret.Data[secretCACertName] 452 return clientCert, clientKey, caCert, nil 453 } 454 455 // allocate allocated a GameServer from a given GameServerAllocation 456 // this sets up allocation through a batch process. 457 func (c *Allocator) allocate(ctx context.Context, gsa *allocationv1.GameServerAllocation) (*agonesv1.GameServer, error) { 458 // creates an allocation request. This contains the requested GameServerAllocation, as well as the 459 // channel we expect the return values to come back for this GameServerAllocation 460 req := request{gsa: gsa, response: make(chan response)} 461 462 // this pushes the request into the batching process 463 c.pendingRequests <- req 464 465 select { 466 case res := <-req.response: // wait for the batch to be completed 467 return res.gs, res.err 468 case <-ctx.Done(): 469 return nil, ErrTotalTimeoutExceeded 470 } 471 } 472 473 // ListenAndAllocate is a blocking function that runs in a loop 474 // looking at c.requestBatches for batches of requests that are coming through. 475 func (c *Allocator) ListenAndAllocate(ctx context.Context, updateWorkerCount int) { 476 // setup workers for allocation updates. Push response values into 477 // this queue for concurrent updating of GameServers to Allocated 478 updateQueue := c.allocationUpdateWorkers(ctx, updateWorkerCount) 479 480 // Batch processing strategy: 481 // We constantly loop around the below for loop. If nothing is found in c.pendingRequests, we move to 482 // default: which will wait for half a second, to allow for some requests to backup in c.pendingRequests, 483 // providing us with a batch of Allocation requests in that channel 484 485 // Once we have 1 or more requests in c.pendingRequests (which is buffered to 100), we can start the batch process. 486 487 // Assuming this is the first run (either entirely, or for a while), list will be nil, and therefore the first 488 // thing that will be done is retrieving the Ready GameServers and sorting them for this batch via 489 // c.listSortedReadyGameServers(). This list is maintained as we flow through the batch. 490 491 // We then use findGameServerForAllocation to loop around the sorted list of Ready GameServers to look for matches 492 // against the preferred and required selectors of the GameServerAllocation. If there is an error, we immediately 493 // pass that straight back to the response channel for this GameServerAllocation. 494 495 // Assuming we find a matching GameServer to our GameServerAllocation, we remove it from the list and the backing 496 // Ready GameServer cache. 497 498 // We then pass the found GameServers into the updateQueue, where there are updateWorkerCount number of goroutines 499 // waiting to concurrently attempt to move the GameServer into an Allocated state, and return the result to 500 // GameServerAllocation request's response channel 501 502 // Then we get the next item off the batch (c.pendingRequests), and do this all over again, but this time, we have 503 // an already sorted list of GameServers, so we only need to find one that matches our GameServerAllocation 504 // selectors, and put it into updateQueue 505 506 // The tracking of requestCount >= maxBatchBeforeRefresh is necessary, because without it, at high enough load 507 // the list of GameServers that we are using to allocate would never get refreshed (list = nil) with an updated 508 // list of Ready GameServers, and you would eventually never be able to Allocate anything as long as the load 509 // continued. 510 511 var list []*agonesv1.GameServer 512 var sortKey uint64 513 requestCount := 0 514 515 for { 516 select { 517 case req := <-c.pendingRequests: 518 // refresh the list after every 100 allocations made in a single batch 519 if requestCount >= maxBatchBeforeRefresh { 520 list = nil 521 requestCount = 0 522 } 523 524 if runtime.FeatureEnabled(runtime.FeatureCountsAndLists) { 525 // SortKey returns the sorting values (list of Priorities) as a determinstic key. 526 // In case gsa.Spec.Priorities is nil this will still return a sortKey. 527 // In case of error this will return 0 for the sortKey. 528 newSortKey, err := req.gsa.SortKey() 529 if err != nil { 530 c.baseLogger.WithError(err).Warn("error getting sortKey for GameServerAllocationSpec", err) 531 } 532 // Set sortKey if this is the first request, or the previous request errored on creating a sortKey. 533 if sortKey == uint64(0) { 534 sortKey = newSortKey 535 } 536 537 if newSortKey != sortKey { 538 sortKey = newSortKey 539 list = nil 540 requestCount = 0 541 } 542 } 543 544 requestCount++ 545 546 if list == nil { 547 if !runtime.FeatureEnabled(runtime.FeatureCountsAndLists) || req.gsa.Spec.Scheduling == apis.Packed { 548 list = c.allocationCache.ListSortedGameServers(req.gsa) 549 } else { 550 // If FeatureCountsAndLists and Scheduling == Distributed, sort game servers by Priorities 551 list = c.allocationCache.ListSortedGameServersPriorities(req.gsa) 552 } 553 } 554 555 gs, index, err := findGameServerForAllocation(req.gsa, list) 556 if err != nil { 557 req.response <- response{request: req, gs: nil, err: err} 558 continue 559 } 560 // remove the game server that has been allocated 561 list = append(list[:index], list[index+1:]...) 562 563 if err := c.allocationCache.RemoveGameServer(gs); err != nil { 564 // this seems unlikely, but lets handle it just in case 565 req.response <- response{request: req, gs: nil, err: err} 566 continue 567 } 568 569 updateQueue <- response{request: req, gs: gs.DeepCopy(), err: nil} 570 571 case <-ctx.Done(): 572 return 573 default: 574 list = nil 575 requestCount = 0 576 // slow down cpu churn, and allow items to batch 577 time.Sleep(c.batchWaitTime) 578 } 579 } 580 } 581 582 // allocationUpdateWorkers runs workerCount number of goroutines as workers to 583 // process each GameServer passed into the returned updateQueue 584 // Each worker will concurrently attempt to move the GameServer to an Allocated 585 // state and then respond to the initial request's response channel with the 586 // details of that update 587 func (c *Allocator) allocationUpdateWorkers(ctx context.Context, workerCount int) chan<- response { 588 updateQueue := make(chan response) 589 590 for i := 0; i < workerCount; i++ { 591 go func() { 592 for { 593 select { 594 case res := <-updateQueue: 595 gs, err := c.applyAllocationToGameServer(ctx, res.request.gsa.Spec.MetaPatch, res.gs, res.request.gsa) 596 if err != nil { 597 if !k8serrors.IsConflict(errors.Cause(err)) { 598 // since we could not allocate, we should put it back 599 // but not if it's a conflict, as the cache is no longer up to date, and 600 // we should wait for it to get updated with fresh info. 601 c.allocationCache.AddGameServer(gs) 602 } 603 res.err = errors.Wrap(err, "error updating allocated gameserver") 604 } else { 605 // put the GameServer back into the cache, so it's immediately around for re-allocation 606 c.allocationCache.AddGameServer(gs) 607 res.gs = gs 608 } 609 610 res.request.response <- res 611 case <-ctx.Done(): 612 return 613 } 614 } 615 }() 616 } 617 618 return updateQueue 619 } 620 621 // applyAllocationToGameServer patches the inputted GameServer with the allocation metadata changes, and updates it to the Allocated State. 622 // Returns the updated GameServer. 623 func (c *Allocator) applyAllocationToGameServer(ctx context.Context, mp allocationv1.MetaPatch, gs *agonesv1.GameServer, gsa *allocationv1.GameServerAllocation) (*agonesv1.GameServer, error) { 624 // patch ObjectMeta labels 625 if mp.Labels != nil { 626 if gs.ObjectMeta.Labels == nil { 627 gs.ObjectMeta.Labels = make(map[string]string, len(mp.Labels)) 628 } 629 for key, value := range mp.Labels { 630 gs.ObjectMeta.Labels[key] = value 631 } 632 } 633 634 if gs.ObjectMeta.Annotations == nil { 635 gs.ObjectMeta.Annotations = make(map[string]string, len(mp.Annotations)) 636 } 637 // apply annotations patch 638 for key, value := range mp.Annotations { 639 gs.ObjectMeta.Annotations[key] = value 640 } 641 642 // add last allocated, so it always gets updated, even if it is already Allocated 643 ts, err := time.Now().MarshalText() 644 if err != nil { 645 return nil, err 646 } 647 gs.ObjectMeta.Annotations[LastAllocatedAnnotationKey] = string(ts) 648 gs.Status.State = agonesv1.GameServerStateAllocated 649 650 // perfom any Counter or List actions 651 var counterErrors error 652 var listErrors error 653 if runtime.FeatureEnabled(runtime.FeatureCountsAndLists) { 654 if gsa.Spec.Counters != nil { 655 for counter, ca := range gsa.Spec.Counters { 656 counterErrors = goErrors.Join(counterErrors, ca.CounterActions(counter, gs)) 657 } 658 } 659 if gsa.Spec.Lists != nil { 660 for list, la := range gsa.Spec.Lists { 661 listErrors = goErrors.Join(listErrors, la.ListActions(list, gs)) 662 } 663 } 664 } 665 666 gsUpdate, updateErr := c.gameServerGetter.GameServers(gs.ObjectMeta.Namespace).Update(ctx, gs, metav1.UpdateOptions{}) 667 if updateErr != nil { 668 return gsUpdate, updateErr 669 } 670 671 // If successful Update record any Counter or List action errors as a warning 672 if counterErrors != nil { 673 c.recorder.Event(gsUpdate, corev1.EventTypeWarning, "CounterActionError", counterErrors.Error()) 674 } 675 if listErrors != nil { 676 c.recorder.Event(gsUpdate, corev1.EventTypeWarning, "ListActionError", listErrors.Error()) 677 } 678 c.recorder.Event(gsUpdate, corev1.EventTypeNormal, string(gsUpdate.Status.State), "Allocated") 679 680 return gsUpdate, updateErr 681 } 682 683 // Retry retries fn based on backoff provided. 684 func Retry(backoff wait.Backoff, fn func() error) error { 685 var lastConflictErr error 686 err := wait.ExponentialBackoff(backoff, func() (bool, error) { 687 err := fn() 688 689 st, ok := status.FromError(err) 690 if ok { 691 if st.Code() == codes.ResourceExhausted { 692 return true, err 693 } 694 } 695 696 switch { 697 case err == nil: 698 return true, nil 699 case err == ErrNoGameServer: 700 return true, err 701 case err == ErrTotalTimeoutExceeded: 702 return true, err 703 default: 704 lastConflictErr = err 705 return false, nil 706 } 707 }) 708 if wait.Interrupted(err) { 709 err = lastConflictErr 710 } 711 return err 712 } 713 714 // newMetrics creates a new gsa latency recorder. 715 func (c *Allocator) newMetrics(ctx context.Context) *metrics { 716 ctx, err := tag.New(ctx, latencyTags...) 717 if err != nil { 718 c.baseLogger.WithError(err).Warn("failed to tag latency recorder.") 719 } 720 return &metrics{ 721 ctx: ctx, 722 gameServerLister: c.allocationCache.gameServerLister, 723 logger: c.baseLogger, 724 start: time.Now(), 725 } 726 } 727 728 func addPort(ip string) string { 729 if strings.Contains(ip, ":") { 730 return ip 731 } 732 return fmt.Sprintf("%s:%s", ip, allocatorPort) 733 }