agones.dev/agones@v1.54.0/pkg/processor/client.go (about) 1 // Copyright 2025 Google LLC All Rights Reserved. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 package processor 16 17 import ( 18 "context" 19 "fmt" 20 "sync" 21 "time" 22 23 "github.com/pkg/errors" 24 "github.com/sirupsen/logrus" 25 "google.golang.org/grpc" 26 "google.golang.org/grpc/codes" 27 "google.golang.org/grpc/credentials/insecure" 28 "google.golang.org/grpc/health/grpc_health_v1" 29 "google.golang.org/grpc/status" 30 "k8s.io/apimachinery/pkg/util/uuid" 31 32 allocationpb "agones.dev/agones/pkg/allocation/go" 33 ) 34 35 // Config holds the processor client configuration 36 type Config struct { 37 // ClientID is a unique identifier for this processor client instance 38 ClientID string 39 40 // ProcessorAddress specifies the address of the processor service to connect to 41 ProcessorAddress string 42 43 // MaxBatchSize determines the maximum number of allocation requests to batch together 44 MaxBatchSize int 45 46 // AllocationTimeout is the maximum duration to wait for an allocation response 47 AllocationTimeout time.Duration 48 49 // ReconnectInterval is the time to wait before retrying a failed connection 50 ReconnectInterval time.Duration 51 } 52 53 // client implements client interface 54 // 55 //nolint:govet // fieldalignment: struct alignment is not critical for our use case 56 type client struct { 57 // hotBatch holds the current batch of allocation requests that have been converted to protobuf format 58 // It accumulates requests until a pull request is received to be directly sent to the processor 59 // After sending, hotBatch is reset and starts collecting new requests for the next batch 60 hotBatch *allocationpb.BatchRequest 61 // pendingRequests holds the list of allocation requests currently in the batch 62 // Each pendingRequest tracks the original request, its unique ID, and channels for response and error 63 // This slice is used to correlate responses from the processor back to the original caller 64 pendingRequests []*pendingRequest 65 logger logrus.FieldLogger 66 config Config 67 68 batchMutex sync.RWMutex 69 // requestIDMapping is a map to correlate request IDs to pendingRequest objects for response handling 70 requestIDMapping map[string]*pendingRequest 71 } 72 73 // pendingRequest represents a request waiting for processing 74 type pendingRequest struct { 75 // request is the original allocation request data 76 request *allocationpb.AllocationRequest 77 78 // response is the channel to receive the allocation response 79 response chan *allocationpb.AllocationResponse 80 81 // error is the channel to receive an error if processing fails 82 error chan error 83 84 // id is the unique identifier for this request 85 id string 86 } 87 88 // Client interface for allocation operations 89 // Provides methods to run the processor client and perform allocation requests 90 type Client interface { 91 // Run starts the processor client 92 Run(ctx context.Context) error 93 94 // Allocate performs a batch allocation request 95 Allocate(ctx context.Context, req *allocationpb.AllocationRequest) (*allocationpb.AllocationResponse, error) 96 } 97 98 // NewClient creates a new processor client 99 func NewClient(config Config, logger logrus.FieldLogger) Client { 100 if config.ClientID == "" { 101 config.ClientID = string(uuid.NewUUID()) 102 } 103 104 return &client{ 105 config: config, 106 logger: logger, 107 hotBatch: &allocationpb.BatchRequest{ 108 Requests: make([]*allocationpb.RequestWrapper, 0, config.MaxBatchSize), 109 }, 110 pendingRequests: make([]*pendingRequest, 0, config.MaxBatchSize), 111 requestIDMapping: make(map[string]*pendingRequest), 112 } 113 } 114 115 // Run starts the processor client and manages the connection lifecycle 116 // It will retry connecting to the processor service until the context is cancelled 117 func (p *client) Run(ctx context.Context) error { 118 p.logger.Info("Starting processor client") 119 120 // Main connection loop with retry 121 for { 122 select { 123 case <-ctx.Done(): 124 p.logger.Info("Processor client stopping") 125 return ctx.Err() 126 default: 127 if err := p.connectAndRun(ctx); err != nil { 128 if ctx.Err() != nil { 129 return ctx.Err() 130 } 131 p.logger.WithError(err).Error("Connection failed, retrying") 132 133 // Wait before retrying connection 134 select { 135 case <-ctx.Done(): 136 return ctx.Err() 137 case <-time.After(p.config.ReconnectInterval): 138 } 139 } 140 } 141 } 142 } 143 144 // Allocate performs an allocation request by batching it and waiting for a response or error 145 func (p *client) Allocate(ctx context.Context, req *allocationpb.AllocationRequest) (*allocationpb.AllocationResponse, error) { 146 requestID := string(uuid.NewUUID()) 147 148 // Create a pendingRequest to track this allocation request and its response/error 149 pendingReq := &pendingRequest{ 150 id: requestID, 151 request: req, 152 response: make(chan *allocationpb.AllocationResponse, 1), 153 error: make(chan error, 1), 154 } 155 156 // Wrap the request for batching. 157 wrapper := &allocationpb.RequestWrapper{ 158 RequestId: requestID, 159 Request: req, 160 } 161 162 p.batchMutex.Lock() 163 p.requestIDMapping[requestID] = pendingReq 164 p.hotBatch.Requests = append(p.hotBatch.Requests, wrapper) 165 p.pendingRequests = append(p.pendingRequests, pendingReq) 166 p.batchMutex.Unlock() 167 168 // Wait for response, error, cancellation, or timeout 169 timeout := p.config.AllocationTimeout 170 171 select { 172 case response := <-pendingReq.response: 173 p.logger.WithField("requestID", requestID).Debug("Received successful response") 174 return response, nil 175 176 case err := <-pendingReq.error: 177 p.logger.WithField("requestID", requestID).WithError(err).Debug("Received error response") 178 return nil, err 179 180 case <-ctx.Done(): 181 p.batchMutex.Lock() 182 delete(p.requestIDMapping, requestID) 183 p.batchMutex.Unlock() 184 p.logger.WithField("requestID", requestID).Debug("Request cancelled by context") 185 return nil, ctx.Err() 186 187 case <-time.After(timeout): 188 p.batchMutex.Lock() 189 delete(p.requestIDMapping, requestID) 190 p.batchMutex.Unlock() 191 p.logger.WithField("requestID", requestID).Error("Timeout waiting for processor response") 192 return nil, status.Errorf(codes.DeadlineExceeded, "allocation timeout after %v", timeout) 193 } 194 } 195 196 // handleStream processes incoming messages from the processor stream 197 // It listens for pull requests and batch responses, dispatching them to appropriate handlers 198 func (p *client) handleStream(ctx context.Context, stream allocationpb.Processor_StreamBatchesClient) error { 199 p.logger.Info("Starting stream message handling") 200 201 // Channel to handle pull requests asynchronously 202 pullRequestChan := make(chan struct{}, 20) 203 204 // Start goroutine to handle pull requests without blocking 205 go p.pullRequestHandler(ctx, stream, pullRequestChan) 206 207 for { 208 select { 209 case <-ctx.Done(): 210 p.logger.Info("Stream handling stopping due to context cancellation") 211 return ctx.Err() 212 default: 213 // Receive message from processor 214 msg, err := stream.Recv() 215 if err != nil { 216 p.logger.WithError(err).Error("Failed to receive message from processor") 217 return errors.Wrap(err, "stream recv error") 218 } 219 220 // Handle message based on its payload type 221 switch payload := msg.GetPayload().(type) { 222 case *allocationpb.ProcessorMessage_Pull: 223 // Pull request: queue for async handling 224 select { 225 case pullRequestChan <- struct{}{}: 226 p.logger.Debug("Pull request queued successfully") 227 default: 228 p.logger.Warn("Pull request queue full - dropping request") 229 } 230 231 case *allocationpb.ProcessorMessage_BatchResponse: 232 // Batch response: handle immediately 233 p.handleBatchResponse(payload.BatchResponse) 234 235 default: 236 // Unknown message type 237 p.logger.WithField("messageType", fmt.Sprintf("%T", payload)).Warn("Received unknown message type from processor") 238 } 239 } 240 } 241 } 242 243 // pullRequestHandler handles pull requests asynchronously without blocking the main stream 244 // It waits for pull requests on the channel and processes them as they arrive 245 func (p *client) pullRequestHandler(ctx context.Context, stream allocationpb.Processor_StreamBatchesClient, pullRequestChan <-chan struct{}) { 246 p.logger.Debug("Starting async pull request handler") 247 248 for { 249 select { 250 case <-ctx.Done(): 251 p.logger.Info("Pull request handler stopping") 252 return 253 254 case <-pullRequestChan: 255 p.handlePullRequest(stream) 256 } 257 } 258 } 259 260 // handlePullRequest responds to pull requests by sending the current batch of allocation requests 261 // It swaps out the hot batch, resets it for new requests, and sends the ready batch to the processor 262 func (p *client) handlePullRequest(stream allocationpb.Processor_StreamBatchesClient) { 263 // Swap out the hot batch and pending requests 264 p.batchMutex.Lock() 265 readyBatch := p.hotBatch 266 readyRequests := p.pendingRequests 267 268 // Reset hot batch for next requests 269 p.hotBatch = &allocationpb.BatchRequest{ 270 Requests: make([]*allocationpb.RequestWrapper, 0, p.config.MaxBatchSize), 271 } 272 p.pendingRequests = make([]*pendingRequest, 0, p.config.MaxBatchSize) 273 p.batchMutex.Unlock() 274 275 if len(readyRequests) == 0 { 276 p.logger.Debug("No requests to send in batch") 277 return 278 } 279 280 // Send batch to processor 281 p.sendBatch(stream, readyBatch, readyRequests) 282 } 283 284 // sendBatch sends a batch of allocation requests to the processor 285 func (p *client) sendBatch(stream allocationpb.Processor_StreamBatchesClient, batch *allocationpb.BatchRequest, requests []*pendingRequest) { 286 batch.BatchId = string(uuid.NewUUID()) 287 288 // Prepare batch message 289 batchMsg := &allocationpb.ProcessorMessage{ 290 ClientId: p.config.ClientID, 291 Payload: &allocationpb.ProcessorMessage_BatchRequest{ 292 BatchRequest: batch, 293 }, 294 } 295 296 sendStart := time.Now() 297 if err := stream.Send(batchMsg); err != nil { 298 p.logger.WithError(err).Error("Failed to send batch") 299 300 // Re-add the request to the hot batch and pendingRequests for the next pull 301 p.batchMutex.Lock() 302 for _, req := range requests { 303 p.hotBatch.Requests = append(p.hotBatch.Requests, &allocationpb.RequestWrapper{ 304 RequestId: req.id, 305 Request: req.request, 306 }) 307 p.pendingRequests = append(p.pendingRequests, req) 308 } 309 p.batchMutex.Unlock() 310 return 311 } 312 313 sendDuration := time.Since(sendStart) 314 p.logger.WithFields(logrus.Fields{ 315 "batchID": batch.BatchId, 316 "requestCount": len(requests), 317 "sendDuration": sendDuration, 318 }).Debug("Batch sent successfully") 319 } 320 321 // handleBatchResponse processes responses from the processor for a batch of requests 322 // It matches responses to pending requests, sends results/errors, and cleans up processed requests 323 func (p *client) handleBatchResponse(batchResp *allocationpb.BatchResponse) { 324 p.logger.WithFields(logrus.Fields{ 325 "component": "processor-client", 326 "batchID": batchResp.BatchId, 327 "responseCount": len(batchResp.Responses), 328 }).Debug("Processing batch response") 329 330 successCount := 0 331 errorCount := 0 332 notFoundCount := 0 333 334 for _, respWrapper := range batchResp.Responses { 335 requestID := respWrapper.RequestId 336 337 // Try to load the pending request for this response 338 p.batchMutex.RLock() 339 req, exists := p.requestIDMapping[requestID] 340 p.batchMutex.RUnlock() 341 342 if exists { 343 // Track if response was processed successfully 344 responseProcessed := false 345 346 switch result := respWrapper.Result.(type) { 347 case *allocationpb.ResponseWrapper_Response: 348 // Success case: send response to caller 349 successCount++ 350 responseProcessed = true 351 352 select { 353 case req.response <- result.Response: 354 p.logger.WithField("requestID", requestID).Debug("Response sent successfully") 355 default: 356 p.logger.WithField("requestID", requestID).Warn("Failed to send response - channel full") 357 responseProcessed = false 358 } 359 360 case *allocationpb.ResponseWrapper_Error: 361 // Error case: send error to caller 362 errorCount++ 363 responseProcessed = true 364 365 code := codes.Code(result.Error.Code) 366 msg := result.Error.Message 367 368 p.logger.WithFields(logrus.Fields{ 369 "component": "processor-client", 370 "requestID": requestID, 371 "batchID": batchResp.BatchId, 372 "errorCode": code, 373 "errorMsg": msg, 374 }).Error("Request failed with error from processor") 375 376 select { 377 case req.error <- status.Error(code, msg): 378 p.logger.WithField("requestID", requestID).Debug("Error sent successfully") 379 default: 380 p.logger.WithField("requestID", requestID).Warn("Failed to send error - channel full") 381 responseProcessed = false 382 } 383 384 default: 385 // Missing result: treat as internal error 386 errorCount++ 387 responseProcessed = true 388 389 p.logger.WithFields(logrus.Fields{ 390 "component": "processor-client", 391 "requestID": requestID, 392 "batchID": batchResp.BatchId, 393 }).Error("Response wrapper has no result") 394 395 select { 396 case req.error <- status.Errorf(codes.Internal, "empty response from processor"): 397 p.logger.WithField("requestID", requestID).Debug("Error sent successfully") 398 default: 399 p.logger.WithField("requestID", requestID).Warn("Failed to send error - channel full") 400 responseProcessed = false 401 } 402 } 403 404 // Only delete if response was processed successfully 405 if responseProcessed { 406 p.batchMutex.Lock() 407 delete(p.requestIDMapping, requestID) 408 p.batchMutex.Unlock() 409 p.logger.WithField("requestID", requestID).Debug("Request cleaned up successfully") 410 } else { 411 p.logger.WithField("requestID", requestID).Warn("Keeping request in map due to failed processing") 412 } 413 } else { 414 // No pending request found for this response 415 notFoundCount++ 416 p.logger.WithFields(logrus.Fields{ 417 "component": "processor-client", 418 "requestID": requestID, 419 "batchID": batchResp.BatchId, 420 }).Warn("No pending request found for response - may have timed out") 421 } 422 } 423 424 // Log summary of batch response processing 425 p.logger.WithFields(logrus.Fields{ 426 "component": "processor-client", 427 "batchID": batchResp.BatchId, 428 "successCount": successCount, 429 "errorCount": errorCount, 430 "notFoundCount": notFoundCount, 431 "totalCount": len(batchResp.Responses), 432 }).Debug("Batch response processing completed") 433 } 434 435 // connectAndRun handles the full connection lifecycle to the processor service 436 // It establishes a connection, creates a stream, registers the client, and then 437 // delegates to handleStream to process messages until an error or cancellation 438 func (p *client) connectAndRun(ctx context.Context) error { 439 // Connect to the processor 440 conn, err := p.connect(ctx) 441 if err != nil { 442 return errors.Wrap(err, "failed to connect") 443 } 444 defer func() { _ = conn.Close() }() 445 446 // Create a new processor client from the connection 447 client := allocationpb.NewProcessorClient(conn) 448 449 // Open a streaming RPC to the processor 450 stream, err := client.StreamBatches(ctx) 451 if err != nil { 452 return errors.Wrap(err, "failed to create stream") 453 } 454 455 // Register this client instance with the processor 456 if err := p.registerClient(stream); err != nil { 457 return errors.Wrap(err, "failed to register") 458 } 459 460 p.logger.Info("Connected to processor") 461 462 // Handle the stream until an error occurs or the context is cancelled 463 return p.handleStream(ctx, stream) 464 } 465 466 // connect attempts to connect to the processor service with health checks 467 // Returns a healthy gRPC connection or an error 468 func (p *client) connect(ctx context.Context) (*grpc.ClientConn, error) { 469 p.logger.Info("Attempting connection") 470 471 conn, err := grpc.NewClient(p.config.ProcessorAddress, 472 grpc.WithTransportCredentials(insecure.NewCredentials())) 473 474 if err != nil { 475 p.logger.WithError(err).Error("connection failed") 476 return nil, err 477 } 478 479 // Perform a health check on the connection 480 if err := p.healthCheck(ctx, conn); err != nil { 481 p.logger.WithError(err).Error("health check failed") 482 _ = conn.Close() 483 return nil, err 484 } 485 486 p.logger.Info("Successfully connected to processor") 487 return conn, nil 488 } 489 490 // healthCheck verifies that the processor service is healthy and serving requests 491 // Returns an error if the health check fails or the service is not in SERVING state 492 func (p *client) healthCheck(ctx context.Context, conn *grpc.ClientConn) error { 493 healthClient := grpc_health_v1.NewHealthClient(conn) 494 495 // Set a timeout for the health check RPC 496 healthCtx, cancel := context.WithTimeout(ctx, 5*time.Second) 497 defer cancel() 498 499 resp, err := healthClient.Check(healthCtx, &grpc_health_v1.HealthCheckRequest{ 500 Service: "processor", 501 }) 502 if err != nil { 503 return err 504 } 505 506 if resp.Status != grpc_health_v1.HealthCheckResponse_SERVING { 507 return errors.Errorf("processor not serving: %v", resp.Status) 508 } 509 510 return nil 511 } 512 513 // registerClient sends a registration message to the processor over the stream 514 // This identifies the client instance to the processor service 515 func (p *client) registerClient(stream allocationpb.Processor_StreamBatchesClient) error { 516 p.logger.WithField("clientID", p.config.ClientID).Info("Registering client with processor") 517 518 registerMsg := &allocationpb.ProcessorMessage{ 519 ClientId: p.config.ClientID, 520 } 521 522 // Send the registration message 523 err := stream.Send(registerMsg) 524 if err != nil { 525 p.logger.WithField("clientID", p.config.ClientID).WithError(err).Error("Failed to register client") 526 return err 527 } 528 529 return nil 530 }