agones.dev/agones@v1.54.0/pkg/processor/client.go

agones.dev/agones@v1.54.0/pkg/processor/client.go (about)

     1  // Copyright 2025 Google LLC All Rights Reserved.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  package processor
    16  
    17  import (
    18  	"context"
    19  	"fmt"
    20  	"sync"
    21  	"time"
    22  
    23  	"github.com/pkg/errors"
    24  	"github.com/sirupsen/logrus"
    25  	"google.golang.org/grpc"
    26  	"google.golang.org/grpc/codes"
    27  	"google.golang.org/grpc/credentials/insecure"
    28  	"google.golang.org/grpc/health/grpc_health_v1"
    29  	"google.golang.org/grpc/status"
    30  	"k8s.io/apimachinery/pkg/util/uuid"
    31  
    32  	allocationpb "agones.dev/agones/pkg/allocation/go"
    33  )
    34  
    35  // Config holds the processor client configuration
    36  type Config struct {
    37  	// ClientID is a unique identifier for this processor client instance
    38  	ClientID string
    39  
    40  	// ProcessorAddress specifies the address of the processor service to connect to
    41  	ProcessorAddress string
    42  
    43  	// MaxBatchSize determines the maximum number of allocation requests to batch together
    44  	MaxBatchSize int
    45  
    46  	// AllocationTimeout is the maximum duration to wait for an allocation response
    47  	AllocationTimeout time.Duration
    48  
    49  	// ReconnectInterval is the time to wait before retrying a failed connection
    50  	ReconnectInterval time.Duration
    51  }
    52  
    53  // client implements client interface
    54  //
    55  //nolint:govet // fieldalignment: struct alignment is not critical for our use case
    56  type client struct {
    57  	// hotBatch holds the current batch of allocation requests that have been converted to protobuf format
    58  	// It accumulates requests until a pull request is received to be directly sent to the processor
    59  	// After sending, hotBatch is reset and starts collecting new requests for the next batch
    60  	hotBatch *allocationpb.BatchRequest
    61  	// pendingRequests holds the list of allocation requests currently in the batch
    62  	// Each pendingRequest tracks the original request, its unique ID, and channels for response and error
    63  	// This slice is used to correlate responses from the processor back to the original caller
    64  	pendingRequests []*pendingRequest
    65  	logger          logrus.FieldLogger
    66  	config          Config
    67  
    68  	batchMutex sync.RWMutex
    69  	// requestIDMapping is a map to correlate request IDs to pendingRequest objects for response handling
    70  	requestIDMapping map[string]*pendingRequest
    71  }
    72  
    73  // pendingRequest represents a request waiting for processing
    74  type pendingRequest struct {
    75  	// request is the original allocation request data
    76  	request *allocationpb.AllocationRequest
    77  
    78  	// response is the channel to receive the allocation response
    79  	response chan *allocationpb.AllocationResponse
    80  
    81  	// error is the channel to receive an error if processing fails
    82  	error chan error
    83  
    84  	// id is the unique identifier for this request
    85  	id string
    86  }
    87  
    88  // Client interface for allocation operations
    89  // Provides methods to run the processor client and perform allocation requests
    90  type Client interface {
    91  	// Run starts the processor client
    92  	Run(ctx context.Context) error
    93  
    94  	// Allocate performs a batch allocation request
    95  	Allocate(ctx context.Context, req *allocationpb.AllocationRequest) (*allocationpb.AllocationResponse, error)
    96  }
    97  
    98  // NewClient creates a new processor client
    99  func NewClient(config Config, logger logrus.FieldLogger) Client {
   100  	if config.ClientID == "" {
   101  		config.ClientID = string(uuid.NewUUID())
   102  	}
   103  
   104  	return &client{
   105  		config: config,
   106  		logger: logger,
   107  		hotBatch: &allocationpb.BatchRequest{
   108  			Requests: make([]*allocationpb.RequestWrapper, 0, config.MaxBatchSize),
   109  		},
   110  		pendingRequests:  make([]*pendingRequest, 0, config.MaxBatchSize),
   111  		requestIDMapping: make(map[string]*pendingRequest),
   112  	}
   113  }
   114  
   115  // Run starts the processor client and manages the connection lifecycle
   116  // It will retry connecting to the processor service until the context is cancelled
   117  func (p *client) Run(ctx context.Context) error {
   118  	p.logger.Info("Starting processor client")
   119  
   120  	// Main connection loop with retry
   121  	for {
   122  		select {
   123  		case <-ctx.Done():
   124  			p.logger.Info("Processor client stopping")
   125  			return ctx.Err()
   126  		default:
   127  			if err := p.connectAndRun(ctx); err != nil {
   128  				if ctx.Err() != nil {
   129  					return ctx.Err()
   130  				}
   131  				p.logger.WithError(err).Error("Connection failed, retrying")
   132  
   133  				// Wait before retrying connection
   134  				select {
   135  				case <-ctx.Done():
   136  					return ctx.Err()
   137  				case <-time.After(p.config.ReconnectInterval):
   138  				}
   139  			}
   140  		}
   141  	}
   142  }
   143  
   144  // Allocate performs an allocation request by batching it and waiting for a response or error
   145  func (p *client) Allocate(ctx context.Context, req *allocationpb.AllocationRequest) (*allocationpb.AllocationResponse, error) {
   146  	requestID := string(uuid.NewUUID())
   147  
   148  	// Create a pendingRequest to track this allocation request and its response/error
   149  	pendingReq := &pendingRequest{
   150  		id:       requestID,
   151  		request:  req,
   152  		response: make(chan *allocationpb.AllocationResponse, 1),
   153  		error:    make(chan error, 1),
   154  	}
   155  
   156  	// Wrap the request for batching.
   157  	wrapper := &allocationpb.RequestWrapper{
   158  		RequestId: requestID,
   159  		Request:   req,
   160  	}
   161  
   162  	p.batchMutex.Lock()
   163  	p.requestIDMapping[requestID] = pendingReq
   164  	p.hotBatch.Requests = append(p.hotBatch.Requests, wrapper)
   165  	p.pendingRequests = append(p.pendingRequests, pendingReq)
   166  	p.batchMutex.Unlock()
   167  
   168  	// Wait for response, error, cancellation, or timeout
   169  	timeout := p.config.AllocationTimeout
   170  
   171  	select {
   172  	case response := <-pendingReq.response:
   173  		p.logger.WithField("requestID", requestID).Debug("Received successful response")
   174  		return response, nil
   175  
   176  	case err := <-pendingReq.error:
   177  		p.logger.WithField("requestID", requestID).WithError(err).Debug("Received error response")
   178  		return nil, err
   179  
   180  	case <-ctx.Done():
   181  		p.batchMutex.Lock()
   182  		delete(p.requestIDMapping, requestID)
   183  		p.batchMutex.Unlock()
   184  		p.logger.WithField("requestID", requestID).Debug("Request cancelled by context")
   185  		return nil, ctx.Err()
   186  
   187  	case <-time.After(timeout):
   188  		p.batchMutex.Lock()
   189  		delete(p.requestIDMapping, requestID)
   190  		p.batchMutex.Unlock()
   191  		p.logger.WithField("requestID", requestID).Error("Timeout waiting for processor response")
   192  		return nil, status.Errorf(codes.DeadlineExceeded, "allocation timeout after %v", timeout)
   193  	}
   194  }
   195  
   196  // handleStream processes incoming messages from the processor stream
   197  // It listens for pull requests and batch responses, dispatching them to appropriate handlers
   198  func (p *client) handleStream(ctx context.Context, stream allocationpb.Processor_StreamBatchesClient) error {
   199  	p.logger.Info("Starting stream message handling")
   200  
   201  	// Channel to handle pull requests asynchronously
   202  	pullRequestChan := make(chan struct{}, 20)
   203  
   204  	// Start goroutine to handle pull requests without blocking
   205  	go p.pullRequestHandler(ctx, stream, pullRequestChan)
   206  
   207  	for {
   208  		select {
   209  		case <-ctx.Done():
   210  			p.logger.Info("Stream handling stopping due to context cancellation")
   211  			return ctx.Err()
   212  		default:
   213  			// Receive message from processor
   214  			msg, err := stream.Recv()
   215  			if err != nil {
   216  				p.logger.WithError(err).Error("Failed to receive message from processor")
   217  				return errors.Wrap(err, "stream recv error")
   218  			}
   219  
   220  			// Handle message based on its payload type
   221  			switch payload := msg.GetPayload().(type) {
   222  			case *allocationpb.ProcessorMessage_Pull:
   223  				// Pull request: queue for async handling
   224  				select {
   225  				case pullRequestChan <- struct{}{}:
   226  					p.logger.Debug("Pull request queued successfully")
   227  				default:
   228  					p.logger.Warn("Pull request queue full - dropping request")
   229  				}
   230  
   231  			case *allocationpb.ProcessorMessage_BatchResponse:
   232  				// Batch response: handle immediately
   233  				p.handleBatchResponse(payload.BatchResponse)
   234  
   235  			default:
   236  				// Unknown message type
   237  				p.logger.WithField("messageType", fmt.Sprintf("%T", payload)).Warn("Received unknown message type from processor")
   238  			}
   239  		}
   240  	}
   241  }
   242  
   243  // pullRequestHandler handles pull requests asynchronously without blocking the main stream
   244  // It waits for pull requests on the channel and processes them as they arrive
   245  func (p *client) pullRequestHandler(ctx context.Context, stream allocationpb.Processor_StreamBatchesClient, pullRequestChan <-chan struct{}) {
   246  	p.logger.Debug("Starting async pull request handler")
   247  
   248  	for {
   249  		select {
   250  		case <-ctx.Done():
   251  			p.logger.Info("Pull request handler stopping")
   252  			return
   253  
   254  		case <-pullRequestChan:
   255  			p.handlePullRequest(stream)
   256  		}
   257  	}
   258  }
   259  
   260  // handlePullRequest responds to pull requests by sending the current batch of allocation requests
   261  // It swaps out the hot batch, resets it for new requests, and sends the ready batch to the processor
   262  func (p *client) handlePullRequest(stream allocationpb.Processor_StreamBatchesClient) {
   263  	// Swap out the hot batch and pending requests
   264  	p.batchMutex.Lock()
   265  	readyBatch := p.hotBatch
   266  	readyRequests := p.pendingRequests
   267  
   268  	// Reset hot batch for next requests
   269  	p.hotBatch = &allocationpb.BatchRequest{
   270  		Requests: make([]*allocationpb.RequestWrapper, 0, p.config.MaxBatchSize),
   271  	}
   272  	p.pendingRequests = make([]*pendingRequest, 0, p.config.MaxBatchSize)
   273  	p.batchMutex.Unlock()
   274  
   275  	if len(readyRequests) == 0 {
   276  		p.logger.Debug("No requests to send in batch")
   277  		return
   278  	}
   279  
   280  	// Send batch to processor
   281  	p.sendBatch(stream, readyBatch, readyRequests)
   282  }
   283  
   284  // sendBatch sends a batch of allocation requests to the processor
   285  func (p *client) sendBatch(stream allocationpb.Processor_StreamBatchesClient, batch *allocationpb.BatchRequest, requests []*pendingRequest) {
   286  	batch.BatchId = string(uuid.NewUUID())
   287  
   288  	// Prepare batch message
   289  	batchMsg := &allocationpb.ProcessorMessage{
   290  		ClientId: p.config.ClientID,
   291  		Payload: &allocationpb.ProcessorMessage_BatchRequest{
   292  			BatchRequest: batch,
   293  		},
   294  	}
   295  
   296  	sendStart := time.Now()
   297  	if err := stream.Send(batchMsg); err != nil {
   298  		p.logger.WithError(err).Error("Failed to send batch")
   299  
   300  		// Re-add the request to the hot batch and pendingRequests for the next pull
   301  		p.batchMutex.Lock()
   302  		for _, req := range requests {
   303  			p.hotBatch.Requests = append(p.hotBatch.Requests, &allocationpb.RequestWrapper{
   304  				RequestId: req.id,
   305  				Request:   req.request,
   306  			})
   307  			p.pendingRequests = append(p.pendingRequests, req)
   308  		}
   309  		p.batchMutex.Unlock()
   310  		return
   311  	}
   312  
   313  	sendDuration := time.Since(sendStart)
   314  	p.logger.WithFields(logrus.Fields{
   315  		"batchID":      batch.BatchId,
   316  		"requestCount": len(requests),
   317  		"sendDuration": sendDuration,
   318  	}).Debug("Batch sent successfully")
   319  }
   320  
   321  // handleBatchResponse processes responses from the processor for a batch of requests
   322  // It matches responses to pending requests, sends results/errors, and cleans up processed requests
   323  func (p *client) handleBatchResponse(batchResp *allocationpb.BatchResponse) {
   324  	p.logger.WithFields(logrus.Fields{
   325  		"component":     "processor-client",
   326  		"batchID":       batchResp.BatchId,
   327  		"responseCount": len(batchResp.Responses),
   328  	}).Debug("Processing batch response")
   329  
   330  	successCount := 0
   331  	errorCount := 0
   332  	notFoundCount := 0
   333  
   334  	for _, respWrapper := range batchResp.Responses {
   335  		requestID := respWrapper.RequestId
   336  
   337  		// Try to load the pending request for this response
   338  		p.batchMutex.RLock()
   339  		req, exists := p.requestIDMapping[requestID]
   340  		p.batchMutex.RUnlock()
   341  
   342  		if exists {
   343  			// Track if response was processed successfully
   344  			responseProcessed := false
   345  
   346  			switch result := respWrapper.Result.(type) {
   347  			case *allocationpb.ResponseWrapper_Response:
   348  				// Success case: send response to caller
   349  				successCount++
   350  				responseProcessed = true
   351  
   352  				select {
   353  				case req.response <- result.Response:
   354  					p.logger.WithField("requestID", requestID).Debug("Response sent successfully")
   355  				default:
   356  					p.logger.WithField("requestID", requestID).Warn("Failed to send response - channel full")
   357  					responseProcessed = false
   358  				}
   359  
   360  			case *allocationpb.ResponseWrapper_Error:
   361  				// Error case: send error to caller
   362  				errorCount++
   363  				responseProcessed = true
   364  
   365  				code := codes.Code(result.Error.Code)
   366  				msg := result.Error.Message
   367  
   368  				p.logger.WithFields(logrus.Fields{
   369  					"component": "processor-client",
   370  					"requestID": requestID,
   371  					"batchID":   batchResp.BatchId,
   372  					"errorCode": code,
   373  					"errorMsg":  msg,
   374  				}).Error("Request failed with error from processor")
   375  
   376  				select {
   377  				case req.error <- status.Error(code, msg):
   378  					p.logger.WithField("requestID", requestID).Debug("Error sent successfully")
   379  				default:
   380  					p.logger.WithField("requestID", requestID).Warn("Failed to send error - channel full")
   381  					responseProcessed = false
   382  				}
   383  
   384  			default:
   385  				// Missing result: treat as internal error
   386  				errorCount++
   387  				responseProcessed = true
   388  
   389  				p.logger.WithFields(logrus.Fields{
   390  					"component": "processor-client",
   391  					"requestID": requestID,
   392  					"batchID":   batchResp.BatchId,
   393  				}).Error("Response wrapper has no result")
   394  
   395  				select {
   396  				case req.error <- status.Errorf(codes.Internal, "empty response from processor"):
   397  					p.logger.WithField("requestID", requestID).Debug("Error sent successfully")
   398  				default:
   399  					p.logger.WithField("requestID", requestID).Warn("Failed to send error - channel full")
   400  					responseProcessed = false
   401  				}
   402  			}
   403  
   404  			// Only delete if response was processed successfully
   405  			if responseProcessed {
   406  				p.batchMutex.Lock()
   407  				delete(p.requestIDMapping, requestID)
   408  				p.batchMutex.Unlock()
   409  				p.logger.WithField("requestID", requestID).Debug("Request cleaned up successfully")
   410  			} else {
   411  				p.logger.WithField("requestID", requestID).Warn("Keeping request in map due to failed processing")
   412  			}
   413  		} else {
   414  			// No pending request found for this response
   415  			notFoundCount++
   416  			p.logger.WithFields(logrus.Fields{
   417  				"component": "processor-client",
   418  				"requestID": requestID,
   419  				"batchID":   batchResp.BatchId,
   420  			}).Warn("No pending request found for response - may have timed out")
   421  		}
   422  	}
   423  
   424  	// Log summary of batch response processing
   425  	p.logger.WithFields(logrus.Fields{
   426  		"component":     "processor-client",
   427  		"batchID":       batchResp.BatchId,
   428  		"successCount":  successCount,
   429  		"errorCount":    errorCount,
   430  		"notFoundCount": notFoundCount,
   431  		"totalCount":    len(batchResp.Responses),
   432  	}).Debug("Batch response processing completed")
   433  }
   434  
   435  // connectAndRun handles the full connection lifecycle to the processor service
   436  // It establishes a connection, creates a stream, registers the client, and then
   437  // delegates to handleStream to process messages until an error or cancellation
   438  func (p *client) connectAndRun(ctx context.Context) error {
   439  	// Connect to the processor
   440  	conn, err := p.connect(ctx)
   441  	if err != nil {
   442  		return errors.Wrap(err, "failed to connect")
   443  	}
   444  	defer func() { _ = conn.Close() }()
   445  
   446  	// Create a new processor client from the connection
   447  	client := allocationpb.NewProcessorClient(conn)
   448  
   449  	// Open a streaming RPC to the processor
   450  	stream, err := client.StreamBatches(ctx)
   451  	if err != nil {
   452  		return errors.Wrap(err, "failed to create stream")
   453  	}
   454  
   455  	// Register this client instance with the processor
   456  	if err := p.registerClient(stream); err != nil {
   457  		return errors.Wrap(err, "failed to register")
   458  	}
   459  
   460  	p.logger.Info("Connected to processor")
   461  
   462  	// Handle the stream until an error occurs or the context is cancelled
   463  	return p.handleStream(ctx, stream)
   464  }
   465  
   466  // connect attempts to connect to the processor service with health checks
   467  // Returns a healthy gRPC connection or an error
   468  func (p *client) connect(ctx context.Context) (*grpc.ClientConn, error) {
   469  	p.logger.Info("Attempting connection")
   470  
   471  	conn, err := grpc.NewClient(p.config.ProcessorAddress,
   472  		grpc.WithTransportCredentials(insecure.NewCredentials()))
   473  
   474  	if err != nil {
   475  		p.logger.WithError(err).Error("connection failed")
   476  		return nil, err
   477  	}
   478  
   479  	// Perform a health check on the connection
   480  	if err := p.healthCheck(ctx, conn); err != nil {
   481  		p.logger.WithError(err).Error("health check failed")
   482  		_ = conn.Close()
   483  		return nil, err
   484  	}
   485  
   486  	p.logger.Info("Successfully connected to processor")
   487  	return conn, nil
   488  }
   489  
   490  // healthCheck verifies that the processor service is healthy and serving requests
   491  // Returns an error if the health check fails or the service is not in SERVING state
   492  func (p *client) healthCheck(ctx context.Context, conn *grpc.ClientConn) error {
   493  	healthClient := grpc_health_v1.NewHealthClient(conn)
   494  
   495  	// Set a timeout for the health check RPC
   496  	healthCtx, cancel := context.WithTimeout(ctx, 5*time.Second)
   497  	defer cancel()
   498  
   499  	resp, err := healthClient.Check(healthCtx, &grpc_health_v1.HealthCheckRequest{
   500  		Service: "processor",
   501  	})
   502  	if err != nil {
   503  		return err
   504  	}
   505  
   506  	if resp.Status != grpc_health_v1.HealthCheckResponse_SERVING {
   507  		return errors.Errorf("processor not serving: %v", resp.Status)
   508  	}
   509  
   510  	return nil
   511  }
   512  
   513  // registerClient sends a registration message to the processor over the stream
   514  // This identifies the client instance to the processor service
   515  func (p *client) registerClient(stream allocationpb.Processor_StreamBatchesClient) error {
   516  	p.logger.WithField("clientID", p.config.ClientID).Info("Registering client with processor")
   517  
   518  	registerMsg := &allocationpb.ProcessorMessage{
   519  		ClientId: p.config.ClientID,
   520  	}
   521  
   522  	// Send the registration message
   523  	err := stream.Send(registerMsg)
   524  	if err != nil {
   525  		p.logger.WithField("clientID", p.config.ClientID).WithError(err).Error("Failed to register client")
   526  		return err
   527  	}
   528  
   529  	return nil
   530  }