github.com/kubeshop/testkube@v1.17.23/pkg/agent/logs.go (about)

     1  package agent
     2  
     3  import (
     4  	"context"
     5  	"fmt"
     6  	"math"
     7  	"time"
     8  
     9  	"github.com/kubeshop/testkube/pkg/cloud"
    10  
    11  	"github.com/pkg/errors"
    12  	"golang.org/x/sync/errgroup"
    13  	"google.golang.org/grpc"
    14  	"google.golang.org/grpc/encoding/gzip"
    15  )
    16  
    17  const logStreamRetryCount = 10
    18  
    19  func (ag *Agent) runLogStreamLoop(ctx context.Context) error {
    20  	ctx = AddAPIKeyMeta(ctx, ag.apiKey)
    21  
    22  	ag.logger.Infow("initiating log streaming connection with Cloud API")
    23  	// creates a new Stream from the client side. ctx is used for the lifetime of the stream.
    24  	opts := []grpc.CallOption{grpc.UseCompressor(gzip.Name), grpc.MaxCallRecvMsgSize(math.MaxInt32)}
    25  	stream, err := ag.client.GetLogsStream(ctx, opts...)
    26  	if err != nil {
    27  		ag.logger.Errorf("failed to execute: %w", err)
    28  		return errors.Wrap(err, "failed to setup stream")
    29  	}
    30  
    31  	// GRPC stream have special requirements for concurrency on SendMsg, and RecvMsg calls.
    32  	// Please check https://github.com/grpc/grpc-go/blob/master/Documentation/concurrency.md
    33  	g, groupCtx := errgroup.WithContext(ctx)
    34  	g.Go(func() error {
    35  		for {
    36  			cmd, err := ag.receiveLogStreamRequest(groupCtx, stream)
    37  			if err != nil {
    38  				return err
    39  			}
    40  
    41  			ag.logStreamRequestBuffer <- cmd
    42  		}
    43  	})
    44  
    45  	g.Go(func() error {
    46  		for {
    47  			select {
    48  			case resp := <-ag.logStreamResponseBuffer:
    49  				err := ag.sendLogStreamResponse(groupCtx, stream, resp)
    50  				if err != nil {
    51  					return err
    52  				}
    53  			case <-groupCtx.Done():
    54  				return groupCtx.Err()
    55  			}
    56  		}
    57  	})
    58  
    59  	err = g.Wait()
    60  
    61  	return err
    62  }
    63  
    64  func (ag *Agent) runLogStreamWorker(ctx context.Context, numWorkers int) error {
    65  	g, groupCtx := errgroup.WithContext(ctx)
    66  	for i := 0; i < numWorkers; i++ {
    67  		g.Go(func() error {
    68  			for {
    69  				select {
    70  				case req := <-ag.logStreamRequestBuffer:
    71  
    72  					if req.RequestType == cloud.LogsStreamRequestType_STREAM_HEALTH_CHECK {
    73  						ag.logStreamResponseBuffer <- &cloud.LogsStreamResponse{
    74  							StreamId: req.StreamId,
    75  							SeqNo:    0,
    76  						}
    77  						break
    78  					}
    79  
    80  					err := ag.executeLogStreamRequest(groupCtx, req)
    81  					if err != nil {
    82  						ag.logger.Errorf("error executing log stream request: %s", err.Error())
    83  					}
    84  				case <-groupCtx.Done():
    85  					return groupCtx.Err()
    86  				}
    87  			}
    88  		})
    89  	}
    90  	return g.Wait()
    91  }
    92  
    93  func (ag *Agent) executeLogStreamRequest(ctx context.Context, req *cloud.LogsStreamRequest) error {
    94  	ag.logger.Info("start sending logs stream")
    95  	logCh, err := ag.logStreamFunc(ctx, req.ExecutionId)
    96  	ag.logger.Info("got channel")
    97  
    98  	for i := 0; i < logStreamRetryCount; i++ {
    99  		if err != nil {
   100  			// We have a race condition here
   101  			// Cloud sometimes slow to insert execution or test
   102  			// while LogStream request from websockets comes in faster
   103  			// so we retry up to logStreamRetryCount times.
   104  			time.Sleep(100 * time.Millisecond)
   105  			logCh, err = ag.logStreamFunc(ctx, req.ExecutionId)
   106  			if err != nil {
   107  				ag.logger.Warnw("retrying log stream error", "retry", i, "error", err.Error())
   108  			} else {
   109  				ag.logger.Debugw("retrying log stream", "retry", i)
   110  			}
   111  		}
   112  	}
   113  	if err != nil {
   114  		ag.logStreamResponseBuffer <- &cloud.LogsStreamResponse{
   115  			StreamId:   req.StreamId,
   116  			SeqNo:      0,
   117  			LogMessage: fmt.Sprintf("error when calling logStreamFunc: %s", err.Error()),
   118  			IsError:    true,
   119  		}
   120  		ag.logger.Errorw("error when calling logStreamFunc", "error", err.Error())
   121  		return nil
   122  	}
   123  
   124  	var i int64
   125  	for {
   126  		select {
   127  		case logOutput, ok := <-logCh:
   128  			if !ok {
   129  				ag.logger.Debugw("channel closed")
   130  				return nil
   131  			}
   132  			ag.logger.Debugw("start sending log output", "content", logOutput.Content)
   133  
   134  			msg := &cloud.LogsStreamResponse{
   135  				StreamId:   req.StreamId,
   136  				SeqNo:      i,
   137  				LogMessage: logOutput.String(),
   138  			}
   139  			i++
   140  
   141  			select {
   142  			case ag.logStreamResponseBuffer <- msg:
   143  			case <-ctx.Done():
   144  				return ctx.Err()
   145  			}
   146  			ag.logger.Debugw("log output sent", "content", logOutput.Content)
   147  
   148  		case <-ctx.Done():
   149  			return ctx.Err()
   150  		}
   151  	}
   152  }
   153  
   154  func (ag *Agent) receiveLogStreamRequest(ctx context.Context, stream cloud.TestKubeCloudAPI_GetLogsStreamClient) (*cloud.LogsStreamRequest, error) {
   155  	respChan := make(chan logsStreamRequest, 1)
   156  	go func() {
   157  		cmd, err := stream.Recv()
   158  		respChan <- logsStreamRequest{resp: cmd, err: err}
   159  	}()
   160  
   161  	var cmd *cloud.LogsStreamRequest
   162  	select {
   163  	case resp := <-respChan:
   164  		cmd = resp.resp
   165  		err := resp.err
   166  
   167  		if err != nil {
   168  			ag.logger.Errorf("agent stream receive: %v", err)
   169  			return nil, err
   170  		}
   171  	case <-ctx.Done():
   172  		return nil, ctx.Err()
   173  	}
   174  
   175  	return cmd, nil
   176  }
   177  
   178  type logsStreamRequest struct {
   179  	resp *cloud.LogsStreamRequest
   180  	err  error
   181  }
   182  
   183  func (ag *Agent) sendLogStreamResponse(ctx context.Context, stream cloud.TestKubeCloudAPI_GetLogsStreamClient, resp *cloud.LogsStreamResponse) error {
   184  	errChan := make(chan error, 1)
   185  	go func() {
   186  		errChan <- stream.Send(resp)
   187  		close(errChan)
   188  	}()
   189  
   190  	t := time.NewTimer(ag.sendTimeout)
   191  	select {
   192  	case err := <-errChan:
   193  		if !t.Stop() {
   194  			<-t.C
   195  		}
   196  		return err
   197  	case <-ctx.Done():
   198  		if !t.Stop() {
   199  			<-t.C
   200  		}
   201  
   202  		return ctx.Err()
   203  	case <-t.C:
   204  		return errors.New("send response too slow")
   205  	}
   206  }