github.com/kubeshop/testkube@v1.17.23/pkg/agent/logs.go (about) 1 package agent 2 3 import ( 4 "context" 5 "fmt" 6 "math" 7 "time" 8 9 "github.com/kubeshop/testkube/pkg/cloud" 10 11 "github.com/pkg/errors" 12 "golang.org/x/sync/errgroup" 13 "google.golang.org/grpc" 14 "google.golang.org/grpc/encoding/gzip" 15 ) 16 17 const logStreamRetryCount = 10 18 19 func (ag *Agent) runLogStreamLoop(ctx context.Context) error { 20 ctx = AddAPIKeyMeta(ctx, ag.apiKey) 21 22 ag.logger.Infow("initiating log streaming connection with Cloud API") 23 // creates a new Stream from the client side. ctx is used for the lifetime of the stream. 24 opts := []grpc.CallOption{grpc.UseCompressor(gzip.Name), grpc.MaxCallRecvMsgSize(math.MaxInt32)} 25 stream, err := ag.client.GetLogsStream(ctx, opts...) 26 if err != nil { 27 ag.logger.Errorf("failed to execute: %w", err) 28 return errors.Wrap(err, "failed to setup stream") 29 } 30 31 // GRPC stream have special requirements for concurrency on SendMsg, and RecvMsg calls. 32 // Please check https://github.com/grpc/grpc-go/blob/master/Documentation/concurrency.md 33 g, groupCtx := errgroup.WithContext(ctx) 34 g.Go(func() error { 35 for { 36 cmd, err := ag.receiveLogStreamRequest(groupCtx, stream) 37 if err != nil { 38 return err 39 } 40 41 ag.logStreamRequestBuffer <- cmd 42 } 43 }) 44 45 g.Go(func() error { 46 for { 47 select { 48 case resp := <-ag.logStreamResponseBuffer: 49 err := ag.sendLogStreamResponse(groupCtx, stream, resp) 50 if err != nil { 51 return err 52 } 53 case <-groupCtx.Done(): 54 return groupCtx.Err() 55 } 56 } 57 }) 58 59 err = g.Wait() 60 61 return err 62 } 63 64 func (ag *Agent) runLogStreamWorker(ctx context.Context, numWorkers int) error { 65 g, groupCtx := errgroup.WithContext(ctx) 66 for i := 0; i < numWorkers; i++ { 67 g.Go(func() error { 68 for { 69 select { 70 case req := <-ag.logStreamRequestBuffer: 71 72 if req.RequestType == cloud.LogsStreamRequestType_STREAM_HEALTH_CHECK { 73 ag.logStreamResponseBuffer <- &cloud.LogsStreamResponse{ 74 StreamId: req.StreamId, 75 SeqNo: 0, 76 } 77 break 78 } 79 80 err := ag.executeLogStreamRequest(groupCtx, req) 81 if err != nil { 82 ag.logger.Errorf("error executing log stream request: %s", err.Error()) 83 } 84 case <-groupCtx.Done(): 85 return groupCtx.Err() 86 } 87 } 88 }) 89 } 90 return g.Wait() 91 } 92 93 func (ag *Agent) executeLogStreamRequest(ctx context.Context, req *cloud.LogsStreamRequest) error { 94 ag.logger.Info("start sending logs stream") 95 logCh, err := ag.logStreamFunc(ctx, req.ExecutionId) 96 ag.logger.Info("got channel") 97 98 for i := 0; i < logStreamRetryCount; i++ { 99 if err != nil { 100 // We have a race condition here 101 // Cloud sometimes slow to insert execution or test 102 // while LogStream request from websockets comes in faster 103 // so we retry up to logStreamRetryCount times. 104 time.Sleep(100 * time.Millisecond) 105 logCh, err = ag.logStreamFunc(ctx, req.ExecutionId) 106 if err != nil { 107 ag.logger.Warnw("retrying log stream error", "retry", i, "error", err.Error()) 108 } else { 109 ag.logger.Debugw("retrying log stream", "retry", i) 110 } 111 } 112 } 113 if err != nil { 114 ag.logStreamResponseBuffer <- &cloud.LogsStreamResponse{ 115 StreamId: req.StreamId, 116 SeqNo: 0, 117 LogMessage: fmt.Sprintf("error when calling logStreamFunc: %s", err.Error()), 118 IsError: true, 119 } 120 ag.logger.Errorw("error when calling logStreamFunc", "error", err.Error()) 121 return nil 122 } 123 124 var i int64 125 for { 126 select { 127 case logOutput, ok := <-logCh: 128 if !ok { 129 ag.logger.Debugw("channel closed") 130 return nil 131 } 132 ag.logger.Debugw("start sending log output", "content", logOutput.Content) 133 134 msg := &cloud.LogsStreamResponse{ 135 StreamId: req.StreamId, 136 SeqNo: i, 137 LogMessage: logOutput.String(), 138 } 139 i++ 140 141 select { 142 case ag.logStreamResponseBuffer <- msg: 143 case <-ctx.Done(): 144 return ctx.Err() 145 } 146 ag.logger.Debugw("log output sent", "content", logOutput.Content) 147 148 case <-ctx.Done(): 149 return ctx.Err() 150 } 151 } 152 } 153 154 func (ag *Agent) receiveLogStreamRequest(ctx context.Context, stream cloud.TestKubeCloudAPI_GetLogsStreamClient) (*cloud.LogsStreamRequest, error) { 155 respChan := make(chan logsStreamRequest, 1) 156 go func() { 157 cmd, err := stream.Recv() 158 respChan <- logsStreamRequest{resp: cmd, err: err} 159 }() 160 161 var cmd *cloud.LogsStreamRequest 162 select { 163 case resp := <-respChan: 164 cmd = resp.resp 165 err := resp.err 166 167 if err != nil { 168 ag.logger.Errorf("agent stream receive: %v", err) 169 return nil, err 170 } 171 case <-ctx.Done(): 172 return nil, ctx.Err() 173 } 174 175 return cmd, nil 176 } 177 178 type logsStreamRequest struct { 179 resp *cloud.LogsStreamRequest 180 err error 181 } 182 183 func (ag *Agent) sendLogStreamResponse(ctx context.Context, stream cloud.TestKubeCloudAPI_GetLogsStreamClient, resp *cloud.LogsStreamResponse) error { 184 errChan := make(chan error, 1) 185 go func() { 186 errChan <- stream.Send(resp) 187 close(errChan) 188 }() 189 190 t := time.NewTimer(ag.sendTimeout) 191 select { 192 case err := <-errChan: 193 if !t.Stop() { 194 <-t.C 195 } 196 return err 197 case <-ctx.Done(): 198 if !t.Stop() { 199 <-t.C 200 } 201 202 return ctx.Err() 203 case <-t.C: 204 return errors.New("send response too slow") 205 } 206 }