github.com/kubeshop/testkube@v1.17.23/pkg/logs/events.go (about)

     1  package logs
     2  
     3  import (
     4  	"context"
     5  	"encoding/json"
     6  	"fmt"
     7  	"sync"
     8  	"time"
     9  
    10  	"github.com/nats-io/nats.go"
    11  	"github.com/nats-io/nats.go/jetstream"
    12  	"github.com/pkg/errors"
    13  
    14  	"github.com/kubeshop/testkube/pkg/api/v1/testkube"
    15  	"github.com/kubeshop/testkube/pkg/logs/adapter"
    16  	"github.com/kubeshop/testkube/pkg/logs/events"
    17  	"github.com/kubeshop/testkube/pkg/logs/state"
    18  )
    19  
    20  const (
    21  	StatusPending  byte = 1
    22  	StatusFinished byte = 2
    23  
    24  	StreamPrefix = "log"
    25  
    26  	StartQueue = "logsstart"
    27  	StopQueue  = "logsstop"
    28  
    29  	LogStartSubject = "events.logs.start"
    30  	LogStopSubject  = "events.logs.stop"
    31  )
    32  
    33  var (
    34  	StartSubjects = map[string]string{
    35  		"test":    testkube.TestStartSubject,
    36  		"generic": LogStartSubject,
    37  	}
    38  
    39  	StopSubjects = map[string]string{
    40  		"test":    testkube.TestStopSubject,
    41  		"generic": LogStopSubject,
    42  	}
    43  )
    44  
    45  type Consumer struct {
    46  	// Name of the consumer
    47  	Name string
    48  	// Context is a consumer context you can call Stop() method on it when no more messages are expected
    49  	Context jetstream.ConsumeContext
    50  	// Instance is a NATS consumer instance
    51  	Instance jetstream.Consumer
    52  }
    53  
    54  func (ls *LogsService) initConsumer(ctx context.Context, a adapter.Adapter, streamName, id string, i int) (jetstream.Consumer, error) {
    55  	name := fmt.Sprintf("lc%s%s%d", id, a.Name(), i)
    56  
    57  	err := a.Init(ctx, id)
    58  	if err != nil {
    59  		return nil, errors.Wrap(err, "can't init adapter")
    60  	}
    61  
    62  	return ls.js.CreateOrUpdateConsumer(ctx, streamName, jetstream.ConsumerConfig{
    63  		Name: name,
    64  		// Durable: name,
    65  		// FilterSubject: streamName,
    66  		DeliverPolicy: jetstream.DeliverAllPolicy,
    67  	})
    68  }
    69  
    70  // handleMessage will handle incoming message from logs stream and proxy it to given adapter
    71  func (ls *LogsService) handleMessage(ctx context.Context, a adapter.Adapter, id string) func(msg jetstream.Msg) {
    72  	log := ls.log.With("id", id, "adapter", a.Name())
    73  
    74  	return func(msg jetstream.Msg) {
    75  		if ls.traceMessages {
    76  			log.Debugw("got message", "data", string(msg.Data()))
    77  		}
    78  
    79  		// deliver to subscriber
    80  		logChunk := events.Log{}
    81  		err := json.Unmarshal(msg.Data(), &logChunk)
    82  		if err != nil {
    83  			if err := msg.Nak(); err != nil {
    84  				log.Errorw("error nacking message", "error", err)
    85  				return
    86  			}
    87  			return
    88  		}
    89  
    90  		err = a.Notify(ctx, id, logChunk)
    91  		if err != nil {
    92  			if err := msg.Nak(); err != nil {
    93  				log.Errorw("error nacking message", "error", err)
    94  				return
    95  			}
    96  			log.Errorw("error notifying adapter", "error", err)
    97  			return
    98  		}
    99  
   100  		if err := msg.Ack(); err != nil {
   101  			log.Errorw("error acking message", "error", err)
   102  		}
   103  	}
   104  }
   105  
   106  // handleStart will handle start event and create logs consumers, also manage state of given (execution) id
   107  func (ls *LogsService) handleStart(ctx context.Context, subject string) func(msg *nats.Msg) {
   108  	return func(msg *nats.Msg) {
   109  		event := events.Trigger{}
   110  		err := json.Unmarshal(msg.Data, &event)
   111  		if err != nil {
   112  			ls.log.Errorw("can't handle start event", "error", err)
   113  			return
   114  		}
   115  		id := event.ResourceId
   116  		log := ls.log.With("id", id, "event", "start")
   117  
   118  		ls.state.Put(ctx, id, state.LogStatePending)
   119  
   120  		s, err := ls.logStream.Init(ctx, id)
   121  		if err != nil {
   122  			ls.log.Errorw("error creating stream", "error", err, "id", id)
   123  			return
   124  		}
   125  
   126  		log.Infow("stream created", "stream", s)
   127  
   128  		streamName := StreamPrefix + id
   129  
   130  		// for each adapter create NATS consumer and consume stream from it e.g. cloud s3 or others
   131  		for i, adapter := range ls.adapters {
   132  			l := log.With("adapter", adapter.Name())
   133  			c, err := ls.initConsumer(ctx, adapter, streamName, id, i)
   134  			if err != nil {
   135  				log.Errorw("error creating consumer", "error", err)
   136  				return
   137  			}
   138  
   139  			// handle message per each adapter
   140  			l.Infow("consumer created", "consumer", c.CachedInfo(), "stream", streamName)
   141  			cons, err := c.Consume(ls.handleMessage(ctx, adapter, id))
   142  			if err != nil {
   143  				log.Errorw("error creating consumer", "error", err, "consumer", c.CachedInfo())
   144  				continue
   145  			}
   146  
   147  			consumerName := id + "_" + adapter.Name() + "_" + subject
   148  			// store consumer instance so we can stop it later in StopSubject handler
   149  			ls.consumerInstances.Store(consumerName, Consumer{
   150  				Name:     consumerName,
   151  				Context:  cons,
   152  				Instance: c,
   153  			})
   154  
   155  			l.Infow("consumer started", "adapter", adapter.Name(), "id", id, "stream", streamName)
   156  		}
   157  
   158  		// confirm when reply is set
   159  		if msg.Reply != "" {
   160  			// reply to start event that everything was initialized correctly
   161  			err = msg.Respond([]byte("ok"))
   162  			if err != nil {
   163  				log.Errorw("error responding to start event", "error", err)
   164  				return
   165  			}
   166  		}
   167  	}
   168  
   169  }
   170  
   171  // handleStop will handle stop event and stop logs consumers, also clean consumers state
   172  func (ls *LogsService) handleStop(ctx context.Context, group string) func(msg *nats.Msg) {
   173  	return func(msg *nats.Msg) {
   174  		var (
   175  			wg      sync.WaitGroup
   176  			stopped = 0
   177  			event   = events.Trigger{}
   178  		)
   179  
   180  		if ls.traceMessages {
   181  			ls.log.Debugw("got stop event", "data", string(msg.Data))
   182  		}
   183  
   184  		err := json.Unmarshal(msg.Data, &event)
   185  		if err != nil {
   186  			ls.log.Errorw("can't handle stop event", "error", err)
   187  			return
   188  		}
   189  
   190  		id := event.ResourceId
   191  
   192  		l := ls.log.With("id", id, "event", "stop")
   193  
   194  		if msg.Reply != "" {
   195  			err = msg.Respond([]byte("stop-queued"))
   196  			if err != nil {
   197  				l.Errorw("error responding to stop event", "error", err)
   198  			}
   199  		}
   200  
   201  		for _, adapter := range ls.adapters {
   202  			consumerName := id + "_" + adapter.Name() + "_" + group
   203  
   204  			// locate consumer on this pod
   205  			c, found := ls.consumerInstances.Load(consumerName)
   206  			if !found {
   207  				l.Debugw("consumer not found on this pod", "found", found, "name", consumerName)
   208  				continue
   209  			}
   210  			l.Debugw("consumer instance found", "c", c, "found", found, "name", consumerName)
   211  
   212  			// stop consumer
   213  			wg.Add(1)
   214  			stopped++
   215  			consumer := c.(Consumer)
   216  
   217  			go ls.stopConsumer(ctx, &wg, consumer, adapter, id)
   218  		}
   219  
   220  		wg.Wait()
   221  		l.Debugw("wait completed")
   222  
   223  		if stopped > 0 {
   224  			ls.state.Put(ctx, event.ResourceId, state.LogStateFinished)
   225  			l.Infow("execution logs consumers stopped", "id", event.ResourceId, "stopped", stopped)
   226  		} else {
   227  			l.Debugw("no consumers found on this pod to stop")
   228  		}
   229  	}
   230  }
   231  
   232  func (ls *LogsService) stopConsumer(ctx context.Context, wg *sync.WaitGroup, consumer Consumer, adapter adapter.Adapter, id string) {
   233  	defer wg.Done()
   234  
   235  	var (
   236  		info       *jetstream.ConsumerInfo
   237  		err        error
   238  		l          = ls.log
   239  		retries    = 0
   240  		maxRetries = 50
   241  	)
   242  
   243  	defer func() {
   244  		// send log finish message as consumer listening for logs needs to be closed
   245  		err = ls.logStream.Finish(ctx, id)
   246  		if err != nil {
   247  			ls.log.Errorw("log stream finish error")
   248  		}
   249  	}()
   250  
   251  	l.Debugw("stopping consumer", "name", consumer.Name)
   252  
   253  	// stop nats consumer
   254  	defer consumer.Context.Stop()
   255  
   256  	for {
   257  		info, err = consumer.Instance.Info(ctx)
   258  		if err != nil {
   259  			l.Errorw("error getting consumer info", "error", err, "name", consumer.Name)
   260  			return
   261  		}
   262  
   263  		nothingToProcess := info.NumAckPending == 0 && info.NumPending == 0
   264  		messagesDelivered := info.Delivered.Consumer > 0 && info.Delivered.Stream > 0
   265  
   266  		l.Debugw("consumer info", "nothingToProcess", nothingToProcess, "messagesDelivered", messagesDelivered, "info", info)
   267  
   268  		// check if there was some messages processed
   269  		if nothingToProcess && messagesDelivered {
   270  
   271  			// delete nats consumer instance from memory
   272  			ls.consumerInstances.Delete(consumer.Name)
   273  			l.Infow("stopping and removing consumer", "name", consumer.Name, "consumerSeq", info.Delivered.Consumer, "streamSeq", info.Delivered.Stream, "last", info.Delivered.Last)
   274  
   275  			// call adapter stop to handle given id
   276  			err := adapter.Stop(ctx, id)
   277  			if err != nil {
   278  				l.Errorw("stop error", "adapter", adapter.Name(), "error", err)
   279  			}
   280  			return
   281  		}
   282  
   283  		// retry if there is no messages processed as there could be slower logs
   284  		retries++
   285  		if retries >= maxRetries {
   286  			l.Errorw("error stopping consumer", "error", err, "name", consumer.Name, "consumerSeq", info.Delivered.Consumer, "streamSeq", info.Delivered.Stream, "last", info.Delivered.Last)
   287  			return
   288  		}
   289  
   290  		// pause a little bit
   291  		l.Debugw("waiting for consumer to finish", "name", consumer.Name, "retries", retries, "consumerSeq", info.Delivered.Consumer, "streamSeq", info.Delivered.Stream, "last", info.Delivered.Last)
   292  		time.Sleep(ls.stopPauseInterval)
   293  	}
   294  }
   295  
   296  type ConsumerStats struct {
   297  	Count int
   298  	Names []string
   299  }
   300  
   301  func (ls *LogsService) GetConsumersStats(ctx context.Context) (stats ConsumerStats) {
   302  
   303  	ls.consumerInstances.Range(func(key, value interface{}) bool {
   304  		stats.Count++
   305  		stats.Names = append(stats.Names, key.(string))
   306  		return true
   307  	})
   308  
   309  	return
   310  }