github.com/kubeshop/testkube@v1.17.23/pkg/logs/events.go (about) 1 package logs 2 3 import ( 4 "context" 5 "encoding/json" 6 "fmt" 7 "sync" 8 "time" 9 10 "github.com/nats-io/nats.go" 11 "github.com/nats-io/nats.go/jetstream" 12 "github.com/pkg/errors" 13 14 "github.com/kubeshop/testkube/pkg/api/v1/testkube" 15 "github.com/kubeshop/testkube/pkg/logs/adapter" 16 "github.com/kubeshop/testkube/pkg/logs/events" 17 "github.com/kubeshop/testkube/pkg/logs/state" 18 ) 19 20 const ( 21 StatusPending byte = 1 22 StatusFinished byte = 2 23 24 StreamPrefix = "log" 25 26 StartQueue = "logsstart" 27 StopQueue = "logsstop" 28 29 LogStartSubject = "events.logs.start" 30 LogStopSubject = "events.logs.stop" 31 ) 32 33 var ( 34 StartSubjects = map[string]string{ 35 "test": testkube.TestStartSubject, 36 "generic": LogStartSubject, 37 } 38 39 StopSubjects = map[string]string{ 40 "test": testkube.TestStopSubject, 41 "generic": LogStopSubject, 42 } 43 ) 44 45 type Consumer struct { 46 // Name of the consumer 47 Name string 48 // Context is a consumer context you can call Stop() method on it when no more messages are expected 49 Context jetstream.ConsumeContext 50 // Instance is a NATS consumer instance 51 Instance jetstream.Consumer 52 } 53 54 func (ls *LogsService) initConsumer(ctx context.Context, a adapter.Adapter, streamName, id string, i int) (jetstream.Consumer, error) { 55 name := fmt.Sprintf("lc%s%s%d", id, a.Name(), i) 56 57 err := a.Init(ctx, id) 58 if err != nil { 59 return nil, errors.Wrap(err, "can't init adapter") 60 } 61 62 return ls.js.CreateOrUpdateConsumer(ctx, streamName, jetstream.ConsumerConfig{ 63 Name: name, 64 // Durable: name, 65 // FilterSubject: streamName, 66 DeliverPolicy: jetstream.DeliverAllPolicy, 67 }) 68 } 69 70 // handleMessage will handle incoming message from logs stream and proxy it to given adapter 71 func (ls *LogsService) handleMessage(ctx context.Context, a adapter.Adapter, id string) func(msg jetstream.Msg) { 72 log := ls.log.With("id", id, "adapter", a.Name()) 73 74 return func(msg jetstream.Msg) { 75 if ls.traceMessages { 76 log.Debugw("got message", "data", string(msg.Data())) 77 } 78 79 // deliver to subscriber 80 logChunk := events.Log{} 81 err := json.Unmarshal(msg.Data(), &logChunk) 82 if err != nil { 83 if err := msg.Nak(); err != nil { 84 log.Errorw("error nacking message", "error", err) 85 return 86 } 87 return 88 } 89 90 err = a.Notify(ctx, id, logChunk) 91 if err != nil { 92 if err := msg.Nak(); err != nil { 93 log.Errorw("error nacking message", "error", err) 94 return 95 } 96 log.Errorw("error notifying adapter", "error", err) 97 return 98 } 99 100 if err := msg.Ack(); err != nil { 101 log.Errorw("error acking message", "error", err) 102 } 103 } 104 } 105 106 // handleStart will handle start event and create logs consumers, also manage state of given (execution) id 107 func (ls *LogsService) handleStart(ctx context.Context, subject string) func(msg *nats.Msg) { 108 return func(msg *nats.Msg) { 109 event := events.Trigger{} 110 err := json.Unmarshal(msg.Data, &event) 111 if err != nil { 112 ls.log.Errorw("can't handle start event", "error", err) 113 return 114 } 115 id := event.ResourceId 116 log := ls.log.With("id", id, "event", "start") 117 118 ls.state.Put(ctx, id, state.LogStatePending) 119 120 s, err := ls.logStream.Init(ctx, id) 121 if err != nil { 122 ls.log.Errorw("error creating stream", "error", err, "id", id) 123 return 124 } 125 126 log.Infow("stream created", "stream", s) 127 128 streamName := StreamPrefix + id 129 130 // for each adapter create NATS consumer and consume stream from it e.g. cloud s3 or others 131 for i, adapter := range ls.adapters { 132 l := log.With("adapter", adapter.Name()) 133 c, err := ls.initConsumer(ctx, adapter, streamName, id, i) 134 if err != nil { 135 log.Errorw("error creating consumer", "error", err) 136 return 137 } 138 139 // handle message per each adapter 140 l.Infow("consumer created", "consumer", c.CachedInfo(), "stream", streamName) 141 cons, err := c.Consume(ls.handleMessage(ctx, adapter, id)) 142 if err != nil { 143 log.Errorw("error creating consumer", "error", err, "consumer", c.CachedInfo()) 144 continue 145 } 146 147 consumerName := id + "_" + adapter.Name() + "_" + subject 148 // store consumer instance so we can stop it later in StopSubject handler 149 ls.consumerInstances.Store(consumerName, Consumer{ 150 Name: consumerName, 151 Context: cons, 152 Instance: c, 153 }) 154 155 l.Infow("consumer started", "adapter", adapter.Name(), "id", id, "stream", streamName) 156 } 157 158 // confirm when reply is set 159 if msg.Reply != "" { 160 // reply to start event that everything was initialized correctly 161 err = msg.Respond([]byte("ok")) 162 if err != nil { 163 log.Errorw("error responding to start event", "error", err) 164 return 165 } 166 } 167 } 168 169 } 170 171 // handleStop will handle stop event and stop logs consumers, also clean consumers state 172 func (ls *LogsService) handleStop(ctx context.Context, group string) func(msg *nats.Msg) { 173 return func(msg *nats.Msg) { 174 var ( 175 wg sync.WaitGroup 176 stopped = 0 177 event = events.Trigger{} 178 ) 179 180 if ls.traceMessages { 181 ls.log.Debugw("got stop event", "data", string(msg.Data)) 182 } 183 184 err := json.Unmarshal(msg.Data, &event) 185 if err != nil { 186 ls.log.Errorw("can't handle stop event", "error", err) 187 return 188 } 189 190 id := event.ResourceId 191 192 l := ls.log.With("id", id, "event", "stop") 193 194 if msg.Reply != "" { 195 err = msg.Respond([]byte("stop-queued")) 196 if err != nil { 197 l.Errorw("error responding to stop event", "error", err) 198 } 199 } 200 201 for _, adapter := range ls.adapters { 202 consumerName := id + "_" + adapter.Name() + "_" + group 203 204 // locate consumer on this pod 205 c, found := ls.consumerInstances.Load(consumerName) 206 if !found { 207 l.Debugw("consumer not found on this pod", "found", found, "name", consumerName) 208 continue 209 } 210 l.Debugw("consumer instance found", "c", c, "found", found, "name", consumerName) 211 212 // stop consumer 213 wg.Add(1) 214 stopped++ 215 consumer := c.(Consumer) 216 217 go ls.stopConsumer(ctx, &wg, consumer, adapter, id) 218 } 219 220 wg.Wait() 221 l.Debugw("wait completed") 222 223 if stopped > 0 { 224 ls.state.Put(ctx, event.ResourceId, state.LogStateFinished) 225 l.Infow("execution logs consumers stopped", "id", event.ResourceId, "stopped", stopped) 226 } else { 227 l.Debugw("no consumers found on this pod to stop") 228 } 229 } 230 } 231 232 func (ls *LogsService) stopConsumer(ctx context.Context, wg *sync.WaitGroup, consumer Consumer, adapter adapter.Adapter, id string) { 233 defer wg.Done() 234 235 var ( 236 info *jetstream.ConsumerInfo 237 err error 238 l = ls.log 239 retries = 0 240 maxRetries = 50 241 ) 242 243 defer func() { 244 // send log finish message as consumer listening for logs needs to be closed 245 err = ls.logStream.Finish(ctx, id) 246 if err != nil { 247 ls.log.Errorw("log stream finish error") 248 } 249 }() 250 251 l.Debugw("stopping consumer", "name", consumer.Name) 252 253 // stop nats consumer 254 defer consumer.Context.Stop() 255 256 for { 257 info, err = consumer.Instance.Info(ctx) 258 if err != nil { 259 l.Errorw("error getting consumer info", "error", err, "name", consumer.Name) 260 return 261 } 262 263 nothingToProcess := info.NumAckPending == 0 && info.NumPending == 0 264 messagesDelivered := info.Delivered.Consumer > 0 && info.Delivered.Stream > 0 265 266 l.Debugw("consumer info", "nothingToProcess", nothingToProcess, "messagesDelivered", messagesDelivered, "info", info) 267 268 // check if there was some messages processed 269 if nothingToProcess && messagesDelivered { 270 271 // delete nats consumer instance from memory 272 ls.consumerInstances.Delete(consumer.Name) 273 l.Infow("stopping and removing consumer", "name", consumer.Name, "consumerSeq", info.Delivered.Consumer, "streamSeq", info.Delivered.Stream, "last", info.Delivered.Last) 274 275 // call adapter stop to handle given id 276 err := adapter.Stop(ctx, id) 277 if err != nil { 278 l.Errorw("stop error", "adapter", adapter.Name(), "error", err) 279 } 280 return 281 } 282 283 // retry if there is no messages processed as there could be slower logs 284 retries++ 285 if retries >= maxRetries { 286 l.Errorw("error stopping consumer", "error", err, "name", consumer.Name, "consumerSeq", info.Delivered.Consumer, "streamSeq", info.Delivered.Stream, "last", info.Delivered.Last) 287 return 288 } 289 290 // pause a little bit 291 l.Debugw("waiting for consumer to finish", "name", consumer.Name, "retries", retries, "consumerSeq", info.Delivered.Consumer, "streamSeq", info.Delivered.Stream, "last", info.Delivered.Last) 292 time.Sleep(ls.stopPauseInterval) 293 } 294 } 295 296 type ConsumerStats struct { 297 Count int 298 Names []string 299 } 300 301 func (ls *LogsService) GetConsumersStats(ctx context.Context) (stats ConsumerStats) { 302 303 ls.consumerInstances.Range(func(key, value interface{}) bool { 304 stats.Count++ 305 stats.Names = append(stats.Names, key.(string)) 306 return true 307 }) 308 309 return 310 }