github.com/kubeshop/testkube@v1.17.23/pkg/tcl/testworkflowstcl/testworkflowcontroller/controller.go (about) 1 // Copyright 2024 Testkube. 2 // 3 // Licensed as a Testkube Pro file under the Testkube Community 4 // License (the "License"); you may not use this file except in compliance with 5 // the License. You may obtain a copy of the License at 6 // 7 // https://github.com/kubeshop/testkube/blob/main/licenses/TCL.txt 8 9 package testworkflowcontroller 10 11 import ( 12 "context" 13 "fmt" 14 "time" 15 16 "github.com/pkg/errors" 17 batchv1 "k8s.io/api/batch/v1" 18 corev1 "k8s.io/api/core/v1" 19 "k8s.io/client-go/kubernetes" 20 21 "github.com/kubeshop/testkube/internal/common" 22 "github.com/kubeshop/testkube/pkg/api/v1/testkube" 23 "github.com/kubeshop/testkube/pkg/tcl/testworkflowstcl/testworkflowprocessor" 24 "github.com/kubeshop/testkube/pkg/tcl/testworkflowstcl/testworkflowprocessor/constants" 25 ) 26 27 const ( 28 JobRetrievalTimeout = 1 * time.Second 29 JobEventRetrievalTimeout = 1 * time.Second 30 ) 31 32 var ( 33 ErrJobAborted = errors.New("job was aborted") 34 ErrJobTimeout = errors.New("timeout retrieving job") 35 ) 36 37 type Controller interface { 38 Abort(ctx context.Context) error 39 Cleanup(ctx context.Context) error 40 Watch(ctx context.Context) Watcher[Notification] 41 } 42 43 func New(parentCtx context.Context, clientSet kubernetes.Interface, namespace, id string, scheduledAt time.Time) (Controller, error) { 44 // Create local context for stopping all the processes 45 ctx, ctxCancel := context.WithCancel(parentCtx) 46 47 // Optimistically, start watching all the resources 48 job := WatchJob(ctx, clientSet, namespace, id, 1) 49 jobEvents := WatchJobEvents(ctx, clientSet, namespace, id, -1) 50 pod := WatchMainPod(ctx, clientSet, namespace, id, 1) 51 podEvents := WatchPodEventsByPodWatcher(ctx, clientSet, namespace, pod, -1) 52 53 // Ensure the main Job exists in the cluster, 54 // and obtain the signature 55 var sig []testworkflowprocessor.Signature 56 var err error 57 select { 58 case j := <-job.Any(ctx): 59 if j.Error != nil { 60 ctxCancel() 61 return nil, j.Error 62 } 63 sig, err = testworkflowprocessor.GetSignatureFromJSON([]byte(j.Value.Annotations[constants.SignatureAnnotationName])) 64 if err != nil { 65 ctxCancel() 66 return nil, errors.Wrap(err, "invalid job signature") 67 } 68 case <-time.After(JobRetrievalTimeout): 69 select { 70 case ev := <-jobEvents.Any(context.Background()): 71 if ev.Value != nil { 72 // Job was there, so it was aborted 73 err = ErrJobAborted 74 } 75 case <-time.After(JobEventRetrievalTimeout): 76 // The job is actually not found 77 err = ErrJobTimeout 78 } 79 ctxCancel() 80 return nil, err 81 } 82 83 // Build accessible controller 84 return &controller{ 85 id: id, 86 namespace: namespace, 87 scheduledAt: scheduledAt, 88 signature: sig, 89 clientSet: clientSet, 90 ctx: ctx, 91 ctxCancel: ctxCancel, 92 job: job, 93 jobEvents: jobEvents, 94 pod: pod, 95 podEvents: podEvents, 96 }, nil 97 } 98 99 type controller struct { 100 id string 101 namespace string 102 scheduledAt time.Time 103 signature []testworkflowprocessor.Signature 104 clientSet kubernetes.Interface 105 ctx context.Context 106 ctxCancel context.CancelFunc 107 job Watcher[*batchv1.Job] 108 jobEvents Watcher[*corev1.Event] 109 pod Watcher[*corev1.Pod] 110 podEvents Watcher[*corev1.Event] 111 } 112 113 func (c *controller) Abort(ctx context.Context) error { 114 return c.Cleanup(ctx) 115 } 116 117 func (c *controller) Cleanup(ctx context.Context) error { 118 return Cleanup(ctx, c.clientSet, c.namespace, c.id) 119 } 120 121 func (c *controller) Watch(parentCtx context.Context) Watcher[Notification] { 122 ctx, ctxCancel := context.WithCancel(parentCtx) 123 w := newWatcher[Notification](ctx, 0) 124 125 go func() { 126 defer w.Close() 127 defer ctxCancel() 128 129 sig := make([]testkube.TestWorkflowSignature, len(c.signature)) 130 for i, s := range c.signature { 131 sig[i] = s.ToInternal() 132 } 133 134 // Build initial result 135 result := testkube.TestWorkflowResult{ 136 Status: common.Ptr(testkube.QUEUED_TestWorkflowStatus), 137 PredictedStatus: common.Ptr(testkube.PASSED_TestWorkflowStatus), 138 Initialization: &testkube.TestWorkflowStepResult{ 139 Status: common.Ptr(testkube.QUEUED_TestWorkflowStepStatus), 140 }, 141 Steps: testworkflowprocessor.MapSignatureListToStepResults(c.signature), 142 } 143 144 // Emit initial empty result 145 w.SendValue(Notification{Result: result.Clone()}) 146 147 // Wait for the pod creation 148 for v := range WatchJobPreEvents(ctx, c.jobEvents, 0).Stream(ctx).Channel() { 149 if v.Error != nil { 150 w.SendError(v.Error) 151 continue 152 } 153 if v.Value.Reason == "SuccessfulCreate" { 154 result.QueuedAt = v.Value.CreationTimestamp.Time 155 } 156 if v.Value.Type == "Normal" { 157 continue 158 } 159 w.SendValue(Notification{ 160 Timestamp: v.Value.CreationTimestamp.Time, 161 Log: fmt.Sprintf("%s (%s) %s\n", v.Value.CreationTimestamp.Time.Format(KubernetesLogTimeFormat), v.Value.Reason, v.Value.Message), 162 }) 163 } 164 165 // Emit the result with queue time 166 if result.QueuedAt.IsZero() { 167 w.SendError(errors.New("job is in unknown state")) 168 return 169 } 170 w.SendValue(Notification{Result: result.Clone()}) 171 172 // Wait for the pod initialization 173 for v := range WatchPodPreEvents(ctx, c.podEvents, 0).Stream(ctx).Channel() { 174 if v.Error != nil { 175 w.SendError(v.Error) 176 continue 177 } 178 if v.Value.Reason == "Scheduled" { 179 result.StartedAt = v.Value.CreationTimestamp.Time 180 result.Status = common.Ptr(testkube.RUNNING_TestWorkflowStatus) 181 } 182 if v.Value.Type == "Normal" { 183 continue 184 } 185 w.SendValue(Notification{ 186 Timestamp: v.Value.CreationTimestamp.Time, 187 Log: fmt.Sprintf("%s (%s) %s\n", v.Value.CreationTimestamp.Time.Format(KubernetesLogTimeFormat), v.Value.Reason, v.Value.Message), 188 }) 189 } 190 191 // Emit the result with start time 192 if result.StartedAt.IsZero() { 193 w.SendError(errors.New("pod is in unknown state")) 194 return 195 } 196 w.SendValue(Notification{Result: result.Clone()}) 197 198 // Wait for the initialization container 199 for v := range WatchContainerPreEvents(ctx, c.podEvents, "tktw-init", 0, true).Stream(ctx).Channel() { 200 if v.Error != nil { 201 w.SendError(v.Error) 202 continue 203 } 204 if v.Value.Reason == "Created" { 205 result.Initialization.QueuedAt = v.Value.CreationTimestamp.Time 206 } else if v.Value.Reason == "Started" { 207 result.Initialization.StartedAt = v.Value.CreationTimestamp.Time 208 result.Initialization.Status = common.Ptr(testkube.RUNNING_TestWorkflowStepStatus) 209 } 210 if v.Value.Type == "Normal" { 211 continue 212 } 213 w.SendValue(Notification{ 214 Timestamp: v.Value.CreationTimestamp.Time, 215 Log: fmt.Sprintf("%s (%s) %s\n", v.Value.CreationTimestamp.Time.Format(KubernetesLogTimeFormat), v.Value.Reason, v.Value.Message), 216 }) 217 } 218 219 // Emit the result with start time 220 if result.Initialization.StartedAt.IsZero() { 221 w.SendError(errors.New("init container is in unknown state")) 222 return 223 } 224 w.SendValue(Notification{Result: result.Clone()}) 225 226 // Watch the initialization container logs 227 lastTs := result.Initialization.StartedAt 228 pod := (<-c.pod.Any(ctx)).Value 229 for v := range WatchContainerLogs(ctx, c.clientSet, c.podEvents, c.namespace, pod.Name, "tktw-init").Stream(ctx).Channel() { 230 if v.Error != nil { 231 w.SendError(v.Error) 232 continue 233 } 234 if v.Value.Time.After(lastTs) { 235 lastTs = v.Value.Time 236 } 237 // TODO: Calibrate clock with v.Value.Hint or just first/last timestamp here 238 w.SendValue(Notification{ 239 Timestamp: v.Value.Time, 240 Log: string(v.Value.Log), 241 }) 242 } 243 244 // Update the initialization container status 245 status, err := GetFinalContainerResult(ctx, c.pod, "tktw-init") 246 if err != nil { 247 w.SendError(err) 248 return 249 } 250 result.Initialization.FinishedAt = status.FinishedAt 251 if lastTs.After(result.Initialization.FinishedAt) { 252 result.Initialization.FinishedAt = lastTs 253 } 254 result.Initialization.Status = common.Ptr(status.Status) 255 if status.Status != testkube.PASSED_TestWorkflowStepStatus { 256 result.Status = common.Ptr(testkube.FAILED_TestWorkflowStatus) 257 result.PredictedStatus = result.Status 258 } 259 w.SendValue(Notification{Result: result.Clone()}) 260 261 // Cancel when the initialization has failed 262 if status.Status != testkube.PASSED_TestWorkflowStepStatus { 263 return 264 } 265 266 // Watch each of the containers 267 lastTs = result.Initialization.FinishedAt 268 for _, container := range append(pod.Spec.InitContainers, pod.Spec.Containers...) { 269 // Ignore not-standard TestWorkflow containers 270 if _, ok := result.Steps[container.Name]; !ok { 271 continue 272 } 273 274 // Send the step queued time 275 stepResult := result.Steps[container.Name] 276 stepResult = result.UpdateStepResult(sig, container.Name, testkube.TestWorkflowStepResult{ 277 QueuedAt: lastTs.UTC(), 278 }, c.scheduledAt) 279 w.SendValue(Notification{Result: result.Clone()}) 280 281 // Watch for the container events 282 lastEvTs := time.Time{} 283 for v := range WatchContainerPreEvents(ctx, c.podEvents, container.Name, 0, false).Stream(ctx).Channel() { 284 if v.Error != nil { 285 w.SendError(v.Error) 286 continue 287 } 288 if lastEvTs.Before(v.Value.CreationTimestamp.Time) { 289 lastEvTs = v.Value.CreationTimestamp.Time 290 } 291 if v.Value.Reason == "Created" { 292 stepResult = result.UpdateStepResult(sig, container.Name, testkube.TestWorkflowStepResult{ 293 QueuedAt: v.Value.CreationTimestamp.Time.UTC(), 294 }, c.scheduledAt) 295 } else if v.Value.Reason == "Started" { 296 stepResult = result.UpdateStepResult(sig, container.Name, testkube.TestWorkflowStepResult{ 297 StartedAt: v.Value.CreationTimestamp.Time.UTC(), 298 Status: common.Ptr(testkube.RUNNING_TestWorkflowStepStatus), 299 }, c.scheduledAt) 300 } 301 if v.Value.Type == "Normal" { 302 continue 303 } 304 w.SendValue(Notification{ 305 Timestamp: v.Value.CreationTimestamp.Time, 306 Ref: container.Name, 307 Log: fmt.Sprintf("%s (%s) %s\n", v.Value.CreationTimestamp.Time.Format(KubernetesLogTimeFormat), v.Value.Reason, v.Value.Message), 308 }) 309 } 310 311 // Emit the next result 312 if stepResult.StartedAt.IsZero() { 313 w.SendError(errors.New("step container is in unknown state")) 314 break 315 } 316 w.SendValue(Notification{Result: result.Clone()}) 317 318 // Watch for the container logs, outputs and statuses 319 for v := range WatchContainerLogs(ctx, c.clientSet, c.podEvents, c.namespace, pod.Name, container.Name).Stream(ctx).Channel() { 320 if v.Error != nil { 321 w.SendError(v.Error) 322 continue 323 } 324 if v.Value.Hint != nil { 325 if v.Value.Hint.Name == "start" && v.Value.Hint.Ref == container.Name { 326 if v.Value.Time.After(stepResult.StartedAt) { 327 stepResult = result.UpdateStepResult(sig, container.Name, testkube.TestWorkflowStepResult{ 328 StartedAt: v.Value.Time.UTC(), 329 }, c.scheduledAt) 330 } 331 } else if v.Value.Hint.Name == "status" { 332 status := testkube.TestWorkflowStepStatus(v.Value.Hint.Value.(string)) 333 if status == "" { 334 status = testkube.PASSED_TestWorkflowStepStatus 335 } 336 if _, ok := result.Steps[v.Value.Hint.Ref]; ok { 337 stepResult = result.UpdateStepResult(sig, v.Value.Hint.Ref, testkube.TestWorkflowStepResult{ 338 Status: &status, 339 }, c.scheduledAt) 340 } 341 } 342 continue 343 } 344 if v.Value.Output != nil { 345 if _, ok := result.Steps[v.Value.Output.Ref]; ok { 346 w.SendValue(Notification{ 347 Timestamp: v.Value.Time, 348 Ref: v.Value.Output.Ref, 349 Output: v.Value.Output, 350 }) 351 } 352 continue 353 } 354 w.SendValue(Notification{Timestamp: v.Value.Time, Ref: container.Name, Log: string(v.Value.Log)}) 355 } 356 357 // Watch container status 358 status, err := GetFinalContainerResult(ctx, c.pod, container.Name) 359 if err != nil { 360 w.SendError(err) 361 break 362 } 363 finishedAt := status.FinishedAt.UTC() 364 if !finishedAt.IsZero() && lastTs.After(finishedAt) { 365 finishedAt = lastTs.UTC() 366 } 367 stepResult = result.UpdateStepResult(sig, container.Name, testkube.TestWorkflowStepResult{ 368 FinishedAt: finishedAt, 369 ExitCode: float64(status.ExitCode), 370 Status: common.Ptr(status.Status), 371 }, c.scheduledAt) 372 w.SendValue(Notification{Result: result.Clone()}) 373 374 // Update the last timestamp 375 lastTs = finishedAt 376 377 // Break the function if the step has been aborted. 378 // Breaking only to the loop is not enough, 379 // because due to GKE bug, the Job is still pending, 380 // so it will get stuck there. 381 if status.Status == testkube.ABORTED_TestWorkflowStepStatus { 382 result.Recompute(sig, c.scheduledAt) 383 abortTs := result.Steps[container.Name].FinishedAt 384 if status.Details == "" { 385 status.Details = "Manual" 386 } 387 388 w.SendValue(Notification{ 389 Timestamp: abortTs, 390 Ref: container.Name, 391 Log: fmt.Sprintf("\n%s Aborted (%s)", abortTs.Format(KubernetesLogTimeFormat), status.Details), 392 }) 393 w.SendValue(Notification{Result: result.Clone()}) 394 return 395 } 396 } 397 398 // Read the pod finish time 399 for v := range c.job.Stream(ctx).Channel() { 400 if v.Value != nil && v.Value.Status.CompletionTime != nil { 401 result.FinishedAt = v.Value.Status.CompletionTime.Time 402 } 403 } 404 if result.FinishedAt.IsZero() { 405 for v := range c.pod.Stream(ctx).Channel() { 406 if v.Value != nil && v.Value.ObjectMeta.DeletionTimestamp != nil { 407 result.FinishedAt = v.Value.ObjectMeta.DeletionTimestamp.Time 408 } 409 } 410 } 411 412 // Compute the TestWorkflow status and dates 413 result.Recompute(sig, c.scheduledAt) 414 w.SendValue(Notification{Result: result.Clone()}) 415 }() 416 417 return w 418 }