github.com/kubeshop/testkube@v1.17.23/pkg/tcl/testworkflowstcl/testworkflowcontroller/controller.go (about)

     1  // Copyright 2024 Testkube.
     2  //
     3  // Licensed as a Testkube Pro file under the Testkube Community
     4  // License (the "License"); you may not use this file except in compliance with
     5  // the License. You may obtain a copy of the License at
     6  //
     7  //	https://github.com/kubeshop/testkube/blob/main/licenses/TCL.txt
     8  
     9  package testworkflowcontroller
    10  
    11  import (
    12  	"context"
    13  	"fmt"
    14  	"time"
    15  
    16  	"github.com/pkg/errors"
    17  	batchv1 "k8s.io/api/batch/v1"
    18  	corev1 "k8s.io/api/core/v1"
    19  	"k8s.io/client-go/kubernetes"
    20  
    21  	"github.com/kubeshop/testkube/internal/common"
    22  	"github.com/kubeshop/testkube/pkg/api/v1/testkube"
    23  	"github.com/kubeshop/testkube/pkg/tcl/testworkflowstcl/testworkflowprocessor"
    24  	"github.com/kubeshop/testkube/pkg/tcl/testworkflowstcl/testworkflowprocessor/constants"
    25  )
    26  
    27  const (
    28  	JobRetrievalTimeout      = 1 * time.Second
    29  	JobEventRetrievalTimeout = 1 * time.Second
    30  )
    31  
    32  var (
    33  	ErrJobAborted = errors.New("job was aborted")
    34  	ErrJobTimeout = errors.New("timeout retrieving job")
    35  )
    36  
    37  type Controller interface {
    38  	Abort(ctx context.Context) error
    39  	Cleanup(ctx context.Context) error
    40  	Watch(ctx context.Context) Watcher[Notification]
    41  }
    42  
    43  func New(parentCtx context.Context, clientSet kubernetes.Interface, namespace, id string, scheduledAt time.Time) (Controller, error) {
    44  	// Create local context for stopping all the processes
    45  	ctx, ctxCancel := context.WithCancel(parentCtx)
    46  
    47  	// Optimistically, start watching all the resources
    48  	job := WatchJob(ctx, clientSet, namespace, id, 1)
    49  	jobEvents := WatchJobEvents(ctx, clientSet, namespace, id, -1)
    50  	pod := WatchMainPod(ctx, clientSet, namespace, id, 1)
    51  	podEvents := WatchPodEventsByPodWatcher(ctx, clientSet, namespace, pod, -1)
    52  
    53  	// Ensure the main Job exists in the cluster,
    54  	// and obtain the signature
    55  	var sig []testworkflowprocessor.Signature
    56  	var err error
    57  	select {
    58  	case j := <-job.Any(ctx):
    59  		if j.Error != nil {
    60  			ctxCancel()
    61  			return nil, j.Error
    62  		}
    63  		sig, err = testworkflowprocessor.GetSignatureFromJSON([]byte(j.Value.Annotations[constants.SignatureAnnotationName]))
    64  		if err != nil {
    65  			ctxCancel()
    66  			return nil, errors.Wrap(err, "invalid job signature")
    67  		}
    68  	case <-time.After(JobRetrievalTimeout):
    69  		select {
    70  		case ev := <-jobEvents.Any(context.Background()):
    71  			if ev.Value != nil {
    72  				// Job was there, so it was aborted
    73  				err = ErrJobAborted
    74  			}
    75  		case <-time.After(JobEventRetrievalTimeout):
    76  			// The job is actually not found
    77  			err = ErrJobTimeout
    78  		}
    79  		ctxCancel()
    80  		return nil, err
    81  	}
    82  
    83  	// Build accessible controller
    84  	return &controller{
    85  		id:          id,
    86  		namespace:   namespace,
    87  		scheduledAt: scheduledAt,
    88  		signature:   sig,
    89  		clientSet:   clientSet,
    90  		ctx:         ctx,
    91  		ctxCancel:   ctxCancel,
    92  		job:         job,
    93  		jobEvents:   jobEvents,
    94  		pod:         pod,
    95  		podEvents:   podEvents,
    96  	}, nil
    97  }
    98  
    99  type controller struct {
   100  	id          string
   101  	namespace   string
   102  	scheduledAt time.Time
   103  	signature   []testworkflowprocessor.Signature
   104  	clientSet   kubernetes.Interface
   105  	ctx         context.Context
   106  	ctxCancel   context.CancelFunc
   107  	job         Watcher[*batchv1.Job]
   108  	jobEvents   Watcher[*corev1.Event]
   109  	pod         Watcher[*corev1.Pod]
   110  	podEvents   Watcher[*corev1.Event]
   111  }
   112  
   113  func (c *controller) Abort(ctx context.Context) error {
   114  	return c.Cleanup(ctx)
   115  }
   116  
   117  func (c *controller) Cleanup(ctx context.Context) error {
   118  	return Cleanup(ctx, c.clientSet, c.namespace, c.id)
   119  }
   120  
   121  func (c *controller) Watch(parentCtx context.Context) Watcher[Notification] {
   122  	ctx, ctxCancel := context.WithCancel(parentCtx)
   123  	w := newWatcher[Notification](ctx, 0)
   124  
   125  	go func() {
   126  		defer w.Close()
   127  		defer ctxCancel()
   128  
   129  		sig := make([]testkube.TestWorkflowSignature, len(c.signature))
   130  		for i, s := range c.signature {
   131  			sig[i] = s.ToInternal()
   132  		}
   133  
   134  		// Build initial result
   135  		result := testkube.TestWorkflowResult{
   136  			Status:          common.Ptr(testkube.QUEUED_TestWorkflowStatus),
   137  			PredictedStatus: common.Ptr(testkube.PASSED_TestWorkflowStatus),
   138  			Initialization: &testkube.TestWorkflowStepResult{
   139  				Status: common.Ptr(testkube.QUEUED_TestWorkflowStepStatus),
   140  			},
   141  			Steps: testworkflowprocessor.MapSignatureListToStepResults(c.signature),
   142  		}
   143  
   144  		// Emit initial empty result
   145  		w.SendValue(Notification{Result: result.Clone()})
   146  
   147  		// Wait for the pod creation
   148  		for v := range WatchJobPreEvents(ctx, c.jobEvents, 0).Stream(ctx).Channel() {
   149  			if v.Error != nil {
   150  				w.SendError(v.Error)
   151  				continue
   152  			}
   153  			if v.Value.Reason == "SuccessfulCreate" {
   154  				result.QueuedAt = v.Value.CreationTimestamp.Time
   155  			}
   156  			if v.Value.Type == "Normal" {
   157  				continue
   158  			}
   159  			w.SendValue(Notification{
   160  				Timestamp: v.Value.CreationTimestamp.Time,
   161  				Log:       fmt.Sprintf("%s (%s) %s\n", v.Value.CreationTimestamp.Time.Format(KubernetesLogTimeFormat), v.Value.Reason, v.Value.Message),
   162  			})
   163  		}
   164  
   165  		// Emit the result with queue time
   166  		if result.QueuedAt.IsZero() {
   167  			w.SendError(errors.New("job is in unknown state"))
   168  			return
   169  		}
   170  		w.SendValue(Notification{Result: result.Clone()})
   171  
   172  		// Wait for the pod initialization
   173  		for v := range WatchPodPreEvents(ctx, c.podEvents, 0).Stream(ctx).Channel() {
   174  			if v.Error != nil {
   175  				w.SendError(v.Error)
   176  				continue
   177  			}
   178  			if v.Value.Reason == "Scheduled" {
   179  				result.StartedAt = v.Value.CreationTimestamp.Time
   180  				result.Status = common.Ptr(testkube.RUNNING_TestWorkflowStatus)
   181  			}
   182  			if v.Value.Type == "Normal" {
   183  				continue
   184  			}
   185  			w.SendValue(Notification{
   186  				Timestamp: v.Value.CreationTimestamp.Time,
   187  				Log:       fmt.Sprintf("%s (%s) %s\n", v.Value.CreationTimestamp.Time.Format(KubernetesLogTimeFormat), v.Value.Reason, v.Value.Message),
   188  			})
   189  		}
   190  
   191  		// Emit the result with start time
   192  		if result.StartedAt.IsZero() {
   193  			w.SendError(errors.New("pod is in unknown state"))
   194  			return
   195  		}
   196  		w.SendValue(Notification{Result: result.Clone()})
   197  
   198  		// Wait for the initialization container
   199  		for v := range WatchContainerPreEvents(ctx, c.podEvents, "tktw-init", 0, true).Stream(ctx).Channel() {
   200  			if v.Error != nil {
   201  				w.SendError(v.Error)
   202  				continue
   203  			}
   204  			if v.Value.Reason == "Created" {
   205  				result.Initialization.QueuedAt = v.Value.CreationTimestamp.Time
   206  			} else if v.Value.Reason == "Started" {
   207  				result.Initialization.StartedAt = v.Value.CreationTimestamp.Time
   208  				result.Initialization.Status = common.Ptr(testkube.RUNNING_TestWorkflowStepStatus)
   209  			}
   210  			if v.Value.Type == "Normal" {
   211  				continue
   212  			}
   213  			w.SendValue(Notification{
   214  				Timestamp: v.Value.CreationTimestamp.Time,
   215  				Log:       fmt.Sprintf("%s (%s) %s\n", v.Value.CreationTimestamp.Time.Format(KubernetesLogTimeFormat), v.Value.Reason, v.Value.Message),
   216  			})
   217  		}
   218  
   219  		// Emit the result with start time
   220  		if result.Initialization.StartedAt.IsZero() {
   221  			w.SendError(errors.New("init container is in unknown state"))
   222  			return
   223  		}
   224  		w.SendValue(Notification{Result: result.Clone()})
   225  
   226  		// Watch the initialization container logs
   227  		lastTs := result.Initialization.StartedAt
   228  		pod := (<-c.pod.Any(ctx)).Value
   229  		for v := range WatchContainerLogs(ctx, c.clientSet, c.podEvents, c.namespace, pod.Name, "tktw-init").Stream(ctx).Channel() {
   230  			if v.Error != nil {
   231  				w.SendError(v.Error)
   232  				continue
   233  			}
   234  			if v.Value.Time.After(lastTs) {
   235  				lastTs = v.Value.Time
   236  			}
   237  			// TODO: Calibrate clock with v.Value.Hint or just first/last timestamp here
   238  			w.SendValue(Notification{
   239  				Timestamp: v.Value.Time,
   240  				Log:       string(v.Value.Log),
   241  			})
   242  		}
   243  
   244  		// Update the initialization container status
   245  		status, err := GetFinalContainerResult(ctx, c.pod, "tktw-init")
   246  		if err != nil {
   247  			w.SendError(err)
   248  			return
   249  		}
   250  		result.Initialization.FinishedAt = status.FinishedAt
   251  		if lastTs.After(result.Initialization.FinishedAt) {
   252  			result.Initialization.FinishedAt = lastTs
   253  		}
   254  		result.Initialization.Status = common.Ptr(status.Status)
   255  		if status.Status != testkube.PASSED_TestWorkflowStepStatus {
   256  			result.Status = common.Ptr(testkube.FAILED_TestWorkflowStatus)
   257  			result.PredictedStatus = result.Status
   258  		}
   259  		w.SendValue(Notification{Result: result.Clone()})
   260  
   261  		// Cancel when the initialization has failed
   262  		if status.Status != testkube.PASSED_TestWorkflowStepStatus {
   263  			return
   264  		}
   265  
   266  		// Watch each of the containers
   267  		lastTs = result.Initialization.FinishedAt
   268  		for _, container := range append(pod.Spec.InitContainers, pod.Spec.Containers...) {
   269  			// Ignore not-standard TestWorkflow containers
   270  			if _, ok := result.Steps[container.Name]; !ok {
   271  				continue
   272  			}
   273  
   274  			// Send the step queued time
   275  			stepResult := result.Steps[container.Name]
   276  			stepResult = result.UpdateStepResult(sig, container.Name, testkube.TestWorkflowStepResult{
   277  				QueuedAt: lastTs.UTC(),
   278  			}, c.scheduledAt)
   279  			w.SendValue(Notification{Result: result.Clone()})
   280  
   281  			// Watch for the container events
   282  			lastEvTs := time.Time{}
   283  			for v := range WatchContainerPreEvents(ctx, c.podEvents, container.Name, 0, false).Stream(ctx).Channel() {
   284  				if v.Error != nil {
   285  					w.SendError(v.Error)
   286  					continue
   287  				}
   288  				if lastEvTs.Before(v.Value.CreationTimestamp.Time) {
   289  					lastEvTs = v.Value.CreationTimestamp.Time
   290  				}
   291  				if v.Value.Reason == "Created" {
   292  					stepResult = result.UpdateStepResult(sig, container.Name, testkube.TestWorkflowStepResult{
   293  						QueuedAt: v.Value.CreationTimestamp.Time.UTC(),
   294  					}, c.scheduledAt)
   295  				} else if v.Value.Reason == "Started" {
   296  					stepResult = result.UpdateStepResult(sig, container.Name, testkube.TestWorkflowStepResult{
   297  						StartedAt: v.Value.CreationTimestamp.Time.UTC(),
   298  						Status:    common.Ptr(testkube.RUNNING_TestWorkflowStepStatus),
   299  					}, c.scheduledAt)
   300  				}
   301  				if v.Value.Type == "Normal" {
   302  					continue
   303  				}
   304  				w.SendValue(Notification{
   305  					Timestamp: v.Value.CreationTimestamp.Time,
   306  					Ref:       container.Name,
   307  					Log:       fmt.Sprintf("%s (%s) %s\n", v.Value.CreationTimestamp.Time.Format(KubernetesLogTimeFormat), v.Value.Reason, v.Value.Message),
   308  				})
   309  			}
   310  
   311  			// Emit the next result
   312  			if stepResult.StartedAt.IsZero() {
   313  				w.SendError(errors.New("step container is in unknown state"))
   314  				break
   315  			}
   316  			w.SendValue(Notification{Result: result.Clone()})
   317  
   318  			// Watch for the container logs, outputs and statuses
   319  			for v := range WatchContainerLogs(ctx, c.clientSet, c.podEvents, c.namespace, pod.Name, container.Name).Stream(ctx).Channel() {
   320  				if v.Error != nil {
   321  					w.SendError(v.Error)
   322  					continue
   323  				}
   324  				if v.Value.Hint != nil {
   325  					if v.Value.Hint.Name == "start" && v.Value.Hint.Ref == container.Name {
   326  						if v.Value.Time.After(stepResult.StartedAt) {
   327  							stepResult = result.UpdateStepResult(sig, container.Name, testkube.TestWorkflowStepResult{
   328  								StartedAt: v.Value.Time.UTC(),
   329  							}, c.scheduledAt)
   330  						}
   331  					} else if v.Value.Hint.Name == "status" {
   332  						status := testkube.TestWorkflowStepStatus(v.Value.Hint.Value.(string))
   333  						if status == "" {
   334  							status = testkube.PASSED_TestWorkflowStepStatus
   335  						}
   336  						if _, ok := result.Steps[v.Value.Hint.Ref]; ok {
   337  							stepResult = result.UpdateStepResult(sig, v.Value.Hint.Ref, testkube.TestWorkflowStepResult{
   338  								Status: &status,
   339  							}, c.scheduledAt)
   340  						}
   341  					}
   342  					continue
   343  				}
   344  				if v.Value.Output != nil {
   345  					if _, ok := result.Steps[v.Value.Output.Ref]; ok {
   346  						w.SendValue(Notification{
   347  							Timestamp: v.Value.Time,
   348  							Ref:       v.Value.Output.Ref,
   349  							Output:    v.Value.Output,
   350  						})
   351  					}
   352  					continue
   353  				}
   354  				w.SendValue(Notification{Timestamp: v.Value.Time, Ref: container.Name, Log: string(v.Value.Log)})
   355  			}
   356  
   357  			// Watch container status
   358  			status, err := GetFinalContainerResult(ctx, c.pod, container.Name)
   359  			if err != nil {
   360  				w.SendError(err)
   361  				break
   362  			}
   363  			finishedAt := status.FinishedAt.UTC()
   364  			if !finishedAt.IsZero() && lastTs.After(finishedAt) {
   365  				finishedAt = lastTs.UTC()
   366  			}
   367  			stepResult = result.UpdateStepResult(sig, container.Name, testkube.TestWorkflowStepResult{
   368  				FinishedAt: finishedAt,
   369  				ExitCode:   float64(status.ExitCode),
   370  				Status:     common.Ptr(status.Status),
   371  			}, c.scheduledAt)
   372  			w.SendValue(Notification{Result: result.Clone()})
   373  
   374  			// Update the last timestamp
   375  			lastTs = finishedAt
   376  
   377  			// Break the function if the step has been aborted.
   378  			// Breaking only to the loop is not enough,
   379  			// because due to GKE bug, the Job is still pending,
   380  			// so it will get stuck there.
   381  			if status.Status == testkube.ABORTED_TestWorkflowStepStatus {
   382  				result.Recompute(sig, c.scheduledAt)
   383  				abortTs := result.Steps[container.Name].FinishedAt
   384  				if status.Details == "" {
   385  					status.Details = "Manual"
   386  				}
   387  
   388  				w.SendValue(Notification{
   389  					Timestamp: abortTs,
   390  					Ref:       container.Name,
   391  					Log:       fmt.Sprintf("\n%s Aborted (%s)", abortTs.Format(KubernetesLogTimeFormat), status.Details),
   392  				})
   393  				w.SendValue(Notification{Result: result.Clone()})
   394  				return
   395  			}
   396  		}
   397  
   398  		// Read the pod finish time
   399  		for v := range c.job.Stream(ctx).Channel() {
   400  			if v.Value != nil && v.Value.Status.CompletionTime != nil {
   401  				result.FinishedAt = v.Value.Status.CompletionTime.Time
   402  			}
   403  		}
   404  		if result.FinishedAt.IsZero() {
   405  			for v := range c.pod.Stream(ctx).Channel() {
   406  				if v.Value != nil && v.Value.ObjectMeta.DeletionTimestamp != nil {
   407  					result.FinishedAt = v.Value.ObjectMeta.DeletionTimestamp.Time
   408  				}
   409  			}
   410  		}
   411  
   412  		// Compute the TestWorkflow status and dates
   413  		result.Recompute(sig, c.scheduledAt)
   414  		w.SendValue(Notification{Result: result.Clone()})
   415  	}()
   416  
   417  	return w
   418  }