github.com/kubevela/workflow@v0.6.0/pkg/executor/workflow.go (about)

     1  /*
     2  Copyright 2022 The KubeVela Authors.
     3  
     4  Licensed under the Apache License, Version 2.0 (the "License");
     5  you may not use this file except in compliance with the License.
     6  You may obtain a copy of the License at
     7  
     8      http://www.apache.org/licenses/LICENSE-2.0
     9  
    10  Unless required by applicable law or agreed to in writing, software
    11  distributed under the License is distributed on an "AS IS" BASIS,
    12  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    13  See the License for the specific language governing permissions and
    14  limitations under the License.
    15  */
    16  
    17  package executor
    18  
    19  import (
    20  	"context"
    21  	"fmt"
    22  	"math"
    23  	"sync"
    24  	"time"
    25  
    26  	"github.com/pkg/errors"
    27  	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
    28  	"k8s.io/apiserver/pkg/endpoints/request"
    29  	"k8s.io/apiserver/pkg/util/feature"
    30  	"sigs.k8s.io/controller-runtime/pkg/client"
    31  
    32  	monitorContext "github.com/kubevela/pkg/monitor/context"
    33  
    34  	"github.com/kubevela/workflow/api/v1alpha1"
    35  	wfContext "github.com/kubevela/workflow/pkg/context"
    36  	"github.com/kubevela/workflow/pkg/cue/model/value"
    37  	"github.com/kubevela/workflow/pkg/debug"
    38  	"github.com/kubevela/workflow/pkg/features"
    39  	"github.com/kubevela/workflow/pkg/hooks"
    40  	"github.com/kubevela/workflow/pkg/monitor/metrics"
    41  	"github.com/kubevela/workflow/pkg/providers/workspace"
    42  	"github.com/kubevela/workflow/pkg/tasks/custom"
    43  	"github.com/kubevela/workflow/pkg/types"
    44  )
    45  
    46  var (
    47  	// DisableRecorder optimize workflow by disable recorder
    48  	DisableRecorder = false
    49  	// StepStatusCache cache the step status
    50  	StepStatusCache sync.Map
    51  )
    52  
    53  const (
    54  	// minWorkflowBackoffWaitTime is the min time to wait before reconcile workflow again
    55  	minWorkflowBackoffWaitTime = 1
    56  	// backoffTimeCoefficient is the coefficient of time to wait before reconcile workflow again
    57  	backoffTimeCoefficient = 0.05
    58  )
    59  
    60  type workflowExecutor struct {
    61  	instance *types.WorkflowInstance
    62  	cli      client.Client
    63  	wfCtx    wfContext.Context
    64  	patcher  types.StatusPatcher
    65  }
    66  
    67  // New returns a Workflow Executor implementation.
    68  func New(instance *types.WorkflowInstance, cli client.Client, patcher types.StatusPatcher) WorkflowExecutor {
    69  	return &workflowExecutor{
    70  		instance: instance,
    71  		cli:      cli,
    72  		patcher:  patcher,
    73  	}
    74  }
    75  
    76  // InitializeWorkflowInstance init workflow instance
    77  func InitializeWorkflowInstance(instance *types.WorkflowInstance) {
    78  	if instance.Status.StartTime.IsZero() && len(instance.Status.Steps) == 0 {
    79  		metrics.WorkflowRunInitializedCounter.WithLabelValues().Inc()
    80  		mode := v1alpha1.WorkflowExecuteMode{
    81  			Steps:    v1alpha1.WorkflowModeStep,
    82  			SubSteps: v1alpha1.WorkflowModeDAG,
    83  		}
    84  		if instance.Mode != nil {
    85  			if instance.Mode.Steps != "" {
    86  				mode.Steps = instance.Mode.Steps
    87  			}
    88  			if instance.Mode.SubSteps != "" {
    89  				mode.SubSteps = instance.Mode.SubSteps
    90  			}
    91  		}
    92  		instance.Status = v1alpha1.WorkflowRunStatus{
    93  			Mode:      mode,
    94  			StartTime: metav1.Now(),
    95  		}
    96  		StepStatusCache.Delete(fmt.Sprintf("%s-%s", instance.Name, instance.Namespace))
    97  		wfContext.CleanupMemoryStore(instance.Name, instance.Namespace)
    98  	}
    99  }
   100  
   101  // ExecuteRunners execute workflow task runners in order.
   102  func (w *workflowExecutor) ExecuteRunners(ctx monitorContext.Context, taskRunners []types.TaskRunner) (v1alpha1.WorkflowRunPhase, error) {
   103  	InitializeWorkflowInstance(w.instance)
   104  	status := &w.instance.Status
   105  	dagMode := status.Mode.Steps == v1alpha1.WorkflowModeDAG
   106  	cacheKey := fmt.Sprintf("%s-%s", w.instance.Name, w.instance.Namespace)
   107  
   108  	allRunnersDone, allRunnersSucceeded := checkRunners(taskRunners, w.instance.Status)
   109  	if status.Finished {
   110  		StepStatusCache.Delete(cacheKey)
   111  	}
   112  	if checkWorkflowTerminated(status, allRunnersDone) {
   113  		if isTerminatedManually(status) {
   114  			return v1alpha1.WorkflowStateTerminated, nil
   115  		}
   116  		return v1alpha1.WorkflowStateFailed, nil
   117  	}
   118  	if checkWorkflowSuspended(status) {
   119  		return v1alpha1.WorkflowStateSuspending, nil
   120  	}
   121  	if allRunnersSucceeded {
   122  		return v1alpha1.WorkflowStateSucceeded, nil
   123  	}
   124  
   125  	wfCtx, err := w.makeContext(ctx, w.instance.Name)
   126  	if err != nil {
   127  		ctx.Error(err, "make context")
   128  		return v1alpha1.WorkflowStateExecuting, err
   129  	}
   130  	w.wfCtx = wfCtx
   131  
   132  	if cacheValue, ok := StepStatusCache.Load(cacheKey); ok {
   133  		// handle cache resource
   134  		if len(status.Steps) < cacheValue.(int) {
   135  			return v1alpha1.WorkflowStateSkipped, nil
   136  		}
   137  	}
   138  
   139  	e := newEngine(ctx, wfCtx, w, status, taskRunners)
   140  
   141  	err = e.Run(ctx, taskRunners, dagMode)
   142  	if err != nil {
   143  		ctx.Error(err, "run steps")
   144  		StepStatusCache.Store(cacheKey, len(status.Steps))
   145  		return v1alpha1.WorkflowStateExecuting, err
   146  	}
   147  
   148  	StepStatusCache.Store(cacheKey, len(status.Steps))
   149  	if feature.DefaultMutableFeatureGate.Enabled(features.EnablePatchStatusAtOnce) {
   150  		return e.status.Phase, nil
   151  	}
   152  	return e.checkWorkflowPhase(), nil
   153  }
   154  
   155  func isTerminatedManually(status *v1alpha1.WorkflowRunStatus) bool {
   156  	manually := false
   157  	for _, step := range status.Steps {
   158  		if step.Phase == v1alpha1.WorkflowStepPhaseFailed {
   159  			if step.Reason == types.StatusReasonTerminate {
   160  				manually = true
   161  			} else {
   162  				return false
   163  			}
   164  		}
   165  	}
   166  	return manually
   167  }
   168  
   169  func checkWorkflowTerminated(status *v1alpha1.WorkflowRunStatus, allTasksDone bool) bool {
   170  	// if all tasks are done, and the terminated is true, then the workflow is terminated
   171  	return status.Terminated && allTasksDone
   172  }
   173  
   174  func checkWorkflowSuspended(status *v1alpha1.WorkflowRunStatus) bool {
   175  	// if workflow is suspended and the suspended step is still running, return false to run the suspended step
   176  	if status.Suspend {
   177  		for _, step := range status.Steps {
   178  			if step.Phase == v1alpha1.WorkflowStepPhaseSuspending {
   179  				return false
   180  			}
   181  			for _, sub := range step.SubStepsStatus {
   182  				if sub.Phase == v1alpha1.WorkflowStepPhaseSuspending {
   183  					return false
   184  				}
   185  			}
   186  		}
   187  	}
   188  	return status.Suspend
   189  }
   190  
   191  func newEngine(ctx monitorContext.Context, wfCtx wfContext.Context, w *workflowExecutor, wfStatus *v1alpha1.WorkflowRunStatus, taskRunners []types.TaskRunner) *engine {
   192  	stepStatus := make(map[string]v1alpha1.StepStatus)
   193  	setStepStatus(stepStatus, wfStatus.Steps)
   194  	stepDependsOn := make(map[string][]string)
   195  	for _, step := range w.instance.Steps {
   196  		hooks.SetAdditionalNameInStatus(stepStatus, step.Name, step.Properties, stepStatus[step.Name])
   197  		stepDependsOn[step.Name] = append(stepDependsOn[step.Name], step.DependsOn...)
   198  		for _, sub := range step.SubSteps {
   199  			hooks.SetAdditionalNameInStatus(stepStatus, step.Name, step.Properties, stepStatus[step.Name])
   200  			stepDependsOn[sub.Name] = append(stepDependsOn[sub.Name], sub.DependsOn...)
   201  		}
   202  	}
   203  	return &engine{
   204  		status:        wfStatus,
   205  		instance:      w.instance,
   206  		wfCtx:         wfCtx,
   207  		cli:           w.cli,
   208  		debug:         w.instance.Debug,
   209  		stepStatus:    stepStatus,
   210  		stepDependsOn: stepDependsOn,
   211  		stepTimeout:   make(map[string]time.Time),
   212  		taskRunners:   taskRunners,
   213  		statusPatcher: w.patcher,
   214  	}
   215  }
   216  
   217  func setStepStatus(statusMap map[string]v1alpha1.StepStatus, status []v1alpha1.WorkflowStepStatus) {
   218  	for _, ss := range status {
   219  		statusMap[ss.Name] = ss.StepStatus
   220  		for _, sss := range ss.SubStepsStatus {
   221  			statusMap[sss.Name] = sss
   222  		}
   223  	}
   224  }
   225  
   226  func (w *workflowExecutor) GetSuspendBackoffWaitTime() time.Duration {
   227  	if len(w.instance.Steps) == 0 {
   228  		return 0
   229  	}
   230  	stepStatus := make(map[string]v1alpha1.StepStatus)
   231  	setStepStatus(stepStatus, w.instance.Status.Steps)
   232  	max := time.Duration(1<<63 - 1)
   233  	min := max
   234  	for _, step := range w.instance.Steps {
   235  		min = handleSuspendBackoffTime(w.wfCtx, step, stepStatus[step.Name], min)
   236  		for _, sub := range step.SubSteps {
   237  			min = handleSuspendBackoffTime(w.wfCtx, v1alpha1.WorkflowStep{
   238  				WorkflowStepBase: v1alpha1.WorkflowStepBase{
   239  					Name:       sub.Name,
   240  					Type:       sub.Type,
   241  					Timeout:    sub.Timeout,
   242  					Properties: sub.Properties,
   243  				},
   244  			}, stepStatus[sub.Name], min)
   245  		}
   246  	}
   247  	if min == max {
   248  		return 0
   249  	}
   250  	return min
   251  }
   252  
   253  func handleSuspendBackoffTime(wfCtx wfContext.Context, step v1alpha1.WorkflowStep, status v1alpha1.StepStatus, min time.Duration) time.Duration {
   254  	if status.Phase != v1alpha1.WorkflowStepPhaseSuspending {
   255  		return min
   256  	}
   257  	if step.Timeout != "" {
   258  		duration, err := time.ParseDuration(step.Timeout)
   259  		if err != nil {
   260  			return min
   261  		}
   262  		timeout := status.FirstExecuteTime.Add(duration)
   263  		if time.Now().Before(timeout) {
   264  			d := time.Until(timeout)
   265  			if duration < min {
   266  				min = d
   267  			}
   268  		}
   269  	}
   270  
   271  	if ts := wfCtx.GetMutableValue(status.ID, workspace.ResumeTimeStamp); ts != "" {
   272  		t, err := time.Parse(time.RFC3339, ts)
   273  		if err != nil {
   274  			return min
   275  		}
   276  		d := time.Until(t)
   277  		if d < min {
   278  			min = d
   279  		}
   280  	}
   281  	return min
   282  }
   283  
   284  func (w *workflowExecutor) GetBackoffWaitTime() time.Duration {
   285  	nextTime, ok := w.wfCtx.GetValueInMemory(types.ContextKeyNextExecuteTime)
   286  	if !ok {
   287  		if w.instance.Status.Suspend {
   288  			return 0
   289  		}
   290  		return time.Second
   291  	}
   292  	unix, ok := nextTime.(int64)
   293  	if !ok {
   294  		return time.Second
   295  	}
   296  	next := time.Unix(unix, 0)
   297  	if next.After(time.Now()) {
   298  		return time.Until(next)
   299  	}
   300  
   301  	return time.Second
   302  }
   303  
   304  func checkRunners(taskRunners []types.TaskRunner, status v1alpha1.WorkflowRunStatus) (bool, bool) {
   305  	success := true
   306  	for _, t := range taskRunners {
   307  		done := false
   308  		for _, ss := range status.Steps {
   309  			if ss.Name == t.Name() {
   310  				done = types.IsStepFinish(ss.Phase, ss.Reason)
   311  				success = success && done && (ss.Phase == v1alpha1.WorkflowStepPhaseSucceeded || ss.Phase == v1alpha1.WorkflowStepPhaseSkipped)
   312  				break
   313  			}
   314  		}
   315  		if !done {
   316  			return false, false
   317  		}
   318  	}
   319  	return true, success
   320  }
   321  
   322  func (w *workflowExecutor) makeContext(ctx context.Context, name string) (wfContext.Context, error) {
   323  	// clear the user info in context
   324  	ctx = request.WithUser(ctx, nil)
   325  	status := &w.instance.Status
   326  	if status.ContextBackend != nil {
   327  		wfCtx, err := wfContext.LoadContext(w.cli, w.instance.Namespace, w.instance.Name, w.instance.Status.ContextBackend.Name)
   328  		if err != nil {
   329  			return nil, errors.WithMessage(err, "load context")
   330  		}
   331  		return wfCtx, nil
   332  	}
   333  
   334  	wfCtx, err := wfContext.NewContext(ctx, w.cli, w.instance.Namespace, name, w.instance.ChildOwnerReferences)
   335  	if err != nil {
   336  		return nil, errors.WithMessage(err, "new context")
   337  	}
   338  
   339  	status.ContextBackend = wfCtx.StoreRef()
   340  	return wfCtx, nil
   341  }
   342  
   343  func (e *engine) getBackoffTimes(stepID string) int {
   344  	if v, ok := e.wfCtx.GetValueInMemory(types.ContextPrefixBackoffTimes, stepID); ok {
   345  		times, ok := v.(int)
   346  		if ok {
   347  			return times
   348  		}
   349  	}
   350  	return -1
   351  }
   352  
   353  func (e *engine) getBackoffWaitTime() int {
   354  	// the default value of min times reaches the max workflow backoff wait time
   355  	minTimes := 15
   356  	found := false
   357  	for _, step := range e.status.Steps {
   358  		if backoffTimes := e.getBackoffTimes(step.ID); backoffTimes > 0 {
   359  			found = true
   360  			if backoffTimes < minTimes {
   361  				minTimes = backoffTimes
   362  			}
   363  		}
   364  		if step.SubStepsStatus != nil {
   365  			for _, subStep := range step.SubStepsStatus {
   366  				if backoffTimes := e.getBackoffTimes(subStep.ID); backoffTimes > 0 {
   367  					found = true
   368  					if backoffTimes < minTimes {
   369  						minTimes = backoffTimes
   370  					}
   371  				}
   372  			}
   373  		}
   374  	}
   375  
   376  	if !found {
   377  		return minWorkflowBackoffWaitTime
   378  	}
   379  
   380  	interval := int(math.Pow(2, float64(minTimes)) * backoffTimeCoefficient)
   381  	if interval < minWorkflowBackoffWaitTime {
   382  		return minWorkflowBackoffWaitTime
   383  	}
   384  	maxWorkflowBackoffWaitTime := e.getMaxBackoffWaitTime()
   385  	if interval > maxWorkflowBackoffWaitTime {
   386  		return maxWorkflowBackoffWaitTime
   387  	}
   388  	return interval
   389  }
   390  
   391  func (e *engine) getMaxBackoffWaitTime() int {
   392  	for _, step := range e.status.Steps {
   393  		if step.Phase == v1alpha1.WorkflowStepPhaseFailed {
   394  			return types.MaxWorkflowFailedBackoffTime
   395  		}
   396  	}
   397  	return types.MaxWorkflowWaitBackoffTime
   398  }
   399  
   400  func (e *engine) getNextTimeout() int64 {
   401  	max := time.Duration(1<<63 - 1)
   402  	min := time.Duration(1<<63 - 1)
   403  	now := time.Now()
   404  	for _, step := range e.status.Steps {
   405  		if step.Phase == v1alpha1.WorkflowStepPhaseRunning {
   406  			if timeout, ok := e.stepTimeout[step.Name]; ok {
   407  				duration := timeout.Sub(now)
   408  				if duration < min {
   409  					min = duration
   410  				}
   411  			}
   412  		}
   413  	}
   414  	if min == max {
   415  		return -1
   416  	}
   417  	if min.Seconds() < 1 {
   418  		return minWorkflowBackoffWaitTime
   419  	}
   420  	return int64(math.Ceil(min.Seconds()))
   421  }
   422  
   423  func (e *engine) setNextExecuteTime(ctx monitorContext.Context) {
   424  	backoff := e.getBackoffWaitTime()
   425  	lastExecuteTime, ok := e.wfCtx.GetValueInMemory(types.ContextKeyLastExecuteTime)
   426  	if !ok {
   427  		ctx.Error(fmt.Errorf("failed to get last execute time"), "workflow run", e.instance.Name)
   428  	}
   429  
   430  	last, ok := lastExecuteTime.(int64)
   431  	if !ok {
   432  		ctx.Error(fmt.Errorf("failed to parse last execute time to int64"), "lastExecuteTime", lastExecuteTime)
   433  	}
   434  	interval := int64(backoff)
   435  	if timeout := e.getNextTimeout(); timeout > 0 && timeout < interval {
   436  		interval = timeout
   437  	}
   438  
   439  	next := last + interval
   440  	e.wfCtx.SetValueInMemory(next, types.ContextKeyNextExecuteTime)
   441  }
   442  
   443  func (e *engine) runAsDAG(ctx monitorContext.Context, taskRunners []types.TaskRunner, pendingRunners bool) error {
   444  	var (
   445  		todoTasks    []types.TaskRunner
   446  		pendingTasks []types.TaskRunner
   447  	)
   448  	wfCtx := e.wfCtx
   449  	done := true
   450  	for _, tRunner := range taskRunners {
   451  		finish := false
   452  		var stepID string
   453  		if status, ok := e.stepStatus[tRunner.Name()]; ok {
   454  			stepID = status.ID
   455  			finish = types.IsStepFinish(status.Phase, status.Reason)
   456  		}
   457  		if !finish {
   458  			done = false
   459  			if pending, status := tRunner.Pending(ctx, wfCtx, e.stepStatus); pending {
   460  				if pendingRunners {
   461  					wfCtx.IncreaseCountValueInMemory(types.ContextPrefixBackoffTimes, status.ID)
   462  					if err := e.updateStepStatus(ctx, status); err != nil {
   463  						return err
   464  					}
   465  				}
   466  				pendingTasks = append(pendingTasks, tRunner)
   467  				continue
   468  			} else if status.Phase == v1alpha1.WorkflowStepPhasePending {
   469  				wfCtx.DeleteValueInMemory(types.ContextPrefixBackoffTimes, stepID)
   470  			}
   471  			todoTasks = append(todoTasks, tRunner)
   472  		} else {
   473  			wfCtx.DeleteValueInMemory(types.ContextPrefixBackoffTimes, stepID)
   474  		}
   475  	}
   476  	if done {
   477  		return nil
   478  	}
   479  
   480  	if len(todoTasks) > 0 {
   481  		err := e.steps(ctx, todoTasks, true)
   482  		if err != nil {
   483  			return err
   484  		}
   485  
   486  		if e.needStop() {
   487  			return nil
   488  		}
   489  
   490  		if len(pendingTasks) > 0 {
   491  			return e.runAsDAG(ctx, pendingTasks, true)
   492  		}
   493  	}
   494  	return nil
   495  
   496  }
   497  
   498  func (e *engine) Run(ctx monitorContext.Context, taskRunners []types.TaskRunner, dag bool) error {
   499  	var err error
   500  	if dag {
   501  		err = e.runAsDAG(ctx, taskRunners, false)
   502  	} else {
   503  		err = e.steps(ctx, taskRunners, dag)
   504  	}
   505  
   506  	e.checkFailedAfterRetries()
   507  	e.setNextExecuteTime(ctx)
   508  	return err
   509  }
   510  
   511  func (e *engine) checkWorkflowStatusMessage() {
   512  	switch {
   513  	case !e.waiting && e.failedAfterRetries && feature.DefaultMutableFeatureGate.Enabled(features.EnableSuspendOnFailure):
   514  		e.status.Message = types.MessageSuspendFailedAfterRetries
   515  	default:
   516  		e.status.Message = ""
   517  	}
   518  }
   519  
   520  func (e *engine) steps(ctx monitorContext.Context, taskRunners []types.TaskRunner, dag bool) error {
   521  	wfCtx := e.wfCtx
   522  	for index, runner := range taskRunners {
   523  		if status, ok := e.stepStatus[runner.Name()]; ok {
   524  			if types.IsStepFinish(status.Phase, status.Reason) {
   525  				continue
   526  			}
   527  		}
   528  		if pending, status := runner.Pending(ctx, wfCtx, e.stepStatus); pending {
   529  			wfCtx.IncreaseCountValueInMemory(types.ContextPrefixBackoffTimes, status.ID)
   530  			if err := e.updateStepStatus(ctx, status); err != nil {
   531  				return err
   532  			}
   533  			if dag {
   534  				continue
   535  			}
   536  			return nil
   537  		}
   538  		options := e.generateRunOptions(ctx, e.findDependPhase(taskRunners, index, dag))
   539  
   540  		status, operation, err := runner.Run(wfCtx, options)
   541  		if err != nil {
   542  			return err
   543  		}
   544  		e.finishStep(operation)
   545  
   546  		// for the suspend step with duration, there's no need to increase the backoff time in reconcile when it's still running
   547  		if !types.IsStepFinish(status.Phase, status.Reason) && status.Phase != v1alpha1.WorkflowStepPhaseSuspending {
   548  			if err := e.updateStepStatus(ctx, status); err != nil {
   549  				return err
   550  			}
   551  			if err := handleBackoffTimes(wfCtx, status, false); err != nil {
   552  				return err
   553  			}
   554  			if dag {
   555  				continue
   556  			}
   557  			return nil
   558  		}
   559  		// clear the backoff time when the step is finished
   560  		if err := handleBackoffTimes(wfCtx, status, true); err != nil {
   561  			return err
   562  		}
   563  		if err := e.updateStepStatus(ctx, status); err != nil {
   564  			return err
   565  		}
   566  
   567  		if dag {
   568  			continue
   569  		}
   570  		if e.needStop() {
   571  			return nil
   572  		}
   573  	}
   574  	return nil
   575  }
   576  
   577  func (e *engine) generateRunOptions(ctx monitorContext.Context, dependsOnPhase v1alpha1.WorkflowStepPhase) *types.TaskRunOptions {
   578  	options := &types.TaskRunOptions{
   579  		GetTracer: func(id string, stepStatus v1alpha1.WorkflowStep) monitorContext.Context {
   580  			return ctx.Fork(id, monitorContext.DurationMetric(func(v float64) {
   581  				metrics.WorkflowRunStepDurationHistogram.WithLabelValues("workflowrun", stepStatus.Type).Observe(v)
   582  			}))
   583  		},
   584  		StepStatus: e.stepStatus,
   585  		Engine:     e,
   586  		PreCheckHooks: []types.TaskPreCheckHook{
   587  			func(step v1alpha1.WorkflowStep, options *types.PreCheckOptions) (*types.PreCheckResult, error) {
   588  				if feature.DefaultMutableFeatureGate.Enabled(features.EnableSuspendOnFailure) {
   589  					return &types.PreCheckResult{Skip: false}, nil
   590  				}
   591  				if e.parentRunner != "" {
   592  					if status, ok := e.stepStatus[e.parentRunner]; ok && status.Phase == v1alpha1.WorkflowStepPhaseSkipped {
   593  						return &types.PreCheckResult{Skip: true}, nil
   594  					}
   595  				}
   596  				switch step.If {
   597  				case "always":
   598  					return &types.PreCheckResult{Skip: false}, nil
   599  				case "":
   600  					return &types.PreCheckResult{Skip: skipExecutionOfNextStep(dependsOnPhase, len(step.DependsOn) > 0)}, nil
   601  				default:
   602  					ifValue, err := custom.ValidateIfValue(e.wfCtx, step, e.stepStatus, options)
   603  					if err != nil {
   604  						return &types.PreCheckResult{Skip: true}, err
   605  					}
   606  					return &types.PreCheckResult{Skip: !ifValue}, nil
   607  				}
   608  			},
   609  			func(step v1alpha1.WorkflowStep, options *types.PreCheckOptions) (*types.PreCheckResult, error) {
   610  				status := e.stepStatus[step.Name]
   611  				if e.parentRunner != "" {
   612  					if status, ok := e.stepStatus[e.parentRunner]; ok && status.Phase == v1alpha1.WorkflowStepPhaseFailed && status.Reason == types.StatusReasonTimeout {
   613  						return &types.PreCheckResult{Timeout: true}, nil
   614  					}
   615  				}
   616  				if !status.FirstExecuteTime.Time.IsZero() && step.Timeout != "" {
   617  					duration, err := time.ParseDuration(step.Timeout)
   618  					if err != nil {
   619  						// if the timeout is a invalid duration, return {timeout: false}
   620  						return &types.PreCheckResult{Timeout: false}, err
   621  					}
   622  					timeout := status.FirstExecuteTime.Add(duration)
   623  					e.stepTimeout[step.Name] = timeout
   624  					if time.Now().After(timeout) {
   625  						return &types.PreCheckResult{Timeout: true}, nil
   626  					}
   627  				}
   628  				return &types.PreCheckResult{Timeout: false}, nil
   629  			},
   630  		},
   631  		PreStartHooks: []types.TaskPreStartHook{hooks.Input},
   632  		PostStopHooks: []types.TaskPostStopHook{hooks.Output},
   633  	}
   634  	if e.debug {
   635  		options.Debug = func(id string, v *value.Value) error {
   636  			debugContext := debug.NewContext(e.cli, e.instance, id)
   637  			if err := debugContext.Set(v); err != nil {
   638  				return err
   639  			}
   640  			return nil
   641  		}
   642  	}
   643  	return options
   644  }
   645  
   646  type engine struct {
   647  	failedAfterRetries bool
   648  	waiting            bool
   649  	suspending         bool
   650  	debug              bool
   651  	status             *v1alpha1.WorkflowRunStatus
   652  	wfCtx              wfContext.Context
   653  	instance           *types.WorkflowInstance
   654  	cli                client.Client
   655  	parentRunner       string
   656  	stepStatus         map[string]v1alpha1.StepStatus
   657  	stepTimeout        map[string]time.Time
   658  	stepDependsOn      map[string][]string
   659  	taskRunners        []types.TaskRunner
   660  	statusPatcher      types.StatusPatcher
   661  }
   662  
   663  func (e *engine) finishStep(operation *types.Operation) {
   664  	if operation != nil {
   665  		e.status.Terminated = e.status.Terminated || operation.Terminated
   666  		e.failedAfterRetries = e.failedAfterRetries || operation.FailedAfterRetries
   667  		e.waiting = e.waiting || operation.Waiting
   668  		e.suspending = e.suspending || operation.Suspend
   669  	}
   670  	e.status.Suspend = e.suspending
   671  	if !e.waiting && e.failedAfterRetries && feature.DefaultMutableFeatureGate.Enabled(features.EnableSuspendOnFailure) {
   672  		e.status.Suspend = true
   673  	}
   674  	if e.failedAfterRetries && !feature.DefaultMutableFeatureGate.Enabled(features.EnableSuspendOnFailure) {
   675  		e.status.Terminated = true
   676  	}
   677  }
   678  
   679  func (e *engine) updateStepStatus(ctx context.Context, status v1alpha1.StepStatus) error {
   680  	var (
   681  		conditionUpdated bool
   682  		now              = metav1.NewTime(time.Now())
   683  	)
   684  
   685  	parentRunner := e.parentRunner
   686  	stepName := status.Name
   687  	if parentRunner != "" {
   688  		stepName = parentRunner
   689  	}
   690  	e.wfCtx.SetValueInMemory(now.Unix(), types.ContextKeyLastExecuteTime)
   691  	status.LastExecuteTime = now
   692  	index := -1
   693  	for i, ss := range e.status.Steps {
   694  		if ss.Name == stepName {
   695  			index = i
   696  			if parentRunner != "" {
   697  				// update the sub steps status
   698  				for j, sub := range ss.SubStepsStatus {
   699  					if sub.Name == status.Name {
   700  						status.FirstExecuteTime = sub.FirstExecuteTime
   701  						e.status.Steps[i].SubStepsStatus[j] = status
   702  						conditionUpdated = true
   703  						break
   704  					}
   705  				}
   706  			} else {
   707  				// update the parent steps status
   708  				status.FirstExecuteTime = ss.FirstExecuteTime
   709  				e.status.Steps[i].StepStatus = status
   710  				conditionUpdated = true
   711  				break
   712  			}
   713  		}
   714  	}
   715  	if !conditionUpdated {
   716  		status.FirstExecuteTime = now
   717  		if parentRunner != "" {
   718  			if index < 0 {
   719  				e.status.Steps = append(e.status.Steps, v1alpha1.WorkflowStepStatus{
   720  					StepStatus: v1alpha1.StepStatus{
   721  						Name:             parentRunner,
   722  						FirstExecuteTime: now,
   723  					}})
   724  				index = len(e.status.Steps) - 1
   725  			}
   726  			e.status.Steps[index].SubStepsStatus = append(e.status.Steps[index].SubStepsStatus, status)
   727  		} else {
   728  			e.status.Steps = append(e.status.Steps, v1alpha1.WorkflowStepStatus{StepStatus: status})
   729  		}
   730  	}
   731  	e.stepStatus[status.Name] = status
   732  	if feature.DefaultMutableFeatureGate.Enabled(features.EnablePatchStatusAtOnce) {
   733  		isUpdate := false
   734  		orig := e.status.Message
   735  		e.status.Phase = e.checkWorkflowPhase()
   736  		if orig != "" && e.status.Message == "" {
   737  			// patch can not set empty string
   738  			isUpdate = true
   739  		}
   740  		return e.statusPatcher(ctx, e.status, isUpdate)
   741  	}
   742  	return nil
   743  }
   744  
   745  func (e *engine) checkWorkflowPhase() v1alpha1.WorkflowRunPhase {
   746  	status := e.status
   747  	e.checkWorkflowStatusMessage()
   748  	allRunnersDone, allRunnersSucceeded := checkRunners(e.taskRunners, e.instance.Status)
   749  	if status.Terminated {
   750  		e.cleanBackoffTimesForTerminated()
   751  		if checkWorkflowTerminated(status, allRunnersDone) {
   752  			wfContext.CleanupMemoryStore(e.instance.Name, e.instance.Namespace)
   753  			if isTerminatedManually(status) {
   754  				return v1alpha1.WorkflowStateTerminated
   755  			}
   756  			return v1alpha1.WorkflowStateFailed
   757  		}
   758  	}
   759  	if status.Suspend {
   760  		wfContext.CleanupMemoryStore(e.instance.Name, e.instance.Namespace)
   761  		return v1alpha1.WorkflowStateSuspending
   762  	}
   763  	if allRunnersSucceeded {
   764  		return v1alpha1.WorkflowStateSucceeded
   765  	}
   766  	return v1alpha1.WorkflowStateExecuting
   767  }
   768  
   769  func (e *engine) checkFailedAfterRetries() {
   770  	if !e.waiting && e.failedAfterRetries && feature.DefaultMutableFeatureGate.Enabled(features.EnableSuspendOnFailure) {
   771  		e.status.Suspend = true
   772  	}
   773  	if e.failedAfterRetries && !feature.DefaultMutableFeatureGate.Enabled(features.EnableSuspendOnFailure) {
   774  		e.status.Terminated = true
   775  	}
   776  }
   777  
   778  func (e *engine) needStop() bool {
   779  	// if the workflow is terminated, we still need to execute all the remaining steps
   780  	return e.status.Suspend
   781  }
   782  
   783  func (e *engine) findDependPhase(taskRunners []types.TaskRunner, index int, dag bool) v1alpha1.WorkflowStepPhase {
   784  	dependsOn := len(e.stepDependsOn[taskRunners[index].Name()]) > 0
   785  	if dag || dependsOn {
   786  		return e.findDependsOnPhase(taskRunners[index].Name())
   787  	}
   788  	if index < 1 {
   789  		return v1alpha1.WorkflowStepPhaseSucceeded
   790  	}
   791  	for i := index - 1; i >= 0; i-- {
   792  		if skipExecutionOfNextStep(e.stepStatus[taskRunners[i].Name()].Phase, dependsOn) {
   793  			return e.stepStatus[taskRunners[i].Name()].Phase
   794  		}
   795  	}
   796  	return e.stepStatus[taskRunners[index-1].Name()].Phase
   797  }
   798  
   799  func (e *engine) findDependsOnPhase(name string) v1alpha1.WorkflowStepPhase {
   800  	for _, dependsOn := range e.stepDependsOn[name] {
   801  		if e.stepStatus[dependsOn].Phase != v1alpha1.WorkflowStepPhaseSucceeded {
   802  			return e.stepStatus[dependsOn].Phase
   803  		}
   804  		if result := e.findDependsOnPhase(dependsOn); result != v1alpha1.WorkflowStepPhaseSucceeded {
   805  			return result
   806  		}
   807  	}
   808  	return v1alpha1.WorkflowStepPhaseSucceeded
   809  }
   810  
   811  // skipExecutionOfNextStep returns true if the next step should be skipped
   812  func skipExecutionOfNextStep(phase v1alpha1.WorkflowStepPhase, dependsOn bool) bool {
   813  	if dependsOn {
   814  		return phase != v1alpha1.WorkflowStepPhaseSucceeded
   815  	}
   816  	return phase != v1alpha1.WorkflowStepPhaseSucceeded && phase != v1alpha1.WorkflowStepPhaseSkipped
   817  }
   818  
   819  func handleBackoffTimes(wfCtx wfContext.Context, status v1alpha1.StepStatus, clear bool) error {
   820  	if clear {
   821  		wfCtx.DeleteValueInMemory(types.ContextPrefixBackoffTimes, status.ID)
   822  		wfCtx.DeleteValueInMemory(types.ContextPrefixBackoffReason, status.ID)
   823  	} else {
   824  		if val, exists := wfCtx.GetValueInMemory(types.ContextPrefixBackoffReason, status.ID); !exists || val != status.Message {
   825  			wfCtx.SetValueInMemory(status.Message, types.ContextPrefixBackoffReason, status.ID)
   826  			wfCtx.DeleteValueInMemory(types.ContextPrefixBackoffTimes, status.ID)
   827  		}
   828  		wfCtx.IncreaseCountValueInMemory(types.ContextPrefixBackoffTimes, status.ID)
   829  	}
   830  	if err := wfCtx.Commit(); err != nil {
   831  		return errors.WithMessage(err, "commit workflow context")
   832  	}
   833  	return nil
   834  }
   835  
   836  func (e *engine) cleanBackoffTimesForTerminated() {
   837  	for _, ss := range e.status.Steps {
   838  		for _, sub := range ss.SubStepsStatus {
   839  			if sub.Reason == types.StatusReasonTerminate {
   840  				e.wfCtx.DeleteValueInMemory(types.ContextPrefixBackoffTimes, sub.ID)
   841  				e.wfCtx.DeleteValueInMemory(types.ContextPrefixBackoffReason, sub.ID)
   842  			}
   843  		}
   844  		if ss.Reason == types.StatusReasonTerminate {
   845  			e.wfCtx.DeleteValueInMemory(types.ContextPrefixBackoffTimes, ss.ID)
   846  			e.wfCtx.DeleteValueInMemory(types.ContextPrefixBackoffReason, ss.ID)
   847  		}
   848  	}
   849  }
   850  
   851  func (e *engine) GetStepStatus(stepName string) v1alpha1.WorkflowStepStatus {
   852  	// ss is step status
   853  	for _, ss := range e.status.Steps {
   854  		if ss.Name == stepName {
   855  			return ss
   856  		}
   857  	}
   858  	return v1alpha1.WorkflowStepStatus{}
   859  }
   860  
   861  func (e *engine) GetCommonStepStatus(stepName string) v1alpha1.StepStatus {
   862  	if status, ok := e.stepStatus[stepName]; ok {
   863  		return status
   864  	}
   865  	return v1alpha1.StepStatus{}
   866  }
   867  
   868  func (e *engine) SetParentRunner(name string) {
   869  	e.parentRunner = name
   870  }
   871  
   872  func (e *engine) GetOperation() *types.Operation {
   873  	return &types.Operation{
   874  		Suspend:            e.status.Suspend,
   875  		Terminated:         e.status.Terminated,
   876  		Waiting:            e.waiting,
   877  		FailedAfterRetries: e.failedAfterRetries,
   878  	}
   879  }