github.com/kyma-project/kyma-environment-broker@v0.0.1/internal/process/staged_manager.go (about)

     1  package process
     2  
     3  import (
     4  	"context"
     5  	"fmt"
     6  	"log"
     7  	"sync"
     8  	"time"
     9  
    10  	"github.com/pkg/errors"
    11  
    12  	"github.com/kyma-project/kyma-environment-broker/internal"
    13  	kebError "github.com/kyma-project/kyma-environment-broker/internal/error"
    14  	"github.com/kyma-project/kyma-environment-broker/internal/event"
    15  	"github.com/kyma-project/kyma-environment-broker/internal/storage"
    16  
    17  	"github.com/pivotal-cf/brokerapi/v8/domain"
    18  	"github.com/sirupsen/logrus"
    19  )
    20  
    21  type StagedManager struct {
    22  	log              logrus.FieldLogger
    23  	operationStorage storage.Operations
    24  	publisher        event.Publisher
    25  
    26  	stages           []*stage
    27  	operationTimeout time.Duration
    28  
    29  	mu sync.RWMutex
    30  
    31  	speedFactor int64
    32  	cfg         StagedManagerConfiguration
    33  }
    34  
    35  type StagedManagerConfiguration struct {
    36  	// Max time of processing step by a worker without returning to the queue
    37  	MaxStepProcessingTime time.Duration `envconfig:"default=2m"`
    38  	WorkersAmount         int           `envconfig:"default=20"`
    39  }
    40  
    41  func (c StagedManagerConfiguration) String() string {
    42  	return fmt.Sprintf("(MaxStepProcessingTime=%s; WorkersAmount=%d)", c.MaxStepProcessingTime, c.WorkersAmount)
    43  }
    44  
    45  type Step interface {
    46  	Name() string
    47  	Run(operation internal.Operation, logger logrus.FieldLogger) (internal.Operation, time.Duration, error)
    48  }
    49  
    50  type StepCondition func(operation internal.Operation) bool
    51  
    52  type StepWithCondition struct {
    53  	Step
    54  	condition StepCondition
    55  }
    56  
    57  type stage struct {
    58  	name  string
    59  	steps []StepWithCondition
    60  }
    61  
    62  func (s *stage) AddStep(step Step, cnd StepCondition) {
    63  	s.steps = append(s.steps, StepWithCondition{
    64  		Step:      step,
    65  		condition: cnd,
    66  	})
    67  }
    68  
    69  func NewStagedManager(storage storage.Operations, pub event.Publisher, operationTimeout time.Duration, cfg StagedManagerConfiguration, logger logrus.FieldLogger) *StagedManager {
    70  	return &StagedManager{
    71  		log:              logger,
    72  		operationStorage: storage,
    73  		publisher:        pub,
    74  		operationTimeout: operationTimeout,
    75  		speedFactor:      1,
    76  		cfg:              cfg,
    77  	}
    78  }
    79  
    80  // SpeedUp changes speedFactor parameter to reduce the sleep time if a step needs a retry.
    81  // This method should only be used for testing purposes
    82  func (m *StagedManager) SpeedUp(speedFactor int64) {
    83  	m.speedFactor = speedFactor
    84  }
    85  
    86  func (m *StagedManager) DefineStages(names []string) {
    87  	m.stages = make([]*stage, len(names))
    88  	for i, n := range names {
    89  		m.stages[i] = &stage{name: n, steps: []StepWithCondition{}}
    90  	}
    91  }
    92  
    93  func (m *StagedManager) AddStep(stageName string, step Step, cnd StepCondition) error {
    94  	for _, s := range m.stages {
    95  		if s.name == stageName {
    96  			s.AddStep(step, cnd)
    97  			return nil
    98  		}
    99  	}
   100  	return fmt.Errorf("stage %s not defined", stageName)
   101  }
   102  
   103  func (m *StagedManager) GetAllStages() []string {
   104  	var all []string
   105  	for _, s := range m.stages {
   106  		all = append(all, s.name)
   107  	}
   108  	return all
   109  }
   110  
   111  func (m *StagedManager) Execute(operationID string) (time.Duration, error) {
   112  
   113  	operation, err := m.operationStorage.GetOperationByID(operationID)
   114  	if err != nil {
   115  		m.log.Errorf("Cannot fetch operation from storage: %s", err)
   116  		return 3 * time.Second, nil
   117  	}
   118  
   119  	logOperation := m.log.WithFields(logrus.Fields{"operation": operationID, "instanceID": operation.InstanceID, "planID": operation.ProvisioningParameters.PlanID})
   120  	logOperation.Infof("Start process operation steps for GlobalAccount=%s, ", operation.ProvisioningParameters.ErsContext.GlobalAccountID)
   121  	if time.Since(operation.CreatedAt) > m.operationTimeout {
   122  		timeoutErr := kebError.TimeoutError("operation has reached the time limit")
   123  		operation.LastError = timeoutErr
   124  		defer m.callPubSubOutsideSteps(operation, timeoutErr)
   125  
   126  		logOperation.Infof("operation has reached the time limit: operation was created at: %s", operation.CreatedAt)
   127  		operation.State = domain.Failed
   128  		_, err = m.operationStorage.UpdateOperation(*operation)
   129  		if err != nil {
   130  			logOperation.Infof("Unable to save operation with finished the provisioning process")
   131  			timeoutErr = timeoutErr.SetMessage(fmt.Sprintf("%s and %s", timeoutErr.Error(), err.Error()))
   132  			operation.LastError = timeoutErr
   133  			return time.Second, timeoutErr
   134  		}
   135  
   136  		return 0, timeoutErr
   137  	}
   138  
   139  	var when time.Duration
   140  	processedOperation := *operation
   141  
   142  	for _, stage := range m.stages {
   143  		if processedOperation.IsStageFinished(stage.name) {
   144  			continue
   145  		}
   146  
   147  		for _, step := range stage.steps {
   148  			logStep := logOperation.WithField("step", step.Name()).
   149  				WithField("stage", stage.name)
   150  			if step.condition != nil && !step.condition(processedOperation) {
   151  				logStep.Debugf("Skipping")
   152  				continue
   153  			}
   154  			operation.EventInfof("processing step: %v", step.Name())
   155  
   156  			processedOperation, when, err = m.runStep(step, processedOperation, logStep)
   157  			if err != nil {
   158  				logStep.Errorf("Process operation failed: %s", err)
   159  				operation.EventErrorf(err, "step %v processing returned error", step.Name())
   160  				return 0, err
   161  			}
   162  			if processedOperation.State == domain.Failed || processedOperation.State == domain.Succeeded {
   163  				logStep.Infof("Operation %q got status %s. Process finished.", operation.ID, processedOperation.State)
   164  				operation.EventInfof("operation processing %v", processedOperation.State)
   165  				return 0, nil
   166  			}
   167  
   168  			// the step needs a retry
   169  			if when > 0 {
   170  				logStep.Warnf("retrying step by restarting the operation in %d s", int64(when.Seconds()))
   171  				return when, nil
   172  			}
   173  		}
   174  
   175  		processedOperation, err = m.saveFinishedStage(processedOperation, stage, logOperation)
   176  		if err != nil {
   177  			return time.Second, nil
   178  		}
   179  	}
   180  
   181  	logOperation.Infof("Operation succeeded")
   182  
   183  	processedOperation.State = domain.Succeeded
   184  	processedOperation.Description = "Processing finished"
   185  	m.publisher.Publish(context.TODO(), OperationSucceeded{
   186  		Operation: processedOperation,
   187  	})
   188  
   189  	_, err = m.operationStorage.UpdateOperation(processedOperation)
   190  	if err != nil {
   191  		logOperation.Infof("Unable to save operation with finished the provisioning process")
   192  		return time.Second, err
   193  	}
   194  
   195  	return 0, nil
   196  }
   197  
   198  func (m *StagedManager) saveFinishedStage(operation internal.Operation, s *stage, log logrus.FieldLogger) (internal.Operation, error) {
   199  	operation.FinishStage(s.name)
   200  	op, err := m.operationStorage.UpdateOperation(operation)
   201  	if err != nil {
   202  		log.Infof("Unable to save operation with finished stage %s: %s", s.name, err.Error())
   203  		return operation, err
   204  	}
   205  	log.Infof("Finished stage %s", s.name)
   206  	return *op, nil
   207  }
   208  
   209  func (m *StagedManager) runStep(step Step, operation internal.Operation, logger logrus.FieldLogger) (processedOperation internal.Operation, backoff time.Duration, err error) {
   210  	var start time.Time
   211  	defer func() {
   212  		if pErr := recover(); pErr != nil {
   213  			log.Println("panic in RunStep in staged manager: ", pErr)
   214  			err = errors.New(fmt.Sprintf("%v", pErr))
   215  			om := NewOperationManager(m.operationStorage)
   216  			processedOperation, _, _ = om.OperationFailed(operation, "recovered from panic", err, m.log)
   217  		}
   218  	}()
   219  
   220  	processedOperation = operation
   221  	begin := time.Now()
   222  	for {
   223  		start = time.Now()
   224  		logger.Infof("Start step")
   225  		processedOperation, backoff, err = step.Run(processedOperation, logger)
   226  		if err != nil {
   227  			processedOperation.LastError = kebError.ReasonForError(err)
   228  			logOperation := m.log.WithFields(logrus.Fields{"operation": processedOperation.ID, "error_component": processedOperation.LastError.Component(), "error_reason": processedOperation.LastError.Reason()})
   229  			logOperation.Errorf("Last error from step %s: %s", step.Name(), processedOperation.LastError.Error())
   230  			// only save to storage, skip for alerting if error
   231  			_, err = m.operationStorage.UpdateOperation(processedOperation)
   232  			if err != nil {
   233  				logOperation.Errorf("Unable to save operation with resolved last error from step: %s", step.Name())
   234  			}
   235  		}
   236  
   237  		m.publisher.Publish(context.TODO(), OperationStepProcessed{
   238  			StepProcessed: StepProcessed{
   239  				StepName: step.Name(),
   240  				Duration: time.Since(start),
   241  				When:     backoff,
   242  				Error:    err,
   243  			},
   244  			Operation:    processedOperation,
   245  			OldOperation: operation,
   246  		})
   247  
   248  		// break the loop if:
   249  		// - the step does not need a retry
   250  		// - step returns an error
   251  		// - the loop takes too much time (to not block the worker too long)
   252  		if backoff == 0 || err != nil || time.Since(begin) > m.cfg.MaxStepProcessingTime {
   253  			return processedOperation, backoff, err
   254  		}
   255  		operation.EventInfof("step %v sleeping for %v", step.Name(), backoff)
   256  		time.Sleep(backoff / time.Duration(m.speedFactor))
   257  	}
   258  }
   259  
   260  func (m *StagedManager) callPubSubOutsideSteps(operation *internal.Operation, err error) {
   261  	logOperation := m.log.WithFields(logrus.Fields{"operation": operation.ID, "error_component": operation.LastError.Component(), "error_reason": operation.LastError.Reason()})
   262  	logOperation.Errorf("Last error: %s", operation.LastError.Error())
   263  
   264  	m.publisher.Publish(context.TODO(), OperationStepProcessed{
   265  		StepProcessed: StepProcessed{
   266  			Duration: time.Since(operation.CreatedAt),
   267  			Error:    err,
   268  		},
   269  		OldOperation: *operation,
   270  		Operation:    *operation,
   271  	})
   272  }