github.com/kyma-project/kyma-environment-broker@v0.0.1/internal/process/staged_manager.go (about) 1 package process 2 3 import ( 4 "context" 5 "fmt" 6 "log" 7 "sync" 8 "time" 9 10 "github.com/pkg/errors" 11 12 "github.com/kyma-project/kyma-environment-broker/internal" 13 kebError "github.com/kyma-project/kyma-environment-broker/internal/error" 14 "github.com/kyma-project/kyma-environment-broker/internal/event" 15 "github.com/kyma-project/kyma-environment-broker/internal/storage" 16 17 "github.com/pivotal-cf/brokerapi/v8/domain" 18 "github.com/sirupsen/logrus" 19 ) 20 21 type StagedManager struct { 22 log logrus.FieldLogger 23 operationStorage storage.Operations 24 publisher event.Publisher 25 26 stages []*stage 27 operationTimeout time.Duration 28 29 mu sync.RWMutex 30 31 speedFactor int64 32 cfg StagedManagerConfiguration 33 } 34 35 type StagedManagerConfiguration struct { 36 // Max time of processing step by a worker without returning to the queue 37 MaxStepProcessingTime time.Duration `envconfig:"default=2m"` 38 WorkersAmount int `envconfig:"default=20"` 39 } 40 41 func (c StagedManagerConfiguration) String() string { 42 return fmt.Sprintf("(MaxStepProcessingTime=%s; WorkersAmount=%d)", c.MaxStepProcessingTime, c.WorkersAmount) 43 } 44 45 type Step interface { 46 Name() string 47 Run(operation internal.Operation, logger logrus.FieldLogger) (internal.Operation, time.Duration, error) 48 } 49 50 type StepCondition func(operation internal.Operation) bool 51 52 type StepWithCondition struct { 53 Step 54 condition StepCondition 55 } 56 57 type stage struct { 58 name string 59 steps []StepWithCondition 60 } 61 62 func (s *stage) AddStep(step Step, cnd StepCondition) { 63 s.steps = append(s.steps, StepWithCondition{ 64 Step: step, 65 condition: cnd, 66 }) 67 } 68 69 func NewStagedManager(storage storage.Operations, pub event.Publisher, operationTimeout time.Duration, cfg StagedManagerConfiguration, logger logrus.FieldLogger) *StagedManager { 70 return &StagedManager{ 71 log: logger, 72 operationStorage: storage, 73 publisher: pub, 74 operationTimeout: operationTimeout, 75 speedFactor: 1, 76 cfg: cfg, 77 } 78 } 79 80 // SpeedUp changes speedFactor parameter to reduce the sleep time if a step needs a retry. 81 // This method should only be used for testing purposes 82 func (m *StagedManager) SpeedUp(speedFactor int64) { 83 m.speedFactor = speedFactor 84 } 85 86 func (m *StagedManager) DefineStages(names []string) { 87 m.stages = make([]*stage, len(names)) 88 for i, n := range names { 89 m.stages[i] = &stage{name: n, steps: []StepWithCondition{}} 90 } 91 } 92 93 func (m *StagedManager) AddStep(stageName string, step Step, cnd StepCondition) error { 94 for _, s := range m.stages { 95 if s.name == stageName { 96 s.AddStep(step, cnd) 97 return nil 98 } 99 } 100 return fmt.Errorf("stage %s not defined", stageName) 101 } 102 103 func (m *StagedManager) GetAllStages() []string { 104 var all []string 105 for _, s := range m.stages { 106 all = append(all, s.name) 107 } 108 return all 109 } 110 111 func (m *StagedManager) Execute(operationID string) (time.Duration, error) { 112 113 operation, err := m.operationStorage.GetOperationByID(operationID) 114 if err != nil { 115 m.log.Errorf("Cannot fetch operation from storage: %s", err) 116 return 3 * time.Second, nil 117 } 118 119 logOperation := m.log.WithFields(logrus.Fields{"operation": operationID, "instanceID": operation.InstanceID, "planID": operation.ProvisioningParameters.PlanID}) 120 logOperation.Infof("Start process operation steps for GlobalAccount=%s, ", operation.ProvisioningParameters.ErsContext.GlobalAccountID) 121 if time.Since(operation.CreatedAt) > m.operationTimeout { 122 timeoutErr := kebError.TimeoutError("operation has reached the time limit") 123 operation.LastError = timeoutErr 124 defer m.callPubSubOutsideSteps(operation, timeoutErr) 125 126 logOperation.Infof("operation has reached the time limit: operation was created at: %s", operation.CreatedAt) 127 operation.State = domain.Failed 128 _, err = m.operationStorage.UpdateOperation(*operation) 129 if err != nil { 130 logOperation.Infof("Unable to save operation with finished the provisioning process") 131 timeoutErr = timeoutErr.SetMessage(fmt.Sprintf("%s and %s", timeoutErr.Error(), err.Error())) 132 operation.LastError = timeoutErr 133 return time.Second, timeoutErr 134 } 135 136 return 0, timeoutErr 137 } 138 139 var when time.Duration 140 processedOperation := *operation 141 142 for _, stage := range m.stages { 143 if processedOperation.IsStageFinished(stage.name) { 144 continue 145 } 146 147 for _, step := range stage.steps { 148 logStep := logOperation.WithField("step", step.Name()). 149 WithField("stage", stage.name) 150 if step.condition != nil && !step.condition(processedOperation) { 151 logStep.Debugf("Skipping") 152 continue 153 } 154 operation.EventInfof("processing step: %v", step.Name()) 155 156 processedOperation, when, err = m.runStep(step, processedOperation, logStep) 157 if err != nil { 158 logStep.Errorf("Process operation failed: %s", err) 159 operation.EventErrorf(err, "step %v processing returned error", step.Name()) 160 return 0, err 161 } 162 if processedOperation.State == domain.Failed || processedOperation.State == domain.Succeeded { 163 logStep.Infof("Operation %q got status %s. Process finished.", operation.ID, processedOperation.State) 164 operation.EventInfof("operation processing %v", processedOperation.State) 165 return 0, nil 166 } 167 168 // the step needs a retry 169 if when > 0 { 170 logStep.Warnf("retrying step by restarting the operation in %d s", int64(when.Seconds())) 171 return when, nil 172 } 173 } 174 175 processedOperation, err = m.saveFinishedStage(processedOperation, stage, logOperation) 176 if err != nil { 177 return time.Second, nil 178 } 179 } 180 181 logOperation.Infof("Operation succeeded") 182 183 processedOperation.State = domain.Succeeded 184 processedOperation.Description = "Processing finished" 185 m.publisher.Publish(context.TODO(), OperationSucceeded{ 186 Operation: processedOperation, 187 }) 188 189 _, err = m.operationStorage.UpdateOperation(processedOperation) 190 if err != nil { 191 logOperation.Infof("Unable to save operation with finished the provisioning process") 192 return time.Second, err 193 } 194 195 return 0, nil 196 } 197 198 func (m *StagedManager) saveFinishedStage(operation internal.Operation, s *stage, log logrus.FieldLogger) (internal.Operation, error) { 199 operation.FinishStage(s.name) 200 op, err := m.operationStorage.UpdateOperation(operation) 201 if err != nil { 202 log.Infof("Unable to save operation with finished stage %s: %s", s.name, err.Error()) 203 return operation, err 204 } 205 log.Infof("Finished stage %s", s.name) 206 return *op, nil 207 } 208 209 func (m *StagedManager) runStep(step Step, operation internal.Operation, logger logrus.FieldLogger) (processedOperation internal.Operation, backoff time.Duration, err error) { 210 var start time.Time 211 defer func() { 212 if pErr := recover(); pErr != nil { 213 log.Println("panic in RunStep in staged manager: ", pErr) 214 err = errors.New(fmt.Sprintf("%v", pErr)) 215 om := NewOperationManager(m.operationStorage) 216 processedOperation, _, _ = om.OperationFailed(operation, "recovered from panic", err, m.log) 217 } 218 }() 219 220 processedOperation = operation 221 begin := time.Now() 222 for { 223 start = time.Now() 224 logger.Infof("Start step") 225 processedOperation, backoff, err = step.Run(processedOperation, logger) 226 if err != nil { 227 processedOperation.LastError = kebError.ReasonForError(err) 228 logOperation := m.log.WithFields(logrus.Fields{"operation": processedOperation.ID, "error_component": processedOperation.LastError.Component(), "error_reason": processedOperation.LastError.Reason()}) 229 logOperation.Errorf("Last error from step %s: %s", step.Name(), processedOperation.LastError.Error()) 230 // only save to storage, skip for alerting if error 231 _, err = m.operationStorage.UpdateOperation(processedOperation) 232 if err != nil { 233 logOperation.Errorf("Unable to save operation with resolved last error from step: %s", step.Name()) 234 } 235 } 236 237 m.publisher.Publish(context.TODO(), OperationStepProcessed{ 238 StepProcessed: StepProcessed{ 239 StepName: step.Name(), 240 Duration: time.Since(start), 241 When: backoff, 242 Error: err, 243 }, 244 Operation: processedOperation, 245 OldOperation: operation, 246 }) 247 248 // break the loop if: 249 // - the step does not need a retry 250 // - step returns an error 251 // - the loop takes too much time (to not block the worker too long) 252 if backoff == 0 || err != nil || time.Since(begin) > m.cfg.MaxStepProcessingTime { 253 return processedOperation, backoff, err 254 } 255 operation.EventInfof("step %v sleeping for %v", step.Name(), backoff) 256 time.Sleep(backoff / time.Duration(m.speedFactor)) 257 } 258 } 259 260 func (m *StagedManager) callPubSubOutsideSteps(operation *internal.Operation, err error) { 261 logOperation := m.log.WithFields(logrus.Fields{"operation": operation.ID, "error_component": operation.LastError.Component(), "error_reason": operation.LastError.Reason()}) 262 logOperation.Errorf("Last error: %s", operation.LastError.Error()) 263 264 m.publisher.Publish(context.TODO(), OperationStepProcessed{ 265 StepProcessed: StepProcessed{ 266 Duration: time.Since(operation.CreatedAt), 267 Error: err, 268 }, 269 OldOperation: *operation, 270 Operation: *operation, 271 }) 272 }