github.com/kaisenlinux/docker@v0.0.0-20230510090727-ea55db55fac7/swarmkit/manager/orchestrator/update/updater.go (about) 1 package update 2 3 import ( 4 "context" 5 "errors" 6 "fmt" 7 "reflect" 8 "sync" 9 "time" 10 11 "github.com/docker/go-events" 12 "github.com/docker/swarmkit/api" 13 "github.com/docker/swarmkit/api/defaults" 14 "github.com/docker/swarmkit/log" 15 "github.com/docker/swarmkit/manager/orchestrator" 16 "github.com/docker/swarmkit/manager/orchestrator/restart" 17 "github.com/docker/swarmkit/manager/state" 18 "github.com/docker/swarmkit/manager/state/store" 19 "github.com/docker/swarmkit/protobuf/ptypes" 20 "github.com/docker/swarmkit/watch" 21 gogotypes "github.com/gogo/protobuf/types" 22 ) 23 24 // Supervisor supervises a set of updates. It's responsible for keeping track of updates, 25 // shutting them down and replacing them. 26 type Supervisor struct { 27 store *store.MemoryStore 28 restarts *restart.Supervisor 29 updates map[string]*Updater 30 l sync.Mutex 31 } 32 33 // NewSupervisor creates a new UpdateSupervisor. 34 func NewSupervisor(store *store.MemoryStore, restartSupervisor *restart.Supervisor) *Supervisor { 35 return &Supervisor{ 36 store: store, 37 updates: make(map[string]*Updater), 38 restarts: restartSupervisor, 39 } 40 } 41 42 // Update starts an Update of `slots` belonging to `service` in the background 43 // and returns immediately. Each slot contains a group of one or more tasks 44 // occupying the same slot (replicated service) or node (global service). There 45 // may be more than one task per slot in cases where an update is in progress 46 // and the new task was started before the old one was shut down. If an update 47 // for that service was already in progress, it will be cancelled before the 48 // new one starts. 49 func (u *Supervisor) Update(ctx context.Context, cluster *api.Cluster, service *api.Service, slots []orchestrator.Slot) { 50 u.l.Lock() 51 defer u.l.Unlock() 52 53 id := service.ID 54 55 if update, ok := u.updates[id]; ok { 56 if reflect.DeepEqual(service.Spec, update.newService.Spec) { 57 // There's already an update working towards this goal. 58 return 59 } 60 update.Cancel() 61 } 62 63 update := NewUpdater(u.store, u.restarts, cluster, service) 64 u.updates[id] = update 65 go func() { 66 update.Run(ctx, slots) 67 u.l.Lock() 68 if u.updates[id] == update { 69 delete(u.updates, id) 70 } 71 u.l.Unlock() 72 }() 73 } 74 75 // CancelAll cancels all current updates. 76 func (u *Supervisor) CancelAll() { 77 u.l.Lock() 78 defer u.l.Unlock() 79 80 for _, update := range u.updates { 81 update.Cancel() 82 } 83 } 84 85 // Updater updates a set of tasks to a new version. 86 type Updater struct { 87 store *store.MemoryStore 88 watchQueue *watch.Queue 89 restarts *restart.Supervisor 90 91 cluster *api.Cluster 92 newService *api.Service 93 94 updatedTasks map[string]time.Time // task ID to creation time 95 updatedTasksMu sync.Mutex 96 97 // stopChan signals to the state machine to stop running. 98 stopChan chan struct{} 99 // doneChan is closed when the state machine terminates. 100 doneChan chan struct{} 101 } 102 103 // NewUpdater creates a new Updater. 104 func NewUpdater(store *store.MemoryStore, restartSupervisor *restart.Supervisor, cluster *api.Cluster, newService *api.Service) *Updater { 105 return &Updater{ 106 store: store, 107 watchQueue: store.WatchQueue(), 108 restarts: restartSupervisor, 109 cluster: cluster.Copy(), 110 newService: newService.Copy(), 111 updatedTasks: make(map[string]time.Time), 112 stopChan: make(chan struct{}), 113 doneChan: make(chan struct{}), 114 } 115 } 116 117 // Cancel cancels the current update immediately. It blocks until the cancellation is confirmed. 118 func (u *Updater) Cancel() { 119 close(u.stopChan) 120 <-u.doneChan 121 } 122 123 // Run starts the update and returns only once its complete or cancelled. 124 func (u *Updater) Run(ctx context.Context, slots []orchestrator.Slot) { 125 defer close(u.doneChan) 126 127 service := u.newService 128 129 // If the update is in a PAUSED state, we should not do anything. 130 if service.UpdateStatus != nil && 131 (service.UpdateStatus.State == api.UpdateStatus_PAUSED || 132 service.UpdateStatus.State == api.UpdateStatus_ROLLBACK_PAUSED) { 133 return 134 } 135 136 var dirtySlots []orchestrator.Slot 137 for _, slot := range slots { 138 if u.isSlotDirty(slot) { 139 dirtySlots = append(dirtySlots, slot) 140 } 141 } 142 // Abort immediately if all tasks are clean. 143 if len(dirtySlots) == 0 { 144 if service.UpdateStatus != nil && 145 (service.UpdateStatus.State == api.UpdateStatus_UPDATING || 146 service.UpdateStatus.State == api.UpdateStatus_ROLLBACK_STARTED) { 147 u.completeUpdate(ctx, service.ID) 148 } 149 return 150 } 151 152 // If there's no update in progress, we are starting one. 153 if service.UpdateStatus == nil { 154 u.startUpdate(ctx, service.ID) 155 } 156 157 var ( 158 monitoringPeriod time.Duration 159 updateConfig *api.UpdateConfig 160 ) 161 162 if service.UpdateStatus != nil && service.UpdateStatus.State == api.UpdateStatus_ROLLBACK_STARTED { 163 monitoringPeriod, _ = gogotypes.DurationFromProto(defaults.Service.Rollback.Monitor) 164 updateConfig = service.Spec.Rollback 165 if updateConfig == nil { 166 updateConfig = defaults.Service.Rollback 167 } 168 } else { 169 monitoringPeriod, _ = gogotypes.DurationFromProto(defaults.Service.Update.Monitor) 170 updateConfig = service.Spec.Update 171 if updateConfig == nil { 172 updateConfig = defaults.Service.Update 173 } 174 } 175 176 parallelism := int(updateConfig.Parallelism) 177 if updateConfig.Monitor != nil { 178 newMonitoringPeriod, err := gogotypes.DurationFromProto(updateConfig.Monitor) 179 if err == nil { 180 monitoringPeriod = newMonitoringPeriod 181 } 182 } 183 184 if parallelism == 0 { 185 // TODO(aluzzardi): We could try to optimize unlimited parallelism by performing updates in a single 186 // goroutine using a batch transaction. 187 parallelism = len(dirtySlots) 188 } 189 190 // Start the workers. 191 slotQueue := make(chan orchestrator.Slot) 192 wg := sync.WaitGroup{} 193 wg.Add(parallelism) 194 for i := 0; i < parallelism; i++ { 195 go func() { 196 u.worker(ctx, slotQueue, updateConfig) 197 wg.Done() 198 }() 199 } 200 201 var failedTaskWatch chan events.Event 202 203 if updateConfig.FailureAction != api.UpdateConfig_CONTINUE { 204 var cancelWatch func() 205 failedTaskWatch, cancelWatch = state.Watch( 206 u.store.WatchQueue(), 207 api.EventUpdateTask{ 208 Task: &api.Task{ServiceID: service.ID, Status: api.TaskStatus{State: api.TaskStateRunning}}, 209 Checks: []api.TaskCheckFunc{api.TaskCheckServiceID, state.TaskCheckStateGreaterThan}, 210 }, 211 ) 212 defer cancelWatch() 213 } 214 215 stopped := false 216 failedTasks := make(map[string]struct{}) 217 totalFailures := 0 218 219 failureTriggersAction := func(failedTask *api.Task) bool { 220 // Ignore tasks we have already seen as failures. 221 if _, found := failedTasks[failedTask.ID]; found { 222 return false 223 } 224 225 // If this failed/completed task is one that we 226 // created as part of this update, we should 227 // follow the failure action. 228 u.updatedTasksMu.Lock() 229 startedAt, found := u.updatedTasks[failedTask.ID] 230 u.updatedTasksMu.Unlock() 231 232 if found && (startedAt.IsZero() || time.Since(startedAt) <= monitoringPeriod) { 233 failedTasks[failedTask.ID] = struct{}{} 234 totalFailures++ 235 if float32(totalFailures)/float32(len(dirtySlots)) > updateConfig.MaxFailureRatio { 236 switch updateConfig.FailureAction { 237 case api.UpdateConfig_PAUSE: 238 stopped = true 239 message := fmt.Sprintf("update paused due to failure or early termination of task %s", failedTask.ID) 240 u.pauseUpdate(ctx, service.ID, message) 241 return true 242 case api.UpdateConfig_ROLLBACK: 243 // Never roll back a rollback 244 if service.UpdateStatus != nil && service.UpdateStatus.State == api.UpdateStatus_ROLLBACK_STARTED { 245 message := fmt.Sprintf("rollback paused due to failure or early termination of task %s", failedTask.ID) 246 u.pauseUpdate(ctx, service.ID, message) 247 return true 248 } 249 stopped = true 250 message := fmt.Sprintf("update rolled back due to failure or early termination of task %s", failedTask.ID) 251 u.rollbackUpdate(ctx, service.ID, message) 252 return true 253 } 254 } 255 } 256 257 return false 258 } 259 260 slotsLoop: 261 for _, slot := range dirtySlots { 262 retryLoop: 263 for { 264 // Wait for a worker to pick up the task or abort the update, whichever comes first. 265 select { 266 case <-u.stopChan: 267 stopped = true 268 break slotsLoop 269 case ev := <-failedTaskWatch: 270 if failureTriggersAction(ev.(api.EventUpdateTask).Task) { 271 break slotsLoop 272 } 273 case slotQueue <- slot: 274 break retryLoop 275 } 276 } 277 } 278 279 close(slotQueue) 280 wg.Wait() 281 282 if !stopped { 283 // if a delay is set we need to monitor for a period longer than the delay 284 // otherwise we will leave the monitorLoop before the task is done delaying 285 if updateConfig.Delay >= monitoringPeriod { 286 monitoringPeriod = updateConfig.Delay + 1*time.Second 287 } 288 // Keep watching for task failures for one more monitoringPeriod, 289 // before declaring the update complete. 290 doneMonitoring := time.After(monitoringPeriod) 291 monitorLoop: 292 for { 293 select { 294 case <-u.stopChan: 295 stopped = true 296 break monitorLoop 297 case <-doneMonitoring: 298 break monitorLoop 299 case ev := <-failedTaskWatch: 300 if failureTriggersAction(ev.(api.EventUpdateTask).Task) { 301 break monitorLoop 302 } 303 } 304 } 305 } 306 307 // TODO(aaronl): Potentially roll back the service if not enough tasks 308 // have reached RUNNING by this point. 309 310 if !stopped { 311 u.completeUpdate(ctx, service.ID) 312 } 313 } 314 315 func (u *Updater) worker(ctx context.Context, queue <-chan orchestrator.Slot, updateConfig *api.UpdateConfig) { 316 for slot := range queue { 317 // Do we have a task with the new spec in desired state = RUNNING? 318 // If so, all we have to do to complete the update is remove the 319 // other tasks. Or if we have a task with the new spec that has 320 // desired state < RUNNING, advance it to running and remove the 321 // other tasks. 322 var ( 323 runningTask *api.Task 324 cleanTask *api.Task 325 ) 326 for _, t := range slot { 327 if !u.isTaskDirty(t) { 328 if t.DesiredState == api.TaskStateRunning { 329 runningTask = t 330 break 331 } 332 if t.DesiredState < api.TaskStateRunning { 333 cleanTask = t 334 } 335 } 336 } 337 if runningTask != nil { 338 if err := u.useExistingTask(ctx, slot, runningTask); err != nil { 339 log.G(ctx).WithError(err).Error("update failed") 340 } 341 } else if cleanTask != nil { 342 if err := u.useExistingTask(ctx, slot, cleanTask); err != nil { 343 log.G(ctx).WithError(err).Error("update failed") 344 } 345 } else { 346 updated := orchestrator.NewTask(u.cluster, u.newService, slot[0].Slot, "") 347 if orchestrator.IsGlobalService(u.newService) { 348 updated = orchestrator.NewTask(u.cluster, u.newService, slot[0].Slot, slot[0].NodeID) 349 } 350 updated.DesiredState = api.TaskStateReady 351 352 if err := u.updateTask(ctx, slot, updated, updateConfig.Order); err != nil { 353 log.G(ctx).WithError(err).WithField("task.id", updated.ID).Error("update failed") 354 } 355 } 356 357 if updateConfig.Delay != 0 { 358 select { 359 case <-time.After(updateConfig.Delay): 360 case <-u.stopChan: 361 return 362 } 363 } 364 } 365 } 366 367 func (u *Updater) updateTask(ctx context.Context, slot orchestrator.Slot, updated *api.Task, order api.UpdateConfig_UpdateOrder) error { 368 // Kick off the watch before even creating the updated task. This is in order to avoid missing any event. 369 taskUpdates, cancel := state.Watch(u.watchQueue, api.EventUpdateTask{ 370 Task: &api.Task{ID: updated.ID}, 371 Checks: []api.TaskCheckFunc{api.TaskCheckID}, 372 }) 373 defer cancel() 374 375 // Create an empty entry for this task, so the updater knows a failure 376 // should count towards the failure count. The timestamp is added 377 // if/when the task reaches RUNNING. 378 u.updatedTasksMu.Lock() 379 u.updatedTasks[updated.ID] = time.Time{} 380 u.updatedTasksMu.Unlock() 381 382 startThenStop := false 383 var delayStartCh <-chan struct{} 384 // Atomically create the updated task and bring down the old one. 385 err := u.store.Batch(func(batch *store.Batch) error { 386 err := batch.Update(func(tx store.Tx) error { 387 if store.GetService(tx, updated.ServiceID) == nil { 388 return errors.New("service was deleted") 389 } 390 391 return store.CreateTask(tx, updated) 392 }) 393 if err != nil { 394 return err 395 } 396 397 if order == api.UpdateConfig_START_FIRST { 398 delayStartCh = u.restarts.DelayStart(ctx, nil, nil, updated.ID, 0, false) 399 startThenStop = true 400 } else { 401 oldTask, err := u.removeOldTasks(ctx, batch, slot) 402 if err != nil { 403 return err 404 } 405 delayStartCh = u.restarts.DelayStart(ctx, nil, oldTask, updated.ID, 0, true) 406 } 407 408 return nil 409 410 }) 411 if err != nil { 412 return err 413 } 414 415 if delayStartCh != nil { 416 select { 417 case <-delayStartCh: 418 case <-u.stopChan: 419 return nil 420 } 421 } 422 423 // Wait for the new task to come up. 424 // TODO(aluzzardi): Consider adding a timeout here. 425 for { 426 select { 427 case e := <-taskUpdates: 428 updated = e.(api.EventUpdateTask).Task 429 if updated.Status.State >= api.TaskStateRunning { 430 u.updatedTasksMu.Lock() 431 u.updatedTasks[updated.ID] = time.Now() 432 u.updatedTasksMu.Unlock() 433 434 if startThenStop && updated.Status.State == api.TaskStateRunning { 435 err := u.store.Batch(func(batch *store.Batch) error { 436 _, err := u.removeOldTasks(ctx, batch, slot) 437 if err != nil { 438 log.G(ctx).WithError(err).WithField("task.id", updated.ID).Warning("failed to remove old task after starting replacement") 439 } 440 return nil 441 }) 442 return err 443 } 444 return nil 445 } 446 case <-u.stopChan: 447 return nil 448 } 449 } 450 } 451 452 func (u *Updater) useExistingTask(ctx context.Context, slot orchestrator.Slot, existing *api.Task) error { 453 var removeTasks []*api.Task 454 for _, t := range slot { 455 if t != existing { 456 removeTasks = append(removeTasks, t) 457 } 458 } 459 if len(removeTasks) != 0 || existing.DesiredState != api.TaskStateRunning { 460 var delayStartCh <-chan struct{} 461 err := u.store.Batch(func(batch *store.Batch) error { 462 var oldTask *api.Task 463 if len(removeTasks) != 0 { 464 var err error 465 oldTask, err = u.removeOldTasks(ctx, batch, removeTasks) 466 if err != nil { 467 return err 468 } 469 } 470 471 if existing.DesiredState != api.TaskStateRunning { 472 delayStartCh = u.restarts.DelayStart(ctx, nil, oldTask, existing.ID, 0, true) 473 } 474 return nil 475 }) 476 if err != nil { 477 return err 478 } 479 480 if delayStartCh != nil { 481 select { 482 case <-delayStartCh: 483 case <-u.stopChan: 484 return nil 485 } 486 } 487 } 488 489 return nil 490 } 491 492 // removeOldTasks shuts down the given tasks and returns one of the tasks that 493 // was shut down, or an error. 494 func (u *Updater) removeOldTasks(ctx context.Context, batch *store.Batch, removeTasks []*api.Task) (*api.Task, error) { 495 var ( 496 lastErr error 497 removedTask *api.Task 498 ) 499 for _, original := range removeTasks { 500 if original.DesiredState > api.TaskStateRunning { 501 continue 502 } 503 err := batch.Update(func(tx store.Tx) error { 504 t := store.GetTask(tx, original.ID) 505 if t == nil { 506 return fmt.Errorf("task %s not found while trying to shut it down", original.ID) 507 } 508 if t.DesiredState > api.TaskStateRunning { 509 return fmt.Errorf( 510 "task %s was already shut down when reached by updater (state: %v)", 511 original.ID, t.DesiredState, 512 ) 513 } 514 t.DesiredState = api.TaskStateShutdown 515 return store.UpdateTask(tx, t) 516 }) 517 if err != nil { 518 lastErr = err 519 } else { 520 removedTask = original 521 } 522 } 523 524 if removedTask == nil { 525 return nil, lastErr 526 } 527 return removedTask, nil 528 } 529 530 func (u *Updater) isTaskDirty(t *api.Task) bool { 531 var n *api.Node 532 u.store.View(func(tx store.ReadTx) { 533 n = store.GetNode(tx, t.NodeID) 534 }) 535 return orchestrator.IsTaskDirty(u.newService, t, n) 536 } 537 538 func (u *Updater) isSlotDirty(slot orchestrator.Slot) bool { 539 return len(slot) > 1 || (len(slot) == 1 && u.isTaskDirty(slot[0])) 540 } 541 542 func (u *Updater) startUpdate(ctx context.Context, serviceID string) { 543 err := u.store.Update(func(tx store.Tx) error { 544 service := store.GetService(tx, serviceID) 545 if service == nil { 546 return nil 547 } 548 if service.UpdateStatus != nil { 549 return nil 550 } 551 552 service.UpdateStatus = &api.UpdateStatus{ 553 State: api.UpdateStatus_UPDATING, 554 Message: "update in progress", 555 StartedAt: ptypes.MustTimestampProto(time.Now()), 556 } 557 558 return store.UpdateService(tx, service) 559 }) 560 561 if err != nil { 562 log.G(ctx).WithError(err).Errorf("failed to mark update of service %s in progress", serviceID) 563 } 564 } 565 566 func (u *Updater) pauseUpdate(ctx context.Context, serviceID, message string) { 567 log.G(ctx).Debugf("pausing update of service %s", serviceID) 568 569 err := u.store.Update(func(tx store.Tx) error { 570 service := store.GetService(tx, serviceID) 571 if service == nil { 572 return nil 573 } 574 if service.UpdateStatus == nil { 575 // The service was updated since we started this update 576 return nil 577 } 578 579 if service.UpdateStatus.State == api.UpdateStatus_ROLLBACK_STARTED { 580 service.UpdateStatus.State = api.UpdateStatus_ROLLBACK_PAUSED 581 } else { 582 service.UpdateStatus.State = api.UpdateStatus_PAUSED 583 } 584 service.UpdateStatus.Message = message 585 586 return store.UpdateService(tx, service) 587 }) 588 589 if err != nil { 590 log.G(ctx).WithError(err).Errorf("failed to pause update of service %s", serviceID) 591 } 592 } 593 594 func (u *Updater) rollbackUpdate(ctx context.Context, serviceID, message string) { 595 log.G(ctx).Debugf("starting rollback of service %s", serviceID) 596 597 err := u.store.Update(func(tx store.Tx) error { 598 service := store.GetService(tx, serviceID) 599 if service == nil { 600 return nil 601 } 602 if service.UpdateStatus == nil { 603 // The service was updated since we started this update 604 return nil 605 } 606 607 service.UpdateStatus.State = api.UpdateStatus_ROLLBACK_STARTED 608 service.UpdateStatus.Message = message 609 610 if service.PreviousSpec == nil { 611 return errors.New("cannot roll back service because no previous spec is available") 612 } 613 service.Spec = *service.PreviousSpec 614 service.SpecVersion = service.PreviousSpecVersion.Copy() 615 service.PreviousSpec = nil 616 service.PreviousSpecVersion = nil 617 618 return store.UpdateService(tx, service) 619 }) 620 621 if err != nil { 622 log.G(ctx).WithError(err).Errorf("failed to start rollback of service %s", serviceID) 623 return 624 } 625 } 626 627 func (u *Updater) completeUpdate(ctx context.Context, serviceID string) { 628 log.G(ctx).Debugf("update of service %s complete", serviceID) 629 630 err := u.store.Update(func(tx store.Tx) error { 631 service := store.GetService(tx, serviceID) 632 if service == nil { 633 return nil 634 } 635 if service.UpdateStatus == nil { 636 // The service was changed since we started this update 637 return nil 638 } 639 if service.UpdateStatus.State == api.UpdateStatus_ROLLBACK_STARTED { 640 service.UpdateStatus.State = api.UpdateStatus_ROLLBACK_COMPLETED 641 service.UpdateStatus.Message = "rollback completed" 642 } else { 643 service.UpdateStatus.State = api.UpdateStatus_COMPLETED 644 service.UpdateStatus.Message = "update completed" 645 } 646 service.UpdateStatus.CompletedAt = ptypes.MustTimestampProto(time.Now()) 647 648 return store.UpdateService(tx, service) 649 }) 650 651 if err != nil { 652 log.G(ctx).WithError(err).Errorf("failed to mark update of service %s complete", serviceID) 653 } 654 }