github.com/kaisenlinux/docker.io@v0.0.0-20230510090727-ea55db55fac7/swarmkit/manager/orchestrator/restart/restart.go (about) 1 package restart 2 3 import ( 4 "container/list" 5 "context" 6 "errors" 7 "sync" 8 "time" 9 10 "github.com/docker/go-events" 11 "github.com/docker/swarmkit/api" 12 "github.com/docker/swarmkit/api/defaults" 13 "github.com/docker/swarmkit/log" 14 "github.com/docker/swarmkit/manager/orchestrator" 15 "github.com/docker/swarmkit/manager/state" 16 "github.com/docker/swarmkit/manager/state/store" 17 gogotypes "github.com/gogo/protobuf/types" 18 ) 19 20 const defaultOldTaskTimeout = time.Minute 21 22 type restartedInstance struct { 23 timestamp time.Time 24 } 25 26 type instanceRestartInfo struct { 27 // counter of restarts for this instance. 28 totalRestarts uint64 29 // Linked list of restartedInstance structs. Only used when 30 // Restart.MaxAttempts and Restart.Window are both 31 // nonzero. 32 restartedInstances *list.List 33 // Why is specVersion in this structure and not in the map key? While 34 // putting it in the key would be a very simple solution, it wouldn't 35 // be easy to clean up map entries corresponding to old specVersions. 36 // Making the key version-agnostic and clearing the value whenever the 37 // version changes avoids the issue of stale map entries for old 38 // versions. 39 specVersion api.Version 40 } 41 42 type delayedStart struct { 43 // cancel is called to cancel the delayed start. 44 cancel func() 45 doneCh chan struct{} 46 47 // waiter is set to true if the next restart is waiting for this delay 48 // to complete. 49 waiter bool 50 } 51 52 // SupervisorInterface is an interface implemented by the Supervisor. It exists 53 // to make testing easier, by allowing the restart supervisor to be mocked or 54 // faked where desired. 55 type SupervisorInterface interface { 56 Restart(context.Context, store.Tx, *api.Cluster, *api.Service, api.Task) error 57 UpdatableTasksInSlot(context.Context, orchestrator.Slot, *api.Service) orchestrator.Slot 58 RecordRestartHistory(orchestrator.SlotTuple, *api.Task) 59 DelayStart(context.Context, store.Tx, *api.Task, string, time.Duration, bool) <-chan struct{} 60 StartNow(store.Tx, string) error 61 Cancel(string) 62 CancelAll() 63 ClearServiceHistory(string) 64 } 65 66 // Supervisor initiates and manages restarts. It's responsible for 67 // delaying restarts when applicable. 68 type Supervisor struct { 69 mu sync.Mutex 70 store *store.MemoryStore 71 delays map[string]*delayedStart 72 historyByService map[string]map[orchestrator.SlotTuple]*instanceRestartInfo 73 TaskTimeout time.Duration 74 } 75 76 // NewSupervisor creates a new RestartSupervisor. 77 func NewSupervisor(store *store.MemoryStore) *Supervisor { 78 return &Supervisor{ 79 store: store, 80 delays: make(map[string]*delayedStart), 81 historyByService: make(map[string]map[orchestrator.SlotTuple]*instanceRestartInfo), 82 TaskTimeout: defaultOldTaskTimeout, 83 } 84 } 85 86 func (r *Supervisor) waitRestart(ctx context.Context, oldDelay *delayedStart, cluster *api.Cluster, taskID string) { 87 // Wait for the last restart delay to elapse. 88 select { 89 case <-oldDelay.doneCh: 90 case <-ctx.Done(): 91 return 92 } 93 94 // Start the next restart 95 err := r.store.Update(func(tx store.Tx) error { 96 t := store.GetTask(tx, taskID) 97 if t == nil { 98 return nil 99 } 100 if t.DesiredState > api.TaskStateRunning { 101 return nil 102 } 103 service := store.GetService(tx, t.ServiceID) 104 if service == nil { 105 return nil 106 } 107 return r.Restart(ctx, tx, cluster, service, *t) 108 }) 109 110 if err != nil { 111 log.G(ctx).WithError(err).Errorf("failed to restart task after waiting for previous restart") 112 } 113 } 114 115 // Restart initiates a new task to replace t if appropriate under the service's 116 // restart policy. 117 func (r *Supervisor) Restart(ctx context.Context, tx store.Tx, cluster *api.Cluster, service *api.Service, t api.Task) error { 118 // TODO(aluzzardi): This function should not depend on `service`. 119 120 // Is the old task still in the process of restarting? If so, wait for 121 // its restart delay to elapse, to avoid tight restart loops (for 122 // example, when the image doesn't exist). 123 r.mu.Lock() 124 oldDelay, ok := r.delays[t.ID] 125 if ok { 126 if !oldDelay.waiter { 127 oldDelay.waiter = true 128 go r.waitRestart(ctx, oldDelay, cluster, t.ID) 129 } 130 r.mu.Unlock() 131 return nil 132 } 133 r.mu.Unlock() 134 135 // Sanity check: was the task shut down already by a separate call to 136 // Restart? If so, we must avoid restarting it, because this will create 137 // an extra task. This should never happen unless there is a bug. 138 if t.DesiredState > api.TaskStateCompleted { 139 return errors.New("Restart called on task that was already shut down") 140 } 141 142 t.DesiredState = api.TaskStateShutdown 143 err := store.UpdateTask(tx, &t) 144 if err != nil { 145 log.G(ctx).WithError(err).Errorf("failed to set task desired state to dead") 146 return err 147 } 148 149 if !r.shouldRestart(ctx, &t, service) { 150 return nil 151 } 152 153 var restartTask *api.Task 154 155 if orchestrator.IsReplicatedService(service) || orchestrator.IsReplicatedJob(service) { 156 restartTask = orchestrator.NewTask(cluster, service, t.Slot, "") 157 } else if orchestrator.IsGlobalService(service) || orchestrator.IsGlobalJob(service) { 158 restartTask = orchestrator.NewTask(cluster, service, 0, t.NodeID) 159 } else { 160 log.G(ctx).Error("service not supported by restart supervisor") 161 return nil 162 } 163 164 if orchestrator.IsReplicatedJob(service) || orchestrator.IsGlobalJob(service) { 165 restartTask.JobIteration = &api.Version{ 166 Index: service.JobStatus.JobIteration.Index, 167 } 168 } 169 170 n := store.GetNode(tx, t.NodeID) 171 172 restartTask.DesiredState = api.TaskStateReady 173 174 var restartDelay time.Duration 175 // Restart delay is not applied to drained nodes 176 if n == nil || n.Spec.Availability != api.NodeAvailabilityDrain { 177 if t.Spec.Restart != nil && t.Spec.Restart.Delay != nil { 178 var err error 179 restartDelay, err = gogotypes.DurationFromProto(t.Spec.Restart.Delay) 180 if err != nil { 181 log.G(ctx).WithError(err).Error("invalid restart delay; using default") 182 restartDelay, _ = gogotypes.DurationFromProto(defaults.Service.Task.Restart.Delay) 183 } 184 } else { 185 restartDelay, _ = gogotypes.DurationFromProto(defaults.Service.Task.Restart.Delay) 186 } 187 } 188 189 waitStop := true 190 191 // Normally we wait for the old task to stop running, but we skip this 192 // if the old task is already dead or the node it's assigned to is down. 193 if (n != nil && n.Status.State == api.NodeStatus_DOWN) || t.Status.State > api.TaskStateRunning { 194 waitStop = false 195 } 196 197 if err := store.CreateTask(tx, restartTask); err != nil { 198 log.G(ctx).WithError(err).WithField("task.id", restartTask.ID).Error("task create failed") 199 return err 200 } 201 202 tuple := orchestrator.SlotTuple{ 203 Slot: restartTask.Slot, 204 ServiceID: restartTask.ServiceID, 205 NodeID: restartTask.NodeID, 206 } 207 r.RecordRestartHistory(tuple, restartTask) 208 209 r.DelayStart(ctx, tx, &t, restartTask.ID, restartDelay, waitStop) 210 return nil 211 } 212 213 // shouldRestart returns true if a task should be restarted according to the 214 // restart policy. 215 func (r *Supervisor) shouldRestart(ctx context.Context, t *api.Task, service *api.Service) bool { 216 // TODO(aluzzardi): This function should not depend on `service`. 217 // There are 3 possible restart policies. 218 switch orchestrator.RestartCondition(t) { 219 case api.RestartOnAny: 220 // we will be restarting, we just need to do a few more checks. 221 // however, if the task belongs to a job, then we will treat 222 // RestartOnAny the same as RestartOnFailure, as it would be 223 // nonsensical to restart completed jobs. 224 if orchestrator.IsReplicatedJob(service) || orchestrator.IsGlobalJob(service) { 225 // it'd be nice to put a fallthrough here, but we can't fallthrough 226 // from inside of an if statement. 227 if t.Status.State == api.TaskStateCompleted { 228 return false 229 } 230 } 231 case api.RestartOnFailure: 232 // we won't restart if the task is in TaskStateCompleted, as this is a 233 // not a failed state -- it indicates that the task exited with 0 234 if t.Status.State == api.TaskStateCompleted { 235 return false 236 } 237 case api.RestartOnNone: 238 // RestartOnNone means we just don't restart, ever 239 return false 240 } 241 242 if t.Spec.Restart == nil || t.Spec.Restart.MaxAttempts == 0 { 243 return true 244 } 245 246 instanceTuple := orchestrator.SlotTuple{ 247 Slot: t.Slot, 248 ServiceID: t.ServiceID, 249 } 250 251 // Slot is not meaningful for "global" tasks, so they need to be 252 // indexed by NodeID. 253 if orchestrator.IsGlobalService(service) { 254 instanceTuple.NodeID = t.NodeID 255 } 256 257 r.mu.Lock() 258 defer r.mu.Unlock() 259 260 restartInfo := r.historyByService[t.ServiceID][instanceTuple] 261 if restartInfo == nil || (t.SpecVersion != nil && *t.SpecVersion != restartInfo.specVersion) { 262 return true 263 } 264 265 if t.Spec.Restart.Window == nil || (t.Spec.Restart.Window.Seconds == 0 && t.Spec.Restart.Window.Nanos == 0) { 266 return restartInfo.totalRestarts < t.Spec.Restart.MaxAttempts 267 } 268 269 if restartInfo.restartedInstances == nil { 270 return true 271 } 272 273 window, err := gogotypes.DurationFromProto(t.Spec.Restart.Window) 274 if err != nil { 275 log.G(ctx).WithError(err).Error("invalid restart lookback window") 276 return restartInfo.totalRestarts < t.Spec.Restart.MaxAttempts 277 } 278 279 var timestamp time.Time 280 // Prefer the manager's timestamp over the agent's, since manager 281 // clocks are more trustworthy. 282 if t.Status.AppliedAt != nil { 283 timestamp, err = gogotypes.TimestampFromProto(t.Status.AppliedAt) 284 if err != nil { 285 log.G(ctx).WithError(err).Error("invalid task status AppliedAt timestamp") 286 return restartInfo.totalRestarts < t.Spec.Restart.MaxAttempts 287 } 288 } else { 289 // It's safe to call TimestampFromProto with a nil timestamp 290 timestamp, err = gogotypes.TimestampFromProto(t.Status.Timestamp) 291 if t.Status.Timestamp == nil || err != nil { 292 log.G(ctx).WithError(err).Error("invalid task completion timestamp") 293 return restartInfo.totalRestarts < t.Spec.Restart.MaxAttempts 294 } 295 } 296 lookback := timestamp.Add(-window) 297 298 numRestarts := uint64(restartInfo.restartedInstances.Len()) 299 300 // Disregard any restarts that happened before the lookback window, 301 // and remove them from the linked list since they will no longer 302 // be relevant to figuring out if tasks should be restarted going 303 // forward. 304 var next *list.Element 305 for e := restartInfo.restartedInstances.Front(); e != nil; e = next { 306 next = e.Next() 307 308 if e.Value.(restartedInstance).timestamp.After(lookback) { 309 break 310 } 311 restartInfo.restartedInstances.Remove(e) 312 numRestarts-- 313 } 314 315 // Ignore restarts that didn't happen before the task we're looking at. 316 for e2 := restartInfo.restartedInstances.Back(); e2 != nil; e2 = e2.Prev() { 317 if e2.Value.(restartedInstance).timestamp.Before(timestamp) { 318 break 319 } 320 numRestarts-- 321 } 322 323 if restartInfo.restartedInstances.Len() == 0 { 324 restartInfo.restartedInstances = nil 325 } 326 327 return numRestarts < t.Spec.Restart.MaxAttempts 328 } 329 330 // UpdatableTasksInSlot returns the set of tasks that should be passed to the 331 // updater from this slot, or an empty slice if none should be. An updatable 332 // slot has either at least one task that with desired state <= RUNNING, or its 333 // most recent task has stopped running and should not be restarted. The latter 334 // case is for making sure that tasks that shouldn't normally be restarted will 335 // still be handled by rolling updates when they become outdated. There is a 336 // special case for rollbacks to make sure that a rollback always takes the 337 // service to a converged state, instead of ignoring tasks with the original 338 // spec that stopped running and shouldn't be restarted according to the 339 // restart policy. 340 func (r *Supervisor) UpdatableTasksInSlot(ctx context.Context, slot orchestrator.Slot, service *api.Service) orchestrator.Slot { 341 if len(slot) < 1 { 342 return nil 343 } 344 345 var updatable orchestrator.Slot 346 for _, t := range slot { 347 if t.DesiredState <= api.TaskStateRunning { 348 updatable = append(updatable, t) 349 } 350 } 351 if len(updatable) > 0 { 352 return updatable 353 } 354 355 if service.UpdateStatus != nil && service.UpdateStatus.State == api.UpdateStatus_ROLLBACK_STARTED { 356 return nil 357 } 358 359 // Find most recent task 360 byTimestamp := orchestrator.TasksByTimestamp(slot) 361 newestIndex := 0 362 for i := 1; i != len(slot); i++ { 363 if byTimestamp.Less(newestIndex, i) { 364 newestIndex = i 365 } 366 } 367 368 if !r.shouldRestart(ctx, slot[newestIndex], service) { 369 return orchestrator.Slot{slot[newestIndex]} 370 } 371 return nil 372 } 373 374 // RecordRestartHistory updates the historyByService map to reflect the restart 375 // of restartedTask. 376 func (r *Supervisor) RecordRestartHistory(tuple orchestrator.SlotTuple, replacementTask *api.Task) { 377 if replacementTask.Spec.Restart == nil || replacementTask.Spec.Restart.MaxAttempts == 0 { 378 // No limit on the number of restarts, so no need to record 379 // history. 380 return 381 } 382 383 r.mu.Lock() 384 defer r.mu.Unlock() 385 386 serviceID := replacementTask.ServiceID 387 if r.historyByService[serviceID] == nil { 388 r.historyByService[serviceID] = make(map[orchestrator.SlotTuple]*instanceRestartInfo) 389 } 390 if r.historyByService[serviceID][tuple] == nil { 391 r.historyByService[serviceID][tuple] = &instanceRestartInfo{} 392 } 393 394 restartInfo := r.historyByService[serviceID][tuple] 395 396 if replacementTask.SpecVersion != nil && *replacementTask.SpecVersion != restartInfo.specVersion { 397 // This task has a different SpecVersion from the one we're 398 // tracking. Most likely, the service was updated. Past failures 399 // shouldn't count against the new service definition, so clear 400 // the history for this instance. 401 *restartInfo = instanceRestartInfo{ 402 specVersion: *replacementTask.SpecVersion, 403 } 404 } 405 406 restartInfo.totalRestarts++ 407 408 if replacementTask.Spec.Restart.Window != nil && (replacementTask.Spec.Restart.Window.Seconds != 0 || replacementTask.Spec.Restart.Window.Nanos != 0) { 409 if restartInfo.restartedInstances == nil { 410 restartInfo.restartedInstances = list.New() 411 } 412 413 // it's okay to call TimestampFromProto with a nil argument 414 timestamp, err := gogotypes.TimestampFromProto(replacementTask.Meta.CreatedAt) 415 if replacementTask.Meta.CreatedAt == nil || err != nil { 416 timestamp = time.Now() 417 } 418 419 restartedInstance := restartedInstance{ 420 timestamp: timestamp, 421 } 422 423 restartInfo.restartedInstances.PushBack(restartedInstance) 424 } 425 } 426 427 // DelayStart starts a timer that moves the task from READY to RUNNING once: 428 // - The restart delay has elapsed (if applicable) 429 // - The old task that it's replacing has stopped running (or this times out) 430 // It must be called during an Update transaction to ensure that it does not 431 // miss events. The purpose of the store.Tx argument is to avoid accidental 432 // calls outside an Update transaction. 433 func (r *Supervisor) DelayStart(ctx context.Context, _ store.Tx, oldTask *api.Task, newTaskID string, delay time.Duration, waitStop bool) <-chan struct{} { 434 ctx, cancel := context.WithCancel(context.Background()) 435 doneCh := make(chan struct{}) 436 437 r.mu.Lock() 438 for { 439 oldDelay, ok := r.delays[newTaskID] 440 if !ok { 441 break 442 } 443 oldDelay.cancel() 444 r.mu.Unlock() 445 // Note that this channel read should only block for a very 446 // short time, because we cancelled the existing delay and 447 // that should cause it to stop immediately. 448 <-oldDelay.doneCh 449 r.mu.Lock() 450 } 451 r.delays[newTaskID] = &delayedStart{cancel: cancel, doneCh: doneCh} 452 r.mu.Unlock() 453 454 var watch chan events.Event 455 cancelWatch := func() {} 456 457 waitForTask := waitStop && oldTask != nil && oldTask.Status.State <= api.TaskStateRunning 458 459 if waitForTask { 460 // Wait for either the old task to complete, or the old task's 461 // node to become unavailable. 462 watch, cancelWatch = state.Watch( 463 r.store.WatchQueue(), 464 api.EventUpdateTask{ 465 Task: &api.Task{ID: oldTask.ID, Status: api.TaskStatus{State: api.TaskStateRunning}}, 466 Checks: []api.TaskCheckFunc{api.TaskCheckID, state.TaskCheckStateGreaterThan}, 467 }, 468 api.EventUpdateNode{ 469 Node: &api.Node{ID: oldTask.NodeID, Status: api.NodeStatus{State: api.NodeStatus_DOWN}}, 470 Checks: []api.NodeCheckFunc{api.NodeCheckID, state.NodeCheckState}, 471 }, 472 api.EventDeleteNode{ 473 Node: &api.Node{ID: oldTask.NodeID}, 474 Checks: []api.NodeCheckFunc{api.NodeCheckID}, 475 }, 476 ) 477 } 478 479 go func() { 480 defer func() { 481 cancelWatch() 482 r.mu.Lock() 483 delete(r.delays, newTaskID) 484 r.mu.Unlock() 485 close(doneCh) 486 }() 487 488 oldTaskTimer := time.NewTimer(r.TaskTimeout) 489 defer oldTaskTimer.Stop() 490 491 // Wait for the delay to elapse, if one is specified. 492 if delay != 0 { 493 select { 494 case <-time.After(delay): 495 case <-ctx.Done(): 496 return 497 } 498 } 499 500 if waitForTask { 501 select { 502 case <-watch: 503 case <-oldTaskTimer.C: 504 case <-ctx.Done(): 505 return 506 } 507 } 508 509 err := r.store.Update(func(tx store.Tx) error { 510 err := r.StartNow(tx, newTaskID) 511 if err != nil { 512 log.G(ctx).WithError(err).WithField("task.id", newTaskID).Error("moving task out of delayed state failed") 513 } 514 return nil 515 }) 516 if err != nil { 517 log.G(ctx).WithError(err).WithField("task.id", newTaskID).Error("task restart transaction failed") 518 } 519 }() 520 521 return doneCh 522 } 523 524 // StartNow moves the task into the RUNNING state so it will proceed to start 525 // up. 526 func (r *Supervisor) StartNow(tx store.Tx, taskID string) error { 527 t := store.GetTask(tx, taskID) 528 if t == nil || t.DesiredState >= api.TaskStateRunning { 529 return nil 530 } 531 532 // only tasks belonging to jobs will have a JobIteration, so this can be 533 // used to distinguish whether this is a job task without looking at the 534 // service. 535 if t.JobIteration != nil { 536 t.DesiredState = api.TaskStateCompleted 537 } else { 538 t.DesiredState = api.TaskStateRunning 539 } 540 return store.UpdateTask(tx, t) 541 } 542 543 // Cancel cancels a pending restart. 544 func (r *Supervisor) Cancel(taskID string) { 545 r.mu.Lock() 546 delay, ok := r.delays[taskID] 547 r.mu.Unlock() 548 549 if !ok { 550 return 551 } 552 553 delay.cancel() 554 <-delay.doneCh 555 } 556 557 // CancelAll aborts all pending restarts 558 func (r *Supervisor) CancelAll() { 559 r.mu.Lock() 560 for _, delay := range r.delays { 561 delay.cancel() 562 } 563 r.mu.Unlock() 564 } 565 566 // ClearServiceHistory forgets restart history related to a given service ID. 567 func (r *Supervisor) ClearServiceHistory(serviceID string) { 568 r.mu.Lock() 569 delete(r.historyByService, serviceID) 570 r.mu.Unlock() 571 }