github.com/kaisenlinux/docker@v0.0.0-20230510090727-ea55db55fac7/swarmkit/agent/worker.go (about) 1 package agent 2 3 import ( 4 "context" 5 "sync" 6 7 "github.com/docker/swarmkit/agent/exec" 8 "github.com/docker/swarmkit/api" 9 "github.com/docker/swarmkit/log" 10 "github.com/docker/swarmkit/watch" 11 "github.com/sirupsen/logrus" 12 bolt "go.etcd.io/bbolt" 13 ) 14 15 // Worker implements the core task management logic and persistence. It 16 // coordinates the set of assignments with the executor. 17 type Worker interface { 18 // Init prepares the worker for task assignment. 19 Init(ctx context.Context) error 20 21 // Close performs worker cleanup when no longer needed. 22 // 23 // It is not safe to call any worker function after that. 24 Close() 25 26 // Assign assigns a complete set of tasks and configs/secrets to a 27 // worker. Any items not included in this set will be removed. 28 Assign(ctx context.Context, assignments []*api.AssignmentChange) error 29 30 // Updates updates an incremental set of tasks or configs/secrets of 31 // the worker. Any items not included either in added or removed will 32 // remain untouched. 33 Update(ctx context.Context, assignments []*api.AssignmentChange) error 34 35 // Listen to updates about tasks controlled by the worker. When first 36 // called, the reporter will receive all updates for all tasks controlled 37 // by the worker. 38 // 39 // The listener will be removed if the context is cancelled. 40 Listen(ctx context.Context, reporter StatusReporter) 41 42 // Report resends the status of all tasks controlled by this worker. 43 Report(ctx context.Context, reporter StatusReporter) 44 45 // Subscribe to log messages matching the subscription. 46 Subscribe(ctx context.Context, subscription *api.SubscriptionMessage) error 47 48 // Wait blocks until all task managers have closed 49 Wait(ctx context.Context) error 50 } 51 52 // statusReporterKey protects removal map from panic. 53 type statusReporterKey struct { 54 StatusReporter 55 } 56 57 type worker struct { 58 db *bolt.DB 59 executor exec.Executor 60 publisher exec.LogPublisher 61 listeners map[*statusReporterKey]struct{} 62 taskevents *watch.Queue 63 publisherProvider exec.LogPublisherProvider 64 65 taskManagers map[string]*taskManager 66 mu sync.RWMutex 67 68 closed bool 69 closers sync.WaitGroup // keeps track of active closers 70 } 71 72 func newWorker(db *bolt.DB, executor exec.Executor, publisherProvider exec.LogPublisherProvider) *worker { 73 return &worker{ 74 db: db, 75 executor: executor, 76 publisherProvider: publisherProvider, 77 taskevents: watch.NewQueue(), 78 listeners: make(map[*statusReporterKey]struct{}), 79 taskManagers: make(map[string]*taskManager), 80 } 81 } 82 83 // Init prepares the worker for assignments. 84 func (w *worker) Init(ctx context.Context) error { 85 w.mu.Lock() 86 defer w.mu.Unlock() 87 88 ctx = log.WithModule(ctx, "worker") 89 90 // TODO(stevvooe): Start task cleanup process. 91 92 // read the tasks from the database and start any task managers that may be needed. 93 return w.db.Update(func(tx *bolt.Tx) error { 94 return WalkTasks(tx, func(task *api.Task) error { 95 if !TaskAssigned(tx, task.ID) { 96 // NOTE(stevvooe): If tasks can survive worker restart, we need 97 // to startup the controller and ensure they are removed. For 98 // now, we can simply remove them from the database. 99 if err := DeleteTask(tx, task.ID); err != nil { 100 log.G(ctx).WithError(err).Errorf("error removing task %v", task.ID) 101 } 102 return nil 103 } 104 105 status, err := GetTaskStatus(tx, task.ID) 106 if err != nil { 107 log.G(ctx).WithError(err).Error("unable to read tasks status") 108 return nil 109 } 110 111 task.Status = *status // merges the status into the task, ensuring we start at the right point. 112 return w.startTask(ctx, tx, task) 113 }) 114 }) 115 } 116 117 // Close performs worker cleanup when no longer needed. 118 func (w *worker) Close() { 119 w.mu.Lock() 120 w.closed = true 121 w.mu.Unlock() 122 123 w.taskevents.Close() 124 } 125 126 // Assign assigns a full set of tasks, configs, and secrets to the worker. 127 // Any tasks not previously known will be started. Any tasks that are in the task set 128 // and already running will be updated, if possible. Any tasks currently running on 129 // the worker outside the task set will be terminated. 130 // Anything not in the set of assignments will be removed. 131 func (w *worker) Assign(ctx context.Context, assignments []*api.AssignmentChange) error { 132 w.mu.Lock() 133 defer w.mu.Unlock() 134 135 if w.closed { 136 return ErrClosed 137 } 138 139 log.G(ctx).WithFields(logrus.Fields{ 140 "len(assignments)": len(assignments), 141 }).Debug("(*worker).Assign") 142 143 // Need to update dependencies before tasks 144 145 err := reconcileSecrets(ctx, w, assignments, true) 146 if err != nil { 147 return err 148 } 149 150 err = reconcileConfigs(ctx, w, assignments, true) 151 if err != nil { 152 return err 153 } 154 155 return reconcileTaskState(ctx, w, assignments, true) 156 } 157 158 // Update updates the set of tasks, configs, and secrets for the worker. 159 // Tasks in the added set will be added to the worker, and tasks in the removed set 160 // will be removed from the worker 161 // Secrets in the added set will be added to the worker, and secrets in the removed set 162 // will be removed from the worker. 163 // Configs in the added set will be added to the worker, and configs in the removed set 164 // will be removed from the worker. 165 func (w *worker) Update(ctx context.Context, assignments []*api.AssignmentChange) error { 166 w.mu.Lock() 167 defer w.mu.Unlock() 168 169 if w.closed { 170 return ErrClosed 171 } 172 173 log.G(ctx).WithFields(logrus.Fields{ 174 "len(assignments)": len(assignments), 175 }).Debug("(*worker).Update") 176 177 err := reconcileSecrets(ctx, w, assignments, false) 178 if err != nil { 179 return err 180 } 181 182 err = reconcileConfigs(ctx, w, assignments, false) 183 if err != nil { 184 return err 185 } 186 187 return reconcileTaskState(ctx, w, assignments, false) 188 } 189 190 func reconcileTaskState(ctx context.Context, w *worker, assignments []*api.AssignmentChange, fullSnapshot bool) error { 191 var ( 192 updatedTasks []*api.Task 193 removedTasks []*api.Task 194 ) 195 for _, a := range assignments { 196 if t := a.Assignment.GetTask(); t != nil { 197 switch a.Action { 198 case api.AssignmentChange_AssignmentActionUpdate: 199 updatedTasks = append(updatedTasks, t) 200 case api.AssignmentChange_AssignmentActionRemove: 201 removedTasks = append(removedTasks, t) 202 } 203 } 204 } 205 206 log.G(ctx).WithFields(logrus.Fields{ 207 "len(updatedTasks)": len(updatedTasks), 208 "len(removedTasks)": len(removedTasks), 209 }).Debug("(*worker).reconcileTaskState") 210 211 tx, err := w.db.Begin(true) 212 if err != nil { 213 log.G(ctx).WithError(err).Error("failed starting transaction against task database") 214 return err 215 } 216 defer tx.Rollback() 217 218 assigned := map[string]struct{}{} 219 220 for _, task := range updatedTasks { 221 log.G(ctx).WithFields( 222 logrus.Fields{ 223 "task.id": task.ID, 224 "task.desiredstate": task.DesiredState}).Debug("assigned") 225 if err := PutTask(tx, task); err != nil { 226 return err 227 } 228 229 if err := SetTaskAssignment(tx, task.ID, true); err != nil { 230 return err 231 } 232 233 if mgr, ok := w.taskManagers[task.ID]; ok { 234 if err := mgr.Update(ctx, task); err != nil && err != ErrClosed { 235 log.G(ctx).WithError(err).Error("failed updating assigned task") 236 } 237 } else { 238 // we may have still seen the task, let's grab the status from 239 // storage and replace it with our status, if we have it. 240 status, err := GetTaskStatus(tx, task.ID) 241 if err != nil { 242 if err != errTaskUnknown { 243 return err 244 } 245 246 // never seen before, register the provided status 247 if err := PutTaskStatus(tx, task.ID, &task.Status); err != nil { 248 return err 249 } 250 } else { 251 task.Status = *status 252 } 253 w.startTask(ctx, tx, task) 254 } 255 256 assigned[task.ID] = struct{}{} 257 } 258 259 closeManager := func(tm *taskManager) { 260 go func(tm *taskManager) { 261 defer w.closers.Done() 262 // when a task is no longer assigned, we shutdown the task manager 263 if err := tm.Close(); err != nil { 264 log.G(ctx).WithError(err).Error("error closing task manager") 265 } 266 }(tm) 267 268 // make an attempt at removing. this is best effort. any errors will be 269 // retried by the reaper later. 270 if err := tm.ctlr.Remove(ctx); err != nil { 271 log.G(ctx).WithError(err).WithField("task.id", tm.task.ID).Error("remove task failed") 272 } 273 274 if err := tm.ctlr.Close(); err != nil { 275 log.G(ctx).WithError(err).Error("error closing controller") 276 } 277 } 278 279 removeTaskAssignment := func(taskID string) error { 280 ctx := log.WithLogger(ctx, log.G(ctx).WithField("task.id", taskID)) 281 // if a task is no longer assigned, then we do not have to keep track 282 // of it. a task will only be unassigned when it is deleted on the 283 // manager. instead of SetTaskAssginment to true, we'll just remove the 284 // task now. 285 if err := DeleteTask(tx, taskID); err != nil { 286 log.G(ctx).WithError(err).Error("error removing de-assigned task") 287 return err 288 } 289 return nil 290 } 291 292 // If this was a complete set of assignments, we're going to remove all the remaining 293 // tasks. 294 if fullSnapshot { 295 for id, tm := range w.taskManagers { 296 if _, ok := assigned[id]; ok { 297 continue 298 } 299 300 err := removeTaskAssignment(id) 301 if err == nil { 302 delete(w.taskManagers, id) 303 go closeManager(tm) 304 } 305 } 306 } else { 307 // If this was an incremental set of assignments, we're going to remove only the tasks 308 // in the removed set 309 for _, task := range removedTasks { 310 err := removeTaskAssignment(task.ID) 311 if err != nil { 312 continue 313 } 314 315 tm, ok := w.taskManagers[task.ID] 316 if ok { 317 delete(w.taskManagers, task.ID) 318 go closeManager(tm) 319 } 320 } 321 } 322 323 return tx.Commit() 324 } 325 326 func reconcileSecrets(ctx context.Context, w *worker, assignments []*api.AssignmentChange, fullSnapshot bool) error { 327 var ( 328 updatedSecrets []api.Secret 329 removedSecrets []string 330 ) 331 for _, a := range assignments { 332 if s := a.Assignment.GetSecret(); s != nil { 333 switch a.Action { 334 case api.AssignmentChange_AssignmentActionUpdate: 335 updatedSecrets = append(updatedSecrets, *s) 336 case api.AssignmentChange_AssignmentActionRemove: 337 removedSecrets = append(removedSecrets, s.ID) 338 } 339 340 } 341 } 342 343 secretsProvider, ok := w.executor.(exec.SecretsProvider) 344 if !ok { 345 if len(updatedSecrets) != 0 || len(removedSecrets) != 0 { 346 log.G(ctx).Warn("secrets update ignored; executor does not support secrets") 347 } 348 return nil 349 } 350 351 secrets := secretsProvider.Secrets() 352 353 log.G(ctx).WithFields(logrus.Fields{ 354 "len(updatedSecrets)": len(updatedSecrets), 355 "len(removedSecrets)": len(removedSecrets), 356 }).Debug("(*worker).reconcileSecrets") 357 358 // If this was a complete set of secrets, we're going to clear the secrets map and add all of them 359 if fullSnapshot { 360 secrets.Reset() 361 } else { 362 secrets.Remove(removedSecrets) 363 } 364 secrets.Add(updatedSecrets...) 365 366 return nil 367 } 368 369 func reconcileConfigs(ctx context.Context, w *worker, assignments []*api.AssignmentChange, fullSnapshot bool) error { 370 var ( 371 updatedConfigs []api.Config 372 removedConfigs []string 373 ) 374 for _, a := range assignments { 375 if r := a.Assignment.GetConfig(); r != nil { 376 switch a.Action { 377 case api.AssignmentChange_AssignmentActionUpdate: 378 updatedConfigs = append(updatedConfigs, *r) 379 case api.AssignmentChange_AssignmentActionRemove: 380 removedConfigs = append(removedConfigs, r.ID) 381 } 382 383 } 384 } 385 386 configsProvider, ok := w.executor.(exec.ConfigsProvider) 387 if !ok { 388 if len(updatedConfigs) != 0 || len(removedConfigs) != 0 { 389 log.G(ctx).Warn("configs update ignored; executor does not support configs") 390 } 391 return nil 392 } 393 394 configs := configsProvider.Configs() 395 396 log.G(ctx).WithFields(logrus.Fields{ 397 "len(updatedConfigs)": len(updatedConfigs), 398 "len(removedConfigs)": len(removedConfigs), 399 }).Debug("(*worker).reconcileConfigs") 400 401 // If this was a complete set of configs, we're going to clear the configs map and add all of them 402 if fullSnapshot { 403 configs.Reset() 404 } else { 405 configs.Remove(removedConfigs) 406 } 407 configs.Add(updatedConfigs...) 408 409 return nil 410 } 411 412 func (w *worker) Listen(ctx context.Context, reporter StatusReporter) { 413 w.mu.Lock() 414 defer w.mu.Unlock() 415 416 key := &statusReporterKey{reporter} 417 w.listeners[key] = struct{}{} 418 419 go func() { 420 <-ctx.Done() 421 w.mu.Lock() 422 defer w.mu.Unlock() 423 delete(w.listeners, key) // remove the listener if the context is closed. 424 }() 425 426 // report the current statuses to the new listener 427 w.reportAllStatuses(ctx, reporter) 428 } 429 430 func (w *worker) Report(ctx context.Context, reporter StatusReporter) { 431 w.mu.Lock() 432 defer w.mu.Unlock() 433 434 w.reportAllStatuses(ctx, reporter) 435 } 436 437 func (w *worker) reportAllStatuses(ctx context.Context, reporter StatusReporter) { 438 if err := w.db.View(func(tx *bolt.Tx) error { 439 return WalkTaskStatus(tx, func(id string, status *api.TaskStatus) error { 440 return reporter.UpdateTaskStatus(ctx, id, status) 441 }) 442 }); err != nil { 443 log.G(ctx).WithError(err).Errorf("failed reporting initial statuses") 444 } 445 } 446 447 func (w *worker) startTask(ctx context.Context, tx *bolt.Tx, task *api.Task) error { 448 _, err := w.taskManager(ctx, tx, task) // side-effect taskManager creation. 449 450 if err != nil { 451 log.G(ctx).WithError(err).Error("failed to start taskManager") 452 // we ignore this error: it gets reported in the taskStatus within 453 // `newTaskManager`. We log it here and move on. If their is an 454 // attempted restart, the lack of taskManager will have this retry 455 // again. 456 return nil 457 } 458 459 // only publish if controller resolution was successful. 460 w.taskevents.Publish(task.Copy()) 461 return nil 462 } 463 464 func (w *worker) taskManager(ctx context.Context, tx *bolt.Tx, task *api.Task) (*taskManager, error) { 465 if tm, ok := w.taskManagers[task.ID]; ok { 466 return tm, nil 467 } 468 469 tm, err := w.newTaskManager(ctx, tx, task) 470 if err != nil { 471 return nil, err 472 } 473 w.taskManagers[task.ID] = tm 474 // keep track of active tasks 475 w.closers.Add(1) 476 return tm, nil 477 } 478 479 func (w *worker) newTaskManager(ctx context.Context, tx *bolt.Tx, task *api.Task) (*taskManager, error) { 480 ctx = log.WithLogger(ctx, log.G(ctx).WithFields(logrus.Fields{ 481 "task.id": task.ID, 482 "service.id": task.ServiceID, 483 })) 484 485 ctlr, status, err := exec.Resolve(ctx, task, w.executor) 486 if err := w.updateTaskStatus(ctx, tx, task.ID, status); err != nil { 487 log.G(ctx).WithError(err).Error("error updating task status after controller resolution") 488 } 489 490 if err != nil { 491 log.G(ctx).WithError(err).Error("controller resolution failed") 492 return nil, err 493 } 494 495 return newTaskManager(ctx, task, ctlr, statusReporterFunc(func(ctx context.Context, taskID string, status *api.TaskStatus) error { 496 w.mu.RLock() 497 defer w.mu.RUnlock() 498 499 return w.db.Update(func(tx *bolt.Tx) error { 500 return w.updateTaskStatus(ctx, tx, taskID, status) 501 }) 502 })), nil 503 } 504 505 // updateTaskStatus reports statuses to listeners, read lock must be held. 506 func (w *worker) updateTaskStatus(ctx context.Context, tx *bolt.Tx, taskID string, status *api.TaskStatus) error { 507 if err := PutTaskStatus(tx, taskID, status); err != nil { 508 // we shouldn't fail to put a task status. however, there exists the 509 // possibility of a race in which we try to put a task status after the 510 // task has been deleted. because this whole contraption is a careful 511 // dance of too-tightly-coupled concurrent parts, fixing tht race is 512 // fraught with hazards. instead, we'll recognize that it can occur, 513 // log the error, and then ignore it. 514 if err == errTaskUnknown { 515 // log at info level. debug logging in docker is already really 516 // verbose, so many people disable it. the race that causes this 517 // behavior should be very rare, but if it occurs, we should know 518 // about it, because if there is some case where it is _not_ rare, 519 // then knowing about it will go a long way toward debugging. 520 log.G(ctx).Info("attempted to update status for a task that has been removed") 521 return nil 522 } 523 log.G(ctx).WithError(err).Error("failed writing status to disk") 524 return err 525 } 526 527 // broadcast the task status out. 528 for key := range w.listeners { 529 if err := key.StatusReporter.UpdateTaskStatus(ctx, taskID, status); err != nil { 530 log.G(ctx).WithError(err).Errorf("failed updating status for reporter %v", key.StatusReporter) 531 } 532 } 533 534 return nil 535 } 536 537 // Subscribe to log messages matching the subscription. 538 func (w *worker) Subscribe(ctx context.Context, subscription *api.SubscriptionMessage) error { 539 log.G(ctx).Debugf("Received subscription %s (selector: %v)", subscription.ID, subscription.Selector) 540 541 publisher, cancel, err := w.publisherProvider.Publisher(ctx, subscription.ID) 542 if err != nil { 543 return err 544 } 545 // Send a close once we're done 546 defer cancel() 547 548 match := func(t *api.Task) bool { 549 // TODO(aluzzardi): Consider using maps to limit the iterations. 550 for _, tid := range subscription.Selector.TaskIDs { 551 if t.ID == tid { 552 return true 553 } 554 } 555 556 for _, sid := range subscription.Selector.ServiceIDs { 557 if t.ServiceID == sid { 558 return true 559 } 560 } 561 562 for _, nid := range subscription.Selector.NodeIDs { 563 if t.NodeID == nid { 564 return true 565 } 566 } 567 568 return false 569 } 570 571 wg := sync.WaitGroup{} 572 w.mu.Lock() 573 for _, tm := range w.taskManagers { 574 if match(tm.task) { 575 wg.Add(1) 576 go func(tm *taskManager) { 577 defer wg.Done() 578 tm.Logs(ctx, *subscription.Options, publisher) 579 }(tm) 580 } 581 } 582 w.mu.Unlock() 583 584 // If follow mode is disabled, wait for the current set of matched tasks 585 // to finish publishing logs, then close the subscription by returning. 586 if subscription.Options == nil || !subscription.Options.Follow { 587 waitCh := make(chan struct{}) 588 go func() { 589 defer close(waitCh) 590 wg.Wait() 591 }() 592 593 select { 594 case <-ctx.Done(): 595 return ctx.Err() 596 case <-waitCh: 597 return nil 598 } 599 } 600 601 // In follow mode, watch for new tasks. Don't close the subscription 602 // until it's cancelled. 603 ch, cancel := w.taskevents.Watch() 604 defer cancel() 605 for { 606 select { 607 case v := <-ch: 608 task := v.(*api.Task) 609 if match(task) { 610 w.mu.RLock() 611 tm, ok := w.taskManagers[task.ID] 612 w.mu.RUnlock() 613 if !ok { 614 continue 615 } 616 617 go tm.Logs(ctx, *subscription.Options, publisher) 618 } 619 case <-ctx.Done(): 620 return ctx.Err() 621 } 622 } 623 } 624 625 func (w *worker) Wait(ctx context.Context) error { 626 ch := make(chan struct{}) 627 go func() { 628 w.closers.Wait() 629 close(ch) 630 }() 631 632 select { 633 case <-ch: 634 return nil 635 case <-ctx.Done(): 636 return ctx.Err() 637 } 638 }