github.com/kaisenlinux/docker.io@v0.0.0-20230510090727-ea55db55fac7/swarmkit/agent/worker.go (about)

     1  package agent
     2  
     3  import (
     4  	"context"
     5  	"sync"
     6  
     7  	"github.com/docker/swarmkit/agent/exec"
     8  	"github.com/docker/swarmkit/api"
     9  	"github.com/docker/swarmkit/log"
    10  	"github.com/docker/swarmkit/watch"
    11  	"github.com/sirupsen/logrus"
    12  	bolt "go.etcd.io/bbolt"
    13  )
    14  
    15  // Worker implements the core task management logic and persistence. It
    16  // coordinates the set of assignments with the executor.
    17  type Worker interface {
    18  	// Init prepares the worker for task assignment.
    19  	Init(ctx context.Context) error
    20  
    21  	// Close performs worker cleanup when no longer needed.
    22  	//
    23  	// It is not safe to call any worker function after that.
    24  	Close()
    25  
    26  	// Assign assigns a complete set of tasks and configs/secrets to a
    27  	// worker. Any items not included in this set will be removed.
    28  	Assign(ctx context.Context, assignments []*api.AssignmentChange) error
    29  
    30  	// Updates updates an incremental set of tasks or configs/secrets of
    31  	// the worker. Any items not included either in added or removed will
    32  	// remain untouched.
    33  	Update(ctx context.Context, assignments []*api.AssignmentChange) error
    34  
    35  	// Listen to updates about tasks controlled by the worker. When first
    36  	// called, the reporter will receive all updates for all tasks controlled
    37  	// by the worker.
    38  	//
    39  	// The listener will be removed if the context is cancelled.
    40  	Listen(ctx context.Context, reporter StatusReporter)
    41  
    42  	// Report resends the status of all tasks controlled by this worker.
    43  	Report(ctx context.Context, reporter StatusReporter)
    44  
    45  	// Subscribe to log messages matching the subscription.
    46  	Subscribe(ctx context.Context, subscription *api.SubscriptionMessage) error
    47  
    48  	// Wait blocks until all task managers have closed
    49  	Wait(ctx context.Context) error
    50  }
    51  
    52  // statusReporterKey protects removal map from panic.
    53  type statusReporterKey struct {
    54  	StatusReporter
    55  }
    56  
    57  type worker struct {
    58  	db                *bolt.DB
    59  	executor          exec.Executor
    60  	publisher         exec.LogPublisher
    61  	listeners         map[*statusReporterKey]struct{}
    62  	taskevents        *watch.Queue
    63  	publisherProvider exec.LogPublisherProvider
    64  
    65  	taskManagers map[string]*taskManager
    66  	mu           sync.RWMutex
    67  
    68  	closed  bool
    69  	closers sync.WaitGroup // keeps track of active closers
    70  }
    71  
    72  func newWorker(db *bolt.DB, executor exec.Executor, publisherProvider exec.LogPublisherProvider) *worker {
    73  	return &worker{
    74  		db:                db,
    75  		executor:          executor,
    76  		publisherProvider: publisherProvider,
    77  		taskevents:        watch.NewQueue(),
    78  		listeners:         make(map[*statusReporterKey]struct{}),
    79  		taskManagers:      make(map[string]*taskManager),
    80  	}
    81  }
    82  
    83  // Init prepares the worker for assignments.
    84  func (w *worker) Init(ctx context.Context) error {
    85  	w.mu.Lock()
    86  	defer w.mu.Unlock()
    87  
    88  	ctx = log.WithModule(ctx, "worker")
    89  
    90  	// TODO(stevvooe): Start task cleanup process.
    91  
    92  	// read the tasks from the database and start any task managers that may be needed.
    93  	return w.db.Update(func(tx *bolt.Tx) error {
    94  		return WalkTasks(tx, func(task *api.Task) error {
    95  			if !TaskAssigned(tx, task.ID) {
    96  				// NOTE(stevvooe): If tasks can survive worker restart, we need
    97  				// to startup the controller and ensure they are removed. For
    98  				// now, we can simply remove them from the database.
    99  				if err := DeleteTask(tx, task.ID); err != nil {
   100  					log.G(ctx).WithError(err).Errorf("error removing task %v", task.ID)
   101  				}
   102  				return nil
   103  			}
   104  
   105  			status, err := GetTaskStatus(tx, task.ID)
   106  			if err != nil {
   107  				log.G(ctx).WithError(err).Error("unable to read tasks status")
   108  				return nil
   109  			}
   110  
   111  			task.Status = *status // merges the status into the task, ensuring we start at the right point.
   112  			return w.startTask(ctx, tx, task)
   113  		})
   114  	})
   115  }
   116  
   117  // Close performs worker cleanup when no longer needed.
   118  func (w *worker) Close() {
   119  	w.mu.Lock()
   120  	w.closed = true
   121  	w.mu.Unlock()
   122  
   123  	w.taskevents.Close()
   124  }
   125  
   126  // Assign assigns a full set of tasks, configs, and secrets to the worker.
   127  // Any tasks not previously known will be started. Any tasks that are in the task set
   128  // and already running will be updated, if possible. Any tasks currently running on
   129  // the worker outside the task set will be terminated.
   130  // Anything not in the set of assignments will be removed.
   131  func (w *worker) Assign(ctx context.Context, assignments []*api.AssignmentChange) error {
   132  	w.mu.Lock()
   133  	defer w.mu.Unlock()
   134  
   135  	if w.closed {
   136  		return ErrClosed
   137  	}
   138  
   139  	log.G(ctx).WithFields(logrus.Fields{
   140  		"len(assignments)": len(assignments),
   141  	}).Debug("(*worker).Assign")
   142  
   143  	// Need to update dependencies before tasks
   144  
   145  	err := reconcileSecrets(ctx, w, assignments, true)
   146  	if err != nil {
   147  		return err
   148  	}
   149  
   150  	err = reconcileConfigs(ctx, w, assignments, true)
   151  	if err != nil {
   152  		return err
   153  	}
   154  
   155  	return reconcileTaskState(ctx, w, assignments, true)
   156  }
   157  
   158  // Update updates the set of tasks, configs, and secrets for the worker.
   159  // Tasks in the added set will be added to the worker, and tasks in the removed set
   160  // will be removed from the worker
   161  // Secrets in the added set will be added to the worker, and secrets in the removed set
   162  // will be removed from the worker.
   163  // Configs in the added set will be added to the worker, and configs in the removed set
   164  // will be removed from the worker.
   165  func (w *worker) Update(ctx context.Context, assignments []*api.AssignmentChange) error {
   166  	w.mu.Lock()
   167  	defer w.mu.Unlock()
   168  
   169  	if w.closed {
   170  		return ErrClosed
   171  	}
   172  
   173  	log.G(ctx).WithFields(logrus.Fields{
   174  		"len(assignments)": len(assignments),
   175  	}).Debug("(*worker).Update")
   176  
   177  	err := reconcileSecrets(ctx, w, assignments, false)
   178  	if err != nil {
   179  		return err
   180  	}
   181  
   182  	err = reconcileConfigs(ctx, w, assignments, false)
   183  	if err != nil {
   184  		return err
   185  	}
   186  
   187  	return reconcileTaskState(ctx, w, assignments, false)
   188  }
   189  
   190  func reconcileTaskState(ctx context.Context, w *worker, assignments []*api.AssignmentChange, fullSnapshot bool) error {
   191  	var (
   192  		updatedTasks []*api.Task
   193  		removedTasks []*api.Task
   194  	)
   195  	for _, a := range assignments {
   196  		if t := a.Assignment.GetTask(); t != nil {
   197  			switch a.Action {
   198  			case api.AssignmentChange_AssignmentActionUpdate:
   199  				updatedTasks = append(updatedTasks, t)
   200  			case api.AssignmentChange_AssignmentActionRemove:
   201  				removedTasks = append(removedTasks, t)
   202  			}
   203  		}
   204  	}
   205  
   206  	log.G(ctx).WithFields(logrus.Fields{
   207  		"len(updatedTasks)": len(updatedTasks),
   208  		"len(removedTasks)": len(removedTasks),
   209  	}).Debug("(*worker).reconcileTaskState")
   210  
   211  	tx, err := w.db.Begin(true)
   212  	if err != nil {
   213  		log.G(ctx).WithError(err).Error("failed starting transaction against task database")
   214  		return err
   215  	}
   216  	defer tx.Rollback()
   217  
   218  	assigned := map[string]struct{}{}
   219  
   220  	for _, task := range updatedTasks {
   221  		log.G(ctx).WithFields(
   222  			logrus.Fields{
   223  				"task.id":           task.ID,
   224  				"task.desiredstate": task.DesiredState}).Debug("assigned")
   225  		if err := PutTask(tx, task); err != nil {
   226  			return err
   227  		}
   228  
   229  		if err := SetTaskAssignment(tx, task.ID, true); err != nil {
   230  			return err
   231  		}
   232  
   233  		if mgr, ok := w.taskManagers[task.ID]; ok {
   234  			if err := mgr.Update(ctx, task); err != nil && err != ErrClosed {
   235  				log.G(ctx).WithError(err).Error("failed updating assigned task")
   236  			}
   237  		} else {
   238  			// we may have still seen the task, let's grab the status from
   239  			// storage and replace it with our status, if we have it.
   240  			status, err := GetTaskStatus(tx, task.ID)
   241  			if err != nil {
   242  				if err != errTaskUnknown {
   243  					return err
   244  				}
   245  
   246  				// never seen before, register the provided status
   247  				if err := PutTaskStatus(tx, task.ID, &task.Status); err != nil {
   248  					return err
   249  				}
   250  			} else {
   251  				task.Status = *status
   252  			}
   253  			w.startTask(ctx, tx, task)
   254  		}
   255  
   256  		assigned[task.ID] = struct{}{}
   257  	}
   258  
   259  	closeManager := func(tm *taskManager) {
   260  		go func(tm *taskManager) {
   261  			defer w.closers.Done()
   262  			// when a task is no longer assigned, we shutdown the task manager
   263  			if err := tm.Close(); err != nil {
   264  				log.G(ctx).WithError(err).Error("error closing task manager")
   265  			}
   266  		}(tm)
   267  
   268  		// make an attempt at removing. this is best effort. any errors will be
   269  		// retried by the reaper later.
   270  		if err := tm.ctlr.Remove(ctx); err != nil {
   271  			log.G(ctx).WithError(err).WithField("task.id", tm.task.ID).Error("remove task failed")
   272  		}
   273  
   274  		if err := tm.ctlr.Close(); err != nil {
   275  			log.G(ctx).WithError(err).Error("error closing controller")
   276  		}
   277  	}
   278  
   279  	removeTaskAssignment := func(taskID string) error {
   280  		ctx := log.WithLogger(ctx, log.G(ctx).WithField("task.id", taskID))
   281  		// if a task is no longer assigned, then we do not have to keep track
   282  		// of it. a task will only be unassigned when it is deleted on the
   283  		// manager. instead of SetTaskAssginment to true, we'll just remove the
   284  		// task now.
   285  		if err := DeleteTask(tx, taskID); err != nil {
   286  			log.G(ctx).WithError(err).Error("error removing de-assigned task")
   287  			return err
   288  		}
   289  		return nil
   290  	}
   291  
   292  	// If this was a complete set of assignments, we're going to remove all the remaining
   293  	// tasks.
   294  	if fullSnapshot {
   295  		for id, tm := range w.taskManagers {
   296  			if _, ok := assigned[id]; ok {
   297  				continue
   298  			}
   299  
   300  			err := removeTaskAssignment(id)
   301  			if err == nil {
   302  				delete(w.taskManagers, id)
   303  				go closeManager(tm)
   304  			}
   305  		}
   306  	} else {
   307  		// If this was an incremental set of assignments, we're going to remove only the tasks
   308  		// in the removed set
   309  		for _, task := range removedTasks {
   310  			err := removeTaskAssignment(task.ID)
   311  			if err != nil {
   312  				continue
   313  			}
   314  
   315  			tm, ok := w.taskManagers[task.ID]
   316  			if ok {
   317  				delete(w.taskManagers, task.ID)
   318  				go closeManager(tm)
   319  			}
   320  		}
   321  	}
   322  
   323  	return tx.Commit()
   324  }
   325  
   326  func reconcileSecrets(ctx context.Context, w *worker, assignments []*api.AssignmentChange, fullSnapshot bool) error {
   327  	var (
   328  		updatedSecrets []api.Secret
   329  		removedSecrets []string
   330  	)
   331  	for _, a := range assignments {
   332  		if s := a.Assignment.GetSecret(); s != nil {
   333  			switch a.Action {
   334  			case api.AssignmentChange_AssignmentActionUpdate:
   335  				updatedSecrets = append(updatedSecrets, *s)
   336  			case api.AssignmentChange_AssignmentActionRemove:
   337  				removedSecrets = append(removedSecrets, s.ID)
   338  			}
   339  
   340  		}
   341  	}
   342  
   343  	secretsProvider, ok := w.executor.(exec.SecretsProvider)
   344  	if !ok {
   345  		if len(updatedSecrets) != 0 || len(removedSecrets) != 0 {
   346  			log.G(ctx).Warn("secrets update ignored; executor does not support secrets")
   347  		}
   348  		return nil
   349  	}
   350  
   351  	secrets := secretsProvider.Secrets()
   352  
   353  	log.G(ctx).WithFields(logrus.Fields{
   354  		"len(updatedSecrets)": len(updatedSecrets),
   355  		"len(removedSecrets)": len(removedSecrets),
   356  	}).Debug("(*worker).reconcileSecrets")
   357  
   358  	// If this was a complete set of secrets, we're going to clear the secrets map and add all of them
   359  	if fullSnapshot {
   360  		secrets.Reset()
   361  	} else {
   362  		secrets.Remove(removedSecrets)
   363  	}
   364  	secrets.Add(updatedSecrets...)
   365  
   366  	return nil
   367  }
   368  
   369  func reconcileConfigs(ctx context.Context, w *worker, assignments []*api.AssignmentChange, fullSnapshot bool) error {
   370  	var (
   371  		updatedConfigs []api.Config
   372  		removedConfigs []string
   373  	)
   374  	for _, a := range assignments {
   375  		if r := a.Assignment.GetConfig(); r != nil {
   376  			switch a.Action {
   377  			case api.AssignmentChange_AssignmentActionUpdate:
   378  				updatedConfigs = append(updatedConfigs, *r)
   379  			case api.AssignmentChange_AssignmentActionRemove:
   380  				removedConfigs = append(removedConfigs, r.ID)
   381  			}
   382  
   383  		}
   384  	}
   385  
   386  	configsProvider, ok := w.executor.(exec.ConfigsProvider)
   387  	if !ok {
   388  		if len(updatedConfigs) != 0 || len(removedConfigs) != 0 {
   389  			log.G(ctx).Warn("configs update ignored; executor does not support configs")
   390  		}
   391  		return nil
   392  	}
   393  
   394  	configs := configsProvider.Configs()
   395  
   396  	log.G(ctx).WithFields(logrus.Fields{
   397  		"len(updatedConfigs)": len(updatedConfigs),
   398  		"len(removedConfigs)": len(removedConfigs),
   399  	}).Debug("(*worker).reconcileConfigs")
   400  
   401  	// If this was a complete set of configs, we're going to clear the configs map and add all of them
   402  	if fullSnapshot {
   403  		configs.Reset()
   404  	} else {
   405  		configs.Remove(removedConfigs)
   406  	}
   407  	configs.Add(updatedConfigs...)
   408  
   409  	return nil
   410  }
   411  
   412  func (w *worker) Listen(ctx context.Context, reporter StatusReporter) {
   413  	w.mu.Lock()
   414  	defer w.mu.Unlock()
   415  
   416  	key := &statusReporterKey{reporter}
   417  	w.listeners[key] = struct{}{}
   418  
   419  	go func() {
   420  		<-ctx.Done()
   421  		w.mu.Lock()
   422  		defer w.mu.Unlock()
   423  		delete(w.listeners, key) // remove the listener if the context is closed.
   424  	}()
   425  
   426  	// report the current statuses to the new listener
   427  	w.reportAllStatuses(ctx, reporter)
   428  }
   429  
   430  func (w *worker) Report(ctx context.Context, reporter StatusReporter) {
   431  	w.mu.Lock()
   432  	defer w.mu.Unlock()
   433  
   434  	w.reportAllStatuses(ctx, reporter)
   435  }
   436  
   437  func (w *worker) reportAllStatuses(ctx context.Context, reporter StatusReporter) {
   438  	if err := w.db.View(func(tx *bolt.Tx) error {
   439  		return WalkTaskStatus(tx, func(id string, status *api.TaskStatus) error {
   440  			return reporter.UpdateTaskStatus(ctx, id, status)
   441  		})
   442  	}); err != nil {
   443  		log.G(ctx).WithError(err).Errorf("failed reporting initial statuses")
   444  	}
   445  }
   446  
   447  func (w *worker) startTask(ctx context.Context, tx *bolt.Tx, task *api.Task) error {
   448  	_, err := w.taskManager(ctx, tx, task) // side-effect taskManager creation.
   449  
   450  	if err != nil {
   451  		log.G(ctx).WithError(err).Error("failed to start taskManager")
   452  		// we ignore this error: it gets reported in the taskStatus within
   453  		// `newTaskManager`. We log it here and move on. If their is an
   454  		// attempted restart, the lack of taskManager will have this retry
   455  		// again.
   456  		return nil
   457  	}
   458  
   459  	// only publish if controller resolution was successful.
   460  	w.taskevents.Publish(task.Copy())
   461  	return nil
   462  }
   463  
   464  func (w *worker) taskManager(ctx context.Context, tx *bolt.Tx, task *api.Task) (*taskManager, error) {
   465  	if tm, ok := w.taskManagers[task.ID]; ok {
   466  		return tm, nil
   467  	}
   468  
   469  	tm, err := w.newTaskManager(ctx, tx, task)
   470  	if err != nil {
   471  		return nil, err
   472  	}
   473  	w.taskManagers[task.ID] = tm
   474  	// keep track of active tasks
   475  	w.closers.Add(1)
   476  	return tm, nil
   477  }
   478  
   479  func (w *worker) newTaskManager(ctx context.Context, tx *bolt.Tx, task *api.Task) (*taskManager, error) {
   480  	ctx = log.WithLogger(ctx, log.G(ctx).WithFields(logrus.Fields{
   481  		"task.id":    task.ID,
   482  		"service.id": task.ServiceID,
   483  	}))
   484  
   485  	ctlr, status, err := exec.Resolve(ctx, task, w.executor)
   486  	if err := w.updateTaskStatus(ctx, tx, task.ID, status); err != nil {
   487  		log.G(ctx).WithError(err).Error("error updating task status after controller resolution")
   488  	}
   489  
   490  	if err != nil {
   491  		log.G(ctx).WithError(err).Error("controller resolution failed")
   492  		return nil, err
   493  	}
   494  
   495  	return newTaskManager(ctx, task, ctlr, statusReporterFunc(func(ctx context.Context, taskID string, status *api.TaskStatus) error {
   496  		w.mu.RLock()
   497  		defer w.mu.RUnlock()
   498  
   499  		return w.db.Update(func(tx *bolt.Tx) error {
   500  			return w.updateTaskStatus(ctx, tx, taskID, status)
   501  		})
   502  	})), nil
   503  }
   504  
   505  // updateTaskStatus reports statuses to listeners, read lock must be held.
   506  func (w *worker) updateTaskStatus(ctx context.Context, tx *bolt.Tx, taskID string, status *api.TaskStatus) error {
   507  	if err := PutTaskStatus(tx, taskID, status); err != nil {
   508  		// we shouldn't fail to put a task status. however, there exists the
   509  		// possibility of a race in which we try to put a task status after the
   510  		// task has been deleted. because this whole contraption is a careful
   511  		// dance of too-tightly-coupled concurrent parts, fixing tht race is
   512  		// fraught with hazards. instead, we'll recognize that it can occur,
   513  		// log the error, and then ignore it.
   514  		if err == errTaskUnknown {
   515  			// log at info level. debug logging in docker is already really
   516  			// verbose, so many people disable it. the race that causes this
   517  			// behavior should be very rare, but if it occurs, we should know
   518  			// about it, because if there is some case where it is _not_ rare,
   519  			// then knowing about it will go a long way toward debugging.
   520  			log.G(ctx).Info("attempted to update status for a task that has been removed")
   521  			return nil
   522  		}
   523  		log.G(ctx).WithError(err).Error("failed writing status to disk")
   524  		return err
   525  	}
   526  
   527  	// broadcast the task status out.
   528  	for key := range w.listeners {
   529  		if err := key.StatusReporter.UpdateTaskStatus(ctx, taskID, status); err != nil {
   530  			log.G(ctx).WithError(err).Errorf("failed updating status for reporter %v", key.StatusReporter)
   531  		}
   532  	}
   533  
   534  	return nil
   535  }
   536  
   537  // Subscribe to log messages matching the subscription.
   538  func (w *worker) Subscribe(ctx context.Context, subscription *api.SubscriptionMessage) error {
   539  	log.G(ctx).Debugf("Received subscription %s (selector: %v)", subscription.ID, subscription.Selector)
   540  
   541  	publisher, cancel, err := w.publisherProvider.Publisher(ctx, subscription.ID)
   542  	if err != nil {
   543  		return err
   544  	}
   545  	// Send a close once we're done
   546  	defer cancel()
   547  
   548  	match := func(t *api.Task) bool {
   549  		// TODO(aluzzardi): Consider using maps to limit the iterations.
   550  		for _, tid := range subscription.Selector.TaskIDs {
   551  			if t.ID == tid {
   552  				return true
   553  			}
   554  		}
   555  
   556  		for _, sid := range subscription.Selector.ServiceIDs {
   557  			if t.ServiceID == sid {
   558  				return true
   559  			}
   560  		}
   561  
   562  		for _, nid := range subscription.Selector.NodeIDs {
   563  			if t.NodeID == nid {
   564  				return true
   565  			}
   566  		}
   567  
   568  		return false
   569  	}
   570  
   571  	wg := sync.WaitGroup{}
   572  	w.mu.Lock()
   573  	for _, tm := range w.taskManagers {
   574  		if match(tm.task) {
   575  			wg.Add(1)
   576  			go func(tm *taskManager) {
   577  				defer wg.Done()
   578  				tm.Logs(ctx, *subscription.Options, publisher)
   579  			}(tm)
   580  		}
   581  	}
   582  	w.mu.Unlock()
   583  
   584  	// If follow mode is disabled, wait for the current set of matched tasks
   585  	// to finish publishing logs, then close the subscription by returning.
   586  	if subscription.Options == nil || !subscription.Options.Follow {
   587  		waitCh := make(chan struct{})
   588  		go func() {
   589  			defer close(waitCh)
   590  			wg.Wait()
   591  		}()
   592  
   593  		select {
   594  		case <-ctx.Done():
   595  			return ctx.Err()
   596  		case <-waitCh:
   597  			return nil
   598  		}
   599  	}
   600  
   601  	// In follow mode, watch for new tasks. Don't close the subscription
   602  	// until it's cancelled.
   603  	ch, cancel := w.taskevents.Watch()
   604  	defer cancel()
   605  	for {
   606  		select {
   607  		case v := <-ch:
   608  			task := v.(*api.Task)
   609  			if match(task) {
   610  				w.mu.RLock()
   611  				tm, ok := w.taskManagers[task.ID]
   612  				w.mu.RUnlock()
   613  				if !ok {
   614  					continue
   615  				}
   616  
   617  				go tm.Logs(ctx, *subscription.Options, publisher)
   618  			}
   619  		case <-ctx.Done():
   620  			return ctx.Err()
   621  		}
   622  	}
   623  }
   624  
   625  func (w *worker) Wait(ctx context.Context) error {
   626  	ch := make(chan struct{})
   627  	go func() {
   628  		w.closers.Wait()
   629  		close(ch)
   630  	}()
   631  
   632  	select {
   633  	case <-ch:
   634  		return nil
   635  	case <-ctx.Done():
   636  		return ctx.Err()
   637  	}
   638  }