github.com/kaisenlinux/docker.io@v0.0.0-20230510090727-ea55db55fac7/swarmkit/agent/agent.go (about)

     1  package agent
     2  
     3  import (
     4  	"bytes"
     5  	"context"
     6  	"math/rand"
     7  	"reflect"
     8  	"sync"
     9  	"time"
    10  
    11  	"github.com/docker/swarmkit/agent/exec"
    12  	"github.com/docker/swarmkit/api"
    13  	"github.com/docker/swarmkit/log"
    14  	"github.com/pkg/errors"
    15  )
    16  
    17  const (
    18  	initialSessionFailureBackoff = 100 * time.Millisecond
    19  	maxSessionFailureBackoff     = 8 * time.Second
    20  	nodeUpdatePeriod             = 20 * time.Second
    21  )
    22  
    23  // Agent implements the primary node functionality for a member of a swarm
    24  // cluster. The primary functionality is to run and report on the status of
    25  // tasks assigned to the node.
    26  type Agent struct {
    27  	config *Config
    28  
    29  	// The latest node object state from manager
    30  	// for this node known to the agent.
    31  	node *api.Node
    32  
    33  	keys []*api.EncryptionKey
    34  
    35  	sessionq chan sessionOperation
    36  	worker   Worker
    37  
    38  	started   chan struct{}
    39  	startOnce sync.Once // start only once
    40  	ready     chan struct{}
    41  	leaving   chan struct{}
    42  	leaveOnce sync.Once
    43  	left      chan struct{} // closed after "run" processes "leaving" and will no longer accept new assignments
    44  	stopped   chan struct{} // requests shutdown
    45  	stopOnce  sync.Once     // only allow stop to be called once
    46  	closed    chan struct{} // only closed in run
    47  	err       error         // read only after closed is closed
    48  
    49  	nodeUpdatePeriod time.Duration
    50  }
    51  
    52  // New returns a new agent, ready for task dispatch.
    53  func New(config *Config) (*Agent, error) {
    54  	if err := config.validate(); err != nil {
    55  		return nil, err
    56  	}
    57  
    58  	a := &Agent{
    59  		config:           config,
    60  		sessionq:         make(chan sessionOperation),
    61  		started:          make(chan struct{}),
    62  		leaving:          make(chan struct{}),
    63  		left:             make(chan struct{}),
    64  		stopped:          make(chan struct{}),
    65  		closed:           make(chan struct{}),
    66  		ready:            make(chan struct{}),
    67  		nodeUpdatePeriod: nodeUpdatePeriod,
    68  	}
    69  
    70  	a.worker = newWorker(config.DB, config.Executor, a)
    71  	return a, nil
    72  }
    73  
    74  // Start begins execution of the agent in the provided context, if not already
    75  // started.
    76  //
    77  // Start returns an error if the agent has already started.
    78  func (a *Agent) Start(ctx context.Context) error {
    79  	err := errAgentStarted
    80  
    81  	a.startOnce.Do(func() {
    82  		close(a.started)
    83  		go a.run(ctx)
    84  		err = nil // clear error above, only once.
    85  	})
    86  
    87  	return err
    88  }
    89  
    90  // Leave instructs the agent to leave the cluster. This method will shutdown
    91  // assignment processing and remove all assignments from the node.
    92  // Leave blocks until worker has finished closing all task managers or agent
    93  // is closed.
    94  func (a *Agent) Leave(ctx context.Context) error {
    95  	select {
    96  	case <-a.started:
    97  	default:
    98  		return errAgentNotStarted
    99  	}
   100  
   101  	a.leaveOnce.Do(func() {
   102  		close(a.leaving)
   103  	})
   104  
   105  	// Do not call Wait until we have confirmed that the agent is no longer
   106  	// accepting assignments. Starting a worker might race with Wait.
   107  	select {
   108  	case <-a.left:
   109  	case <-a.closed:
   110  		return ErrClosed
   111  	case <-ctx.Done():
   112  		return ctx.Err()
   113  	}
   114  
   115  	// agent could be closed while Leave is in progress
   116  	var err error
   117  	ch := make(chan struct{})
   118  	go func() {
   119  		err = a.worker.Wait(ctx)
   120  		close(ch)
   121  	}()
   122  
   123  	select {
   124  	case <-ch:
   125  		return err
   126  	case <-a.closed:
   127  		return ErrClosed
   128  	}
   129  }
   130  
   131  // Stop shuts down the agent, blocking until full shutdown. If the agent is not
   132  // started, Stop will block until the agent has fully shutdown.
   133  func (a *Agent) Stop(ctx context.Context) error {
   134  	select {
   135  	case <-a.started:
   136  	default:
   137  		return errAgentNotStarted
   138  	}
   139  
   140  	a.stop()
   141  
   142  	// wait till closed or context cancelled
   143  	select {
   144  	case <-a.closed:
   145  		return nil
   146  	case <-ctx.Done():
   147  		return ctx.Err()
   148  	}
   149  }
   150  
   151  // stop signals the agent shutdown process, returning true if this call was the
   152  // first to actually shutdown the agent.
   153  func (a *Agent) stop() bool {
   154  	var stopped bool
   155  	a.stopOnce.Do(func() {
   156  		close(a.stopped)
   157  		stopped = true
   158  	})
   159  
   160  	return stopped
   161  }
   162  
   163  // Err returns the error that caused the agent to shutdown or nil. Err blocks
   164  // until the agent is fully shutdown.
   165  func (a *Agent) Err(ctx context.Context) error {
   166  	select {
   167  	case <-a.closed:
   168  		return a.err
   169  	case <-ctx.Done():
   170  		return ctx.Err()
   171  	}
   172  }
   173  
   174  // Ready returns a channel that will be closed when agent first becomes ready.
   175  func (a *Agent) Ready() <-chan struct{} {
   176  	return a.ready
   177  }
   178  
   179  func (a *Agent) run(ctx context.Context) {
   180  	ctx, cancel := context.WithCancel(ctx)
   181  	defer cancel()
   182  	defer close(a.closed) // full shutdown.
   183  
   184  	ctx = log.WithModule(ctx, "agent")
   185  
   186  	log.G(ctx).Debug("(*Agent).run")
   187  	defer log.G(ctx).Debug("(*Agent).run exited")
   188  
   189  	nodeTLSInfo := a.config.NodeTLSInfo
   190  
   191  	// get the node description
   192  	nodeDescription, err := a.nodeDescriptionWithHostname(ctx, nodeTLSInfo)
   193  	if err != nil {
   194  		log.G(ctx).WithError(err).WithField("agent", a.config.Executor).Error("agent: node description unavailable")
   195  	}
   196  	// nodeUpdateTicker is used to periodically check for updates to node description
   197  	nodeUpdateTicker := time.NewTicker(a.nodeUpdatePeriod)
   198  	defer nodeUpdateTicker.Stop()
   199  
   200  	var (
   201  		backoff       time.Duration
   202  		session       = newSession(ctx, a, backoff, "", nodeDescription) // start the initial session
   203  		registered    = session.registered
   204  		ready         = a.ready // first session ready
   205  		sessionq      chan sessionOperation
   206  		leaving       = a.leaving
   207  		subscriptions = map[string]context.CancelFunc{}
   208  		// subscriptionDone is a channel that allows us to notify ourselves
   209  		// that a lot subscription should be finished. this channel is
   210  		// unbuffered, because it is only written to in a goroutine, and
   211  		// therefore cannot block the main execution path.
   212  		subscriptionDone = make(chan string)
   213  	)
   214  	defer func() {
   215  		session.close()
   216  	}()
   217  
   218  	if err := a.worker.Init(ctx); err != nil {
   219  		log.G(ctx).WithError(err).Error("worker initialization failed")
   220  		a.err = err
   221  		return // fatal?
   222  	}
   223  	defer a.worker.Close()
   224  
   225  	// setup a reliable reporter to call back to us.
   226  	reporter := newStatusReporter(ctx, a)
   227  	defer reporter.Close()
   228  
   229  	a.worker.Listen(ctx, reporter)
   230  
   231  	updateNode := func() {
   232  		// skip updating if the registration isn't finished
   233  		if registered != nil {
   234  			return
   235  		}
   236  		// get the current node description
   237  		newNodeDescription, err := a.nodeDescriptionWithHostname(ctx, nodeTLSInfo)
   238  		if err != nil {
   239  			log.G(ctx).WithError(err).WithField("agent", a.config.Executor).Error("agent: updated node description unavailable")
   240  		}
   241  
   242  		// if newNodeDescription is nil, it will cause a panic when
   243  		// trying to create a session. Typically this can happen
   244  		// if the engine goes down
   245  		if newNodeDescription == nil {
   246  			return
   247  		}
   248  
   249  		// if the node description has changed, update it to the new one
   250  		// and close the session. The old session will be stopped and a
   251  		// new one will be created with the updated description
   252  		if !reflect.DeepEqual(nodeDescription, newNodeDescription) {
   253  			nodeDescription = newNodeDescription
   254  			// close the session
   255  			log.G(ctx).Info("agent: found node update")
   256  
   257  			if err := session.close(); err != nil {
   258  				log.G(ctx).WithError(err).Error("agent: closing session failed")
   259  			}
   260  			sessionq = nil
   261  			registered = nil
   262  		}
   263  	}
   264  
   265  	for {
   266  		select {
   267  		case operation := <-sessionq:
   268  			operation.response <- operation.fn(session)
   269  		case <-leaving:
   270  			leaving = nil
   271  
   272  			// TODO(stevvooe): Signal to the manager that the node is leaving.
   273  
   274  			// when leaving we remove all assignments.
   275  			if err := a.worker.Assign(ctx, nil); err != nil {
   276  				log.G(ctx).WithError(err).Error("failed removing all assignments")
   277  			}
   278  
   279  			close(a.left)
   280  		case msg := <-session.assignments:
   281  			// if we have left, accept no more assignments
   282  			if leaving == nil {
   283  				continue
   284  			}
   285  
   286  			switch msg.Type {
   287  			case api.AssignmentsMessage_COMPLETE:
   288  				// Need to assign secrets and configs before tasks,
   289  				// because tasks might depend on new secrets or configs
   290  				if err := a.worker.Assign(ctx, msg.Changes); err != nil {
   291  					log.G(ctx).WithError(err).Error("failed to synchronize worker assignments")
   292  				}
   293  			case api.AssignmentsMessage_INCREMENTAL:
   294  				if err := a.worker.Update(ctx, msg.Changes); err != nil {
   295  					log.G(ctx).WithError(err).Error("failed to update worker assignments")
   296  				}
   297  			}
   298  		case msg := <-session.messages:
   299  			if err := a.handleSessionMessage(ctx, msg, nodeTLSInfo); err != nil {
   300  				log.G(ctx).WithError(err).Error("session message handler failed")
   301  			}
   302  		case sub := <-session.subscriptions:
   303  			if sub.Close {
   304  				if cancel, ok := subscriptions[sub.ID]; ok {
   305  					cancel()
   306  				}
   307  				delete(subscriptions, sub.ID)
   308  				continue
   309  			}
   310  
   311  			if _, ok := subscriptions[sub.ID]; ok {
   312  				// Duplicate subscription
   313  				continue
   314  			}
   315  
   316  			subCtx, subCancel := context.WithCancel(ctx)
   317  			subscriptions[sub.ID] = subCancel
   318  			// NOTE(dperny): for like 3 years, there has been a to do saying
   319  			// "we're tossing the error here, that seems wrong". this is not a
   320  			// to do anymore. 9/10 of these errors are going to be "context
   321  			// deadline exceeded", and the remaining 1/10 obviously doesn't
   322  			// matter or we'd have missed it by now.
   323  			go func() {
   324  				a.worker.Subscribe(subCtx, sub)
   325  				// when the worker finishes the subscription, we should notify
   326  				// ourselves that this has occurred. We cannot rely on getting
   327  				// a Close message from the manager, as any number of things
   328  				// could go wrong (see github.com/moby/moby/issues/39916).
   329  				subscriptionDone <- sub.ID
   330  			}()
   331  		case subID := <-subscriptionDone:
   332  			// subscription may already have been removed. If so, no need to
   333  			// take any action.
   334  			if cancel, ok := subscriptions[subID]; ok {
   335  				cancel()
   336  				delete(subscriptions, subID)
   337  			}
   338  		case <-registered:
   339  			log.G(ctx).Debugln("agent: registered")
   340  			if ready != nil {
   341  				close(ready)
   342  			}
   343  			if a.config.SessionTracker != nil {
   344  				a.config.SessionTracker.SessionEstablished()
   345  			}
   346  			ready = nil
   347  			registered = nil // we only care about this once per session
   348  			backoff = 0      // reset backoff
   349  			sessionq = a.sessionq
   350  			// re-report all task statuses when re-establishing a session
   351  			go a.worker.Report(ctx, reporter)
   352  		case err := <-session.errs:
   353  			// TODO(stevvooe): This may actually block if a session is closed
   354  			// but no error was sent. This must be the only place
   355  			// session.close is called in response to errors, for this to work.
   356  			if err != nil {
   357  				if a.config.SessionTracker != nil {
   358  					a.config.SessionTracker.SessionError(err)
   359  				}
   360  
   361  				backoff = initialSessionFailureBackoff + 2*backoff
   362  				if backoff > maxSessionFailureBackoff {
   363  					backoff = maxSessionFailureBackoff
   364  				}
   365  				log.G(ctx).WithError(err).WithField("backoff", backoff).Errorf("agent: session failed")
   366  			}
   367  
   368  			if err := session.close(); err != nil {
   369  				log.G(ctx).WithError(err).Error("agent: closing session failed")
   370  			}
   371  			sessionq = nil
   372  			// if we're here before <-registered, do nothing for that event
   373  			registered = nil
   374  		case <-session.closed:
   375  			if a.config.SessionTracker != nil {
   376  				if err := a.config.SessionTracker.SessionClosed(); err != nil {
   377  					log.G(ctx).WithError(err).Error("agent: exiting")
   378  					a.err = err
   379  					return
   380  				}
   381  			}
   382  
   383  			log.G(ctx).Debugf("agent: rebuild session")
   384  
   385  			// select a session registration delay from backoff range.
   386  			delay := time.Duration(0)
   387  			if backoff > 0 {
   388  				delay = time.Duration(rand.Int63n(int64(backoff)))
   389  			}
   390  			session = newSession(ctx, a, delay, session.sessionID, nodeDescription)
   391  			registered = session.registered
   392  		case ev := <-a.config.NotifyTLSChange:
   393  			// the TLS info has changed, so force a check to see if we need to restart the session
   394  			if tlsInfo, ok := ev.(*api.NodeTLSInfo); ok {
   395  				nodeTLSInfo = tlsInfo
   396  				updateNode()
   397  				nodeUpdateTicker.Stop()
   398  				nodeUpdateTicker = time.NewTicker(a.nodeUpdatePeriod)
   399  			}
   400  		case <-nodeUpdateTicker.C:
   401  			// periodically check to see whether the node information has changed, and if so, restart the session
   402  			updateNode()
   403  		case <-a.stopped:
   404  			// TODO(stevvooe): Wait on shutdown and cleanup. May need to pump
   405  			// this loop a few times.
   406  			return
   407  		case <-ctx.Done():
   408  			if a.err == nil {
   409  				a.err = ctx.Err()
   410  			}
   411  			return
   412  		}
   413  	}
   414  }
   415  
   416  func (a *Agent) handleSessionMessage(ctx context.Context, message *api.SessionMessage, nti *api.NodeTLSInfo) error {
   417  	seen := map[api.Peer]struct{}{}
   418  	for _, manager := range message.Managers {
   419  		if manager.Peer.Addr == "" {
   420  			continue
   421  		}
   422  
   423  		a.config.ConnBroker.Remotes().Observe(*manager.Peer, int(manager.Weight))
   424  		seen[*manager.Peer] = struct{}{}
   425  	}
   426  
   427  	var changes *NodeChanges
   428  	if message.Node != nil && (a.node == nil || !nodesEqual(a.node, message.Node)) {
   429  		if a.config.NotifyNodeChange != nil {
   430  			changes = &NodeChanges{Node: message.Node.Copy()}
   431  		}
   432  		a.node = message.Node.Copy()
   433  		if err := a.config.Executor.Configure(ctx, a.node); err != nil {
   434  			log.G(ctx).WithError(err).Error("node configure failed")
   435  		}
   436  	}
   437  	if len(message.RootCA) > 0 && !bytes.Equal(message.RootCA, nti.TrustRoot) {
   438  		if changes == nil {
   439  			changes = &NodeChanges{RootCert: message.RootCA}
   440  		} else {
   441  			changes.RootCert = message.RootCA
   442  		}
   443  	}
   444  
   445  	if changes != nil {
   446  		a.config.NotifyNodeChange <- changes
   447  	}
   448  
   449  	// prune managers not in list.
   450  	for peer := range a.config.ConnBroker.Remotes().Weights() {
   451  		if _, ok := seen[peer]; !ok {
   452  			a.config.ConnBroker.Remotes().Remove(peer)
   453  		}
   454  	}
   455  
   456  	if message.NetworkBootstrapKeys == nil {
   457  		return nil
   458  	}
   459  
   460  	for _, key := range message.NetworkBootstrapKeys {
   461  		same := false
   462  		for _, agentKey := range a.keys {
   463  			if agentKey.LamportTime == key.LamportTime {
   464  				same = true
   465  			}
   466  		}
   467  		if !same {
   468  			a.keys = message.NetworkBootstrapKeys
   469  			if err := a.config.Executor.SetNetworkBootstrapKeys(a.keys); err != nil {
   470  				return errors.Wrap(err, "configuring network key failed")
   471  			}
   472  		}
   473  	}
   474  
   475  	return nil
   476  }
   477  
   478  type sessionOperation struct {
   479  	fn       func(session *session) error
   480  	response chan error
   481  }
   482  
   483  // withSession runs fn with the current session.
   484  func (a *Agent) withSession(ctx context.Context, fn func(session *session) error) error {
   485  	response := make(chan error, 1)
   486  	select {
   487  	case a.sessionq <- sessionOperation{
   488  		fn:       fn,
   489  		response: response,
   490  	}:
   491  		select {
   492  		case err := <-response:
   493  			return err
   494  		case <-a.closed:
   495  			return ErrClosed
   496  		case <-ctx.Done():
   497  			return ctx.Err()
   498  		}
   499  	case <-a.closed:
   500  		return ErrClosed
   501  	case <-ctx.Done():
   502  		return ctx.Err()
   503  	}
   504  }
   505  
   506  // UpdateTaskStatus attempts to send a task status update over the current session,
   507  // blocking until the operation is completed.
   508  //
   509  // If an error is returned, the operation should be retried.
   510  func (a *Agent) UpdateTaskStatus(ctx context.Context, taskID string, status *api.TaskStatus) error {
   511  	log.G(ctx).WithField("task.id", taskID).Debug("(*Agent).UpdateTaskStatus")
   512  	ctx, cancel := context.WithCancel(ctx)
   513  	defer cancel()
   514  
   515  	errs := make(chan error, 1)
   516  	if err := a.withSession(ctx, func(session *session) error {
   517  		go func() {
   518  			err := session.sendTaskStatus(ctx, taskID, status)
   519  			if err != nil {
   520  				if err == errTaskUnknown {
   521  					err = nil // dispatcher no longer cares about this task.
   522  				} else {
   523  					log.G(ctx).WithError(err).Error("closing session after fatal error")
   524  					session.sendError(err)
   525  				}
   526  			} else {
   527  				log.G(ctx).Debug("task status reported")
   528  			}
   529  
   530  			errs <- err
   531  		}()
   532  
   533  		return nil
   534  	}); err != nil {
   535  		return err
   536  	}
   537  
   538  	select {
   539  	case err := <-errs:
   540  		return err
   541  	case <-ctx.Done():
   542  		return ctx.Err()
   543  	}
   544  }
   545  
   546  // Publisher returns a LogPublisher for the given subscription
   547  // as well as a cancel function that should be called when the log stream
   548  // is completed.
   549  func (a *Agent) Publisher(ctx context.Context, subscriptionID string) (exec.LogPublisher, func(), error) {
   550  	// TODO(stevvooe): The level of coordination here is WAY too much for logs.
   551  	// These should only be best effort and really just buffer until a session is
   552  	// ready. Ideally, they would use a separate connection completely.
   553  
   554  	var (
   555  		err       error
   556  		publisher api.LogBroker_PublishLogsClient
   557  	)
   558  
   559  	err = a.withSession(ctx, func(session *session) error {
   560  		publisher, err = api.NewLogBrokerClient(session.conn.ClientConn).PublishLogs(ctx)
   561  		return err
   562  	})
   563  	if err != nil {
   564  		return nil, nil, err
   565  	}
   566  
   567  	// make little closure for ending the log stream
   568  	sendCloseMsg := func() {
   569  		// send a close message, to tell the manager our logs are done
   570  		publisher.Send(&api.PublishLogsMessage{
   571  			SubscriptionID: subscriptionID,
   572  			Close:          true,
   573  		})
   574  		// close the stream forreal. ignore the return value and the error,
   575  		// because we don't care.
   576  		publisher.CloseAndRecv()
   577  	}
   578  
   579  	return exec.LogPublisherFunc(func(ctx context.Context, message api.LogMessage) error {
   580  			select {
   581  			case <-ctx.Done():
   582  				sendCloseMsg()
   583  				return ctx.Err()
   584  			default:
   585  			}
   586  
   587  			return publisher.Send(&api.PublishLogsMessage{
   588  				SubscriptionID: subscriptionID,
   589  				Messages:       []api.LogMessage{message},
   590  			})
   591  		}), func() {
   592  			sendCloseMsg()
   593  		}, nil
   594  }
   595  
   596  // nodeDescriptionWithHostname retrieves node description, and overrides hostname if available
   597  func (a *Agent) nodeDescriptionWithHostname(ctx context.Context, tlsInfo *api.NodeTLSInfo) (*api.NodeDescription, error) {
   598  	desc, err := a.config.Executor.Describe(ctx)
   599  
   600  	// Override hostname and TLS info
   601  	if desc != nil {
   602  		if a.config.Hostname != "" {
   603  			desc.Hostname = a.config.Hostname
   604  		}
   605  		desc.TLSInfo = tlsInfo
   606  		desc.FIPS = a.config.FIPS
   607  	}
   608  	return desc, err
   609  }
   610  
   611  // nodesEqual returns true if the node states are functionally equal, ignoring status,
   612  // version and other superfluous fields.
   613  //
   614  // This used to decide whether or not to propagate a node update to executor.
   615  func nodesEqual(a, b *api.Node) bool {
   616  	a, b = a.Copy(), b.Copy()
   617  
   618  	a.Status, b.Status = api.NodeStatus{}, api.NodeStatus{}
   619  	a.Meta, b.Meta = api.Meta{}, api.Meta{}
   620  
   621  	return reflect.DeepEqual(a, b)
   622  }