github.com/juju/juju@v0.0.0-20240327075706-a90865de2538/worker/uniter/resolver/loop.go (about)

     1  // Copyright 2015 Canonical Ltd.
     2  // Licensed under the AGPLv3, see LICENCE file for details.
     3  
     4  package resolver
     5  
     6  import (
     7  	"time"
     8  
     9  	jujucharm "github.com/juju/charm/v12"
    10  	"github.com/juju/charm/v12/hooks"
    11  	"github.com/juju/errors"
    12  	"github.com/juju/mutex/v2"
    13  
    14  	"github.com/juju/juju/core/lxdprofile"
    15  	"github.com/juju/juju/worker/fortress"
    16  	"github.com/juju/juju/worker/uniter/operation"
    17  	"github.com/juju/juju/worker/uniter/remotestate"
    18  )
    19  
    20  // ErrLoopAborted is used to signal that the loop is exiting because it
    21  // received a value on its config's Abort chan.
    22  var ErrLoopAborted = errors.New("resolver loop aborted")
    23  
    24  // ErrDoNotProceed is used to distinguish behaviour from
    25  // resolver.ErrNoOperation. i.e do not run any operations versus
    26  // this resolver has no operations to run.
    27  var ErrDoNotProceed = errors.New("do not proceed")
    28  
    29  // Logger is here to stop the desire of creating a package level Logger.
    30  // Don't do this, instead use the one passed into the LoopConfig.
    31  type logger interface{}
    32  
    33  var _ logger = struct{}{}
    34  
    35  // Logger represents the logging methods used in this package.
    36  type Logger interface {
    37  	Errorf(string, ...interface{})
    38  	Debugf(string, ...interface{})
    39  	Tracef(string, ...interface{})
    40  	Warningf(string, ...interface{})
    41  }
    42  
    43  // LoopConfig contains configuration parameters for the resolver loop.
    44  type LoopConfig struct {
    45  	Resolver      Resolver
    46  	Watcher       remotestate.Watcher
    47  	Executor      operation.Executor
    48  	Factory       operation.Factory
    49  	Abort         <-chan struct{}
    50  	OnIdle        func() error
    51  	CharmDirGuard fortress.Guard
    52  	CharmDir      string
    53  	Logger        Logger
    54  }
    55  
    56  // Loop repeatedly waits for remote state changes, feeding the local and
    57  // remote state to the provided Resolver to generate Operations which are
    58  // then run with the provided Executor.
    59  //
    60  // The provided "onIdle" function will be called when the loop is waiting
    61  // for remote state changes due to a lack of work to perform. It will not
    62  // be called when a change is anticipated (i.e. due to ErrWaiting).
    63  //
    64  // The resolver loop can be controlled in the following ways:
    65  //   - if the "abort" channel is signalled, then the loop will
    66  //     exit with ErrLoopAborted
    67  //   - if the resolver returns ErrWaiting, then no operations
    68  //     will be executed until the remote state has changed
    69  //     again
    70  //   - if the resolver returns ErrNoOperation, then "onIdle"
    71  //     will be invoked and the loop will wait until the remote
    72  //     state has changed again
    73  //   - if the resolver, onIdle, or executor return some other
    74  //     error, the loop will exit immediately
    75  func Loop(cfg LoopConfig, localState *LocalState) error {
    76  	rf := &resolverOpFactory{Factory: cfg.Factory, LocalState: localState}
    77  
    78  	// Initialize charmdir availability before entering the loop in case we're recovering from a restart.
    79  	err := updateCharmDir(cfg.Executor.State(), cfg.CharmDirGuard, cfg.Abort, cfg.Logger)
    80  	if err != nil {
    81  		return errors.Trace(err)
    82  	}
    83  
    84  	// If we're restarting the loop, ensure any pending charm upgrade is run
    85  	// before continuing.
    86  	err = checkCharmInstallUpgrade(cfg.Logger, cfg.CharmDir, cfg.Watcher.Snapshot(), rf, cfg.Executor)
    87  	if err != nil {
    88  		return errors.Trace(err)
    89  	}
    90  
    91  	fire := make(chan struct{}, 1)
    92  	for {
    93  		rf.RemoteState = cfg.Watcher.Snapshot()
    94  		rf.LocalState.State = cfg.Executor.State()
    95  
    96  		if localState.HookWasShutdown && rf.RemoteState.ContainerRunningStatus != nil {
    97  			agentShutdown := rf.RemoteState.Shutdown
    98  			if !agentShutdown {
    99  				agentShutdown = maybeAgentShutdown(cfg)
   100  			}
   101  			if !agentShutdown {
   102  				cfg.Logger.Warningf("last %q hook was killed, but agent still alive", localState.Hook.Kind)
   103  			}
   104  		}
   105  
   106  		op, err := cfg.Resolver.NextOp(*rf.LocalState, rf.RemoteState, rf)
   107  		for err == nil {
   108  			// Send remote state changes to running operations.
   109  			remoteStateChanged := make(chan remotestate.Snapshot)
   110  			done := make(chan struct{})
   111  			go func() {
   112  				var rs chan remotestate.Snapshot
   113  				for {
   114  					select {
   115  					case <-cfg.Watcher.RemoteStateChanged():
   116  						// We consumed a remote state change event
   117  						// so we need a way to trigger the select below
   118  						// in case it was a new operation.
   119  						select {
   120  						case fire <- struct{}{}:
   121  						default:
   122  						}
   123  						rs = remoteStateChanged
   124  					case rs <- cfg.Watcher.Snapshot():
   125  						rs = nil
   126  					case <-done:
   127  						return
   128  					}
   129  				}
   130  			}()
   131  
   132  			cfg.Logger.Tracef("running op: %v", op)
   133  			if err := cfg.Executor.Run(op, remoteStateChanged); err != nil {
   134  				close(done)
   135  
   136  				if errors.Cause(err) == mutex.ErrCancelled {
   137  					// If the lock acquisition was cancelled (such as when the
   138  					// migration-inactive flag drops) we do not want the
   139  					// resolver to surface that error. This puts the agent into
   140  					// the "failed" state, which causes the initial migration
   141  					// validation phase to fail.
   142  					// The safest thing to do is to bounce the loop and
   143  					// reevaluate our state, which is what happens upon a
   144  					// fortress error anyway (uniter.TranslateFortressErrors).
   145  					cfg.Logger.Warningf("executor lock acquisition cancelled")
   146  					return ErrRestart
   147  				}
   148  				return errors.Trace(err)
   149  			}
   150  			close(done)
   151  
   152  			// Refresh snapshot, in case remote state
   153  			// changed between operations.
   154  			rf.RemoteState = cfg.Watcher.Snapshot()
   155  			rf.LocalState.State = cfg.Executor.State()
   156  
   157  			err = updateCharmDir(rf.LocalState.State, cfg.CharmDirGuard, cfg.Abort, cfg.Logger)
   158  			if err != nil {
   159  				return errors.Trace(err)
   160  			}
   161  
   162  			op, err = cfg.Resolver.NextOp(*rf.LocalState, rf.RemoteState, rf)
   163  		}
   164  
   165  		switch errors.Cause(err) {
   166  		case nil:
   167  		case ErrWaiting:
   168  			// If a resolver is waiting for events to
   169  			// complete, the agent is not idle.
   170  		case ErrNoOperation:
   171  			if cfg.OnIdle != nil {
   172  				if err := cfg.OnIdle(); err != nil {
   173  					return errors.Trace(err)
   174  				}
   175  			}
   176  		default:
   177  			return err
   178  		}
   179  
   180  		select {
   181  		case <-cfg.Abort:
   182  			return ErrLoopAborted
   183  		case <-cfg.Watcher.RemoteStateChanged():
   184  		case <-fire:
   185  		}
   186  	}
   187  }
   188  
   189  // maybeAgentShutdown returns true if the agent was killed by a
   190  // SIGTERM. If not true at the time of calling, it will wait a short
   191  // time for the status to possibly be updated.
   192  func maybeAgentShutdown(cfg LoopConfig) bool {
   193  	fire := make(chan struct{}, 1)
   194  	remoteStateChanged := make(chan remotestate.Snapshot)
   195  	done := make(chan struct{})
   196  	defer close(done)
   197  	go func() {
   198  		var rs chan remotestate.Snapshot
   199  		for {
   200  			select {
   201  			case <-cfg.Watcher.RemoteStateChanged():
   202  				// We consumed a remote state change event
   203  				// so we need a way to trigger the select below
   204  				// in case it was a new operation.
   205  				select {
   206  				case fire <- struct{}{}:
   207  				default:
   208  				}
   209  				rs = remoteStateChanged
   210  			case rs <- cfg.Watcher.Snapshot():
   211  				rs = nil
   212  			case <-done:
   213  				return
   214  			}
   215  		}
   216  	}()
   217  	for {
   218  		select {
   219  		case rs := <-remoteStateChanged:
   220  			if rs.Shutdown {
   221  				return true
   222  			}
   223  		case <-time.After(3 * time.Second):
   224  			return false
   225  		}
   226  	}
   227  }
   228  
   229  // updateCharmDir sets charm directory availability for sharing among
   230  // concurrent workers according to local operation state.
   231  func updateCharmDir(opState operation.State, guard fortress.Guard, abort fortress.Abort, logger Logger) error {
   232  	var changing bool
   233  
   234  	// Determine if the charm content is changing.
   235  	if opState.Kind == operation.Install || opState.Kind == operation.Upgrade {
   236  		changing = true
   237  	} else if opState.Kind == operation.RunHook && opState.Hook != nil && opState.Hook.Kind == hooks.UpgradeCharm {
   238  		changing = true
   239  	}
   240  
   241  	available := opState.Started && !opState.Stopped && !changing
   242  	logger.Tracef("charmdir: available=%v opState: started=%v stopped=%v changing=%v",
   243  		available, opState.Started, opState.Stopped, changing)
   244  	if available {
   245  		return guard.Unlock()
   246  	} else {
   247  		return guard.Lockdown(abort)
   248  	}
   249  }
   250  
   251  func checkCharmInstallUpgrade(logger Logger, charmDir string, remote remotestate.Snapshot, rf *resolverOpFactory, ex operation.Executor) error {
   252  	// If we restarted due to error with a pending charm upgrade available,
   253  	// do the upgrade now.  There are cases (lp:1895040) where the error was
   254  	// caused because not all units were upgraded before relation-created
   255  	// hooks were attempted for peer relations.  Do this before the remote
   256  	// state watcher is started.  It will not trigger an upgrade, until the
   257  	// next applicationChanged event.  Could get stuck in an error loop.
   258  
   259  	local := rf.LocalState
   260  	local.State = ex.State()
   261  
   262  	opFunc := rf.NewUpgrade
   263  	if !local.Installed && local.Hook != nil && local.Hook.Kind == hooks.Install && local.Step != operation.Done {
   264  		// We must have failed to run the install hook, restarted (possibly in a sidecar charm), so need to re-run the install op.
   265  		opFunc = rf.NewInstall
   266  	} else if !local.Installed || remote.CharmURL == "" {
   267  		// If the unit isn't installed, no need to start an upgrade.
   268  		return nil
   269  	}
   270  
   271  	_, err := jujucharm.ReadCharmDir(charmDir)
   272  	haveCharmDir := err == nil
   273  	if haveCharmDir {
   274  		// If the unit is installed and already upgrading and the charm dir
   275  		// exists no need to start an upgrade.
   276  		if local.Kind == operation.Upgrade || (local.Hook != nil && local.Hook.Kind == hooks.UpgradeCharm) {
   277  			return nil
   278  		}
   279  	}
   280  
   281  	if local.Started && remote.CharmProfileRequired {
   282  		if remote.LXDProfileName == "" {
   283  			return nil
   284  		}
   285  		rev, err := lxdprofile.ProfileRevision(remote.LXDProfileName)
   286  		if err != nil {
   287  			return errors.Trace(err)
   288  		}
   289  		curl, err := jujucharm.ParseURL(remote.CharmURL)
   290  		if err != nil {
   291  			return errors.Trace(err)
   292  		}
   293  		if rev != curl.Revision {
   294  			logger.Tracef("Charm profile required: current revision %d does not match new revision %d", rev, curl.Revision)
   295  			return nil
   296  		}
   297  	}
   298  
   299  	sameCharm := local.CharmURL == remote.CharmURL
   300  	if haveCharmDir && (!local.Started || sameCharm) {
   301  		return nil
   302  	}
   303  	if !haveCharmDir {
   304  		logger.Debugf("start to re-download charm %v because charm dir %q has gone which is usually caused by operator pod re-scheduling", remote.CharmURL, charmDir)
   305  	}
   306  	if !sameCharm {
   307  		logger.Debugf("execute pending upgrade from %s to %s after uniter loop restart", local.CharmURL, remote.CharmURL)
   308  	}
   309  
   310  	op, err := opFunc(remote.CharmURL)
   311  	if err != nil {
   312  		return errors.Trace(err)
   313  	}
   314  	if err = ex.Run(op, nil); err != nil {
   315  		return errors.Trace(err)
   316  	}
   317  	if local.Restart {
   318  		return ErrRestart
   319  	}
   320  	return nil
   321  }