github.com/juju/juju@v0.0.0-20240430160146-1752b71fcf00/worker/migrationminion/worker.go (about)

     1  // Copyright 2016 Canonical Ltd.
     2  // Licensed under the AGPLv3, see LICENCE file for details.
     3  
     4  package migrationminion
     5  
     6  import (
     7  	"time"
     8  
     9  	"github.com/juju/clock"
    10  	"github.com/juju/errors"
    11  	"github.com/juju/worker/v3"
    12  	"github.com/juju/worker/v3/catacomb"
    13  	"gopkg.in/retry.v1"
    14  
    15  	"github.com/juju/juju/agent"
    16  	"github.com/juju/juju/api"
    17  	"github.com/juju/juju/api/base"
    18  	apiservererrors "github.com/juju/juju/apiserver/errors"
    19  	"github.com/juju/juju/core/migration"
    20  	"github.com/juju/juju/core/network"
    21  	"github.com/juju/juju/core/watcher"
    22  	"github.com/juju/juju/rpc/params"
    23  	"github.com/juju/juju/worker/fortress"
    24  )
    25  
    26  const (
    27  	// maxRetries is the number of times we'll attempt validation
    28  	// before giving up.
    29  	maxRetries = 10
    30  
    31  	// initialRetryDelay is the starting delay - this will be
    32  	// increased exponentially up maxRetries.
    33  	initialRetryDelay = 100 * time.Millisecond
    34  
    35  	// retryBackoffFactor is how much longer we wait after a failing
    36  	// retry. Retrying 10 times starting at 100ms and backing off 1.6x
    37  	// gives us a total delay time of about 45s.
    38  	retryBackoffFactor = 1.6
    39  )
    40  
    41  // Facade exposes controller functionality to a Worker.
    42  type Facade interface {
    43  	Watch() (watcher.MigrationStatusWatcher, error)
    44  	Report(migrationId string, phase migration.Phase, success bool) error
    45  }
    46  
    47  // Config defines the operation of a Worker.
    48  type Config struct {
    49  	Agent             agent.Agent
    50  	Facade            Facade
    51  	Guard             fortress.Guard
    52  	Clock             clock.Clock
    53  	APIOpen           func(*api.Info, api.DialOpts) (api.Connection, error)
    54  	ValidateMigration func(base.APICaller) error
    55  	Logger            Logger
    56  }
    57  
    58  // Validate returns an error if config cannot drive a Worker.
    59  func (config Config) Validate() error {
    60  	if config.Agent == nil {
    61  		return errors.NotValidf("nil Agent")
    62  	}
    63  	if config.Facade == nil {
    64  		return errors.NotValidf("nil Facade")
    65  	}
    66  	if config.Guard == nil {
    67  		return errors.NotValidf("nil Guard")
    68  	}
    69  	if config.Clock == nil {
    70  		return errors.NotValidf("nil Clock")
    71  	}
    72  	if config.APIOpen == nil {
    73  		return errors.NotValidf("nil APIOpen")
    74  	}
    75  	if config.ValidateMigration == nil {
    76  		return errors.NotValidf("nil ValidateMigration")
    77  	}
    78  	if config.Logger == nil {
    79  		return errors.NotValidf("nil Logger")
    80  	}
    81  	return nil
    82  }
    83  
    84  // New returns a Worker backed by config, or an error.
    85  func New(config Config) (worker.Worker, error) {
    86  	if err := config.Validate(); err != nil {
    87  		return nil, errors.Trace(err)
    88  	}
    89  	w := &Worker{config: config}
    90  	err := catacomb.Invoke(catacomb.Plan{
    91  		Site: &w.catacomb,
    92  		Work: w.loop,
    93  	})
    94  	if err != nil {
    95  		return nil, errors.Trace(err)
    96  	}
    97  	return w, nil
    98  }
    99  
   100  // Worker waits for a model migration to be active, then locks down the
   101  // configured fortress and implements the migration.
   102  type Worker struct {
   103  	catacomb catacomb.Catacomb
   104  	config   Config
   105  }
   106  
   107  // Kill implements worker.Worker.
   108  func (w *Worker) Kill() {
   109  	w.catacomb.Kill(nil)
   110  }
   111  
   112  // Wait implements worker.Worker.
   113  func (w *Worker) Wait() error {
   114  	return w.catacomb.Wait()
   115  }
   116  
   117  func (w *Worker) loop() error {
   118  	watch, err := w.config.Facade.Watch()
   119  	if err != nil {
   120  		return errors.Annotate(err, "setting up watcher")
   121  	}
   122  	if err := w.catacomb.Add(watch); err != nil {
   123  		return errors.Trace(err)
   124  	}
   125  
   126  	for {
   127  		select {
   128  		case <-w.catacomb.Dying():
   129  			return w.catacomb.ErrDying()
   130  		case status, ok := <-watch.Changes():
   131  			if !ok {
   132  				return errors.New("watcher channel closed")
   133  			}
   134  			if err := w.handle(status); err != nil {
   135  				return errors.Trace(err)
   136  			}
   137  		}
   138  	}
   139  }
   140  
   141  func (w *Worker) handle(status watcher.MigrationStatus) error {
   142  	w.config.Logger.Infof("migration phase is now: %s", status.Phase)
   143  
   144  	if !status.Phase.IsRunning() {
   145  		return w.config.Guard.Unlock()
   146  	}
   147  
   148  	// Ensure that all workers related to migration fortress have
   149  	// stopped and aren't allowed to restart.
   150  	err := w.config.Guard.Lockdown(w.catacomb.Dying())
   151  	if errors.Cause(err) == fortress.ErrAborted {
   152  		return w.catacomb.ErrDying()
   153  	} else if err != nil {
   154  		return errors.Trace(err)
   155  	}
   156  
   157  	switch status.Phase {
   158  	case migration.QUIESCE:
   159  		err = w.doQUIESCE(status)
   160  	case migration.VALIDATION:
   161  		err = w.doVALIDATION(status)
   162  	case migration.SUCCESS:
   163  		err = w.doSUCCESS(status)
   164  	default:
   165  		// The minion doesn't need to do anything for other
   166  		// migration phases.
   167  	}
   168  	return errors.Trace(err)
   169  }
   170  
   171  func (w *Worker) doQUIESCE(status watcher.MigrationStatus) error {
   172  	// Report that the minion is ready and that all workers that
   173  	// should be shut down have done so.
   174  	return w.report(status, true)
   175  }
   176  
   177  func (w *Worker) doVALIDATION(status watcher.MigrationStatus) error {
   178  	attempt := retry.StartWithCancel(
   179  		retry.LimitCount(maxRetries, retry.Exponential{
   180  			Initial: initialRetryDelay,
   181  			Factor:  retryBackoffFactor,
   182  			Jitter:  true,
   183  		}),
   184  		w.config.Clock,
   185  		w.catacomb.Dying(),
   186  	)
   187  	var err error
   188  	for attempt.Next() {
   189  		err = w.validate(status)
   190  		if err == nil {
   191  			break
   192  		}
   193  		if attempt.More() {
   194  			w.config.Logger.Debugf("validation failed (retrying): %v", err)
   195  		}
   196  	}
   197  	if errors.Is(err, apiservererrors.ErrTryAgain) || params.IsCodeTryAgain(err) {
   198  		// We treat TryAgainError as a retriable error,
   199  		// so ingore it and don't report to the migration master.
   200  		w.config.Logger.Errorf("validation failed due to rate limit reached: %v", err)
   201  		return nil
   202  	}
   203  	if err != nil {
   204  		// Don't return this error just log it and report to the
   205  		// migrationmaster that things didn't work out.
   206  		w.config.Logger.Errorf("validation failed: %v", err)
   207  	}
   208  	return w.report(status, err == nil)
   209  }
   210  
   211  func (w *Worker) validate(status watcher.MigrationStatus) error {
   212  	agentConf := w.config.Agent.CurrentConfig()
   213  	apiInfo, ok := agentConf.APIInfo()
   214  	if !ok {
   215  		return errors.New("no API connection details")
   216  	}
   217  	apiInfo.Addrs = status.TargetAPIAddrs
   218  	apiInfo.CACert = status.TargetCACert
   219  	// Application agents (k8s) use old password.
   220  	if apiInfo.Password == "" {
   221  		apiInfo.Password = agentConf.OldPassword()
   222  	}
   223  
   224  	// Use zero DialOpts (no retries) because the worker must stay
   225  	// responsive to Kill requests. We don't want it to be blocked by
   226  	// a long set of retry attempts.
   227  	conn, err := w.config.APIOpen(apiInfo, api.DialOpts{})
   228  	if err != nil {
   229  		return errors.Annotate(err, "failed to open API to target controller")
   230  	}
   231  	defer func() { _ = conn.Close() }()
   232  
   233  	// Ask the agent to confirm that things look ok.
   234  	err = w.config.ValidateMigration(conn)
   235  	return errors.Trace(err)
   236  }
   237  
   238  func (w *Worker) doSUCCESS(status watcher.MigrationStatus) error {
   239  	hps, err := network.ParseProviderHostPorts(status.TargetAPIAddrs...)
   240  	if err != nil {
   241  		return errors.Annotate(err, "converting API addresses")
   242  	}
   243  
   244  	// Report first because the config update that's about to happen
   245  	// will cause the API connection to drop. The SUCCESS phase is the
   246  	// point of no return anyway.
   247  	if err := w.report(status, true); err != nil {
   248  		return errors.Trace(err)
   249  	}
   250  
   251  	err = w.config.Agent.ChangeConfig(func(conf agent.ConfigSetter) error {
   252  		err := conf.SetAPIHostPorts([]network.HostPorts{hps.HostPorts()})
   253  		if err != nil {
   254  			return errors.Trace(err)
   255  		}
   256  		conf.SetCACert(status.TargetCACert)
   257  		return nil
   258  	})
   259  	return errors.Annotate(err, "setting agent config")
   260  }
   261  
   262  func (w *Worker) report(status watcher.MigrationStatus, success bool) error {
   263  	w.config.Logger.Debugf("reporting back for phase %s: %v", status.Phase, success)
   264  	err := w.config.Facade.Report(status.MigrationId, status.Phase, success)
   265  	return errors.Annotate(err, "failed to report phase progress")
   266  }