github.com/juju/juju@v0.0.0-20240430160146-1752b71fcf00/worker/undertaker/undertaker.go (about)

     1  // Copyright 2015-2016 Canonical Ltd.
     2  // Licensed under the AGPLv3, see LICENCE file for details.
     3  
     4  package undertaker
     5  
     6  import (
     7  	"context"
     8  	"fmt"
     9  	"time"
    10  
    11  	"github.com/juju/clock"
    12  	"github.com/juju/errors"
    13  	"github.com/juju/worker/v3/catacomb"
    14  	"gopkg.in/retry.v1"
    15  
    16  	"github.com/juju/juju/core/life"
    17  	"github.com/juju/juju/core/status"
    18  	"github.com/juju/juju/core/watcher"
    19  	"github.com/juju/juju/environs"
    20  	environscontext "github.com/juju/juju/environs/context"
    21  	"github.com/juju/juju/rpc/params"
    22  	"github.com/juju/juju/worker"
    23  	"github.com/juju/juju/worker/common"
    24  )
    25  
    26  //go:generate go run go.uber.org/mock/mockgen -package undertaker_test -destination facade_mock_test.go github.com/juju/juju/worker/undertaker Facade
    27  //go:generate go run go.uber.org/mock/mockgen -package undertaker_test -destination credentialapi_mock_test.go github.com/juju/juju/worker/common CredentialAPI
    28  
    29  // Facade covers the parts of the api/undertaker.UndertakerClient that we
    30  // need for the worker. It's more than a little raw, but we'll survive.
    31  type Facade interface {
    32  	environs.EnvironConfigGetter
    33  	ModelInfo() (params.UndertakerModelInfoResult, error)
    34  	WatchModelResources() (watcher.NotifyWatcher, error)
    35  	WatchModel() (watcher.NotifyWatcher, error)
    36  	ProcessDyingModel() error
    37  	RemoveModel() error
    38  	SetStatus(status status.Status, message string, data map[string]interface{}) error
    39  }
    40  
    41  // Logger defines a way to report non-fatal errors.
    42  type Logger interface {
    43  	Errorf(string, ...interface{})
    44  	Infof(string, ...interface{})
    45  	Debugf(string, ...interface{})
    46  	Tracef(string, ...interface{})
    47  	Warningf(string, ...interface{})
    48  }
    49  
    50  // Config holds the resources and configuration necessary to run an
    51  // undertaker worker.
    52  type Config struct {
    53  	Facade                Facade
    54  	CredentialAPI         common.CredentialAPI
    55  	Logger                Logger
    56  	Clock                 clock.Clock
    57  	NewCloudDestroyerFunc func(context.Context, environs.OpenParams) (environs.CloudDestroyer, error)
    58  }
    59  
    60  // Validate returns an error if the config cannot be expected to drive
    61  // a functional undertaker worker.
    62  func (config Config) Validate() error {
    63  	if config.Facade == nil {
    64  		return errors.NotValidf("nil Facade")
    65  	}
    66  	if config.CredentialAPI == nil {
    67  		return errors.NotValidf("nil CredentialAPI")
    68  	}
    69  	if config.Logger == nil {
    70  		return errors.NotValidf("nil Logger")
    71  	}
    72  	if config.Clock == nil {
    73  		return errors.NotValidf("nil Clock")
    74  	}
    75  	if config.NewCloudDestroyerFunc == nil {
    76  		return errors.NotValidf("nil NewCloudDestroyerFunc")
    77  	}
    78  	return nil
    79  }
    80  
    81  // NewUndertaker returns a worker which processes a dying model.
    82  func NewUndertaker(config Config) (*Undertaker, error) {
    83  	if err := config.Validate(); err != nil {
    84  		return nil, errors.Trace(err)
    85  	}
    86  
    87  	u := &Undertaker{
    88  		config: config,
    89  	}
    90  	err := catacomb.Invoke(catacomb.Plan{
    91  		Site: &u.catacomb,
    92  		Work: u.run,
    93  	})
    94  	if err != nil {
    95  		return nil, errors.Trace(err)
    96  	}
    97  	return u, nil
    98  }
    99  
   100  type Undertaker struct {
   101  	catacomb catacomb.Catacomb
   102  	config   Config
   103  }
   104  
   105  // Kill is part of the worker.Worker interface.
   106  func (u *Undertaker) Kill() {
   107  	u.catacomb.Kill(nil)
   108  }
   109  
   110  // Wait is part of the worker.Worker interface.
   111  func (u *Undertaker) Wait() error {
   112  	return u.catacomb.Wait()
   113  }
   114  
   115  func (u *Undertaker) run() (errOut error) {
   116  	defer func() {
   117  		if errors.Is(errOut, context.Canceled) ||
   118  			errors.Is(errOut, context.DeadlineExceeded) {
   119  			select {
   120  			case <-u.catacomb.Dying():
   121  				errOut = u.catacomb.ErrDying()
   122  			default:
   123  			}
   124  		}
   125  	}()
   126  
   127  	modelWatcher, err := u.config.Facade.WatchModel()
   128  	if errors.Is(err, errors.NotFound) {
   129  		// If model already gone, exit early.
   130  		return nil
   131  	} else if err != nil {
   132  		return errors.Trace(err)
   133  	}
   134  	err = u.catacomb.Add(modelWatcher)
   135  	if err != nil {
   136  		return err
   137  	}
   138  
   139  	select {
   140  	case <-modelWatcher.Changes():
   141  	case <-u.catacomb.Dying():
   142  		return u.catacomb.ErrDying()
   143  	}
   144  
   145  	result, err := u.config.Facade.ModelInfo()
   146  	if errors.Is(err, errors.NotFound) {
   147  		// If model already gone, exit early.
   148  		return nil
   149  	} else if err != nil {
   150  		return errors.Trace(err)
   151  	} else if result.Error != nil {
   152  		return errors.Trace(result.Error)
   153  	}
   154  	info := result.Result
   155  
   156  	ctx, cancel := context.WithCancel(u.catacomb.Context(context.Background()))
   157  	defer cancel()
   158  
   159  	// Watch for changes to model destroy values, if so, cancel the context
   160  	// and restart the worker.
   161  	err = u.catacomb.Add(worker.NewSimpleWorker(func(stopCh <-chan struct{}) error {
   162  		for {
   163  			select {
   164  			case <-stopCh:
   165  				return nil
   166  			case <-modelWatcher.Changes():
   167  				result, err := u.config.Facade.ModelInfo()
   168  				if errors.Is(err, errors.NotFound) || err != nil || result.Error != nil {
   169  					continue
   170  				}
   171  				updated := result.Result
   172  				changed := false
   173  				switch {
   174  				case info.DestroyTimeout == nil && updated.DestroyTimeout != nil:
   175  					changed = true
   176  				case info.DestroyTimeout != nil && updated.DestroyTimeout == nil:
   177  					changed = true
   178  				case info.DestroyTimeout != nil && updated.DestroyTimeout != nil && *info.DestroyTimeout != *updated.DestroyTimeout:
   179  					changed = true
   180  				case info.ForceDestroyed != updated.ForceDestroyed:
   181  					changed = true
   182  				}
   183  				if changed {
   184  					u.config.Logger.Infof("model destroy parameters changed: restarting undertaker worker")
   185  					return errors.Errorf("model destroy parameters changed")
   186  				}
   187  			}
   188  		}
   189  	}))
   190  	if err != nil {
   191  		return err
   192  	}
   193  
   194  	if info.Life == life.Alive {
   195  		return errors.Errorf("model still alive")
   196  	}
   197  
   198  	if info.ForceDestroyed && info.DestroyTimeout != nil {
   199  		u.config.Logger.Infof("force destroying model %q with timeout %v", info.Name, info.DestroyTimeout)
   200  		return u.forceDestroy(ctx, info)
   201  	} else if info.DestroyTimeout != nil {
   202  		u.config.Logger.Warningf("timeout ignored for graceful model destroy")
   203  	}
   204  	// Even if ForceDestroyed is true, if we don't have a timeout, we treat them the same
   205  	// as a non-force destroyed model.
   206  	u.config.Logger.Infof("destroying model %q", info.Name)
   207  	return u.cleanDestroy(ctx, info)
   208  }
   209  
   210  func (u *Undertaker) cleanDestroy(ctx context.Context, info params.UndertakerModelInfo) error {
   211  	select {
   212  	case <-ctx.Done():
   213  		return ctx.Err()
   214  	default:
   215  	}
   216  
   217  	if info.Life == life.Dying {
   218  		// TODO(axw) 2016-04-14 #1570285
   219  		// We should update status with information
   220  		// about the remaining resources here, and
   221  		// also make the worker responsible for
   222  		// checking the emptiness criteria before
   223  		// attempting to remove the model.
   224  		if err := u.setStatus(
   225  			status.Destroying,
   226  			"cleaning up cloud resources",
   227  		); err != nil {
   228  			return errors.Trace(err)
   229  		}
   230  		// Wait for the model to become empty.
   231  		if err := u.processDyingModel(ctx, info); err != nil {
   232  			u.config.Logger.Errorf("destroy model failed: %v", err)
   233  			return fmt.Errorf("proccesing model death: %w", err)
   234  		}
   235  	} else {
   236  		u.config.Logger.Debugf("skipping processDyingModel as model is already dead")
   237  	}
   238  
   239  	if info.IsSystem {
   240  		// Nothing to do. We don't destroy environ resources or
   241  		// delete model docs for a controller model, because we're
   242  		// running inside that controller and can't safely clean up
   243  		// our own infrastructure. (That'll be the client's job in
   244  		// the end, once we've reported that we've tidied up what we
   245  		// can, by returning nil here, indicating that we've set it
   246  		// to Dead -- implied by processDyingModel succeeding.)
   247  		return nil
   248  	}
   249  
   250  	select {
   251  	case <-ctx.Done():
   252  		return ctx.Err()
   253  	default:
   254  	}
   255  
   256  	retryStrategy := retry.LimitCount(1, retry.Regular{})
   257  	// Destroy environ resources.
   258  	if err := u.destroyEnviron(ctx, info, retryStrategy); err != nil {
   259  		u.config.Logger.Errorf("destroy environ failed: %v", err)
   260  		return fmt.Errorf("cannot destroy cloud resources: %w", err)
   261  	}
   262  
   263  	select {
   264  	case <-ctx.Done():
   265  		return ctx.Err()
   266  	default:
   267  	}
   268  
   269  	// Finally, the model is going to be dead, and be removed.
   270  	if err := u.config.Facade.RemoveModel(); err != nil {
   271  		u.config.Logger.Errorf("remove model failed: %v", err)
   272  		return errors.Annotate(err, "cannot remove model")
   273  	}
   274  	return nil
   275  }
   276  
   277  func (u *Undertaker) forceDestroy(ctx context.Context, info params.UndertakerModelInfo) error {
   278  	if !info.ForceDestroyed || info.DestroyTimeout == nil {
   279  		return errors.Errorf("invalid force destroy")
   280  	}
   281  
   282  	select {
   283  	case <-ctx.Done():
   284  		return ctx.Err()
   285  	default:
   286  	}
   287  
   288  	if *info.DestroyTimeout == 0 {
   289  		u.config.Logger.Infof("skipping waiting for model to cleanly shutdown since timeout is 0")
   290  	} else if info.Life == life.Dying {
   291  		// TODO(axw) 2016-04-14 #1570285
   292  		// We should update status with information
   293  		// about the remaining resources here, and
   294  		// also make the worker responsible for
   295  		// checking the emptiness criteria before
   296  		// attempting to remove the model.
   297  		if err := u.setStatus(
   298  			status.Destroying,
   299  			"cleaning up cloud resources",
   300  		); err != nil {
   301  			return errors.Trace(err)
   302  		}
   303  		proccessCtx, proccessCancel := context.WithCancel(ctx)
   304  		processTimer := u.config.Clock.AfterFunc(*info.DestroyTimeout, func() {
   305  			proccessCancel()
   306  		})
   307  		defer processTimer.Stop()
   308  		if err := u.processDyingModel(proccessCtx, info); err != nil && !errors.Is(err, context.Canceled) {
   309  			proccessCancel()
   310  			u.config.Logger.Errorf("destroy model failed: %v", err)
   311  			return fmt.Errorf("proccesing model death: %w", err)
   312  		}
   313  		proccessCancel()
   314  	} else {
   315  		u.config.Logger.Debugf("skipping processDyingModel as model is already dead")
   316  	}
   317  
   318  	if info.IsSystem {
   319  		// Nothing to do. We don't destroy environ resources or
   320  		// delete model docs for a controller model, because we're
   321  		// running inside that controller and can't safely clean up
   322  		// our own infrastructure. (That'll be the client's job in
   323  		// the end, once we've reported that we've tidied up what we
   324  		// can, by returning nil here, indicating that we've set it
   325  		// to Dead -- implied by processDyingModel succeeding.)
   326  		return nil
   327  	}
   328  
   329  	select {
   330  	case <-ctx.Done():
   331  		return ctx.Err()
   332  	default:
   333  	}
   334  
   335  	if *info.DestroyTimeout == 0 {
   336  		u.config.Logger.Infof("skipping tearing down cloud environment since timeout is 0")
   337  	} else {
   338  		destroyCtx, destroyCancel := context.WithCancel(ctx)
   339  		destroyTimer := u.config.Clock.AfterFunc(*info.DestroyTimeout, func() {
   340  			destroyCancel()
   341  		})
   342  		defer destroyTimer.Stop()
   343  		retryStrategy := retry.Exponential{
   344  			Initial:  1 * time.Second,
   345  			Factor:   1.5,
   346  			MaxDelay: 5 * time.Second,
   347  		}
   348  		if err := u.destroyEnviron(destroyCtx, info, retryStrategy); err != nil && !errors.Is(err, context.Canceled) {
   349  			destroyCancel()
   350  			u.config.Logger.Errorf("destroy environ failed: %v", err)
   351  			return fmt.Errorf("tearing down cloud environment: %w", err)
   352  		}
   353  		destroyCancel()
   354  	}
   355  
   356  	select {
   357  	case <-ctx.Done():
   358  		return ctx.Err()
   359  	default:
   360  	}
   361  
   362  	// Finally, the model is going to be dead, and be removed.
   363  	if err := u.config.Facade.RemoveModel(); err != nil {
   364  		u.config.Logger.Errorf("remove model failed: %v", err)
   365  		return errors.Annotate(err, "cannot remove model")
   366  	}
   367  	return nil
   368  }
   369  
   370  func (u *Undertaker) environ() (environs.CloudDestroyer, error) {
   371  	modelConfig, err := u.config.Facade.ModelConfig()
   372  	if err != nil {
   373  		return nil, errors.Annotate(err, "retrieving model config")
   374  	}
   375  
   376  	cloudSpec, err := u.config.Facade.CloudSpec()
   377  	if err != nil {
   378  		return nil, errors.Annotatef(err, "retrieving cloud spec for model %q (%s)", modelConfig.Name(), modelConfig.UUID())
   379  	}
   380  
   381  	environ, err := u.config.NewCloudDestroyerFunc(context.TODO(), environs.OpenParams{
   382  		Cloud:  cloudSpec,
   383  		Config: modelConfig,
   384  	})
   385  	if err != nil {
   386  		return nil, errors.Annotatef(err, "creating environ for model %q (%s)", modelConfig.Name(), modelConfig.UUID())
   387  	}
   388  	return environ, nil
   389  }
   390  
   391  func (u *Undertaker) invokeDestroyEnviron(callCtx environscontext.ProviderCallContext) error {
   392  	environ, err := u.environ()
   393  	if err != nil {
   394  		return err
   395  	}
   396  	return environ.Destroy(callCtx)
   397  }
   398  
   399  func (u *Undertaker) destroyEnviron(ctx context.Context, info params.UndertakerModelInfo, retryStrategy retry.Strategy) error {
   400  	u.config.Logger.Debugf("destroying cloud resources for model %v", info.Name)
   401  	// Now the model is known to be hosted and dying, we can tidy up any
   402  	// provider resources it might have used.
   403  	if err := u.setStatus(
   404  		status.Destroying, "tearing down cloud environment",
   405  	); err != nil {
   406  		return errors.Trace(err)
   407  	}
   408  
   409  	callCtx := common.NewCloudCallContextFunc(u.config.CredentialAPI)(ctx)
   410  	errChan := make(chan error)
   411  	done := make(chan struct{})
   412  	defer close(done)
   413  
   414  	r := retry.Start(retryStrategy, u.config.Clock)
   415  	attempt := 1
   416  	var destroyErr error = errors.ConstError("exhausted retries")
   417  out:
   418  	for r.Next() {
   419  		select {
   420  		case <-ctx.Done():
   421  			destroyErr = ctx.Err()
   422  			break out
   423  		default:
   424  		}
   425  		go func() {
   426  			u.config.Logger.Tracef("environ destroy enter")
   427  			defer u.config.Logger.Tracef("environ destroy leave")
   428  			err := u.invokeDestroyEnviron(callCtx)
   429  			select {
   430  			case errChan <- err:
   431  			case <-done:
   432  				if err != nil {
   433  					u.config.Logger.Errorf("attempt %d to destroy environ failed (will not retry):  %v", attempt, err)
   434  				}
   435  			}
   436  		}()
   437  		select {
   438  		case <-ctx.Done():
   439  			destroyErr = ctx.Err()
   440  			break out
   441  		case destroyErr = <-errChan:
   442  			if destroyErr == nil {
   443  				break out
   444  			}
   445  			u.config.Logger.Errorf("attempt %d to destroy environ failed (will retry):  %v", attempt, destroyErr)
   446  		}
   447  	}
   448  	if destroyErr == nil {
   449  		return nil
   450  	}
   451  	return fmt.Errorf("process destroy environ: %w", destroyErr)
   452  }
   453  
   454  func (u *Undertaker) setStatus(modelStatus status.Status, message string) error {
   455  	return u.config.Facade.SetStatus(modelStatus, message, nil)
   456  }
   457  
   458  func (u *Undertaker) processDyingModel(ctx context.Context, info params.UndertakerModelInfo) error {
   459  	watch, err := u.config.Facade.WatchModelResources()
   460  	if err != nil {
   461  		return errors.Trace(err)
   462  	}
   463  	if err := u.catacomb.Add(watch); err != nil {
   464  		return errors.Trace(err)
   465  	}
   466  	defer watch.Kill()
   467  	attempt := 1
   468  	for {
   469  		select {
   470  		case <-ctx.Done():
   471  			u.config.Logger.Debugf("processDyingModel timed out")
   472  			return errors.Annotatef(ctx.Err(), "process dying model")
   473  		case <-watch.Changes():
   474  			err := u.config.Facade.ProcessDyingModel()
   475  			if err == nil {
   476  				u.config.Logger.Debugf("processDyingModel done")
   477  				// ProcessDyingModel succeeded. We're free to
   478  				// destroy any remaining environ resources.
   479  				return nil
   480  			}
   481  			if !params.IsCodeModelNotEmpty(err) && !params.IsCodeHasHostedModels(err) {
   482  				return errors.Trace(err)
   483  			}
   484  			// Retry once there are changes to the model's resources.
   485  			_ = u.setStatus(
   486  				status.Destroying,
   487  				fmt.Sprintf("attempt %d to destroy model failed (will retry):  %v", attempt, err),
   488  			)
   489  
   490  			u.config.Logger.Debugf("attempt %d to destroy model failed (will retry):  %v", attempt, err)
   491  		}
   492  		attempt++
   493  	}
   494  }