github.com/juju/juju@v0.0.0-20240430160146-1752b71fcf00/worker/undertaker/undertaker.go (about) 1 // Copyright 2015-2016 Canonical Ltd. 2 // Licensed under the AGPLv3, see LICENCE file for details. 3 4 package undertaker 5 6 import ( 7 "context" 8 "fmt" 9 "time" 10 11 "github.com/juju/clock" 12 "github.com/juju/errors" 13 "github.com/juju/worker/v3/catacomb" 14 "gopkg.in/retry.v1" 15 16 "github.com/juju/juju/core/life" 17 "github.com/juju/juju/core/status" 18 "github.com/juju/juju/core/watcher" 19 "github.com/juju/juju/environs" 20 environscontext "github.com/juju/juju/environs/context" 21 "github.com/juju/juju/rpc/params" 22 "github.com/juju/juju/worker" 23 "github.com/juju/juju/worker/common" 24 ) 25 26 //go:generate go run go.uber.org/mock/mockgen -package undertaker_test -destination facade_mock_test.go github.com/juju/juju/worker/undertaker Facade 27 //go:generate go run go.uber.org/mock/mockgen -package undertaker_test -destination credentialapi_mock_test.go github.com/juju/juju/worker/common CredentialAPI 28 29 // Facade covers the parts of the api/undertaker.UndertakerClient that we 30 // need for the worker. It's more than a little raw, but we'll survive. 31 type Facade interface { 32 environs.EnvironConfigGetter 33 ModelInfo() (params.UndertakerModelInfoResult, error) 34 WatchModelResources() (watcher.NotifyWatcher, error) 35 WatchModel() (watcher.NotifyWatcher, error) 36 ProcessDyingModel() error 37 RemoveModel() error 38 SetStatus(status status.Status, message string, data map[string]interface{}) error 39 } 40 41 // Logger defines a way to report non-fatal errors. 42 type Logger interface { 43 Errorf(string, ...interface{}) 44 Infof(string, ...interface{}) 45 Debugf(string, ...interface{}) 46 Tracef(string, ...interface{}) 47 Warningf(string, ...interface{}) 48 } 49 50 // Config holds the resources and configuration necessary to run an 51 // undertaker worker. 52 type Config struct { 53 Facade Facade 54 CredentialAPI common.CredentialAPI 55 Logger Logger 56 Clock clock.Clock 57 NewCloudDestroyerFunc func(context.Context, environs.OpenParams) (environs.CloudDestroyer, error) 58 } 59 60 // Validate returns an error if the config cannot be expected to drive 61 // a functional undertaker worker. 62 func (config Config) Validate() error { 63 if config.Facade == nil { 64 return errors.NotValidf("nil Facade") 65 } 66 if config.CredentialAPI == nil { 67 return errors.NotValidf("nil CredentialAPI") 68 } 69 if config.Logger == nil { 70 return errors.NotValidf("nil Logger") 71 } 72 if config.Clock == nil { 73 return errors.NotValidf("nil Clock") 74 } 75 if config.NewCloudDestroyerFunc == nil { 76 return errors.NotValidf("nil NewCloudDestroyerFunc") 77 } 78 return nil 79 } 80 81 // NewUndertaker returns a worker which processes a dying model. 82 func NewUndertaker(config Config) (*Undertaker, error) { 83 if err := config.Validate(); err != nil { 84 return nil, errors.Trace(err) 85 } 86 87 u := &Undertaker{ 88 config: config, 89 } 90 err := catacomb.Invoke(catacomb.Plan{ 91 Site: &u.catacomb, 92 Work: u.run, 93 }) 94 if err != nil { 95 return nil, errors.Trace(err) 96 } 97 return u, nil 98 } 99 100 type Undertaker struct { 101 catacomb catacomb.Catacomb 102 config Config 103 } 104 105 // Kill is part of the worker.Worker interface. 106 func (u *Undertaker) Kill() { 107 u.catacomb.Kill(nil) 108 } 109 110 // Wait is part of the worker.Worker interface. 111 func (u *Undertaker) Wait() error { 112 return u.catacomb.Wait() 113 } 114 115 func (u *Undertaker) run() (errOut error) { 116 defer func() { 117 if errors.Is(errOut, context.Canceled) || 118 errors.Is(errOut, context.DeadlineExceeded) { 119 select { 120 case <-u.catacomb.Dying(): 121 errOut = u.catacomb.ErrDying() 122 default: 123 } 124 } 125 }() 126 127 modelWatcher, err := u.config.Facade.WatchModel() 128 if errors.Is(err, errors.NotFound) { 129 // If model already gone, exit early. 130 return nil 131 } else if err != nil { 132 return errors.Trace(err) 133 } 134 err = u.catacomb.Add(modelWatcher) 135 if err != nil { 136 return err 137 } 138 139 select { 140 case <-modelWatcher.Changes(): 141 case <-u.catacomb.Dying(): 142 return u.catacomb.ErrDying() 143 } 144 145 result, err := u.config.Facade.ModelInfo() 146 if errors.Is(err, errors.NotFound) { 147 // If model already gone, exit early. 148 return nil 149 } else if err != nil { 150 return errors.Trace(err) 151 } else if result.Error != nil { 152 return errors.Trace(result.Error) 153 } 154 info := result.Result 155 156 ctx, cancel := context.WithCancel(u.catacomb.Context(context.Background())) 157 defer cancel() 158 159 // Watch for changes to model destroy values, if so, cancel the context 160 // and restart the worker. 161 err = u.catacomb.Add(worker.NewSimpleWorker(func(stopCh <-chan struct{}) error { 162 for { 163 select { 164 case <-stopCh: 165 return nil 166 case <-modelWatcher.Changes(): 167 result, err := u.config.Facade.ModelInfo() 168 if errors.Is(err, errors.NotFound) || err != nil || result.Error != nil { 169 continue 170 } 171 updated := result.Result 172 changed := false 173 switch { 174 case info.DestroyTimeout == nil && updated.DestroyTimeout != nil: 175 changed = true 176 case info.DestroyTimeout != nil && updated.DestroyTimeout == nil: 177 changed = true 178 case info.DestroyTimeout != nil && updated.DestroyTimeout != nil && *info.DestroyTimeout != *updated.DestroyTimeout: 179 changed = true 180 case info.ForceDestroyed != updated.ForceDestroyed: 181 changed = true 182 } 183 if changed { 184 u.config.Logger.Infof("model destroy parameters changed: restarting undertaker worker") 185 return errors.Errorf("model destroy parameters changed") 186 } 187 } 188 } 189 })) 190 if err != nil { 191 return err 192 } 193 194 if info.Life == life.Alive { 195 return errors.Errorf("model still alive") 196 } 197 198 if info.ForceDestroyed && info.DestroyTimeout != nil { 199 u.config.Logger.Infof("force destroying model %q with timeout %v", info.Name, info.DestroyTimeout) 200 return u.forceDestroy(ctx, info) 201 } else if info.DestroyTimeout != nil { 202 u.config.Logger.Warningf("timeout ignored for graceful model destroy") 203 } 204 // Even if ForceDestroyed is true, if we don't have a timeout, we treat them the same 205 // as a non-force destroyed model. 206 u.config.Logger.Infof("destroying model %q", info.Name) 207 return u.cleanDestroy(ctx, info) 208 } 209 210 func (u *Undertaker) cleanDestroy(ctx context.Context, info params.UndertakerModelInfo) error { 211 select { 212 case <-ctx.Done(): 213 return ctx.Err() 214 default: 215 } 216 217 if info.Life == life.Dying { 218 // TODO(axw) 2016-04-14 #1570285 219 // We should update status with information 220 // about the remaining resources here, and 221 // also make the worker responsible for 222 // checking the emptiness criteria before 223 // attempting to remove the model. 224 if err := u.setStatus( 225 status.Destroying, 226 "cleaning up cloud resources", 227 ); err != nil { 228 return errors.Trace(err) 229 } 230 // Wait for the model to become empty. 231 if err := u.processDyingModel(ctx, info); err != nil { 232 u.config.Logger.Errorf("destroy model failed: %v", err) 233 return fmt.Errorf("proccesing model death: %w", err) 234 } 235 } else { 236 u.config.Logger.Debugf("skipping processDyingModel as model is already dead") 237 } 238 239 if info.IsSystem { 240 // Nothing to do. We don't destroy environ resources or 241 // delete model docs for a controller model, because we're 242 // running inside that controller and can't safely clean up 243 // our own infrastructure. (That'll be the client's job in 244 // the end, once we've reported that we've tidied up what we 245 // can, by returning nil here, indicating that we've set it 246 // to Dead -- implied by processDyingModel succeeding.) 247 return nil 248 } 249 250 select { 251 case <-ctx.Done(): 252 return ctx.Err() 253 default: 254 } 255 256 retryStrategy := retry.LimitCount(1, retry.Regular{}) 257 // Destroy environ resources. 258 if err := u.destroyEnviron(ctx, info, retryStrategy); err != nil { 259 u.config.Logger.Errorf("destroy environ failed: %v", err) 260 return fmt.Errorf("cannot destroy cloud resources: %w", err) 261 } 262 263 select { 264 case <-ctx.Done(): 265 return ctx.Err() 266 default: 267 } 268 269 // Finally, the model is going to be dead, and be removed. 270 if err := u.config.Facade.RemoveModel(); err != nil { 271 u.config.Logger.Errorf("remove model failed: %v", err) 272 return errors.Annotate(err, "cannot remove model") 273 } 274 return nil 275 } 276 277 func (u *Undertaker) forceDestroy(ctx context.Context, info params.UndertakerModelInfo) error { 278 if !info.ForceDestroyed || info.DestroyTimeout == nil { 279 return errors.Errorf("invalid force destroy") 280 } 281 282 select { 283 case <-ctx.Done(): 284 return ctx.Err() 285 default: 286 } 287 288 if *info.DestroyTimeout == 0 { 289 u.config.Logger.Infof("skipping waiting for model to cleanly shutdown since timeout is 0") 290 } else if info.Life == life.Dying { 291 // TODO(axw) 2016-04-14 #1570285 292 // We should update status with information 293 // about the remaining resources here, and 294 // also make the worker responsible for 295 // checking the emptiness criteria before 296 // attempting to remove the model. 297 if err := u.setStatus( 298 status.Destroying, 299 "cleaning up cloud resources", 300 ); err != nil { 301 return errors.Trace(err) 302 } 303 proccessCtx, proccessCancel := context.WithCancel(ctx) 304 processTimer := u.config.Clock.AfterFunc(*info.DestroyTimeout, func() { 305 proccessCancel() 306 }) 307 defer processTimer.Stop() 308 if err := u.processDyingModel(proccessCtx, info); err != nil && !errors.Is(err, context.Canceled) { 309 proccessCancel() 310 u.config.Logger.Errorf("destroy model failed: %v", err) 311 return fmt.Errorf("proccesing model death: %w", err) 312 } 313 proccessCancel() 314 } else { 315 u.config.Logger.Debugf("skipping processDyingModel as model is already dead") 316 } 317 318 if info.IsSystem { 319 // Nothing to do. We don't destroy environ resources or 320 // delete model docs for a controller model, because we're 321 // running inside that controller and can't safely clean up 322 // our own infrastructure. (That'll be the client's job in 323 // the end, once we've reported that we've tidied up what we 324 // can, by returning nil here, indicating that we've set it 325 // to Dead -- implied by processDyingModel succeeding.) 326 return nil 327 } 328 329 select { 330 case <-ctx.Done(): 331 return ctx.Err() 332 default: 333 } 334 335 if *info.DestroyTimeout == 0 { 336 u.config.Logger.Infof("skipping tearing down cloud environment since timeout is 0") 337 } else { 338 destroyCtx, destroyCancel := context.WithCancel(ctx) 339 destroyTimer := u.config.Clock.AfterFunc(*info.DestroyTimeout, func() { 340 destroyCancel() 341 }) 342 defer destroyTimer.Stop() 343 retryStrategy := retry.Exponential{ 344 Initial: 1 * time.Second, 345 Factor: 1.5, 346 MaxDelay: 5 * time.Second, 347 } 348 if err := u.destroyEnviron(destroyCtx, info, retryStrategy); err != nil && !errors.Is(err, context.Canceled) { 349 destroyCancel() 350 u.config.Logger.Errorf("destroy environ failed: %v", err) 351 return fmt.Errorf("tearing down cloud environment: %w", err) 352 } 353 destroyCancel() 354 } 355 356 select { 357 case <-ctx.Done(): 358 return ctx.Err() 359 default: 360 } 361 362 // Finally, the model is going to be dead, and be removed. 363 if err := u.config.Facade.RemoveModel(); err != nil { 364 u.config.Logger.Errorf("remove model failed: %v", err) 365 return errors.Annotate(err, "cannot remove model") 366 } 367 return nil 368 } 369 370 func (u *Undertaker) environ() (environs.CloudDestroyer, error) { 371 modelConfig, err := u.config.Facade.ModelConfig() 372 if err != nil { 373 return nil, errors.Annotate(err, "retrieving model config") 374 } 375 376 cloudSpec, err := u.config.Facade.CloudSpec() 377 if err != nil { 378 return nil, errors.Annotatef(err, "retrieving cloud spec for model %q (%s)", modelConfig.Name(), modelConfig.UUID()) 379 } 380 381 environ, err := u.config.NewCloudDestroyerFunc(context.TODO(), environs.OpenParams{ 382 Cloud: cloudSpec, 383 Config: modelConfig, 384 }) 385 if err != nil { 386 return nil, errors.Annotatef(err, "creating environ for model %q (%s)", modelConfig.Name(), modelConfig.UUID()) 387 } 388 return environ, nil 389 } 390 391 func (u *Undertaker) invokeDestroyEnviron(callCtx environscontext.ProviderCallContext) error { 392 environ, err := u.environ() 393 if err != nil { 394 return err 395 } 396 return environ.Destroy(callCtx) 397 } 398 399 func (u *Undertaker) destroyEnviron(ctx context.Context, info params.UndertakerModelInfo, retryStrategy retry.Strategy) error { 400 u.config.Logger.Debugf("destroying cloud resources for model %v", info.Name) 401 // Now the model is known to be hosted and dying, we can tidy up any 402 // provider resources it might have used. 403 if err := u.setStatus( 404 status.Destroying, "tearing down cloud environment", 405 ); err != nil { 406 return errors.Trace(err) 407 } 408 409 callCtx := common.NewCloudCallContextFunc(u.config.CredentialAPI)(ctx) 410 errChan := make(chan error) 411 done := make(chan struct{}) 412 defer close(done) 413 414 r := retry.Start(retryStrategy, u.config.Clock) 415 attempt := 1 416 var destroyErr error = errors.ConstError("exhausted retries") 417 out: 418 for r.Next() { 419 select { 420 case <-ctx.Done(): 421 destroyErr = ctx.Err() 422 break out 423 default: 424 } 425 go func() { 426 u.config.Logger.Tracef("environ destroy enter") 427 defer u.config.Logger.Tracef("environ destroy leave") 428 err := u.invokeDestroyEnviron(callCtx) 429 select { 430 case errChan <- err: 431 case <-done: 432 if err != nil { 433 u.config.Logger.Errorf("attempt %d to destroy environ failed (will not retry): %v", attempt, err) 434 } 435 } 436 }() 437 select { 438 case <-ctx.Done(): 439 destroyErr = ctx.Err() 440 break out 441 case destroyErr = <-errChan: 442 if destroyErr == nil { 443 break out 444 } 445 u.config.Logger.Errorf("attempt %d to destroy environ failed (will retry): %v", attempt, destroyErr) 446 } 447 } 448 if destroyErr == nil { 449 return nil 450 } 451 return fmt.Errorf("process destroy environ: %w", destroyErr) 452 } 453 454 func (u *Undertaker) setStatus(modelStatus status.Status, message string) error { 455 return u.config.Facade.SetStatus(modelStatus, message, nil) 456 } 457 458 func (u *Undertaker) processDyingModel(ctx context.Context, info params.UndertakerModelInfo) error { 459 watch, err := u.config.Facade.WatchModelResources() 460 if err != nil { 461 return errors.Trace(err) 462 } 463 if err := u.catacomb.Add(watch); err != nil { 464 return errors.Trace(err) 465 } 466 defer watch.Kill() 467 attempt := 1 468 for { 469 select { 470 case <-ctx.Done(): 471 u.config.Logger.Debugf("processDyingModel timed out") 472 return errors.Annotatef(ctx.Err(), "process dying model") 473 case <-watch.Changes(): 474 err := u.config.Facade.ProcessDyingModel() 475 if err == nil { 476 u.config.Logger.Debugf("processDyingModel done") 477 // ProcessDyingModel succeeded. We're free to 478 // destroy any remaining environ resources. 479 return nil 480 } 481 if !params.IsCodeModelNotEmpty(err) && !params.IsCodeHasHostedModels(err) { 482 return errors.Trace(err) 483 } 484 // Retry once there are changes to the model's resources. 485 _ = u.setStatus( 486 status.Destroying, 487 fmt.Sprintf("attempt %d to destroy model failed (will retry): %v", attempt, err), 488 ) 489 490 u.config.Logger.Debugf("attempt %d to destroy model failed (will retry): %v", attempt, err) 491 } 492 attempt++ 493 } 494 }