github.com/juju/juju@v0.0.0-20240327075706-a90865de2538/worker/modelcache/worker.go (about) 1 // Copyright 2018 Canonical Ltd. 2 // Licensed under the AGPLv3, see LICENCE file for details. 3 4 package modelcache 5 6 import ( 7 "sync" 8 "time" 9 10 "github.com/juju/clock" 11 "github.com/juju/errors" 12 "github.com/juju/worker/v3" 13 "github.com/juju/worker/v3/catacomb" 14 "github.com/kr/pretty" 15 "github.com/prometheus/client_golang/prometheus" 16 17 "github.com/juju/juju/core/cache" 18 "github.com/juju/juju/core/lxdprofile" 19 "github.com/juju/juju/core/multiwatcher" 20 "github.com/juju/juju/core/settings" 21 "github.com/juju/juju/core/status" 22 "github.com/juju/juju/pubsub/controller" 23 "github.com/juju/juju/state" 24 ) 25 26 // Unlocker is used to indicate that the model cache is ready to be used. 27 type Unlocker interface { 28 Unlock() 29 } 30 31 // Clock provides an interface for dealing with clocks. 32 type Clock interface { 33 // After waits for the duration to elapse and then sends the 34 // current time on the returned channel. 35 After(time.Duration) <-chan time.Time 36 } 37 38 // Hub defines the methods of the apiserver centralhub that the peer 39 // grouper uses. 40 type Hub interface { 41 Subscribe(topic string, handler interface{}) (func(), error) 42 } 43 44 // Config describes the necessary fields for NewWorker. 45 type Config struct { 46 StatePool *state.StatePool 47 Hub Hub 48 InitializedGate Unlocker 49 Logger Logger 50 PrometheusRegisterer prometheus.Registerer 51 Cleanup func() 52 53 // Notify is used primarily for testing, and is passed through 54 // to the cache.Controller. It is called every time the controller 55 // processes an event. 56 Notify func(interface{}) 57 58 // WatcherFactory supplies the watcher that supplies deltas from state. 59 // We use a factory because we do not allow the worker loop to be crashed 60 // by a watcher that stops in an error state. 61 // Watcher acquisition my occur multiple times during a worker life-cycle. 62 WatcherFactory func() multiwatcher.Watcher 63 64 // WatcherRestartDelayMin is the minimum duration of the worker pause 65 // before instantiating a new all-watcher when the previous one returns an 66 // error. 67 // This is intended to prevent log flooding in the case of unrecoverable 68 // watcher errors. 69 WatcherRestartDelayMin time.Duration 70 71 // WatcherRestartDelayMax is the maximum duration of the worker pause 72 // before instantiating a new all-watcher when the previous one returns an 73 // error. 74 WatcherRestartDelayMax time.Duration 75 76 // Clock is used to enforce watcher restart delays. 77 Clock Clock 78 } 79 80 // WithDefaultRestartStrategy returns a new config with production-use settings 81 // for the all-watcher restart strategy. 82 func (c Config) WithDefaultRestartStrategy() Config { 83 c.WatcherRestartDelayMin = 10 * time.Millisecond 84 c.WatcherRestartDelayMax = time.Second 85 c.Clock = clock.WallClock 86 return c 87 } 88 89 // Validate ensures all the necessary values are specified. 90 func (c *Config) Validate() error { 91 if c.StatePool == nil { 92 return errors.NotValidf("missing state pool") 93 } 94 if c.Hub == nil { 95 return errors.NotValidf("missing hub") 96 } 97 if c.InitializedGate == nil { 98 return errors.NotValidf("missing initialized gate") 99 } 100 if c.Logger == nil { 101 return errors.NotValidf("missing logger") 102 } 103 if c.WatcherFactory == nil { 104 return errors.NotValidf("missing watcher factory") 105 } 106 if c.PrometheusRegisterer == nil { 107 return errors.NotValidf("missing prometheus registerer") 108 } 109 if c.Cleanup == nil { 110 return errors.NotValidf("missing cleanup func") 111 } 112 if c.WatcherRestartDelayMin <= 0 { 113 return errors.NotValidf("non-positive watcher min restart delay") 114 } 115 if c.WatcherRestartDelayMax <= 0 { 116 return errors.NotValidf("non-positive watcher max restart delay") 117 } 118 if c.Clock == nil { 119 return errors.NotValidf("missing clock") 120 } 121 return nil 122 } 123 124 type cacheWorker struct { 125 config Config 126 catacomb catacomb.Catacomb 127 controller *cache.Controller 128 changes chan interface{} 129 watcher multiwatcher.Watcher 130 watcherRestartDelay time.Duration 131 mu sync.Mutex 132 } 133 134 // NewWorker creates a new cacheWorker, and starts an 135 // all model watcher. 136 func NewWorker(config Config) (worker.Worker, error) { 137 if err := config.Validate(); err != nil { 138 return nil, errors.Trace(err) 139 } 140 w := &cacheWorker{ 141 config: config, 142 changes: make(chan interface{}), 143 watcherRestartDelay: config.WatcherRestartDelayMin, 144 } 145 controller, err := cache.NewController( 146 cache.ControllerConfig{ 147 Changes: w.changes, 148 Notify: config.Notify, 149 }) 150 if err != nil { 151 return nil, errors.Trace(err) 152 } 153 w.controller = controller 154 if err := catacomb.Invoke(catacomb.Plan{ 155 Site: &w.catacomb, 156 Work: w.loop, 157 Init: []worker.Worker{w.controller}, 158 }); err != nil { 159 return nil, errors.Trace(err) 160 } 161 return w, nil 162 } 163 164 // Report returns information that is used in the dependency engine report. 165 func (c *cacheWorker) Report() map[string]interface{} { 166 if c.controller == nil { 167 return nil 168 } 169 return c.controller.Report() 170 } 171 172 func (c *cacheWorker) init() error { 173 // Initialize the cache controller with controller config. 174 systemState, err := c.config.StatePool.SystemState() 175 if err != nil { 176 return errors.Trace(err) 177 } 178 controllerConfig, err := systemState.ControllerConfig() 179 if err != nil { 180 return errors.Annotate(err, "unable to get controller config") 181 } 182 cc := cache.ControllerConfigChange{ 183 Config: controllerConfig, 184 } 185 select { 186 case c.changes <- cc: 187 case <-c.catacomb.Dying(): 188 } 189 return nil 190 } 191 192 func (c *cacheWorker) loop() error { 193 defer c.config.Cleanup() 194 195 allWatcherStarts := prometheus.NewCounter(prometheus.CounterOpts{ 196 Namespace: "juju_worker_modelcache", 197 Name: "watcher_starts", 198 Help: "The number of times the all model watcher has been started.", 199 }) 200 201 collector := cache.NewMetricsCollector(c.controller) 202 _ = c.config.PrometheusRegisterer.Register(collector) 203 _ = c.config.PrometheusRegisterer.Register(allWatcherStarts) 204 defer c.config.PrometheusRegisterer.Unregister(allWatcherStarts) 205 defer c.config.PrometheusRegisterer.Unregister(collector) 206 207 // Ensure that we are listening for updates before we send the initial 208 // controller config update. In reality, there will be no config changed events 209 // published until the initialize gate is unlocked, as the API server won't 210 // yet be running. However in tests, there is a situation where the test waits 211 // for the initial event and then publishes a change to ensure that the change 212 // results in another event. Without subscribing to the event first, there is a 213 // race between the test and the worker. Subscribing first ensures the worker 214 // is ready to process any changes. 215 unsubscribe, err := c.config.Hub.Subscribe(controller.ConfigChanged, c.onConfigChanged) 216 if err != nil { 217 c.config.Logger.Criticalf("programming error in subscribe function: %v", err) 218 return errors.Trace(err) 219 } 220 defer unsubscribe() 221 222 if err := c.init(); err != nil { 223 return errors.Trace(err) 224 } 225 226 watcherChanges := make(chan []multiwatcher.Delta) 227 // This worker needs to be robust with respect to the multiwatcher errors. 228 // If we get an unexpected error we should get a new allWatcher. 229 // We don't want a weird error in the multiwatcher taking down the apiserver, 230 // which is what would happen if this worker errors out. 231 // The cached controller takes care of invalidation 232 // via its own mark/sweep logic. 233 var wg sync.WaitGroup 234 wg.Add(1) 235 defer func() { 236 c.mu.Lock() 237 // If we have been stopped before we have properly been started 238 // there may not be a watcher yet. 239 if c.watcher != nil { 240 _ = c.watcher.Stop() 241 } 242 c.mu.Unlock() 243 wg.Wait() 244 }() 245 246 go func() { 247 // Ensure we don't leave the main loop until the goroutine is done. 248 defer wg.Done() 249 for { 250 c.mu.Lock() 251 select { 252 case <-c.catacomb.Dying(): 253 c.mu.Unlock() 254 return 255 default: 256 // Continue through. 257 } 258 259 // Each time the watcher is restarted, 260 // mark the cache residents as stale. 261 c.controller.Mark() 262 263 allWatcherStarts.Inc() 264 c.watcher = c.config.WatcherFactory() 265 c.mu.Unlock() 266 267 // processWatcher only returns nil if we are dying. 268 // That condition will be handled at the top of the loop. 269 if err := c.processWatcher(watcherChanges); err != nil { 270 c.handleWatcherErr(err) 271 } 272 } 273 }() 274 275 first := true 276 for { 277 select { 278 case <-c.catacomb.Dying(): 279 return c.catacomb.ErrDying() 280 case deltas := <-watcherChanges: 281 // Translate multi-watcher deltas into cache changes 282 // and supply them via the changes channel. 283 for _, d := range deltas { 284 if logger := c.config.Logger; logger.IsTraceEnabled() { 285 logger.Tracef(pretty.Sprint(d)) 286 } 287 value := c.translate(d) 288 if value != nil { 289 select { 290 case c.changes <- value: 291 case <-c.catacomb.Dying(): 292 return c.catacomb.ErrDying() 293 } 294 } 295 } 296 297 // Evict any stale residents. 298 c.controller.Sweep() 299 300 // If we successfully processed a batch of deltas, then the last 301 // watcher restart is considered a success and we can reset our 302 // restart delay duration. 303 c.watcherRestartDelay = c.config.WatcherRestartDelayMin 304 305 if first { 306 // Indicate that the cache is now ready to be used. 307 c.config.InitializedGate.Unlock() 308 first = false 309 } 310 } 311 } 312 } 313 314 func (c *cacheWorker) onConfigChanged(topic string, data controller.ConfigChangedMessage, err error) { 315 if err != nil { 316 c.config.Logger.Criticalf("programming error in %s message data: %v", topic, err) 317 return 318 } 319 320 cc := cache.ControllerConfigChange{ 321 Config: data.Config, 322 } 323 select { 324 case c.changes <- cc: 325 case <-c.catacomb.Dying(): 326 } 327 328 } 329 330 func (c *cacheWorker) processWatcher(watcherChanges chan<- []multiwatcher.Delta) error { 331 for { 332 deltas, err := c.watcher.Next() 333 if err != nil { 334 return errors.Trace(err) 335 } 336 337 select { 338 case <-c.catacomb.Dying(): 339 return nil 340 case watcherChanges <- deltas: 341 } 342 } 343 } 344 345 func (c *cacheWorker) handleWatcherErr(err error) { 346 // If the backing watcher has stopped and the watcher's tomb 347 // error is nil, this means a legitimate clean stop. If we have 348 // been told to die, then we exit cleanly. Otherwise die with an 349 // error and let the dependency engine handle starting us up 350 // again. 351 if multiwatcher.IsErrStopped(err) { 352 select { 353 case <-c.catacomb.Dying(): 354 return 355 default: 356 c.catacomb.Kill(err) 357 return 358 } 359 } 360 361 // For any other errors close the watcher, which will cause us 362 // to create a new one after the restart delay. 363 select { 364 case <-c.catacomb.Dying(): 365 return 366 case <-c.config.Clock.After(c.watcherRestartDelay): 367 // The restart delay increases exponentially until we hit the max. 368 c.watcherRestartDelay = c.watcherRestartDelay * 2 369 if c.watcherRestartDelay > c.config.WatcherRestartDelayMax { 370 c.watcherRestartDelay = c.config.WatcherRestartDelayMax 371 } 372 373 c.config.Logger.Errorf("watcher error: %v, getting new watcher", err) 374 _ = c.watcher.Stop() 375 } 376 } 377 378 func (c *cacheWorker) translate(d multiwatcher.Delta) interface{} { 379 id := d.Entity.EntityID() 380 switch id.Kind { 381 case multiwatcher.ModelKind: 382 return c.translateModel(d) 383 case multiwatcher.ApplicationKind: 384 return c.translateApplication(d) 385 case multiwatcher.MachineKind: 386 return c.translateMachine(d) 387 case multiwatcher.UnitKind: 388 return c.translateUnit(d) 389 case multiwatcher.RelationKind: 390 return c.translateRelation(d) 391 case multiwatcher.CharmKind: 392 return c.translateCharm(d) 393 case multiwatcher.BranchKind: 394 // Generation deltas are processed as cache branch changes, 395 // as only "in-flight" branches should ever be in the cache. 396 return c.translateBranch(d) 397 default: 398 return nil 399 } 400 } 401 402 func (c *cacheWorker) translateModel(d multiwatcher.Delta) interface{} { 403 e := d.Entity 404 405 if d.Removed { 406 return cache.RemoveModel{ 407 ModelUUID: e.EntityID().ModelUUID, 408 } 409 } 410 411 value, ok := e.(*multiwatcher.ModelInfo) 412 if !ok { 413 c.config.Logger.Errorf("unexpected type %T", e) 414 return nil 415 } 416 417 return cache.ModelChange{ 418 ModelUUID: value.ModelUUID, 419 Name: value.Name, 420 Type: value.Type, 421 Life: value.Life, 422 Owner: value.Owner, 423 IsController: value.IsController, 424 Cloud: value.Cloud, 425 CloudRegion: value.CloudRegion, 426 CloudCredential: value.CloudCredential, 427 Annotations: value.Annotations, 428 Config: value.Config, 429 Status: coreStatus(value.Status), 430 // TODO: constraints, sla 431 UserPermissions: value.UserPermissions, 432 } 433 } 434 435 func (c *cacheWorker) translateApplication(d multiwatcher.Delta) interface{} { 436 e := d.Entity 437 id := e.EntityID() 438 439 if d.Removed { 440 return cache.RemoveApplication{ 441 ModelUUID: id.ModelUUID, 442 Name: id.ID, 443 } 444 } 445 446 value, ok := e.(*multiwatcher.ApplicationInfo) 447 if !ok { 448 c.config.Logger.Errorf("unexpected type %T", e) 449 return nil 450 } 451 452 var podSpec *cache.PodSpec 453 if spec := value.PodSpec; spec != nil { 454 podSpec = &cache.PodSpec{ 455 Spec: spec.Spec, 456 Raw: spec.Raw, 457 Counter: spec.Counter, 458 } 459 } 460 461 return cache.ApplicationChange{ 462 ModelUUID: value.ModelUUID, 463 Name: value.Name, 464 Exposed: value.Exposed, 465 CharmURL: value.CharmURL, 466 Life: value.Life, 467 MinUnits: value.MinUnits, 468 Constraints: value.Constraints, 469 Annotations: value.Annotations, 470 Config: value.Config, 471 Subordinate: value.Subordinate, 472 Status: coreStatus(value.Status), 473 OperatorStatus: coreStatus(value.OperatorStatus), 474 WorkloadVersion: value.WorkloadVersion, 475 PodSpec: podSpec, 476 } 477 } 478 479 func (c *cacheWorker) translateMachine(d multiwatcher.Delta) interface{} { 480 e := d.Entity 481 id := e.EntityID() 482 483 if d.Removed { 484 return cache.RemoveMachine{ 485 ModelUUID: id.ModelUUID, 486 Id: id.ID, 487 } 488 } 489 490 value, ok := e.(*multiwatcher.MachineInfo) 491 if !ok { 492 c.config.Logger.Errorf("unexpected type %T", e) 493 return nil 494 } 495 496 return cache.MachineChange{ 497 ModelUUID: value.ModelUUID, 498 Id: value.ID, 499 InstanceId: value.InstanceID, 500 AgentStatus: coreStatus(value.AgentStatus), 501 Life: value.Life, 502 Annotations: value.Annotations, 503 Config: value.Config, 504 Base: value.Base, 505 ContainerType: value.ContainerType, 506 IsManual: value.IsManual, 507 SupportedContainers: value.SupportedContainers, 508 SupportedContainersKnown: value.SupportedContainersKnown, 509 HardwareCharacteristics: value.HardwareCharacteristics, 510 CharmProfiles: value.CharmProfiles, 511 Addresses: value.Addresses, 512 HasVote: value.HasVote, 513 WantsVote: value.WantsVote, 514 } 515 } 516 517 func (c *cacheWorker) translateUnit(d multiwatcher.Delta) interface{} { 518 e := d.Entity 519 id := e.EntityID() 520 521 if d.Removed { 522 return cache.RemoveUnit{ 523 ModelUUID: id.ModelUUID, 524 Name: id.ID, 525 } 526 } 527 528 value, ok := e.(*multiwatcher.UnitInfo) 529 if !ok { 530 c.config.Logger.Errorf("unexpected type %T", e) 531 return nil 532 } 533 534 return cache.UnitChange{ 535 ModelUUID: value.ModelUUID, 536 Name: value.Name, 537 Application: value.Application, 538 Base: value.Base, 539 CharmURL: value.CharmURL, 540 Annotations: value.Annotations, 541 Life: value.Life, 542 PublicAddress: value.PublicAddress, 543 PrivateAddress: value.PrivateAddress, 544 MachineId: value.MachineID, 545 OpenPortRangesByEndpoint: value.OpenPortRangesByEndpoint, 546 Principal: value.Principal, 547 Subordinate: value.Subordinate, 548 549 WorkloadStatus: coreStatus(value.WorkloadStatus), 550 AgentStatus: coreStatus(value.AgentStatus), 551 ContainerStatus: coreStatus(value.ContainerStatus), 552 } 553 } 554 555 func (c *cacheWorker) translateRelation(d multiwatcher.Delta) interface{} { 556 e := d.Entity 557 id := e.EntityID() 558 559 if d.Removed { 560 return cache.RemoveRelation{ 561 ModelUUID: id.ModelUUID, 562 Key: id.ID, 563 } 564 } 565 566 value, ok := e.(*multiwatcher.RelationInfo) 567 if !ok { 568 c.config.Logger.Errorf("unexpected type %T", e) 569 return nil 570 } 571 572 endpoints := make([]cache.Endpoint, len(value.Endpoints)) 573 for i, ep := range value.Endpoints { 574 endpoints[i] = cache.Endpoint{ 575 Application: ep.ApplicationName, 576 Name: ep.Relation.Name, 577 Role: ep.Relation.Role, 578 Interface: ep.Relation.Interface, 579 Optional: ep.Relation.Optional, 580 Limit: ep.Relation.Limit, 581 Scope: ep.Relation.Scope, 582 } 583 } 584 585 return cache.RelationChange{ 586 ModelUUID: value.ModelUUID, 587 Key: value.Key, 588 Endpoints: endpoints, 589 } 590 } 591 592 func (c *cacheWorker) translateCharm(d multiwatcher.Delta) interface{} { 593 e := d.Entity 594 id := e.EntityID() 595 596 if d.Removed { 597 return cache.RemoveCharm{ 598 ModelUUID: id.ModelUUID, 599 CharmURL: id.ID, 600 } 601 } 602 603 value, ok := e.(*multiwatcher.CharmInfo) 604 if !ok { 605 c.config.Logger.Errorf("unexpected type %T", e) 606 return nil 607 } 608 609 return cache.CharmChange{ 610 ModelUUID: value.ModelUUID, 611 CharmURL: value.CharmURL, 612 LXDProfile: coreLXDProfile(value.LXDProfile), 613 DefaultConfig: value.DefaultConfig, 614 } 615 } 616 617 func (c *cacheWorker) translateBranch(d multiwatcher.Delta) interface{} { 618 e := d.Entity 619 id := e.EntityID() 620 621 if d.Removed { 622 return cache.RemoveBranch{ 623 ModelUUID: id.ModelUUID, 624 Id: id.ID, 625 } 626 } 627 628 value, ok := e.(*multiwatcher.BranchInfo) 629 if !ok { 630 c.config.Logger.Errorf("unexpected type %T", e) 631 return nil 632 } 633 634 // Branches differ slightly from other cached entities. 635 // If a branch has been committed or aborted, it will have a non-zero 636 // value for completion, indicating that it is no longer active and should 637 // be removed from the cache. 638 if value.Completed > 0 { 639 return cache.RemoveBranch{ 640 ModelUUID: id.ModelUUID, 641 Id: id.ID, 642 } 643 } 644 645 return cache.BranchChange{ 646 ModelUUID: value.ModelUUID, 647 Name: value.Name, 648 Id: value.ID, 649 AssignedUnits: value.AssignedUnits, 650 Config: coreItemChanges(value.Config), 651 Created: value.Created, 652 CreatedBy: value.CreatedBy, 653 Completed: value.Completed, 654 CompletedBy: value.CompletedBy, 655 GenerationId: value.GenerationID, 656 } 657 } 658 659 // Kill is part of the worker.Worker interface. 660 func (c *cacheWorker) Kill() { 661 c.catacomb.Kill(nil) 662 } 663 664 // Wait is part of the worker.Worker interface. 665 func (c *cacheWorker) Wait() error { 666 return c.catacomb.Wait() 667 } 668 669 func coreStatus(info multiwatcher.StatusInfo) status.StatusInfo { 670 return status.StatusInfo{ 671 Status: info.Current, 672 Message: info.Message, 673 Data: info.Data, 674 Since: info.Since, 675 } 676 } 677 678 func coreLXDProfile(delta *multiwatcher.Profile) lxdprofile.Profile { 679 if delta == nil { 680 return lxdprofile.Profile{} 681 } 682 return lxdprofile.Profile{ 683 Config: delta.Config, 684 Description: delta.Description, 685 Devices: delta.Devices, 686 } 687 } 688 689 func coreItemChanges(delta map[string][]multiwatcher.ItemChange) map[string]settings.ItemChanges { 690 if delta == nil { 691 return nil 692 } 693 694 cfg := make(map[string]settings.ItemChanges, len(delta)) 695 for k, v := range delta { 696 changes := make(settings.ItemChanges, len(v)) 697 for i, ch := range v { 698 changes[i] = settings.ItemChange{ 699 Type: ch.Type, 700 Key: ch.Key, 701 NewValue: ch.NewValue, 702 OldValue: ch.OldValue, 703 } 704 } 705 cfg[k] = changes 706 } 707 return cfg 708 }