github.com/juju/juju@v0.0.0-20240327075706-a90865de2538/worker/instancepoller/worker.go (about) 1 // Copyright 2013 Canonical Ltd. 2 // Licensed under the AGPLv3, see LICENCE file for details. 3 4 package instancepoller 5 6 import ( 7 stdcontext "context" 8 "time" 9 10 "github.com/juju/clock" 11 "github.com/juju/errors" 12 "github.com/juju/names/v5" 13 "github.com/juju/worker/v3" 14 "github.com/juju/worker/v3/catacomb" 15 16 "github.com/juju/juju/core/instance" 17 "github.com/juju/juju/core/life" 18 "github.com/juju/juju/core/network" 19 "github.com/juju/juju/core/status" 20 "github.com/juju/juju/core/watcher" 21 "github.com/juju/juju/environs" 22 "github.com/juju/juju/environs/context" 23 "github.com/juju/juju/environs/instances" 24 "github.com/juju/juju/rpc/params" 25 "github.com/juju/juju/worker/common" 26 ) 27 28 // ShortPoll and LongPoll hold the polling intervals for the instance 29 // updater. When a machine has no address or is not started, it will be 30 // polled at ShortPoll intervals until it does, exponentially backing off 31 // with an exponent of ShortPollBackoff until a maximum of ShortPollCap is 32 // reached. 33 // 34 // When a machine has an address and is started LongPoll will be used to 35 // check that the instance address or status has not changed. 36 var ( 37 ShortPoll = 3 * time.Second 38 ShortPollBackoff = 2.0 39 ShortPollCap = 1 * time.Minute 40 LongPoll = 15 * time.Minute 41 ) 42 43 // Environ specifies the provider-specific methods needed by the instance 44 // poller. 45 type Environ interface { 46 Instances(ctx context.ProviderCallContext, ids []instance.Id) ([]instances.Instance, error) 47 NetworkInterfaces(ctx context.ProviderCallContext, ids []instance.Id) ([]network.InterfaceInfos, error) 48 } 49 50 // Machine specifies an interface for machine instances processed by the 51 // instance poller. 52 type Machine interface { 53 Id() string 54 InstanceId() (instance.Id, error) 55 SetProviderNetworkConfig(network.InterfaceInfos) (network.ProviderAddresses, bool, error) 56 InstanceStatus() (params.StatusResult, error) 57 SetInstanceStatus(status.Status, string, map[string]interface{}) error 58 String() string 59 Refresh() error 60 Status() (params.StatusResult, error) 61 Life() life.Value 62 IsManual() (bool, error) 63 } 64 65 // FacadeAPI specifies the api-server methods needed by the instance 66 // poller. 67 type FacadeAPI interface { 68 WatchModelMachines() (watcher.StringsWatcher, error) 69 Machine(tag names.MachineTag) (Machine, error) 70 } 71 72 // Config encapsulates the configuration options for instantiating a new 73 // instance poller worker. 74 type Config struct { 75 Clock clock.Clock 76 Facade FacadeAPI 77 Environ Environ 78 Logger Logger 79 80 CredentialAPI common.CredentialAPI 81 } 82 83 // Validate checks whether the worker configuration settings are valid. 84 func (config Config) Validate() error { 85 if config.Clock == nil { 86 return errors.NotValidf("nil clock.Clock") 87 } 88 if config.Facade == nil { 89 return errors.NotValidf("nil Facade") 90 } 91 if config.Environ == nil { 92 return errors.NotValidf("nil Environ") 93 } 94 if config.Logger == nil { 95 return errors.NotValidf("nil Logger") 96 } 97 if config.CredentialAPI == nil { 98 return errors.NotValidf("nil CredentialAPI") 99 } 100 return nil 101 } 102 103 type pollGroupType uint8 104 105 const ( 106 shortPollGroup pollGroupType = iota 107 longPollGroup 108 invalidPollGroup 109 ) 110 111 type pollGroupEntry struct { 112 m Machine 113 tag names.MachineTag 114 instanceID instance.Id 115 116 shortPollInterval time.Duration 117 shortPollAt time.Time 118 } 119 120 func (e *pollGroupEntry) resetShortPollInterval(clk clock.Clock) { 121 e.shortPollInterval = ShortPoll 122 e.shortPollAt = clk.Now().Add(e.shortPollInterval) 123 } 124 125 func (e *pollGroupEntry) bumpShortPollInterval(clk clock.Clock) { 126 e.shortPollInterval = time.Duration(float64(e.shortPollInterval) * ShortPollBackoff) 127 if e.shortPollInterval > ShortPollCap { 128 e.shortPollInterval = ShortPollCap 129 } 130 e.shortPollAt = clk.Now().Add(e.shortPollInterval) 131 } 132 133 type updaterWorker struct { 134 config Config 135 catacomb catacomb.Catacomb 136 137 pollGroup [2]map[names.MachineTag]*pollGroupEntry 138 instanceIDToGroupEntry map[instance.Id]*pollGroupEntry 139 callContextFunc common.CloudCallContextFunc 140 141 // Hook function which tests can use to be notified when the worker 142 // has processed a full loop iteration. 143 loopCompletedHook func() 144 } 145 146 // NewWorker returns a worker that keeps track of 147 // the machines in the state and polls their instance 148 // addresses and status periodically to keep them up to date. 149 func NewWorker(config Config) (worker.Worker, error) { 150 if err := config.Validate(); err != nil { 151 return nil, errors.Trace(err) 152 } 153 u := &updaterWorker{ 154 config: config, 155 pollGroup: [2]map[names.MachineTag]*pollGroupEntry{ 156 make(map[names.MachineTag]*pollGroupEntry), 157 make(map[names.MachineTag]*pollGroupEntry), 158 }, 159 instanceIDToGroupEntry: make(map[instance.Id]*pollGroupEntry), 160 callContextFunc: common.NewCloudCallContextFunc(config.CredentialAPI), 161 } 162 err := catacomb.Invoke(catacomb.Plan{ 163 Site: &u.catacomb, 164 Work: u.loop, 165 }) 166 if err != nil { 167 return nil, errors.Trace(err) 168 } 169 return u, nil 170 } 171 172 // Kill is part of the worker.Worker interface. 173 func (u *updaterWorker) Kill() { 174 u.catacomb.Kill(nil) 175 } 176 177 // Wait is part of the worker.Worker interface. 178 func (u *updaterWorker) Wait() error { 179 return u.catacomb.Wait() 180 } 181 182 func (u *updaterWorker) loop() error { 183 watch, err := u.config.Facade.WatchModelMachines() 184 if err != nil { 185 return errors.Trace(err) 186 } 187 if err := u.catacomb.Add(watch); err != nil { 188 return errors.Trace(err) 189 } 190 191 shortPollTimer := u.config.Clock.NewTimer(ShortPoll) 192 longPollTimer := u.config.Clock.NewTimer(LongPoll) 193 defer func() { 194 _ = shortPollTimer.Stop() 195 _ = longPollTimer.Stop() 196 }() 197 198 for { 199 select { 200 case <-u.catacomb.Dying(): 201 return u.catacomb.ErrDying() 202 case ids, ok := <-watch.Changes(): 203 if !ok { 204 return errors.New("machines watcher closed") 205 } 206 207 for i := range ids { 208 tag := names.NewMachineTag(ids[i]) 209 if err := u.queueMachineForPolling(tag); err != nil { 210 return err 211 } 212 } 213 case <-shortPollTimer.Chan(): 214 if err := u.pollGroupMembers(shortPollGroup); err != nil { 215 return err 216 } 217 shortPollTimer.Reset(ShortPoll) 218 case <-longPollTimer.Chan(): 219 if err := u.pollGroupMembers(longPollGroup); err != nil { 220 return err 221 } 222 longPollTimer.Reset(LongPoll) 223 } 224 225 if u.loopCompletedHook != nil { 226 u.loopCompletedHook() 227 } 228 } 229 } 230 231 func (u *updaterWorker) queueMachineForPolling(tag names.MachineTag) error { 232 // If we are already polling this machine, check whether it is still alive 233 // and remove it from its poll group if it is now dead. 234 if entry, groupType := u.lookupPolledMachine(tag); entry != nil { 235 var isDead bool 236 if err := entry.m.Refresh(); err != nil { 237 // If the machine is not found, this probably means 238 // that it is dead and has been removed from the DB. 239 if !errors.IsNotFound(err) { 240 return errors.Trace(err) 241 } 242 isDead = true 243 } else if entry.m.Life() == life.Dead { 244 isDead = true 245 } 246 247 if isDead { 248 u.config.Logger.Debugf("removing dead machine %q (instance ID %q)", entry.m, entry.instanceID) 249 delete(u.pollGroup[groupType], tag) 250 delete(u.instanceIDToGroupEntry, entry.instanceID) 251 return nil 252 } 253 254 // Something has changed with the machine state. Reset short 255 // poll interval for the machine and move it to the short poll 256 // group (if not already there) so we immediately poll its 257 // status at the next interval. 258 u.moveEntryToPollGroup(shortPollGroup, entry) 259 if groupType == longPollGroup { 260 u.config.Logger.Debugf("moving machine %q (instance ID %q) to short poll group", entry.m, entry.instanceID) 261 } 262 return nil 263 } 264 265 // Get information about the machine 266 m, err := u.config.Facade.Machine(tag) 267 if err != nil { 268 return errors.Trace(err) 269 } 270 271 // We don't poll manual machines, instead we're setting the status to 'running' 272 // as we don't have any better information from the provider, see lp:1678981 273 isManual, err := m.IsManual() 274 if err != nil { 275 return errors.Trace(err) 276 } 277 278 if isManual { 279 machineStatus, err := m.InstanceStatus() 280 if err != nil { 281 return errors.Trace(err) 282 } 283 if status.Status(machineStatus.Status) != status.Running { 284 if err = m.SetInstanceStatus(status.Running, "Manually provisioned machine", nil); err != nil { 285 u.config.Logger.Errorf("cannot set instance status on %q: %v", m, err) 286 return err 287 } 288 } 289 return nil 290 } 291 292 // Add all new machines to the short poll group and arrange for them to 293 // be polled as soon as possible. 294 u.appendToShortPollGroup(tag, m) 295 return nil 296 } 297 298 func (u *updaterWorker) appendToShortPollGroup(tag names.MachineTag, m Machine) { 299 entry := &pollGroupEntry{ 300 tag: tag, 301 m: m, 302 } 303 entry.resetShortPollInterval(u.config.Clock) 304 u.pollGroup[shortPollGroup][tag] = entry 305 } 306 307 func (u *updaterWorker) moveEntryToPollGroup(toGroup pollGroupType, entry *pollGroupEntry) { 308 // Ensure that the entry is not present in the other group 309 delete(u.pollGroup[1-toGroup], entry.tag) 310 u.pollGroup[toGroup][entry.tag] = entry 311 312 // If moving to the short poll group reset the poll interval 313 if toGroup == shortPollGroup { 314 entry.resetShortPollInterval(u.config.Clock) 315 } 316 } 317 318 func (u *updaterWorker) lookupPolledMachine(tag names.MachineTag) (*pollGroupEntry, pollGroupType) { 319 for groupType, members := range u.pollGroup { 320 if found := members[tag]; found != nil { 321 return found, pollGroupType(groupType) 322 } 323 } 324 return nil, invalidPollGroup 325 } 326 327 func (u *updaterWorker) pollGroupMembers(groupType pollGroupType) error { 328 // Build a list of instance IDs to pass as a query to the provider. 329 var instList []instance.Id 330 now := u.config.Clock.Now() 331 for _, entry := range u.pollGroup[groupType] { 332 if groupType == shortPollGroup && now.Before(entry.shortPollAt) { 333 continue // we shouldn't poll this entry yet 334 } 335 336 if err := u.resolveInstanceID(entry); err != nil { 337 if params.IsCodeNotProvisioned(err) { 338 // machine not provisioned yet; bump its poll 339 // interval and re-try later (or as soon as we 340 // get a change for the machine) 341 entry.bumpShortPollInterval(u.config.Clock) 342 continue 343 } 344 return errors.Trace(err) 345 } 346 347 instList = append(instList, entry.instanceID) 348 } 349 350 if len(instList) == 0 { 351 return nil 352 } 353 354 ctx := stdcontext.Background() 355 infoList, err := u.config.Environ.Instances(u.callContextFunc(ctx), instList) 356 if err != nil { 357 switch errors.Cause(err) { 358 case environs.ErrPartialInstances: 359 // Proceed and process the ones we've found. 360 case environs.ErrNoInstances: 361 // If there were no instances recognised by the provider, we do not 362 // retrieve the network configuration, and will therefore have 363 // nothing to update. 364 // This can happen when machines do have instance IDs, but the 365 // instances themselves are shut down, such as we have seen for 366 // dying models. 367 // If we're in the short poll group bump all the poll intervals for 368 // entries with an instance ID. Any without an instance ID will 369 // already have had their intervals bumped above. 370 if groupType == shortPollGroup { 371 for _, id := range instList { 372 u.instanceIDToGroupEntry[id].bumpShortPollInterval(u.config.Clock) 373 } 374 } 375 376 return nil 377 default: 378 return errors.Trace(err) 379 } 380 } 381 382 netList, err := u.config.Environ.NetworkInterfaces(u.callContextFunc(ctx), instList) 383 if err != nil && !isPartialOrNoInstancesError(err) { 384 // NOTE(achilleasa): 2022-01-24: all existing providers (with the 385 // exception of "manual" which we don't care about in this context) 386 // implement the NetworkInterfaces method. 387 // 388 // This error is meant as a hint to folks working on new providers 389 // in the future to ensure that they implement this method. 390 if errors.IsNotSupported(errors.Cause(err)) { 391 return errors.Errorf("BUG: substrate does not implement required NetworkInterfaces method") 392 } 393 394 return errors.Annotate(err, "enumerating network interface list for instances") 395 } 396 397 for idx, info := range infoList { 398 var nics network.InterfaceInfos 399 if netList != nil { 400 nics = netList[idx] 401 } 402 403 if err := u.processOneInstance(instList[idx], info, nics, groupType); err != nil { 404 return errors.Trace(err) 405 } 406 } 407 408 return nil 409 } 410 411 func (u *updaterWorker) processOneInstance( 412 id instance.Id, info instances.Instance, nics network.InterfaceInfos, groupType pollGroupType, 413 ) error { 414 entry := u.instanceIDToGroupEntry[id] 415 416 // If we received ErrPartialInstances, and this ID is one of those not found, 417 // and we're in the short poll group, back off the poll interval. 418 // This will ensure that instances that have gone away do not cause excessive 419 // provider call volumes. 420 if info == nil { 421 u.config.Logger.Warningf("unable to retrieve instance information for instance: %q", id) 422 423 if groupType == shortPollGroup { 424 entry.bumpShortPollInterval(u.config.Clock) 425 } 426 return nil 427 } 428 429 providerStatus, providerAddrCount, err := u.processProviderInfo(entry, info, nics) 430 if err != nil { 431 return errors.Trace(err) 432 } 433 434 machineStatus, err := entry.m.Status() 435 if err != nil { 436 return errors.Trace(err) 437 } 438 439 u.maybeSwitchPollGroup(groupType, entry, providerStatus, status.Status(machineStatus.Status), providerAddrCount) 440 return nil 441 } 442 443 func (u *updaterWorker) resolveInstanceID(entry *pollGroupEntry) error { 444 if entry.instanceID != "" { 445 return nil // already resolved 446 } 447 448 instID, err := entry.m.InstanceId() 449 if err != nil { 450 return errors.Annotatef(err, "retrieving instance ID for machine %q", entry.m.Id()) 451 } 452 453 entry.instanceID = instID 454 u.instanceIDToGroupEntry[instID] = entry 455 return nil 456 } 457 458 // processProviderInfo updates an entry's machine status and set of provider 459 // addresses based on the information collected from the provider. It returns 460 // the *instance* status and the number of provider addresses currently 461 // known for the machine. 462 func (u *updaterWorker) processProviderInfo( 463 entry *pollGroupEntry, info instances.Instance, providerInterfaces network.InterfaceInfos, 464 ) (status.Status, int, error) { 465 curStatus, err := entry.m.InstanceStatus() 466 if err != nil { 467 // This should never occur since the machine is provisioned. If 468 // it does occur, report an unknown status to move the machine to 469 // the short poll group. 470 u.config.Logger.Warningf("cannot get current instance status for machine %v (instance ID %q): %v", 471 entry.m.Id(), entry.instanceID, err) 472 473 return status.Unknown, -1, nil 474 } 475 476 // Check for status changes 477 providerStatus := info.Status(u.callContextFunc(stdcontext.Background())) 478 curInstStatus := instance.Status{ 479 Status: status.Status(curStatus.Status), 480 Message: curStatus.Info, 481 } 482 483 if providerStatus != curInstStatus { 484 u.config.Logger.Infof("machine %q (instance ID %q) instance status changed from %q to %q", 485 entry.m.Id(), entry.instanceID, curInstStatus, providerStatus) 486 487 if err = entry.m.SetInstanceStatus(providerStatus.Status, providerStatus.Message, nil); err != nil { 488 u.config.Logger.Errorf("cannot set instance status on %q: %v", entry.m, err) 489 return status.Unknown, -1, errors.Trace(err) 490 } 491 492 // If the instance is now running, we should reset the poll 493 // interval to make sure we can capture machine status changes 494 // as early as possible. 495 if providerStatus.Status == status.Running { 496 entry.resetShortPollInterval(u.config.Clock) 497 } 498 } 499 500 // We don't care about dead machines; they will be cleaned up when we 501 // process the following machine watcher events. 502 if entry.m.Life() == life.Dead { 503 return status.Unknown, -1, nil 504 } 505 506 // Check whether the provider addresses for this machine need to be 507 // updated. 508 addrCount, err := u.syncProviderAddresses(entry, providerInterfaces) 509 if err != nil { 510 return status.Unknown, -1, err 511 } 512 513 return providerStatus.Status, addrCount, nil 514 } 515 516 // syncProviderAddresses updates the provider addresses for this entry's machine 517 // using either the provider sourced interface list. 518 // 519 // The call returns the count of provider addresses for the machine. 520 func (u *updaterWorker) syncProviderAddresses( 521 entry *pollGroupEntry, providerIfaceList network.InterfaceInfos, 522 ) (int, error) { 523 addrs, modified, err := entry.m.SetProviderNetworkConfig(providerIfaceList) 524 if err != nil { 525 return -1, errors.Trace(err) 526 } else if modified { 527 u.config.Logger.Infof("machine %q (instance ID %q) has new addresses: %v", 528 entry.m.Id(), entry.instanceID, addrs) 529 } 530 531 return len(addrs), nil 532 } 533 534 func (u *updaterWorker) maybeSwitchPollGroup( 535 curGroup pollGroupType, 536 entry *pollGroupEntry, 537 curProviderStatus, 538 curMachineStatus status.Status, 539 providerAddrCount int, 540 ) { 541 if curProviderStatus == status.Allocating || curProviderStatus == status.Pending { 542 // Keep the machine in the short poll group until it settles. 543 entry.bumpShortPollInterval(u.config.Clock) 544 return 545 } 546 547 // If the machine is currently in the long poll group and it has an 548 // unknown status or suddenly has no network addresses, move it back to 549 // the short poll group. 550 if curGroup == longPollGroup && (curProviderStatus == status.Unknown || providerAddrCount == 0) { 551 u.moveEntryToPollGroup(shortPollGroup, entry) 552 u.config.Logger.Debugf("moving machine %q (instance ID %q) back to short poll group", entry.m, entry.instanceID) 553 return 554 } 555 556 // The machine has started and we have at least one address; move to 557 // the long poll group 558 if providerAddrCount > 0 && curMachineStatus == status.Started { 559 u.moveEntryToPollGroup(longPollGroup, entry) 560 if curGroup != longPollGroup { 561 u.config.Logger.Debugf("moving machine %q (instance ID %q) to long poll group", entry.m, entry.instanceID) 562 } 563 return 564 } 565 566 // If we are in the short poll group apply exponential backoff to the 567 // poll frequency allow time for the machine to boot up. 568 if curGroup == shortPollGroup { 569 entry.bumpShortPollInterval(u.config.Clock) 570 } 571 } 572 573 func isPartialOrNoInstancesError(err error) bool { 574 cause := errors.Cause(err) 575 return cause == environs.ErrPartialInstances || cause == environs.ErrNoInstances 576 }