github.com/mhilton/juju-juju@v0.0.0-20150901100907-a94dd2c73455/worker/uniter/modes.go (about) 1 // Copyright 2012-2015 Canonical Ltd. 2 // Licensed under the AGPLv3, see LICENCE file for details. 3 4 package uniter 5 6 import ( 7 "fmt" 8 "time" 9 10 "github.com/juju/errors" 11 "gopkg.in/juju/charm.v5" 12 "gopkg.in/juju/charm.v5/hooks" 13 "launchpad.net/tomb" 14 15 "github.com/juju/juju/apiserver/params" 16 "github.com/juju/juju/state/watcher" 17 "github.com/juju/juju/worker" 18 "github.com/juju/juju/worker/uniter/hook" 19 "github.com/juju/juju/worker/uniter/operation" 20 ) 21 22 // setAgentStatus sets the unit's status if it has changed since last time this method was called. 23 func setAgentStatus(u *Uniter, status params.Status, info string, data map[string]interface{}) error { 24 u.setStatusMutex.Lock() 25 defer u.setStatusMutex.Unlock() 26 if u.lastReportedStatus == status && u.lastReportedMessage == info { 27 return nil 28 } 29 u.lastReportedStatus = status 30 u.lastReportedMessage = info 31 logger.Debugf("[AGENT-STATUS] %s: %s", status, info) 32 return u.unit.SetAgentStatus(status, info, data) 33 } 34 35 // reportAgentError reports if there was an error performing an agent operation. 36 func reportAgentError(u *Uniter, userMessage string, err error) { 37 // If a non-nil error is reported (e.g. due to an operation failing), 38 // set the agent status to Failed. 39 if err == nil { 40 return 41 } 42 err2 := setAgentStatus(u, params.StatusFailed, userMessage, nil) 43 if err2 != nil { 44 logger.Errorf("updating agent status: %v", err2) 45 } 46 } 47 48 // Mode defines the signature of the functions that implement the possible 49 // states of a running Uniter. 50 type Mode func(u *Uniter) (Mode, error) 51 52 // ModeContinue determines what action to take based on persistent uniter state. 53 func ModeContinue(u *Uniter) (next Mode, err error) { 54 defer modeContext("ModeContinue", &err)() 55 opState := u.operationState() 56 57 // Resume interrupted deployment operations. 58 if opState.Kind == operation.Install { 59 logger.Infof("resuming charm install") 60 return ModeInstalling(opState.CharmURL) 61 } else if opState.Kind == operation.Upgrade { 62 logger.Infof("resuming charm upgrade") 63 return ModeUpgrading(opState.CharmURL), nil 64 } 65 66 // If we got this far, we should have an installed charm, 67 // so initialize the metrics timers according to what's 68 // currently deployed. 69 if err := u.initializeMetricsTimers(); err != nil { 70 return nil, errors.Trace(err) 71 } 72 73 // Check for any leadership change, and enact it if possible. 74 logger.Infof("checking leadership status") 75 // If we've already accepted leadership, we don't need to do it again. 76 canAcceptLeader := !opState.Leader 77 select { 78 // If the unit's shutting down, we shouldn't accept it. 79 case <-u.f.UnitDying(): 80 canAcceptLeader = false 81 default: 82 // If we're in an unexpected mode (eg pending hook) we shouldn't try either. 83 if opState.Kind != operation.Continue { 84 canAcceptLeader = false 85 } 86 } 87 88 // NOTE: the Wait() looks scary, but a ClaimLeadership ticket should always 89 // complete quickly; worst-case is API latency time, but it's designed that 90 // it should be vanishingly rare to hit that code path. 91 isLeader := u.leadershipTracker.ClaimLeader().Wait() 92 var creator creator 93 switch { 94 case isLeader && canAcceptLeader: 95 creator = newAcceptLeadershipOp() 96 case opState.Leader && !isLeader: 97 creator = newResignLeadershipOp() 98 } 99 if creator != nil { 100 return continueAfter(u, creator) 101 } 102 logger.Infof("leadership status is up-to-date") 103 104 switch opState.Kind { 105 case operation.RunAction: 106 // TODO(fwereade): we *should* handle interrupted actions, and make sure 107 // they're marked as failed, but that's not for now. 108 if opState.Hook != nil { 109 logger.Infof("found incomplete action %q; ignoring", opState.ActionId) 110 logger.Infof("recommitting prior %q hook", opState.Hook.Kind) 111 creator = newSkipHookOp(*opState.Hook) 112 } else { 113 logger.Infof("%q hook is nil", operation.RunAction) 114 } 115 case operation.RunHook: 116 switch opState.Step { 117 case operation.Pending: 118 logger.Infof("awaiting error resolution for %q hook", opState.Hook.Kind) 119 return ModeHookError, nil 120 case operation.Queued: 121 logger.Infof("found queued %q hook", opState.Hook.Kind) 122 // Ensure storage-attached hooks are run before install 123 // or upgrade hooks. 124 switch opState.Hook.Kind { 125 case hooks.UpgradeCharm: 126 // Force a refresh of all storage attachments, 127 // so we find out about new ones introduced 128 // by the charm upgrade. 129 if err := u.storage.Refresh(); err != nil { 130 return nil, errors.Trace(err) 131 } 132 fallthrough 133 case hooks.Install: 134 if err := waitStorage(u); err != nil { 135 return nil, errors.Trace(err) 136 } 137 } 138 creator = newRunHookOp(*opState.Hook) 139 case operation.Done: 140 logger.Infof("committing %q hook", opState.Hook.Kind) 141 creator = newSkipHookOp(*opState.Hook) 142 } 143 case operation.Continue: 144 if opState.Stopped { 145 logger.Infof("opState.Stopped == true; transition to ModeTerminating") 146 return ModeTerminating, nil 147 } 148 logger.Infof("no operations in progress; waiting for changes") 149 return ModeAbide, nil 150 default: 151 return nil, errors.Errorf("unknown operation kind %v", opState.Kind) 152 } 153 return continueAfter(u, creator) 154 } 155 156 // ModeInstalling is responsible for the initial charm deployment. If an install 157 // operation were to set an appropriate status, it shouldn't be necessary; but see 158 // ModeUpgrading for discussion relevant to both. 159 func ModeInstalling(curl *charm.URL) (next Mode, err error) { 160 name := fmt.Sprintf("ModeInstalling %s", curl) 161 return func(u *Uniter) (next Mode, err error) { 162 defer modeContext(name, &err)() 163 return continueAfter(u, newInstallOp(curl)) 164 }, nil 165 } 166 167 // ModeUpgrading is responsible for upgrading the charm. It shouldn't really 168 // need to be a mode at all -- it's just running a single operation -- but 169 // it's not safe to call it inside arbitrary other modes, because failing to 170 // pass through ModeContinue on the way out could cause a queued hook to be 171 // accidentally skipped. 172 func ModeUpgrading(curl *charm.URL) Mode { 173 name := fmt.Sprintf("ModeUpgrading %s", curl) 174 return func(u *Uniter) (next Mode, err error) { 175 defer modeContext(name, &err)() 176 return continueAfter(u, newUpgradeOp(curl)) 177 } 178 } 179 180 // ModeTerminating marks the unit dead and returns ErrTerminateAgent. 181 func ModeTerminating(u *Uniter) (next Mode, err error) { 182 defer modeContext("ModeTerminating", &err)() 183 w, err := u.unit.Watch() 184 if err != nil { 185 return nil, errors.Trace(err) 186 } 187 defer watcher.Stop(w, &u.tomb) 188 189 // Upon unit termination we attempt to send any leftover metrics one last time. If we fail, there is nothing 190 // else we can do but log the error. 191 sendErr := u.runOperation(newSendMetricsOp()) 192 if sendErr != nil { 193 logger.Warningf("failed to send metrics: %v", sendErr) 194 } 195 196 for { 197 select { 198 case <-u.tomb.Dying(): 199 return nil, tomb.ErrDying 200 case actionId := <-u.f.ActionEvents(): 201 creator := newActionOp(actionId) 202 if err := u.runOperation(creator); err != nil { 203 return nil, errors.Trace(err) 204 } 205 case _, ok := <-w.Changes(): 206 if !ok { 207 return nil, watcher.EnsureErr(w) 208 } 209 if err := u.unit.Refresh(); err != nil { 210 return nil, errors.Trace(err) 211 } 212 if hasSubs, err := u.unit.HasSubordinates(); err != nil { 213 return nil, errors.Trace(err) 214 } else if hasSubs { 215 continue 216 } 217 // The unit is known to be Dying; so if it didn't have subordinates 218 // just above, it can't acquire new ones before this call. 219 if err := u.unit.EnsureDead(); err != nil { 220 return nil, errors.Trace(err) 221 } 222 return nil, worker.ErrTerminateAgent 223 } 224 } 225 } 226 227 // ModeAbide is the Uniter's usual steady state. It watches for and responds to: 228 // * service configuration changes 229 // * charm upgrade requests 230 // * relation changes 231 // * unit death 232 // * acquisition or loss of service leadership 233 func ModeAbide(u *Uniter) (next Mode, err error) { 234 defer modeContext("ModeAbide", &err)() 235 opState := u.operationState() 236 if opState.Kind != operation.Continue { 237 return nil, errors.Errorf("insane uniter state: %#v", opState) 238 } 239 if err := u.deployer.Fix(); err != nil { 240 return nil, errors.Trace(err) 241 } 242 243 if !opState.Leader && !u.ranLeaderSettingsChanged { 244 creator := newSimpleRunHookOp(hook.LeaderSettingsChanged) 245 if err := u.runOperation(creator); err != nil { 246 return nil, errors.Trace(err) 247 } 248 } 249 250 if !u.ranConfigChanged { 251 return continueAfter(u, newSimpleRunHookOp(hooks.ConfigChanged)) 252 } 253 if !opState.Started { 254 return continueAfter(u, newSimpleRunHookOp(hooks.Start)) 255 } 256 u.f.WantUpgradeEvent(false) 257 u.relations.StartHooks() 258 defer func() { 259 if e := u.relations.StopHooks(); e != nil { 260 if err == nil { 261 err = e 262 } else { 263 logger.Errorf("error while stopping hooks: %v", e) 264 } 265 } 266 }() 267 268 select { 269 case <-u.f.UnitDying(): 270 return modeAbideDyingLoop(u) 271 default: 272 } 273 return modeAbideAliveLoop(u) 274 } 275 276 // idleWaitTime is the time after which, if there are no uniter events, 277 // the agent state becomes idle. 278 var idleWaitTime = 2 * time.Second 279 280 // modeAbideAliveLoop handles all state changes for ModeAbide when the unit 281 // is in an Alive state. 282 func modeAbideAliveLoop(u *Uniter) (Mode, error) { 283 var leaderElected, leaderDeposed <-chan struct{} 284 for { 285 // We expect one or none of these vars to be non-nil; and if none 286 // are, we set the one that should trigger when our leadership state 287 // differs from what we have recorded locally. 288 if leaderElected == nil && leaderDeposed == nil { 289 if u.operationState().Leader { 290 logger.Infof("waiting to lose leadership") 291 leaderDeposed = u.leadershipTracker.WaitMinion().Ready() 292 } else { 293 logger.Infof("waiting to gain leadership") 294 leaderElected = u.leadershipTracker.WaitLeader().Ready() 295 } 296 } 297 298 // collect-metrics hook 299 lastCollectMetrics := time.Unix(u.operationState().CollectMetricsTime, 0) 300 collectMetricsSignal := u.collectMetricsAt( 301 time.Now(), lastCollectMetrics, metricsPollInterval, 302 ) 303 304 lastSentMetrics := time.Unix(u.operationState().SendMetricsTime, 0) 305 sendMetricsSignal := u.sendMetricsAt( 306 time.Now(), lastSentMetrics, metricsSendInterval, 307 ) 308 309 // update-status hook 310 lastUpdateStatus := time.Unix(u.operationState().UpdateStatusTime, 0) 311 updateStatusSignal := u.updateStatusAt( 312 time.Now(), lastUpdateStatus, statusPollInterval, 313 ) 314 315 var creator creator 316 select { 317 case <-time.After(idleWaitTime): 318 if err := setAgentStatus(u, params.StatusIdle, "", nil); err != nil { 319 return nil, errors.Trace(err) 320 } 321 continue 322 case <-u.tomb.Dying(): 323 return nil, tomb.ErrDying 324 case <-u.f.UnitDying(): 325 return modeAbideDyingLoop(u) 326 case curl := <-u.f.UpgradeEvents(): 327 return ModeUpgrading(curl), nil 328 case ids := <-u.f.RelationsEvents(): 329 creator = newUpdateRelationsOp(ids) 330 case actionId := <-u.f.ActionEvents(): 331 creator = newActionOp(actionId) 332 case tags := <-u.f.StorageEvents(): 333 creator = newUpdateStorageOp(tags) 334 case <-u.f.ConfigEvents(): 335 creator = newSimpleRunHookOp(hooks.ConfigChanged) 336 case <-u.f.MeterStatusEvents(): 337 creator = newSimpleRunHookOp(hooks.MeterStatusChanged) 338 case <-collectMetricsSignal: 339 creator = newSimpleRunHookOp(hooks.CollectMetrics) 340 case <-sendMetricsSignal: 341 creator = newSendMetricsOp() 342 case <-updateStatusSignal: 343 creator = newSimpleRunHookOp(hooks.UpdateStatus) 344 case hookInfo := <-u.relations.Hooks(): 345 creator = newRunHookOp(hookInfo) 346 case hookInfo := <-u.storage.Hooks(): 347 creator = newRunHookOp(hookInfo) 348 case <-leaderElected: 349 // This operation queues a hook, better to let ModeContinue pick up 350 // after it than to duplicate queued-hook handling here. 351 return continueAfter(u, newAcceptLeadershipOp()) 352 case <-leaderDeposed: 353 leaderDeposed = nil 354 creator = newResignLeadershipOp() 355 case <-u.f.LeaderSettingsEvents(): 356 creator = newSimpleRunHookOp(hook.LeaderSettingsChanged) 357 } 358 if err := u.runOperation(creator); err != nil { 359 return nil, errors.Trace(err) 360 } 361 } 362 } 363 364 // modeAbideDyingLoop handles the proper termination of all relations in 365 // response to a Dying unit. 366 func modeAbideDyingLoop(u *Uniter) (next Mode, err error) { 367 if err := u.unit.Refresh(); err != nil { 368 return nil, errors.Trace(err) 369 } 370 if err = u.unit.DestroyAllSubordinates(); err != nil { 371 return nil, errors.Trace(err) 372 } 373 if err := u.relations.SetDying(); err != nil { 374 return nil, errors.Trace(err) 375 } 376 if u.operationState().Leader { 377 if err := u.runOperation(newResignLeadershipOp()); err != nil { 378 return nil, errors.Trace(err) 379 } 380 // TODO(fwereade): we ought to inform the tracker that we're shutting down 381 // (and no longer wish to continue renewing our lease) so that the tracker 382 // can then report minionhood at all times, and thus prevent the is-leader 383 // and leader-set hook tools from acting in a correct but misleading way 384 // (ie continuing to act as though leader after leader-deposed has run). 385 } 386 if err := u.storage.SetDying(); err != nil { 387 return nil, errors.Trace(err) 388 } 389 for { 390 if len(u.relations.GetInfo()) == 0 && u.storage.Empty() { 391 return continueAfter(u, newSimpleRunHookOp(hooks.Stop)) 392 } 393 var creator creator 394 select { 395 case <-u.tomb.Dying(): 396 return nil, tomb.ErrDying 397 case actionId := <-u.f.ActionEvents(): 398 creator = newActionOp(actionId) 399 case <-u.f.ConfigEvents(): 400 creator = newSimpleRunHookOp(hooks.ConfigChanged) 401 case <-u.f.LeaderSettingsEvents(): 402 creator = newSimpleRunHookOp(hook.LeaderSettingsChanged) 403 case hookInfo := <-u.relations.Hooks(): 404 creator = newRunHookOp(hookInfo) 405 case hookInfo := <-u.storage.Hooks(): 406 creator = newRunHookOp(hookInfo) 407 } 408 if err := u.runOperation(creator); err != nil { 409 return nil, errors.Trace(err) 410 } 411 } 412 } 413 414 // waitStorage waits until all storage attachments are provisioned 415 // and their hooks processed. 416 func waitStorage(u *Uniter) error { 417 if u.storage.Pending() == 0 { 418 return nil 419 } 420 logger.Infof("waiting for storage attachments") 421 for u.storage.Pending() > 0 { 422 var creator creator 423 select { 424 case <-u.tomb.Dying(): 425 return tomb.ErrDying 426 case <-u.f.UnitDying(): 427 // Unit is shutting down; no need to handle any 428 // more storage-attached hooks. We will process 429 // required storage-detaching hooks in ModeAbideDying. 430 return nil 431 case tags := <-u.f.StorageEvents(): 432 creator = newUpdateStorageOp(tags) 433 case hookInfo := <-u.storage.Hooks(): 434 creator = newRunHookOp(hookInfo) 435 } 436 if err := u.runOperation(creator); err != nil { 437 return errors.Trace(err) 438 } 439 } 440 logger.Infof("storage attachments ready") 441 return nil 442 } 443 444 // ModeHookError is responsible for watching and responding to: 445 // * user resolution of hook errors 446 // * forced charm upgrade requests 447 // * loss of service leadership 448 func ModeHookError(u *Uniter) (next Mode, err error) { 449 defer modeContext("ModeHookError", &err)() 450 opState := u.operationState() 451 if opState.Kind != operation.RunHook || opState.Step != operation.Pending { 452 return nil, errors.Errorf("insane uniter state: %#v", u.operationState()) 453 } 454 455 // Create error information for status. 456 hookInfo := *opState.Hook 457 hookName := string(hookInfo.Kind) 458 statusData := map[string]interface{}{} 459 if hookInfo.Kind.IsRelation() { 460 statusData["relation-id"] = hookInfo.RelationId 461 if hookInfo.RemoteUnit != "" { 462 statusData["remote-unit"] = hookInfo.RemoteUnit 463 } 464 relationName, err := u.relations.Name(hookInfo.RelationId) 465 if err != nil { 466 return nil, errors.Trace(err) 467 } 468 hookName = fmt.Sprintf("%s-%s", relationName, hookInfo.Kind) 469 } 470 statusData["hook"] = hookName 471 statusMessage := fmt.Sprintf("hook failed: %q", hookName) 472 473 // Run the select loop. 474 u.f.WantResolvedEvent() 475 u.f.WantUpgradeEvent(true) 476 var leaderDeposed <-chan struct{} 477 if opState.Leader { 478 leaderDeposed = u.leadershipTracker.WaitMinion().Ready() 479 } 480 for { 481 // The spec says we should set the workload status to Error, but that's crazy talk. 482 // It's the agent itself that should be in Error state. So we'll ensure the model is 483 // correct and translate before the user sees the data. 484 // ie a charm hook error results in agent error status, but is presented as a workload error. 485 if err = setAgentStatus(u, params.StatusError, statusMessage, statusData); err != nil { 486 return nil, errors.Trace(err) 487 } 488 select { 489 case <-u.tomb.Dying(): 490 return nil, tomb.ErrDying 491 case curl := <-u.f.UpgradeEvents(): 492 return ModeUpgrading(curl), nil 493 case rm := <-u.f.ResolvedEvents(): 494 var creator creator 495 switch rm { 496 case params.ResolvedRetryHooks: 497 creator = newRetryHookOp(hookInfo) 498 case params.ResolvedNoHooks: 499 creator = newSkipHookOp(hookInfo) 500 default: 501 return nil, errors.Errorf("unknown resolved mode %q", rm) 502 } 503 err := u.runOperation(creator) 504 if errors.Cause(err) == operation.ErrHookFailed { 505 continue 506 } else if err != nil { 507 return nil, errors.Trace(err) 508 } 509 return ModeContinue, nil 510 case actionId := <-u.f.ActionEvents(): 511 if err := u.runOperation(newActionOp(actionId)); err != nil { 512 return nil, errors.Trace(err) 513 } 514 case <-leaderDeposed: 515 // This should trigger at most once -- we can't reaccept leadership while 516 // in an error state. 517 leaderDeposed = nil 518 if err := u.runOperation(newResignLeadershipOp()); err != nil { 519 return nil, errors.Trace(err) 520 } 521 } 522 } 523 } 524 525 // ModeConflicted is responsible for watching and responding to: 526 // * user resolution of charm upgrade conflicts 527 // * forced charm upgrade requests 528 func ModeConflicted(curl *charm.URL) Mode { 529 return func(u *Uniter) (next Mode, err error) { 530 defer modeContext("ModeConflicted", &err)() 531 // TODO(mue) Add helpful data here too in later CL. 532 // The spec says we should set the workload status to Error, but that's crazy talk. 533 // It's the agent itself that should be in Error state. So we'll ensure the model is 534 // correct and translate before the user sees the data. 535 // ie a charm upgrade error results in agent error status, but is presented as a workload error. 536 if err := setAgentStatus(u, params.StatusError, "upgrade failed", nil); err != nil { 537 return nil, errors.Trace(err) 538 } 539 u.f.WantResolvedEvent() 540 u.f.WantUpgradeEvent(true) 541 var creator creator 542 select { 543 case <-u.tomb.Dying(): 544 return nil, tomb.ErrDying 545 case curl = <-u.f.UpgradeEvents(): 546 creator = newRevertUpgradeOp(curl) 547 case <-u.f.ResolvedEvents(): 548 creator = newResolvedUpgradeOp(curl) 549 } 550 return continueAfter(u, creator) 551 } 552 } 553 554 // modeContext returns a function that implements logging and common error 555 // manipulation for Mode funcs. 556 func modeContext(name string, err *error) func() { 557 logger.Infof("%s starting", name) 558 return func() { 559 logger.Infof("%s exiting", name) 560 *err = errors.Annotatef(*err, name) 561 } 562 } 563 564 // continueAfter is commonly used at the end of a Mode func to execute the 565 // operation returned by creator and return ModeContinue (or any error). 566 func continueAfter(u *Uniter, creator creator) (Mode, error) { 567 if err := u.runOperation(creator); err != nil { 568 return nil, errors.Trace(err) 569 } 570 return ModeContinue, nil 571 }