github.com/mwhudson/juju@v0.0.0-20160512215208-90ff01f3497f/worker/uniter/uniter.go (about) 1 // Copyright 2012-2015 Canonical Ltd. 2 // Licensed under the AGPLv3, see LICENCE file for details. 3 4 package uniter 5 6 import ( 7 "fmt" 8 "os" 9 "strings" 10 "sync" 11 "time" 12 13 "github.com/juju/errors" 14 "github.com/juju/loggo" 15 "github.com/juju/names" 16 "github.com/juju/utils" 17 "github.com/juju/utils/clock" 18 "github.com/juju/utils/exec" 19 "github.com/juju/utils/fslock" 20 corecharm "gopkg.in/juju/charm.v6-unstable" 21 22 "github.com/juju/juju/api/uniter" 23 "github.com/juju/juju/apiserver/params" 24 "github.com/juju/juju/core/leadership" 25 "github.com/juju/juju/status" 26 "github.com/juju/juju/worker" 27 "github.com/juju/juju/worker/catacomb" 28 "github.com/juju/juju/worker/fortress" 29 "github.com/juju/juju/worker/uniter/actions" 30 "github.com/juju/juju/worker/uniter/charm" 31 "github.com/juju/juju/worker/uniter/hook" 32 uniterleadership "github.com/juju/juju/worker/uniter/leadership" 33 "github.com/juju/juju/worker/uniter/operation" 34 "github.com/juju/juju/worker/uniter/relation" 35 "github.com/juju/juju/worker/uniter/remotestate" 36 "github.com/juju/juju/worker/uniter/resolver" 37 "github.com/juju/juju/worker/uniter/runcommands" 38 "github.com/juju/juju/worker/uniter/runner" 39 "github.com/juju/juju/worker/uniter/runner/context" 40 "github.com/juju/juju/worker/uniter/runner/jujuc" 41 "github.com/juju/juju/worker/uniter/storage" 42 jujuos "github.com/juju/utils/os" 43 ) 44 45 var logger = loggo.GetLogger("juju.worker.uniter") 46 47 // A UniterExecutionObserver gets the appropriate methods called when a hook 48 // is executed and either succeeds or fails. Missing hooks don't get reported 49 // in this way. 50 type UniterExecutionObserver interface { 51 HookCompleted(hookName string) 52 HookFailed(hookName string) 53 } 54 55 // Uniter implements the capabilities of the unit agent. It is not intended to 56 // implement the actual *behaviour* of the unit agent; that responsibility is 57 // delegated to Mode values, which are expected to react to events and direct 58 // the uniter's responses to them. 59 type Uniter struct { 60 catacomb catacomb.Catacomb 61 st *uniter.State 62 paths Paths 63 unit *uniter.Unit 64 relations relation.Relations 65 storage *storage.Attachments 66 clock clock.Clock 67 68 // Cache the last reported status information 69 // so we don't make unnecessary api calls. 70 setStatusMutex sync.Mutex 71 lastReportedStatus status.Status 72 lastReportedMessage string 73 74 deployer *deployerProxy 75 operationFactory operation.Factory 76 operationExecutor operation.Executor 77 newOperationExecutor NewExecutorFunc 78 79 leadershipTracker leadership.Tracker 80 charmDirGuard fortress.Guard 81 82 hookLock *fslock.Lock 83 84 // TODO(axw) move the runListener and run-command code outside of the 85 // uniter, and introduce a separate worker. Each worker would feed 86 // operations to a single, synchronized runner to execute. 87 runListener *RunListener 88 commands runcommands.Commands 89 commandChannel chan string 90 91 // The execution observer is only used in tests at this stage. Should this 92 // need to be extended, perhaps a list of observers would be needed. 93 observer UniterExecutionObserver 94 95 // updateStatusAt defines a function that will be used to generate signals for 96 // the update-status hook 97 updateStatusAt func() <-chan time.Time 98 99 // hookRetryStrategy represents configuration for hook retries 100 hookRetryStrategy params.RetryStrategy 101 102 // downloader is the downloader that should be used to get the charm 103 // archive. 104 downloader charm.Downloader 105 } 106 107 // UniterParams hold all the necessary parameters for a new Uniter. 108 type UniterParams struct { 109 UniterFacade *uniter.State 110 UnitTag names.UnitTag 111 LeadershipTracker leadership.Tracker 112 DataDir string 113 Downloader charm.Downloader 114 MachineLock *fslock.Lock 115 CharmDirGuard fortress.Guard 116 UpdateStatusSignal func() <-chan time.Time 117 HookRetryStrategy params.RetryStrategy 118 NewOperationExecutor NewExecutorFunc 119 Clock clock.Clock 120 // TODO (mattyw, wallyworld, fwereade) Having the observer here make this approach a bit more legitimate, but it isn't. 121 // the observer is only a stop gap to be used in tests. A better approach would be to have the uniter tests start hooks 122 // that write to files, and have the tests watch the output to know that hooks have finished. 123 Observer UniterExecutionObserver 124 } 125 126 type NewExecutorFunc func(string, func() (*corecharm.URL, error), func(string) (func() error, error)) (operation.Executor, error) 127 128 // NewUniter creates a new Uniter which will install, run, and upgrade 129 // a charm on behalf of the unit with the given unitTag, by executing 130 // hooks and operations provoked by changes in st. 131 func NewUniter(uniterParams *UniterParams) (*Uniter, error) { 132 u := &Uniter{ 133 st: uniterParams.UniterFacade, 134 paths: NewPaths(uniterParams.DataDir, uniterParams.UnitTag), 135 hookLock: uniterParams.MachineLock, 136 leadershipTracker: uniterParams.LeadershipTracker, 137 charmDirGuard: uniterParams.CharmDirGuard, 138 updateStatusAt: uniterParams.UpdateStatusSignal, 139 hookRetryStrategy: uniterParams.HookRetryStrategy, 140 newOperationExecutor: uniterParams.NewOperationExecutor, 141 observer: uniterParams.Observer, 142 clock: uniterParams.Clock, 143 downloader: uniterParams.Downloader, 144 } 145 err := catacomb.Invoke(catacomb.Plan{ 146 Site: &u.catacomb, 147 Work: func() error { 148 return u.loop(uniterParams.UnitTag) 149 }, 150 }) 151 if err != nil { 152 return nil, errors.Trace(err) 153 } 154 return u, nil 155 } 156 157 func (u *Uniter) loop(unitTag names.UnitTag) (err error) { 158 if err := u.init(unitTag); err != nil { 159 if err == worker.ErrTerminateAgent { 160 return err 161 } 162 return fmt.Errorf("failed to initialize uniter for %q: %v", unitTag, err) 163 } 164 logger.Infof("unit %q started", u.unit) 165 166 // Install is a special case, as it must run before there 167 // is any remote state, and before the remote state watcher 168 // is started. 169 var charmURL *corecharm.URL 170 var charmModifiedVersion int 171 opState := u.operationExecutor.State() 172 if opState.Kind == operation.Install { 173 logger.Infof("resuming charm install") 174 op, err := u.operationFactory.NewInstall(opState.CharmURL) 175 if err != nil { 176 return errors.Trace(err) 177 } 178 if err := u.operationExecutor.Run(op); err != nil { 179 return errors.Trace(err) 180 } 181 charmURL = opState.CharmURL 182 } else { 183 curl, err := u.unit.CharmURL() 184 if err != nil { 185 return errors.Trace(err) 186 } 187 charmURL = curl 188 svc, err := u.unit.Service() 189 if err != nil { 190 return errors.Trace(err) 191 } 192 charmModifiedVersion, err = svc.CharmModifiedVersion() 193 if err != nil { 194 return errors.Trace(err) 195 } 196 } 197 198 var ( 199 watcher *remotestate.RemoteStateWatcher 200 watcherMu sync.Mutex 201 ) 202 203 logger.Infof("hooks are retried %v", u.hookRetryStrategy.ShouldRetry) 204 retryHookChan := make(chan struct{}, 1) 205 retryHookTimer := utils.NewBackoffTimer(utils.BackoffTimerConfig{ 206 Min: u.hookRetryStrategy.MinRetryTime, 207 Max: u.hookRetryStrategy.MaxRetryTime, 208 Jitter: u.hookRetryStrategy.JitterRetryTime, 209 Factor: u.hookRetryStrategy.RetryTimeFactor, 210 Func: func() { 211 // Don't try to send on the channel if it's already full 212 // This can happen if the timer fires off before the event is consumed 213 // by the resolver loop 214 select { 215 case retryHookChan <- struct{}{}: 216 default: 217 } 218 }, 219 Clock: u.clock, 220 }) 221 defer func() { 222 // Whenever we exit the uniter we want to stop a potentially 223 // running timer so it doesn't trigger for nothing. 224 retryHookTimer.Reset() 225 }() 226 227 restartWatcher := func() error { 228 watcherMu.Lock() 229 defer watcherMu.Unlock() 230 231 if watcher != nil { 232 // watcher added to catacomb, will kill uniter if there's an error. 233 worker.Stop(watcher) 234 } 235 var err error 236 watcher, err = remotestate.NewWatcher( 237 remotestate.WatcherConfig{ 238 State: remotestate.NewAPIState(u.st), 239 LeadershipTracker: u.leadershipTracker, 240 UnitTag: unitTag, 241 UpdateStatusChannel: u.updateStatusAt, 242 CommandChannel: u.commandChannel, 243 RetryHookChannel: retryHookChan, 244 }) 245 if err != nil { 246 return errors.Trace(err) 247 } 248 if err := u.catacomb.Add(watcher); err != nil { 249 return errors.Trace(err) 250 } 251 return nil 252 } 253 254 onIdle := func() error { 255 opState := u.operationExecutor.State() 256 if opState.Kind != operation.Continue { 257 // We should only set idle status if we're in 258 // the "Continue" state, which indicates that 259 // there is nothing to do and we're not in an 260 // error state. 261 return nil 262 } 263 return setAgentStatus(u, status.StatusIdle, "", nil) 264 } 265 266 clearResolved := func() error { 267 if err := u.unit.ClearResolved(); err != nil { 268 return errors.Trace(err) 269 } 270 watcher.ClearResolvedMode() 271 return nil 272 } 273 274 for { 275 if err = restartWatcher(); err != nil { 276 err = errors.Annotate(err, "(re)starting watcher") 277 break 278 } 279 280 uniterResolver := NewUniterResolver(ResolverConfig{ 281 ClearResolved: clearResolved, 282 ReportHookError: u.reportHookError, 283 FixDeployer: u.deployer.Fix, 284 ShouldRetryHooks: u.hookRetryStrategy.ShouldRetry, 285 StartRetryHookTimer: retryHookTimer.Start, 286 StopRetryHookTimer: retryHookTimer.Reset, 287 Actions: actions.NewResolver(), 288 Leadership: uniterleadership.NewResolver(), 289 Relations: relation.NewRelationsResolver(u.relations), 290 Storage: storage.NewResolver(u.storage), 291 Commands: runcommands.NewCommandsResolver( 292 u.commands, watcher.CommandCompleted, 293 ), 294 }) 295 296 // We should not do anything until there has been a change 297 // to the remote state. The watcher will trigger at least 298 // once initially. 299 select { 300 case <-u.catacomb.Dying(): 301 return u.catacomb.ErrDying() 302 case <-watcher.RemoteStateChanged(): 303 } 304 305 localState := resolver.LocalState{ 306 CharmURL: charmURL, 307 CharmModifiedVersion: charmModifiedVersion, 308 } 309 for err == nil { 310 err = resolver.Loop(resolver.LoopConfig{ 311 Resolver: uniterResolver, 312 Watcher: watcher, 313 Executor: u.operationExecutor, 314 Factory: u.operationFactory, 315 Abort: u.catacomb.Dying(), 316 OnIdle: onIdle, 317 CharmDirGuard: u.charmDirGuard, 318 }, &localState) 319 switch cause := errors.Cause(err); cause { 320 case nil: 321 // Loop back around. 322 case resolver.ErrLoopAborted: 323 err = u.catacomb.ErrDying() 324 case operation.ErrNeedsReboot: 325 err = worker.ErrRebootMachine 326 case operation.ErrHookFailed: 327 // Loop back around. The resolver can tell that it is in 328 // an error state by inspecting the operation state. 329 err = nil 330 case resolver.ErrTerminate: 331 err = u.terminate() 332 case resolver.ErrRestart: 333 // make sure we update the two values used above in 334 // creating LocalState. 335 charmURL = localState.CharmURL 336 charmModifiedVersion = localState.CharmModifiedVersion 337 // leave err assigned, causing loop to break 338 default: 339 // We need to set conflicted from here, because error 340 // handling is outside of the resolver's control. 341 if operation.IsDeployConflictError(cause) { 342 localState.Conflicted = true 343 err = setAgentStatus(u, status.StatusError, "upgrade failed", nil) 344 } else { 345 reportAgentError(u, "resolver loop error", err) 346 } 347 } 348 } 349 350 if errors.Cause(err) != resolver.ErrRestart { 351 break 352 } 353 } 354 355 logger.Infof("unit %q shutting down: %s", u.unit, err) 356 return err 357 } 358 359 func (u *Uniter) terminate() error { 360 unitWatcher, err := u.unit.Watch() 361 if err != nil { 362 return errors.Trace(err) 363 } 364 if err := u.catacomb.Add(unitWatcher); err != nil { 365 return errors.Trace(err) 366 } 367 for { 368 select { 369 case <-u.catacomb.Dying(): 370 return u.catacomb.ErrDying() 371 case _, ok := <-unitWatcher.Changes(): 372 if !ok { 373 return errors.New("unit watcher closed") 374 } 375 if err := u.unit.Refresh(); err != nil { 376 return errors.Trace(err) 377 } 378 if hasSubs, err := u.unit.HasSubordinates(); err != nil { 379 return errors.Trace(err) 380 } else if hasSubs { 381 continue 382 } 383 // The unit is known to be Dying; so if it didn't have subordinates 384 // just above, it can't acquire new ones before this call. 385 if err := u.unit.EnsureDead(); err != nil { 386 return errors.Trace(err) 387 } 388 return worker.ErrTerminateAgent 389 } 390 } 391 } 392 393 func (u *Uniter) setupLocks() (err error) { 394 if message := u.hookLock.Message(); u.hookLock.IsLocked() && message != "" { 395 // Look to see if it was us that held the lock before. If it was, we 396 // should be safe enough to break it, as it is likely that we died 397 // before unlocking, and have been restarted by the init system. 398 parts := strings.SplitN(message, ":", 2) 399 if len(parts) > 1 && parts[0] == u.unit.Name() { 400 if err := u.hookLock.BreakLock(); err != nil { 401 return err 402 } 403 } 404 } 405 return nil 406 } 407 408 func (u *Uniter) init(unitTag names.UnitTag) (err error) { 409 u.unit, err = u.st.Unit(unitTag) 410 if err != nil { 411 return err 412 } 413 if u.unit.Life() == params.Dead { 414 // If we started up already dead, we should not progress further. If we 415 // become Dead immediately after starting up, we may well complete any 416 // operations in progress before detecting it; but that race is fundamental 417 // and inescapable, whereas this one is not. 418 return worker.ErrTerminateAgent 419 } 420 if err = u.setupLocks(); err != nil { 421 return err 422 } 423 if err := jujuc.EnsureSymlinks(u.paths.ToolsDir); err != nil { 424 return err 425 } 426 if err := os.MkdirAll(u.paths.State.RelationsDir, 0755); err != nil { 427 return errors.Trace(err) 428 } 429 relations, err := relation.NewRelations( 430 u.st, unitTag, u.paths.State.CharmDir, 431 u.paths.State.RelationsDir, u.catacomb.Dying(), 432 ) 433 if err != nil { 434 return errors.Annotatef(err, "cannot create relations") 435 } 436 u.relations = relations 437 storageAttachments, err := storage.NewAttachments( 438 u.st, unitTag, u.paths.State.StorageDir, u.catacomb.Dying(), 439 ) 440 if err != nil { 441 return errors.Annotatef(err, "cannot create storage hook source") 442 } 443 u.storage = storageAttachments 444 u.commands = runcommands.NewCommands() 445 u.commandChannel = make(chan string) 446 447 deployer, err := charm.NewDeployer( 448 u.paths.State.CharmDir, 449 u.paths.State.DeployerDir, 450 charm.NewBundlesDir(u.paths.State.BundlesDir, u.downloader), 451 ) 452 if err != nil { 453 return errors.Annotatef(err, "cannot create deployer") 454 } 455 u.deployer = &deployerProxy{deployer} 456 contextFactory, err := context.NewContextFactory( 457 u.st, unitTag, u.leadershipTracker, u.relations.GetInfo, u.storage, u.paths, u.clock, 458 ) 459 if err != nil { 460 return err 461 } 462 runnerFactory, err := runner.NewFactory( 463 u.st, u.paths, contextFactory, 464 ) 465 if err != nil { 466 return errors.Trace(err) 467 } 468 u.operationFactory = operation.NewFactory(operation.FactoryParams{ 469 Deployer: u.deployer, 470 RunnerFactory: runnerFactory, 471 Callbacks: &operationCallbacks{u}, 472 Abort: u.catacomb.Dying(), 473 MetricSpoolDir: u.paths.GetMetricsSpoolDir(), 474 }) 475 476 operationExecutor, err := u.newOperationExecutor(u.paths.State.OperationsFile, u.getServiceCharmURL, u.acquireExecutionLock) 477 if err != nil { 478 return errors.Trace(err) 479 } 480 u.operationExecutor = operationExecutor 481 482 logger.Debugf("starting juju-run listener on unix:%s", u.paths.Runtime.JujuRunSocket) 483 commandRunner, err := NewChannelCommandRunner(ChannelCommandRunnerConfig{ 484 Abort: u.catacomb.Dying(), 485 Commands: u.commands, 486 CommandChannel: u.commandChannel, 487 }) 488 if err != nil { 489 return errors.Annotate(err, "creating command runner") 490 } 491 u.runListener, err = NewRunListener(RunListenerConfig{ 492 SocketPath: u.paths.Runtime.JujuRunSocket, 493 CommandRunner: commandRunner, 494 }) 495 if err != nil { 496 return errors.Trace(err) 497 } 498 rlw := newRunListenerWrapper(u.runListener) 499 if err := u.catacomb.Add(rlw); err != nil { 500 return errors.Trace(err) 501 } 502 // The socket needs to have permissions 777 in order for other users to use it. 503 if jujuos.HostOS() != jujuos.Windows { 504 return os.Chmod(u.paths.Runtime.JujuRunSocket, 0777) 505 } 506 return nil 507 } 508 509 func (u *Uniter) Kill() { 510 u.catacomb.Kill(nil) 511 } 512 513 func (u *Uniter) Wait() error { 514 return u.catacomb.Wait() 515 } 516 517 func (u *Uniter) getServiceCharmURL() (*corecharm.URL, error) { 518 // TODO(fwereade): pretty sure there's no reason to make 2 API calls here. 519 service, err := u.st.Service(u.unit.ServiceTag()) 520 if err != nil { 521 return nil, err 522 } 523 charmURL, _, err := service.CharmURL() 524 return charmURL, err 525 } 526 527 // RunCommands executes the supplied commands in a hook context. 528 func (u *Uniter) RunCommands(args RunCommandsArgs) (results *exec.ExecResponse, err error) { 529 // TODO(axw) drop this when we move the run-listener to an independent 530 // worker. This exists purely for the tests. 531 return u.runListener.RunCommands(args) 532 } 533 534 // acquireExecutionLock acquires the machine-level execution lock, and 535 // returns a func that must be called to unlock it. It's used by operation.Executor 536 // when running operations that execute external code. 537 func (u *Uniter) acquireExecutionLock(message string) (func() error, error) { 538 logger.Debugf("lock: %v", message) 539 // We want to make sure we don't block forever when locking, but take the 540 // Uniter's catacomb into account. 541 checkCatacomb := func() error { 542 select { 543 case <-u.catacomb.Dying(): 544 return u.catacomb.ErrDying() 545 default: 546 return nil 547 } 548 } 549 message = fmt.Sprintf("%s: %s", u.unit.Name(), message) 550 if err := u.hookLock.LockWithFunc(message, checkCatacomb); err != nil { 551 return nil, err 552 } 553 return func() error { 554 logger.Debugf("unlock: %v", message) 555 return u.hookLock.Unlock() 556 }, nil 557 } 558 559 func (u *Uniter) reportHookError(hookInfo hook.Info) error { 560 // Set the agent status to "error". We must do this here in case the 561 // hook is interrupted (e.g. unit agent crashes), rather than immediately 562 // after attempting a runHookOp. 563 hookName := string(hookInfo.Kind) 564 statusData := map[string]interface{}{} 565 if hookInfo.Kind.IsRelation() { 566 statusData["relation-id"] = hookInfo.RelationId 567 if hookInfo.RemoteUnit != "" { 568 statusData["remote-unit"] = hookInfo.RemoteUnit 569 } 570 relationName, err := u.relations.Name(hookInfo.RelationId) 571 if err != nil { 572 return errors.Trace(err) 573 } 574 hookName = fmt.Sprintf("%s-%s", relationName, hookInfo.Kind) 575 } 576 statusData["hook"] = hookName 577 statusMessage := fmt.Sprintf("hook failed: %q", hookName) 578 return setAgentStatus(u, status.StatusError, statusMessage, statusData) 579 }