github.com/makyo/juju@v0.0.0-20160425123129-2608902037e9/worker/uniter/uniter.go (about) 1 // Copyright 2012-2015 Canonical Ltd. 2 // Licensed under the AGPLv3, see LICENCE file for details. 3 4 package uniter 5 6 import ( 7 "fmt" 8 "os" 9 "strings" 10 "sync" 11 "time" 12 13 "github.com/juju/errors" 14 "github.com/juju/loggo" 15 "github.com/juju/names" 16 "github.com/juju/utils" 17 "github.com/juju/utils/clock" 18 "github.com/juju/utils/exec" 19 "github.com/juju/utils/fslock" 20 corecharm "gopkg.in/juju/charm.v6-unstable" 21 22 "github.com/juju/juju/api/uniter" 23 "github.com/juju/juju/apiserver/params" 24 "github.com/juju/juju/core/leadership" 25 "github.com/juju/juju/status" 26 "github.com/juju/juju/worker" 27 "github.com/juju/juju/worker/catacomb" 28 "github.com/juju/juju/worker/fortress" 29 "github.com/juju/juju/worker/uniter/actions" 30 "github.com/juju/juju/worker/uniter/charm" 31 "github.com/juju/juju/worker/uniter/hook" 32 uniterleadership "github.com/juju/juju/worker/uniter/leadership" 33 "github.com/juju/juju/worker/uniter/operation" 34 "github.com/juju/juju/worker/uniter/relation" 35 "github.com/juju/juju/worker/uniter/remotestate" 36 "github.com/juju/juju/worker/uniter/resolver" 37 "github.com/juju/juju/worker/uniter/runcommands" 38 "github.com/juju/juju/worker/uniter/runner" 39 "github.com/juju/juju/worker/uniter/runner/context" 40 "github.com/juju/juju/worker/uniter/runner/jujuc" 41 "github.com/juju/juju/worker/uniter/storage" 42 jujuos "github.com/juju/utils/os" 43 ) 44 45 var logger = loggo.GetLogger("juju.worker.uniter") 46 47 // A UniterExecutionObserver gets the appropriate methods called when a hook 48 // is executed and either succeeds or fails. Missing hooks don't get reported 49 // in this way. 50 type UniterExecutionObserver interface { 51 HookCompleted(hookName string) 52 HookFailed(hookName string) 53 } 54 55 // Uniter implements the capabilities of the unit agent. It is not intended to 56 // implement the actual *behaviour* of the unit agent; that responsibility is 57 // delegated to Mode values, which are expected to react to events and direct 58 // the uniter's responses to them. 59 type Uniter struct { 60 catacomb catacomb.Catacomb 61 st *uniter.State 62 paths Paths 63 unit *uniter.Unit 64 relations relation.Relations 65 storage *storage.Attachments 66 clock clock.Clock 67 68 // Cache the last reported status information 69 // so we don't make unnecessary api calls. 70 setStatusMutex sync.Mutex 71 lastReportedStatus status.Status 72 lastReportedMessage string 73 74 deployer *deployerProxy 75 operationFactory operation.Factory 76 operationExecutor operation.Executor 77 newOperationExecutor NewExecutorFunc 78 79 leadershipTracker leadership.Tracker 80 charmDirGuard fortress.Guard 81 82 hookLock *fslock.Lock 83 84 // TODO(axw) move the runListener and run-command code outside of the 85 // uniter, and introduce a separate worker. Each worker would feed 86 // operations to a single, synchronized runner to execute. 87 runListener *RunListener 88 commands runcommands.Commands 89 commandChannel chan string 90 91 // The execution observer is only used in tests at this stage. Should this 92 // need to be extended, perhaps a list of observers would be needed. 93 observer UniterExecutionObserver 94 95 // updateStatusAt defines a function that will be used to generate signals for 96 // the update-status hook 97 updateStatusAt func() <-chan time.Time 98 99 // hookRetryStrategy represents configuration for hook retries 100 hookRetryStrategy params.RetryStrategy 101 } 102 103 // UniterParams hold all the necessary parameters for a new Uniter. 104 type UniterParams struct { 105 UniterFacade *uniter.State 106 UnitTag names.UnitTag 107 LeadershipTracker leadership.Tracker 108 DataDir string 109 MachineLock *fslock.Lock 110 CharmDirGuard fortress.Guard 111 UpdateStatusSignal func() <-chan time.Time 112 HookRetryStrategy params.RetryStrategy 113 NewOperationExecutor NewExecutorFunc 114 Clock clock.Clock 115 // TODO (mattyw, wallyworld, fwereade) Having the observer here make this approach a bit more legitimate, but it isn't. 116 // the observer is only a stop gap to be used in tests. A better approach would be to have the uniter tests start hooks 117 // that write to files, and have the tests watch the output to know that hooks have finished. 118 Observer UniterExecutionObserver 119 } 120 121 type NewExecutorFunc func(string, func() (*corecharm.URL, error), func(string) (func() error, error)) (operation.Executor, error) 122 123 // NewUniter creates a new Uniter which will install, run, and upgrade 124 // a charm on behalf of the unit with the given unitTag, by executing 125 // hooks and operations provoked by changes in st. 126 func NewUniter(uniterParams *UniterParams) (*Uniter, error) { 127 u := &Uniter{ 128 st: uniterParams.UniterFacade, 129 paths: NewPaths(uniterParams.DataDir, uniterParams.UnitTag), 130 hookLock: uniterParams.MachineLock, 131 leadershipTracker: uniterParams.LeadershipTracker, 132 charmDirGuard: uniterParams.CharmDirGuard, 133 updateStatusAt: uniterParams.UpdateStatusSignal, 134 hookRetryStrategy: uniterParams.HookRetryStrategy, 135 newOperationExecutor: uniterParams.NewOperationExecutor, 136 observer: uniterParams.Observer, 137 clock: uniterParams.Clock, 138 } 139 err := catacomb.Invoke(catacomb.Plan{ 140 Site: &u.catacomb, 141 Work: func() error { 142 return u.loop(uniterParams.UnitTag) 143 }, 144 }) 145 if err != nil { 146 return nil, errors.Trace(err) 147 } 148 return u, nil 149 } 150 151 func (u *Uniter) loop(unitTag names.UnitTag) (err error) { 152 if err := u.init(unitTag); err != nil { 153 if err == worker.ErrTerminateAgent { 154 return err 155 } 156 return fmt.Errorf("failed to initialize uniter for %q: %v", unitTag, err) 157 } 158 logger.Infof("unit %q started", u.unit) 159 160 // Install is a special case, as it must run before there 161 // is any remote state, and before the remote state watcher 162 // is started. 163 var charmURL *corecharm.URL 164 var charmModifiedVersion int 165 opState := u.operationExecutor.State() 166 if opState.Kind == operation.Install { 167 logger.Infof("resuming charm install") 168 op, err := u.operationFactory.NewInstall(opState.CharmURL) 169 if err != nil { 170 return errors.Trace(err) 171 } 172 if err := u.operationExecutor.Run(op); err != nil { 173 return errors.Trace(err) 174 } 175 charmURL = opState.CharmURL 176 } else { 177 curl, err := u.unit.CharmURL() 178 if err != nil { 179 return errors.Trace(err) 180 } 181 charmURL = curl 182 svc, err := u.unit.Service() 183 if err != nil { 184 return errors.Trace(err) 185 } 186 charmModifiedVersion, err = svc.CharmModifiedVersion() 187 if err != nil { 188 return errors.Trace(err) 189 } 190 } 191 192 var ( 193 watcher *remotestate.RemoteStateWatcher 194 watcherMu sync.Mutex 195 ) 196 197 logger.Infof("hooks are retried %v", u.hookRetryStrategy.ShouldRetry) 198 retryHookChan := make(chan struct{}, 1) 199 retryHookTimer := utils.NewBackoffTimer(utils.BackoffTimerConfig{ 200 Min: u.hookRetryStrategy.MinRetryTime, 201 Max: u.hookRetryStrategy.MaxRetryTime, 202 Jitter: u.hookRetryStrategy.JitterRetryTime, 203 Factor: u.hookRetryStrategy.RetryTimeFactor, 204 Func: func() { 205 // Don't try to send on the channel if it's already full 206 // This can happen if the timer fires off before the event is consumed 207 // by the resolver loop 208 select { 209 case retryHookChan <- struct{}{}: 210 default: 211 } 212 }, 213 Clock: u.clock, 214 }) 215 defer func() { 216 // Stop any send that might be pending 217 // before closing the channel 218 retryHookTimer.Reset() 219 close(retryHookChan) 220 }() 221 222 restartWatcher := func() error { 223 watcherMu.Lock() 224 defer watcherMu.Unlock() 225 226 if watcher != nil { 227 // watcher added to catacomb, will kill uniter if there's an error. 228 worker.Stop(watcher) 229 } 230 var err error 231 watcher, err = remotestate.NewWatcher( 232 remotestate.WatcherConfig{ 233 State: remotestate.NewAPIState(u.st), 234 LeadershipTracker: u.leadershipTracker, 235 UnitTag: unitTag, 236 UpdateStatusChannel: u.updateStatusAt, 237 CommandChannel: u.commandChannel, 238 RetryHookChannel: retryHookChan, 239 }) 240 if err != nil { 241 return errors.Trace(err) 242 } 243 if err := u.catacomb.Add(watcher); err != nil { 244 return errors.Trace(err) 245 } 246 return nil 247 } 248 249 onIdle := func() error { 250 opState := u.operationExecutor.State() 251 if opState.Kind != operation.Continue { 252 // We should only set idle status if we're in 253 // the "Continue" state, which indicates that 254 // there is nothing to do and we're not in an 255 // error state. 256 return nil 257 } 258 return setAgentStatus(u, status.StatusIdle, "", nil) 259 } 260 261 clearResolved := func() error { 262 if err := u.unit.ClearResolved(); err != nil { 263 return errors.Trace(err) 264 } 265 watcher.ClearResolvedMode() 266 return nil 267 } 268 269 for { 270 if err = restartWatcher(); err != nil { 271 err = errors.Annotate(err, "(re)starting watcher") 272 break 273 } 274 275 uniterResolver := NewUniterResolver(ResolverConfig{ 276 ClearResolved: clearResolved, 277 ReportHookError: u.reportHookError, 278 FixDeployer: u.deployer.Fix, 279 ShouldRetryHooks: u.hookRetryStrategy.ShouldRetry, 280 StartRetryHookTimer: retryHookTimer.Start, 281 StopRetryHookTimer: retryHookTimer.Reset, 282 Actions: actions.NewResolver(), 283 Leadership: uniterleadership.NewResolver(), 284 Relations: relation.NewRelationsResolver(u.relations), 285 Storage: storage.NewResolver(u.storage), 286 Commands: runcommands.NewCommandsResolver( 287 u.commands, watcher.CommandCompleted, 288 ), 289 }) 290 291 // We should not do anything until there has been a change 292 // to the remote state. The watcher will trigger at least 293 // once initially. 294 select { 295 case <-u.catacomb.Dying(): 296 return u.catacomb.ErrDying() 297 case <-watcher.RemoteStateChanged(): 298 } 299 300 localState := resolver.LocalState{ 301 CharmURL: charmURL, 302 CharmModifiedVersion: charmModifiedVersion, 303 } 304 for err == nil { 305 err = resolver.Loop(resolver.LoopConfig{ 306 Resolver: uniterResolver, 307 Watcher: watcher, 308 Executor: u.operationExecutor, 309 Factory: u.operationFactory, 310 Abort: u.catacomb.Dying(), 311 OnIdle: onIdle, 312 CharmDirGuard: u.charmDirGuard, 313 }, &localState) 314 switch cause := errors.Cause(err); cause { 315 case nil: 316 // Loop back around. 317 case resolver.ErrLoopAborted: 318 err = u.catacomb.ErrDying() 319 case operation.ErrNeedsReboot: 320 err = worker.ErrRebootMachine 321 case operation.ErrHookFailed: 322 // Loop back around. The resolver can tell that it is in 323 // an error state by inspecting the operation state. 324 err = nil 325 case resolver.ErrTerminate: 326 err = u.terminate() 327 case resolver.ErrRestart: 328 // make sure we update the two values used above in 329 // creating LocalState. 330 charmURL = localState.CharmURL 331 charmModifiedVersion = localState.CharmModifiedVersion 332 // leave err assigned, causing loop to break 333 default: 334 // We need to set conflicted from here, because error 335 // handling is outside of the resolver's control. 336 if operation.IsDeployConflictError(cause) { 337 localState.Conflicted = true 338 err = setAgentStatus(u, status.StatusError, "upgrade failed", nil) 339 } else { 340 reportAgentError(u, "resolver loop error", err) 341 } 342 } 343 } 344 345 if errors.Cause(err) != resolver.ErrRestart { 346 break 347 } 348 } 349 350 logger.Infof("unit %q shutting down: %s", u.unit, err) 351 return err 352 } 353 354 func (u *Uniter) terminate() error { 355 unitWatcher, err := u.unit.Watch() 356 if err != nil { 357 return errors.Trace(err) 358 } 359 if err := u.catacomb.Add(unitWatcher); err != nil { 360 return errors.Trace(err) 361 } 362 for { 363 select { 364 case <-u.catacomb.Dying(): 365 return u.catacomb.ErrDying() 366 case _, ok := <-unitWatcher.Changes(): 367 if !ok { 368 return errors.New("unit watcher closed") 369 } 370 if err := u.unit.Refresh(); err != nil { 371 return errors.Trace(err) 372 } 373 if hasSubs, err := u.unit.HasSubordinates(); err != nil { 374 return errors.Trace(err) 375 } else if hasSubs { 376 continue 377 } 378 // The unit is known to be Dying; so if it didn't have subordinates 379 // just above, it can't acquire new ones before this call. 380 if err := u.unit.EnsureDead(); err != nil { 381 return errors.Trace(err) 382 } 383 return worker.ErrTerminateAgent 384 } 385 } 386 } 387 388 func (u *Uniter) setupLocks() (err error) { 389 if message := u.hookLock.Message(); u.hookLock.IsLocked() && message != "" { 390 // Look to see if it was us that held the lock before. If it was, we 391 // should be safe enough to break it, as it is likely that we died 392 // before unlocking, and have been restarted by the init system. 393 parts := strings.SplitN(message, ":", 2) 394 if len(parts) > 1 && parts[0] == u.unit.Name() { 395 if err := u.hookLock.BreakLock(); err != nil { 396 return err 397 } 398 } 399 } 400 return nil 401 } 402 403 func (u *Uniter) init(unitTag names.UnitTag) (err error) { 404 u.unit, err = u.st.Unit(unitTag) 405 if err != nil { 406 return err 407 } 408 if u.unit.Life() == params.Dead { 409 // If we started up already dead, we should not progress further. If we 410 // become Dead immediately after starting up, we may well complete any 411 // operations in progress before detecting it; but that race is fundamental 412 // and inescapable, whereas this one is not. 413 return worker.ErrTerminateAgent 414 } 415 if err = u.setupLocks(); err != nil { 416 return err 417 } 418 if err := jujuc.EnsureSymlinks(u.paths.ToolsDir); err != nil { 419 return err 420 } 421 if err := os.MkdirAll(u.paths.State.RelationsDir, 0755); err != nil { 422 return errors.Trace(err) 423 } 424 relations, err := relation.NewRelations( 425 u.st, unitTag, u.paths.State.CharmDir, 426 u.paths.State.RelationsDir, u.catacomb.Dying(), 427 ) 428 if err != nil { 429 return errors.Annotatef(err, "cannot create relations") 430 } 431 u.relations = relations 432 storageAttachments, err := storage.NewAttachments( 433 u.st, unitTag, u.paths.State.StorageDir, u.catacomb.Dying(), 434 ) 435 if err != nil { 436 return errors.Annotatef(err, "cannot create storage hook source") 437 } 438 u.storage = storageAttachments 439 u.commands = runcommands.NewCommands() 440 u.commandChannel = make(chan string) 441 442 deployer, err := charm.NewDeployer( 443 u.paths.State.CharmDir, 444 u.paths.State.DeployerDir, 445 charm.NewBundlesDir(u.paths.State.BundlesDir), 446 ) 447 if err != nil { 448 return errors.Annotatef(err, "cannot create deployer") 449 } 450 u.deployer = &deployerProxy{deployer} 451 contextFactory, err := context.NewContextFactory( 452 u.st, unitTag, u.leadershipTracker, u.relations.GetInfo, u.storage, u.paths, u.clock, 453 ) 454 if err != nil { 455 return err 456 } 457 runnerFactory, err := runner.NewFactory( 458 u.st, u.paths, contextFactory, 459 ) 460 if err != nil { 461 return errors.Trace(err) 462 } 463 u.operationFactory = operation.NewFactory(operation.FactoryParams{ 464 Deployer: u.deployer, 465 RunnerFactory: runnerFactory, 466 Callbacks: &operationCallbacks{u}, 467 Abort: u.catacomb.Dying(), 468 MetricSpoolDir: u.paths.GetMetricsSpoolDir(), 469 }) 470 471 operationExecutor, err := u.newOperationExecutor(u.paths.State.OperationsFile, u.getServiceCharmURL, u.acquireExecutionLock) 472 if err != nil { 473 return errors.Trace(err) 474 } 475 u.operationExecutor = operationExecutor 476 477 logger.Debugf("starting juju-run listener on unix:%s", u.paths.Runtime.JujuRunSocket) 478 commandRunner, err := NewChannelCommandRunner(ChannelCommandRunnerConfig{ 479 Abort: u.catacomb.Dying(), 480 Commands: u.commands, 481 CommandChannel: u.commandChannel, 482 }) 483 if err != nil { 484 return errors.Annotate(err, "creating command runner") 485 } 486 u.runListener, err = NewRunListener(RunListenerConfig{ 487 SocketPath: u.paths.Runtime.JujuRunSocket, 488 CommandRunner: commandRunner, 489 }) 490 if err != nil { 491 return errors.Trace(err) 492 } 493 rlw := newRunListenerWrapper(u.runListener) 494 if err := u.catacomb.Add(rlw); err != nil { 495 return errors.Trace(err) 496 } 497 // The socket needs to have permissions 777 in order for other users to use it. 498 if jujuos.HostOS() != jujuos.Windows { 499 return os.Chmod(u.paths.Runtime.JujuRunSocket, 0777) 500 } 501 return nil 502 } 503 504 func (u *Uniter) Kill() { 505 u.catacomb.Kill(nil) 506 } 507 508 func (u *Uniter) Wait() error { 509 return u.catacomb.Wait() 510 } 511 512 func (u *Uniter) getServiceCharmURL() (*corecharm.URL, error) { 513 // TODO(fwereade): pretty sure there's no reason to make 2 API calls here. 514 service, err := u.st.Service(u.unit.ServiceTag()) 515 if err != nil { 516 return nil, err 517 } 518 charmURL, _, err := service.CharmURL() 519 return charmURL, err 520 } 521 522 // RunCommands executes the supplied commands in a hook context. 523 func (u *Uniter) RunCommands(args RunCommandsArgs) (results *exec.ExecResponse, err error) { 524 // TODO(axw) drop this when we move the run-listener to an independent 525 // worker. This exists purely for the tests. 526 return u.runListener.RunCommands(args) 527 } 528 529 // acquireExecutionLock acquires the machine-level execution lock, and 530 // returns a func that must be called to unlock it. It's used by operation.Executor 531 // when running operations that execute external code. 532 func (u *Uniter) acquireExecutionLock(message string) (func() error, error) { 533 logger.Debugf("lock: %v", message) 534 // We want to make sure we don't block forever when locking, but take the 535 // Uniter's catacomb into account. 536 checkCatacomb := func() error { 537 select { 538 case <-u.catacomb.Dying(): 539 return u.catacomb.ErrDying() 540 default: 541 return nil 542 } 543 } 544 message = fmt.Sprintf("%s: %s", u.unit.Name(), message) 545 if err := u.hookLock.LockWithFunc(message, checkCatacomb); err != nil { 546 return nil, err 547 } 548 return func() error { 549 logger.Debugf("unlock: %v", message) 550 return u.hookLock.Unlock() 551 }, nil 552 } 553 554 func (u *Uniter) reportHookError(hookInfo hook.Info) error { 555 // Set the agent status to "error". We must do this here in case the 556 // hook is interrupted (e.g. unit agent crashes), rather than immediately 557 // after attempting a runHookOp. 558 hookName := string(hookInfo.Kind) 559 statusData := map[string]interface{}{} 560 if hookInfo.Kind.IsRelation() { 561 statusData["relation-id"] = hookInfo.RelationId 562 if hookInfo.RemoteUnit != "" { 563 statusData["remote-unit"] = hookInfo.RemoteUnit 564 } 565 relationName, err := u.relations.Name(hookInfo.RelationId) 566 if err != nil { 567 return errors.Trace(err) 568 } 569 hookName = fmt.Sprintf("%s-%s", relationName, hookInfo.Kind) 570 } 571 statusData["hook"] = hookName 572 statusMessage := fmt.Sprintf("hook failed: %q", hookName) 573 return setAgentStatus(u, status.StatusError, statusMessage, statusData) 574 }