github.com/mwhudson/juju@v0.0.0-20160512215208-90ff01f3497f/worker/uniter/uniter.go (about)

     1  // Copyright 2012-2015 Canonical Ltd.
     2  // Licensed under the AGPLv3, see LICENCE file for details.
     3  
     4  package uniter
     5  
     6  import (
     7  	"fmt"
     8  	"os"
     9  	"strings"
    10  	"sync"
    11  	"time"
    12  
    13  	"github.com/juju/errors"
    14  	"github.com/juju/loggo"
    15  	"github.com/juju/names"
    16  	"github.com/juju/utils"
    17  	"github.com/juju/utils/clock"
    18  	"github.com/juju/utils/exec"
    19  	"github.com/juju/utils/fslock"
    20  	corecharm "gopkg.in/juju/charm.v6-unstable"
    21  
    22  	"github.com/juju/juju/api/uniter"
    23  	"github.com/juju/juju/apiserver/params"
    24  	"github.com/juju/juju/core/leadership"
    25  	"github.com/juju/juju/status"
    26  	"github.com/juju/juju/worker"
    27  	"github.com/juju/juju/worker/catacomb"
    28  	"github.com/juju/juju/worker/fortress"
    29  	"github.com/juju/juju/worker/uniter/actions"
    30  	"github.com/juju/juju/worker/uniter/charm"
    31  	"github.com/juju/juju/worker/uniter/hook"
    32  	uniterleadership "github.com/juju/juju/worker/uniter/leadership"
    33  	"github.com/juju/juju/worker/uniter/operation"
    34  	"github.com/juju/juju/worker/uniter/relation"
    35  	"github.com/juju/juju/worker/uniter/remotestate"
    36  	"github.com/juju/juju/worker/uniter/resolver"
    37  	"github.com/juju/juju/worker/uniter/runcommands"
    38  	"github.com/juju/juju/worker/uniter/runner"
    39  	"github.com/juju/juju/worker/uniter/runner/context"
    40  	"github.com/juju/juju/worker/uniter/runner/jujuc"
    41  	"github.com/juju/juju/worker/uniter/storage"
    42  	jujuos "github.com/juju/utils/os"
    43  )
    44  
    45  var logger = loggo.GetLogger("juju.worker.uniter")
    46  
    47  // A UniterExecutionObserver gets the appropriate methods called when a hook
    48  // is executed and either succeeds or fails.  Missing hooks don't get reported
    49  // in this way.
    50  type UniterExecutionObserver interface {
    51  	HookCompleted(hookName string)
    52  	HookFailed(hookName string)
    53  }
    54  
    55  // Uniter implements the capabilities of the unit agent. It is not intended to
    56  // implement the actual *behaviour* of the unit agent; that responsibility is
    57  // delegated to Mode values, which are expected to react to events and direct
    58  // the uniter's responses to them.
    59  type Uniter struct {
    60  	catacomb  catacomb.Catacomb
    61  	st        *uniter.State
    62  	paths     Paths
    63  	unit      *uniter.Unit
    64  	relations relation.Relations
    65  	storage   *storage.Attachments
    66  	clock     clock.Clock
    67  
    68  	// Cache the last reported status information
    69  	// so we don't make unnecessary api calls.
    70  	setStatusMutex      sync.Mutex
    71  	lastReportedStatus  status.Status
    72  	lastReportedMessage string
    73  
    74  	deployer             *deployerProxy
    75  	operationFactory     operation.Factory
    76  	operationExecutor    operation.Executor
    77  	newOperationExecutor NewExecutorFunc
    78  
    79  	leadershipTracker leadership.Tracker
    80  	charmDirGuard     fortress.Guard
    81  
    82  	hookLock *fslock.Lock
    83  
    84  	// TODO(axw) move the runListener and run-command code outside of the
    85  	// uniter, and introduce a separate worker. Each worker would feed
    86  	// operations to a single, synchronized runner to execute.
    87  	runListener    *RunListener
    88  	commands       runcommands.Commands
    89  	commandChannel chan string
    90  
    91  	// The execution observer is only used in tests at this stage. Should this
    92  	// need to be extended, perhaps a list of observers would be needed.
    93  	observer UniterExecutionObserver
    94  
    95  	// updateStatusAt defines a function that will be used to generate signals for
    96  	// the update-status hook
    97  	updateStatusAt func() <-chan time.Time
    98  
    99  	// hookRetryStrategy represents configuration for hook retries
   100  	hookRetryStrategy params.RetryStrategy
   101  
   102  	// downloader is the downloader that should be used to get the charm
   103  	// archive.
   104  	downloader charm.Downloader
   105  }
   106  
   107  // UniterParams hold all the necessary parameters for a new Uniter.
   108  type UniterParams struct {
   109  	UniterFacade         *uniter.State
   110  	UnitTag              names.UnitTag
   111  	LeadershipTracker    leadership.Tracker
   112  	DataDir              string
   113  	Downloader           charm.Downloader
   114  	MachineLock          *fslock.Lock
   115  	CharmDirGuard        fortress.Guard
   116  	UpdateStatusSignal   func() <-chan time.Time
   117  	HookRetryStrategy    params.RetryStrategy
   118  	NewOperationExecutor NewExecutorFunc
   119  	Clock                clock.Clock
   120  	// TODO (mattyw, wallyworld, fwereade) Having the observer here make this approach a bit more legitimate, but it isn't.
   121  	// the observer is only a stop gap to be used in tests. A better approach would be to have the uniter tests start hooks
   122  	// that write to files, and have the tests watch the output to know that hooks have finished.
   123  	Observer UniterExecutionObserver
   124  }
   125  
   126  type NewExecutorFunc func(string, func() (*corecharm.URL, error), func(string) (func() error, error)) (operation.Executor, error)
   127  
   128  // NewUniter creates a new Uniter which will install, run, and upgrade
   129  // a charm on behalf of the unit with the given unitTag, by executing
   130  // hooks and operations provoked by changes in st.
   131  func NewUniter(uniterParams *UniterParams) (*Uniter, error) {
   132  	u := &Uniter{
   133  		st:                   uniterParams.UniterFacade,
   134  		paths:                NewPaths(uniterParams.DataDir, uniterParams.UnitTag),
   135  		hookLock:             uniterParams.MachineLock,
   136  		leadershipTracker:    uniterParams.LeadershipTracker,
   137  		charmDirGuard:        uniterParams.CharmDirGuard,
   138  		updateStatusAt:       uniterParams.UpdateStatusSignal,
   139  		hookRetryStrategy:    uniterParams.HookRetryStrategy,
   140  		newOperationExecutor: uniterParams.NewOperationExecutor,
   141  		observer:             uniterParams.Observer,
   142  		clock:                uniterParams.Clock,
   143  		downloader:           uniterParams.Downloader,
   144  	}
   145  	err := catacomb.Invoke(catacomb.Plan{
   146  		Site: &u.catacomb,
   147  		Work: func() error {
   148  			return u.loop(uniterParams.UnitTag)
   149  		},
   150  	})
   151  	if err != nil {
   152  		return nil, errors.Trace(err)
   153  	}
   154  	return u, nil
   155  }
   156  
   157  func (u *Uniter) loop(unitTag names.UnitTag) (err error) {
   158  	if err := u.init(unitTag); err != nil {
   159  		if err == worker.ErrTerminateAgent {
   160  			return err
   161  		}
   162  		return fmt.Errorf("failed to initialize uniter for %q: %v", unitTag, err)
   163  	}
   164  	logger.Infof("unit %q started", u.unit)
   165  
   166  	// Install is a special case, as it must run before there
   167  	// is any remote state, and before the remote state watcher
   168  	// is started.
   169  	var charmURL *corecharm.URL
   170  	var charmModifiedVersion int
   171  	opState := u.operationExecutor.State()
   172  	if opState.Kind == operation.Install {
   173  		logger.Infof("resuming charm install")
   174  		op, err := u.operationFactory.NewInstall(opState.CharmURL)
   175  		if err != nil {
   176  			return errors.Trace(err)
   177  		}
   178  		if err := u.operationExecutor.Run(op); err != nil {
   179  			return errors.Trace(err)
   180  		}
   181  		charmURL = opState.CharmURL
   182  	} else {
   183  		curl, err := u.unit.CharmURL()
   184  		if err != nil {
   185  			return errors.Trace(err)
   186  		}
   187  		charmURL = curl
   188  		svc, err := u.unit.Service()
   189  		if err != nil {
   190  			return errors.Trace(err)
   191  		}
   192  		charmModifiedVersion, err = svc.CharmModifiedVersion()
   193  		if err != nil {
   194  			return errors.Trace(err)
   195  		}
   196  	}
   197  
   198  	var (
   199  		watcher   *remotestate.RemoteStateWatcher
   200  		watcherMu sync.Mutex
   201  	)
   202  
   203  	logger.Infof("hooks are retried %v", u.hookRetryStrategy.ShouldRetry)
   204  	retryHookChan := make(chan struct{}, 1)
   205  	retryHookTimer := utils.NewBackoffTimer(utils.BackoffTimerConfig{
   206  		Min:    u.hookRetryStrategy.MinRetryTime,
   207  		Max:    u.hookRetryStrategy.MaxRetryTime,
   208  		Jitter: u.hookRetryStrategy.JitterRetryTime,
   209  		Factor: u.hookRetryStrategy.RetryTimeFactor,
   210  		Func: func() {
   211  			// Don't try to send on the channel if it's already full
   212  			// This can happen if the timer fires off before the event is consumed
   213  			// by the resolver loop
   214  			select {
   215  			case retryHookChan <- struct{}{}:
   216  			default:
   217  			}
   218  		},
   219  		Clock: u.clock,
   220  	})
   221  	defer func() {
   222  		// Whenever we exit the uniter we want to stop a potentially
   223  		// running timer so it doesn't trigger for nothing.
   224  		retryHookTimer.Reset()
   225  	}()
   226  
   227  	restartWatcher := func() error {
   228  		watcherMu.Lock()
   229  		defer watcherMu.Unlock()
   230  
   231  		if watcher != nil {
   232  			// watcher added to catacomb, will kill uniter if there's an error.
   233  			worker.Stop(watcher)
   234  		}
   235  		var err error
   236  		watcher, err = remotestate.NewWatcher(
   237  			remotestate.WatcherConfig{
   238  				State:               remotestate.NewAPIState(u.st),
   239  				LeadershipTracker:   u.leadershipTracker,
   240  				UnitTag:             unitTag,
   241  				UpdateStatusChannel: u.updateStatusAt,
   242  				CommandChannel:      u.commandChannel,
   243  				RetryHookChannel:    retryHookChan,
   244  			})
   245  		if err != nil {
   246  			return errors.Trace(err)
   247  		}
   248  		if err := u.catacomb.Add(watcher); err != nil {
   249  			return errors.Trace(err)
   250  		}
   251  		return nil
   252  	}
   253  
   254  	onIdle := func() error {
   255  		opState := u.operationExecutor.State()
   256  		if opState.Kind != operation.Continue {
   257  			// We should only set idle status if we're in
   258  			// the "Continue" state, which indicates that
   259  			// there is nothing to do and we're not in an
   260  			// error state.
   261  			return nil
   262  		}
   263  		return setAgentStatus(u, status.StatusIdle, "", nil)
   264  	}
   265  
   266  	clearResolved := func() error {
   267  		if err := u.unit.ClearResolved(); err != nil {
   268  			return errors.Trace(err)
   269  		}
   270  		watcher.ClearResolvedMode()
   271  		return nil
   272  	}
   273  
   274  	for {
   275  		if err = restartWatcher(); err != nil {
   276  			err = errors.Annotate(err, "(re)starting watcher")
   277  			break
   278  		}
   279  
   280  		uniterResolver := NewUniterResolver(ResolverConfig{
   281  			ClearResolved:       clearResolved,
   282  			ReportHookError:     u.reportHookError,
   283  			FixDeployer:         u.deployer.Fix,
   284  			ShouldRetryHooks:    u.hookRetryStrategy.ShouldRetry,
   285  			StartRetryHookTimer: retryHookTimer.Start,
   286  			StopRetryHookTimer:  retryHookTimer.Reset,
   287  			Actions:             actions.NewResolver(),
   288  			Leadership:          uniterleadership.NewResolver(),
   289  			Relations:           relation.NewRelationsResolver(u.relations),
   290  			Storage:             storage.NewResolver(u.storage),
   291  			Commands: runcommands.NewCommandsResolver(
   292  				u.commands, watcher.CommandCompleted,
   293  			),
   294  		})
   295  
   296  		// We should not do anything until there has been a change
   297  		// to the remote state. The watcher will trigger at least
   298  		// once initially.
   299  		select {
   300  		case <-u.catacomb.Dying():
   301  			return u.catacomb.ErrDying()
   302  		case <-watcher.RemoteStateChanged():
   303  		}
   304  
   305  		localState := resolver.LocalState{
   306  			CharmURL:             charmURL,
   307  			CharmModifiedVersion: charmModifiedVersion,
   308  		}
   309  		for err == nil {
   310  			err = resolver.Loop(resolver.LoopConfig{
   311  				Resolver:      uniterResolver,
   312  				Watcher:       watcher,
   313  				Executor:      u.operationExecutor,
   314  				Factory:       u.operationFactory,
   315  				Abort:         u.catacomb.Dying(),
   316  				OnIdle:        onIdle,
   317  				CharmDirGuard: u.charmDirGuard,
   318  			}, &localState)
   319  			switch cause := errors.Cause(err); cause {
   320  			case nil:
   321  				// Loop back around.
   322  			case resolver.ErrLoopAborted:
   323  				err = u.catacomb.ErrDying()
   324  			case operation.ErrNeedsReboot:
   325  				err = worker.ErrRebootMachine
   326  			case operation.ErrHookFailed:
   327  				// Loop back around. The resolver can tell that it is in
   328  				// an error state by inspecting the operation state.
   329  				err = nil
   330  			case resolver.ErrTerminate:
   331  				err = u.terminate()
   332  			case resolver.ErrRestart:
   333  				// make sure we update the two values used above in
   334  				// creating LocalState.
   335  				charmURL = localState.CharmURL
   336  				charmModifiedVersion = localState.CharmModifiedVersion
   337  				// leave err assigned, causing loop to break
   338  			default:
   339  				// We need to set conflicted from here, because error
   340  				// handling is outside of the resolver's control.
   341  				if operation.IsDeployConflictError(cause) {
   342  					localState.Conflicted = true
   343  					err = setAgentStatus(u, status.StatusError, "upgrade failed", nil)
   344  				} else {
   345  					reportAgentError(u, "resolver loop error", err)
   346  				}
   347  			}
   348  		}
   349  
   350  		if errors.Cause(err) != resolver.ErrRestart {
   351  			break
   352  		}
   353  	}
   354  
   355  	logger.Infof("unit %q shutting down: %s", u.unit, err)
   356  	return err
   357  }
   358  
   359  func (u *Uniter) terminate() error {
   360  	unitWatcher, err := u.unit.Watch()
   361  	if err != nil {
   362  		return errors.Trace(err)
   363  	}
   364  	if err := u.catacomb.Add(unitWatcher); err != nil {
   365  		return errors.Trace(err)
   366  	}
   367  	for {
   368  		select {
   369  		case <-u.catacomb.Dying():
   370  			return u.catacomb.ErrDying()
   371  		case _, ok := <-unitWatcher.Changes():
   372  			if !ok {
   373  				return errors.New("unit watcher closed")
   374  			}
   375  			if err := u.unit.Refresh(); err != nil {
   376  				return errors.Trace(err)
   377  			}
   378  			if hasSubs, err := u.unit.HasSubordinates(); err != nil {
   379  				return errors.Trace(err)
   380  			} else if hasSubs {
   381  				continue
   382  			}
   383  			// The unit is known to be Dying; so if it didn't have subordinates
   384  			// just above, it can't acquire new ones before this call.
   385  			if err := u.unit.EnsureDead(); err != nil {
   386  				return errors.Trace(err)
   387  			}
   388  			return worker.ErrTerminateAgent
   389  		}
   390  	}
   391  }
   392  
   393  func (u *Uniter) setupLocks() (err error) {
   394  	if message := u.hookLock.Message(); u.hookLock.IsLocked() && message != "" {
   395  		// Look to see if it was us that held the lock before.  If it was, we
   396  		// should be safe enough to break it, as it is likely that we died
   397  		// before unlocking, and have been restarted by the init system.
   398  		parts := strings.SplitN(message, ":", 2)
   399  		if len(parts) > 1 && parts[0] == u.unit.Name() {
   400  			if err := u.hookLock.BreakLock(); err != nil {
   401  				return err
   402  			}
   403  		}
   404  	}
   405  	return nil
   406  }
   407  
   408  func (u *Uniter) init(unitTag names.UnitTag) (err error) {
   409  	u.unit, err = u.st.Unit(unitTag)
   410  	if err != nil {
   411  		return err
   412  	}
   413  	if u.unit.Life() == params.Dead {
   414  		// If we started up already dead, we should not progress further. If we
   415  		// become Dead immediately after starting up, we may well complete any
   416  		// operations in progress before detecting it; but that race is fundamental
   417  		// and inescapable, whereas this one is not.
   418  		return worker.ErrTerminateAgent
   419  	}
   420  	if err = u.setupLocks(); err != nil {
   421  		return err
   422  	}
   423  	if err := jujuc.EnsureSymlinks(u.paths.ToolsDir); err != nil {
   424  		return err
   425  	}
   426  	if err := os.MkdirAll(u.paths.State.RelationsDir, 0755); err != nil {
   427  		return errors.Trace(err)
   428  	}
   429  	relations, err := relation.NewRelations(
   430  		u.st, unitTag, u.paths.State.CharmDir,
   431  		u.paths.State.RelationsDir, u.catacomb.Dying(),
   432  	)
   433  	if err != nil {
   434  		return errors.Annotatef(err, "cannot create relations")
   435  	}
   436  	u.relations = relations
   437  	storageAttachments, err := storage.NewAttachments(
   438  		u.st, unitTag, u.paths.State.StorageDir, u.catacomb.Dying(),
   439  	)
   440  	if err != nil {
   441  		return errors.Annotatef(err, "cannot create storage hook source")
   442  	}
   443  	u.storage = storageAttachments
   444  	u.commands = runcommands.NewCommands()
   445  	u.commandChannel = make(chan string)
   446  
   447  	deployer, err := charm.NewDeployer(
   448  		u.paths.State.CharmDir,
   449  		u.paths.State.DeployerDir,
   450  		charm.NewBundlesDir(u.paths.State.BundlesDir, u.downloader),
   451  	)
   452  	if err != nil {
   453  		return errors.Annotatef(err, "cannot create deployer")
   454  	}
   455  	u.deployer = &deployerProxy{deployer}
   456  	contextFactory, err := context.NewContextFactory(
   457  		u.st, unitTag, u.leadershipTracker, u.relations.GetInfo, u.storage, u.paths, u.clock,
   458  	)
   459  	if err != nil {
   460  		return err
   461  	}
   462  	runnerFactory, err := runner.NewFactory(
   463  		u.st, u.paths, contextFactory,
   464  	)
   465  	if err != nil {
   466  		return errors.Trace(err)
   467  	}
   468  	u.operationFactory = operation.NewFactory(operation.FactoryParams{
   469  		Deployer:       u.deployer,
   470  		RunnerFactory:  runnerFactory,
   471  		Callbacks:      &operationCallbacks{u},
   472  		Abort:          u.catacomb.Dying(),
   473  		MetricSpoolDir: u.paths.GetMetricsSpoolDir(),
   474  	})
   475  
   476  	operationExecutor, err := u.newOperationExecutor(u.paths.State.OperationsFile, u.getServiceCharmURL, u.acquireExecutionLock)
   477  	if err != nil {
   478  		return errors.Trace(err)
   479  	}
   480  	u.operationExecutor = operationExecutor
   481  
   482  	logger.Debugf("starting juju-run listener on unix:%s", u.paths.Runtime.JujuRunSocket)
   483  	commandRunner, err := NewChannelCommandRunner(ChannelCommandRunnerConfig{
   484  		Abort:          u.catacomb.Dying(),
   485  		Commands:       u.commands,
   486  		CommandChannel: u.commandChannel,
   487  	})
   488  	if err != nil {
   489  		return errors.Annotate(err, "creating command runner")
   490  	}
   491  	u.runListener, err = NewRunListener(RunListenerConfig{
   492  		SocketPath:    u.paths.Runtime.JujuRunSocket,
   493  		CommandRunner: commandRunner,
   494  	})
   495  	if err != nil {
   496  		return errors.Trace(err)
   497  	}
   498  	rlw := newRunListenerWrapper(u.runListener)
   499  	if err := u.catacomb.Add(rlw); err != nil {
   500  		return errors.Trace(err)
   501  	}
   502  	// The socket needs to have permissions 777 in order for other users to use it.
   503  	if jujuos.HostOS() != jujuos.Windows {
   504  		return os.Chmod(u.paths.Runtime.JujuRunSocket, 0777)
   505  	}
   506  	return nil
   507  }
   508  
   509  func (u *Uniter) Kill() {
   510  	u.catacomb.Kill(nil)
   511  }
   512  
   513  func (u *Uniter) Wait() error {
   514  	return u.catacomb.Wait()
   515  }
   516  
   517  func (u *Uniter) getServiceCharmURL() (*corecharm.URL, error) {
   518  	// TODO(fwereade): pretty sure there's no reason to make 2 API calls here.
   519  	service, err := u.st.Service(u.unit.ServiceTag())
   520  	if err != nil {
   521  		return nil, err
   522  	}
   523  	charmURL, _, err := service.CharmURL()
   524  	return charmURL, err
   525  }
   526  
   527  // RunCommands executes the supplied commands in a hook context.
   528  func (u *Uniter) RunCommands(args RunCommandsArgs) (results *exec.ExecResponse, err error) {
   529  	// TODO(axw) drop this when we move the run-listener to an independent
   530  	// worker. This exists purely for the tests.
   531  	return u.runListener.RunCommands(args)
   532  }
   533  
   534  // acquireExecutionLock acquires the machine-level execution lock, and
   535  // returns a func that must be called to unlock it. It's used by operation.Executor
   536  // when running operations that execute external code.
   537  func (u *Uniter) acquireExecutionLock(message string) (func() error, error) {
   538  	logger.Debugf("lock: %v", message)
   539  	// We want to make sure we don't block forever when locking, but take the
   540  	// Uniter's catacomb into account.
   541  	checkCatacomb := func() error {
   542  		select {
   543  		case <-u.catacomb.Dying():
   544  			return u.catacomb.ErrDying()
   545  		default:
   546  			return nil
   547  		}
   548  	}
   549  	message = fmt.Sprintf("%s: %s", u.unit.Name(), message)
   550  	if err := u.hookLock.LockWithFunc(message, checkCatacomb); err != nil {
   551  		return nil, err
   552  	}
   553  	return func() error {
   554  		logger.Debugf("unlock: %v", message)
   555  		return u.hookLock.Unlock()
   556  	}, nil
   557  }
   558  
   559  func (u *Uniter) reportHookError(hookInfo hook.Info) error {
   560  	// Set the agent status to "error". We must do this here in case the
   561  	// hook is interrupted (e.g. unit agent crashes), rather than immediately
   562  	// after attempting a runHookOp.
   563  	hookName := string(hookInfo.Kind)
   564  	statusData := map[string]interface{}{}
   565  	if hookInfo.Kind.IsRelation() {
   566  		statusData["relation-id"] = hookInfo.RelationId
   567  		if hookInfo.RemoteUnit != "" {
   568  			statusData["remote-unit"] = hookInfo.RemoteUnit
   569  		}
   570  		relationName, err := u.relations.Name(hookInfo.RelationId)
   571  		if err != nil {
   572  			return errors.Trace(err)
   573  		}
   574  		hookName = fmt.Sprintf("%s-%s", relationName, hookInfo.Kind)
   575  	}
   576  	statusData["hook"] = hookName
   577  	statusMessage := fmt.Sprintf("hook failed: %q", hookName)
   578  	return setAgentStatus(u, status.StatusError, statusMessage, statusData)
   579  }