github.com/makyo/juju@v0.0.0-20160425123129-2608902037e9/worker/uniter/uniter.go (about)

     1  // Copyright 2012-2015 Canonical Ltd.
     2  // Licensed under the AGPLv3, see LICENCE file for details.
     3  
     4  package uniter
     5  
     6  import (
     7  	"fmt"
     8  	"os"
     9  	"strings"
    10  	"sync"
    11  	"time"
    12  
    13  	"github.com/juju/errors"
    14  	"github.com/juju/loggo"
    15  	"github.com/juju/names"
    16  	"github.com/juju/utils"
    17  	"github.com/juju/utils/clock"
    18  	"github.com/juju/utils/exec"
    19  	"github.com/juju/utils/fslock"
    20  	corecharm "gopkg.in/juju/charm.v6-unstable"
    21  
    22  	"github.com/juju/juju/api/uniter"
    23  	"github.com/juju/juju/apiserver/params"
    24  	"github.com/juju/juju/core/leadership"
    25  	"github.com/juju/juju/status"
    26  	"github.com/juju/juju/worker"
    27  	"github.com/juju/juju/worker/catacomb"
    28  	"github.com/juju/juju/worker/fortress"
    29  	"github.com/juju/juju/worker/uniter/actions"
    30  	"github.com/juju/juju/worker/uniter/charm"
    31  	"github.com/juju/juju/worker/uniter/hook"
    32  	uniterleadership "github.com/juju/juju/worker/uniter/leadership"
    33  	"github.com/juju/juju/worker/uniter/operation"
    34  	"github.com/juju/juju/worker/uniter/relation"
    35  	"github.com/juju/juju/worker/uniter/remotestate"
    36  	"github.com/juju/juju/worker/uniter/resolver"
    37  	"github.com/juju/juju/worker/uniter/runcommands"
    38  	"github.com/juju/juju/worker/uniter/runner"
    39  	"github.com/juju/juju/worker/uniter/runner/context"
    40  	"github.com/juju/juju/worker/uniter/runner/jujuc"
    41  	"github.com/juju/juju/worker/uniter/storage"
    42  	jujuos "github.com/juju/utils/os"
    43  )
    44  
    45  var logger = loggo.GetLogger("juju.worker.uniter")
    46  
    47  // A UniterExecutionObserver gets the appropriate methods called when a hook
    48  // is executed and either succeeds or fails.  Missing hooks don't get reported
    49  // in this way.
    50  type UniterExecutionObserver interface {
    51  	HookCompleted(hookName string)
    52  	HookFailed(hookName string)
    53  }
    54  
    55  // Uniter implements the capabilities of the unit agent. It is not intended to
    56  // implement the actual *behaviour* of the unit agent; that responsibility is
    57  // delegated to Mode values, which are expected to react to events and direct
    58  // the uniter's responses to them.
    59  type Uniter struct {
    60  	catacomb  catacomb.Catacomb
    61  	st        *uniter.State
    62  	paths     Paths
    63  	unit      *uniter.Unit
    64  	relations relation.Relations
    65  	storage   *storage.Attachments
    66  	clock     clock.Clock
    67  
    68  	// Cache the last reported status information
    69  	// so we don't make unnecessary api calls.
    70  	setStatusMutex      sync.Mutex
    71  	lastReportedStatus  status.Status
    72  	lastReportedMessage string
    73  
    74  	deployer             *deployerProxy
    75  	operationFactory     operation.Factory
    76  	operationExecutor    operation.Executor
    77  	newOperationExecutor NewExecutorFunc
    78  
    79  	leadershipTracker leadership.Tracker
    80  	charmDirGuard     fortress.Guard
    81  
    82  	hookLock *fslock.Lock
    83  
    84  	// TODO(axw) move the runListener and run-command code outside of the
    85  	// uniter, and introduce a separate worker. Each worker would feed
    86  	// operations to a single, synchronized runner to execute.
    87  	runListener    *RunListener
    88  	commands       runcommands.Commands
    89  	commandChannel chan string
    90  
    91  	// The execution observer is only used in tests at this stage. Should this
    92  	// need to be extended, perhaps a list of observers would be needed.
    93  	observer UniterExecutionObserver
    94  
    95  	// updateStatusAt defines a function that will be used to generate signals for
    96  	// the update-status hook
    97  	updateStatusAt func() <-chan time.Time
    98  
    99  	// hookRetryStrategy represents configuration for hook retries
   100  	hookRetryStrategy params.RetryStrategy
   101  }
   102  
   103  // UniterParams hold all the necessary parameters for a new Uniter.
   104  type UniterParams struct {
   105  	UniterFacade         *uniter.State
   106  	UnitTag              names.UnitTag
   107  	LeadershipTracker    leadership.Tracker
   108  	DataDir              string
   109  	MachineLock          *fslock.Lock
   110  	CharmDirGuard        fortress.Guard
   111  	UpdateStatusSignal   func() <-chan time.Time
   112  	HookRetryStrategy    params.RetryStrategy
   113  	NewOperationExecutor NewExecutorFunc
   114  	Clock                clock.Clock
   115  	// TODO (mattyw, wallyworld, fwereade) Having the observer here make this approach a bit more legitimate, but it isn't.
   116  	// the observer is only a stop gap to be used in tests. A better approach would be to have the uniter tests start hooks
   117  	// that write to files, and have the tests watch the output to know that hooks have finished.
   118  	Observer UniterExecutionObserver
   119  }
   120  
   121  type NewExecutorFunc func(string, func() (*corecharm.URL, error), func(string) (func() error, error)) (operation.Executor, error)
   122  
   123  // NewUniter creates a new Uniter which will install, run, and upgrade
   124  // a charm on behalf of the unit with the given unitTag, by executing
   125  // hooks and operations provoked by changes in st.
   126  func NewUniter(uniterParams *UniterParams) (*Uniter, error) {
   127  	u := &Uniter{
   128  		st:                   uniterParams.UniterFacade,
   129  		paths:                NewPaths(uniterParams.DataDir, uniterParams.UnitTag),
   130  		hookLock:             uniterParams.MachineLock,
   131  		leadershipTracker:    uniterParams.LeadershipTracker,
   132  		charmDirGuard:        uniterParams.CharmDirGuard,
   133  		updateStatusAt:       uniterParams.UpdateStatusSignal,
   134  		hookRetryStrategy:    uniterParams.HookRetryStrategy,
   135  		newOperationExecutor: uniterParams.NewOperationExecutor,
   136  		observer:             uniterParams.Observer,
   137  		clock:                uniterParams.Clock,
   138  	}
   139  	err := catacomb.Invoke(catacomb.Plan{
   140  		Site: &u.catacomb,
   141  		Work: func() error {
   142  			return u.loop(uniterParams.UnitTag)
   143  		},
   144  	})
   145  	if err != nil {
   146  		return nil, errors.Trace(err)
   147  	}
   148  	return u, nil
   149  }
   150  
   151  func (u *Uniter) loop(unitTag names.UnitTag) (err error) {
   152  	if err := u.init(unitTag); err != nil {
   153  		if err == worker.ErrTerminateAgent {
   154  			return err
   155  		}
   156  		return fmt.Errorf("failed to initialize uniter for %q: %v", unitTag, err)
   157  	}
   158  	logger.Infof("unit %q started", u.unit)
   159  
   160  	// Install is a special case, as it must run before there
   161  	// is any remote state, and before the remote state watcher
   162  	// is started.
   163  	var charmURL *corecharm.URL
   164  	var charmModifiedVersion int
   165  	opState := u.operationExecutor.State()
   166  	if opState.Kind == operation.Install {
   167  		logger.Infof("resuming charm install")
   168  		op, err := u.operationFactory.NewInstall(opState.CharmURL)
   169  		if err != nil {
   170  			return errors.Trace(err)
   171  		}
   172  		if err := u.operationExecutor.Run(op); err != nil {
   173  			return errors.Trace(err)
   174  		}
   175  		charmURL = opState.CharmURL
   176  	} else {
   177  		curl, err := u.unit.CharmURL()
   178  		if err != nil {
   179  			return errors.Trace(err)
   180  		}
   181  		charmURL = curl
   182  		svc, err := u.unit.Service()
   183  		if err != nil {
   184  			return errors.Trace(err)
   185  		}
   186  		charmModifiedVersion, err = svc.CharmModifiedVersion()
   187  		if err != nil {
   188  			return errors.Trace(err)
   189  		}
   190  	}
   191  
   192  	var (
   193  		watcher   *remotestate.RemoteStateWatcher
   194  		watcherMu sync.Mutex
   195  	)
   196  
   197  	logger.Infof("hooks are retried %v", u.hookRetryStrategy.ShouldRetry)
   198  	retryHookChan := make(chan struct{}, 1)
   199  	retryHookTimer := utils.NewBackoffTimer(utils.BackoffTimerConfig{
   200  		Min:    u.hookRetryStrategy.MinRetryTime,
   201  		Max:    u.hookRetryStrategy.MaxRetryTime,
   202  		Jitter: u.hookRetryStrategy.JitterRetryTime,
   203  		Factor: u.hookRetryStrategy.RetryTimeFactor,
   204  		Func: func() {
   205  			// Don't try to send on the channel if it's already full
   206  			// This can happen if the timer fires off before the event is consumed
   207  			// by the resolver loop
   208  			select {
   209  			case retryHookChan <- struct{}{}:
   210  			default:
   211  			}
   212  		},
   213  		Clock: u.clock,
   214  	})
   215  	defer func() {
   216  		// Stop any send that might be pending
   217  		// before closing the channel
   218  		retryHookTimer.Reset()
   219  		close(retryHookChan)
   220  	}()
   221  
   222  	restartWatcher := func() error {
   223  		watcherMu.Lock()
   224  		defer watcherMu.Unlock()
   225  
   226  		if watcher != nil {
   227  			// watcher added to catacomb, will kill uniter if there's an error.
   228  			worker.Stop(watcher)
   229  		}
   230  		var err error
   231  		watcher, err = remotestate.NewWatcher(
   232  			remotestate.WatcherConfig{
   233  				State:               remotestate.NewAPIState(u.st),
   234  				LeadershipTracker:   u.leadershipTracker,
   235  				UnitTag:             unitTag,
   236  				UpdateStatusChannel: u.updateStatusAt,
   237  				CommandChannel:      u.commandChannel,
   238  				RetryHookChannel:    retryHookChan,
   239  			})
   240  		if err != nil {
   241  			return errors.Trace(err)
   242  		}
   243  		if err := u.catacomb.Add(watcher); err != nil {
   244  			return errors.Trace(err)
   245  		}
   246  		return nil
   247  	}
   248  
   249  	onIdle := func() error {
   250  		opState := u.operationExecutor.State()
   251  		if opState.Kind != operation.Continue {
   252  			// We should only set idle status if we're in
   253  			// the "Continue" state, which indicates that
   254  			// there is nothing to do and we're not in an
   255  			// error state.
   256  			return nil
   257  		}
   258  		return setAgentStatus(u, status.StatusIdle, "", nil)
   259  	}
   260  
   261  	clearResolved := func() error {
   262  		if err := u.unit.ClearResolved(); err != nil {
   263  			return errors.Trace(err)
   264  		}
   265  		watcher.ClearResolvedMode()
   266  		return nil
   267  	}
   268  
   269  	for {
   270  		if err = restartWatcher(); err != nil {
   271  			err = errors.Annotate(err, "(re)starting watcher")
   272  			break
   273  		}
   274  
   275  		uniterResolver := NewUniterResolver(ResolverConfig{
   276  			ClearResolved:       clearResolved,
   277  			ReportHookError:     u.reportHookError,
   278  			FixDeployer:         u.deployer.Fix,
   279  			ShouldRetryHooks:    u.hookRetryStrategy.ShouldRetry,
   280  			StartRetryHookTimer: retryHookTimer.Start,
   281  			StopRetryHookTimer:  retryHookTimer.Reset,
   282  			Actions:             actions.NewResolver(),
   283  			Leadership:          uniterleadership.NewResolver(),
   284  			Relations:           relation.NewRelationsResolver(u.relations),
   285  			Storage:             storage.NewResolver(u.storage),
   286  			Commands: runcommands.NewCommandsResolver(
   287  				u.commands, watcher.CommandCompleted,
   288  			),
   289  		})
   290  
   291  		// We should not do anything until there has been a change
   292  		// to the remote state. The watcher will trigger at least
   293  		// once initially.
   294  		select {
   295  		case <-u.catacomb.Dying():
   296  			return u.catacomb.ErrDying()
   297  		case <-watcher.RemoteStateChanged():
   298  		}
   299  
   300  		localState := resolver.LocalState{
   301  			CharmURL:             charmURL,
   302  			CharmModifiedVersion: charmModifiedVersion,
   303  		}
   304  		for err == nil {
   305  			err = resolver.Loop(resolver.LoopConfig{
   306  				Resolver:      uniterResolver,
   307  				Watcher:       watcher,
   308  				Executor:      u.operationExecutor,
   309  				Factory:       u.operationFactory,
   310  				Abort:         u.catacomb.Dying(),
   311  				OnIdle:        onIdle,
   312  				CharmDirGuard: u.charmDirGuard,
   313  			}, &localState)
   314  			switch cause := errors.Cause(err); cause {
   315  			case nil:
   316  				// Loop back around.
   317  			case resolver.ErrLoopAborted:
   318  				err = u.catacomb.ErrDying()
   319  			case operation.ErrNeedsReboot:
   320  				err = worker.ErrRebootMachine
   321  			case operation.ErrHookFailed:
   322  				// Loop back around. The resolver can tell that it is in
   323  				// an error state by inspecting the operation state.
   324  				err = nil
   325  			case resolver.ErrTerminate:
   326  				err = u.terminate()
   327  			case resolver.ErrRestart:
   328  				// make sure we update the two values used above in
   329  				// creating LocalState.
   330  				charmURL = localState.CharmURL
   331  				charmModifiedVersion = localState.CharmModifiedVersion
   332  				// leave err assigned, causing loop to break
   333  			default:
   334  				// We need to set conflicted from here, because error
   335  				// handling is outside of the resolver's control.
   336  				if operation.IsDeployConflictError(cause) {
   337  					localState.Conflicted = true
   338  					err = setAgentStatus(u, status.StatusError, "upgrade failed", nil)
   339  				} else {
   340  					reportAgentError(u, "resolver loop error", err)
   341  				}
   342  			}
   343  		}
   344  
   345  		if errors.Cause(err) != resolver.ErrRestart {
   346  			break
   347  		}
   348  	}
   349  
   350  	logger.Infof("unit %q shutting down: %s", u.unit, err)
   351  	return err
   352  }
   353  
   354  func (u *Uniter) terminate() error {
   355  	unitWatcher, err := u.unit.Watch()
   356  	if err != nil {
   357  		return errors.Trace(err)
   358  	}
   359  	if err := u.catacomb.Add(unitWatcher); err != nil {
   360  		return errors.Trace(err)
   361  	}
   362  	for {
   363  		select {
   364  		case <-u.catacomb.Dying():
   365  			return u.catacomb.ErrDying()
   366  		case _, ok := <-unitWatcher.Changes():
   367  			if !ok {
   368  				return errors.New("unit watcher closed")
   369  			}
   370  			if err := u.unit.Refresh(); err != nil {
   371  				return errors.Trace(err)
   372  			}
   373  			if hasSubs, err := u.unit.HasSubordinates(); err != nil {
   374  				return errors.Trace(err)
   375  			} else if hasSubs {
   376  				continue
   377  			}
   378  			// The unit is known to be Dying; so if it didn't have subordinates
   379  			// just above, it can't acquire new ones before this call.
   380  			if err := u.unit.EnsureDead(); err != nil {
   381  				return errors.Trace(err)
   382  			}
   383  			return worker.ErrTerminateAgent
   384  		}
   385  	}
   386  }
   387  
   388  func (u *Uniter) setupLocks() (err error) {
   389  	if message := u.hookLock.Message(); u.hookLock.IsLocked() && message != "" {
   390  		// Look to see if it was us that held the lock before.  If it was, we
   391  		// should be safe enough to break it, as it is likely that we died
   392  		// before unlocking, and have been restarted by the init system.
   393  		parts := strings.SplitN(message, ":", 2)
   394  		if len(parts) > 1 && parts[0] == u.unit.Name() {
   395  			if err := u.hookLock.BreakLock(); err != nil {
   396  				return err
   397  			}
   398  		}
   399  	}
   400  	return nil
   401  }
   402  
   403  func (u *Uniter) init(unitTag names.UnitTag) (err error) {
   404  	u.unit, err = u.st.Unit(unitTag)
   405  	if err != nil {
   406  		return err
   407  	}
   408  	if u.unit.Life() == params.Dead {
   409  		// If we started up already dead, we should not progress further. If we
   410  		// become Dead immediately after starting up, we may well complete any
   411  		// operations in progress before detecting it; but that race is fundamental
   412  		// and inescapable, whereas this one is not.
   413  		return worker.ErrTerminateAgent
   414  	}
   415  	if err = u.setupLocks(); err != nil {
   416  		return err
   417  	}
   418  	if err := jujuc.EnsureSymlinks(u.paths.ToolsDir); err != nil {
   419  		return err
   420  	}
   421  	if err := os.MkdirAll(u.paths.State.RelationsDir, 0755); err != nil {
   422  		return errors.Trace(err)
   423  	}
   424  	relations, err := relation.NewRelations(
   425  		u.st, unitTag, u.paths.State.CharmDir,
   426  		u.paths.State.RelationsDir, u.catacomb.Dying(),
   427  	)
   428  	if err != nil {
   429  		return errors.Annotatef(err, "cannot create relations")
   430  	}
   431  	u.relations = relations
   432  	storageAttachments, err := storage.NewAttachments(
   433  		u.st, unitTag, u.paths.State.StorageDir, u.catacomb.Dying(),
   434  	)
   435  	if err != nil {
   436  		return errors.Annotatef(err, "cannot create storage hook source")
   437  	}
   438  	u.storage = storageAttachments
   439  	u.commands = runcommands.NewCommands()
   440  	u.commandChannel = make(chan string)
   441  
   442  	deployer, err := charm.NewDeployer(
   443  		u.paths.State.CharmDir,
   444  		u.paths.State.DeployerDir,
   445  		charm.NewBundlesDir(u.paths.State.BundlesDir),
   446  	)
   447  	if err != nil {
   448  		return errors.Annotatef(err, "cannot create deployer")
   449  	}
   450  	u.deployer = &deployerProxy{deployer}
   451  	contextFactory, err := context.NewContextFactory(
   452  		u.st, unitTag, u.leadershipTracker, u.relations.GetInfo, u.storage, u.paths, u.clock,
   453  	)
   454  	if err != nil {
   455  		return err
   456  	}
   457  	runnerFactory, err := runner.NewFactory(
   458  		u.st, u.paths, contextFactory,
   459  	)
   460  	if err != nil {
   461  		return errors.Trace(err)
   462  	}
   463  	u.operationFactory = operation.NewFactory(operation.FactoryParams{
   464  		Deployer:       u.deployer,
   465  		RunnerFactory:  runnerFactory,
   466  		Callbacks:      &operationCallbacks{u},
   467  		Abort:          u.catacomb.Dying(),
   468  		MetricSpoolDir: u.paths.GetMetricsSpoolDir(),
   469  	})
   470  
   471  	operationExecutor, err := u.newOperationExecutor(u.paths.State.OperationsFile, u.getServiceCharmURL, u.acquireExecutionLock)
   472  	if err != nil {
   473  		return errors.Trace(err)
   474  	}
   475  	u.operationExecutor = operationExecutor
   476  
   477  	logger.Debugf("starting juju-run listener on unix:%s", u.paths.Runtime.JujuRunSocket)
   478  	commandRunner, err := NewChannelCommandRunner(ChannelCommandRunnerConfig{
   479  		Abort:          u.catacomb.Dying(),
   480  		Commands:       u.commands,
   481  		CommandChannel: u.commandChannel,
   482  	})
   483  	if err != nil {
   484  		return errors.Annotate(err, "creating command runner")
   485  	}
   486  	u.runListener, err = NewRunListener(RunListenerConfig{
   487  		SocketPath:    u.paths.Runtime.JujuRunSocket,
   488  		CommandRunner: commandRunner,
   489  	})
   490  	if err != nil {
   491  		return errors.Trace(err)
   492  	}
   493  	rlw := newRunListenerWrapper(u.runListener)
   494  	if err := u.catacomb.Add(rlw); err != nil {
   495  		return errors.Trace(err)
   496  	}
   497  	// The socket needs to have permissions 777 in order for other users to use it.
   498  	if jujuos.HostOS() != jujuos.Windows {
   499  		return os.Chmod(u.paths.Runtime.JujuRunSocket, 0777)
   500  	}
   501  	return nil
   502  }
   503  
   504  func (u *Uniter) Kill() {
   505  	u.catacomb.Kill(nil)
   506  }
   507  
   508  func (u *Uniter) Wait() error {
   509  	return u.catacomb.Wait()
   510  }
   511  
   512  func (u *Uniter) getServiceCharmURL() (*corecharm.URL, error) {
   513  	// TODO(fwereade): pretty sure there's no reason to make 2 API calls here.
   514  	service, err := u.st.Service(u.unit.ServiceTag())
   515  	if err != nil {
   516  		return nil, err
   517  	}
   518  	charmURL, _, err := service.CharmURL()
   519  	return charmURL, err
   520  }
   521  
   522  // RunCommands executes the supplied commands in a hook context.
   523  func (u *Uniter) RunCommands(args RunCommandsArgs) (results *exec.ExecResponse, err error) {
   524  	// TODO(axw) drop this when we move the run-listener to an independent
   525  	// worker. This exists purely for the tests.
   526  	return u.runListener.RunCommands(args)
   527  }
   528  
   529  // acquireExecutionLock acquires the machine-level execution lock, and
   530  // returns a func that must be called to unlock it. It's used by operation.Executor
   531  // when running operations that execute external code.
   532  func (u *Uniter) acquireExecutionLock(message string) (func() error, error) {
   533  	logger.Debugf("lock: %v", message)
   534  	// We want to make sure we don't block forever when locking, but take the
   535  	// Uniter's catacomb into account.
   536  	checkCatacomb := func() error {
   537  		select {
   538  		case <-u.catacomb.Dying():
   539  			return u.catacomb.ErrDying()
   540  		default:
   541  			return nil
   542  		}
   543  	}
   544  	message = fmt.Sprintf("%s: %s", u.unit.Name(), message)
   545  	if err := u.hookLock.LockWithFunc(message, checkCatacomb); err != nil {
   546  		return nil, err
   547  	}
   548  	return func() error {
   549  		logger.Debugf("unlock: %v", message)
   550  		return u.hookLock.Unlock()
   551  	}, nil
   552  }
   553  
   554  func (u *Uniter) reportHookError(hookInfo hook.Info) error {
   555  	// Set the agent status to "error". We must do this here in case the
   556  	// hook is interrupted (e.g. unit agent crashes), rather than immediately
   557  	// after attempting a runHookOp.
   558  	hookName := string(hookInfo.Kind)
   559  	statusData := map[string]interface{}{}
   560  	if hookInfo.Kind.IsRelation() {
   561  		statusData["relation-id"] = hookInfo.RelationId
   562  		if hookInfo.RemoteUnit != "" {
   563  			statusData["remote-unit"] = hookInfo.RemoteUnit
   564  		}
   565  		relationName, err := u.relations.Name(hookInfo.RelationId)
   566  		if err != nil {
   567  			return errors.Trace(err)
   568  		}
   569  		hookName = fmt.Sprintf("%s-%s", relationName, hookInfo.Kind)
   570  	}
   571  	statusData["hook"] = hookName
   572  	statusMessage := fmt.Sprintf("hook failed: %q", hookName)
   573  	return setAgentStatus(u, status.StatusError, statusMessage, statusData)
   574  }