github.com/juju/juju@v0.0.0-20240430160146-1752b71fcf00/worker/machineactions/worker.go (about)

     1  // Copyright 2016 Canonical Ltd.
     2  // Copyright 2016 Cloudbase Solutions
     3  // Licensed under the AGPLv3, see LICENCE file for details.
     4  
     5  package machineactions
     6  
     7  import (
     8  	"fmt"
     9  	"sync"
    10  	"sync/atomic"
    11  
    12  	"github.com/juju/errors"
    13  	"github.com/juju/loggo"
    14  	"github.com/juju/names/v5"
    15  	"github.com/juju/worker/v3"
    16  
    17  	"github.com/juju/juju/api/agent/machineactions"
    18  	"github.com/juju/juju/core/machinelock"
    19  	"github.com/juju/juju/core/watcher"
    20  	"github.com/juju/juju/rpc/params"
    21  )
    22  
    23  var logger = loggo.GetLogger("juju.worker.machineactions")
    24  
    25  // Facade defines the capabilities required by the worker from the API.
    26  type Facade interface {
    27  	WatchActionNotifications(agent names.MachineTag) (watcher.StringsWatcher, error)
    28  	RunningActions(agent names.MachineTag) ([]params.ActionResult, error)
    29  
    30  	Action(names.ActionTag) (*machineactions.Action, error)
    31  	ActionBegin(names.ActionTag) error
    32  	ActionFinish(tag names.ActionTag, status string, results map[string]any, message string) error
    33  }
    34  
    35  // WorkerConfig defines the worker's dependencies.
    36  type WorkerConfig struct {
    37  	Facade       Facade
    38  	MachineTag   names.MachineTag
    39  	MachineLock  machinelock.Lock
    40  	HandleAction func(name string, params map[string]any) (results map[string]any, err error)
    41  }
    42  
    43  // Validate returns an error if the configuration is not complete.
    44  func (c WorkerConfig) Validate() error {
    45  	if c.Facade == nil {
    46  		return errors.NotValidf("nil Facade")
    47  	}
    48  	if c.MachineTag == (names.MachineTag{}) {
    49  		return errors.NotValidf("unspecified MachineTag")
    50  	}
    51  	if c.HandleAction == nil {
    52  		return errors.NotValidf("nil HandleAction")
    53  	}
    54  	return nil
    55  }
    56  
    57  // NewMachineActionsWorker returns a worker.Worker that watches for actions
    58  // enqueued on this machine and tries to execute them.
    59  func NewMachineActionsWorker(config WorkerConfig) (worker.Worker, error) {
    60  	if err := config.Validate(); err != nil {
    61  		return nil, errors.Trace(err)
    62  	}
    63  	swConfig := watcher.StringsConfig{
    64  		Handler: &handler{config: config, limiter: make(chan struct{}, maxConcurrency)},
    65  	}
    66  	return watcher.NewStringsWorker(swConfig)
    67  }
    68  
    69  // At most 100 actions can run simultaneously.
    70  const maxConcurrency = 100
    71  
    72  // handler implements watcher.StringsHandler
    73  type handler struct {
    74  	config   WorkerConfig
    75  	wait     sync.WaitGroup
    76  	limiter  chan struct{}
    77  	inflight int64
    78  }
    79  
    80  // SetUp is part of the watcher.StringsHandler interface.
    81  func (h *handler) SetUp() (watcher.StringsWatcher, error) {
    82  	actions, err := h.config.Facade.RunningActions(h.config.MachineTag)
    83  	if err != nil {
    84  		return nil, errors.Trace(err)
    85  	}
    86  	// We try to cancel any running action before starting up so actions don't linger around
    87  	// We *should* really have only one action coming up here if the execution is serial but
    88  	// this is best effort anyway.
    89  	for _, action := range actions {
    90  		tag, err := names.ParseActionTag(action.Action.Tag)
    91  		if err != nil {
    92  			logger.Infof("tried to cancel action %s but failed with error %v", action.Action.Tag, err)
    93  			continue
    94  		}
    95  		err = h.config.Facade.ActionFinish(tag, params.ActionFailed, nil, "action cancelled")
    96  		if err != nil {
    97  			logger.Infof("tried to cancel action %s but failed with error %v", action.Action.Tag, err)
    98  		}
    99  	}
   100  	return h.config.Facade.WatchActionNotifications(h.config.MachineTag)
   101  }
   102  
   103  // Handle is part of the watcher.StringsHandler interface.
   104  // It should give us any actions currently enqueued for this machine.
   105  // We try to execute every action before returning
   106  func (h *handler) Handle(abort <-chan struct{}, actionsSlice []string) error {
   107  	for _, actionId := range actionsSlice {
   108  		ok := names.IsValidAction(actionId)
   109  		if !ok {
   110  			return errors.Errorf("got invalid action id %s", actionId)
   111  		}
   112  
   113  		actionTag := names.NewActionTag(actionId)
   114  		action, err := h.config.Facade.Action(actionTag)
   115  		if err != nil {
   116  			// If there is an error attempting to get the action, then don't bounce
   117  			// the worker. We can't remove the action notification directly, as that
   118  			// requires the action to exist.
   119  			// TODO (stickupkid) As a follow up, we should have a new method that
   120  			// allows the removal of a action notification without an action present.
   121  			logger.Infof("unable to retrieve action %s: %v", actionId, err)
   122  			continue
   123  		}
   124  
   125  		// Acquire concurrency slot.
   126  		select {
   127  		case h.limiter <- struct{}{}:
   128  		case <-abort:
   129  			// The associated strings watcher has been aborted, so there isn't
   130  			// anything we can do here but give up.
   131  			logger.Debugf("action %q aborted waiting in queue", actionTag.ID)
   132  			return nil
   133  		}
   134  		h.wait.Add(1)
   135  		atomic.AddInt64(&h.inflight, 1)
   136  
   137  		// Run the action.
   138  		go h.runAction(actionTag, *action, abort)
   139  	}
   140  	return nil
   141  }
   142  
   143  // TearDown is part of the watcher.NotifyHandler interface.
   144  func (h *handler) TearDown() error {
   145  	// Wait for any running actions to finish.
   146  	// TODO (stickupkid): This wait group could wait for ever if any of actions hang.
   147  	// Instead we should be much more clever and wait for a limited time before marking
   148  	// any outstanding actions as failed.
   149  	inflight := atomic.LoadInt64(&h.inflight)
   150  	if inflight > 0 {
   151  		logger.Infof("Waiting for %d running actions...", inflight)
   152  	}
   153  	h.wait.Wait()
   154  	if inflight > 0 {
   155  		logger.Infof("Done waiting for actions.")
   156  	}
   157  	return nil
   158  }
   159  
   160  func (h *handler) runAction(actionTag names.ActionTag, action machineactions.Action, abort <-chan struct{}) {
   161  	var results map[string]any
   162  	var actionErr error
   163  	defer func() {
   164  		// The result returned from handling the action is sent through using ActionFinish.
   165  		var finishErr error
   166  		if actionErr != nil {
   167  			finishErr = h.config.Facade.ActionFinish(actionTag, params.ActionFailed, nil, actionErr.Error())
   168  		} else {
   169  			finishErr = h.config.Facade.ActionFinish(actionTag, params.ActionCompleted, results, "")
   170  		}
   171  		if finishErr != nil &&
   172  			!params.IsCodeAlreadyExists(finishErr) &&
   173  			!params.IsCodeNotFoundOrCodeUnauthorized(finishErr) {
   174  			logger.Errorf("could not finish action %s: %v", action.Name(), finishErr)
   175  		}
   176  
   177  		// Release concurrency slot.
   178  		select {
   179  		case <-h.limiter:
   180  		case <-abort:
   181  			logger.Debugf("action %q aborted waiting to enqueue", actionTag)
   182  		}
   183  		atomic.AddInt64(&h.inflight, -1)
   184  		h.wait.Done()
   185  	}()
   186  
   187  	if !action.Parallel() || action.ExecutionGroup() != "" {
   188  		group := "exec-command"
   189  		worker := "machine exec command runner"
   190  		if g := action.ExecutionGroup(); g != "" {
   191  			group = fmt.Sprintf("%s-%s", group, g)
   192  			worker = fmt.Sprintf("%s (exec group=%s)", worker, g)
   193  		}
   194  		spec := machinelock.Spec{
   195  			Cancel:  abort,
   196  			Worker:  worker,
   197  			Comment: fmt.Sprintf("action %s", action.ID()),
   198  			Group:   group,
   199  		}
   200  		releaser, err := h.config.MachineLock.Acquire(spec)
   201  		if err != nil {
   202  			actionErr = errors.Annotatef(err, "could not acquire machine execution lock for exec action %s", action.Name())
   203  			return
   204  		}
   205  		defer releaser()
   206  	}
   207  
   208  	if err := h.config.Facade.ActionBegin(actionTag); err != nil {
   209  		actionErr = errors.Annotatef(err, "could not begin action %s", action.Name())
   210  		return
   211  	}
   212  	results, actionErr = h.config.HandleAction(action.Name(), action.Params())
   213  }