github.com/juju/juju@v0.0.0-20240430160146-1752b71fcf00/worker/machineactions/worker.go (about) 1 // Copyright 2016 Canonical Ltd. 2 // Copyright 2016 Cloudbase Solutions 3 // Licensed under the AGPLv3, see LICENCE file for details. 4 5 package machineactions 6 7 import ( 8 "fmt" 9 "sync" 10 "sync/atomic" 11 12 "github.com/juju/errors" 13 "github.com/juju/loggo" 14 "github.com/juju/names/v5" 15 "github.com/juju/worker/v3" 16 17 "github.com/juju/juju/api/agent/machineactions" 18 "github.com/juju/juju/core/machinelock" 19 "github.com/juju/juju/core/watcher" 20 "github.com/juju/juju/rpc/params" 21 ) 22 23 var logger = loggo.GetLogger("juju.worker.machineactions") 24 25 // Facade defines the capabilities required by the worker from the API. 26 type Facade interface { 27 WatchActionNotifications(agent names.MachineTag) (watcher.StringsWatcher, error) 28 RunningActions(agent names.MachineTag) ([]params.ActionResult, error) 29 30 Action(names.ActionTag) (*machineactions.Action, error) 31 ActionBegin(names.ActionTag) error 32 ActionFinish(tag names.ActionTag, status string, results map[string]any, message string) error 33 } 34 35 // WorkerConfig defines the worker's dependencies. 36 type WorkerConfig struct { 37 Facade Facade 38 MachineTag names.MachineTag 39 MachineLock machinelock.Lock 40 HandleAction func(name string, params map[string]any) (results map[string]any, err error) 41 } 42 43 // Validate returns an error if the configuration is not complete. 44 func (c WorkerConfig) Validate() error { 45 if c.Facade == nil { 46 return errors.NotValidf("nil Facade") 47 } 48 if c.MachineTag == (names.MachineTag{}) { 49 return errors.NotValidf("unspecified MachineTag") 50 } 51 if c.HandleAction == nil { 52 return errors.NotValidf("nil HandleAction") 53 } 54 return nil 55 } 56 57 // NewMachineActionsWorker returns a worker.Worker that watches for actions 58 // enqueued on this machine and tries to execute them. 59 func NewMachineActionsWorker(config WorkerConfig) (worker.Worker, error) { 60 if err := config.Validate(); err != nil { 61 return nil, errors.Trace(err) 62 } 63 swConfig := watcher.StringsConfig{ 64 Handler: &handler{config: config, limiter: make(chan struct{}, maxConcurrency)}, 65 } 66 return watcher.NewStringsWorker(swConfig) 67 } 68 69 // At most 100 actions can run simultaneously. 70 const maxConcurrency = 100 71 72 // handler implements watcher.StringsHandler 73 type handler struct { 74 config WorkerConfig 75 wait sync.WaitGroup 76 limiter chan struct{} 77 inflight int64 78 } 79 80 // SetUp is part of the watcher.StringsHandler interface. 81 func (h *handler) SetUp() (watcher.StringsWatcher, error) { 82 actions, err := h.config.Facade.RunningActions(h.config.MachineTag) 83 if err != nil { 84 return nil, errors.Trace(err) 85 } 86 // We try to cancel any running action before starting up so actions don't linger around 87 // We *should* really have only one action coming up here if the execution is serial but 88 // this is best effort anyway. 89 for _, action := range actions { 90 tag, err := names.ParseActionTag(action.Action.Tag) 91 if err != nil { 92 logger.Infof("tried to cancel action %s but failed with error %v", action.Action.Tag, err) 93 continue 94 } 95 err = h.config.Facade.ActionFinish(tag, params.ActionFailed, nil, "action cancelled") 96 if err != nil { 97 logger.Infof("tried to cancel action %s but failed with error %v", action.Action.Tag, err) 98 } 99 } 100 return h.config.Facade.WatchActionNotifications(h.config.MachineTag) 101 } 102 103 // Handle is part of the watcher.StringsHandler interface. 104 // It should give us any actions currently enqueued for this machine. 105 // We try to execute every action before returning 106 func (h *handler) Handle(abort <-chan struct{}, actionsSlice []string) error { 107 for _, actionId := range actionsSlice { 108 ok := names.IsValidAction(actionId) 109 if !ok { 110 return errors.Errorf("got invalid action id %s", actionId) 111 } 112 113 actionTag := names.NewActionTag(actionId) 114 action, err := h.config.Facade.Action(actionTag) 115 if err != nil { 116 // If there is an error attempting to get the action, then don't bounce 117 // the worker. We can't remove the action notification directly, as that 118 // requires the action to exist. 119 // TODO (stickupkid) As a follow up, we should have a new method that 120 // allows the removal of a action notification without an action present. 121 logger.Infof("unable to retrieve action %s: %v", actionId, err) 122 continue 123 } 124 125 // Acquire concurrency slot. 126 select { 127 case h.limiter <- struct{}{}: 128 case <-abort: 129 // The associated strings watcher has been aborted, so there isn't 130 // anything we can do here but give up. 131 logger.Debugf("action %q aborted waiting in queue", actionTag.ID) 132 return nil 133 } 134 h.wait.Add(1) 135 atomic.AddInt64(&h.inflight, 1) 136 137 // Run the action. 138 go h.runAction(actionTag, *action, abort) 139 } 140 return nil 141 } 142 143 // TearDown is part of the watcher.NotifyHandler interface. 144 func (h *handler) TearDown() error { 145 // Wait for any running actions to finish. 146 // TODO (stickupkid): This wait group could wait for ever if any of actions hang. 147 // Instead we should be much more clever and wait for a limited time before marking 148 // any outstanding actions as failed. 149 inflight := atomic.LoadInt64(&h.inflight) 150 if inflight > 0 { 151 logger.Infof("Waiting for %d running actions...", inflight) 152 } 153 h.wait.Wait() 154 if inflight > 0 { 155 logger.Infof("Done waiting for actions.") 156 } 157 return nil 158 } 159 160 func (h *handler) runAction(actionTag names.ActionTag, action machineactions.Action, abort <-chan struct{}) { 161 var results map[string]any 162 var actionErr error 163 defer func() { 164 // The result returned from handling the action is sent through using ActionFinish. 165 var finishErr error 166 if actionErr != nil { 167 finishErr = h.config.Facade.ActionFinish(actionTag, params.ActionFailed, nil, actionErr.Error()) 168 } else { 169 finishErr = h.config.Facade.ActionFinish(actionTag, params.ActionCompleted, results, "") 170 } 171 if finishErr != nil && 172 !params.IsCodeAlreadyExists(finishErr) && 173 !params.IsCodeNotFoundOrCodeUnauthorized(finishErr) { 174 logger.Errorf("could not finish action %s: %v", action.Name(), finishErr) 175 } 176 177 // Release concurrency slot. 178 select { 179 case <-h.limiter: 180 case <-abort: 181 logger.Debugf("action %q aborted waiting to enqueue", actionTag) 182 } 183 atomic.AddInt64(&h.inflight, -1) 184 h.wait.Done() 185 }() 186 187 if !action.Parallel() || action.ExecutionGroup() != "" { 188 group := "exec-command" 189 worker := "machine exec command runner" 190 if g := action.ExecutionGroup(); g != "" { 191 group = fmt.Sprintf("%s-%s", group, g) 192 worker = fmt.Sprintf("%s (exec group=%s)", worker, g) 193 } 194 spec := machinelock.Spec{ 195 Cancel: abort, 196 Worker: worker, 197 Comment: fmt.Sprintf("action %s", action.ID()), 198 Group: group, 199 } 200 releaser, err := h.config.MachineLock.Acquire(spec) 201 if err != nil { 202 actionErr = errors.Annotatef(err, "could not acquire machine execution lock for exec action %s", action.Name()) 203 return 204 } 205 defer releaser() 206 } 207 208 if err := h.config.Facade.ActionBegin(actionTag); err != nil { 209 actionErr = errors.Annotatef(err, "could not begin action %s", action.Name()) 210 return 211 } 212 results, actionErr = h.config.HandleAction(action.Name(), action.Params()) 213 }