github.com/juju/juju@v0.0.0-20240327075706-a90865de2538/worker/uniter/resolver/loop.go (about) 1 // Copyright 2015 Canonical Ltd. 2 // Licensed under the AGPLv3, see LICENCE file for details. 3 4 package resolver 5 6 import ( 7 "time" 8 9 jujucharm "github.com/juju/charm/v12" 10 "github.com/juju/charm/v12/hooks" 11 "github.com/juju/errors" 12 "github.com/juju/mutex/v2" 13 14 "github.com/juju/juju/core/lxdprofile" 15 "github.com/juju/juju/worker/fortress" 16 "github.com/juju/juju/worker/uniter/operation" 17 "github.com/juju/juju/worker/uniter/remotestate" 18 ) 19 20 // ErrLoopAborted is used to signal that the loop is exiting because it 21 // received a value on its config's Abort chan. 22 var ErrLoopAborted = errors.New("resolver loop aborted") 23 24 // ErrDoNotProceed is used to distinguish behaviour from 25 // resolver.ErrNoOperation. i.e do not run any operations versus 26 // this resolver has no operations to run. 27 var ErrDoNotProceed = errors.New("do not proceed") 28 29 // Logger is here to stop the desire of creating a package level Logger. 30 // Don't do this, instead use the one passed into the LoopConfig. 31 type logger interface{} 32 33 var _ logger = struct{}{} 34 35 // Logger represents the logging methods used in this package. 36 type Logger interface { 37 Errorf(string, ...interface{}) 38 Debugf(string, ...interface{}) 39 Tracef(string, ...interface{}) 40 Warningf(string, ...interface{}) 41 } 42 43 // LoopConfig contains configuration parameters for the resolver loop. 44 type LoopConfig struct { 45 Resolver Resolver 46 Watcher remotestate.Watcher 47 Executor operation.Executor 48 Factory operation.Factory 49 Abort <-chan struct{} 50 OnIdle func() error 51 CharmDirGuard fortress.Guard 52 CharmDir string 53 Logger Logger 54 } 55 56 // Loop repeatedly waits for remote state changes, feeding the local and 57 // remote state to the provided Resolver to generate Operations which are 58 // then run with the provided Executor. 59 // 60 // The provided "onIdle" function will be called when the loop is waiting 61 // for remote state changes due to a lack of work to perform. It will not 62 // be called when a change is anticipated (i.e. due to ErrWaiting). 63 // 64 // The resolver loop can be controlled in the following ways: 65 // - if the "abort" channel is signalled, then the loop will 66 // exit with ErrLoopAborted 67 // - if the resolver returns ErrWaiting, then no operations 68 // will be executed until the remote state has changed 69 // again 70 // - if the resolver returns ErrNoOperation, then "onIdle" 71 // will be invoked and the loop will wait until the remote 72 // state has changed again 73 // - if the resolver, onIdle, or executor return some other 74 // error, the loop will exit immediately 75 func Loop(cfg LoopConfig, localState *LocalState) error { 76 rf := &resolverOpFactory{Factory: cfg.Factory, LocalState: localState} 77 78 // Initialize charmdir availability before entering the loop in case we're recovering from a restart. 79 err := updateCharmDir(cfg.Executor.State(), cfg.CharmDirGuard, cfg.Abort, cfg.Logger) 80 if err != nil { 81 return errors.Trace(err) 82 } 83 84 // If we're restarting the loop, ensure any pending charm upgrade is run 85 // before continuing. 86 err = checkCharmInstallUpgrade(cfg.Logger, cfg.CharmDir, cfg.Watcher.Snapshot(), rf, cfg.Executor) 87 if err != nil { 88 return errors.Trace(err) 89 } 90 91 fire := make(chan struct{}, 1) 92 for { 93 rf.RemoteState = cfg.Watcher.Snapshot() 94 rf.LocalState.State = cfg.Executor.State() 95 96 if localState.HookWasShutdown && rf.RemoteState.ContainerRunningStatus != nil { 97 agentShutdown := rf.RemoteState.Shutdown 98 if !agentShutdown { 99 agentShutdown = maybeAgentShutdown(cfg) 100 } 101 if !agentShutdown { 102 cfg.Logger.Warningf("last %q hook was killed, but agent still alive", localState.Hook.Kind) 103 } 104 } 105 106 op, err := cfg.Resolver.NextOp(*rf.LocalState, rf.RemoteState, rf) 107 for err == nil { 108 // Send remote state changes to running operations. 109 remoteStateChanged := make(chan remotestate.Snapshot) 110 done := make(chan struct{}) 111 go func() { 112 var rs chan remotestate.Snapshot 113 for { 114 select { 115 case <-cfg.Watcher.RemoteStateChanged(): 116 // We consumed a remote state change event 117 // so we need a way to trigger the select below 118 // in case it was a new operation. 119 select { 120 case fire <- struct{}{}: 121 default: 122 } 123 rs = remoteStateChanged 124 case rs <- cfg.Watcher.Snapshot(): 125 rs = nil 126 case <-done: 127 return 128 } 129 } 130 }() 131 132 cfg.Logger.Tracef("running op: %v", op) 133 if err := cfg.Executor.Run(op, remoteStateChanged); err != nil { 134 close(done) 135 136 if errors.Cause(err) == mutex.ErrCancelled { 137 // If the lock acquisition was cancelled (such as when the 138 // migration-inactive flag drops) we do not want the 139 // resolver to surface that error. This puts the agent into 140 // the "failed" state, which causes the initial migration 141 // validation phase to fail. 142 // The safest thing to do is to bounce the loop and 143 // reevaluate our state, which is what happens upon a 144 // fortress error anyway (uniter.TranslateFortressErrors). 145 cfg.Logger.Warningf("executor lock acquisition cancelled") 146 return ErrRestart 147 } 148 return errors.Trace(err) 149 } 150 close(done) 151 152 // Refresh snapshot, in case remote state 153 // changed between operations. 154 rf.RemoteState = cfg.Watcher.Snapshot() 155 rf.LocalState.State = cfg.Executor.State() 156 157 err = updateCharmDir(rf.LocalState.State, cfg.CharmDirGuard, cfg.Abort, cfg.Logger) 158 if err != nil { 159 return errors.Trace(err) 160 } 161 162 op, err = cfg.Resolver.NextOp(*rf.LocalState, rf.RemoteState, rf) 163 } 164 165 switch errors.Cause(err) { 166 case nil: 167 case ErrWaiting: 168 // If a resolver is waiting for events to 169 // complete, the agent is not idle. 170 case ErrNoOperation: 171 if cfg.OnIdle != nil { 172 if err := cfg.OnIdle(); err != nil { 173 return errors.Trace(err) 174 } 175 } 176 default: 177 return err 178 } 179 180 select { 181 case <-cfg.Abort: 182 return ErrLoopAborted 183 case <-cfg.Watcher.RemoteStateChanged(): 184 case <-fire: 185 } 186 } 187 } 188 189 // maybeAgentShutdown returns true if the agent was killed by a 190 // SIGTERM. If not true at the time of calling, it will wait a short 191 // time for the status to possibly be updated. 192 func maybeAgentShutdown(cfg LoopConfig) bool { 193 fire := make(chan struct{}, 1) 194 remoteStateChanged := make(chan remotestate.Snapshot) 195 done := make(chan struct{}) 196 defer close(done) 197 go func() { 198 var rs chan remotestate.Snapshot 199 for { 200 select { 201 case <-cfg.Watcher.RemoteStateChanged(): 202 // We consumed a remote state change event 203 // so we need a way to trigger the select below 204 // in case it was a new operation. 205 select { 206 case fire <- struct{}{}: 207 default: 208 } 209 rs = remoteStateChanged 210 case rs <- cfg.Watcher.Snapshot(): 211 rs = nil 212 case <-done: 213 return 214 } 215 } 216 }() 217 for { 218 select { 219 case rs := <-remoteStateChanged: 220 if rs.Shutdown { 221 return true 222 } 223 case <-time.After(3 * time.Second): 224 return false 225 } 226 } 227 } 228 229 // updateCharmDir sets charm directory availability for sharing among 230 // concurrent workers according to local operation state. 231 func updateCharmDir(opState operation.State, guard fortress.Guard, abort fortress.Abort, logger Logger) error { 232 var changing bool 233 234 // Determine if the charm content is changing. 235 if opState.Kind == operation.Install || opState.Kind == operation.Upgrade { 236 changing = true 237 } else if opState.Kind == operation.RunHook && opState.Hook != nil && opState.Hook.Kind == hooks.UpgradeCharm { 238 changing = true 239 } 240 241 available := opState.Started && !opState.Stopped && !changing 242 logger.Tracef("charmdir: available=%v opState: started=%v stopped=%v changing=%v", 243 available, opState.Started, opState.Stopped, changing) 244 if available { 245 return guard.Unlock() 246 } else { 247 return guard.Lockdown(abort) 248 } 249 } 250 251 func checkCharmInstallUpgrade(logger Logger, charmDir string, remote remotestate.Snapshot, rf *resolverOpFactory, ex operation.Executor) error { 252 // If we restarted due to error with a pending charm upgrade available, 253 // do the upgrade now. There are cases (lp:1895040) where the error was 254 // caused because not all units were upgraded before relation-created 255 // hooks were attempted for peer relations. Do this before the remote 256 // state watcher is started. It will not trigger an upgrade, until the 257 // next applicationChanged event. Could get stuck in an error loop. 258 259 local := rf.LocalState 260 local.State = ex.State() 261 262 opFunc := rf.NewUpgrade 263 if !local.Installed && local.Hook != nil && local.Hook.Kind == hooks.Install && local.Step != operation.Done { 264 // We must have failed to run the install hook, restarted (possibly in a sidecar charm), so need to re-run the install op. 265 opFunc = rf.NewInstall 266 } else if !local.Installed || remote.CharmURL == "" { 267 // If the unit isn't installed, no need to start an upgrade. 268 return nil 269 } 270 271 _, err := jujucharm.ReadCharmDir(charmDir) 272 haveCharmDir := err == nil 273 if haveCharmDir { 274 // If the unit is installed and already upgrading and the charm dir 275 // exists no need to start an upgrade. 276 if local.Kind == operation.Upgrade || (local.Hook != nil && local.Hook.Kind == hooks.UpgradeCharm) { 277 return nil 278 } 279 } 280 281 if local.Started && remote.CharmProfileRequired { 282 if remote.LXDProfileName == "" { 283 return nil 284 } 285 rev, err := lxdprofile.ProfileRevision(remote.LXDProfileName) 286 if err != nil { 287 return errors.Trace(err) 288 } 289 curl, err := jujucharm.ParseURL(remote.CharmURL) 290 if err != nil { 291 return errors.Trace(err) 292 } 293 if rev != curl.Revision { 294 logger.Tracef("Charm profile required: current revision %d does not match new revision %d", rev, curl.Revision) 295 return nil 296 } 297 } 298 299 sameCharm := local.CharmURL == remote.CharmURL 300 if haveCharmDir && (!local.Started || sameCharm) { 301 return nil 302 } 303 if !haveCharmDir { 304 logger.Debugf("start to re-download charm %v because charm dir %q has gone which is usually caused by operator pod re-scheduling", remote.CharmURL, charmDir) 305 } 306 if !sameCharm { 307 logger.Debugf("execute pending upgrade from %s to %s after uniter loop restart", local.CharmURL, remote.CharmURL) 308 } 309 310 op, err := opFunc(remote.CharmURL) 311 if err != nil { 312 return errors.Trace(err) 313 } 314 if err = ex.Run(op, nil); err != nil { 315 return errors.Trace(err) 316 } 317 if local.Restart { 318 return ErrRestart 319 } 320 return nil 321 }