github.com/juju/juju@v0.0.0-20240430160146-1752b71fcf00/worker/migrationminion/worker.go (about) 1 // Copyright 2016 Canonical Ltd. 2 // Licensed under the AGPLv3, see LICENCE file for details. 3 4 package migrationminion 5 6 import ( 7 "time" 8 9 "github.com/juju/clock" 10 "github.com/juju/errors" 11 "github.com/juju/worker/v3" 12 "github.com/juju/worker/v3/catacomb" 13 "gopkg.in/retry.v1" 14 15 "github.com/juju/juju/agent" 16 "github.com/juju/juju/api" 17 "github.com/juju/juju/api/base" 18 apiservererrors "github.com/juju/juju/apiserver/errors" 19 "github.com/juju/juju/core/migration" 20 "github.com/juju/juju/core/network" 21 "github.com/juju/juju/core/watcher" 22 "github.com/juju/juju/rpc/params" 23 "github.com/juju/juju/worker/fortress" 24 ) 25 26 const ( 27 // maxRetries is the number of times we'll attempt validation 28 // before giving up. 29 maxRetries = 10 30 31 // initialRetryDelay is the starting delay - this will be 32 // increased exponentially up maxRetries. 33 initialRetryDelay = 100 * time.Millisecond 34 35 // retryBackoffFactor is how much longer we wait after a failing 36 // retry. Retrying 10 times starting at 100ms and backing off 1.6x 37 // gives us a total delay time of about 45s. 38 retryBackoffFactor = 1.6 39 ) 40 41 // Facade exposes controller functionality to a Worker. 42 type Facade interface { 43 Watch() (watcher.MigrationStatusWatcher, error) 44 Report(migrationId string, phase migration.Phase, success bool) error 45 } 46 47 // Config defines the operation of a Worker. 48 type Config struct { 49 Agent agent.Agent 50 Facade Facade 51 Guard fortress.Guard 52 Clock clock.Clock 53 APIOpen func(*api.Info, api.DialOpts) (api.Connection, error) 54 ValidateMigration func(base.APICaller) error 55 Logger Logger 56 } 57 58 // Validate returns an error if config cannot drive a Worker. 59 func (config Config) Validate() error { 60 if config.Agent == nil { 61 return errors.NotValidf("nil Agent") 62 } 63 if config.Facade == nil { 64 return errors.NotValidf("nil Facade") 65 } 66 if config.Guard == nil { 67 return errors.NotValidf("nil Guard") 68 } 69 if config.Clock == nil { 70 return errors.NotValidf("nil Clock") 71 } 72 if config.APIOpen == nil { 73 return errors.NotValidf("nil APIOpen") 74 } 75 if config.ValidateMigration == nil { 76 return errors.NotValidf("nil ValidateMigration") 77 } 78 if config.Logger == nil { 79 return errors.NotValidf("nil Logger") 80 } 81 return nil 82 } 83 84 // New returns a Worker backed by config, or an error. 85 func New(config Config) (worker.Worker, error) { 86 if err := config.Validate(); err != nil { 87 return nil, errors.Trace(err) 88 } 89 w := &Worker{config: config} 90 err := catacomb.Invoke(catacomb.Plan{ 91 Site: &w.catacomb, 92 Work: w.loop, 93 }) 94 if err != nil { 95 return nil, errors.Trace(err) 96 } 97 return w, nil 98 } 99 100 // Worker waits for a model migration to be active, then locks down the 101 // configured fortress and implements the migration. 102 type Worker struct { 103 catacomb catacomb.Catacomb 104 config Config 105 } 106 107 // Kill implements worker.Worker. 108 func (w *Worker) Kill() { 109 w.catacomb.Kill(nil) 110 } 111 112 // Wait implements worker.Worker. 113 func (w *Worker) Wait() error { 114 return w.catacomb.Wait() 115 } 116 117 func (w *Worker) loop() error { 118 watch, err := w.config.Facade.Watch() 119 if err != nil { 120 return errors.Annotate(err, "setting up watcher") 121 } 122 if err := w.catacomb.Add(watch); err != nil { 123 return errors.Trace(err) 124 } 125 126 for { 127 select { 128 case <-w.catacomb.Dying(): 129 return w.catacomb.ErrDying() 130 case status, ok := <-watch.Changes(): 131 if !ok { 132 return errors.New("watcher channel closed") 133 } 134 if err := w.handle(status); err != nil { 135 return errors.Trace(err) 136 } 137 } 138 } 139 } 140 141 func (w *Worker) handle(status watcher.MigrationStatus) error { 142 w.config.Logger.Infof("migration phase is now: %s", status.Phase) 143 144 if !status.Phase.IsRunning() { 145 return w.config.Guard.Unlock() 146 } 147 148 // Ensure that all workers related to migration fortress have 149 // stopped and aren't allowed to restart. 150 err := w.config.Guard.Lockdown(w.catacomb.Dying()) 151 if errors.Cause(err) == fortress.ErrAborted { 152 return w.catacomb.ErrDying() 153 } else if err != nil { 154 return errors.Trace(err) 155 } 156 157 switch status.Phase { 158 case migration.QUIESCE: 159 err = w.doQUIESCE(status) 160 case migration.VALIDATION: 161 err = w.doVALIDATION(status) 162 case migration.SUCCESS: 163 err = w.doSUCCESS(status) 164 default: 165 // The minion doesn't need to do anything for other 166 // migration phases. 167 } 168 return errors.Trace(err) 169 } 170 171 func (w *Worker) doQUIESCE(status watcher.MigrationStatus) error { 172 // Report that the minion is ready and that all workers that 173 // should be shut down have done so. 174 return w.report(status, true) 175 } 176 177 func (w *Worker) doVALIDATION(status watcher.MigrationStatus) error { 178 attempt := retry.StartWithCancel( 179 retry.LimitCount(maxRetries, retry.Exponential{ 180 Initial: initialRetryDelay, 181 Factor: retryBackoffFactor, 182 Jitter: true, 183 }), 184 w.config.Clock, 185 w.catacomb.Dying(), 186 ) 187 var err error 188 for attempt.Next() { 189 err = w.validate(status) 190 if err == nil { 191 break 192 } 193 if attempt.More() { 194 w.config.Logger.Debugf("validation failed (retrying): %v", err) 195 } 196 } 197 if errors.Is(err, apiservererrors.ErrTryAgain) || params.IsCodeTryAgain(err) { 198 // We treat TryAgainError as a retriable error, 199 // so ingore it and don't report to the migration master. 200 w.config.Logger.Errorf("validation failed due to rate limit reached: %v", err) 201 return nil 202 } 203 if err != nil { 204 // Don't return this error just log it and report to the 205 // migrationmaster that things didn't work out. 206 w.config.Logger.Errorf("validation failed: %v", err) 207 } 208 return w.report(status, err == nil) 209 } 210 211 func (w *Worker) validate(status watcher.MigrationStatus) error { 212 agentConf := w.config.Agent.CurrentConfig() 213 apiInfo, ok := agentConf.APIInfo() 214 if !ok { 215 return errors.New("no API connection details") 216 } 217 apiInfo.Addrs = status.TargetAPIAddrs 218 apiInfo.CACert = status.TargetCACert 219 // Application agents (k8s) use old password. 220 if apiInfo.Password == "" { 221 apiInfo.Password = agentConf.OldPassword() 222 } 223 224 // Use zero DialOpts (no retries) because the worker must stay 225 // responsive to Kill requests. We don't want it to be blocked by 226 // a long set of retry attempts. 227 conn, err := w.config.APIOpen(apiInfo, api.DialOpts{}) 228 if err != nil { 229 return errors.Annotate(err, "failed to open API to target controller") 230 } 231 defer func() { _ = conn.Close() }() 232 233 // Ask the agent to confirm that things look ok. 234 err = w.config.ValidateMigration(conn) 235 return errors.Trace(err) 236 } 237 238 func (w *Worker) doSUCCESS(status watcher.MigrationStatus) error { 239 hps, err := network.ParseProviderHostPorts(status.TargetAPIAddrs...) 240 if err != nil { 241 return errors.Annotate(err, "converting API addresses") 242 } 243 244 // Report first because the config update that's about to happen 245 // will cause the API connection to drop. The SUCCESS phase is the 246 // point of no return anyway. 247 if err := w.report(status, true); err != nil { 248 return errors.Trace(err) 249 } 250 251 err = w.config.Agent.ChangeConfig(func(conf agent.ConfigSetter) error { 252 err := conf.SetAPIHostPorts([]network.HostPorts{hps.HostPorts()}) 253 if err != nil { 254 return errors.Trace(err) 255 } 256 conf.SetCACert(status.TargetCACert) 257 return nil 258 }) 259 return errors.Annotate(err, "setting agent config") 260 } 261 262 func (w *Worker) report(status watcher.MigrationStatus, success bool) error { 263 w.config.Logger.Debugf("reporting back for phase %s: %v", status.Phase, success) 264 err := w.config.Facade.Report(status.MigrationId, status.Phase, success) 265 return errors.Annotate(err, "failed to report phase progress") 266 }