github.com/juju/juju@v0.0.0-20240327075706-a90865de2538/worker/upgradedatabase/worker.go (about) 1 // Copyright 2019 Canonical Ltd. 2 // Licensed under the AGPLv3, see LICENCE file for details. 3 4 package upgradedatabase 5 6 import ( 7 "fmt" 8 "time" 9 10 "github.com/juju/errors" 11 "github.com/juju/names/v5" 12 "github.com/juju/retry" 13 "github.com/juju/version/v2" 14 "github.com/juju/worker/v3" 15 "gopkg.in/tomb.v2" 16 17 "github.com/juju/juju/agent" 18 "github.com/juju/juju/core/status" 19 "github.com/juju/juju/state" 20 "github.com/juju/juju/upgrades" 21 jujuversion "github.com/juju/juju/version" 22 "github.com/juju/juju/worker/gate" 23 "github.com/juju/juju/wrench" 24 ) 25 26 // NewLock creates a gate.Lock to be used to synchronise workers 27 // that need to start after database upgrades have completed. 28 // The returned Lock should be passed to NewWorker. 29 // If the agent has already upgraded to the current version, 30 // then the lock will be returned in the released state. 31 func NewLock(agentConfig agent.Config) gate.Lock { 32 lock := gate.NewLock() 33 34 // Build numbers are irrelevant to upgrade steps. 35 upgradedToVersion := agentConfig.UpgradedToVersion().ToPatch() 36 currentVersion := jujuversion.Current.ToPatch() 37 38 if upgradedToVersion == currentVersion { 39 lock.Unlock() 40 } 41 42 return lock 43 } 44 45 // Config is the configuration needed to construct an upgradeDB worker. 46 type Config struct { 47 // UpgradeComplete is a lock used to synchronise workers that must start 48 // after database upgrades are verified as completed. 49 UpgradeComplete gate.Lock 50 51 // Tag is the current machine tag. 52 Tag names.Tag 53 54 // agent is the running machine agent. 55 Agent agent.Agent 56 57 // Logger is the logger for this worker. 58 Logger Logger 59 60 // Open state is a function pointer for returning a state pool indirection. 61 OpenState func() (Pool, error) 62 63 // PerformUpgrade is a function pointer for executing the DB upgrade steps. 64 // Context retrieval is lazy because because it requires a real 65 // state.StatePool that we cast our Pool indirection back to. 66 // We need the concrete type, because we are unable to indirect all the 67 // state methods that upgrade steps might require. 68 // This is OK for in-theatre operation, but is not suitable for testing. 69 PerformUpgrade func(version.Number, []upgrades.Target, func() upgrades.Context) error 70 71 // RetryStrategy is the strategy to use for re-attempting failed upgrades. 72 RetryStrategy retry.CallArgs 73 74 // Clock is used to enforce time-out logic for controllers waiting for the 75 // master MongoDB upgrades to execute. 76 Clock Clock 77 } 78 79 // Validate returns an error if the worker config is not valid. 80 func (cfg Config) Validate() error { 81 if cfg.UpgradeComplete == nil { 82 return errors.NotValidf("nil UpgradeComplete lock") 83 } 84 if cfg.Tag == nil { 85 return errors.NotValidf("nil machine tag") 86 } 87 k := cfg.Tag.Kind() 88 if k != names.MachineTagKind && k != names.ControllerAgentTagKind { 89 return errors.NotValidf("%q tag kind", k) 90 } 91 if cfg.Agent == nil { 92 return errors.NotValidf("nil Agent") 93 } 94 if cfg.Logger == nil { 95 return errors.NotValidf("nil Logger") 96 } 97 if cfg.OpenState == nil { 98 return errors.NotValidf("nil OpenState function") 99 } 100 if cfg.PerformUpgrade == nil { 101 return errors.NotValidf("nil PerformUpgrade function") 102 } 103 if cfg.RetryStrategy.Clock == nil { 104 return errors.NotValidf("nil RetryStrategy Clock") 105 } 106 if cfg.RetryStrategy.Delay == 0 { 107 return errors.NotValidf("zero value for RetryStrategy Delay") 108 } 109 if cfg.RetryStrategy.Attempts == 0 && cfg.RetryStrategy.MaxDuration == 0 { 110 return errors.NotValidf("zero value for RetryStrategy Attempts and MaxDuration") 111 } 112 if cfg.Clock == nil { 113 return errors.NotValidf("nil Clock") 114 } 115 return nil 116 } 117 118 // upgradeDB is a worker that will run on a controller machine. 119 // It is responsible for running upgrade steps of type `DatabaseMaster` on the 120 // primary MongoDB instance. 121 type upgradeDB struct { 122 tomb tomb.Tomb 123 upgradeComplete gate.Lock 124 125 tag names.Tag 126 agent agent.Agent 127 logger Logger 128 pool Pool 129 performUpgrade func(version.Number, []upgrades.Target, func() upgrades.Context) error 130 upgradeInfo UpgradeInfo 131 retryStrategy retry.CallArgs 132 clock Clock 133 134 fromVersion version.Number 135 toVersion version.Number 136 } 137 138 // NewWorker validates the input configuration, then uses it to create, 139 // start and return an upgradeDB worker. 140 func NewWorker(cfg Config) (worker.Worker, error) { 141 var err error 142 143 if err = cfg.Validate(); err != nil { 144 return nil, errors.Trace(err) 145 } 146 147 w := &upgradeDB{ 148 upgradeComplete: cfg.UpgradeComplete, 149 tag: cfg.Tag, 150 agent: cfg.Agent, 151 logger: cfg.Logger, 152 performUpgrade: cfg.PerformUpgrade, 153 retryStrategy: cfg.RetryStrategy, 154 clock: cfg.Clock, 155 } 156 if w.pool, err = cfg.OpenState(); err != nil { 157 return nil, err 158 } 159 160 w.tomb.Go(w.run) 161 return w, nil 162 } 163 164 func (w *upgradeDB) run() error { 165 defer func() { 166 if err := w.pool.Close(); err != nil { 167 w.logger.Errorf("failed closing state pool: %v", err) 168 } 169 }() 170 171 if w.upgradeDone() { 172 return nil 173 } 174 175 isPrimary, err := w.pool.IsPrimary(w.tag.Id()) 176 if err != nil { 177 return errors.Trace(err) 178 } 179 180 // Ensure that an upgrade document exists in order to monitor this upgrade. 181 // This is the same document that will be used by the `upgradesteps` worker 182 // that will execute subsequently. 183 // In this worker we use it as a distributed lock - once the status reports 184 // `UpgradeDBComplete` this causes our member `upgradeComplete` to unlock 185 // on each controller running this worker. 186 if w.upgradeInfo, err = w.pool.EnsureUpgradeInfo(w.tag.Id(), w.fromVersion, w.toVersion); err != nil { 187 return errors.Annotate(err, "retrieving upgrade info") 188 } 189 190 // If we are the primary we need to run the upgrade steps. 191 // Otherwise we watch state and unlock once the primary has run the steps. 192 if isPrimary { 193 err = w.runUpgrade() 194 } else { 195 err = w.watchUpgrade() 196 } 197 return errors.Trace(err) 198 } 199 200 // upgradeDone returns true if this worker 201 // does not need to run any upgrade logic. 202 func (w *upgradeDB) upgradeDone() bool { 203 // If we are already unlocked, there is nothing to do. 204 if w.upgradeComplete.IsUnlocked() { 205 return true 206 } 207 208 // If we are already on the current version, there is nothing to do. 209 w.fromVersion = w.agent.CurrentConfig().UpgradedToVersion() 210 w.toVersion = jujuversion.Current 211 if w.fromVersion == w.toVersion { 212 w.logger.Infof("database upgrade for %v already completed", w.toVersion) 213 w.upgradeComplete.Unlock() 214 return true 215 } 216 217 return false 218 } 219 220 func (w *upgradeDB) runUpgrade() error { 221 w.logger.Infof("running database upgrade for %v on mongodb primary", w.toVersion) 222 w.setStatus(status.Started, fmt.Sprintf("upgrading database for %v", w.toVersion)) 223 224 err := w.agent.ChangeConfig(w.runUpgradeSteps) 225 if err != nil { 226 w.setFailStatus() 227 return errors.Trace(err) 228 } 229 // Update the upgrade status document to unlock the other controllers. 230 err = w.upgradeInfo.SetStatus(state.UpgradeDBComplete) 231 if err != nil { 232 w.setFailStatus() 233 return errors.Trace(err) 234 } 235 w.logger.Infof("database upgrade for %v completed successfully.", w.toVersion) 236 w.setStatus(status.Started, fmt.Sprintf("database upgrade for %v completed", w.toVersion)) 237 w.upgradeComplete.Unlock() 238 return nil 239 } 240 241 // runUpgradeSteps runs the required database upgrade steps for the agent, 242 // retrying on failure. 243 func (w *upgradeDB) runUpgradeSteps(agentConfig agent.ConfigSetter) error { 244 contextGetter := w.contextGetter(agentConfig) 245 246 retryStrategy := w.retryStrategy 247 retryStrategy.Func = func() error { 248 return w.performUpgrade(w.fromVersion, []upgrades.Target{upgrades.DatabaseMaster}, contextGetter) 249 } 250 retryStrategy.NotifyFunc = func(lastError error, attempt int) { 251 w.reportUpgradeFailure(lastError, attempt != retryStrategy.Attempts) 252 } 253 err := retry.Call(retryStrategy) 254 if retry.IsAttemptsExceeded(err) || retry.IsDurationExceeded(err) { 255 err = retry.LastError(err) 256 } 257 return errors.Trace(err) 258 } 259 260 // contextGetter returns a function that creates an upgrade context. 261 // Note that the performUpgrade method passed by the manifold calls 262 // upgrades.PerformStateUpgrade, which only uses the StateContext from this 263 // context. We can set the API connection to nil - it should never be used. 264 func (w *upgradeDB) contextGetter(agentConfig agent.ConfigSetter) func() upgrades.Context { 265 return func() upgrades.Context { 266 return upgrades.NewContext(agentConfig, nil, upgrades.NewStateBackend(w.pool.(*pool).StatePool)) 267 } 268 } 269 270 func (w *upgradeDB) watchUpgrade() error { 271 w.logger.Infof("waiting for database upgrade on mongodb primary") 272 w.setStatus(status.Started, fmt.Sprintf("waiting on primary database upgrade for %v", w.toVersion)) 273 274 if wrench.IsActive("upgrade-database", "watch-upgrade") { 275 // Simulate an error causing the upgrade to fail. 276 w.setFailStatus() 277 return errors.New("unable to upgrade - wrench in works") 278 } 279 280 timeout := w.clock.After(10 * time.Minute) 281 watcher := w.upgradeInfo.Watch() 282 defer func() { _ = watcher.Stop() }() 283 284 // Ensure that we re-read the upgrade document after starting the watcher to 285 // ensure that we are operating on the latest information, otherwise there 286 // is a potential race where we wouldn't notice a change. 287 if err := w.upgradeInfo.Refresh(); err != nil { 288 w.logger.Errorf("unable to refresh upgrade info: %v", err) 289 w.setFailStatus() 290 return err 291 } 292 293 // To be here, this node previously returned false for isPrimary 294 // Sometimes our primary changes, or is reported as false when called too 295 // early. In the case that a node state changes whilst watching, 296 // escalate an error which will result in the worker being restarted 297 stateChanged := make(chan struct{}) 298 done := make(chan struct{}) 299 defer close(done) 300 go func() { 301 for { 302 select { 303 case <-done: 304 return 305 case <-w.clock.After(5 * time.Second): 306 isPrimary, err := w.pool.IsPrimary(w.tag.Id()) 307 if isPrimary || err != nil { 308 if err != nil { 309 w.logger.Errorf("Failed to check is this node is primary: %v", err) 310 } 311 close(stateChanged) 312 return 313 } 314 } 315 } 316 }() 317 318 for { 319 // If the primary has already run the database steps then the status 320 // will be "db-complete", however it may have progressed further on to 321 // upgrade steps, so we check for that status too. 322 switch w.upgradeInfo.Status() { 323 case state.UpgradeDBComplete, state.UpgradeRunning: 324 w.logger.Infof("finished waiting - database upgrade steps completed on mongodb primary") 325 w.setStatus(status.Started, fmt.Sprintf("confirmed primary database upgrade for %v", w.toVersion)) 326 w.upgradeComplete.Unlock() 327 return nil 328 default: 329 // Continue waiting for another change. 330 } 331 332 select { 333 case <-watcher.Changes(): 334 if err := w.upgradeInfo.Refresh(); err != nil { 335 w.setFailStatus() 336 return errors.Trace(err) 337 } 338 case <-stateChanged: 339 w.logger.Infof("primary changed mid-upgrade to this watching host. Restart upgrade") 340 return errors.New("mongo primary state changed") 341 case <-timeout: 342 w.setFailStatus() 343 return errors.New("timed out waiting for primary database upgrade") 344 case <-w.tomb.Dying(): 345 return tomb.ErrDying 346 } 347 } 348 } 349 350 func (w *upgradeDB) reportUpgradeFailure(err error, willRetry bool) { 351 retryText := "will retry" 352 if !willRetry { 353 retryText = "giving up" 354 } 355 356 w.logger.Errorf("database upgrade from %v to %v for %q failed (%s): %v", 357 w.fromVersion, w.toVersion, w.tag, retryText, err) 358 w.setFailStatus() 359 } 360 361 func (w *upgradeDB) setFailStatus() { 362 w.setStatus(status.Error, fmt.Sprintf("upgrading database for %v", w.toVersion)) 363 } 364 365 func (w *upgradeDB) setStatus(sts status.Status, msg string) { 366 if err := w.pool.SetStatus(w.tag.Id(), sts, msg); err != nil { 367 w.logger.Errorf("setting agent status: %v", err) 368 } 369 } 370 371 // Kill is part of the worker.Worker interface. 372 func (w *upgradeDB) Kill() { 373 w.tomb.Kill(nil) 374 } 375 376 // Wait is part of the worker.Worker interface. 377 func (w *upgradeDB) Wait() error { 378 return w.tomb.Wait() 379 }