github.com/juju/juju@v0.0.0-20240327075706-a90865de2538/worker/upgradeseries/worker.go (about) 1 // Copyright 2018 Canonical Ltd. 2 // Licensed under the AGPLv3, see LICENCE file for details. 3 4 package upgradeseries 5 6 import ( 7 "strings" 8 "sync" 9 10 "github.com/juju/errors" 11 "github.com/juju/names/v5" 12 "github.com/juju/os/v2/series" 13 "github.com/juju/worker/v3" 14 "github.com/juju/worker/v3/catacomb" 15 16 "github.com/juju/juju/core/model" 17 "github.com/juju/juju/rpc/params" 18 ) 19 20 //go:generate go run go.uber.org/mock/mockgen -package mocks -destination mocks/package_mock.go github.com/juju/juju/worker/upgradeseries Facade,UnitDiscovery,Upgrader 21 22 var hostSeries = series.HostSeries 23 24 // Logger represents the methods required to emit log messages. 25 type Logger interface { 26 Debugf(message string, args ...interface{}) 27 Infof(message string, args ...interface{}) 28 Warningf(message string, args ...interface{}) 29 Errorf(message string, args ...interface{}) 30 } 31 32 // UnitDiscovery represents how the worker determines which units need 33 // to check in. 34 type UnitDiscovery interface { 35 Units() ([]names.UnitTag, error) 36 } 37 38 // Config is the configuration needed to construct an UpgradeSeries worker. 39 type Config struct { 40 // Facade is used to access back-end state. 41 Facade Facade 42 43 // Logger is the logger for this worker. 44 Logger Logger 45 46 // UnitDiscovery determines how the worker knows which units should 47 // be running on the machine. 48 UnitDiscovery UnitDiscovery 49 50 // UpgraderFactory is a factory method that will return an upgrader capable 51 // of handling service and agent binary manipulation for a 52 // runtime-determined current and target OS series. 53 UpgraderFactory func(string, string) (Upgrader, error) 54 } 55 56 // Validate validates the upgrade-series worker configuration. 57 func (config Config) Validate() error { 58 if config.Logger == nil { 59 return errors.NotValidf("nil Logger") 60 } 61 if config.Facade == nil { 62 return errors.NotValidf("nil Facade") 63 } 64 if config.UnitDiscovery == nil { 65 return errors.NotValidf("nil UnitDiscovery") 66 } 67 if config.UpgraderFactory == nil { 68 return errors.NotValidf("nil UpgraderFactory") 69 } 70 return nil 71 } 72 73 // upgradeSeriesWorker is responsible for machine and unit agent requirements 74 // during upgrade-series: 75 // 76 // copying the agent binary directory and renaming; 77 // rewriting the machine and unit(s) systemd files if necessary; 78 // ensuring unit agents are started post-upgrade; 79 // moving the status of the upgrade-series steps along. 80 type upgradeSeriesWorker struct { 81 Facade 82 83 catacomb catacomb.Catacomb 84 logger Logger 85 unitDiscovery UnitDiscovery 86 upgraderFactory func(string, string) (Upgrader, error) 87 88 // Some local state retained for reporting purposes. 89 mu sync.Mutex 90 machineStatus model.UpgradeSeriesStatus 91 units names.Set 92 preparedUnits []names.UnitTag 93 completedUnits []names.UnitTag 94 95 // Ensure that leaders are pinned only once if possible, 96 // on the first transition to UpgradeSeriesPrepareStarted. 97 // However repeated pin calls are not of too much concern, 98 // as the pin operations are idempotent. 99 leadersPinned bool 100 } 101 102 // NewWorker creates, starts and returns a new upgrade-series worker based on 103 // the input configuration. 104 func NewWorker(config Config) (worker.Worker, error) { 105 if err := config.Validate(); err != nil { 106 return nil, errors.Trace(err) 107 } 108 109 w := &upgradeSeriesWorker{ 110 Facade: config.Facade, 111 logger: config.Logger, 112 unitDiscovery: config.UnitDiscovery, 113 upgraderFactory: config.UpgraderFactory, 114 machineStatus: model.UpgradeSeriesNotStarted, 115 leadersPinned: false, 116 } 117 118 if err := catacomb.Invoke(catacomb.Plan{ 119 Site: &w.catacomb, 120 Work: w.loop, 121 }); err != nil { 122 return nil, errors.Trace(err) 123 } 124 125 return w, nil 126 } 127 128 func (w *upgradeSeriesWorker) loop() error { 129 uw, err := w.WatchUpgradeSeriesNotifications() 130 if err != nil { 131 return errors.Trace(err) 132 } 133 err = w.catacomb.Add(uw) 134 if err != nil { 135 return errors.Trace(err) 136 } 137 for { 138 select { 139 case <-w.catacomb.Dying(): 140 return w.catacomb.ErrDying() 141 case <-uw.Changes(): 142 if err := w.handleUpgradeSeriesChange(); err != nil { 143 return errors.Trace(err) 144 } 145 } 146 } 147 } 148 149 // handleUpgradeSeriesChange retrieves the current upgrade-series status for 150 // this machine and based on the status, calls methods that will progress 151 // the workflow accordingly. 152 func (w *upgradeSeriesWorker) handleUpgradeSeriesChange() error { 153 w.mu.Lock() 154 defer w.mu.Unlock() 155 156 var err error 157 if w.machineStatus, err = w.MachineStatus(); err != nil { 158 if errors.IsNotFound(err) { 159 // No upgrade-series lock. This can happen when: 160 // - The first watch call is made. 161 // - The lock is removed after a completed upgrade. 162 w.logger.Infof("no series upgrade lock present") 163 w.machineStatus = model.UpgradeSeriesNotStarted 164 w.preparedUnits = nil 165 w.completedUnits = nil 166 return nil 167 } 168 return errors.Trace(err) 169 } 170 w.logger.Infof("machine series upgrade status is %q", w.machineStatus) 171 172 // Determine the set of units that are on the machine. 173 if w.units == nil { 174 units, err := w.unitDiscovery.Units() 175 if err != nil { 176 return errors.Annotate(err, "unit discovery") 177 } 178 w.units = names.NewSet(asGenericTags(units)...) 179 } 180 181 switch w.machineStatus { 182 case model.UpgradeSeriesValidate: 183 err = w.handleValidate() 184 case model.UpgradeSeriesPrepareStarted: 185 err = w.handlePrepareStarted() 186 case model.UpgradeSeriesCompleteStarted: 187 err = w.handleCompleteStarted() 188 case model.UpgradeSeriesCompleted: 189 err = w.handleCompleted() 190 } 191 192 if err != nil { 193 if err := w.SetInstanceStatus(model.UpgradeSeriesError, err.Error()); err != nil { 194 w.logger.Errorf("failed to set series upgrade error status: %s", err.Error()) 195 } 196 } 197 return errors.Trace(err) 198 } 199 200 // handleValidate handles the workflow for the machine with validating the 201 // given set of machine applications and charms. 202 func (w *upgradeSeriesWorker) handleValidate() error { 203 if err := w.SetInstanceStatus(model.UpgradeSeriesValidate, "validating units"); err != nil { 204 return errors.Trace(err) 205 } 206 return nil 207 } 208 209 // handlePrepareStarted handles workflow for the machine with an upgrade-series 210 // lock status of "UpgradeSeriesPrepareStarted" 211 func (w *upgradeSeriesWorker) handlePrepareStarted() error { 212 var err error 213 if err = w.SetInstanceStatus(model.UpgradeSeriesPrepareStarted, "preparing units"); err != nil { 214 return errors.Trace(err) 215 } 216 217 if !w.leadersPinned { 218 if err = w.pinLeaders(); err != nil { 219 return errors.Trace(err) 220 } 221 } 222 223 if w.preparedUnits, err = w.UnitsPrepared(); err != nil { 224 return errors.Trace(err) 225 } 226 227 // If not all the units have checked in, we are still preparing. 228 prepared := names.NewSet(asGenericTags(w.preparedUnits)...) 229 if remaining := w.units.Difference(prepared); remaining.Size() > 0 { 230 // Not done yet. 231 var names []string 232 for _, tag := range remaining.SortedValues() { 233 names = append(names, tag.Id()) 234 } 235 w.logger.Debugf("waiting for units: %s", strings.Join(names, ",")) 236 return nil 237 } 238 239 return errors.Trace(w.transitionPrepareComplete()) 240 } 241 242 // transitionPrepareComplete rewrites service unit files for unit agents running 243 // on this machine so that they are compatible with the init system of the 244 // series upgrade target. 245 func (w *upgradeSeriesWorker) transitionPrepareComplete() error { 246 if err := w.SetInstanceStatus(model.UpgradeSeriesPrepareStarted, "completing preparation"); err != nil { 247 return errors.Trace(err) 248 } 249 250 w.logger.Infof("preparing service units for series upgrade") 251 currentSeries, err := w.CurrentSeries() 252 if err != nil { 253 return errors.Trace(err) 254 } 255 256 toSeries, err := w.TargetSeries() 257 if err != nil { 258 return errors.Trace(err) 259 } 260 261 upgrader, err := w.upgraderFactory(currentSeries, toSeries) 262 if err != nil { 263 return errors.Trace(err) 264 } 265 if err := upgrader.PerformUpgrade(); err != nil { 266 return errors.Trace(err) 267 } 268 269 if err := w.SetMachineStatus(model.UpgradeSeriesPrepareCompleted, "binaries and service files written"); err != nil { 270 return errors.Trace(err) 271 } 272 273 return errors.Trace(w.SetInstanceStatus(model.UpgradeSeriesPrepareCompleted, "waiting for completion command")) 274 } 275 276 func (w *upgradeSeriesWorker) handleCompleteStarted() error { 277 if err := w.SetInstanceStatus(model.UpgradeSeriesCompleteStarted, "waiting for units"); err != nil { 278 return errors.Trace(err) 279 } 280 281 var err error 282 if w.preparedUnits, err = w.UnitsPrepared(); err != nil { 283 return errors.Trace(err) 284 } 285 286 // If all the units are prepared, tell them to start. 287 prepared := names.NewSet(asGenericTags(w.preparedUnits)...) 288 if remaining := w.units.Difference(prepared); remaining.Size() == 0 && len(w.units) > 0 { 289 return errors.Trace(w.StartUnitCompletion("start units after series upgrade")) 290 } 291 292 // If the units have all completed their workflow, then we are done. 293 // Make the final update to the lock to say the machine is completed. 294 if w.completedUnits, err = w.UnitsCompleted(); err != nil { 295 return errors.Trace(err) 296 } 297 298 // If not all the units have checked in, we are still preparing. 299 completed := names.NewSet(asGenericTags(w.completedUnits)...) 300 if remaining := w.units.Difference(completed); remaining.Size() > 0 { 301 // Not done yet. 302 var names []string 303 for _, tag := range remaining.SortedValues() { 304 names = append(names, tag.Id()) 305 } 306 w.logger.Debugf("waiting for units: %s", strings.Join(names, ",")) 307 return nil 308 } 309 310 w.logger.Infof("series upgrade complete") 311 return errors.Trace(w.SetMachineStatus(model.UpgradeSeriesCompleted, "series upgrade complete")) 312 } 313 314 // handleCompleted notifies the server that it has completed the upgrade 315 // workflow, then unpins leadership for applications running on the machine. 316 func (w *upgradeSeriesWorker) handleCompleted() error { 317 if err := w.SetInstanceStatus(model.UpgradeSeriesCompleted, "finalising upgrade"); err != nil { 318 return errors.Trace(err) 319 } 320 321 s, err := hostSeries() 322 if err != nil { 323 return errors.Trace(err) 324 } 325 if err = w.FinishUpgradeSeries(s); err != nil { 326 return errors.Trace(err) 327 } 328 if err = w.unpinLeaders(); err != nil { 329 return errors.Trace(err) 330 } 331 332 return errors.Trace(w.SetInstanceStatus(model.UpgradeSeriesCompleted, "success")) 333 } 334 335 // pinLeaders pins leadership for applications 336 // represented by units running on this machine. 337 func (w *upgradeSeriesWorker) pinLeaders() (err error) { 338 // if we encounter an error, 339 // attempt to ensure that no application leaders remain pinned. 340 defer func() { 341 if err != nil { 342 if unpinErr := w.unpinLeaders(); unpinErr != nil { 343 err = errors.Wrap(err, unpinErr) 344 } 345 } 346 }() 347 348 results, err := w.PinMachineApplications() 349 if err != nil { 350 // If pin machine applications method return not implemented because it's 351 // utilising the legacy leases store, then we should display the warning 352 // in the log and return out. Unpinning leaders should be safe as that 353 // should be considered a no-op 354 if params.IsCodeNotImplemented(err) { 355 w.logger.Infof("failed to pin machine applications, with legacy lease manager leadership pinning is not implemented") 356 return nil 357 } 358 return errors.Trace(err) 359 } 360 361 var lastErr error 362 for app, err := range results { 363 if err == nil { 364 w.logger.Infof("unpin leader for application %q", app) 365 continue 366 } 367 w.logger.Errorf("failed to pin leader for application %q: %s", app, err.Error()) 368 lastErr = err 369 } 370 371 if lastErr == nil { 372 w.leadersPinned = true 373 return nil 374 } 375 return errors.Trace(lastErr) 376 } 377 378 // unpinLeaders unpins leadership for applications 379 // represented by units running on this machine. 380 func (w *upgradeSeriesWorker) unpinLeaders() error { 381 results, err := w.UnpinMachineApplications() 382 if err != nil { 383 return errors.Trace(err) 384 } 385 386 var lastErr error 387 for app, err := range results { 388 if err == nil { 389 w.logger.Infof("unpinned leader for application %q", app) 390 continue 391 } 392 w.logger.Errorf("failed to unpin leader for application %q: %s", app, err.Error()) 393 lastErr = err 394 } 395 396 if lastErr == nil { 397 w.leadersPinned = false 398 return nil 399 } 400 return errors.Trace(lastErr) 401 } 402 403 // Report (worker.Reporter) generates a report for the Juju engine. 404 func (w *upgradeSeriesWorker) Report() map[string]interface{} { 405 w.mu.Lock() 406 defer w.mu.Unlock() 407 408 report := map[string]interface{}{"machine status": w.machineStatus} 409 410 if len(w.preparedUnits) > 0 { 411 units := make([]string, len(w.preparedUnits)) 412 for i, u := range w.preparedUnits { 413 units[i] = u.Id() 414 } 415 report["prepared units"] = units 416 } 417 418 if len(w.completedUnits) > 0 { 419 units := make([]string, len(w.completedUnits)) 420 for i, u := range w.completedUnits { 421 units[i] = u.Id() 422 } 423 report["completed units"] = units 424 } 425 426 return report 427 } 428 429 // Kill implements worker.Worker.Kill. 430 func (w *upgradeSeriesWorker) Kill() { 431 w.catacomb.Kill(nil) 432 } 433 434 // Wait implements worker.Worker.Wait. 435 func (w *upgradeSeriesWorker) Wait() error { 436 return w.catacomb.Wait() 437 } 438 439 // Stop stops the upgrade-series worker and returns any 440 // error it encountered when running. 441 func (w *upgradeSeriesWorker) Stop() error { 442 w.Kill() 443 return w.Wait() 444 } 445 446 func asGenericTags(units []names.UnitTag) []names.Tag { 447 result := make([]names.Tag, len(units)) 448 for i, tag := range units { 449 result[i] = tag 450 } 451 return result 452 }