github.com/wallyworld/juju@v0.0.0-20161013125918-6cf1bc9d917a/worker/migrationmaster/worker.go (about) 1 // Copyright 2016 Canonical Ltd. 2 // Licensed under the AGPLv3, see LICENCE file for details. 3 4 package migrationmaster 5 6 import ( 7 "fmt" 8 "strings" 9 "time" 10 11 "github.com/juju/errors" 12 "github.com/juju/loggo" 13 "github.com/juju/utils/clock" 14 "gopkg.in/juju/names.v2" 15 16 "github.com/juju/juju/api" 17 "github.com/juju/juju/api/migrationtarget" 18 "github.com/juju/juju/apiserver/params" 19 coremigration "github.com/juju/juju/core/migration" 20 "github.com/juju/juju/migration" 21 "github.com/juju/juju/watcher" 22 "github.com/juju/juju/worker/catacomb" 23 "github.com/juju/juju/worker/fortress" 24 ) 25 26 var ( 27 // ErrInactive is returned when the migration is no longer active 28 // (probably aborted). In this case the migrationmaster should be 29 // restarted so that it can wait for the next migration attempt. 30 ErrInactive = errors.New("migration is no longer active") 31 32 // ErrMigrated is returned when the model has migrated to another 33 // server. The migrationmaster should not be restarted again in 34 // this case. 35 ErrMigrated = errors.New("model has migrated") 36 ) 37 38 const ( 39 // maxMinionWait is the maximum time that the migrationmaster will 40 // wait for minions to report back regarding a given migration 41 // phase. 42 maxMinionWait = 15 * time.Minute 43 44 // minionWaitLogInterval is the time between progress update 45 // messages, while the migrationmaster is waiting for reports from 46 // minions. 47 minionWaitLogInterval = 30 * time.Second 48 ) 49 50 // Facade exposes controller functionality to a Worker. 51 type Facade interface { 52 // Watch returns a watcher which reports when a migration is 53 // active for the model associated with the API connection. 54 Watch() (watcher.NotifyWatcher, error) 55 56 // MigrationStatus returns the details and progress of the latest 57 // model migration. 58 MigrationStatus() (coremigration.MigrationStatus, error) 59 60 // SetPhase updates the phase of the currently active model 61 // migration. 62 SetPhase(coremigration.Phase) error 63 64 // SetStatusMessage sets a human readable message regarding the 65 // progress of a migration. 66 SetStatusMessage(string) error 67 68 // Prechecks performs pre-migration checks on the model and 69 // (source) controller. 70 Prechecks() error 71 72 // ModelInfo return basic information about the model to migrated. 73 ModelInfo() (coremigration.ModelInfo, error) 74 75 // Export returns a serialized representation of the model 76 // associated with the API connection. 77 Export() (coremigration.SerializedModel, error) 78 79 // Reap removes all documents of the model associated with the API 80 // connection. 81 Reap() error 82 83 // WatchMinionReports returns a watcher which reports when a migration 84 // minion has made a report for the current migration phase. 85 WatchMinionReports() (watcher.NotifyWatcher, error) 86 87 // MinionReports returns details of the reports made by migration 88 // minions to the controller for the current migration phase. 89 MinionReports() (coremigration.MinionReports, error) 90 } 91 92 // Config defines the operation of a Worker. 93 type Config struct { 94 ModelUUID string 95 Facade Facade 96 Guard fortress.Guard 97 APIOpen func(*api.Info, api.DialOpts) (api.Connection, error) 98 UploadBinaries func(migration.UploadBinariesConfig) error 99 CharmDownloader migration.CharmDownloader 100 ToolsDownloader migration.ToolsDownloader 101 Clock clock.Clock 102 } 103 104 // Validate returns an error if config cannot drive a Worker. 105 func (config Config) Validate() error { 106 if config.ModelUUID == "" { 107 return errors.NotValidf("empty ModelUUID") 108 } 109 if config.Facade == nil { 110 return errors.NotValidf("nil Facade") 111 } 112 if config.Guard == nil { 113 return errors.NotValidf("nil Guard") 114 } 115 if config.APIOpen == nil { 116 return errors.NotValidf("nil APIOpen") 117 } 118 if config.UploadBinaries == nil { 119 return errors.NotValidf("nil UploadBinaries") 120 } 121 if config.CharmDownloader == nil { 122 return errors.NotValidf("nil CharmDownloader") 123 } 124 if config.ToolsDownloader == nil { 125 return errors.NotValidf("nil ToolsDownloader") 126 } 127 if config.Clock == nil { 128 return errors.NotValidf("nil Clock") 129 } 130 return nil 131 } 132 133 // New returns a Worker backed by config, or an error. 134 func New(config Config) (*Worker, error) { 135 if err := config.Validate(); err != nil { 136 return nil, errors.Trace(err) 137 } 138 139 // Soon we will get model specific logs generated in the 140 // controller logged against the model. Until then, distinguish 141 // the logs from different migrationmaster insteads using the 142 // model UUID suffix. 143 loggerName := "juju.worker.migrationmaster:" + config.ModelUUID[len(config.ModelUUID)-6:] 144 logger := loggo.GetLogger(loggerName) 145 146 w := &Worker{ 147 config: config, 148 logger: logger, 149 } 150 err := catacomb.Invoke(catacomb.Plan{ 151 Site: &w.catacomb, 152 Work: w.run, 153 }) 154 if err != nil { 155 return nil, errors.Trace(err) 156 } 157 return w, nil 158 } 159 160 // Worker waits until a migration is active and its configured 161 // Fortress is locked down, and then orchestrates a model migration. 162 type Worker struct { 163 catacomb catacomb.Catacomb 164 config Config 165 logger loggo.Logger 166 } 167 168 // Kill implements worker.Worker. 169 func (w *Worker) Kill() { 170 w.catacomb.Kill(nil) 171 } 172 173 // Wait implements worker.Worker. 174 func (w *Worker) Wait() error { 175 return w.catacomb.Wait() 176 } 177 178 func (w *Worker) run() error { 179 status, err := w.waitForActiveMigration() 180 if err != nil { 181 return errors.Trace(err) 182 } 183 184 err = w.config.Guard.Lockdown(w.catacomb.Dying()) 185 if errors.Cause(err) == fortress.ErrAborted { 186 return w.catacomb.ErrDying() 187 } else if err != nil { 188 return errors.Trace(err) 189 } 190 191 if status.ExternalControl { 192 err := w.waitForMigrationEnd() 193 return errors.Trace(err) 194 } 195 196 phase := status.Phase 197 198 for { 199 var err error 200 switch phase { 201 case coremigration.QUIESCE: 202 phase, err = w.doQUIESCE(status) 203 case coremigration.IMPORT: 204 phase, err = w.doIMPORT(status.TargetInfo, status.ModelUUID) 205 case coremigration.VALIDATION: 206 phase, err = w.doVALIDATION(status) 207 case coremigration.SUCCESS: 208 phase, err = w.doSUCCESS(status) 209 case coremigration.LOGTRANSFER: 210 phase, err = w.doLOGTRANSFER() 211 case coremigration.REAP: 212 phase, err = w.doREAP() 213 case coremigration.ABORT: 214 phase, err = w.doABORT(status.TargetInfo, status.ModelUUID) 215 default: 216 return errors.Errorf("unknown phase: %v [%d]", phase.String(), phase) 217 } 218 219 if err != nil { 220 // A phase handler should only return an error if the 221 // migration master should exit. In the face of other 222 // errors the handler should log the problem and then 223 // return the appropriate error phase to transition to - 224 // i.e. ABORT or REAPFAILED) 225 return errors.Trace(err) 226 } 227 228 if w.killed() { 229 return w.catacomb.ErrDying() 230 } 231 232 w.logger.Infof("setting migration phase to %s", phase) 233 if err := w.config.Facade.SetPhase(phase); err != nil { 234 return errors.Annotate(err, "failed to set phase") 235 } 236 status.Phase = phase 237 238 if modelHasMigrated(phase) { 239 return ErrMigrated 240 } else if phase.IsTerminal() { 241 // Some other terminal phase (aborted), exit and try 242 // again. 243 return ErrInactive 244 } 245 } 246 } 247 248 func (w *Worker) killed() bool { 249 select { 250 case <-w.catacomb.Dying(): 251 return true 252 default: 253 return false 254 } 255 } 256 257 func (w *Worker) setInfoStatus(s string, a ...interface{}) { 258 w.setStatusAndLog(w.logger.Infof, s, a...) 259 } 260 261 func (w *Worker) setWarningStatus(s string, a ...interface{}) { 262 w.setStatusAndLog(w.logger.Warningf, s, a...) 263 } 264 265 func (w *Worker) setErrorStatus(s string, a ...interface{}) { 266 w.setStatusAndLog(w.logger.Errorf, s, a...) 267 } 268 269 func (w *Worker) setStatusAndLog(log func(string, ...interface{}), s string, a ...interface{}) { 270 message := fmt.Sprintf(s, a...) 271 log(message) 272 if err := w.setStatus(message); err != nil { 273 // Setting status isn't critical. If it fails, just logging 274 // the problem here and not passing it upstream makes things a 275 // lot clearer in the caller. 276 w.logger.Errorf("%s", err) 277 } 278 } 279 280 func (w *Worker) setStatus(message string) error { 281 err := w.config.Facade.SetStatusMessage(message) 282 return errors.Annotate(err, "failed to set status message") 283 } 284 285 func (w *Worker) doQUIESCE(status coremigration.MigrationStatus) (coremigration.Phase, error) { 286 err := w.prechecks(status) 287 if err != nil { 288 w.setErrorStatus(err.Error()) 289 return coremigration.ABORT, nil 290 } 291 292 ok, err := w.waitForMinions(status, failFast, "quiescing") 293 if err != nil { 294 return coremigration.UNKNOWN, errors.Trace(err) 295 } 296 if !ok { 297 return coremigration.ABORT, nil 298 } 299 300 return coremigration.IMPORT, nil 301 } 302 303 func (w *Worker) prechecks(status coremigration.MigrationStatus) error { 304 w.setInfoStatus("performing source prechecks") 305 err := w.config.Facade.Prechecks() 306 if err != nil { 307 return errors.Annotate(err, "source prechecks failed") 308 } 309 310 w.setInfoStatus("performing target prechecks") 311 model, err := w.config.Facade.ModelInfo() 312 if err != nil { 313 return errors.Annotate(err, "failed to obtain model info during prechecks") 314 } 315 conn, err := w.openAPIConn(status.TargetInfo) 316 if err != nil { 317 return errors.Annotate(err, "failed to connect to target controller during prechecks") 318 } 319 defer conn.Close() 320 321 if conn.ControllerTag() != status.TargetInfo.ControllerTag { 322 return errors.Errorf("unexpected target controller UUID (got %s, expected %s)", 323 conn.ControllerTag(), status.TargetInfo.ControllerTag) 324 } 325 326 targetClient := migrationtarget.NewClient(conn) 327 err = targetClient.Prechecks(model) 328 return errors.Annotate(err, "target prechecks failed") 329 } 330 331 func (w *Worker) doIMPORT(targetInfo coremigration.TargetInfo, modelUUID string) (coremigration.Phase, error) { 332 err := w.transferModel(targetInfo, modelUUID) 333 if err != nil { 334 w.setErrorStatus("model data transfer failed, %v", err) 335 return coremigration.ABORT, nil 336 } 337 return coremigration.VALIDATION, nil 338 } 339 340 func (w *Worker) transferModel(targetInfo coremigration.TargetInfo, modelUUID string) error { 341 w.setInfoStatus("exporting model") 342 serialized, err := w.config.Facade.Export() 343 if err != nil { 344 return errors.Annotate(err, "model export failed") 345 } 346 347 w.setInfoStatus("importing model into target controller") 348 conn, err := w.openAPIConn(targetInfo) 349 if err != nil { 350 return errors.Annotate(err, "failed to connect to target controller") 351 } 352 defer conn.Close() 353 targetClient := migrationtarget.NewClient(conn) 354 err = targetClient.Import(serialized.Bytes) 355 if err != nil { 356 return errors.Annotate(err, "failed to import model into target controller") 357 } 358 359 w.setInfoStatus("uploading model binaries into target controller") 360 targetModelConn, err := w.openAPIConnForModel(targetInfo, modelUUID) 361 if err != nil { 362 return errors.Annotate(err, "failed to open connection to target model") 363 } 364 defer targetModelConn.Close() 365 targetModelClient := targetModelConn.Client() 366 err = w.config.UploadBinaries(migration.UploadBinariesConfig{ 367 Charms: serialized.Charms, 368 CharmDownloader: w.config.CharmDownloader, 369 CharmUploader: targetModelClient, 370 Tools: serialized.Tools, 371 ToolsDownloader: w.config.ToolsDownloader, 372 ToolsUploader: targetModelClient, 373 }) 374 return errors.Annotate(err, "failed migration binaries") 375 } 376 377 func (w *Worker) doVALIDATION(status coremigration.MigrationStatus) (coremigration.Phase, error) { 378 // Wait for agents to complete their validation checks. 379 ok, err := w.waitForMinions(status, failFast, "validating") 380 if err != nil { 381 return coremigration.UNKNOWN, errors.Trace(err) 382 } 383 if !ok { 384 return coremigration.ABORT, nil 385 } 386 387 // Once all agents have validated, activate the model in the 388 // target controller. 389 err = w.activateModel(status.TargetInfo, status.ModelUUID) 390 if err != nil { 391 w.setErrorStatus("model activation failed, %v", err) 392 return coremigration.ABORT, nil 393 } 394 return coremigration.SUCCESS, nil 395 } 396 397 func (w *Worker) activateModel(targetInfo coremigration.TargetInfo, modelUUID string) error { 398 w.setInfoStatus("activating model in target controller") 399 conn, err := w.openAPIConn(targetInfo) 400 if err != nil { 401 return errors.Trace(err) 402 } 403 defer conn.Close() 404 405 targetClient := migrationtarget.NewClient(conn) 406 err = targetClient.Activate(modelUUID) 407 return errors.Trace(err) 408 } 409 410 func (w *Worker) doSUCCESS(status coremigration.MigrationStatus) (coremigration.Phase, error) { 411 _, err := w.waitForMinions(status, waitForAll, "successful") 412 if err != nil { 413 return coremigration.UNKNOWN, errors.Trace(err) 414 } 415 // There's no turning back from SUCCESS - any problems should have 416 // been picked up in VALIDATION. After the minion wait in the 417 // SUCCESS phase, the migration can only proceed to LOGTRANSFER. 418 return coremigration.LOGTRANSFER, nil 419 } 420 421 func (w *Worker) doLOGTRANSFER() (coremigration.Phase, error) { 422 // TODO(mjs) - To be implemented. 423 // w.setInfoStatus("successful: transferring logs to target controller") 424 return coremigration.REAP, nil 425 } 426 427 func (w *Worker) doREAP() (coremigration.Phase, error) { 428 w.setInfoStatus("successful, removing model from source controller") 429 err := w.config.Facade.Reap() 430 if err != nil { 431 return coremigration.REAPFAILED, errors.Trace(err) 432 } 433 return coremigration.DONE, nil 434 } 435 436 func (w *Worker) doABORT(targetInfo coremigration.TargetInfo, modelUUID string) (coremigration.Phase, error) { 437 w.setInfoStatus("aborted, removing model from target controller") 438 if err := w.removeImportedModel(targetInfo, modelUUID); err != nil { 439 // This isn't fatal. Removing the imported model is a best 440 // efforts attempt so just report the error and proceed. 441 w.setWarningStatus("failed to remove model from target controller, %v", err) 442 } 443 return coremigration.ABORTDONE, nil 444 } 445 446 func (w *Worker) removeImportedModel(targetInfo coremigration.TargetInfo, modelUUID string) error { 447 conn, err := w.openAPIConn(targetInfo) 448 if err != nil { 449 return errors.Trace(err) 450 } 451 defer conn.Close() 452 453 targetClient := migrationtarget.NewClient(conn) 454 err = targetClient.Abort(modelUUID) 455 return errors.Trace(err) 456 } 457 458 func (w *Worker) waitForActiveMigration() (coremigration.MigrationStatus, error) { 459 var empty coremigration.MigrationStatus 460 461 watcher, err := w.config.Facade.Watch() 462 if err != nil { 463 return empty, errors.Annotate(err, "watching for migration") 464 } 465 if err := w.catacomb.Add(watcher); err != nil { 466 return empty, errors.Trace(err) 467 } 468 defer watcher.Kill() 469 470 for { 471 select { 472 case <-w.catacomb.Dying(): 473 return empty, w.catacomb.ErrDying() 474 case <-watcher.Changes(): 475 } 476 477 status, err := w.config.Facade.MigrationStatus() 478 switch { 479 case params.IsCodeNotFound(err): 480 // There's never been a migration. 481 case err == nil && status.Phase.IsTerminal(): 482 // No migration in progress. 483 if modelHasMigrated(status.Phase) { 484 return empty, ErrMigrated 485 } 486 case err != nil: 487 return empty, errors.Annotate(err, "retrieving migration status") 488 default: 489 // Migration is in progress. 490 return status, nil 491 } 492 493 // While waiting for a migration, ensure the fortress is open. 494 if err := w.config.Guard.Unlock(); err != nil { 495 return empty, errors.Trace(err) 496 } 497 } 498 } 499 500 func (w *Worker) waitForMigrationEnd() error { 501 w.logger.Infof("migration is externally managed. waiting for completion") 502 watcher, err := w.config.Facade.Watch() 503 if err != nil { 504 return errors.Annotate(err, "watching for migration") 505 } 506 if err := w.catacomb.Add(watcher); err != nil { 507 return errors.Trace(err) 508 } 509 defer watcher.Kill() 510 511 for { 512 select { 513 case <-w.catacomb.Dying(): 514 return w.catacomb.ErrDying() 515 case <-watcher.Changes(): 516 } 517 518 status, err := w.config.Facade.MigrationStatus() 519 if err != nil { 520 return errors.Annotate(err, "retrieving migration status") 521 } 522 w.logger.Infof("migration phase is now %v", status.Phase) 523 if status.Phase.IsTerminal() { 524 if modelHasMigrated(status.Phase) { 525 w.logger.Infof("migration is complete") 526 return ErrMigrated 527 } 528 w.logger.Infof("migration has aborted") 529 return ErrInactive 530 } 531 } 532 } 533 534 // Possible values for waitForMinion's waitPolicy argument. 535 const failFast = false // Stop waiting at first minion failure report 536 const waitForAll = true // Wait for all minion reports to arrive (or timeout) 537 538 func (w *Worker) waitForMinions( 539 status coremigration.MigrationStatus, 540 waitPolicy bool, 541 infoPrefix string, 542 ) (success bool, err error) { 543 clk := w.config.Clock 544 maxWait := maxMinionWait - clk.Now().Sub(status.PhaseChangedTime) 545 timeout := clk.After(maxWait) 546 547 w.setInfoStatus("%s, waiting for agents to report back", infoPrefix) 548 w.logger.Infof("waiting for agents to report back for migration phase %s (will wait up to %s)", 549 status.Phase, truncDuration(maxWait)) 550 551 watch, err := w.config.Facade.WatchMinionReports() 552 if err != nil { 553 return false, errors.Trace(err) 554 } 555 if err := w.catacomb.Add(watch); err != nil { 556 return false, errors.Trace(err) 557 } 558 559 logProgress := clk.After(minionWaitLogInterval) 560 561 var reports coremigration.MinionReports 562 for { 563 select { 564 case <-w.catacomb.Dying(): 565 return false, w.catacomb.ErrDying() 566 567 case <-timeout: 568 w.logger.Errorf(formatMinionTimeout(reports, status, infoPrefix)) 569 w.setErrorStatus("%s, timed out waiting for agents to report", infoPrefix) 570 return false, nil 571 572 case <-watch.Changes(): 573 var err error 574 reports, err = w.config.Facade.MinionReports() 575 if err != nil { 576 return false, errors.Trace(err) 577 } 578 if err := validateMinionReports(reports, status); err != nil { 579 return false, errors.Trace(err) 580 } 581 failures := len(reports.FailedMachines) + len(reports.FailedUnits) 582 if failures > 0 { 583 w.logger.Errorf(formatMinionFailure(reports, infoPrefix)) 584 w.setErrorStatus("%s, some agents reported failure", infoPrefix) 585 if waitPolicy == failFast { 586 return false, nil 587 } 588 } 589 if reports.UnknownCount == 0 { 590 msg := formatMinionWaitDone(reports, infoPrefix) 591 if failures > 0 { 592 w.logger.Errorf(msg) 593 w.setErrorStatus("%s, some agents reported failure", infoPrefix) 594 return false, nil 595 } 596 w.logger.Infof(msg) 597 w.setInfoStatus("%s, all agents reported success", infoPrefix) 598 return true, nil 599 } 600 601 case <-logProgress: 602 w.setInfoStatus("%s, %s", infoPrefix, formatMinionWaitUpdate(reports)) 603 logProgress = clk.After(minionWaitLogInterval) 604 } 605 } 606 } 607 608 func truncDuration(d time.Duration) time.Duration { 609 return (d / time.Second) * time.Second 610 } 611 612 func validateMinionReports(reports coremigration.MinionReports, status coremigration.MigrationStatus) error { 613 if reports.MigrationId != status.MigrationId { 614 return errors.Errorf("unexpected migration id in minion reports, got %v, expected %v", 615 reports.MigrationId, status.MigrationId) 616 } 617 if reports.Phase != status.Phase { 618 return errors.Errorf("minion reports phase (%s) does not match migration phase (%s)", 619 reports.Phase, status.Phase) 620 } 621 return nil 622 } 623 624 func formatMinionTimeout( 625 reports coremigration.MinionReports, 626 status coremigration.MigrationStatus, 627 infoPrefix string, 628 ) string { 629 if reports.IsZero() { 630 return fmt.Sprintf("no agents reported in time") 631 } 632 633 var fails []string 634 if len(reports.SomeUnknownMachines) > 0 { 635 fails = append(fails, fmt.Sprintf("machines: %s", strings.Join(reports.SomeUnknownMachines, ","))) 636 } 637 if len(reports.SomeUnknownUnits) > 0 { 638 fails = append(fails, fmt.Sprintf("units: %s", strings.Join(reports.SomeUnknownUnits, ","))) 639 } 640 return fmt.Sprintf("%d agents failed to report in time for %q phase (including %s)", 641 reports.UnknownCount, infoPrefix, strings.Join(fails, "; ")) 642 } 643 644 func formatMinionFailure(reports coremigration.MinionReports, infoPrefix string) string { 645 var fails []string 646 if len(reports.FailedMachines) > 0 { 647 fails = append(fails, fmt.Sprintf("machines: %s", strings.Join(reports.FailedMachines, ","))) 648 } 649 if len(reports.FailedUnits) > 0 { 650 fails = append(fails, fmt.Sprintf("units: %s", strings.Join(reports.FailedUnits, ","))) 651 } 652 return fmt.Sprintf("agents failed phase %q (%s)", infoPrefix, strings.Join(fails, "; ")) 653 } 654 655 func formatMinionWaitDone(reports coremigration.MinionReports, infoPrefix string) string { 656 return fmt.Sprintf("completed waiting for agents to report for %q, %d succeeded, %d failed", 657 infoPrefix, reports.SuccessCount, len(reports.FailedMachines)+len(reports.FailedUnits)) 658 } 659 660 func formatMinionWaitUpdate(reports coremigration.MinionReports) string { 661 if reports.IsZero() { 662 return fmt.Sprintf("no reports from agents yet") 663 } 664 665 msg := fmt.Sprintf("waiting for agents to report back: %d succeeded, %d still to report", 666 reports.SuccessCount, reports.UnknownCount) 667 failed := len(reports.FailedMachines) + len(reports.FailedUnits) 668 if failed > 0 { 669 msg += fmt.Sprintf(", %d failed", failed) 670 } 671 return msg 672 } 673 674 func (w *Worker) openAPIConn(targetInfo coremigration.TargetInfo) (api.Connection, error) { 675 return w.openAPIConnForModel(targetInfo, "") 676 } 677 678 func (w *Worker) openAPIConnForModel(targetInfo coremigration.TargetInfo, modelUUID string) (api.Connection, error) { 679 apiInfo := &api.Info{ 680 Addrs: targetInfo.Addrs, 681 CACert: targetInfo.CACert, 682 Tag: targetInfo.AuthTag, 683 Password: targetInfo.Password, 684 ModelTag: names.NewModelTag(modelUUID), 685 Macaroons: targetInfo.Macaroons, 686 } 687 return w.config.APIOpen(apiInfo, migration.ControllerDialOpts()) 688 } 689 690 func modelHasMigrated(phase coremigration.Phase) bool { 691 return phase == coremigration.DONE || phase == coremigration.REAPFAILED 692 }