gopkg.in/ubuntu-core/snappy.v0@v0.0.0-20210902073436-25a8614f10a6/overlord/overlord.go (about) 1 // -*- Mode: Go; indent-tabs-mode: t -*- 2 3 /* 4 * Copyright (C) 2016-2017 Canonical Ltd 5 * 6 * This program is free software: you can redistribute it and/or modify 7 * it under the terms of the GNU General Public License version 3 as 8 * published by the Free Software Foundation. 9 * 10 * This program is distributed in the hope that it will be useful, 11 * but WITHOUT ANY WARRANTY; without even the implied warranty of 12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13 * GNU General Public License for more details. 14 * 15 * You should have received a copy of the GNU General Public License 16 * along with this program. If not, see <http://www.gnu.org/licenses/>. 17 * 18 */ 19 20 // Package overlord implements the overall control of a snappy system. 21 package overlord 22 23 import ( 24 "fmt" 25 "net/http" 26 "net/url" 27 "os" 28 "path/filepath" 29 "sync" 30 "sync/atomic" 31 "time" 32 33 "gopkg.in/tomb.v2" 34 35 "github.com/snapcore/snapd/dirs" 36 "github.com/snapcore/snapd/logger" 37 "github.com/snapcore/snapd/osutil" 38 "github.com/snapcore/snapd/overlord/assertstate" 39 "github.com/snapcore/snapd/overlord/cmdstate" 40 "github.com/snapcore/snapd/overlord/configstate" 41 "github.com/snapcore/snapd/overlord/configstate/proxyconf" 42 "github.com/snapcore/snapd/overlord/devicestate" 43 "github.com/snapcore/snapd/overlord/healthstate" 44 "github.com/snapcore/snapd/overlord/hookstate" 45 "github.com/snapcore/snapd/overlord/ifacestate" 46 "github.com/snapcore/snapd/overlord/patch" 47 "github.com/snapcore/snapd/overlord/servicestate" 48 "github.com/snapcore/snapd/overlord/snapshotstate" 49 "github.com/snapcore/snapd/overlord/snapstate" 50 _ "github.com/snapcore/snapd/overlord/snapstate/policy" 51 "github.com/snapcore/snapd/overlord/state" 52 "github.com/snapcore/snapd/overlord/storecontext" 53 "github.com/snapcore/snapd/snapdenv" 54 "github.com/snapcore/snapd/store" 55 "github.com/snapcore/snapd/timings" 56 ) 57 58 var ( 59 ensureInterval = 5 * time.Minute 60 pruneInterval = 10 * time.Minute 61 pruneWait = 24 * time.Hour * 1 62 abortWait = 24 * time.Hour * 7 63 64 pruneMaxChanges = 500 65 66 defaultCachedDownloads = 5 67 68 configstateInit = configstate.Init 69 ) 70 71 var pruneTickerC = func(t *time.Ticker) <-chan time.Time { 72 return t.C 73 } 74 75 // Overlord is the central manager of a snappy system, keeping 76 // track of all available state managers and related helpers. 77 type Overlord struct { 78 stateEng *StateEngine 79 // ensure loop 80 loopTomb *tomb.Tomb 81 ensureLock sync.Mutex 82 ensureTimer *time.Timer 83 ensureNext time.Time 84 ensureRun int32 85 pruneTicker *time.Ticker 86 87 startOfOperationTime time.Time 88 89 // restarts 90 restartBehavior RestartBehavior 91 // managers 92 inited bool 93 startedUp bool 94 runner *state.TaskRunner 95 snapMgr *snapstate.SnapManager 96 serviceMgr *servicestate.ServiceManager 97 assertMgr *assertstate.AssertManager 98 ifaceMgr *ifacestate.InterfaceManager 99 hookMgr *hookstate.HookManager 100 deviceMgr *devicestate.DeviceManager 101 cmdMgr *cmdstate.CommandManager 102 shotMgr *snapshotstate.SnapshotManager 103 // proxyConf mediates the http proxy config 104 proxyConf func(req *http.Request) (*url.URL, error) 105 } 106 107 // RestartBehavior controls how to hanndle and carry forward restart requests 108 // via the state. 109 type RestartBehavior interface { 110 HandleRestart(t state.RestartType) 111 // RebootAsExpected is called early when either a reboot was 112 // requested by snapd and happened or no reboot was expected at all. 113 RebootAsExpected(st *state.State) error 114 // RebootDidNotHappen is called early instead when a reboot was 115 // requested by snad but did not happen. 116 RebootDidNotHappen(st *state.State) error 117 } 118 119 var storeNew = store.New 120 121 // New creates a new Overlord with all its state managers. 122 // It can be provided with an optional RestartBehavior. 123 func New(restartBehavior RestartBehavior) (*Overlord, error) { 124 o := &Overlord{ 125 loopTomb: new(tomb.Tomb), 126 inited: true, 127 restartBehavior: restartBehavior, 128 } 129 130 backend := &overlordStateBackend{ 131 path: dirs.SnapStateFile, 132 ensureBefore: o.ensureBefore, 133 requestRestart: o.requestRestart, 134 } 135 s, err := loadState(backend, restartBehavior) 136 if err != nil { 137 return nil, err 138 } 139 140 o.stateEng = NewStateEngine(s) 141 o.runner = state.NewTaskRunner(s) 142 143 // any unknown task should be ignored and succeed 144 matchAnyUnknownTask := func(_ *state.Task) bool { 145 return true 146 } 147 o.runner.AddOptionalHandler(matchAnyUnknownTask, handleUnknownTask, nil) 148 149 hookMgr, err := hookstate.Manager(s, o.runner) 150 if err != nil { 151 return nil, err 152 } 153 o.addManager(hookMgr) 154 155 snapMgr, err := snapstate.Manager(s, o.runner) 156 if err != nil { 157 return nil, err 158 } 159 o.addManager(snapMgr) 160 161 serviceMgr := servicestate.Manager(s, o.runner) 162 o.addManager(serviceMgr) 163 164 assertMgr, err := assertstate.Manager(s, o.runner) 165 if err != nil { 166 return nil, err 167 } 168 o.addManager(assertMgr) 169 170 ifaceMgr, err := ifacestate.Manager(s, hookMgr, o.runner, nil, nil) 171 if err != nil { 172 return nil, err 173 } 174 o.addManager(ifaceMgr) 175 176 deviceMgr, err := devicestate.Manager(s, hookMgr, o.runner, o.newStore) 177 if err != nil { 178 return nil, err 179 } 180 o.addManager(deviceMgr) 181 182 o.addManager(cmdstate.Manager(s, o.runner)) 183 o.addManager(snapshotstate.Manager(s, o.runner)) 184 185 if err := configstateInit(s, hookMgr); err != nil { 186 return nil, err 187 } 188 healthstate.Init(hookMgr) 189 190 // the shared task runner should be added last! 191 o.stateEng.AddManager(o.runner) 192 193 s.Lock() 194 defer s.Unlock() 195 // setting up the store 196 o.proxyConf = proxyconf.New(s).Conf 197 storeCtx := storecontext.New(s, o.deviceMgr.StoreContextBackend()) 198 sto := o.newStoreWithContext(storeCtx) 199 200 snapstate.ReplaceStore(s, sto) 201 202 return o, nil 203 } 204 205 func (o *Overlord) addManager(mgr StateManager) { 206 switch x := mgr.(type) { 207 case *hookstate.HookManager: 208 o.hookMgr = x 209 case *snapstate.SnapManager: 210 o.snapMgr = x 211 case *servicestate.ServiceManager: 212 o.serviceMgr = x 213 case *assertstate.AssertManager: 214 o.assertMgr = x 215 case *ifacestate.InterfaceManager: 216 o.ifaceMgr = x 217 case *devicestate.DeviceManager: 218 o.deviceMgr = x 219 case *cmdstate.CommandManager: 220 o.cmdMgr = x 221 case *snapshotstate.SnapshotManager: 222 o.shotMgr = x 223 } 224 o.stateEng.AddManager(mgr) 225 } 226 227 func loadState(backend state.Backend, restartBehavior RestartBehavior) (*state.State, error) { 228 curBootID, err := osutil.BootID() 229 if err != nil { 230 return nil, fmt.Errorf("fatal: cannot find current boot id: %v", err) 231 } 232 233 perfTimings := timings.New(map[string]string{"startup": "load-state"}) 234 235 if !osutil.FileExists(dirs.SnapStateFile) { 236 // fail fast, mostly interesting for tests, this dir is setup 237 // by the snapd package 238 stateDir := filepath.Dir(dirs.SnapStateFile) 239 if !osutil.IsDirectory(stateDir) { 240 return nil, fmt.Errorf("fatal: directory %q must be present", stateDir) 241 } 242 s := state.New(backend) 243 s.Lock() 244 s.VerifyReboot(curBootID) 245 s.Unlock() 246 patch.Init(s) 247 return s, nil 248 } 249 250 r, err := os.Open(dirs.SnapStateFile) 251 if err != nil { 252 return nil, fmt.Errorf("cannot read the state file: %s", err) 253 } 254 defer r.Close() 255 256 var s *state.State 257 timings.Run(perfTimings, "read-state", "read snapd state from disk", func(tm timings.Measurer) { 258 s, err = state.ReadState(backend, r) 259 }) 260 if err != nil { 261 return nil, err 262 } 263 s.Lock() 264 perfTimings.Save(s) 265 s.Unlock() 266 267 err = verifyReboot(s, curBootID, restartBehavior) 268 if err != nil { 269 return nil, err 270 } 271 272 // one-shot migrations 273 err = patch.Apply(s) 274 if err != nil { 275 return nil, err 276 } 277 return s, nil 278 } 279 280 func verifyReboot(s *state.State, curBootID string, restartBehavior RestartBehavior) error { 281 s.Lock() 282 defer s.Unlock() 283 err := s.VerifyReboot(curBootID) 284 if err != nil && err != state.ErrExpectedReboot { 285 return err 286 } 287 expectedRebootDidNotHappen := err == state.ErrExpectedReboot 288 if restartBehavior != nil { 289 if expectedRebootDidNotHappen { 290 return restartBehavior.RebootDidNotHappen(s) 291 } 292 return restartBehavior.RebootAsExpected(s) 293 } 294 if expectedRebootDidNotHappen { 295 logger.Noticef("expected system restart but it did not happen") 296 } 297 return nil 298 } 299 300 func (o *Overlord) newStoreWithContext(storeCtx store.DeviceAndAuthContext) snapstate.StoreService { 301 cfg := store.DefaultConfig() 302 cfg.Proxy = o.proxyConf 303 sto := storeNew(cfg, storeCtx) 304 sto.SetCacheDownloads(defaultCachedDownloads) 305 return sto 306 } 307 308 // newStore can make new stores for use during remodeling. 309 // The device backend will tie them to the remodeling device state. 310 func (o *Overlord) newStore(devBE storecontext.DeviceBackend) snapstate.StoreService { 311 scb := o.deviceMgr.StoreContextBackend() 312 stoCtx := storecontext.NewComposed(o.State(), devBE, scb, scb) 313 return o.newStoreWithContext(stoCtx) 314 } 315 316 // StartUp proceeds to run any expensive Overlord or managers initialization. After this is done once it is a noop. 317 func (o *Overlord) StartUp() error { 318 if o.startedUp { 319 return nil 320 } 321 o.startedUp = true 322 323 // account for deviceMgr == nil as it's not always present in 324 // the tests. 325 if o.deviceMgr != nil && !snapdenv.Preseeding() { 326 var err error 327 st := o.State() 328 st.Lock() 329 o.startOfOperationTime, err = o.deviceMgr.StartOfOperationTime() 330 st.Unlock() 331 if err != nil { 332 return fmt.Errorf("cannot get start of operation time: %s", err) 333 } 334 } 335 336 // slow down for tests 337 if s := os.Getenv("SNAPD_SLOW_STARTUP"); s != "" { 338 if d, err := time.ParseDuration(s); err == nil { 339 logger.Noticef("slowing down startup by %v as requested", d) 340 341 time.Sleep(d) 342 } 343 } 344 345 return o.stateEng.StartUp() 346 } 347 348 // StartupTimeout computes a usable timeout for the startup 349 // initializations by using a pessimistic estimate. 350 func (o *Overlord) StartupTimeout() (timeout time.Duration, reasoning string, err error) { 351 // TODO: adjust based on real hardware measurements 352 st := o.State() 353 st.Lock() 354 defer st.Unlock() 355 n, err := snapstate.NumSnaps(st) 356 if err != nil { 357 return 0, "", err 358 } 359 // number of snaps (and connections) play a role 360 reasoning = "pessimistic estimate of 30s plus 5s per snap" 361 to := (30 * time.Second) + time.Duration(n)*(5*time.Second) 362 return to, reasoning, nil 363 } 364 365 func (o *Overlord) ensureTimerSetup() { 366 o.ensureLock.Lock() 367 defer o.ensureLock.Unlock() 368 o.ensureTimer = time.NewTimer(ensureInterval) 369 o.ensureNext = time.Now().Add(ensureInterval) 370 o.pruneTicker = time.NewTicker(pruneInterval) 371 } 372 373 func (o *Overlord) ensureTimerReset() time.Time { 374 o.ensureLock.Lock() 375 defer o.ensureLock.Unlock() 376 now := time.Now() 377 o.ensureTimer.Reset(ensureInterval) 378 o.ensureNext = now.Add(ensureInterval) 379 return o.ensureNext 380 } 381 382 func (o *Overlord) ensureBefore(d time.Duration) { 383 o.ensureLock.Lock() 384 defer o.ensureLock.Unlock() 385 if o.ensureTimer == nil { 386 panic("cannot use EnsureBefore before Overlord.Loop") 387 } 388 now := time.Now() 389 next := now.Add(d) 390 if next.Before(o.ensureNext) { 391 o.ensureTimer.Reset(d) 392 o.ensureNext = next 393 return 394 } 395 396 if o.ensureNext.Before(now) { 397 // timer already expired, it will be reset in Loop() and 398 // next Ensure() will be called shortly. 399 if !o.ensureTimer.Stop() { 400 return 401 } 402 o.ensureTimer.Reset(0) 403 o.ensureNext = now 404 } 405 } 406 407 func (o *Overlord) requestRestart(t state.RestartType) { 408 if o.restartBehavior == nil { 409 logger.Noticef("restart requested but no behavior set") 410 } else { 411 o.restartBehavior.HandleRestart(t) 412 } 413 } 414 415 var preseedExitWithError = func(err error) { 416 fmt.Fprintf(os.Stderr, "cannot preseed: %v\n", err) 417 os.Exit(1) 418 } 419 420 // Loop runs a loop in a goroutine to ensure the current state regularly through StateEngine Ensure. 421 func (o *Overlord) Loop() { 422 o.ensureTimerSetup() 423 preseed := snapdenv.Preseeding() 424 if preseed { 425 o.runner.OnTaskError(preseedExitWithError) 426 } 427 o.loopTomb.Go(func() error { 428 for { 429 // TODO: pass a proper context into Ensure 430 o.ensureTimerReset() 431 // in case of errors engine logs them, 432 // continue to the next Ensure() try for now 433 err := o.stateEng.Ensure() 434 if err != nil && preseed { 435 st := o.State() 436 // acquire state lock to ensure nothing attempts to write state 437 // as we are exiting; there is no deferred unlock to avoid 438 // potential race on exit. 439 st.Lock() 440 preseedExitWithError(err) 441 } 442 o.ensureDidRun() 443 pruneC := pruneTickerC(o.pruneTicker) 444 select { 445 case <-o.loopTomb.Dying(): 446 return nil 447 case <-o.ensureTimer.C: 448 case <-pruneC: 449 if preseed { 450 // in preseed mode avoid setting StartOfOperationTime (it's 451 // an error), and don't Prune. 452 continue 453 } 454 st := o.State() 455 st.Lock() 456 st.Prune(o.startOfOperationTime, pruneWait, abortWait, pruneMaxChanges) 457 st.Unlock() 458 } 459 } 460 }) 461 } 462 463 func (o *Overlord) ensureDidRun() { 464 atomic.StoreInt32(&o.ensureRun, 1) 465 } 466 467 func (o *Overlord) CanStandby() bool { 468 run := atomic.LoadInt32(&o.ensureRun) 469 return run != 0 470 } 471 472 // Stop stops the ensure loop and the managers under the StateEngine. 473 func (o *Overlord) Stop() error { 474 o.loopTomb.Kill(nil) 475 err := o.loopTomb.Wait() 476 o.stateEng.Stop() 477 return err 478 } 479 480 func (o *Overlord) settle(timeout time.Duration, beforeCleanups func()) error { 481 if err := o.StartUp(); err != nil { 482 return err 483 } 484 485 func() { 486 o.ensureLock.Lock() 487 defer o.ensureLock.Unlock() 488 if o.ensureTimer != nil { 489 panic("cannot use Settle concurrently with other Settle or Loop calls") 490 } 491 o.ensureTimer = time.NewTimer(0) 492 }() 493 494 defer func() { 495 o.ensureLock.Lock() 496 defer o.ensureLock.Unlock() 497 o.ensureTimer.Stop() 498 o.ensureTimer = nil 499 }() 500 501 t0 := time.Now() 502 done := false 503 var errs []error 504 for !done { 505 if timeout > 0 && time.Since(t0) > timeout { 506 err := fmt.Errorf("Settle is not converging") 507 if len(errs) != 0 { 508 return &ensureError{append(errs, err)} 509 } 510 return err 511 } 512 next := o.ensureTimerReset() 513 err := o.stateEng.Ensure() 514 switch ee := err.(type) { 515 case nil: 516 case *ensureError: 517 errs = append(errs, ee.errs...) 518 default: 519 errs = append(errs, err) 520 } 521 o.stateEng.Wait() 522 o.ensureLock.Lock() 523 done = o.ensureNext.Equal(next) 524 o.ensureLock.Unlock() 525 if done { 526 if beforeCleanups != nil { 527 beforeCleanups() 528 beforeCleanups = nil 529 } 530 // we should wait also for cleanup handlers 531 st := o.State() 532 st.Lock() 533 for _, chg := range st.Changes() { 534 if chg.IsReady() && !chg.IsClean() { 535 done = false 536 break 537 } 538 } 539 st.Unlock() 540 } 541 } 542 if len(errs) != 0 { 543 return &ensureError{errs} 544 } 545 return nil 546 } 547 548 // Settle runs first a state engine Ensure and then wait for 549 // activities to settle. That's done by waiting for all managers' 550 // activities to settle while making sure no immediate further Ensure 551 // is scheduled. It then waits similarly for all ready changes to 552 // reach the clean state. Chiefly for tests. Cannot be used in 553 // conjunction with Loop. If timeout is non-zero and settling takes 554 // longer than timeout, returns an error. Calls StartUp as well. 555 func (o *Overlord) Settle(timeout time.Duration) error { 556 return o.settle(timeout, nil) 557 } 558 559 // SettleObserveBeforeCleanups runs first a state engine Ensure and 560 // then wait for activities to settle. That's done by waiting for all 561 // managers' activities to settle while making sure no immediate 562 // further Ensure is scheduled. It then waits similarly for all ready 563 // changes to reach the clean state, but calls once the provided 564 // callback before doing that. Chiefly for tests. Cannot be used in 565 // conjunction with Loop. If timeout is non-zero and settling takes 566 // longer than timeout, returns an error. Calls StartUp as well. 567 func (o *Overlord) SettleObserveBeforeCleanups(timeout time.Duration, beforeCleanups func()) error { 568 return o.settle(timeout, beforeCleanups) 569 } 570 571 // State returns the system state managed by the overlord. 572 func (o *Overlord) State() *state.State { 573 return o.stateEng.State() 574 } 575 576 // StateEngine returns the stage engine used by overlord. 577 func (o *Overlord) StateEngine() *StateEngine { 578 return o.stateEng 579 } 580 581 // TaskRunner returns the shared task runner responsible for running 582 // tasks for all managers under the overlord. 583 func (o *Overlord) TaskRunner() *state.TaskRunner { 584 return o.runner 585 } 586 587 // SnapManager returns the snap manager responsible for snaps under 588 // the overlord. 589 func (o *Overlord) SnapManager() *snapstate.SnapManager { 590 return o.snapMgr 591 } 592 593 // ServiceManager returns the manager responsible for services 594 // under the overlord. 595 func (o *Overlord) ServiceManager() *servicestate.ServiceManager { 596 return o.serviceMgr 597 } 598 599 // AssertManager returns the assertion manager enforcing assertions 600 // under the overlord. 601 func (o *Overlord) AssertManager() *assertstate.AssertManager { 602 return o.assertMgr 603 } 604 605 // InterfaceManager returns the interface manager maintaining 606 // interface connections under the overlord. 607 func (o *Overlord) InterfaceManager() *ifacestate.InterfaceManager { 608 return o.ifaceMgr 609 } 610 611 // HookManager returns the hook manager responsible for running hooks 612 // under the overlord. 613 func (o *Overlord) HookManager() *hookstate.HookManager { 614 return o.hookMgr 615 } 616 617 // DeviceManager returns the device manager responsible for the device 618 // identity and policies. 619 func (o *Overlord) DeviceManager() *devicestate.DeviceManager { 620 return o.deviceMgr 621 } 622 623 // CommandManager returns the manager responsible for running odd 624 // jobs. 625 func (o *Overlord) CommandManager() *cmdstate.CommandManager { 626 return o.cmdMgr 627 } 628 629 // SnapshotManager returns the manager responsible for snapshots. 630 func (o *Overlord) SnapshotManager() *snapshotstate.SnapshotManager { 631 return o.shotMgr 632 } 633 634 // Mock creates an Overlord without any managers and with a backend 635 // not using disk. Managers can be added with AddManager. For testing. 636 func Mock() *Overlord { 637 return MockWithStateAndRestartHandler(nil, nil) 638 } 639 640 // MockWithStateAndRestartHandler creates an Overlord with the given state 641 // unless it is nil in which case it uses a state backend not using 642 // disk. It will use the given handler on restart requests. Managers 643 // can be added with AddManager. For testing. 644 func MockWithStateAndRestartHandler(s *state.State, handleRestart func(state.RestartType)) *Overlord { 645 o := &Overlord{ 646 loopTomb: new(tomb.Tomb), 647 inited: false, 648 restartBehavior: mockRestartBehavior(handleRestart), 649 } 650 if s == nil { 651 s = state.New(mockBackend{o: o}) 652 } 653 o.stateEng = NewStateEngine(s) 654 o.runner = state.NewTaskRunner(s) 655 656 return o 657 } 658 659 // AddManager adds a manager to the overlord created with Mock. For 660 // testing. 661 func (o *Overlord) AddManager(mgr StateManager) { 662 if o.inited { 663 panic("internal error: cannot add managers to a fully initialized Overlord") 664 } 665 o.addManager(mgr) 666 } 667 668 type mockRestartBehavior func(state.RestartType) 669 670 func (rb mockRestartBehavior) HandleRestart(t state.RestartType) { 671 if rb == nil { 672 return 673 } 674 rb(t) 675 } 676 677 func (rb mockRestartBehavior) RebootAsExpected(*state.State) error { 678 panic("internal error: overlord.Mock should not invoke RebootAsExpected") 679 } 680 681 func (rb mockRestartBehavior) RebootDidNotHappen(*state.State) error { 682 panic("internal error: overlord.Mock should not invoke RebootDidNotHappen") 683 } 684 685 type mockBackend struct { 686 o *Overlord 687 } 688 689 func (mb mockBackend) Checkpoint(data []byte) error { 690 return nil 691 } 692 693 func (mb mockBackend) EnsureBefore(d time.Duration) { 694 mb.o.ensureLock.Lock() 695 timer := mb.o.ensureTimer 696 mb.o.ensureLock.Unlock() 697 if timer == nil { 698 return 699 } 700 701 mb.o.ensureBefore(d) 702 } 703 704 func (mb mockBackend) RequestRestart(t state.RestartType) { 705 mb.o.requestRestart(t) 706 }