github.com/stolowski/snapd@v0.0.0-20210407085831-115137ce5a22/overlord/overlord.go (about) 1 // -*- Mode: Go; indent-tabs-mode: t -*- 2 3 /* 4 * Copyright (C) 2016-2017 Canonical Ltd 5 * 6 * This program is free software: you can redistribute it and/or modify 7 * it under the terms of the GNU General Public License version 3 as 8 * published by the Free Software Foundation. 9 * 10 * This program is distributed in the hope that it will be useful, 11 * but WITHOUT ANY WARRANTY; without even the implied warranty of 12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13 * GNU General Public License for more details. 14 * 15 * You should have received a copy of the GNU General Public License 16 * along with this program. If not, see <http://www.gnu.org/licenses/>. 17 * 18 */ 19 20 // Package overlord implements the overall control of a snappy system. 21 package overlord 22 23 import ( 24 "fmt" 25 "net/http" 26 "net/url" 27 "os" 28 "path/filepath" 29 "sync" 30 "sync/atomic" 31 "time" 32 33 "gopkg.in/tomb.v2" 34 35 "github.com/snapcore/snapd/dirs" 36 "github.com/snapcore/snapd/logger" 37 "github.com/snapcore/snapd/osutil" 38 39 "github.com/snapcore/snapd/overlord/assertstate" 40 "github.com/snapcore/snapd/overlord/cmdstate" 41 "github.com/snapcore/snapd/overlord/configstate" 42 "github.com/snapcore/snapd/overlord/configstate/proxyconf" 43 "github.com/snapcore/snapd/overlord/devicestate" 44 "github.com/snapcore/snapd/overlord/healthstate" 45 "github.com/snapcore/snapd/overlord/hookstate" 46 "github.com/snapcore/snapd/overlord/ifacestate" 47 "github.com/snapcore/snapd/overlord/patch" 48 "github.com/snapcore/snapd/overlord/servicestate" 49 "github.com/snapcore/snapd/overlord/snapshotstate" 50 "github.com/snapcore/snapd/overlord/snapstate" 51 _ "github.com/snapcore/snapd/overlord/snapstate/policy" 52 "github.com/snapcore/snapd/overlord/state" 53 "github.com/snapcore/snapd/overlord/storecontext" 54 "github.com/snapcore/snapd/snapdenv" 55 "github.com/snapcore/snapd/store" 56 "github.com/snapcore/snapd/timings" 57 ) 58 59 var ( 60 ensureInterval = 5 * time.Minute 61 pruneInterval = 10 * time.Minute 62 pruneWait = 24 * time.Hour * 1 63 abortWait = 24 * time.Hour * 7 64 65 pruneMaxChanges = 500 66 67 defaultCachedDownloads = 5 68 69 configstateInit = configstate.Init 70 ) 71 72 var pruneTickerC = func(t *time.Ticker) <-chan time.Time { 73 return t.C 74 } 75 76 // Overlord is the central manager of a snappy system, keeping 77 // track of all available state managers and related helpers. 78 type Overlord struct { 79 stateEng *StateEngine 80 // ensure loop 81 loopTomb *tomb.Tomb 82 ensureLock sync.Mutex 83 ensureTimer *time.Timer 84 ensureNext time.Time 85 ensureRun int32 86 pruneTicker *time.Ticker 87 88 startOfOperationTime time.Time 89 90 // restarts 91 restartBehavior RestartBehavior 92 // managers 93 inited bool 94 startedUp bool 95 runner *state.TaskRunner 96 snapMgr *snapstate.SnapManager 97 serviceMgr *servicestate.ServiceManager 98 assertMgr *assertstate.AssertManager 99 ifaceMgr *ifacestate.InterfaceManager 100 hookMgr *hookstate.HookManager 101 deviceMgr *devicestate.DeviceManager 102 cmdMgr *cmdstate.CommandManager 103 shotMgr *snapshotstate.SnapshotManager 104 // proxyConf mediates the http proxy config 105 proxyConf func(req *http.Request) (*url.URL, error) 106 } 107 108 // RestartBehavior controls how to hanndle and carry forward restart requests 109 // via the state. 110 type RestartBehavior interface { 111 HandleRestart(t state.RestartType) 112 // RebootAsExpected is called early when either a reboot was 113 // requested by snapd and happened or no reboot was expected at all. 114 RebootAsExpected(st *state.State) error 115 // RebootDidNotHappen is called early instead when a reboot was 116 // requested by snad but did not happen. 117 RebootDidNotHappen(st *state.State) error 118 } 119 120 var storeNew = store.New 121 122 // New creates a new Overlord with all its state managers. 123 // It can be provided with an optional RestartBehavior. 124 func New(restartBehavior RestartBehavior) (*Overlord, error) { 125 o := &Overlord{ 126 loopTomb: new(tomb.Tomb), 127 inited: true, 128 restartBehavior: restartBehavior, 129 } 130 131 backend := &overlordStateBackend{ 132 path: dirs.SnapStateFile, 133 ensureBefore: o.ensureBefore, 134 requestRestart: o.requestRestart, 135 } 136 s, err := loadState(backend, restartBehavior) 137 if err != nil { 138 return nil, err 139 } 140 141 o.stateEng = NewStateEngine(s) 142 o.runner = state.NewTaskRunner(s) 143 144 // any unknown task should be ignored and succeed 145 matchAnyUnknownTask := func(_ *state.Task) bool { 146 return true 147 } 148 o.runner.AddOptionalHandler(matchAnyUnknownTask, handleUnknownTask, nil) 149 150 hookMgr, err := hookstate.Manager(s, o.runner) 151 if err != nil { 152 return nil, err 153 } 154 o.addManager(hookMgr) 155 156 snapMgr, err := snapstate.Manager(s, o.runner) 157 if err != nil { 158 return nil, err 159 } 160 o.addManager(snapMgr) 161 162 serviceMgr := servicestate.Manager(s, o.runner) 163 o.addManager(serviceMgr) 164 165 assertMgr, err := assertstate.Manager(s, o.runner) 166 if err != nil { 167 return nil, err 168 } 169 o.addManager(assertMgr) 170 171 ifaceMgr, err := ifacestate.Manager(s, hookMgr, o.runner, nil, nil) 172 if err != nil { 173 return nil, err 174 } 175 o.addManager(ifaceMgr) 176 177 deviceMgr, err := devicestate.Manager(s, hookMgr, o.runner, o.newStore) 178 if err != nil { 179 return nil, err 180 } 181 o.addManager(deviceMgr) 182 183 o.addManager(cmdstate.Manager(s, o.runner)) 184 o.addManager(snapshotstate.Manager(s, o.runner)) 185 186 if err := configstateInit(s, hookMgr); err != nil { 187 return nil, err 188 } 189 healthstate.Init(hookMgr) 190 191 // the shared task runner should be added last! 192 o.stateEng.AddManager(o.runner) 193 194 s.Lock() 195 defer s.Unlock() 196 // setting up the store 197 o.proxyConf = proxyconf.New(s).Conf 198 storeCtx := storecontext.New(s, o.deviceMgr.StoreContextBackend()) 199 sto := o.newStoreWithContext(storeCtx) 200 201 snapstate.ReplaceStore(s, sto) 202 203 return o, nil 204 } 205 206 func (o *Overlord) addManager(mgr StateManager) { 207 switch x := mgr.(type) { 208 case *hookstate.HookManager: 209 o.hookMgr = x 210 case *snapstate.SnapManager: 211 o.snapMgr = x 212 case *servicestate.ServiceManager: 213 o.serviceMgr = x 214 case *assertstate.AssertManager: 215 o.assertMgr = x 216 case *ifacestate.InterfaceManager: 217 o.ifaceMgr = x 218 case *devicestate.DeviceManager: 219 o.deviceMgr = x 220 case *cmdstate.CommandManager: 221 o.cmdMgr = x 222 case *snapshotstate.SnapshotManager: 223 o.shotMgr = x 224 } 225 o.stateEng.AddManager(mgr) 226 } 227 228 func loadState(backend state.Backend, restartBehavior RestartBehavior) (*state.State, error) { 229 curBootID, err := osutil.BootID() 230 if err != nil { 231 return nil, fmt.Errorf("fatal: cannot find current boot id: %v", err) 232 } 233 234 perfTimings := timings.New(map[string]string{"startup": "load-state"}) 235 236 if !osutil.FileExists(dirs.SnapStateFile) { 237 // fail fast, mostly interesting for tests, this dir is setup 238 // by the snapd package 239 stateDir := filepath.Dir(dirs.SnapStateFile) 240 if !osutil.IsDirectory(stateDir) { 241 return nil, fmt.Errorf("fatal: directory %q must be present", stateDir) 242 } 243 s := state.New(backend) 244 s.Lock() 245 s.VerifyReboot(curBootID) 246 s.Unlock() 247 patch.Init(s) 248 return s, nil 249 } 250 251 r, err := os.Open(dirs.SnapStateFile) 252 if err != nil { 253 return nil, fmt.Errorf("cannot read the state file: %s", err) 254 } 255 defer r.Close() 256 257 var s *state.State 258 timings.Run(perfTimings, "read-state", "read snapd state from disk", func(tm timings.Measurer) { 259 s, err = state.ReadState(backend, r) 260 }) 261 if err != nil { 262 return nil, err 263 } 264 s.Lock() 265 perfTimings.Save(s) 266 s.Unlock() 267 268 err = verifyReboot(s, curBootID, restartBehavior) 269 if err != nil { 270 return nil, err 271 } 272 273 // one-shot migrations 274 err = patch.Apply(s) 275 if err != nil { 276 return nil, err 277 } 278 return s, nil 279 } 280 281 func verifyReboot(s *state.State, curBootID string, restartBehavior RestartBehavior) error { 282 s.Lock() 283 defer s.Unlock() 284 err := s.VerifyReboot(curBootID) 285 if err != nil && err != state.ErrExpectedReboot { 286 return err 287 } 288 expectedRebootDidNotHappen := err == state.ErrExpectedReboot 289 if restartBehavior != nil { 290 if expectedRebootDidNotHappen { 291 return restartBehavior.RebootDidNotHappen(s) 292 } 293 return restartBehavior.RebootAsExpected(s) 294 } 295 if expectedRebootDidNotHappen { 296 logger.Noticef("expected system restart but it did not happen") 297 } 298 return nil 299 } 300 301 func (o *Overlord) newStoreWithContext(storeCtx store.DeviceAndAuthContext) snapstate.StoreService { 302 cfg := store.DefaultConfig() 303 cfg.Proxy = o.proxyConf 304 sto := storeNew(cfg, storeCtx) 305 sto.SetCacheDownloads(defaultCachedDownloads) 306 return sto 307 } 308 309 // newStore can make new stores for use during remodeling. 310 // The device backend will tie them to the remodeling device state. 311 func (o *Overlord) newStore(devBE storecontext.DeviceBackend) snapstate.StoreService { 312 scb := o.deviceMgr.StoreContextBackend() 313 stoCtx := storecontext.NewComposed(o.State(), devBE, scb, scb) 314 return o.newStoreWithContext(stoCtx) 315 } 316 317 // StartUp proceeds to run any expensive Overlord or managers initialization. After this is done once it is a noop. 318 func (o *Overlord) StartUp() error { 319 if o.startedUp { 320 return nil 321 } 322 o.startedUp = true 323 324 // account for deviceMgr == nil as it's not always present in 325 // the tests. 326 if o.deviceMgr != nil && !snapdenv.Preseeding() { 327 var err error 328 st := o.State() 329 st.Lock() 330 o.startOfOperationTime, err = o.deviceMgr.StartOfOperationTime() 331 st.Unlock() 332 if err != nil { 333 return fmt.Errorf("cannot get start of operation time: %s", err) 334 } 335 } 336 337 // slow down for tests 338 if s := os.Getenv("SNAPD_SLOW_STARTUP"); s != "" { 339 if d, err := time.ParseDuration(s); err == nil { 340 logger.Noticef("slowing down startup by %v as requested", d) 341 342 time.Sleep(d) 343 } 344 } 345 346 return o.stateEng.StartUp() 347 } 348 349 // StartupTimeout computes a usable timeout for the startup 350 // initializations by using a pessimistic estimate. 351 func (o *Overlord) StartupTimeout() (timeout time.Duration, reasoning string, err error) { 352 // TODO: adjust based on real hardware measurements 353 st := o.State() 354 st.Lock() 355 defer st.Unlock() 356 n, err := snapstate.NumSnaps(st) 357 if err != nil { 358 return 0, "", err 359 } 360 // number of snaps (and connections) play a role 361 reasoning = "pessimistic estimate of 30s plus 5s per snap" 362 to := (30 * time.Second) + time.Duration(n)*(5*time.Second) 363 return to, reasoning, nil 364 } 365 366 func (o *Overlord) ensureTimerSetup() { 367 o.ensureLock.Lock() 368 defer o.ensureLock.Unlock() 369 o.ensureTimer = time.NewTimer(ensureInterval) 370 o.ensureNext = time.Now().Add(ensureInterval) 371 o.pruneTicker = time.NewTicker(pruneInterval) 372 } 373 374 func (o *Overlord) ensureTimerReset() time.Time { 375 o.ensureLock.Lock() 376 defer o.ensureLock.Unlock() 377 now := time.Now() 378 o.ensureTimer.Reset(ensureInterval) 379 o.ensureNext = now.Add(ensureInterval) 380 return o.ensureNext 381 } 382 383 func (o *Overlord) ensureBefore(d time.Duration) { 384 o.ensureLock.Lock() 385 defer o.ensureLock.Unlock() 386 if o.ensureTimer == nil { 387 panic("cannot use EnsureBefore before Overlord.Loop") 388 } 389 now := time.Now() 390 next := now.Add(d) 391 if next.Before(o.ensureNext) { 392 o.ensureTimer.Reset(d) 393 o.ensureNext = next 394 return 395 } 396 397 if o.ensureNext.Before(now) { 398 // timer already expired, it will be reset in Loop() and 399 // next Ensure() will be called shortly. 400 if !o.ensureTimer.Stop() { 401 return 402 } 403 o.ensureTimer.Reset(0) 404 o.ensureNext = now 405 } 406 } 407 408 func (o *Overlord) requestRestart(t state.RestartType) { 409 if o.restartBehavior == nil { 410 logger.Noticef("restart requested but no behavior set") 411 } else { 412 o.restartBehavior.HandleRestart(t) 413 } 414 } 415 416 var preseedExitWithError = func(err error) { 417 fmt.Fprintf(os.Stderr, "cannot preseed: %v\n", err) 418 os.Exit(1) 419 } 420 421 // Loop runs a loop in a goroutine to ensure the current state regularly through StateEngine Ensure. 422 func (o *Overlord) Loop() { 423 o.ensureTimerSetup() 424 preseed := snapdenv.Preseeding() 425 if preseed { 426 o.runner.OnTaskError(preseedExitWithError) 427 } 428 o.loopTomb.Go(func() error { 429 for { 430 // TODO: pass a proper context into Ensure 431 o.ensureTimerReset() 432 // in case of errors engine logs them, 433 // continue to the next Ensure() try for now 434 err := o.stateEng.Ensure() 435 if err != nil && preseed { 436 st := o.State() 437 // acquire state lock to ensure nothing attempts to write state 438 // as we are exiting; there is no deferred unlock to avoid 439 // potential race on exit. 440 st.Lock() 441 preseedExitWithError(err) 442 } 443 o.ensureDidRun() 444 pruneC := pruneTickerC(o.pruneTicker) 445 select { 446 case <-o.loopTomb.Dying(): 447 return nil 448 case <-o.ensureTimer.C: 449 case <-pruneC: 450 if preseed { 451 // in preseed mode avoid setting StartOfOperationTime (it's 452 // an error), and don't Prune. 453 continue 454 } 455 st := o.State() 456 st.Lock() 457 st.Prune(o.startOfOperationTime, pruneWait, abortWait, pruneMaxChanges) 458 st.Unlock() 459 } 460 } 461 }) 462 } 463 464 func (o *Overlord) ensureDidRun() { 465 atomic.StoreInt32(&o.ensureRun, 1) 466 } 467 468 func (o *Overlord) CanStandby() bool { 469 run := atomic.LoadInt32(&o.ensureRun) 470 return run != 0 471 } 472 473 // Stop stops the ensure loop and the managers under the StateEngine. 474 func (o *Overlord) Stop() error { 475 o.loopTomb.Kill(nil) 476 err := o.loopTomb.Wait() 477 o.stateEng.Stop() 478 return err 479 } 480 481 func (o *Overlord) settle(timeout time.Duration, beforeCleanups func()) error { 482 if err := o.StartUp(); err != nil { 483 return err 484 } 485 486 func() { 487 o.ensureLock.Lock() 488 defer o.ensureLock.Unlock() 489 if o.ensureTimer != nil { 490 panic("cannot use Settle concurrently with other Settle or Loop calls") 491 } 492 o.ensureTimer = time.NewTimer(0) 493 }() 494 495 defer func() { 496 o.ensureLock.Lock() 497 defer o.ensureLock.Unlock() 498 o.ensureTimer.Stop() 499 o.ensureTimer = nil 500 }() 501 502 t0 := time.Now() 503 done := false 504 var errs []error 505 for !done { 506 if timeout > 0 && time.Since(t0) > timeout { 507 err := fmt.Errorf("Settle is not converging") 508 if len(errs) != 0 { 509 return &ensureError{append(errs, err)} 510 } 511 return err 512 } 513 next := o.ensureTimerReset() 514 err := o.stateEng.Ensure() 515 switch ee := err.(type) { 516 case nil: 517 case *ensureError: 518 errs = append(errs, ee.errs...) 519 default: 520 errs = append(errs, err) 521 } 522 o.stateEng.Wait() 523 o.ensureLock.Lock() 524 done = o.ensureNext.Equal(next) 525 o.ensureLock.Unlock() 526 if done { 527 if beforeCleanups != nil { 528 beforeCleanups() 529 beforeCleanups = nil 530 } 531 // we should wait also for cleanup handlers 532 st := o.State() 533 st.Lock() 534 for _, chg := range st.Changes() { 535 if chg.IsReady() && !chg.IsClean() { 536 done = false 537 break 538 } 539 } 540 st.Unlock() 541 } 542 } 543 if len(errs) != 0 { 544 return &ensureError{errs} 545 } 546 return nil 547 } 548 549 // Settle runs first a state engine Ensure and then wait for 550 // activities to settle. That's done by waiting for all managers' 551 // activities to settle while making sure no immediate further Ensure 552 // is scheduled. It then waits similarly for all ready changes to 553 // reach the clean state. Chiefly for tests. Cannot be used in 554 // conjunction with Loop. If timeout is non-zero and settling takes 555 // longer than timeout, returns an error. Calls StartUp as well. 556 func (o *Overlord) Settle(timeout time.Duration) error { 557 return o.settle(timeout, nil) 558 } 559 560 // SettleObserveBeforeCleanups runs first a state engine Ensure and 561 // then wait for activities to settle. That's done by waiting for all 562 // managers' activities to settle while making sure no immediate 563 // further Ensure is scheduled. It then waits similarly for all ready 564 // changes to reach the clean state, but calls once the provided 565 // callback before doing that. Chiefly for tests. Cannot be used in 566 // conjunction with Loop. If timeout is non-zero and settling takes 567 // longer than timeout, returns an error. Calls StartUp as well. 568 func (o *Overlord) SettleObserveBeforeCleanups(timeout time.Duration, beforeCleanups func()) error { 569 return o.settle(timeout, beforeCleanups) 570 } 571 572 // State returns the system state managed by the overlord. 573 func (o *Overlord) State() *state.State { 574 return o.stateEng.State() 575 } 576 577 // StateEngine returns the stage engine used by overlord. 578 func (o *Overlord) StateEngine() *StateEngine { 579 return o.stateEng 580 } 581 582 // TaskRunner returns the shared task runner responsible for running 583 // tasks for all managers under the overlord. 584 func (o *Overlord) TaskRunner() *state.TaskRunner { 585 return o.runner 586 } 587 588 // SnapManager returns the snap manager responsible for snaps under 589 // the overlord. 590 func (o *Overlord) SnapManager() *snapstate.SnapManager { 591 return o.snapMgr 592 } 593 594 // AssertManager returns the assertion manager enforcing assertions 595 // under the overlord. 596 func (o *Overlord) AssertManager() *assertstate.AssertManager { 597 return o.assertMgr 598 } 599 600 // InterfaceManager returns the interface manager maintaining 601 // interface connections under the overlord. 602 func (o *Overlord) InterfaceManager() *ifacestate.InterfaceManager { 603 return o.ifaceMgr 604 } 605 606 // HookManager returns the hook manager responsible for running hooks 607 // under the overlord. 608 func (o *Overlord) HookManager() *hookstate.HookManager { 609 return o.hookMgr 610 } 611 612 // DeviceManager returns the device manager responsible for the device 613 // identity and policies. 614 func (o *Overlord) DeviceManager() *devicestate.DeviceManager { 615 return o.deviceMgr 616 } 617 618 // CommandManager returns the manager responsible for running odd 619 // jobs. 620 func (o *Overlord) CommandManager() *cmdstate.CommandManager { 621 return o.cmdMgr 622 } 623 624 // SnapshotManager returns the manager responsible for snapshots. 625 func (o *Overlord) SnapshotManager() *snapshotstate.SnapshotManager { 626 return o.shotMgr 627 } 628 629 // Mock creates an Overlord without any managers and with a backend 630 // not using disk. Managers can be added with AddManager. For testing. 631 func Mock() *Overlord { 632 return MockWithStateAndRestartHandler(nil, nil) 633 } 634 635 // MockWithStateAndRestartHandler creates an Overlord with the given state 636 // unless it is nil in which case it uses a state backend not using 637 // disk. It will use the given handler on restart requests. Managers 638 // can be added with AddManager. For testing. 639 func MockWithStateAndRestartHandler(s *state.State, handleRestart func(state.RestartType)) *Overlord { 640 o := &Overlord{ 641 loopTomb: new(tomb.Tomb), 642 inited: false, 643 restartBehavior: mockRestartBehavior(handleRestart), 644 } 645 if s == nil { 646 s = state.New(mockBackend{o: o}) 647 } 648 o.stateEng = NewStateEngine(s) 649 o.runner = state.NewTaskRunner(s) 650 651 return o 652 } 653 654 // AddManager adds a manager to the overlord created with Mock. For 655 // testing. 656 func (o *Overlord) AddManager(mgr StateManager) { 657 if o.inited { 658 panic("internal error: cannot add managers to a fully initialized Overlord") 659 } 660 o.addManager(mgr) 661 } 662 663 type mockRestartBehavior func(state.RestartType) 664 665 func (rb mockRestartBehavior) HandleRestart(t state.RestartType) { 666 if rb == nil { 667 return 668 } 669 rb(t) 670 } 671 672 func (rb mockRestartBehavior) RebootAsExpected(*state.State) error { 673 panic("internal error: overlord.Mock should not invoke RebootAsExpected") 674 } 675 676 func (rb mockRestartBehavior) RebootDidNotHappen(*state.State) error { 677 panic("internal error: overlord.Mock should not invoke RebootDidNotHappen") 678 } 679 680 type mockBackend struct { 681 o *Overlord 682 } 683 684 func (mb mockBackend) Checkpoint(data []byte) error { 685 return nil 686 } 687 688 func (mb mockBackend) EnsureBefore(d time.Duration) { 689 mb.o.ensureLock.Lock() 690 timer := mb.o.ensureTimer 691 mb.o.ensureLock.Unlock() 692 if timer == nil { 693 return 694 } 695 696 mb.o.ensureBefore(d) 697 } 698 699 func (mb mockBackend) RequestRestart(t state.RestartType) { 700 mb.o.requestRestart(t) 701 }