github.com/kubiko/snapd@v0.0.0-20201013125620-d4f3094d9ddf/overlord/overlord.go (about) 1 // -*- Mode: Go; indent-tabs-mode: t -*- 2 3 /* 4 * Copyright (C) 2016-2017 Canonical Ltd 5 * 6 * This program is free software: you can redistribute it and/or modify 7 * it under the terms of the GNU General Public License version 3 as 8 * published by the Free Software Foundation. 9 * 10 * This program is distributed in the hope that it will be useful, 11 * but WITHOUT ANY WARRANTY; without even the implied warranty of 12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13 * GNU General Public License for more details. 14 * 15 * You should have received a copy of the GNU General Public License 16 * along with this program. If not, see <http://www.gnu.org/licenses/>. 17 * 18 */ 19 20 // Package overlord implements the overall control of a snappy system. 21 package overlord 22 23 import ( 24 "fmt" 25 "net/http" 26 "net/url" 27 "os" 28 "path/filepath" 29 "sync" 30 "sync/atomic" 31 "time" 32 33 "gopkg.in/tomb.v2" 34 35 "github.com/snapcore/snapd/dirs" 36 "github.com/snapcore/snapd/logger" 37 "github.com/snapcore/snapd/osutil" 38 39 "github.com/snapcore/snapd/overlord/assertstate" 40 "github.com/snapcore/snapd/overlord/cmdstate" 41 "github.com/snapcore/snapd/overlord/configstate" 42 "github.com/snapcore/snapd/overlord/configstate/proxyconf" 43 "github.com/snapcore/snapd/overlord/devicestate" 44 "github.com/snapcore/snapd/overlord/healthstate" 45 "github.com/snapcore/snapd/overlord/hookstate" 46 "github.com/snapcore/snapd/overlord/ifacestate" 47 "github.com/snapcore/snapd/overlord/patch" 48 "github.com/snapcore/snapd/overlord/snapshotstate" 49 "github.com/snapcore/snapd/overlord/snapstate" 50 _ "github.com/snapcore/snapd/overlord/snapstate/policy" 51 "github.com/snapcore/snapd/overlord/state" 52 "github.com/snapcore/snapd/overlord/storecontext" 53 "github.com/snapcore/snapd/snapdenv" 54 "github.com/snapcore/snapd/store" 55 "github.com/snapcore/snapd/timings" 56 ) 57 58 var ( 59 ensureInterval = 5 * time.Minute 60 pruneInterval = 10 * time.Minute 61 pruneWait = 24 * time.Hour * 1 62 abortWait = 24 * time.Hour * 7 63 64 pruneMaxChanges = 500 65 66 defaultCachedDownloads = 5 67 68 configstateInit = configstate.Init 69 ) 70 71 var pruneTickerC = func(t *time.Ticker) <-chan time.Time { 72 return t.C 73 } 74 75 // Overlord is the central manager of a snappy system, keeping 76 // track of all available state managers and related helpers. 77 type Overlord struct { 78 stateEng *StateEngine 79 // ensure loop 80 loopTomb *tomb.Tomb 81 ensureLock sync.Mutex 82 ensureTimer *time.Timer 83 ensureNext time.Time 84 ensureRun int32 85 pruneTicker *time.Ticker 86 87 startOfOperationTime time.Time 88 89 // restarts 90 restartBehavior RestartBehavior 91 // managers 92 inited bool 93 startedUp bool 94 runner *state.TaskRunner 95 snapMgr *snapstate.SnapManager 96 assertMgr *assertstate.AssertManager 97 ifaceMgr *ifacestate.InterfaceManager 98 hookMgr *hookstate.HookManager 99 deviceMgr *devicestate.DeviceManager 100 cmdMgr *cmdstate.CommandManager 101 shotMgr *snapshotstate.SnapshotManager 102 // proxyConf mediates the http proxy config 103 proxyConf func(req *http.Request) (*url.URL, error) 104 } 105 106 // RestartBehavior controls how to hanndle and carry forward restart requests 107 // via the state. 108 type RestartBehavior interface { 109 HandleRestart(t state.RestartType) 110 // RebootAsExpected is called early when either a reboot was 111 // requested by snapd and happened or no reboot was expected at all. 112 RebootAsExpected(st *state.State) error 113 // RebootDidNotHappen is called early instead when a reboot was 114 // requested by snad but did not happen. 115 RebootDidNotHappen(st *state.State) error 116 } 117 118 var storeNew = store.New 119 120 // New creates a new Overlord with all its state managers. 121 // It can be provided with an optional RestartBehavior. 122 func New(restartBehavior RestartBehavior) (*Overlord, error) { 123 o := &Overlord{ 124 loopTomb: new(tomb.Tomb), 125 inited: true, 126 restartBehavior: restartBehavior, 127 } 128 129 backend := &overlordStateBackend{ 130 path: dirs.SnapStateFile, 131 ensureBefore: o.ensureBefore, 132 requestRestart: o.requestRestart, 133 } 134 s, err := loadState(backend, restartBehavior) 135 if err != nil { 136 return nil, err 137 } 138 139 o.stateEng = NewStateEngine(s) 140 o.runner = state.NewTaskRunner(s) 141 142 // any unknown task should be ignored and succeed 143 matchAnyUnknownTask := func(_ *state.Task) bool { 144 return true 145 } 146 o.runner.AddOptionalHandler(matchAnyUnknownTask, handleUnknownTask, nil) 147 148 hookMgr, err := hookstate.Manager(s, o.runner) 149 if err != nil { 150 return nil, err 151 } 152 o.addManager(hookMgr) 153 154 snapMgr, err := snapstate.Manager(s, o.runner) 155 if err != nil { 156 return nil, err 157 } 158 o.addManager(snapMgr) 159 160 assertMgr, err := assertstate.Manager(s, o.runner) 161 if err != nil { 162 return nil, err 163 } 164 o.addManager(assertMgr) 165 166 ifaceMgr, err := ifacestate.Manager(s, hookMgr, o.runner, nil, nil) 167 if err != nil { 168 return nil, err 169 } 170 o.addManager(ifaceMgr) 171 172 deviceMgr, err := devicestate.Manager(s, hookMgr, o.runner, o.newStore) 173 if err != nil { 174 return nil, err 175 } 176 o.addManager(deviceMgr) 177 178 o.addManager(cmdstate.Manager(s, o.runner)) 179 o.addManager(snapshotstate.Manager(s, o.runner)) 180 181 if err := configstateInit(s, hookMgr); err != nil { 182 return nil, err 183 } 184 healthstate.Init(hookMgr) 185 186 // the shared task runner should be added last! 187 o.stateEng.AddManager(o.runner) 188 189 s.Lock() 190 defer s.Unlock() 191 // setting up the store 192 o.proxyConf = proxyconf.New(s).Conf 193 storeCtx := storecontext.New(s, o.deviceMgr.StoreContextBackend()) 194 sto := o.newStoreWithContext(storeCtx) 195 196 snapstate.ReplaceStore(s, sto) 197 198 return o, nil 199 } 200 201 func (o *Overlord) addManager(mgr StateManager) { 202 switch x := mgr.(type) { 203 case *hookstate.HookManager: 204 o.hookMgr = x 205 case *snapstate.SnapManager: 206 o.snapMgr = x 207 case *assertstate.AssertManager: 208 o.assertMgr = x 209 case *ifacestate.InterfaceManager: 210 o.ifaceMgr = x 211 case *devicestate.DeviceManager: 212 o.deviceMgr = x 213 case *cmdstate.CommandManager: 214 o.cmdMgr = x 215 case *snapshotstate.SnapshotManager: 216 o.shotMgr = x 217 } 218 o.stateEng.AddManager(mgr) 219 } 220 221 func loadState(backend state.Backend, restartBehavior RestartBehavior) (*state.State, error) { 222 curBootID, err := osutil.BootID() 223 if err != nil { 224 return nil, fmt.Errorf("fatal: cannot find current boot id: %v", err) 225 } 226 227 perfTimings := timings.New(map[string]string{"startup": "load-state"}) 228 229 if !osutil.FileExists(dirs.SnapStateFile) { 230 // fail fast, mostly interesting for tests, this dir is setup 231 // by the snapd package 232 stateDir := filepath.Dir(dirs.SnapStateFile) 233 if !osutil.IsDirectory(stateDir) { 234 return nil, fmt.Errorf("fatal: directory %q must be present", stateDir) 235 } 236 s := state.New(backend) 237 s.Lock() 238 s.VerifyReboot(curBootID) 239 s.Unlock() 240 patch.Init(s) 241 return s, nil 242 } 243 244 r, err := os.Open(dirs.SnapStateFile) 245 if err != nil { 246 return nil, fmt.Errorf("cannot read the state file: %s", err) 247 } 248 defer r.Close() 249 250 var s *state.State 251 timings.Run(perfTimings, "read-state", "read snapd state from disk", func(tm timings.Measurer) { 252 s, err = state.ReadState(backend, r) 253 }) 254 if err != nil { 255 return nil, err 256 } 257 s.Lock() 258 perfTimings.Save(s) 259 s.Unlock() 260 261 err = verifyReboot(s, curBootID, restartBehavior) 262 if err != nil { 263 return nil, err 264 } 265 266 // one-shot migrations 267 err = patch.Apply(s) 268 if err != nil { 269 return nil, err 270 } 271 return s, nil 272 } 273 274 func verifyReboot(s *state.State, curBootID string, restartBehavior RestartBehavior) error { 275 s.Lock() 276 defer s.Unlock() 277 err := s.VerifyReboot(curBootID) 278 if err != nil && err != state.ErrExpectedReboot { 279 return err 280 } 281 expectedRebootDidNotHappen := err == state.ErrExpectedReboot 282 if restartBehavior != nil { 283 if expectedRebootDidNotHappen { 284 return restartBehavior.RebootDidNotHappen(s) 285 } 286 return restartBehavior.RebootAsExpected(s) 287 } 288 if expectedRebootDidNotHappen { 289 logger.Noticef("expected system restart but it did not happen") 290 } 291 return nil 292 } 293 294 func (o *Overlord) newStoreWithContext(storeCtx store.DeviceAndAuthContext) snapstate.StoreService { 295 cfg := store.DefaultConfig() 296 cfg.Proxy = o.proxyConf 297 sto := storeNew(cfg, storeCtx) 298 sto.SetCacheDownloads(defaultCachedDownloads) 299 return sto 300 } 301 302 // newStore can make new stores for use during remodeling. 303 // The device backend will tie them to the remodeling device state. 304 func (o *Overlord) newStore(devBE storecontext.DeviceBackend) snapstate.StoreService { 305 scb := o.deviceMgr.StoreContextBackend() 306 stoCtx := storecontext.NewComposed(o.State(), devBE, scb, scb) 307 return o.newStoreWithContext(stoCtx) 308 } 309 310 // StartUp proceeds to run any expensive Overlord or managers initialization. After this is done once it is a noop. 311 func (o *Overlord) StartUp() error { 312 if o.startedUp { 313 return nil 314 } 315 o.startedUp = true 316 317 // account for deviceMgr == nil as it's not always present in 318 // the tests. 319 if o.deviceMgr != nil && !snapdenv.Preseeding() { 320 var err error 321 st := o.State() 322 st.Lock() 323 o.startOfOperationTime, err = o.deviceMgr.StartOfOperationTime() 324 st.Unlock() 325 if err != nil { 326 return fmt.Errorf("cannot get start of operation time: %s", err) 327 } 328 } 329 330 // slow down for tests 331 if s := os.Getenv("SNAPD_SLOW_STARTUP"); s != "" { 332 if d, err := time.ParseDuration(s); err == nil { 333 logger.Noticef("slowing down startup by %v as requested", d) 334 335 time.Sleep(d) 336 } 337 } 338 339 return o.stateEng.StartUp() 340 } 341 342 // StartupTimeout computes a usable timeout for the startup 343 // initializations by using a pessimistic estimate. 344 func (o *Overlord) StartupTimeout() (timeout time.Duration, reasoning string, err error) { 345 // TODO: adjust based on real hardware measurements 346 st := o.State() 347 st.Lock() 348 defer st.Unlock() 349 n, err := snapstate.NumSnaps(st) 350 if err != nil { 351 return 0, "", err 352 } 353 // number of snaps (and connections) play a role 354 reasoning = "pessimistic estimate of 30s plus 5s per snap" 355 to := (30 * time.Second) + time.Duration(n)*(5*time.Second) 356 return to, reasoning, nil 357 } 358 359 func (o *Overlord) ensureTimerSetup() { 360 o.ensureLock.Lock() 361 defer o.ensureLock.Unlock() 362 o.ensureTimer = time.NewTimer(ensureInterval) 363 o.ensureNext = time.Now().Add(ensureInterval) 364 o.pruneTicker = time.NewTicker(pruneInterval) 365 } 366 367 func (o *Overlord) ensureTimerReset() time.Time { 368 o.ensureLock.Lock() 369 defer o.ensureLock.Unlock() 370 now := time.Now() 371 o.ensureTimer.Reset(ensureInterval) 372 o.ensureNext = now.Add(ensureInterval) 373 return o.ensureNext 374 } 375 376 func (o *Overlord) ensureBefore(d time.Duration) { 377 o.ensureLock.Lock() 378 defer o.ensureLock.Unlock() 379 if o.ensureTimer == nil { 380 panic("cannot use EnsureBefore before Overlord.Loop") 381 } 382 now := time.Now() 383 next := now.Add(d) 384 if next.Before(o.ensureNext) { 385 o.ensureTimer.Reset(d) 386 o.ensureNext = next 387 return 388 } 389 390 if o.ensureNext.Before(now) { 391 // timer already expired, it will be reset in Loop() and 392 // next Ensure() will be called shortly. 393 if !o.ensureTimer.Stop() { 394 return 395 } 396 o.ensureTimer.Reset(0) 397 o.ensureNext = now 398 } 399 } 400 401 func (o *Overlord) requestRestart(t state.RestartType) { 402 if o.restartBehavior == nil { 403 logger.Noticef("restart requested but no behavior set") 404 } else { 405 o.restartBehavior.HandleRestart(t) 406 } 407 } 408 409 var preseedExitWithError = func(err error) { 410 fmt.Fprintf(os.Stderr, "cannot preseed: %v\n", err) 411 os.Exit(1) 412 } 413 414 // Loop runs a loop in a goroutine to ensure the current state regularly through StateEngine Ensure. 415 func (o *Overlord) Loop() { 416 o.ensureTimerSetup() 417 preseed := snapdenv.Preseeding() 418 if preseed { 419 o.runner.OnTaskError(preseedExitWithError) 420 } 421 o.loopTomb.Go(func() error { 422 for { 423 // TODO: pass a proper context into Ensure 424 o.ensureTimerReset() 425 // in case of errors engine logs them, 426 // continue to the next Ensure() try for now 427 err := o.stateEng.Ensure() 428 if err != nil && preseed { 429 st := o.State() 430 // acquire state lock to ensure nothing attempts to write state 431 // as we are exiting; there is no deferred unlock to avoid 432 // potential race on exit. 433 st.Lock() 434 preseedExitWithError(err) 435 } 436 o.ensureDidRun() 437 pruneC := pruneTickerC(o.pruneTicker) 438 select { 439 case <-o.loopTomb.Dying(): 440 return nil 441 case <-o.ensureTimer.C: 442 case <-pruneC: 443 if preseed { 444 // in preseed mode avoid setting StartOfOperationTime (it's 445 // an error), and don't Prune. 446 continue 447 } 448 st := o.State() 449 st.Lock() 450 st.Prune(o.startOfOperationTime, pruneWait, abortWait, pruneMaxChanges) 451 st.Unlock() 452 } 453 } 454 }) 455 } 456 457 func (o *Overlord) ensureDidRun() { 458 atomic.StoreInt32(&o.ensureRun, 1) 459 } 460 461 func (o *Overlord) CanStandby() bool { 462 run := atomic.LoadInt32(&o.ensureRun) 463 return run != 0 464 } 465 466 // Stop stops the ensure loop and the managers under the StateEngine. 467 func (o *Overlord) Stop() error { 468 o.loopTomb.Kill(nil) 469 err := o.loopTomb.Wait() 470 o.stateEng.Stop() 471 return err 472 } 473 474 func (o *Overlord) settle(timeout time.Duration, beforeCleanups func()) error { 475 if err := o.StartUp(); err != nil { 476 return err 477 } 478 479 func() { 480 o.ensureLock.Lock() 481 defer o.ensureLock.Unlock() 482 if o.ensureTimer != nil { 483 panic("cannot use Settle concurrently with other Settle or Loop calls") 484 } 485 o.ensureTimer = time.NewTimer(0) 486 }() 487 488 defer func() { 489 o.ensureLock.Lock() 490 defer o.ensureLock.Unlock() 491 o.ensureTimer.Stop() 492 o.ensureTimer = nil 493 }() 494 495 t0 := time.Now() 496 done := false 497 var errs []error 498 for !done { 499 if timeout > 0 && time.Since(t0) > timeout { 500 err := fmt.Errorf("Settle is not converging") 501 if len(errs) != 0 { 502 return &ensureError{append(errs, err)} 503 } 504 return err 505 } 506 next := o.ensureTimerReset() 507 err := o.stateEng.Ensure() 508 switch ee := err.(type) { 509 case nil: 510 case *ensureError: 511 errs = append(errs, ee.errs...) 512 default: 513 errs = append(errs, err) 514 } 515 o.stateEng.Wait() 516 o.ensureLock.Lock() 517 done = o.ensureNext.Equal(next) 518 o.ensureLock.Unlock() 519 if done { 520 if beforeCleanups != nil { 521 beforeCleanups() 522 beforeCleanups = nil 523 } 524 // we should wait also for cleanup handlers 525 st := o.State() 526 st.Lock() 527 for _, chg := range st.Changes() { 528 if chg.IsReady() && !chg.IsClean() { 529 done = false 530 break 531 } 532 } 533 st.Unlock() 534 } 535 } 536 if len(errs) != 0 { 537 return &ensureError{errs} 538 } 539 return nil 540 } 541 542 // Settle runs first a state engine Ensure and then wait for 543 // activities to settle. That's done by waiting for all managers' 544 // activities to settle while making sure no immediate further Ensure 545 // is scheduled. It then waits similarly for all ready changes to 546 // reach the clean state. Chiefly for tests. Cannot be used in 547 // conjunction with Loop. If timeout is non-zero and settling takes 548 // longer than timeout, returns an error. Calls StartUp as well. 549 func (o *Overlord) Settle(timeout time.Duration) error { 550 return o.settle(timeout, nil) 551 } 552 553 // SettleObserveBeforeCleanups runs first a state engine Ensure and 554 // then wait for activities to settle. That's done by waiting for all 555 // managers' activities to settle while making sure no immediate 556 // further Ensure is scheduled. It then waits similarly for all ready 557 // changes to reach the clean state, but calls once the provided 558 // callback before doing that. Chiefly for tests. Cannot be used in 559 // conjunction with Loop. If timeout is non-zero and settling takes 560 // longer than timeout, returns an error. Calls StartUp as well. 561 func (o *Overlord) SettleObserveBeforeCleanups(timeout time.Duration, beforeCleanups func()) error { 562 return o.settle(timeout, beforeCleanups) 563 } 564 565 // State returns the system state managed by the overlord. 566 func (o *Overlord) State() *state.State { 567 return o.stateEng.State() 568 } 569 570 // StateEngine returns the stage engine used by overlord. 571 func (o *Overlord) StateEngine() *StateEngine { 572 return o.stateEng 573 } 574 575 // TaskRunner returns the shared task runner responsible for running 576 // tasks for all managers under the overlord. 577 func (o *Overlord) TaskRunner() *state.TaskRunner { 578 return o.runner 579 } 580 581 // SnapManager returns the snap manager responsible for snaps under 582 // the overlord. 583 func (o *Overlord) SnapManager() *snapstate.SnapManager { 584 return o.snapMgr 585 } 586 587 // AssertManager returns the assertion manager enforcing assertions 588 // under the overlord. 589 func (o *Overlord) AssertManager() *assertstate.AssertManager { 590 return o.assertMgr 591 } 592 593 // InterfaceManager returns the interface manager maintaining 594 // interface connections under the overlord. 595 func (o *Overlord) InterfaceManager() *ifacestate.InterfaceManager { 596 return o.ifaceMgr 597 } 598 599 // HookManager returns the hook manager responsible for running hooks 600 // under the overlord. 601 func (o *Overlord) HookManager() *hookstate.HookManager { 602 return o.hookMgr 603 } 604 605 // DeviceManager returns the device manager responsible for the device 606 // identity and policies. 607 func (o *Overlord) DeviceManager() *devicestate.DeviceManager { 608 return o.deviceMgr 609 } 610 611 // CommandManager returns the manager responsible for running odd 612 // jobs. 613 func (o *Overlord) CommandManager() *cmdstate.CommandManager { 614 return o.cmdMgr 615 } 616 617 // SnapshotManager returns the manager responsible for snapshots. 618 func (o *Overlord) SnapshotManager() *snapshotstate.SnapshotManager { 619 return o.shotMgr 620 } 621 622 // Mock creates an Overlord without any managers and with a backend 623 // not using disk. Managers can be added with AddManager. For testing. 624 func Mock() *Overlord { 625 return MockWithStateAndRestartHandler(nil, nil) 626 } 627 628 // MockWithStateAndRestartHandler creates an Overlord with the given state 629 // unless it is nil in which case it uses a state backend not using 630 // disk. It will use the given handler on restart requests. Managers 631 // can be added with AddManager. For testing. 632 func MockWithStateAndRestartHandler(s *state.State, handleRestart func(state.RestartType)) *Overlord { 633 o := &Overlord{ 634 loopTomb: new(tomb.Tomb), 635 inited: false, 636 restartBehavior: mockRestartBehavior(handleRestart), 637 } 638 if s == nil { 639 s = state.New(mockBackend{o: o}) 640 } 641 o.stateEng = NewStateEngine(s) 642 o.runner = state.NewTaskRunner(s) 643 644 return o 645 } 646 647 // AddManager adds a manager to the overlord created with Mock. For 648 // testing. 649 func (o *Overlord) AddManager(mgr StateManager) { 650 if o.inited { 651 panic("internal error: cannot add managers to a fully initialized Overlord") 652 } 653 o.addManager(mgr) 654 } 655 656 type mockRestartBehavior func(state.RestartType) 657 658 func (rb mockRestartBehavior) HandleRestart(t state.RestartType) { 659 if rb == nil { 660 return 661 } 662 rb(t) 663 } 664 665 func (rb mockRestartBehavior) RebootAsExpected(*state.State) error { 666 panic("internal error: overlord.Mock should not invoke RebootAsExpected") 667 } 668 669 func (rb mockRestartBehavior) RebootDidNotHappen(*state.State) error { 670 panic("internal error: overlord.Mock should not invoke RebootDidNotHappen") 671 } 672 673 type mockBackend struct { 674 o *Overlord 675 } 676 677 func (mb mockBackend) Checkpoint(data []byte) error { 678 return nil 679 } 680 681 func (mb mockBackend) EnsureBefore(d time.Duration) { 682 mb.o.ensureLock.Lock() 683 timer := mb.o.ensureTimer 684 mb.o.ensureLock.Unlock() 685 if timer == nil { 686 return 687 } 688 689 mb.o.ensureBefore(d) 690 } 691 692 func (mb mockBackend) RequestRestart(t state.RestartType) { 693 mb.o.requestRestart(t) 694 }