github.com/rigado/snapd@v2.42.5-go-mod+incompatible/overlord/overlord.go (about) 1 // -*- Mode: Go; indent-tabs-mode: t -*- 2 3 /* 4 * Copyright (C) 2016-2017 Canonical Ltd 5 * 6 * This program is free software: you can redistribute it and/or modify 7 * it under the terms of the GNU General Public License version 3 as 8 * published by the Free Software Foundation. 9 * 10 * This program is distributed in the hope that it will be useful, 11 * but WITHOUT ANY WARRANTY; without even the implied warranty of 12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13 * GNU General Public License for more details. 14 * 15 * You should have received a copy of the GNU General Public License 16 * along with this program. If not, see <http://www.gnu.org/licenses/>. 17 * 18 */ 19 20 // Package overlord implements the overall control of a snappy system. 21 package overlord 22 23 import ( 24 "fmt" 25 "net/http" 26 "net/url" 27 "os" 28 "path/filepath" 29 "sync" 30 "sync/atomic" 31 "time" 32 33 "gopkg.in/tomb.v2" 34 35 "github.com/snapcore/snapd/dirs" 36 "github.com/snapcore/snapd/logger" 37 "github.com/snapcore/snapd/osutil" 38 39 "github.com/snapcore/snapd/overlord/assertstate" 40 "github.com/snapcore/snapd/overlord/cmdstate" 41 "github.com/snapcore/snapd/overlord/configstate" 42 "github.com/snapcore/snapd/overlord/configstate/proxyconf" 43 "github.com/snapcore/snapd/overlord/devicestate" 44 "github.com/snapcore/snapd/overlord/healthstate" 45 "github.com/snapcore/snapd/overlord/hookstate" 46 "github.com/snapcore/snapd/overlord/ifacestate" 47 "github.com/snapcore/snapd/overlord/patch" 48 "github.com/snapcore/snapd/overlord/snapshotstate" 49 "github.com/snapcore/snapd/overlord/snapstate" 50 "github.com/snapcore/snapd/overlord/state" 51 "github.com/snapcore/snapd/overlord/storecontext" 52 "github.com/snapcore/snapd/store" 53 "github.com/snapcore/snapd/timings" 54 ) 55 56 var ( 57 ensureInterval = 5 * time.Minute 58 pruneInterval = 10 * time.Minute 59 pruneWait = 24 * time.Hour * 1 60 abortWait = 24 * time.Hour * 7 61 62 pruneMaxChanges = 500 63 64 defaultCachedDownloads = 5 65 66 configstateInit = configstate.Init 67 ) 68 69 // Overlord is the central manager of a snappy system, keeping 70 // track of all available state managers and related helpers. 71 type Overlord struct { 72 stateEng *StateEngine 73 // ensure loop 74 loopTomb *tomb.Tomb 75 ensureLock sync.Mutex 76 ensureTimer *time.Timer 77 ensureNext time.Time 78 ensureRun int32 79 pruneTicker *time.Ticker 80 // restarts 81 restartBehavior RestartBehavior 82 // managers 83 inited bool 84 startedUp bool 85 runner *state.TaskRunner 86 snapMgr *snapstate.SnapManager 87 assertMgr *assertstate.AssertManager 88 ifaceMgr *ifacestate.InterfaceManager 89 hookMgr *hookstate.HookManager 90 deviceMgr *devicestate.DeviceManager 91 cmdMgr *cmdstate.CommandManager 92 shotMgr *snapshotstate.SnapshotManager 93 // proxyConf mediates the http proxy config 94 proxyConf func(req *http.Request) (*url.URL, error) 95 } 96 97 // RestartBehavior controls how to hanndle and carry forward restart requests 98 // via the state. 99 type RestartBehavior interface { 100 HandleRestart(t state.RestartType) 101 // RebootAsExpected is called early when either a reboot was 102 // requested by snapd and happened or no reboot was expected at all. 103 RebootAsExpected(st *state.State) error 104 // RebootDidNotHappen is called early instead when a reboot was 105 // requested by snad but did not happen. 106 RebootDidNotHappen(st *state.State) error 107 } 108 109 var storeNew = store.New 110 111 // New creates a new Overlord with all its state managers. 112 // It can be provided with an optional RestartBehavior. 113 func New(restartBehavior RestartBehavior) (*Overlord, error) { 114 o := &Overlord{ 115 loopTomb: new(tomb.Tomb), 116 inited: true, 117 restartBehavior: restartBehavior, 118 } 119 120 backend := &overlordStateBackend{ 121 path: dirs.SnapStateFile, 122 ensureBefore: o.ensureBefore, 123 requestRestart: o.requestRestart, 124 } 125 s, err := loadState(backend, restartBehavior) 126 if err != nil { 127 return nil, err 128 } 129 130 o.stateEng = NewStateEngine(s) 131 o.runner = state.NewTaskRunner(s) 132 133 // any unknown task should be ignored and succeed 134 matchAnyUnknownTask := func(_ *state.Task) bool { 135 return true 136 } 137 o.runner.AddOptionalHandler(matchAnyUnknownTask, handleUnknownTask, nil) 138 139 hookMgr, err := hookstate.Manager(s, o.runner) 140 if err != nil { 141 return nil, err 142 } 143 o.addManager(hookMgr) 144 145 snapMgr, err := snapstate.Manager(s, o.runner) 146 if err != nil { 147 return nil, err 148 } 149 o.addManager(snapMgr) 150 151 assertMgr, err := assertstate.Manager(s, o.runner) 152 if err != nil { 153 return nil, err 154 } 155 o.addManager(assertMgr) 156 157 ifaceMgr, err := ifacestate.Manager(s, hookMgr, o.runner, nil, nil) 158 if err != nil { 159 return nil, err 160 } 161 o.addManager(ifaceMgr) 162 163 deviceMgr, err := devicestate.Manager(s, hookMgr, o.runner, o.newStore) 164 if err != nil { 165 return nil, err 166 } 167 o.addManager(deviceMgr) 168 169 o.addManager(cmdstate.Manager(s, o.runner)) 170 o.addManager(snapshotstate.Manager(s, o.runner)) 171 172 if err := configstateInit(s, hookMgr); err != nil { 173 return nil, err 174 } 175 healthstate.Init(hookMgr) 176 177 // the shared task runner should be added last! 178 o.stateEng.AddManager(o.runner) 179 180 s.Lock() 181 defer s.Unlock() 182 // setting up the store 183 o.proxyConf = proxyconf.New(s).Conf 184 storeCtx := storecontext.New(s, o.deviceMgr.StoreContextBackend()) 185 sto := o.newStoreWithContext(storeCtx) 186 187 snapstate.ReplaceStore(s, sto) 188 189 return o, nil 190 } 191 192 func (o *Overlord) addManager(mgr StateManager) { 193 switch x := mgr.(type) { 194 case *hookstate.HookManager: 195 o.hookMgr = x 196 case *snapstate.SnapManager: 197 o.snapMgr = x 198 case *assertstate.AssertManager: 199 o.assertMgr = x 200 case *ifacestate.InterfaceManager: 201 o.ifaceMgr = x 202 case *devicestate.DeviceManager: 203 o.deviceMgr = x 204 case *cmdstate.CommandManager: 205 o.cmdMgr = x 206 case *snapshotstate.SnapshotManager: 207 o.shotMgr = x 208 } 209 o.stateEng.AddManager(mgr) 210 } 211 212 func loadState(backend state.Backend, restartBehavior RestartBehavior) (*state.State, error) { 213 curBootID, err := osutil.BootID() 214 if err != nil { 215 return nil, fmt.Errorf("fatal: cannot find current boot id: %v", err) 216 } 217 218 perfTimings := timings.New(map[string]string{"startup": "load-state"}) 219 220 if !osutil.FileExists(dirs.SnapStateFile) { 221 // fail fast, mostly interesting for tests, this dir is setup 222 // by the snapd package 223 stateDir := filepath.Dir(dirs.SnapStateFile) 224 if !osutil.IsDirectory(stateDir) { 225 return nil, fmt.Errorf("fatal: directory %q must be present", stateDir) 226 } 227 s := state.New(backend) 228 s.Lock() 229 s.VerifyReboot(curBootID) 230 s.Unlock() 231 patch.Init(s) 232 return s, nil 233 } 234 235 r, err := os.Open(dirs.SnapStateFile) 236 if err != nil { 237 return nil, fmt.Errorf("cannot read the state file: %s", err) 238 } 239 defer r.Close() 240 241 var s *state.State 242 timings.Run(perfTimings, "read-state", "read snapd state from disk", func(tm timings.Measurer) { 243 s, err = state.ReadState(backend, r) 244 }) 245 if err != nil { 246 return nil, err 247 } 248 s.Lock() 249 perfTimings.Save(s) 250 s.Unlock() 251 252 err = verifyReboot(s, curBootID, restartBehavior) 253 if err != nil { 254 return nil, err 255 } 256 257 // one-shot migrations 258 err = patch.Apply(s) 259 if err != nil { 260 return nil, err 261 } 262 return s, nil 263 } 264 265 func verifyReboot(s *state.State, curBootID string, restartBehavior RestartBehavior) error { 266 s.Lock() 267 defer s.Unlock() 268 err := s.VerifyReboot(curBootID) 269 if err != nil && err != state.ErrExpectedReboot { 270 return err 271 } 272 expectedRebootDidNotHappen := err == state.ErrExpectedReboot 273 if restartBehavior != nil { 274 if expectedRebootDidNotHappen { 275 return restartBehavior.RebootDidNotHappen(s) 276 } 277 return restartBehavior.RebootAsExpected(s) 278 } 279 if expectedRebootDidNotHappen { 280 logger.Noticef("expected system restart but it did not happen") 281 } 282 return nil 283 } 284 285 func (o *Overlord) newStoreWithContext(storeCtx store.DeviceAndAuthContext) snapstate.StoreService { 286 cfg := store.DefaultConfig() 287 cfg.Proxy = o.proxyConf 288 sto := storeNew(cfg, storeCtx) 289 sto.SetCacheDownloads(defaultCachedDownloads) 290 return sto 291 } 292 293 // newStore can make new stores for use during remodeling. 294 // The device backend will tie them to the remodeling device state. 295 func (o *Overlord) newStore(devBE storecontext.DeviceBackend) snapstate.StoreService { 296 scb := o.deviceMgr.StoreContextBackend() 297 stoCtx := storecontext.NewComposed(o.State(), devBE, scb, scb) 298 return o.newStoreWithContext(stoCtx) 299 } 300 301 // StartUp proceeds to run any expensive Overlord or managers initialization. After this is done once it is a noop. 302 func (o *Overlord) StartUp() error { 303 if o.startedUp { 304 return nil 305 } 306 o.startedUp = true 307 308 // slow down for tests 309 if s := os.Getenv("SNAPD_SLOW_STARTUP"); s != "" { 310 if d, err := time.ParseDuration(s); err == nil { 311 logger.Noticef("slowing down startup by %v as requested", d) 312 313 time.Sleep(d) 314 } 315 } 316 317 return o.stateEng.StartUp() 318 } 319 320 // StartupTimeout computes a usable timeout for the startup 321 // initializations by using a pessimistic estimate. 322 func (o *Overlord) StartupTimeout() (timeout time.Duration, reasoning string, err error) { 323 // TODO: adjust based on real hardware measurements 324 st := o.State() 325 st.Lock() 326 defer st.Unlock() 327 n, err := snapstate.NumSnaps(st) 328 if err != nil { 329 return 0, "", err 330 } 331 // number of snaps (and connections) play a role 332 reasoning = "pessimistic estimate of 30s plus 5s per snap" 333 to := (30 * time.Second) + time.Duration(n)*(5*time.Second) 334 return to, reasoning, nil 335 } 336 337 func (o *Overlord) ensureTimerSetup() { 338 o.ensureLock.Lock() 339 defer o.ensureLock.Unlock() 340 o.ensureTimer = time.NewTimer(ensureInterval) 341 o.ensureNext = time.Now().Add(ensureInterval) 342 o.pruneTicker = time.NewTicker(pruneInterval) 343 } 344 345 func (o *Overlord) ensureTimerReset() time.Time { 346 o.ensureLock.Lock() 347 defer o.ensureLock.Unlock() 348 now := time.Now() 349 o.ensureTimer.Reset(ensureInterval) 350 o.ensureNext = now.Add(ensureInterval) 351 return o.ensureNext 352 } 353 354 func (o *Overlord) ensureBefore(d time.Duration) { 355 o.ensureLock.Lock() 356 defer o.ensureLock.Unlock() 357 if o.ensureTimer == nil { 358 panic("cannot use EnsureBefore before Overlord.Loop") 359 } 360 now := time.Now() 361 next := now.Add(d) 362 if next.Before(o.ensureNext) { 363 o.ensureTimer.Reset(d) 364 o.ensureNext = next 365 return 366 } 367 368 if o.ensureNext.Before(now) { 369 // timer already expired, it will be reset in Loop() and 370 // next Ensure() will be called shortly. 371 if !o.ensureTimer.Stop() { 372 return 373 } 374 o.ensureTimer.Reset(0) 375 o.ensureNext = now 376 } 377 } 378 379 func (o *Overlord) requestRestart(t state.RestartType) { 380 if o.restartBehavior == nil { 381 logger.Noticef("restart requested but no behavior set") 382 } else { 383 o.restartBehavior.HandleRestart(t) 384 } 385 } 386 387 // Loop runs a loop in a goroutine to ensure the current state regularly through StateEngine Ensure. 388 func (o *Overlord) Loop() { 389 o.ensureTimerSetup() 390 o.loopTomb.Go(func() error { 391 for { 392 // TODO: pass a proper context into Ensure 393 o.ensureTimerReset() 394 // in case of errors engine logs them, 395 // continue to the next Ensure() try for now 396 o.stateEng.Ensure() 397 o.ensureDidRun() 398 select { 399 case <-o.loopTomb.Dying(): 400 return nil 401 case <-o.ensureTimer.C: 402 case <-o.pruneTicker.C: 403 st := o.State() 404 st.Lock() 405 st.Prune(pruneWait, abortWait, pruneMaxChanges) 406 st.Unlock() 407 } 408 } 409 }) 410 } 411 412 func (o *Overlord) ensureDidRun() { 413 atomic.StoreInt32(&o.ensureRun, 1) 414 } 415 416 func (o *Overlord) CanStandby() bool { 417 run := atomic.LoadInt32(&o.ensureRun) 418 return run != 0 419 } 420 421 // Stop stops the ensure loop and the managers under the StateEngine. 422 func (o *Overlord) Stop() error { 423 o.loopTomb.Kill(nil) 424 err := o.loopTomb.Wait() 425 o.stateEng.Stop() 426 return err 427 } 428 429 func (o *Overlord) settle(timeout time.Duration, beforeCleanups func()) error { 430 if err := o.StartUp(); err != nil { 431 return err 432 } 433 434 func() { 435 o.ensureLock.Lock() 436 defer o.ensureLock.Unlock() 437 if o.ensureTimer != nil { 438 panic("cannot use Settle concurrently with other Settle or Loop calls") 439 } 440 o.ensureTimer = time.NewTimer(0) 441 }() 442 443 defer func() { 444 o.ensureLock.Lock() 445 defer o.ensureLock.Unlock() 446 o.ensureTimer.Stop() 447 o.ensureTimer = nil 448 }() 449 450 t0 := time.Now() 451 done := false 452 var errs []error 453 for !done { 454 if timeout > 0 && time.Since(t0) > timeout { 455 err := fmt.Errorf("Settle is not converging") 456 if len(errs) != 0 { 457 return &ensureError{append(errs, err)} 458 } 459 return err 460 } 461 next := o.ensureTimerReset() 462 err := o.stateEng.Ensure() 463 switch ee := err.(type) { 464 case nil: 465 case *ensureError: 466 errs = append(errs, ee.errs...) 467 default: 468 errs = append(errs, err) 469 } 470 o.stateEng.Wait() 471 o.ensureLock.Lock() 472 done = o.ensureNext.Equal(next) 473 o.ensureLock.Unlock() 474 if done { 475 if beforeCleanups != nil { 476 beforeCleanups() 477 beforeCleanups = nil 478 } 479 // we should wait also for cleanup handlers 480 st := o.State() 481 st.Lock() 482 for _, chg := range st.Changes() { 483 if chg.IsReady() && !chg.IsClean() { 484 done = false 485 break 486 } 487 } 488 st.Unlock() 489 } 490 } 491 if len(errs) != 0 { 492 return &ensureError{errs} 493 } 494 return nil 495 } 496 497 // Settle runs first a state engine Ensure and then wait for 498 // activities to settle. That's done by waiting for all managers' 499 // activities to settle while making sure no immediate further Ensure 500 // is scheduled. It then waits similarly for all ready changes to 501 // reach the clean state. Chiefly for tests. Cannot be used in 502 // conjunction with Loop. If timeout is non-zero and settling takes 503 // longer than timeout, returns an error. Calls StartUp as well. 504 func (o *Overlord) Settle(timeout time.Duration) error { 505 return o.settle(timeout, nil) 506 } 507 508 // SettleObserveBeforeCleanups runs first a state engine Ensure and 509 // then wait for activities to settle. That's done by waiting for all 510 // managers' activities to settle while making sure no immediate 511 // further Ensure is scheduled. It then waits similarly for all ready 512 // changes to reach the clean state, but calls once the provided 513 // callback before doing that. Chiefly for tests. Cannot be used in 514 // conjunction with Loop. If timeout is non-zero and settling takes 515 // longer than timeout, returns an error. Calls StartUp as well. 516 func (o *Overlord) SettleObserveBeforeCleanups(timeout time.Duration, beforeCleanups func()) error { 517 return o.settle(timeout, beforeCleanups) 518 } 519 520 // State returns the system state managed by the overlord. 521 func (o *Overlord) State() *state.State { 522 return o.stateEng.State() 523 } 524 525 // StateEngine returns the stage engine used by overlord. 526 func (o *Overlord) StateEngine() *StateEngine { 527 return o.stateEng 528 } 529 530 // TaskRunner returns the shared task runner responsible for running 531 // tasks for all managers under the overlord. 532 func (o *Overlord) TaskRunner() *state.TaskRunner { 533 return o.runner 534 } 535 536 // SnapManager returns the snap manager responsible for snaps under 537 // the overlord. 538 func (o *Overlord) SnapManager() *snapstate.SnapManager { 539 return o.snapMgr 540 } 541 542 // AssertManager returns the assertion manager enforcing assertions 543 // under the overlord. 544 func (o *Overlord) AssertManager() *assertstate.AssertManager { 545 return o.assertMgr 546 } 547 548 // InterfaceManager returns the interface manager maintaining 549 // interface connections under the overlord. 550 func (o *Overlord) InterfaceManager() *ifacestate.InterfaceManager { 551 return o.ifaceMgr 552 } 553 554 // HookManager returns the hook manager responsible for running hooks 555 // under the overlord. 556 func (o *Overlord) HookManager() *hookstate.HookManager { 557 return o.hookMgr 558 } 559 560 // DeviceManager returns the device manager responsible for the device 561 // identity and policies. 562 func (o *Overlord) DeviceManager() *devicestate.DeviceManager { 563 return o.deviceMgr 564 } 565 566 // CommandManager returns the manager responsible for running odd 567 // jobs. 568 func (o *Overlord) CommandManager() *cmdstate.CommandManager { 569 return o.cmdMgr 570 } 571 572 // SnapshotManager returns the manager responsible for snapshots. 573 func (o *Overlord) SnapshotManager() *snapshotstate.SnapshotManager { 574 return o.shotMgr 575 } 576 577 // Mock creates an Overlord without any managers and with a backend 578 // not using disk. Managers can be added with AddManager. For testing. 579 func Mock() *Overlord { 580 return MockWithRestartHandler(nil) 581 } 582 583 // MockWithRestartHandler creates an Overlord without any managers and 584 // with a backend not using disk. It will use the given handler on 585 // restart requests. Managers can be added with AddManager. For 586 // testing. 587 func MockWithRestartHandler(handleRestart func(state.RestartType)) *Overlord { 588 o := &Overlord{ 589 loopTomb: new(tomb.Tomb), 590 inited: false, 591 restartBehavior: mockRestartBehavior(handleRestart), 592 } 593 s := state.New(mockBackend{o: o}) 594 o.stateEng = NewStateEngine(s) 595 o.runner = state.NewTaskRunner(s) 596 597 return o 598 } 599 600 // AddManager adds a manager to the overlord created with Mock. For 601 // testing. 602 func (o *Overlord) AddManager(mgr StateManager) { 603 if o.inited { 604 panic("internal error: cannot add managers to a fully initialized Overlord") 605 } 606 o.addManager(mgr) 607 } 608 609 type mockRestartBehavior func(state.RestartType) 610 611 func (rb mockRestartBehavior) HandleRestart(t state.RestartType) { 612 if rb == nil { 613 return 614 } 615 rb(t) 616 } 617 618 func (rb mockRestartBehavior) RebootAsExpected(*state.State) error { 619 panic("internal error: overlord.Mock should not invoke RebootAsExpected") 620 } 621 622 func (rb mockRestartBehavior) RebootDidNotHappen(*state.State) error { 623 panic("internal error: overlord.Mock should not invoke RebootDidNotHappen") 624 } 625 626 type mockBackend struct { 627 o *Overlord 628 } 629 630 func (mb mockBackend) Checkpoint(data []byte) error { 631 return nil 632 } 633 634 func (mb mockBackend) EnsureBefore(d time.Duration) { 635 mb.o.ensureLock.Lock() 636 timer := mb.o.ensureTimer 637 mb.o.ensureLock.Unlock() 638 if timer == nil { 639 return 640 } 641 642 mb.o.ensureBefore(d) 643 } 644 645 func (mb mockBackend) RequestRestart(t state.RestartType) { 646 mb.o.requestRestart(t) 647 }