github.com/tompreston/snapd@v0.0.0-20210817193607-954edfcb9611/daemon/daemon.go (about) 1 // -*- Mode: Go; indent-tabs-mode: t -*- 2 3 /* 4 * Copyright (C) 2015-2021 Canonical Ltd 5 * 6 * This program is free software: you can redistribute it and/or modify 7 * it under the terms of the GNU General Public License version 3 as 8 * published by the Free Software Foundation. 9 * 10 * This program is distributed in the hope that it will be useful, 11 * but WITHOUT ANY WARRANTY; without even the implied warranty of 12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13 * GNU General Public License for more details. 14 * 15 * You should have received a copy of the GNU General Public License 16 * along with this program. If not, see <http://www.gnu.org/licenses/>. 17 * 18 */ 19 20 package daemon 21 22 import ( 23 "bytes" 24 "context" 25 "encoding/json" 26 "fmt" 27 "net" 28 "net/http" 29 "os" 30 "os/exec" 31 "os/signal" 32 "strings" 33 "sync" 34 "time" 35 36 "github.com/gorilla/mux" 37 "gopkg.in/tomb.v2" 38 39 "github.com/snapcore/snapd/dirs" 40 "github.com/snapcore/snapd/i18n" 41 "github.com/snapcore/snapd/logger" 42 "github.com/snapcore/snapd/netutil" 43 "github.com/snapcore/snapd/osutil" 44 "github.com/snapcore/snapd/overlord" 45 "github.com/snapcore/snapd/overlord/auth" 46 "github.com/snapcore/snapd/overlord/standby" 47 "github.com/snapcore/snapd/overlord/state" 48 "github.com/snapcore/snapd/snapdenv" 49 "github.com/snapcore/snapd/store" 50 "github.com/snapcore/snapd/systemd" 51 ) 52 53 var ErrRestartSocket = fmt.Errorf("daemon stop requested to wait for socket activation") 54 55 var systemdSdNotify = systemd.SdNotify 56 57 const ( 58 daemonRestartMsg = "daemon is restarting" 59 systemRestartMsg = "system is restarting" 60 systemHaltMsg = "system is halting" 61 systemPoweroffMsg = "system is powering off" 62 socketRestartMsg = "daemon is stopping to wait for socket activation" 63 ) 64 65 // A Daemon listens for requests and routes them to the right command 66 type Daemon struct { 67 Version string 68 overlord *overlord.Overlord 69 state *state.State 70 snapdListener net.Listener 71 snapListener net.Listener 72 connTracker *connTracker 73 serve *http.Server 74 tomb tomb.Tomb 75 router *mux.Router 76 standbyOpinions *standby.StandbyOpinions 77 78 // set to what kind of restart was requested if any 79 requestedRestart state.RestartType 80 // set to remember that we need to exit the daemon in a way that 81 // prevents systemd from restarting it 82 restartSocket bool 83 // degradedErr is set when the daemon is in degraded mode 84 degradedErr error 85 86 expectedRebootDidNotHappen bool 87 88 mu sync.Mutex 89 } 90 91 // A ResponseFunc handles one of the individual verbs for a method 92 type ResponseFunc func(*Command, *http.Request, *auth.UserState) Response 93 94 // A Command routes a request to an individual per-verb ResponseFUnc 95 type Command struct { 96 Path string 97 PathPrefix string 98 // 99 GET ResponseFunc 100 PUT ResponseFunc 101 POST ResponseFunc 102 103 // Access control. 104 ReadAccess accessChecker 105 WriteAccess accessChecker 106 107 d *Daemon 108 } 109 110 func (c *Command) ServeHTTP(w http.ResponseWriter, r *http.Request) { 111 st := c.d.state 112 st.Lock() 113 // TODO Look at the error and fail if there's an attempt to authenticate with invalid data. 114 user, _ := userFromRequest(st, r) 115 st.Unlock() 116 117 // check if we are in degradedMode 118 if c.d.degradedErr != nil && r.Method != "GET" { 119 InternalError(c.d.degradedErr.Error()).ServeHTTP(w, r) 120 return 121 } 122 123 ucred, err := ucrednetGet(r.RemoteAddr) 124 if err != nil && err != errNoID { 125 logger.Noticef("unexpected error when attempting to get UID: %s", err) 126 InternalError(err.Error()).ServeHTTP(w, r) 127 return 128 } 129 130 ctx := store.WithClientUserAgent(r.Context(), r) 131 r = r.WithContext(ctx) 132 133 var rspf ResponseFunc 134 var access accessChecker 135 136 switch r.Method { 137 case "GET": 138 rspf = c.GET 139 access = c.ReadAccess 140 case "PUT": 141 rspf = c.PUT 142 access = c.WriteAccess 143 case "POST": 144 rspf = c.POST 145 access = c.WriteAccess 146 } 147 148 if rspf == nil { 149 MethodNotAllowed("method %q not allowed", r.Method).ServeHTTP(w, r) 150 return 151 } 152 153 if rspe := access.CheckAccess(c.d, r, ucred, user); rspe != nil { 154 rspe.ServeHTTP(w, r) 155 return 156 } 157 158 rsp := rspf(c, r, user) 159 160 if srsp, ok := rsp.(StructuredResponse); ok { 161 rjson := srsp.JSON() 162 163 _, rst := st.Restarting() 164 rjson.addMaintenanceFromRestartType(rst) 165 166 if rjson.Type != ResponseTypeError { 167 st.Lock() 168 count, stamp := st.WarningsSummary() 169 st.Unlock() 170 rjson.addWarningCount(count, stamp) 171 } 172 173 // serve the updated serialisation 174 rsp = rjson 175 } 176 177 rsp.ServeHTTP(w, r) 178 } 179 180 type wrappedWriter struct { 181 w http.ResponseWriter 182 s int 183 } 184 185 func (w *wrappedWriter) Header() http.Header { 186 return w.w.Header() 187 } 188 189 func (w *wrappedWriter) Write(bs []byte) (int, error) { 190 return w.w.Write(bs) 191 } 192 193 func (w *wrappedWriter) WriteHeader(s int) { 194 w.w.WriteHeader(s) 195 w.s = s 196 } 197 198 func (w *wrappedWriter) Flush() { 199 if f, ok := w.w.(http.Flusher); ok { 200 f.Flush() 201 } 202 } 203 204 func logit(handler http.Handler) http.Handler { 205 return http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { 206 ww := &wrappedWriter{w: w} 207 t0 := time.Now() 208 handler.ServeHTTP(ww, r) 209 t := time.Since(t0) 210 url := r.URL.String() 211 if !strings.Contains(url, "/changes/") { 212 logger.Debugf("%s %s %s %s %d", r.RemoteAddr, r.Method, r.URL, t, ww.s) 213 } 214 }) 215 } 216 217 // Init sets up the Daemon's internal workings. 218 // Don't call more than once. 219 func (d *Daemon) Init() error { 220 listenerMap, err := netutil.ActivationListeners() 221 if err != nil { 222 return err 223 } 224 225 // The SnapdSocket is required-- without it, die. 226 if listener, err := netutil.GetListener(dirs.SnapdSocket, listenerMap); err == nil { 227 d.snapdListener = &ucrednetListener{Listener: listener} 228 } else { 229 return fmt.Errorf("when trying to listen on %s: %v", dirs.SnapdSocket, err) 230 } 231 232 if listener, err := netutil.GetListener(dirs.SnapSocket, listenerMap); err == nil { 233 // This listener may also be nil if that socket wasn't among 234 // the listeners, so check it before using it. 235 d.snapListener = &ucrednetListener{Listener: listener} 236 } else { 237 logger.Debugf("cannot get listener for %q: %v", dirs.SnapSocket, err) 238 } 239 240 d.addRoutes() 241 242 logger.Noticef("started %v.", snapdenv.UserAgent()) 243 244 return nil 245 } 246 247 // SetDegradedMode puts the daemon into an degraded mode which will the 248 // error given in the "err" argument for commands that are not marked 249 // as readonlyOK. 250 // 251 // This is useful to report errors to the client when the daemon 252 // cannot work because e.g. a sanity check failed or the system is out 253 // of diskspace. 254 // 255 // When the system is fine again calling "DegradedMode(nil)" is enough 256 // to put the daemon into full operation again. 257 func (d *Daemon) SetDegradedMode(err error) { 258 d.degradedErr = err 259 } 260 261 func (d *Daemon) addRoutes() { 262 d.router = mux.NewRouter() 263 264 for _, c := range api { 265 c.d = d 266 if c.PathPrefix == "" { 267 d.router.Handle(c.Path, c).Name(c.Path) 268 } else { 269 d.router.PathPrefix(c.PathPrefix).Handler(c).Name(c.PathPrefix) 270 } 271 } 272 273 // also maybe add a /favicon.ico handler... 274 275 d.router.NotFoundHandler = NotFound("not found") 276 } 277 278 var ( 279 shutdownTimeout = 25 * time.Second 280 ) 281 282 type connTracker struct { 283 mu sync.Mutex 284 conns map[net.Conn]struct{} 285 } 286 287 func (ct *connTracker) CanStandby() bool { 288 ct.mu.Lock() 289 defer ct.mu.Unlock() 290 291 return len(ct.conns) == 0 292 } 293 294 func (ct *connTracker) trackConn(conn net.Conn, state http.ConnState) { 295 ct.mu.Lock() 296 defer ct.mu.Unlock() 297 // we ignore hijacked connections, if we do things with websockets 298 // we'll need custom shutdown handling for them 299 if state == http.StateNew || state == http.StateActive { 300 ct.conns[conn] = struct{}{} 301 } else { 302 delete(ct.conns, conn) 303 } 304 } 305 306 func (d *Daemon) initStandbyHandling() { 307 d.standbyOpinions = standby.New(d.state) 308 d.standbyOpinions.AddOpinion(d.connTracker) 309 d.standbyOpinions.AddOpinion(d.overlord) 310 d.standbyOpinions.AddOpinion(d.overlord.SnapManager()) 311 d.standbyOpinions.AddOpinion(d.overlord.DeviceManager()) 312 d.standbyOpinions.Start() 313 } 314 315 // Start the Daemon 316 func (d *Daemon) Start() error { 317 if d.expectedRebootDidNotHappen { 318 // we need to schedule and wait for a system restart 319 d.tomb.Kill(nil) 320 // avoid systemd killing us again while we wait 321 systemdSdNotify("READY=1") 322 return nil 323 } 324 if d.overlord == nil { 325 panic("internal error: no Overlord") 326 } 327 328 to, reasoning, err := d.overlord.StartupTimeout() 329 if err != nil { 330 return err 331 } 332 if to > 0 { 333 to = to.Round(time.Microsecond) 334 us := to.Nanoseconds() / 1000 335 logger.Noticef("adjusting startup timeout by %v (%s)", to, reasoning) 336 systemdSdNotify(fmt.Sprintf("EXTEND_TIMEOUT_USEC=%d", us)) 337 } 338 // now perform expensive overlord/manages initiliazation 339 if err := d.overlord.StartUp(); err != nil { 340 return err 341 } 342 343 d.connTracker = &connTracker{conns: make(map[net.Conn]struct{})} 344 d.serve = &http.Server{ 345 Handler: logit(d.router), 346 ConnState: d.connTracker.trackConn, 347 } 348 349 // enable standby handling 350 d.initStandbyHandling() 351 352 // before serving actual connections remove the maintenance.json file as we 353 // are no longer down for maintenance, this state most closely corresponds 354 // to state.RestartUnset 355 if err := d.updateMaintenanceFile(state.RestartUnset); err != nil { 356 return err 357 } 358 359 // the loop runs in its own goroutine 360 d.overlord.Loop() 361 362 d.tomb.Go(func() error { 363 if d.snapListener != nil { 364 d.tomb.Go(func() error { 365 if err := d.serve.Serve(d.snapListener); err != http.ErrServerClosed && d.tomb.Err() == tomb.ErrStillAlive { 366 return err 367 } 368 369 return nil 370 }) 371 } 372 373 if err := d.serve.Serve(d.snapdListener); err != http.ErrServerClosed && d.tomb.Err() == tomb.ErrStillAlive { 374 return err 375 } 376 377 return nil 378 }) 379 380 // notify systemd that we are ready 381 systemdSdNotify("READY=1") 382 return nil 383 } 384 385 // HandleRestart implements overlord.RestartBehavior. 386 func (d *Daemon) HandleRestart(t state.RestartType) { 387 d.mu.Lock() 388 defer d.mu.Unlock() 389 390 scheduleFallback := func(a rebootAction) { 391 if err := reboot(a, rebootWaitTimeout); err != nil { 392 logger.Noticef("%s", err) 393 } 394 } 395 396 // die when asked to restart (systemd should get us back up!) etc 397 switch t { 398 case state.RestartDaemon: 399 // save the restart kind to write out a maintenance.json in a bit 400 d.requestedRestart = t 401 case state.RestartSystem, state.RestartSystemNow: 402 // try to schedule a fallback slow reboot already here 403 // in case we get stuck shutting down 404 405 // save the restart kind to write out a maintenance.json in a bit 406 scheduleFallback(rebootReboot) 407 d.requestedRestart = t 408 case state.RestartSystemHaltNow: 409 scheduleFallback(rebootHalt) 410 d.requestedRestart = t 411 case state.RestartSystemPoweroffNow: 412 scheduleFallback(rebootPoweroff) 413 d.requestedRestart = t 414 case state.RestartSocket: 415 // save the restart kind to write out a maintenance.json in a bit 416 d.requestedRestart = t 417 d.restartSocket = true 418 case state.StopDaemon: 419 logger.Noticef("stopping snapd as requested") 420 default: 421 logger.Noticef("internal error: restart handler called with unknown restart type: %v", t) 422 } 423 424 d.tomb.Kill(nil) 425 } 426 427 var ( 428 rebootNoticeWait = 3 * time.Second 429 rebootWaitTimeout = 10 * time.Minute 430 rebootRetryWaitTimeout = 5 * time.Minute 431 rebootMaxTentatives = 3 432 ) 433 434 func (d *Daemon) updateMaintenanceFile(rst state.RestartType) error { 435 // for unset restart, just remove the maintenance.json file 436 if rst == state.RestartUnset { 437 err := os.Remove(dirs.SnapdMaintenanceFile) 438 // only return err if the error was something other than the file not 439 // existing 440 if err != nil && !os.IsNotExist(err) { 441 return err 442 } 443 return nil 444 } 445 446 // otherwise marshal and write it out appropriately 447 b, err := json.Marshal(maintenanceForRestartType(rst)) 448 if err != nil { 449 return err 450 } 451 452 return osutil.AtomicWrite(dirs.SnapdMaintenanceFile, bytes.NewBuffer(b), 0644, 0) 453 } 454 455 // Stop shuts down the Daemon 456 func (d *Daemon) Stop(sigCh chan<- os.Signal) error { 457 // we need to schedule/wait for a system restart again 458 if d.expectedRebootDidNotHappen { 459 // make the reboot retry immediate 460 immediateReboot := true 461 return d.doReboot(sigCh, state.RestartSystem, immediateReboot, rebootRetryWaitTimeout) 462 } 463 if d.overlord == nil { 464 return fmt.Errorf("internal error: no Overlord") 465 } 466 467 d.tomb.Kill(nil) 468 469 // check the state associated with a potential restart with the lock to 470 // prevent races 471 d.mu.Lock() 472 // needsFullShutdown is whether the entire system will 473 // shutdown or not as a consequence of this request 474 needsFullShutdown := false 475 switch d.requestedRestart { 476 case state.RestartSystem, state.RestartSystemNow, state.RestartSystemHaltNow, state.RestartSystemPoweroffNow: 477 needsFullShutdown = true 478 } 479 immediateShutdown := false 480 switch d.requestedRestart { 481 case state.RestartSystemNow, state.RestartSystemHaltNow, state.RestartSystemPoweroffNow: 482 immediateShutdown = true 483 } 484 restartSocket := d.restartSocket 485 d.mu.Unlock() 486 487 // before not accepting any new client connections we need to write the 488 // maintenance.json file for potential clients to see after the daemon stops 489 // responding so they can read it correctly and handle the maintenance 490 if err := d.updateMaintenanceFile(d.requestedRestart); err != nil { 491 logger.Noticef("error writing maintenance file: %v", err) 492 } 493 494 d.snapdListener.Close() 495 d.standbyOpinions.Stop() 496 497 if d.snapListener != nil { 498 // stop running hooks first 499 // and do it more gracefully if we are restarting 500 hookMgr := d.overlord.HookManager() 501 if ok, _ := d.state.Restarting(); ok { 502 logger.Noticef("gracefully waiting for running hooks") 503 hookMgr.GracefullyWaitRunningHooks() 504 logger.Noticef("done waiting for running hooks") 505 } 506 hookMgr.StopHooks() 507 d.snapListener.Close() 508 } 509 510 if needsFullShutdown { 511 // give time to polling clients to notice restart 512 time.Sleep(rebootNoticeWait) 513 } 514 515 // We're using the background context here because the tomb's 516 // context will likely already have been cancelled when we are 517 // called. 518 ctx, cancel := context.WithTimeout(context.Background(), shutdownTimeout) 519 d.tomb.Kill(d.serve.Shutdown(ctx)) 520 cancel() 521 522 if !needsFullShutdown { 523 // tell systemd that we are stopping 524 systemdSdNotify("STOPPING=1") 525 } 526 527 if restartSocket { 528 // At this point we processed all open requests (and 529 // stopped accepting new requests) - before going into 530 // socket activated mode we need to check if any of 531 // those open requests resulted in something that 532 // prevents us from going into socket activation mode. 533 // 534 // If this is the case we do a "normal" snapd restart 535 // to process the new changes. 536 if !d.standbyOpinions.CanStandby() { 537 d.restartSocket = false 538 } 539 } 540 d.overlord.Stop() 541 542 if err := d.tomb.Wait(); err != nil { 543 if err == context.DeadlineExceeded { 544 logger.Noticef("WARNING: cannot gracefully shut down in-flight snapd API activity within: %v", shutdownTimeout) 545 // the process is shutting down anyway, so we may just 546 // as well close the active connections right now 547 d.serve.Close() 548 } else { 549 // do not stop the shutdown even if the tomb errors 550 // because we already scheduled a slow shutdown and 551 // exiting here will just restart snapd (via systemd) 552 // which will lead to confusing results. 553 if needsFullShutdown { 554 logger.Noticef("WARNING: cannot stop daemon: %v", err) 555 } else { 556 return err 557 } 558 } 559 } 560 561 if needsFullShutdown { 562 return d.doReboot(sigCh, d.requestedRestart, immediateShutdown, rebootWaitTimeout) 563 } 564 565 if d.restartSocket { 566 return ErrRestartSocket 567 } 568 569 return nil 570 } 571 572 func (d *Daemon) rebootDelay(immediate bool) (time.Duration, error) { 573 d.state.Lock() 574 defer d.state.Unlock() 575 now := time.Now() 576 // see whether a reboot had already been scheduled 577 var rebootAt time.Time 578 err := d.state.Get("daemon-system-restart-at", &rebootAt) 579 if err != nil && err != state.ErrNoState { 580 return 0, err 581 } 582 rebootDelay := 1 * time.Minute 583 if immediate { 584 rebootDelay = 0 585 } 586 if err == nil { 587 rebootDelay = rebootAt.Sub(now) 588 } else { 589 ovr := os.Getenv("SNAPD_REBOOT_DELAY") // for tests 590 if ovr != "" && !immediate { 591 d, err := time.ParseDuration(ovr) 592 if err == nil { 593 rebootDelay = d 594 } 595 } 596 rebootAt = now.Add(rebootDelay) 597 d.state.Set("daemon-system-restart-at", rebootAt) 598 } 599 return rebootDelay, nil 600 } 601 602 func (d *Daemon) doReboot(sigCh chan<- os.Signal, rst state.RestartType, immediate bool, waitTimeout time.Duration) error { 603 rebootDelay, err := d.rebootDelay(immediate) 604 if err != nil { 605 return err 606 } 607 action := rebootReboot 608 switch rst { 609 case state.RestartSystemHaltNow: 610 action = rebootHalt 611 case state.RestartSystemPoweroffNow: 612 action = rebootPoweroff 613 } 614 // ask for shutdown and wait for it to happen. 615 // if we exit snapd will be restared by systemd 616 if err := reboot(action, rebootDelay); err != nil { 617 return err 618 } 619 // wait for reboot to happen 620 logger.Noticef("Waiting for %s", action) 621 if sigCh != nil { 622 signal.Stop(sigCh) 623 if len(sigCh) > 0 { 624 // a signal arrived in between 625 return nil 626 } 627 close(sigCh) 628 } 629 time.Sleep(waitTimeout) 630 return fmt.Errorf("expected %s did not happen", action) 631 } 632 633 var ( 634 shutdownMsg = i18n.G("reboot scheduled to update the system") 635 haltMsg = i18n.G("system halt scheduled") 636 poweroffMsg = i18n.G("system poweroff scheduled") 637 ) 638 639 type rebootAction int 640 641 func (a rebootAction) String() string { 642 switch a { 643 case rebootReboot: 644 return "system reboot" 645 case rebootHalt: 646 return "system halt" 647 case rebootPoweroff: 648 return "system poweroff" 649 default: 650 panic(fmt.Sprintf("unknown reboot action %d", a)) 651 } 652 } 653 654 const ( 655 rebootReboot rebootAction = iota 656 rebootHalt 657 rebootPoweroff 658 ) 659 660 func rebootImpl(action rebootAction, rebootDelay time.Duration) error { 661 if rebootDelay < 0 { 662 rebootDelay = 0 663 } 664 mins := int64(rebootDelay / time.Minute) 665 var arg, msg string 666 switch action { 667 case rebootReboot: 668 arg = "-r" 669 msg = shutdownMsg 670 case rebootHalt: 671 arg = "--halt" 672 msg = haltMsg 673 case rebootPoweroff: 674 arg = "--poweroff" 675 msg = poweroffMsg 676 default: 677 return fmt.Errorf("unknown reboot action: %v", action) 678 } 679 cmd := exec.Command("shutdown", arg, fmt.Sprintf("+%d", mins), msg) 680 if out, err := cmd.CombinedOutput(); err != nil { 681 return osutil.OutputErr(out, err) 682 } 683 return nil 684 } 685 686 var reboot = rebootImpl 687 688 // Dying is a tomb-ish thing 689 func (d *Daemon) Dying() <-chan struct{} { 690 return d.tomb.Dying() 691 } 692 693 func clearReboot(st *state.State) { 694 st.Set("daemon-system-restart-at", nil) 695 st.Set("daemon-system-restart-tentative", nil) 696 } 697 698 // RebootAsExpected implements part of overlord.RestartBehavior. 699 func (d *Daemon) RebootAsExpected(st *state.State) error { 700 clearReboot(st) 701 return nil 702 } 703 704 // RebootDidNotHappen implements part of overlord.RestartBehavior. 705 func (d *Daemon) RebootDidNotHappen(st *state.State) error { 706 var nTentative int 707 err := st.Get("daemon-system-restart-tentative", &nTentative) 708 if err != nil && err != state.ErrNoState { 709 return err 710 } 711 nTentative++ 712 if nTentative > rebootMaxTentatives { 713 // giving up, proceed normally, some in-progress refresh 714 // might get rolled back!! 715 st.ClearReboot() 716 clearReboot(st) 717 logger.Noticef("snapd was restarted while a system restart was expected, snapd retried to schedule and waited again for a system restart %d times and is giving up", rebootMaxTentatives) 718 return nil 719 } 720 st.Set("daemon-system-restart-tentative", nTentative) 721 d.state = st 722 logger.Noticef("snapd was restarted while a system restart was expected, snapd will try to schedule and wait for a system restart again (tenative %d/%d)", nTentative, rebootMaxTentatives) 723 return state.ErrExpectedReboot 724 } 725 726 // New Daemon 727 func New() (*Daemon, error) { 728 d := &Daemon{} 729 ovld, err := overlord.New(d) 730 if err == state.ErrExpectedReboot { 731 // we proceed without overlord until we reach Stop 732 // where we will schedule and wait again for a system restart. 733 // ATM we cannot do that in New because we need to satisfy 734 // systemd notify mechanisms. 735 d.expectedRebootDidNotHappen = true 736 return d, nil 737 } 738 if err != nil { 739 return nil, err 740 } 741 d.overlord = ovld 742 d.state = ovld.State() 743 return d, nil 744 }