github.com/stulluk/snapd@v0.0.0-20210611110309-f6d5d5bd24b0/daemon/daemon.go (about) 1 // -*- Mode: Go; indent-tabs-mode: t -*- 2 3 /* 4 * Copyright (C) 2015-2021 Canonical Ltd 5 * 6 * This program is free software: you can redistribute it and/or modify 7 * it under the terms of the GNU General Public License version 3 as 8 * published by the Free Software Foundation. 9 * 10 * This program is distributed in the hope that it will be useful, 11 * but WITHOUT ANY WARRANTY; without even the implied warranty of 12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13 * GNU General Public License for more details. 14 * 15 * You should have received a copy of the GNU General Public License 16 * along with this program. If not, see <http://www.gnu.org/licenses/>. 17 * 18 */ 19 20 package daemon 21 22 import ( 23 "bytes" 24 "context" 25 "encoding/json" 26 "fmt" 27 "net" 28 "net/http" 29 "os" 30 "os/exec" 31 "os/signal" 32 "strings" 33 "sync" 34 "time" 35 36 "github.com/gorilla/mux" 37 "gopkg.in/tomb.v2" 38 39 "github.com/snapcore/snapd/dirs" 40 "github.com/snapcore/snapd/i18n" 41 "github.com/snapcore/snapd/logger" 42 "github.com/snapcore/snapd/netutil" 43 "github.com/snapcore/snapd/osutil" 44 "github.com/snapcore/snapd/overlord" 45 "github.com/snapcore/snapd/overlord/auth" 46 "github.com/snapcore/snapd/overlord/standby" 47 "github.com/snapcore/snapd/overlord/state" 48 "github.com/snapcore/snapd/snapdenv" 49 "github.com/snapcore/snapd/store" 50 "github.com/snapcore/snapd/systemd" 51 ) 52 53 var ErrRestartSocket = fmt.Errorf("daemon stop requested to wait for socket activation") 54 55 var systemdSdNotify = systemd.SdNotify 56 57 const ( 58 daemonRestartMsg = "daemon is restarting" 59 systemRestartMsg = "system is restarting" 60 systemHaltMsg = "system is halting" 61 systemPoweroffMsg = "system is powering off" 62 socketRestartMsg = "daemon is stopping to wait for socket activation" 63 ) 64 65 // A Daemon listens for requests and routes them to the right command 66 type Daemon struct { 67 Version string 68 overlord *overlord.Overlord 69 state *state.State 70 snapdListener net.Listener 71 snapListener net.Listener 72 connTracker *connTracker 73 serve *http.Server 74 tomb tomb.Tomb 75 router *mux.Router 76 standbyOpinions *standby.StandbyOpinions 77 78 // set to what kind of restart was requested if any 79 requestedRestart state.RestartType 80 // set to remember that we need to exit the daemon in a way that 81 // prevents systemd from restarting it 82 restartSocket bool 83 // degradedErr is set when the daemon is in degraded mode 84 degradedErr error 85 86 expectedRebootDidNotHappen bool 87 88 mu sync.Mutex 89 } 90 91 // A ResponseFunc handles one of the individual verbs for a method 92 type ResponseFunc func(*Command, *http.Request, *auth.UserState) Response 93 94 // A Command routes a request to an individual per-verb ResponseFUnc 95 type Command struct { 96 Path string 97 PathPrefix string 98 // 99 GET ResponseFunc 100 PUT ResponseFunc 101 POST ResponseFunc 102 103 // Access control. 104 ReadAccess accessChecker 105 WriteAccess accessChecker 106 107 d *Daemon 108 } 109 110 func (c *Command) ServeHTTP(w http.ResponseWriter, r *http.Request) { 111 st := c.d.state 112 st.Lock() 113 // TODO Look at the error and fail if there's an attempt to authenticate with invalid data. 114 user, _ := userFromRequest(st, r) 115 st.Unlock() 116 117 // check if we are in degradedMode 118 if c.d.degradedErr != nil && r.Method != "GET" { 119 InternalError(c.d.degradedErr.Error()).ServeHTTP(w, r) 120 return 121 } 122 123 ucred, err := ucrednetGet(r.RemoteAddr) 124 if err != nil && err != errNoID { 125 logger.Noticef("unexpected error when attempting to get UID: %s", err) 126 InternalError(err.Error()).ServeHTTP(w, r) 127 return 128 } 129 130 ctx := store.WithClientUserAgent(r.Context(), r) 131 r = r.WithContext(ctx) 132 133 var rspf ResponseFunc 134 var access accessChecker 135 136 switch r.Method { 137 case "GET": 138 rspf = c.GET 139 access = c.ReadAccess 140 case "PUT": 141 rspf = c.PUT 142 access = c.WriteAccess 143 case "POST": 144 rspf = c.POST 145 access = c.WriteAccess 146 } 147 148 if rspf == nil { 149 MethodNotAllowed("method %q not allowed", r.Method).ServeHTTP(w, r) 150 return 151 } 152 153 switch access.CheckAccess(r, ucred, user) { 154 case accessOK: 155 // nothing 156 case accessUnauthorized: 157 Unauthorized("access denied").ServeHTTP(w, r) 158 return 159 case accessForbidden: 160 Forbidden("access denied").ServeHTTP(w, r) 161 return 162 case accessCancelled: 163 AuthCancelled("cancelled").ServeHTTP(w, r) 164 return 165 } 166 167 rsp := rspf(c, r, user) 168 169 if srsp, ok := rsp.(StructuredResponse); ok { 170 rjson := srsp.JSON() 171 172 _, rst := st.Restarting() 173 rjson.addMaintenanceFromRestartType(rst) 174 175 if rjson.Type != ResponseTypeError { 176 st.Lock() 177 count, stamp := st.WarningsSummary() 178 st.Unlock() 179 rjson.addWarningCount(count, stamp) 180 } 181 182 // serve the updated serialisation 183 rsp = rjson 184 } 185 186 rsp.ServeHTTP(w, r) 187 } 188 189 type wrappedWriter struct { 190 w http.ResponseWriter 191 s int 192 } 193 194 func (w *wrappedWriter) Header() http.Header { 195 return w.w.Header() 196 } 197 198 func (w *wrappedWriter) Write(bs []byte) (int, error) { 199 return w.w.Write(bs) 200 } 201 202 func (w *wrappedWriter) WriteHeader(s int) { 203 w.w.WriteHeader(s) 204 w.s = s 205 } 206 207 func (w *wrappedWriter) Flush() { 208 if f, ok := w.w.(http.Flusher); ok { 209 f.Flush() 210 } 211 } 212 213 func logit(handler http.Handler) http.Handler { 214 return http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { 215 ww := &wrappedWriter{w: w} 216 t0 := time.Now() 217 handler.ServeHTTP(ww, r) 218 t := time.Since(t0) 219 url := r.URL.String() 220 if !strings.Contains(url, "/changes/") { 221 logger.Debugf("%s %s %s %s %d", r.RemoteAddr, r.Method, r.URL, t, ww.s) 222 } 223 }) 224 } 225 226 // Init sets up the Daemon's internal workings. 227 // Don't call more than once. 228 func (d *Daemon) Init() error { 229 listenerMap, err := netutil.ActivationListeners() 230 if err != nil { 231 return err 232 } 233 234 // The SnapdSocket is required-- without it, die. 235 if listener, err := netutil.GetListener(dirs.SnapdSocket, listenerMap); err == nil { 236 d.snapdListener = &ucrednetListener{Listener: listener} 237 } else { 238 return fmt.Errorf("when trying to listen on %s: %v", dirs.SnapdSocket, err) 239 } 240 241 if listener, err := netutil.GetListener(dirs.SnapSocket, listenerMap); err == nil { 242 // This listener may also be nil if that socket wasn't among 243 // the listeners, so check it before using it. 244 d.snapListener = &ucrednetListener{Listener: listener} 245 } else { 246 logger.Debugf("cannot get listener for %q: %v", dirs.SnapSocket, err) 247 } 248 249 d.addRoutes() 250 251 logger.Noticef("started %v.", snapdenv.UserAgent()) 252 253 return nil 254 } 255 256 // SetDegradedMode puts the daemon into an degraded mode which will the 257 // error given in the "err" argument for commands that are not marked 258 // as readonlyOK. 259 // 260 // This is useful to report errors to the client when the daemon 261 // cannot work because e.g. a sanity check failed or the system is out 262 // of diskspace. 263 // 264 // When the system is fine again calling "DegradedMode(nil)" is enough 265 // to put the daemon into full operation again. 266 func (d *Daemon) SetDegradedMode(err error) { 267 d.degradedErr = err 268 } 269 270 func (d *Daemon) addRoutes() { 271 d.router = mux.NewRouter() 272 273 for _, c := range api { 274 c.d = d 275 if c.PathPrefix == "" { 276 d.router.Handle(c.Path, c).Name(c.Path) 277 } else { 278 d.router.PathPrefix(c.PathPrefix).Handler(c).Name(c.PathPrefix) 279 } 280 } 281 282 // also maybe add a /favicon.ico handler... 283 284 d.router.NotFoundHandler = NotFound("not found") 285 } 286 287 var ( 288 shutdownTimeout = 25 * time.Second 289 ) 290 291 type connTracker struct { 292 mu sync.Mutex 293 conns map[net.Conn]struct{} 294 } 295 296 func (ct *connTracker) CanStandby() bool { 297 ct.mu.Lock() 298 defer ct.mu.Unlock() 299 300 return len(ct.conns) == 0 301 } 302 303 func (ct *connTracker) trackConn(conn net.Conn, state http.ConnState) { 304 ct.mu.Lock() 305 defer ct.mu.Unlock() 306 // we ignore hijacked connections, if we do things with websockets 307 // we'll need custom shutdown handling for them 308 if state == http.StateNew || state == http.StateActive { 309 ct.conns[conn] = struct{}{} 310 } else { 311 delete(ct.conns, conn) 312 } 313 } 314 315 func (d *Daemon) initStandbyHandling() { 316 d.standbyOpinions = standby.New(d.state) 317 d.standbyOpinions.AddOpinion(d.connTracker) 318 d.standbyOpinions.AddOpinion(d.overlord) 319 d.standbyOpinions.AddOpinion(d.overlord.SnapManager()) 320 d.standbyOpinions.AddOpinion(d.overlord.DeviceManager()) 321 d.standbyOpinions.Start() 322 } 323 324 // Start the Daemon 325 func (d *Daemon) Start() error { 326 if d.expectedRebootDidNotHappen { 327 // we need to schedule and wait for a system restart 328 d.tomb.Kill(nil) 329 // avoid systemd killing us again while we wait 330 systemdSdNotify("READY=1") 331 return nil 332 } 333 if d.overlord == nil { 334 panic("internal error: no Overlord") 335 } 336 337 to, reasoning, err := d.overlord.StartupTimeout() 338 if err != nil { 339 return err 340 } 341 if to > 0 { 342 to = to.Round(time.Microsecond) 343 us := to.Nanoseconds() / 1000 344 logger.Noticef("adjusting startup timeout by %v (%s)", to, reasoning) 345 systemdSdNotify(fmt.Sprintf("EXTEND_TIMEOUT_USEC=%d", us)) 346 } 347 // now perform expensive overlord/manages initiliazation 348 if err := d.overlord.StartUp(); err != nil { 349 return err 350 } 351 352 d.connTracker = &connTracker{conns: make(map[net.Conn]struct{})} 353 d.serve = &http.Server{ 354 Handler: logit(d.router), 355 ConnState: d.connTracker.trackConn, 356 } 357 358 // enable standby handling 359 d.initStandbyHandling() 360 361 // before serving actual connections remove the maintenance.json file as we 362 // are no longer down for maintenance, this state most closely corresponds 363 // to state.RestartUnset 364 if err := d.updateMaintenanceFile(state.RestartUnset); err != nil { 365 return err 366 } 367 368 // the loop runs in its own goroutine 369 d.overlord.Loop() 370 371 d.tomb.Go(func() error { 372 if d.snapListener != nil { 373 d.tomb.Go(func() error { 374 if err := d.serve.Serve(d.snapListener); err != http.ErrServerClosed && d.tomb.Err() == tomb.ErrStillAlive { 375 return err 376 } 377 378 return nil 379 }) 380 } 381 382 if err := d.serve.Serve(d.snapdListener); err != http.ErrServerClosed && d.tomb.Err() == tomb.ErrStillAlive { 383 return err 384 } 385 386 return nil 387 }) 388 389 // notify systemd that we are ready 390 systemdSdNotify("READY=1") 391 return nil 392 } 393 394 // HandleRestart implements overlord.RestartBehavior. 395 func (d *Daemon) HandleRestart(t state.RestartType) { 396 d.mu.Lock() 397 defer d.mu.Unlock() 398 399 scheduleFallback := func(a rebootAction) { 400 if err := reboot(a, rebootWaitTimeout); err != nil { 401 logger.Noticef("%s", err) 402 } 403 } 404 405 // die when asked to restart (systemd should get us back up!) etc 406 switch t { 407 case state.RestartDaemon: 408 // save the restart kind to write out a maintenance.json in a bit 409 d.requestedRestart = t 410 case state.RestartSystem, state.RestartSystemNow: 411 // try to schedule a fallback slow reboot already here 412 // in case we get stuck shutting down 413 414 // save the restart kind to write out a maintenance.json in a bit 415 scheduleFallback(rebootReboot) 416 d.requestedRestart = t 417 case state.RestartSystemHaltNow: 418 scheduleFallback(rebootHalt) 419 d.requestedRestart = t 420 case state.RestartSystemPoweroffNow: 421 scheduleFallback(rebootPoweroff) 422 d.requestedRestart = t 423 case state.RestartSocket: 424 // save the restart kind to write out a maintenance.json in a bit 425 d.requestedRestart = t 426 d.restartSocket = true 427 case state.StopDaemon: 428 logger.Noticef("stopping snapd as requested") 429 default: 430 logger.Noticef("internal error: restart handler called with unknown restart type: %v", t) 431 } 432 433 d.tomb.Kill(nil) 434 } 435 436 var ( 437 rebootNoticeWait = 3 * time.Second 438 rebootWaitTimeout = 10 * time.Minute 439 rebootRetryWaitTimeout = 5 * time.Minute 440 rebootMaxTentatives = 3 441 ) 442 443 func (d *Daemon) updateMaintenanceFile(rst state.RestartType) error { 444 // for unset restart, just remove the maintenance.json file 445 if rst == state.RestartUnset { 446 err := os.Remove(dirs.SnapdMaintenanceFile) 447 // only return err if the error was something other than the file not 448 // existing 449 if err != nil && !os.IsNotExist(err) { 450 return err 451 } 452 return nil 453 } 454 455 // otherwise marshal and write it out appropriately 456 b, err := json.Marshal(maintenanceForRestartType(rst)) 457 if err != nil { 458 return err 459 } 460 461 return osutil.AtomicWrite(dirs.SnapdMaintenanceFile, bytes.NewBuffer(b), 0644, 0) 462 } 463 464 // Stop shuts down the Daemon 465 func (d *Daemon) Stop(sigCh chan<- os.Signal) error { 466 // we need to schedule/wait for a system restart again 467 if d.expectedRebootDidNotHappen { 468 // make the reboot retry immediate 469 immediateReboot := true 470 return d.doReboot(sigCh, state.RestartSystem, immediateReboot, rebootRetryWaitTimeout) 471 } 472 if d.overlord == nil { 473 return fmt.Errorf("internal error: no Overlord") 474 } 475 476 d.tomb.Kill(nil) 477 478 // check the state associated with a potential restart with the lock to 479 // prevent races 480 d.mu.Lock() 481 // needsFullShutdown is whether the entire system will 482 // shutdown or not as a consequence of this request 483 needsFullShutdown := false 484 switch d.requestedRestart { 485 case state.RestartSystem, state.RestartSystemNow, state.RestartSystemHaltNow, state.RestartSystemPoweroffNow: 486 needsFullShutdown = true 487 } 488 immediateShutdown := false 489 switch d.requestedRestart { 490 case state.RestartSystemNow, state.RestartSystemHaltNow, state.RestartSystemPoweroffNow: 491 immediateShutdown = true 492 } 493 restartSocket := d.restartSocket 494 d.mu.Unlock() 495 496 // before not accepting any new client connections we need to write the 497 // maintenance.json file for potential clients to see after the daemon stops 498 // responding so they can read it correctly and handle the maintenance 499 if err := d.updateMaintenanceFile(d.requestedRestart); err != nil { 500 logger.Noticef("error writing maintenance file: %v", err) 501 } 502 503 d.snapdListener.Close() 504 d.standbyOpinions.Stop() 505 506 if d.snapListener != nil { 507 // stop running hooks first 508 // and do it more gracefully if we are restarting 509 hookMgr := d.overlord.HookManager() 510 if ok, _ := d.state.Restarting(); ok { 511 logger.Noticef("gracefully waiting for running hooks") 512 hookMgr.GracefullyWaitRunningHooks() 513 logger.Noticef("done waiting for running hooks") 514 } 515 hookMgr.StopHooks() 516 d.snapListener.Close() 517 } 518 519 if needsFullShutdown { 520 // give time to polling clients to notice restart 521 time.Sleep(rebootNoticeWait) 522 } 523 524 // We're using the background context here because the tomb's 525 // context will likely already have been cancelled when we are 526 // called. 527 ctx, cancel := context.WithTimeout(context.Background(), shutdownTimeout) 528 d.tomb.Kill(d.serve.Shutdown(ctx)) 529 cancel() 530 531 if !needsFullShutdown { 532 // tell systemd that we are stopping 533 systemdSdNotify("STOPPING=1") 534 } 535 536 if restartSocket { 537 // At this point we processed all open requests (and 538 // stopped accepting new requests) - before going into 539 // socket activated mode we need to check if any of 540 // those open requests resulted in something that 541 // prevents us from going into socket activation mode. 542 // 543 // If this is the case we do a "normal" snapd restart 544 // to process the new changes. 545 if !d.standbyOpinions.CanStandby() { 546 d.restartSocket = false 547 } 548 } 549 d.overlord.Stop() 550 551 if err := d.tomb.Wait(); err != nil { 552 if err == context.DeadlineExceeded { 553 logger.Noticef("WARNING: cannot gracefully shut down in-flight snapd API activity within: %v", shutdownTimeout) 554 // the process is shutting down anyway, so we may just 555 // as well close the active connections right now 556 d.serve.Close() 557 } else { 558 // do not stop the shutdown even if the tomb errors 559 // because we already scheduled a slow shutdown and 560 // exiting here will just restart snapd (via systemd) 561 // which will lead to confusing results. 562 if needsFullShutdown { 563 logger.Noticef("WARNING: cannot stop daemon: %v", err) 564 } else { 565 return err 566 } 567 } 568 } 569 570 if needsFullShutdown { 571 return d.doReboot(sigCh, d.requestedRestart, immediateShutdown, rebootWaitTimeout) 572 } 573 574 if d.restartSocket { 575 return ErrRestartSocket 576 } 577 578 return nil 579 } 580 581 func (d *Daemon) rebootDelay(immediate bool) (time.Duration, error) { 582 d.state.Lock() 583 defer d.state.Unlock() 584 now := time.Now() 585 // see whether a reboot had already been scheduled 586 var rebootAt time.Time 587 err := d.state.Get("daemon-system-restart-at", &rebootAt) 588 if err != nil && err != state.ErrNoState { 589 return 0, err 590 } 591 rebootDelay := 1 * time.Minute 592 if immediate { 593 rebootDelay = 0 594 } 595 if err == nil { 596 rebootDelay = rebootAt.Sub(now) 597 } else { 598 ovr := os.Getenv("SNAPD_REBOOT_DELAY") // for tests 599 if ovr != "" && !immediate { 600 d, err := time.ParseDuration(ovr) 601 if err == nil { 602 rebootDelay = d 603 } 604 } 605 rebootAt = now.Add(rebootDelay) 606 d.state.Set("daemon-system-restart-at", rebootAt) 607 } 608 return rebootDelay, nil 609 } 610 611 func (d *Daemon) doReboot(sigCh chan<- os.Signal, rst state.RestartType, immediate bool, waitTimeout time.Duration) error { 612 rebootDelay, err := d.rebootDelay(immediate) 613 if err != nil { 614 return err 615 } 616 action := rebootReboot 617 switch rst { 618 case state.RestartSystemHaltNow: 619 action = rebootHalt 620 case state.RestartSystemPoweroffNow: 621 action = rebootPoweroff 622 } 623 // ask for shutdown and wait for it to happen. 624 // if we exit snapd will be restared by systemd 625 if err := reboot(action, rebootDelay); err != nil { 626 return err 627 } 628 // wait for reboot to happen 629 logger.Noticef("Waiting for %s", action) 630 if sigCh != nil { 631 signal.Stop(sigCh) 632 if len(sigCh) > 0 { 633 // a signal arrived in between 634 return nil 635 } 636 close(sigCh) 637 } 638 time.Sleep(waitTimeout) 639 return fmt.Errorf("expected %s did not happen", action) 640 } 641 642 var ( 643 shutdownMsg = i18n.G("reboot scheduled to update the system") 644 haltMsg = i18n.G("system halt scheduled") 645 poweroffMsg = i18n.G("system poweroff scheduled") 646 ) 647 648 type rebootAction int 649 650 func (a rebootAction) String() string { 651 switch a { 652 case rebootReboot: 653 return "system reboot" 654 case rebootHalt: 655 return "system halt" 656 case rebootPoweroff: 657 return "system poweroff" 658 default: 659 panic(fmt.Sprintf("unknown reboot action %d", a)) 660 } 661 } 662 663 const ( 664 rebootReboot rebootAction = iota 665 rebootHalt 666 rebootPoweroff 667 ) 668 669 func rebootImpl(action rebootAction, rebootDelay time.Duration) error { 670 if rebootDelay < 0 { 671 rebootDelay = 0 672 } 673 mins := int64(rebootDelay / time.Minute) 674 var arg, msg string 675 switch action { 676 case rebootReboot: 677 arg = "-r" 678 msg = shutdownMsg 679 case rebootHalt: 680 arg = "--halt" 681 msg = haltMsg 682 case rebootPoweroff: 683 arg = "--poweroff" 684 msg = poweroffMsg 685 default: 686 return fmt.Errorf("unknown reboot action: %v", action) 687 } 688 cmd := exec.Command("shutdown", arg, fmt.Sprintf("+%d", mins), msg) 689 if out, err := cmd.CombinedOutput(); err != nil { 690 return osutil.OutputErr(out, err) 691 } 692 return nil 693 } 694 695 var reboot = rebootImpl 696 697 // Dying is a tomb-ish thing 698 func (d *Daemon) Dying() <-chan struct{} { 699 return d.tomb.Dying() 700 } 701 702 func clearReboot(st *state.State) { 703 st.Set("daemon-system-restart-at", nil) 704 st.Set("daemon-system-restart-tentative", nil) 705 } 706 707 // RebootAsExpected implements part of overlord.RestartBehavior. 708 func (d *Daemon) RebootAsExpected(st *state.State) error { 709 clearReboot(st) 710 return nil 711 } 712 713 // RebootDidNotHappen implements part of overlord.RestartBehavior. 714 func (d *Daemon) RebootDidNotHappen(st *state.State) error { 715 var nTentative int 716 err := st.Get("daemon-system-restart-tentative", &nTentative) 717 if err != nil && err != state.ErrNoState { 718 return err 719 } 720 nTentative++ 721 if nTentative > rebootMaxTentatives { 722 // giving up, proceed normally, some in-progress refresh 723 // might get rolled back!! 724 st.ClearReboot() 725 clearReboot(st) 726 logger.Noticef("snapd was restarted while a system restart was expected, snapd retried to schedule and waited again for a system restart %d times and is giving up", rebootMaxTentatives) 727 return nil 728 } 729 st.Set("daemon-system-restart-tentative", nTentative) 730 d.state = st 731 logger.Noticef("snapd was restarted while a system restart was expected, snapd will try to schedule and wait for a system restart again (tenative %d/%d)", nTentative, rebootMaxTentatives) 732 return state.ErrExpectedReboot 733 } 734 735 // New Daemon 736 func New() (*Daemon, error) { 737 d := &Daemon{} 738 ovld, err := overlord.New(d) 739 if err == state.ErrExpectedReboot { 740 // we proceed without overlord until we reach Stop 741 // where we will schedule and wait again for a system restart. 742 // ATM we cannot do that in New because we need to satisfy 743 // systemd notify mechanisms. 744 d.expectedRebootDidNotHappen = true 745 return d, nil 746 } 747 if err != nil { 748 return nil, err 749 } 750 d.overlord = ovld 751 d.state = ovld.State() 752 return d, nil 753 }