github.com/david-imola/snapd@v0.0.0-20210611180407-2de8ddeece6d/overlord/servicestate/servicemgr.go (about)

     1  // -*- Mode: Go; indent-tabs-mode: t -*-
     2  
     3  /*
     4   * Copyright (C) 2021 Canonical Ltd
     5   *
     6   * This program is free software: you can redistribute it and/or modify
     7   * it under the terms of the GNU General Public License version 3 as
     8   * published by the Free Software Foundation.
     9   *
    10   * This program is distributed in the hope that it will be useful,
    11   * but WITHOUT ANY WARRANTY; without even the implied warranty of
    12   * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    13   * GNU General Public License for more details.
    14   *
    15   * You should have received a copy of the GNU General Public License
    16   * along with this program.  If not, see <http://www.gnu.org/licenses/>.
    17   *
    18   */
    19  
    20  package servicestate
    21  
    22  import (
    23  	"fmt"
    24  	"os"
    25  	"os/exec"
    26  	"path/filepath"
    27  	"strings"
    28  	"time"
    29  
    30  	"github.com/snapcore/snapd/dirs"
    31  	"github.com/snapcore/snapd/logger"
    32  	"github.com/snapcore/snapd/osutil"
    33  	"github.com/snapcore/snapd/overlord/snapstate"
    34  	"github.com/snapcore/snapd/overlord/state"
    35  	"github.com/snapcore/snapd/progress"
    36  	"github.com/snapcore/snapd/snap"
    37  	"github.com/snapcore/snapd/snap/quota"
    38  	"github.com/snapcore/snapd/snapdenv"
    39  	"github.com/snapcore/snapd/systemd"
    40  	"github.com/snapcore/snapd/timings"
    41  	"github.com/snapcore/snapd/wrappers"
    42  )
    43  
    44  // ServiceManager is responsible for starting and stopping snap services.
    45  type ServiceManager struct {
    46  	state *state.State
    47  
    48  	ensuredSnapSvcs bool
    49  }
    50  
    51  // Manager returns a new service manager.
    52  func Manager(st *state.State, runner *state.TaskRunner) *ServiceManager {
    53  	delayedCrossMgrInit()
    54  	m := &ServiceManager{
    55  		state: st,
    56  	}
    57  	// TODO: undo handler
    58  	runner.AddHandler("service-control", m.doServiceControl, nil)
    59  	return m
    60  }
    61  
    62  func MockEnsuredSnapServices(mgr *ServiceManager, ensured bool) (restore func()) {
    63  	osutil.MustBeTestBinary("ensured snap services can only be mocked from tests")
    64  	old := mgr.ensuredSnapSvcs
    65  	mgr.ensuredSnapSvcs = ensured
    66  	return func() {
    67  		mgr.ensuredSnapSvcs = old
    68  	}
    69  }
    70  
    71  func (m *ServiceManager) ensureSnapServicesUpdated() (err error) {
    72  	m.state.Lock()
    73  	defer m.state.Unlock()
    74  	if m.ensuredSnapSvcs {
    75  		return nil
    76  	}
    77  
    78  	// only run after we are seeded
    79  	var seeded bool
    80  	err = m.state.Get("seeded", &seeded)
    81  	if err != nil && err != state.ErrNoState {
    82  		return err
    83  	}
    84  	if !seeded {
    85  		return nil
    86  	}
    87  
    88  	// we are seeded, now we need to find all snap services and re-generate
    89  	// services as necessary
    90  
    91  	// ensure all snap services are updated
    92  	allStates, err := snapstate.All(m.state)
    93  	if err != nil && err != state.ErrNoState {
    94  		return err
    95  	}
    96  
    97  	// if we have no snaps we can exit early
    98  	if len(allStates) == 0 {
    99  		m.ensuredSnapSvcs = true
   100  		return nil
   101  	}
   102  
   103  	allGrps, err := AllQuotas(m.state)
   104  	if err != nil && err != state.ErrNoState {
   105  		return err
   106  	}
   107  
   108  	snapsMap := map[*snap.Info]*wrappers.SnapServiceOptions{}
   109  
   110  	for _, snapSt := range allStates {
   111  		info, err := snapSt.CurrentInfo()
   112  		if err != nil {
   113  			return err
   114  		}
   115  
   116  		// don't use EnsureSnapServices with the snapd snap
   117  		if info.Type() == snap.TypeSnapd {
   118  			continue
   119  		}
   120  
   121  		// use the cached copy of all quota groups
   122  		snapSvcOpts, err := SnapServiceOptions(m.state, info.InstanceName(), allGrps)
   123  		if err != nil {
   124  			return err
   125  		}
   126  		snapsMap[info] = snapSvcOpts
   127  	}
   128  
   129  	// setup ensure options
   130  	ensureOpts := &wrappers.EnsureSnapServicesOptions{
   131  		Preseeding: snapdenv.Preseeding(),
   132  	}
   133  
   134  	// set RequireMountedSnapdSnap if we are on UC18+ only
   135  	deviceCtx, err := snapstate.DeviceCtx(m.state, nil, nil)
   136  	if err != nil {
   137  		return err
   138  	}
   139  
   140  	if !deviceCtx.Classic() && deviceCtx.Model().Base() != "" {
   141  		ensureOpts.RequireMountedSnapdSnap = true
   142  	}
   143  
   144  	rewrittenServices := make(map[*snap.Info][]*snap.AppInfo)
   145  	serviceKillingMightHaveOccurred := false
   146  	observeChange := func(app *snap.AppInfo, _ *quota.Group, unitType, name string, old, new string) {
   147  		if unitType == "service" {
   148  			rewrittenServices[app.Snap] = append(rewrittenServices[app.Snap], app)
   149  			if !serviceKillingMightHaveOccurred {
   150  				if strings.Contains(old, "\nRequires=usr-lib-snapd.mount\n") {
   151  					serviceKillingMightHaveOccurred = true
   152  				}
   153  			}
   154  		}
   155  	}
   156  
   157  	err = wrappers.EnsureSnapServices(snapsMap, ensureOpts, observeChange, progress.Null)
   158  	if err != nil {
   159  		return err
   160  	}
   161  
   162  	// if nothing was modified or we are not on UC18+, we are done
   163  	if len(rewrittenServices) == 0 || deviceCtx.Classic() || deviceCtx.Model().Base() == "" || !serviceKillingMightHaveOccurred {
   164  		m.ensuredSnapSvcs = true
   165  		return nil
   166  	}
   167  
   168  	// otherwise, we know now that we have rewritten some snap services, we need
   169  	// to handle the case of LP #1924805, and restart any services that were
   170  	// accidentally killed when we refreshed snapd
   171  	if err := restartServicesKilledInSnapdSnapRefresh(rewrittenServices); err != nil {
   172  		// we failed to restart services that were killed by a snapd refresh, so
   173  		// we need to immediately reboot in the hopes that this restores
   174  		// services to a functioning state
   175  
   176  		m.state.RequestRestart(state.RestartSystemNow)
   177  		return fmt.Errorf("error trying to restart killed services, immediately rebooting: %v", err)
   178  	}
   179  
   180  	m.ensuredSnapSvcs = true
   181  
   182  	return nil
   183  }
   184  
   185  // Ensure implements StateManager.Ensure.
   186  func (m *ServiceManager) Ensure() error {
   187  	if err := m.ensureSnapServicesUpdated(); err != nil {
   188  		return err
   189  	}
   190  	return nil
   191  }
   192  
   193  func delayedCrossMgrInit() {
   194  	// hook into conflict checks mechanisms
   195  	snapstate.AddAffectedSnapsByAttr("service-action", serviceControlAffectedSnaps)
   196  	snapstate.SnapServiceOptions = SnapServiceOptions
   197  	snapstate.EnsureSnapAbsentFromQuotaGroup = EnsureSnapAbsentFromQuota
   198  }
   199  
   200  func serviceControlAffectedSnaps(t *state.Task) ([]string, error) {
   201  	var serviceAction ServiceAction
   202  	if err := t.Get("service-action", &serviceAction); err != nil {
   203  		return nil, fmt.Errorf("internal error: cannot obtain service action from task: %s", t.Summary())
   204  	}
   205  	return []string{serviceAction.SnapName}, nil
   206  }
   207  
   208  func getBootTime() (time.Time, error) {
   209  	cmd := exec.Command("uptime", "-s")
   210  	cmd.Env = append(cmd.Env, "TZ=UTC")
   211  	out, err := cmd.CombinedOutput()
   212  	if err != nil {
   213  		return time.Time{}, osutil.OutputErr(out, err)
   214  	}
   215  
   216  	// parse the output from the command as a time
   217  	t, err := time.ParseInLocation("2006-01-02 15:04:05", strings.TrimSpace(string(out)), time.UTC)
   218  	if err != nil {
   219  		return time.Time{}, err
   220  	}
   221  
   222  	return t, nil
   223  }
   224  
   225  func restartServicesKilledInSnapdSnapRefresh(modified map[*snap.Info][]*snap.AppInfo) error {
   226  	// we decide on which services to restart by identifying (out of the set of
   227  	// services we just modified) services that were stopped after
   228  	// usr-lib-snapd.mount was written, but before usr-lib-snapd.mount was last
   229  	// stopped - this is the time window in which snapd (accidentally) killed
   230  	// all snap services using Requires=, see LP #1924805 for full details, so
   231  	// we need to undo that by restarting those snaps
   232  
   233  	st, err := os.Stat(filepath.Join(dirs.SnapServicesDir, wrappers.SnapdToolingMountUnit))
   234  	if err != nil {
   235  		return err
   236  	}
   237  
   238  	// always truncate all times to second precision, since that is the least
   239  	// precise time we have of all the times we consider, due to using systemctl
   240  	// for getting the InactiveEnterTimestamp for systemd units
   241  	// TODO: we should switch back to using D-Bus for this, where we get much
   242  	// more accurate times, down to the microsecond, which is the same precision
   243  	// we have for the modification time here, and thus we can more easily avoid
   244  	// the truncation issue, and we can ensure that we are minimizing the risk
   245  	// of inadvertently starting services that just so happened to have been
   246  	// stopped in the same second that we modified and usr-lib-snapd.mount.
   247  	lowerTimeBound := st.ModTime().Truncate(time.Second)
   248  
   249  	// if the time that the usr-lib-snapd.mount was modified is before the time
   250  	// that this device was booted up, then we can skip this since we know we
   251  	// that a refresh is not being performed
   252  	bootTime, err := getBootTime()
   253  	if err != nil {
   254  		// don't fail if we can't get the boot time, if we don't get it the
   255  		// below check will be always false (no time can be before zero time)
   256  		logger.Noticef("error getting boot time: %v", err)
   257  	}
   258  
   259  	if lowerTimeBound.Before(bootTime) {
   260  		return nil
   261  	}
   262  
   263  	// Get the InactiveEnterTimestamp property for the usr-lib-snapd.mount unit,
   264  	// this is the time that usr-lib-snapd.mount was transitioned from
   265  	// deactivating to inactive and was done being started. This is the correct
   266  	// upper bound for our window in which systemd killed snap services because
   267  	// systemd orders the transactions when we stop usr-lib-snapd.mount thusly:
   268  	//
   269  	// 1. Find all units which have Requires=usr-lib-snapd.mount (all snap
   270  	//    services which would have been refreshed during snapd 2.49.2)
   271  	// 2. Stop all such services found in 1.
   272  	// 3. Stop usr-lib-snapd.mount itself.
   273  	//
   274  	// Thus the time after all the services were killed is given by the time
   275  	// that systemd transitioned usr-lib-snapd.mount to inactive, which is given
   276  	// by InactiveEnterTimestamp.
   277  
   278  	// TODO: pass a real interactor here?
   279  	sysd := systemd.New(systemd.SystemMode, progress.Null)
   280  
   281  	upperTimeBound, err := sysd.InactiveEnterTimestamp(wrappers.SnapdToolingMountUnit)
   282  	if err != nil {
   283  		return err
   284  	}
   285  
   286  	if upperTimeBound.IsZero() {
   287  		// this means that the usr-lib-snapd.mount unit never exited during this
   288  		// boot, which means we are done in this ensure because the bug we care
   289  		// about (LP #1924805) here was never triggered
   290  		return nil
   291  	}
   292  
   293  	upperTimeBound = upperTimeBound.Truncate(time.Second)
   294  
   295  	// if the lower time bound is ever in the future past the upperTimeBound,
   296  	// then  just use the upperTimeBound as both limits, since we know that the
   297  	// upper bound and the time for each service being stopped are of the same
   298  	// precision
   299  	if lowerTimeBound.After(upperTimeBound) {
   300  		lowerTimeBound = upperTimeBound
   301  	}
   302  
   303  	candidateAppsToRestartBySnap := make(map[*snap.Info][]*snap.AppInfo)
   304  
   305  	for sn, apps := range modified {
   306  		for _, app := range apps {
   307  			// get the InactiveEnterTimestamp for the service
   308  			t, err := sysd.InactiveEnterTimestamp(app.ServiceName())
   309  			if err != nil {
   310  				return err
   311  			}
   312  
   313  			// always truncate to second precision
   314  			t = t.Truncate(time.Second)
   315  
   316  			// check if this unit entered the inactive state between the time
   317  			// range, but be careful about time precision here, we want an
   318  			// inclusive range i.e. [lower,upper] not (lower,upper) in case the
   319  			// time that systemd saves these events as is imprecise or slow and
   320  			// things get saved as having happened at the exact same time
   321  			if !t.Before(lowerTimeBound) && !t.After(upperTimeBound) {
   322  				candidateAppsToRestartBySnap[sn] = append(candidateAppsToRestartBySnap[sn], app)
   323  			}
   324  		}
   325  	}
   326  
   327  	// Second loop actually restarts the services per-snap by sorting them and
   328  	// removing disabled services. Note that we could have disabled services
   329  	// here because a service could have been running, but disabled when snapd
   330  	// was refreshed, hence it got killed, but we don't want to restart it,
   331  	// since it is disabled, and so that disabled running service is just SOL.
   332  	for sn, apps := range candidateAppsToRestartBySnap {
   333  		// TODO: should we try to start as many services as possible here before
   334  		// giving up given the severity of the bug?
   335  		disabledSvcs, err := wrappers.QueryDisabledServices(sn, progress.Null)
   336  		if err != nil {
   337  			return err
   338  		}
   339  
   340  		startupOrdered, err := snap.SortServices(apps)
   341  		if err != nil {
   342  			return err
   343  		}
   344  
   345  		// TODO: what to do about timings here?
   346  		nullPerfTimings := &timings.Timings{}
   347  		if err := wrappers.StartServices(startupOrdered, disabledSvcs, nil, progress.Null, nullPerfTimings); err != nil {
   348  			return err
   349  		}
   350  	}
   351  
   352  	return nil
   353  }