gopkg.in/ubuntu-core/snappy.v0@v0.0.0-20210902073436-25a8614f10a6/overlord/servicestate/servicemgr.go (about)

     1  // -*- Mode: Go; indent-tabs-mode: t -*-
     2  
     3  /*
     4   * Copyright (C) 2021 Canonical Ltd
     5   *
     6   * This program is free software: you can redistribute it and/or modify
     7   * it under the terms of the GNU General Public License version 3 as
     8   * published by the Free Software Foundation.
     9   *
    10   * This program is distributed in the hope that it will be useful,
    11   * but WITHOUT ANY WARRANTY; without even the implied warranty of
    12   * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    13   * GNU General Public License for more details.
    14   *
    15   * You should have received a copy of the GNU General Public License
    16   * along with this program.  If not, see <http://www.gnu.org/licenses/>.
    17   *
    18   */
    19  
    20  package servicestate
    21  
    22  import (
    23  	"fmt"
    24  	"os"
    25  	"os/exec"
    26  	"path/filepath"
    27  	"strings"
    28  	"time"
    29  
    30  	"github.com/snapcore/snapd/dirs"
    31  	"github.com/snapcore/snapd/logger"
    32  	"github.com/snapcore/snapd/osutil"
    33  	"github.com/snapcore/snapd/overlord/snapstate"
    34  	"github.com/snapcore/snapd/overlord/state"
    35  	"github.com/snapcore/snapd/progress"
    36  	"github.com/snapcore/snapd/snap"
    37  	"github.com/snapcore/snapd/snap/quota"
    38  	"github.com/snapcore/snapd/snapdenv"
    39  	"github.com/snapcore/snapd/systemd"
    40  	"github.com/snapcore/snapd/timings"
    41  	"github.com/snapcore/snapd/wrappers"
    42  )
    43  
    44  // ServiceManager is responsible for starting and stopping snap services.
    45  type ServiceManager struct {
    46  	state *state.State
    47  
    48  	ensuredSnapSvcs bool
    49  }
    50  
    51  // Manager returns a new service manager.
    52  func Manager(st *state.State, runner *state.TaskRunner) *ServiceManager {
    53  	delayedCrossMgrInit()
    54  	m := &ServiceManager{
    55  		state: st,
    56  	}
    57  	// TODO: undo handler
    58  	runner.AddHandler("service-control", m.doServiceControl, nil)
    59  
    60  	// TODO: undo handler
    61  	runner.AddHandler("quota-control", m.doQuotaControl, nil)
    62  
    63  	snapstate.AddAffectedSnapsByKind("quota-control", quotaControlAffectedSnaps)
    64  
    65  	return m
    66  }
    67  
    68  func MockEnsuredSnapServices(mgr *ServiceManager, ensured bool) (restore func()) {
    69  	osutil.MustBeTestBinary("ensured snap services can only be mocked from tests")
    70  	old := mgr.ensuredSnapSvcs
    71  	mgr.ensuredSnapSvcs = ensured
    72  	return func() {
    73  		mgr.ensuredSnapSvcs = old
    74  	}
    75  }
    76  
    77  func (m *ServiceManager) ensureSnapServicesUpdated() (err error) {
    78  	m.state.Lock()
    79  	defer m.state.Unlock()
    80  	if m.ensuredSnapSvcs {
    81  		return nil
    82  	}
    83  
    84  	// only run after we are seeded
    85  	var seeded bool
    86  	err = m.state.Get("seeded", &seeded)
    87  	if err != nil && err != state.ErrNoState {
    88  		return err
    89  	}
    90  	if !seeded {
    91  		return nil
    92  	}
    93  
    94  	// we are seeded, now we need to find all snap services and re-generate
    95  	// services as necessary
    96  
    97  	// ensure all snap services are updated
    98  	allStates, err := snapstate.All(m.state)
    99  	if err != nil && err != state.ErrNoState {
   100  		return err
   101  	}
   102  
   103  	// if we have no snaps we can exit early
   104  	if len(allStates) == 0 {
   105  		m.ensuredSnapSvcs = true
   106  		return nil
   107  	}
   108  
   109  	allGrps, err := AllQuotas(m.state)
   110  	if err != nil && err != state.ErrNoState {
   111  		return err
   112  	}
   113  
   114  	snapsMap := map[*snap.Info]*wrappers.SnapServiceOptions{}
   115  
   116  	for _, snapSt := range allStates {
   117  		info, err := snapSt.CurrentInfo()
   118  		if err != nil {
   119  			return err
   120  		}
   121  
   122  		// don't use EnsureSnapServices with the snapd snap
   123  		if info.Type() == snap.TypeSnapd {
   124  			continue
   125  		}
   126  
   127  		// use the cached copy of all quota groups
   128  		snapSvcOpts, err := SnapServiceOptions(m.state, info.InstanceName(), allGrps)
   129  		if err != nil {
   130  			return err
   131  		}
   132  		snapsMap[info] = snapSvcOpts
   133  	}
   134  
   135  	// setup ensure options
   136  	ensureOpts := &wrappers.EnsureSnapServicesOptions{
   137  		Preseeding: snapdenv.Preseeding(),
   138  	}
   139  
   140  	// set RequireMountedSnapdSnap if we are on UC18+ only
   141  	deviceCtx, err := snapstate.DeviceCtx(m.state, nil, nil)
   142  	if err != nil {
   143  		return err
   144  	}
   145  
   146  	if !deviceCtx.Classic() && deviceCtx.Model().Base() != "" {
   147  		ensureOpts.RequireMountedSnapdSnap = true
   148  	}
   149  
   150  	rewrittenServices := make(map[*snap.Info][]*snap.AppInfo)
   151  	serviceKillingMightHaveOccurred := false
   152  	observeChange := func(app *snap.AppInfo, _ *quota.Group, unitType, name string, old, new string) {
   153  		if unitType == "service" {
   154  			rewrittenServices[app.Snap] = append(rewrittenServices[app.Snap], app)
   155  			if !serviceKillingMightHaveOccurred {
   156  				if strings.Contains(old, "\nRequires=usr-lib-snapd.mount\n") {
   157  					serviceKillingMightHaveOccurred = true
   158  				}
   159  			}
   160  		}
   161  	}
   162  
   163  	err = wrappers.EnsureSnapServices(snapsMap, ensureOpts, observeChange, progress.Null)
   164  	if err != nil {
   165  		return err
   166  	}
   167  
   168  	// if nothing was modified or we are not on UC18+, we are done
   169  	if len(rewrittenServices) == 0 || deviceCtx.Classic() || deviceCtx.Model().Base() == "" || !serviceKillingMightHaveOccurred {
   170  		m.ensuredSnapSvcs = true
   171  		return nil
   172  	}
   173  
   174  	// otherwise, we know now that we have rewritten some snap services, we need
   175  	// to handle the case of LP #1924805, and restart any services that were
   176  	// accidentally killed when we refreshed snapd
   177  	if err := restartServicesKilledInSnapdSnapRefresh(rewrittenServices); err != nil {
   178  		// we failed to restart services that were killed by a snapd refresh, so
   179  		// we need to immediately reboot in the hopes that this restores
   180  		// services to a functioning state
   181  
   182  		m.state.RequestRestart(state.RestartSystemNow)
   183  		return fmt.Errorf("error trying to restart killed services, immediately rebooting: %v", err)
   184  	}
   185  
   186  	m.ensuredSnapSvcs = true
   187  
   188  	return nil
   189  }
   190  
   191  // Ensure implements StateManager.Ensure.
   192  func (m *ServiceManager) Ensure() error {
   193  	if err := m.ensureSnapServicesUpdated(); err != nil {
   194  		return err
   195  	}
   196  	return nil
   197  }
   198  
   199  func delayedCrossMgrInit() {
   200  	// hook into conflict checks mechanisms
   201  	snapstate.AddAffectedSnapsByAttr("service-action", serviceControlAffectedSnaps)
   202  	snapstate.SnapServiceOptions = SnapServiceOptions
   203  	snapstate.EnsureSnapAbsentFromQuotaGroup = EnsureSnapAbsentFromQuota
   204  }
   205  
   206  func serviceControlAffectedSnaps(t *state.Task) ([]string, error) {
   207  	var serviceAction ServiceAction
   208  	if err := t.Get("service-action", &serviceAction); err != nil {
   209  		return nil, fmt.Errorf("internal error: cannot obtain service action from task: %s", t.Summary())
   210  	}
   211  	return []string{serviceAction.SnapName}, nil
   212  }
   213  
   214  func getBootTime() (time.Time, error) {
   215  	cmd := exec.Command("uptime", "-s")
   216  	cmd.Env = append(cmd.Env, "TZ=UTC")
   217  	out, err := cmd.CombinedOutput()
   218  	if err != nil {
   219  		return time.Time{}, osutil.OutputErr(out, err)
   220  	}
   221  
   222  	// parse the output from the command as a time
   223  	t, err := time.ParseInLocation("2006-01-02 15:04:05", strings.TrimSpace(string(out)), time.UTC)
   224  	if err != nil {
   225  		return time.Time{}, err
   226  	}
   227  
   228  	return t, nil
   229  }
   230  
   231  func restartServicesKilledInSnapdSnapRefresh(modified map[*snap.Info][]*snap.AppInfo) error {
   232  	// we decide on which services to restart by identifying (out of the set of
   233  	// services we just modified) services that were stopped after
   234  	// usr-lib-snapd.mount was written, but before usr-lib-snapd.mount was last
   235  	// stopped - this is the time window in which snapd (accidentally) killed
   236  	// all snap services using Requires=, see LP #1924805 for full details, so
   237  	// we need to undo that by restarting those snaps
   238  
   239  	st, err := os.Stat(filepath.Join(dirs.SnapServicesDir, wrappers.SnapdToolingMountUnit))
   240  	if err != nil {
   241  		return err
   242  	}
   243  
   244  	// always truncate all times to second precision, since that is the least
   245  	// precise time we have of all the times we consider, due to using systemctl
   246  	// for getting the InactiveEnterTimestamp for systemd units
   247  	// TODO: we should switch back to using D-Bus for this, where we get much
   248  	// more accurate times, down to the microsecond, which is the same precision
   249  	// we have for the modification time here, and thus we can more easily avoid
   250  	// the truncation issue, and we can ensure that we are minimizing the risk
   251  	// of inadvertently starting services that just so happened to have been
   252  	// stopped in the same second that we modified and usr-lib-snapd.mount.
   253  	lowerTimeBound := st.ModTime().Truncate(time.Second)
   254  
   255  	// if the time that the usr-lib-snapd.mount was modified is before the time
   256  	// that this device was booted up, then we can skip this since we know we
   257  	// that a refresh is not being performed
   258  	bootTime, err := getBootTime()
   259  	if err != nil {
   260  		// don't fail if we can't get the boot time, if we don't get it the
   261  		// below check will be always false (no time can be before zero time)
   262  		logger.Noticef("error getting boot time: %v", err)
   263  	}
   264  
   265  	if lowerTimeBound.Before(bootTime) {
   266  		return nil
   267  	}
   268  
   269  	// Get the InactiveEnterTimestamp property for the usr-lib-snapd.mount unit,
   270  	// this is the time that usr-lib-snapd.mount was transitioned from
   271  	// deactivating to inactive and was done being started. This is the correct
   272  	// upper bound for our window in which systemd killed snap services because
   273  	// systemd orders the transactions when we stop usr-lib-snapd.mount thusly:
   274  	//
   275  	// 1. Find all units which have Requires=usr-lib-snapd.mount (all snap
   276  	//    services which would have been refreshed during snapd 2.49.2)
   277  	// 2. Stop all such services found in 1.
   278  	// 3. Stop usr-lib-snapd.mount itself.
   279  	//
   280  	// Thus the time after all the services were killed is given by the time
   281  	// that systemd transitioned usr-lib-snapd.mount to inactive, which is given
   282  	// by InactiveEnterTimestamp.
   283  
   284  	// TODO: pass a real interactor here?
   285  	sysd := systemd.New(systemd.SystemMode, progress.Null)
   286  
   287  	upperTimeBound, err := sysd.InactiveEnterTimestamp(wrappers.SnapdToolingMountUnit)
   288  	if err != nil {
   289  		return err
   290  	}
   291  
   292  	if upperTimeBound.IsZero() {
   293  		// this means that the usr-lib-snapd.mount unit never exited during this
   294  		// boot, which means we are done in this ensure because the bug we care
   295  		// about (LP #1924805) here was never triggered
   296  		return nil
   297  	}
   298  
   299  	upperTimeBound = upperTimeBound.Truncate(time.Second)
   300  
   301  	// if the lower time bound is ever in the future past the upperTimeBound,
   302  	// then  just use the upperTimeBound as both limits, since we know that the
   303  	// upper bound and the time for each service being stopped are of the same
   304  	// precision
   305  	if lowerTimeBound.After(upperTimeBound) {
   306  		lowerTimeBound = upperTimeBound
   307  	}
   308  
   309  	candidateAppsToRestartBySnap := make(map[*snap.Info][]*snap.AppInfo)
   310  
   311  	for sn, apps := range modified {
   312  		for _, app := range apps {
   313  			// get the InactiveEnterTimestamp for the service
   314  			t, err := sysd.InactiveEnterTimestamp(app.ServiceName())
   315  			if err != nil {
   316  				return err
   317  			}
   318  
   319  			// always truncate to second precision
   320  			t = t.Truncate(time.Second)
   321  
   322  			// check if this unit entered the inactive state between the time
   323  			// range, but be careful about time precision here, we want an
   324  			// inclusive range i.e. [lower,upper] not (lower,upper) in case the
   325  			// time that systemd saves these events as is imprecise or slow and
   326  			// things get saved as having happened at the exact same time
   327  			if !t.Before(lowerTimeBound) && !t.After(upperTimeBound) {
   328  				candidateAppsToRestartBySnap[sn] = append(candidateAppsToRestartBySnap[sn], app)
   329  			}
   330  		}
   331  	}
   332  
   333  	// Second loop actually restarts the services per-snap by sorting them and
   334  	// removing disabled services. Note that we could have disabled services
   335  	// here because a service could have been running, but disabled when snapd
   336  	// was refreshed, hence it got killed, but we don't want to restart it,
   337  	// since it is disabled, and so that disabled running service is just SOL.
   338  	for sn, apps := range candidateAppsToRestartBySnap {
   339  		// TODO: should we try to start as many services as possible here before
   340  		// giving up given the severity of the bug?
   341  		disabledSvcs, err := wrappers.QueryDisabledServices(sn, progress.Null)
   342  		if err != nil {
   343  			return err
   344  		}
   345  
   346  		startupOrdered, err := snap.SortServices(apps)
   347  		if err != nil {
   348  			return err
   349  		}
   350  
   351  		// TODO: what to do about timings here?
   352  		nullPerfTimings := &timings.Timings{}
   353  		if err := wrappers.StartServices(startupOrdered, disabledSvcs, nil, progress.Null, nullPerfTimings); err != nil {
   354  			return err
   355  		}
   356  	}
   357  
   358  	return nil
   359  }