github.com/david-imola/snapd@v0.0.0-20210611180407-2de8ddeece6d/overlord/servicestate/servicemgr.go (about) 1 // -*- Mode: Go; indent-tabs-mode: t -*- 2 3 /* 4 * Copyright (C) 2021 Canonical Ltd 5 * 6 * This program is free software: you can redistribute it and/or modify 7 * it under the terms of the GNU General Public License version 3 as 8 * published by the Free Software Foundation. 9 * 10 * This program is distributed in the hope that it will be useful, 11 * but WITHOUT ANY WARRANTY; without even the implied warranty of 12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13 * GNU General Public License for more details. 14 * 15 * You should have received a copy of the GNU General Public License 16 * along with this program. If not, see <http://www.gnu.org/licenses/>. 17 * 18 */ 19 20 package servicestate 21 22 import ( 23 "fmt" 24 "os" 25 "os/exec" 26 "path/filepath" 27 "strings" 28 "time" 29 30 "github.com/snapcore/snapd/dirs" 31 "github.com/snapcore/snapd/logger" 32 "github.com/snapcore/snapd/osutil" 33 "github.com/snapcore/snapd/overlord/snapstate" 34 "github.com/snapcore/snapd/overlord/state" 35 "github.com/snapcore/snapd/progress" 36 "github.com/snapcore/snapd/snap" 37 "github.com/snapcore/snapd/snap/quota" 38 "github.com/snapcore/snapd/snapdenv" 39 "github.com/snapcore/snapd/systemd" 40 "github.com/snapcore/snapd/timings" 41 "github.com/snapcore/snapd/wrappers" 42 ) 43 44 // ServiceManager is responsible for starting and stopping snap services. 45 type ServiceManager struct { 46 state *state.State 47 48 ensuredSnapSvcs bool 49 } 50 51 // Manager returns a new service manager. 52 func Manager(st *state.State, runner *state.TaskRunner) *ServiceManager { 53 delayedCrossMgrInit() 54 m := &ServiceManager{ 55 state: st, 56 } 57 // TODO: undo handler 58 runner.AddHandler("service-control", m.doServiceControl, nil) 59 return m 60 } 61 62 func MockEnsuredSnapServices(mgr *ServiceManager, ensured bool) (restore func()) { 63 osutil.MustBeTestBinary("ensured snap services can only be mocked from tests") 64 old := mgr.ensuredSnapSvcs 65 mgr.ensuredSnapSvcs = ensured 66 return func() { 67 mgr.ensuredSnapSvcs = old 68 } 69 } 70 71 func (m *ServiceManager) ensureSnapServicesUpdated() (err error) { 72 m.state.Lock() 73 defer m.state.Unlock() 74 if m.ensuredSnapSvcs { 75 return nil 76 } 77 78 // only run after we are seeded 79 var seeded bool 80 err = m.state.Get("seeded", &seeded) 81 if err != nil && err != state.ErrNoState { 82 return err 83 } 84 if !seeded { 85 return nil 86 } 87 88 // we are seeded, now we need to find all snap services and re-generate 89 // services as necessary 90 91 // ensure all snap services are updated 92 allStates, err := snapstate.All(m.state) 93 if err != nil && err != state.ErrNoState { 94 return err 95 } 96 97 // if we have no snaps we can exit early 98 if len(allStates) == 0 { 99 m.ensuredSnapSvcs = true 100 return nil 101 } 102 103 allGrps, err := AllQuotas(m.state) 104 if err != nil && err != state.ErrNoState { 105 return err 106 } 107 108 snapsMap := map[*snap.Info]*wrappers.SnapServiceOptions{} 109 110 for _, snapSt := range allStates { 111 info, err := snapSt.CurrentInfo() 112 if err != nil { 113 return err 114 } 115 116 // don't use EnsureSnapServices with the snapd snap 117 if info.Type() == snap.TypeSnapd { 118 continue 119 } 120 121 // use the cached copy of all quota groups 122 snapSvcOpts, err := SnapServiceOptions(m.state, info.InstanceName(), allGrps) 123 if err != nil { 124 return err 125 } 126 snapsMap[info] = snapSvcOpts 127 } 128 129 // setup ensure options 130 ensureOpts := &wrappers.EnsureSnapServicesOptions{ 131 Preseeding: snapdenv.Preseeding(), 132 } 133 134 // set RequireMountedSnapdSnap if we are on UC18+ only 135 deviceCtx, err := snapstate.DeviceCtx(m.state, nil, nil) 136 if err != nil { 137 return err 138 } 139 140 if !deviceCtx.Classic() && deviceCtx.Model().Base() != "" { 141 ensureOpts.RequireMountedSnapdSnap = true 142 } 143 144 rewrittenServices := make(map[*snap.Info][]*snap.AppInfo) 145 serviceKillingMightHaveOccurred := false 146 observeChange := func(app *snap.AppInfo, _ *quota.Group, unitType, name string, old, new string) { 147 if unitType == "service" { 148 rewrittenServices[app.Snap] = append(rewrittenServices[app.Snap], app) 149 if !serviceKillingMightHaveOccurred { 150 if strings.Contains(old, "\nRequires=usr-lib-snapd.mount\n") { 151 serviceKillingMightHaveOccurred = true 152 } 153 } 154 } 155 } 156 157 err = wrappers.EnsureSnapServices(snapsMap, ensureOpts, observeChange, progress.Null) 158 if err != nil { 159 return err 160 } 161 162 // if nothing was modified or we are not on UC18+, we are done 163 if len(rewrittenServices) == 0 || deviceCtx.Classic() || deviceCtx.Model().Base() == "" || !serviceKillingMightHaveOccurred { 164 m.ensuredSnapSvcs = true 165 return nil 166 } 167 168 // otherwise, we know now that we have rewritten some snap services, we need 169 // to handle the case of LP #1924805, and restart any services that were 170 // accidentally killed when we refreshed snapd 171 if err := restartServicesKilledInSnapdSnapRefresh(rewrittenServices); err != nil { 172 // we failed to restart services that were killed by a snapd refresh, so 173 // we need to immediately reboot in the hopes that this restores 174 // services to a functioning state 175 176 m.state.RequestRestart(state.RestartSystemNow) 177 return fmt.Errorf("error trying to restart killed services, immediately rebooting: %v", err) 178 } 179 180 m.ensuredSnapSvcs = true 181 182 return nil 183 } 184 185 // Ensure implements StateManager.Ensure. 186 func (m *ServiceManager) Ensure() error { 187 if err := m.ensureSnapServicesUpdated(); err != nil { 188 return err 189 } 190 return nil 191 } 192 193 func delayedCrossMgrInit() { 194 // hook into conflict checks mechanisms 195 snapstate.AddAffectedSnapsByAttr("service-action", serviceControlAffectedSnaps) 196 snapstate.SnapServiceOptions = SnapServiceOptions 197 snapstate.EnsureSnapAbsentFromQuotaGroup = EnsureSnapAbsentFromQuota 198 } 199 200 func serviceControlAffectedSnaps(t *state.Task) ([]string, error) { 201 var serviceAction ServiceAction 202 if err := t.Get("service-action", &serviceAction); err != nil { 203 return nil, fmt.Errorf("internal error: cannot obtain service action from task: %s", t.Summary()) 204 } 205 return []string{serviceAction.SnapName}, nil 206 } 207 208 func getBootTime() (time.Time, error) { 209 cmd := exec.Command("uptime", "-s") 210 cmd.Env = append(cmd.Env, "TZ=UTC") 211 out, err := cmd.CombinedOutput() 212 if err != nil { 213 return time.Time{}, osutil.OutputErr(out, err) 214 } 215 216 // parse the output from the command as a time 217 t, err := time.ParseInLocation("2006-01-02 15:04:05", strings.TrimSpace(string(out)), time.UTC) 218 if err != nil { 219 return time.Time{}, err 220 } 221 222 return t, nil 223 } 224 225 func restartServicesKilledInSnapdSnapRefresh(modified map[*snap.Info][]*snap.AppInfo) error { 226 // we decide on which services to restart by identifying (out of the set of 227 // services we just modified) services that were stopped after 228 // usr-lib-snapd.mount was written, but before usr-lib-snapd.mount was last 229 // stopped - this is the time window in which snapd (accidentally) killed 230 // all snap services using Requires=, see LP #1924805 for full details, so 231 // we need to undo that by restarting those snaps 232 233 st, err := os.Stat(filepath.Join(dirs.SnapServicesDir, wrappers.SnapdToolingMountUnit)) 234 if err != nil { 235 return err 236 } 237 238 // always truncate all times to second precision, since that is the least 239 // precise time we have of all the times we consider, due to using systemctl 240 // for getting the InactiveEnterTimestamp for systemd units 241 // TODO: we should switch back to using D-Bus for this, where we get much 242 // more accurate times, down to the microsecond, which is the same precision 243 // we have for the modification time here, and thus we can more easily avoid 244 // the truncation issue, and we can ensure that we are minimizing the risk 245 // of inadvertently starting services that just so happened to have been 246 // stopped in the same second that we modified and usr-lib-snapd.mount. 247 lowerTimeBound := st.ModTime().Truncate(time.Second) 248 249 // if the time that the usr-lib-snapd.mount was modified is before the time 250 // that this device was booted up, then we can skip this since we know we 251 // that a refresh is not being performed 252 bootTime, err := getBootTime() 253 if err != nil { 254 // don't fail if we can't get the boot time, if we don't get it the 255 // below check will be always false (no time can be before zero time) 256 logger.Noticef("error getting boot time: %v", err) 257 } 258 259 if lowerTimeBound.Before(bootTime) { 260 return nil 261 } 262 263 // Get the InactiveEnterTimestamp property for the usr-lib-snapd.mount unit, 264 // this is the time that usr-lib-snapd.mount was transitioned from 265 // deactivating to inactive and was done being started. This is the correct 266 // upper bound for our window in which systemd killed snap services because 267 // systemd orders the transactions when we stop usr-lib-snapd.mount thusly: 268 // 269 // 1. Find all units which have Requires=usr-lib-snapd.mount (all snap 270 // services which would have been refreshed during snapd 2.49.2) 271 // 2. Stop all such services found in 1. 272 // 3. Stop usr-lib-snapd.mount itself. 273 // 274 // Thus the time after all the services were killed is given by the time 275 // that systemd transitioned usr-lib-snapd.mount to inactive, which is given 276 // by InactiveEnterTimestamp. 277 278 // TODO: pass a real interactor here? 279 sysd := systemd.New(systemd.SystemMode, progress.Null) 280 281 upperTimeBound, err := sysd.InactiveEnterTimestamp(wrappers.SnapdToolingMountUnit) 282 if err != nil { 283 return err 284 } 285 286 if upperTimeBound.IsZero() { 287 // this means that the usr-lib-snapd.mount unit never exited during this 288 // boot, which means we are done in this ensure because the bug we care 289 // about (LP #1924805) here was never triggered 290 return nil 291 } 292 293 upperTimeBound = upperTimeBound.Truncate(time.Second) 294 295 // if the lower time bound is ever in the future past the upperTimeBound, 296 // then just use the upperTimeBound as both limits, since we know that the 297 // upper bound and the time for each service being stopped are of the same 298 // precision 299 if lowerTimeBound.After(upperTimeBound) { 300 lowerTimeBound = upperTimeBound 301 } 302 303 candidateAppsToRestartBySnap := make(map[*snap.Info][]*snap.AppInfo) 304 305 for sn, apps := range modified { 306 for _, app := range apps { 307 // get the InactiveEnterTimestamp for the service 308 t, err := sysd.InactiveEnterTimestamp(app.ServiceName()) 309 if err != nil { 310 return err 311 } 312 313 // always truncate to second precision 314 t = t.Truncate(time.Second) 315 316 // check if this unit entered the inactive state between the time 317 // range, but be careful about time precision here, we want an 318 // inclusive range i.e. [lower,upper] not (lower,upper) in case the 319 // time that systemd saves these events as is imprecise or slow and 320 // things get saved as having happened at the exact same time 321 if !t.Before(lowerTimeBound) && !t.After(upperTimeBound) { 322 candidateAppsToRestartBySnap[sn] = append(candidateAppsToRestartBySnap[sn], app) 323 } 324 } 325 } 326 327 // Second loop actually restarts the services per-snap by sorting them and 328 // removing disabled services. Note that we could have disabled services 329 // here because a service could have been running, but disabled when snapd 330 // was refreshed, hence it got killed, but we don't want to restart it, 331 // since it is disabled, and so that disabled running service is just SOL. 332 for sn, apps := range candidateAppsToRestartBySnap { 333 // TODO: should we try to start as many services as possible here before 334 // giving up given the severity of the bug? 335 disabledSvcs, err := wrappers.QueryDisabledServices(sn, progress.Null) 336 if err != nil { 337 return err 338 } 339 340 startupOrdered, err := snap.SortServices(apps) 341 if err != nil { 342 return err 343 } 344 345 // TODO: what to do about timings here? 346 nullPerfTimings := &timings.Timings{} 347 if err := wrappers.StartServices(startupOrdered, disabledSvcs, nil, progress.Null, nullPerfTimings); err != nil { 348 return err 349 } 350 } 351 352 return nil 353 }