github.com/rkt/rkt@v1.30.1-0.20200224141603-171c416fac02/stage1/init/common/units.go (about) 1 // Copyright 2014 The rkt Authors 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 //+build linux 16 17 package common 18 19 import ( 20 "errors" 21 "fmt" 22 "io" 23 "os" 24 "path" 25 "path/filepath" 26 "strconv" 27 "strings" 28 29 "github.com/appc/spec/schema" 30 "github.com/appc/spec/schema/types" 31 "github.com/rkt/rkt/common" 32 "github.com/rkt/rkt/pkg/user" 33 stage1commontypes "github.com/rkt/rkt/stage1/common/types" 34 35 "github.com/coreos/go-systemd/unit" 36 "github.com/hashicorp/errwrap" 37 ) 38 39 // The maximum value for the MilliValue of an appc resource limit. 40 const MaxMilliValue = int64(((1 << 63) - 1) / 1000) 41 42 func MutableEnv(p *stage1commontypes.Pod) error { 43 w := NewUnitWriter(p) 44 45 w.WriteUnit( 46 TargetUnitPath(p.Root, "default"), 47 "failed to write default.target", 48 unit.NewUnitOption("Unit", "Description", "rkt apps target"), 49 unit.NewUnitOption("Unit", "DefaultDependencies", "false"), 50 unit.NewUnitOption("Unit", "Requires", "systemd-journald.service"), 51 unit.NewUnitOption("Unit", "After", "systemd-journald.service"), 52 unit.NewUnitOption("Unit", "Wants", "supervisor-ready.service"), 53 unit.NewUnitOption("Unit", "Before", "supervisor-ready.service"), 54 unit.NewUnitOption("Unit", "Before", "halt.target"), 55 unit.NewUnitOption("Unit", "Conflicts", "halt.target"), 56 ) 57 58 w.WriteUnit( 59 ServiceUnitPath(p.Root, "prepare-app@"), 60 "failed to write prepare-app service template", 61 unit.NewUnitOption("Unit", "Description", "Prepare minimum environment for chrooted applications"), 62 unit.NewUnitOption("Unit", "DefaultDependencies", "false"), 63 unit.NewUnitOption("Unit", "OnFailureJobMode", "fail"), 64 65 // prepare-app is meant to be executed at most once. 66 // We must ensure that the prepare-app service unit remains started after the prepare-app binary exits 67 // such that it is not executed again during restarts of the target app. 68 unit.NewUnitOption("Service", "RemainAfterExit", "yes"), 69 70 unit.NewUnitOption("Service", "Type", "oneshot"), 71 unit.NewUnitOption("Service", "Restart", "no"), 72 unit.NewUnitOption("Service", "ExecStart", "/prepare-app %I"), 73 unit.NewUnitOption("Service", "User", "0"), 74 unit.NewUnitOption("Service", "Group", "0"), 75 unit.NewUnitOption("Service", "CapabilityBoundingSet", "CAP_SYS_ADMIN CAP_DAC_OVERRIDE CAP_MKNOD"), 76 ) 77 78 w.WriteUnit( 79 TargetUnitPath(p.Root, "halt"), 80 "failed to write halt target", 81 unit.NewUnitOption("Unit", "Description", "Halt"), 82 unit.NewUnitOption("Unit", "DefaultDependencies", "false"), 83 unit.NewUnitOption("Unit", "AllowIsolate", "true"), 84 unit.NewUnitOption("Unit", "Requires", "shutdown.service"), 85 unit.NewUnitOption("Unit", "After", "shutdown.service"), 86 ) 87 88 w.writeShutdownService( 89 "ExecStart", 90 unit.NewUnitOption("Unit", "Description", "Pod shutdown"), 91 unit.NewUnitOption("Unit", "AllowIsolate", "true"), 92 unit.NewUnitOption("Unit", "DefaultDependencies", "false"), 93 unit.NewUnitOption("Service", "RemainAfterExit", "yes"), 94 ) 95 96 w.Activate("systemd-journal-flush.service", ServiceWantPath(p.Root, "systemd-journal-flush")) 97 98 return w.Error() 99 } 100 101 func ImmutableEnv(p *stage1commontypes.Pod) error { 102 uw := NewUnitWriter(p) 103 104 opts := []*unit.UnitOption{ 105 unit.NewUnitOption("Unit", "Description", "rkt apps target"), 106 unit.NewUnitOption("Unit", "DefaultDependencies", "false"), 107 unit.NewUnitOption("Unit", "Wants", "supervisor-ready.service"), 108 unit.NewUnitOption("Unit", "Before", "supervisor-ready.service"), 109 } 110 111 for i := range p.Manifest.Apps { 112 ra := &p.Manifest.Apps[i] 113 serviceName := ServiceUnitName(ra.Name) 114 opts = append(opts, unit.NewUnitOption("Unit", "After", serviceName)) 115 opts = append(opts, unit.NewUnitOption("Unit", "Wants", serviceName)) 116 } 117 118 uw.WriteUnit( 119 TargetUnitPath(p.Root, "default"), 120 "failed to write default.target", 121 opts..., 122 ) 123 124 uw.WriteUnit( 125 ServiceUnitPath(p.Root, "prepare-app@"), 126 "failed to write prepare-app service template", 127 unit.NewUnitOption("Unit", "Description", "Prepare minimum environment for chrooted applications"), 128 unit.NewUnitOption("Unit", "DefaultDependencies", "false"), 129 unit.NewUnitOption("Unit", "OnFailureJobMode", "fail"), 130 unit.NewUnitOption("Unit", "Requires", "systemd-journald.service"), 131 unit.NewUnitOption("Unit", "After", "systemd-journald.service"), 132 unit.NewUnitOption("Service", "Type", "oneshot"), 133 unit.NewUnitOption("Service", "Restart", "no"), 134 unit.NewUnitOption("Service", "ExecStart", "/prepare-app %I"), 135 unit.NewUnitOption("Service", "User", "0"), 136 unit.NewUnitOption("Service", "Group", "0"), 137 unit.NewUnitOption("Service", "CapabilityBoundingSet", "CAP_SYS_ADMIN CAP_DAC_OVERRIDE CAP_MKNOD"), 138 ) 139 140 uw.WriteUnit( 141 TargetUnitPath(p.Root, "halt"), 142 "failed to write halt target", 143 unit.NewUnitOption("Unit", "Description", "Halt"), 144 unit.NewUnitOption("Unit", "DefaultDependencies", "false"), 145 unit.NewUnitOption("Unit", "AllowIsolate", "true"), 146 ) 147 148 uw.writeShutdownService( 149 "ExecStop", 150 unit.NewUnitOption("Unit", "Description", "Pod shutdown"), 151 unit.NewUnitOption("Unit", "AllowIsolate", "true"), 152 unit.NewUnitOption("Unit", "StopWhenUnneeded", "yes"), 153 unit.NewUnitOption("Unit", "DefaultDependencies", "false"), 154 unit.NewUnitOption("Service", "RemainAfterExit", "yes"), 155 ) 156 157 if err := uw.Error(); err != nil { 158 return err 159 } 160 161 for i := range p.Manifest.Apps { 162 ra := &p.Manifest.Apps[i] 163 164 if ra.App.WorkingDirectory == "" { 165 ra.App.WorkingDirectory = "/" 166 } 167 168 binPath, err := FindBinPath(p, ra) 169 if err != nil { 170 return err 171 } 172 173 uw.AppUnit(ra, binPath, 174 unit.NewUnitOption("Unit", "After", "systemd-journald.service"), 175 // When an app fails, we shut down the pod 176 unit.NewUnitOption("Unit", "OnFailure", "halt.target")) 177 178 uw.AppReaperUnit(ra.Name, binPath, 179 unit.NewUnitOption("Service", "Environment", `"EXIT_POD=true"`), 180 unit.NewUnitOption("Unit", "Wants", "shutdown.service"), 181 unit.NewUnitOption("Unit", "After", "shutdown.service"), 182 ) 183 } 184 185 uw.Activate("systemd-journal-flush.service", ServiceWantPath(p.Root, "systemd-journal-flush")) 186 187 return uw.Error() 188 } 189 190 // SetupAppIO prepares all properties related to streams (stdin/stdout/stderr) and TTY 191 // for an application service unit. 192 // 193 // It works according to the following steps: 194 // 1. short-circuit interactive pods and legacy systemd, for backward compatibility 195 // 2. parse app-level annotations to determine stdin/stdout/stderr mode 196 // 2a. if an annotation is missing/invalid, it fallbacks to legacy mode (in: null, out/err: journald) 197 // 2b. if a valid annotation is found, it prepares: 198 // - TTY and stream properties for the systemd service unit 199 // - env variables for iottymux binary 200 // 3. if any of stdin/stdout/stderr is in TTY or streaming mode: 201 // 3a. the env file for iottymux is written to `/rkt/iottymux/<appname>/env` with the above content 202 // 3b. for TTY mode, a `TTYPath` property and an `After=ttymux@<appname>.service` dependency are added 203 // 3c. for streaming mode, a `Before=iomux@<appname>.service` dependency is added 204 // 205 // For complete details, see dev-docs at Documentation/devel/log-attach-design.md 206 func (uw *UnitWriter) SetupAppIO(p *stage1commontypes.Pod, ra *schema.RuntimeApp, binPath string, opts ...*unit.UnitOption) []*unit.UnitOption { 207 if uw.err != nil { 208 return opts 209 } 210 211 if p.Interactive { 212 opts = append(opts, unit.NewUnitOption("Service", "StandardInput", "tty")) 213 opts = append(opts, unit.NewUnitOption("Service", "StandardOutput", "tty")) 214 opts = append(opts, unit.NewUnitOption("Service", "StandardError", "tty")) 215 return opts 216 } 217 218 flavor, systemdVersion, err := GetFlavor(uw.p) 219 if err != nil { 220 uw.err = err 221 return opts 222 } 223 224 stdin, _ := ra.Annotations.Get(stage1commontypes.AppStdinMode) 225 stdout, _ := ra.Annotations.Get(stage1commontypes.AppStdoutMode) 226 stderr, _ := ra.Annotations.Get(stage1commontypes.AppStderrMode) 227 228 // Attach needs https://github.com/systemd/systemd/pull/4179, ie. systemd v232 or a backport 229 if ((flavor == "src" || flavor == "host") && systemdVersion < 232) || 230 ((flavor == "coreos" || flavor == "kvm") && systemdVersion < 231) { 231 // Explicit error if systemd is too old for attaching 232 if stdin != "" || stdout != "" || stderr != "" { 233 uw.err = fmt.Errorf("stage1 systemd %d does not support attachable I/O", systemdVersion) 234 return opts 235 } 236 opts = append(opts, unit.NewUnitOption("Service", "StandardInput", "null")) 237 opts = append(opts, unit.NewUnitOption("Service", "StandardOutput", "journal+console")) 238 opts = append(opts, unit.NewUnitOption("Service", "StandardError", "journal+console")) 239 return opts 240 } 241 242 var iottymuxEnvFlags []string 243 needsIOMux := false 244 needsTTYMux := false 245 246 switch stdin { 247 case "stream": 248 needsIOMux = true 249 uw.AppSocketUnit(ra.Name, binPath, "stdin") 250 iottymuxEnvFlags = append(iottymuxEnvFlags, "STAGE2_STDIN=true") 251 opts = append(opts, unit.NewUnitOption("Service", "StandardInput", "fd")) 252 opts = append(opts, unit.NewUnitOption("Service", "Sockets", fmt.Sprintf("%s-%s.socket", ra.Name, "stdin"))) 253 case "tty": 254 needsTTYMux = true 255 iottymuxEnvFlags = append(iottymuxEnvFlags, "STAGE2_STDIN=true") 256 opts = append(opts, unit.NewUnitOption("Service", "StandardInput", "tty-force")) 257 case "interactive": 258 opts = append(opts, unit.NewUnitOption("Service", "StandardInput", "tty")) 259 default: 260 // null mode 261 opts = append(opts, unit.NewUnitOption("Service", "StandardInput", "null")) 262 } 263 264 switch stdout { 265 case "stream": 266 needsIOMux = true 267 uw.AppSocketUnit(ra.Name, binPath, "stdout") 268 iottymuxEnvFlags = append(iottymuxEnvFlags, "STAGE2_STDOUT=true") 269 opts = append(opts, unit.NewUnitOption("Service", "StandardOutput", "fd")) 270 opts = append(opts, unit.NewUnitOption("Service", "Sockets", fmt.Sprintf("%s-%s.socket", ra.Name, "stdout"))) 271 case "tty": 272 needsTTYMux = true 273 iottymuxEnvFlags = append(iottymuxEnvFlags, "STAGE2_STDOUT=true") 274 opts = append(opts, unit.NewUnitOption("Service", "StandardOutput", "tty")) 275 case "interactive": 276 opts = append(opts, unit.NewUnitOption("Service", "StandardOutput", "tty")) 277 case "null": 278 opts = append(opts, unit.NewUnitOption("Service", "StandardOutput", "null")) 279 default: 280 // log mode 281 opts = append(opts, unit.NewUnitOption("Service", "StandardOutput", "journal+console")) 282 } 283 284 switch stderr { 285 case "stream": 286 needsIOMux = true 287 uw.AppSocketUnit(ra.Name, binPath, "stderr") 288 iottymuxEnvFlags = append(iottymuxEnvFlags, "STAGE2_STDERR=true") 289 opts = append(opts, unit.NewUnitOption("Service", "StandardError", "fd")) 290 opts = append(opts, unit.NewUnitOption("Service", "Sockets", fmt.Sprintf("%s-%s.socket", ra.Name, "stderr"))) 291 case "tty": 292 needsTTYMux = true 293 iottymuxEnvFlags = append(iottymuxEnvFlags, "STAGE2_STDERR=true") 294 opts = append(opts, unit.NewUnitOption("Service", "StandardError", "tty")) 295 case "interactive": 296 opts = append(opts, unit.NewUnitOption("Service", "StandardError", "tty")) 297 case "null": 298 opts = append(opts, unit.NewUnitOption("Service", "StandardError", "null")) 299 default: 300 // log mode 301 opts = append(opts, unit.NewUnitOption("Service", "StandardError", "journal+console")) 302 } 303 304 // if at least one stream requires I/O muxing, an appropriate iottymux dependency needs to be setup 305 if needsIOMux || needsTTYMux { 306 // an env file is written here for iottymux, containing service configuration. 307 appIODir := IOMuxDir(p.Root, ra.Name) 308 os.MkdirAll(appIODir, 0644) 309 file, err := os.OpenFile(filepath.Join(appIODir, "env"), os.O_WRONLY|os.O_CREATE|os.O_TRUNC, 0644) 310 if err != nil { 311 uw.err = err 312 return nil 313 } 314 defer file.Close() 315 316 // env file specifies: debug verbosity, which streams to mux and whether a dedicated TTY is needed. 317 file.WriteString(fmt.Sprintf("STAGE2_TTY=%t\n", needsTTYMux)) 318 file.WriteString(fmt.Sprintf("STAGE1_DEBUG=%t\n", p.Debug)) 319 for _, l := range iottymuxEnvFlags { 320 file.WriteString(l + "\n") 321 } 322 323 if needsIOMux { 324 // streaming mode brings in a `iomux@.service` before-dependency 325 opts = append(opts, unit.NewUnitOption("Unit", "Requires", fmt.Sprintf("iomux@%s.service", ra.Name))) 326 opts = append(opts, unit.NewUnitOption("Unit", "Before", fmt.Sprintf("iomux@%s.service", ra.Name))) 327 logMode, ok := p.Manifest.Annotations.Get("coreos.com/rkt/experiment/logmode") 328 if ok { 329 file.WriteString(fmt.Sprintf("STAGE1_LOGMODE=%s\n", logMode)) 330 } 331 switch logMode { 332 case "k8s-plain": 333 kubernetesLogPath, ok := ra.Annotations.Get("coreos.com/rkt/experiment/kubernetes-log-path") 334 if !ok { 335 uw.err = fmt.Errorf("kubernetes-log-path annotation needs to be specified when k8s-plain logging mode is used") 336 return nil 337 } 338 file.WriteString(fmt.Sprintf("KUBERNETES_LOG_PATH=%s\n", kubernetesLogPath)) 339 } 340 341 } else if needsTTYMux { 342 // tty mode brings in a `ttymux@.service` after-dependency (it needs to create the TTY first) 343 opts = append(opts, unit.NewUnitOption("Service", "TTYPath", filepath.Join("/rkt/iottymux", ra.Name.String(), "stage2-pts"))) 344 opts = append(opts, unit.NewUnitOption("Unit", "Requires", fmt.Sprintf("ttymux@%s.service", ra.Name))) 345 opts = append(opts, unit.NewUnitOption("Unit", "After", fmt.Sprintf("ttymux@%s.service", ra.Name))) 346 } 347 } 348 return opts 349 } 350 351 // UnitWriter is the type that writes systemd units preserving the first previously occurred error. 352 // Any method of this type can be invoked multiple times without error checking. 353 // If a previous invocation generated an error, any invoked method will be skipped. 354 // If an error occurred during method invocations, it can be retrieved using Error(). 355 type UnitWriter struct { 356 err error 357 p *stage1commontypes.Pod 358 } 359 360 // NewUnitWriter returns a new UnitWriter for the given pod. 361 func NewUnitWriter(p *stage1commontypes.Pod) *UnitWriter { 362 return &UnitWriter{p: p} 363 } 364 365 // WriteUnit writes a systemd unit in the given path with the given unit options 366 // if no previous error occurred. 367 func (uw *UnitWriter) WriteUnit(path string, errmsg string, opts ...*unit.UnitOption) { 368 if uw.err != nil { 369 return 370 } 371 372 file, err := os.OpenFile(path, os.O_WRONLY|os.O_CREATE|os.O_TRUNC, 0644) 373 if err != nil { 374 uw.err = errwrap.Wrap(errors.New(errmsg), err) 375 return 376 } 377 defer file.Close() 378 379 if _, err = io.Copy(file, unit.Serialize(opts)); err != nil { 380 uw.err = errwrap.Wrap(errors.New(errmsg), err) 381 return 382 } 383 if err := user.ShiftFiles([]string{path}, &uw.p.UidRange); err != nil { 384 uw.err = errwrap.Wrap(errors.New(errmsg), err) 385 return 386 } 387 } 388 389 // writeShutdownService writes a shutdown.service unit with the given unit options 390 // if no previous error occurred. 391 // exec specifies how systemctl should be invoked, i.e. ExecStart, or ExecStop. 392 func (uw *UnitWriter) writeShutdownService(exec string, opts ...*unit.UnitOption) { 393 if uw.err != nil { 394 return 395 } 396 397 flavor, systemdVersion, err := GetFlavor(uw.p) 398 if err != nil { 399 uw.err = errwrap.Wrap(errors.New("failed to create shutdown service"), err) 400 return 401 } 402 403 opts = append(opts, []*unit.UnitOption{ 404 // The default stdout is /dev/console (the tty created by nspawn). 405 // But the tty might be destroyed if rkt is executed via ssh and 406 // the user terminates the ssh session. We still want 407 // shutdown.service to succeed in that case, so don't use 408 // /dev/console. 409 unit.NewUnitOption("Service", "StandardInput", "null"), 410 unit.NewUnitOption("Service", "StandardOutput", "null"), 411 unit.NewUnitOption("Service", "StandardError", "null"), 412 }...) 413 414 shutdownVerb := "exit" 415 // systemd <v227 doesn't allow the "exit" verb when running as PID 1, so 416 // use "halt". 417 // If systemdVersion is 0 it means it couldn't be guessed, assume it's new 418 // enough for "systemctl exit". 419 // This can happen, for example, when building rkt with: 420 // 421 // ./configure --with-stage1-flavors=src --with-stage1-systemd-version=master 422 // 423 // The patches for the "exit" verb are backported to the "coreos" flavor, so 424 // don't rely on the systemd version on the "coreos" flavor. 425 if flavor != "coreos" && systemdVersion != 0 && systemdVersion < 227 { 426 shutdownVerb = "halt" 427 } 428 429 opts = append( 430 opts, 431 unit.NewUnitOption("Service", exec, fmt.Sprintf("/usr/bin/systemctl --force %s", shutdownVerb)), 432 ) 433 434 uw.WriteUnit( 435 ServiceUnitPath(uw.p.Root, "shutdown"), 436 "failed to create shutdown service", 437 opts..., 438 ) 439 } 440 441 // Activate actives the given unit in the given wantPath. 442 func (uw *UnitWriter) Activate(unit, wantPath string) { 443 if uw.err != nil { 444 return 445 } 446 447 if err := os.Symlink(path.Join("..", unit), wantPath); err != nil && !os.IsExist(err) { 448 uw.err = errwrap.Wrap(errors.New("failed to link service want"), err) 449 } 450 } 451 452 // error returns the first error that occurred during write* invocations. 453 func (uw *UnitWriter) Error() error { 454 return uw.err 455 } 456 457 // AppUnit sets up the main systemd service unit for the application. 458 func (uw *UnitWriter) AppUnit(ra *schema.RuntimeApp, binPath string, opts ...*unit.UnitOption) { 459 if uw.err != nil { 460 return 461 } 462 463 if len(ra.App.Exec) == 0 { 464 uw.err = fmt.Errorf(`image %q has an empty "exec" (try --exec=BINARY)`, 465 uw.p.AppNameToImageName(ra.Name)) 466 return 467 } 468 469 pa, err := prepareApp(uw.p, ra) 470 if err != nil { 471 uw.err = err 472 return 473 } 474 475 appName := ra.Name.String() 476 imgName := uw.p.AppNameToImageName(ra.Name) 477 /* Write the generic unit options */ 478 opts = append(opts, []*unit.UnitOption{ 479 unit.NewUnitOption("Unit", "Description", fmt.Sprintf("Application=%v Image=%v", appName, imgName)), 480 unit.NewUnitOption("Unit", "DefaultDependencies", "false"), 481 unit.NewUnitOption("Unit", "Wants", fmt.Sprintf("reaper-%s.service", appName)), 482 unit.NewUnitOption("Service", "Restart", "no"), 483 484 // This helps working around a race 485 // (https://github.com/systemd/systemd/issues/2913) that causes the 486 // systemd unit name not getting written to the journal if the unit is 487 // short-lived and runs as non-root. 488 unit.NewUnitOption("Service", "SyslogIdentifier", appName), 489 }...) 490 491 // Setup I/O for iottymux (stdin/stdout/stderr) 492 opts = append(opts, uw.SetupAppIO(uw.p, ra, binPath)...) 493 494 if supportsNotify(uw.p, ra.Name.String()) { 495 opts = append(opts, unit.NewUnitOption("Service", "Type", "notify")) 496 } 497 498 // Some pre-start jobs take a long time, set the timeout to 0 499 opts = append(opts, unit.NewUnitOption("Service", "TimeoutStartSec", "0")) 500 501 opts = append(opts, unit.NewUnitOption("Unit", "Requires", "sysusers.service")) 502 opts = append(opts, unit.NewUnitOption("Unit", "After", "sysusers.service")) 503 504 opts = uw.appSystemdUnit(pa, binPath, opts) 505 506 uw.WriteUnit(ServiceUnitPath(uw.p.Root, ra.Name), "failed to create service unit file", opts...) 507 uw.Activate(ServiceUnitName(ra.Name), ServiceWantPath(uw.p.Root, ra.Name)) 508 509 } 510 511 // appSystemdUnit sets up an application for isolation via systemd 512 func (uw *UnitWriter) appSystemdUnit(pa *preparedApp, binPath string, opts []*unit.UnitOption) []*unit.UnitOption { 513 if uw.err != nil { 514 return nil 515 } 516 517 flavor, systemdVersion, err := GetFlavor(uw.p) 518 if err != nil { 519 uw.err = errwrap.Wrap(errors.New("unable to determine stage1 flavor"), err) 520 return nil 521 } 522 523 ra := pa.app 524 app := ra.App 525 appName := ra.Name 526 imgName := uw.p.AppNameToImageName(ra.Name) 527 528 podAbsRoot, err := filepath.Abs(uw.p.Root) 529 if err != nil { 530 uw.err = err 531 return nil 532 } 533 534 var supplementaryGroups []string 535 for _, g := range app.SupplementaryGIDs { 536 supplementaryGroups = append(supplementaryGroups, strconv.Itoa(g)) 537 } 538 539 // Write env file 540 if err := common.WriteEnvFile(common.ComposeEnviron(pa.env), &uw.p.UidRange, EnvFilePath(uw.p.Root, pa.app.Name)); err != nil { 541 uw.err = errwrap.Wrapf("unable to write environment file", err) 542 return nil 543 } 544 545 execStart := append([]string{binPath}, app.Exec[1:]...) 546 execStartString := quoteExec(execStart) 547 opts = append(opts, 548 unit.NewUnitOption("Service", "ExecStart", execStartString), 549 unit.NewUnitOption("Service", "RootDirectory", common.RelAppRootfsPath(appName)), 550 unit.NewUnitOption("Service", "WorkingDirectory", app.WorkingDirectory), 551 unit.NewUnitOption("Service", "EnvironmentFile", RelEnvFilePath(appName)), 552 unit.NewUnitOption("Service", "User", strconv.Itoa(int(pa.uid))), 553 unit.NewUnitOption("Service", "Group", strconv.Itoa(int(pa.gid))), 554 unit.NewUnitOption("Service", "PermissionsStartOnly", "true"), 555 unit.NewUnitOption("Unit", "Requires", InstantiatedPrepareAppUnitName(ra.Name)), 556 unit.NewUnitOption("Unit", "After", InstantiatedPrepareAppUnitName(ra.Name)), 557 ) 558 559 if len(supplementaryGroups) > 0 { 560 opts = appendOptionsList(opts, "Service", "SupplementaryGroups", "", supplementaryGroups...) 561 } 562 563 if !uw.p.InsecureOptions.DisableCapabilities { 564 opts = append(opts, unit.NewUnitOption("Service", "CapabilityBoundingSet", strings.Join(pa.capabilities, " "))) 565 } 566 567 // Apply seccomp isolator, if any and not opt-ing out; 568 // see https://www.freedesktop.org/software/systemd/man/systemd.exec.html#SystemCallFilter= 569 if pa.seccomp != nil { 570 opts, err = seccompUnitOptions(opts, pa.seccomp) 571 if err != nil { 572 uw.err = errwrap.Wrapf("unable to apply seccomp options", err) 573 return nil 574 } 575 } 576 opts = append(opts, unit.NewUnitOption("Service", "NoNewPrivileges", strconv.FormatBool(pa.noNewPrivileges))) 577 578 if ra.ReadOnlyRootFS { 579 for _, m := range pa.mounts { 580 mntPath, err := EvaluateSymlinksInsideApp(podAbsRoot, m.Mount.Path) 581 if err != nil { 582 uw.err = err 583 return nil 584 } 585 586 if !m.ReadOnly { 587 rwDir := filepath.Join(common.RelAppRootfsPath(ra.Name), mntPath) 588 opts = appendOptionsList(opts, "Service", "ReadWriteDirectories", "", rwDir) 589 } 590 } 591 opts = appendOptionsList(opts, "Service", "ReadOnlyDirectories", "", common.RelAppRootfsPath(ra.Name)) 592 } 593 594 // Unless we have --insecure-options=paths, then do some path protections: 595 // 596 // * prevent access to sensitive kernel tunables 597 // * Run the app in a separate mount namespace 598 // 599 if !uw.p.InsecureOptions.DisablePaths { 600 // Systemd 231+ has InaccessiblePaths 601 // older versions only have InaccessibleDirectories 602 // Paths prepended with "-" are ignored if they don't exist. 603 if systemdVersion >= 231 { 604 opts = appendOptionsList(opts, "Service", "InaccessiblePaths", "-", pa.relAppPaths(pa.hiddenPaths)...) 605 opts = appendOptionsList(opts, "Service", "InaccessiblePaths", "-", pa.relAppPaths(pa.hiddenDirs)...) 606 opts = appendOptionsList(opts, "Service", "ReadOnlyPaths", "-", pa.relAppPaths(pa.roPaths)...) 607 } else { 608 opts = appendOptionsList(opts, "Service", "InaccessibleDirectories", "-", pa.relAppPaths(pa.hiddenDirs)...) 609 opts = appendOptionsList(opts, "Service", "ReadOnlyDirectories", "-", pa.relAppPaths(pa.roPaths)...) 610 } 611 612 if systemdVersion >= 233 { 613 // ProtectKernelTunables is introduced in systemd-v232 but didn't work 614 // until v233 due to a systemd bug, see 615 // https://github.com/systemd/systemd/pull/4594 616 // However, from v233, setting ProtectKernelTunables + RootDirectory causes 617 // MountAPIVFS to be enabled unconditionally, which we don't want. 618 // 619 // opts = append(opts, unit.NewUnitOption("Service", "ProtectKernelTunables", "true")) 620 621 // MountAPIVFS is introduced in systemd-233. Don't let systemd mount /sys: 622 // it is mounted by prepare-app (tested by TestVolumeSysfs) 623 opts = append(opts, unit.NewUnitOption("Service", "MountAPIVFS", "false")) 624 } 625 626 // MountFlags=shared creates a new mount namespace and (as unintuitive 627 // as it might seem) makes sure the mount is slave+shared. 628 opts = append(opts, unit.NewUnitOption("Service", "MountFlags", "shared")) 629 } 630 631 // Generate default device policy for the app, as well as the list of allowed devices. 632 // For kvm flavor, devices are VM-specific and restricting them is not strictly needed. 633 if !uw.p.InsecureOptions.DisablePaths && flavor != "kvm" { 634 opts = append(opts, unit.NewUnitOption("Service", "DevicePolicy", "closed")) 635 deviceAllows, err := generateDeviceAllows(common.Stage1RootfsPath(podAbsRoot), appName, app.MountPoints, pa.mounts, &uw.p.UidRange) 636 if err != nil { 637 uw.err = err 638 return nil 639 } 640 for _, dev := range deviceAllows { 641 opts = append(opts, unit.NewUnitOption("Service", "DeviceAllow", dev)) 642 } 643 } 644 645 for _, eh := range app.EventHandlers { 646 var typ string 647 switch eh.Name { 648 case "pre-start": 649 typ = "ExecStartPre" 650 case "post-stop": 651 typ = "ExecStopPost" 652 default: 653 uw.err = fmt.Errorf("unrecognized eventHandler: %v", eh.Name) 654 return nil 655 } 656 exec := quoteExec(eh.Exec) 657 opts = append(opts, unit.NewUnitOption("Service", typ, exec)) 658 } 659 660 // Resource isolators 661 if pa.resources.MemoryLimit != nil { 662 opts = append(opts, unit.NewUnitOption("Service", "MemoryLimit", strconv.FormatUint(*pa.resources.MemoryLimit, 10))) 663 } 664 if pa.resources.CPUQuota != nil { 665 quota := strconv.FormatUint(*pa.resources.CPUQuota, 10) + "%" 666 opts = append(opts, unit.NewUnitOption("Service", "CPUQuota", quota)) 667 } 668 if pa.resources.LinuxCPUShares != nil { 669 opts = append(opts, unit.NewUnitOption("Service", "CPUShares", strconv.FormatUint(*pa.resources.LinuxCPUShares, 10))) 670 } 671 if pa.resources.LinuxOOMScoreAdjust != nil { 672 opts = append(opts, unit.NewUnitOption("Service", "OOMScoreAdjust", strconv.Itoa(*pa.resources.LinuxOOMScoreAdjust))) 673 } 674 675 var saPorts []types.Port 676 for _, p := range ra.App.Ports { 677 if p.SocketActivated { 678 saPorts = append(saPorts, p) 679 } 680 } 681 682 if len(saPorts) > 0 { 683 sockopts := []*unit.UnitOption{ 684 unit.NewUnitOption("Unit", "Description", fmt.Sprintf("Application=%v Image=%v %s", appName, imgName, "socket-activated ports")), 685 unit.NewUnitOption("Unit", "DefaultDependencies", "false"), 686 unit.NewUnitOption("Socket", "BindIPv6Only", "both"), 687 unit.NewUnitOption("Socket", "Service", ServiceUnitName(appName)), 688 } 689 690 for _, sap := range saPorts { 691 var proto string 692 switch sap.Protocol { 693 case "tcp": 694 proto = "ListenStream" 695 case "udp": 696 proto = "ListenDatagram" 697 default: 698 uw.err = fmt.Errorf("unrecognized protocol: %v", sap.Protocol) 699 return nil 700 } 701 // We find the host port for the pod's port and use that in the 702 // socket unit file. 703 // This is so because systemd inside the pod will match based on 704 // the socket port number, and since the socket was created on the 705 // host, it will have the host port number. 706 port := findHostPort(*uw.p.Manifest, sap.Name) 707 if port == 0 { 708 log.Printf("warning: no --port option for socket-activated port %q, assuming port %d as specified in the manifest", sap.Name, sap.Port) 709 port = sap.Port 710 } 711 sockopts = append(sockopts, unit.NewUnitOption("Socket", proto, fmt.Sprintf("%v", port))) 712 } 713 714 file, err := os.OpenFile(SocketUnitPath(uw.p.Root, appName), os.O_WRONLY|os.O_CREATE, 0644) 715 if err != nil { 716 uw.err = errwrap.Wrap(errors.New("failed to create socket file"), err) 717 return nil 718 } 719 defer file.Close() 720 721 if _, err = io.Copy(file, unit.Serialize(sockopts)); err != nil { 722 uw.err = errwrap.Wrap(errors.New("failed to write socket unit file"), err) 723 return nil 724 } 725 726 if err = os.Symlink(path.Join("..", SocketUnitName(appName)), SocketWantPath(uw.p.Root, appName)); err != nil { 727 uw.err = errwrap.Wrap(errors.New("failed to link socket want"), err) 728 return nil 729 } 730 731 opts = append(opts, unit.NewUnitOption("Unit", "Requires", SocketUnitName(appName))) 732 } 733 return opts 734 } 735 736 // AppReaperUnit writes an app reaper service unit for the given app in the given path using the given unit options. 737 func (uw *UnitWriter) AppReaperUnit(appName types.ACName, binPath string, opts ...*unit.UnitOption) { 738 if uw.err != nil { 739 return 740 } 741 742 opts = append(opts, []*unit.UnitOption{ 743 unit.NewUnitOption("Unit", "Description", fmt.Sprintf("%s Reaper", appName)), 744 unit.NewUnitOption("Unit", "DefaultDependencies", "false"), 745 unit.NewUnitOption("Unit", "StopWhenUnneeded", "yes"), 746 unit.NewUnitOption("Unit", "Before", "halt.target"), 747 unit.NewUnitOption("Unit", "Conflicts", "exit.target"), 748 unit.NewUnitOption("Unit", "Conflicts", "halt.target"), 749 unit.NewUnitOption("Unit", "Conflicts", "poweroff.target"), 750 unit.NewUnitOption("Service", "RemainAfterExit", "yes"), 751 unit.NewUnitOption("Service", "ExecStop", fmt.Sprintf( 752 "/reaper.sh \"%s\" \"%s\" \"%s\"", 753 appName, 754 common.RelAppRootfsPath(appName), 755 binPath, 756 )), 757 }...) 758 759 uw.WriteUnit( 760 ServiceUnitPath(uw.p.Root, types.ACName(fmt.Sprintf("reaper-%s", appName))), 761 fmt.Sprintf("failed to write app %q reaper service", appName), 762 opts..., 763 ) 764 } 765 766 // AppSocketUnits writes a stream socket-unit for the given app in the given path. 767 func (uw *UnitWriter) AppSocketUnit(appName types.ACName, binPath string, streamName string, opts ...*unit.UnitOption) { 768 opts = append(opts, []*unit.UnitOption{ 769 unit.NewUnitOption("Unit", "Description", fmt.Sprintf("%s socket for %s", streamName, appName)), 770 unit.NewUnitOption("Unit", "DefaultDependencies", "no"), 771 unit.NewUnitOption("Unit", "StopWhenUnneeded", "yes"), 772 unit.NewUnitOption("Unit", "RefuseManualStart", "yes"), 773 unit.NewUnitOption("Unit", "RefuseManualStop", "yes"), 774 unit.NewUnitOption("Unit", "BindsTo", fmt.Sprintf("%s.service", appName)), 775 unit.NewUnitOption("Socket", "RemoveOnStop", "yes"), 776 unit.NewUnitOption("Socket", "Service", fmt.Sprintf("%s.service", appName)), 777 unit.NewUnitOption("Socket", "FileDescriptorName", streamName), 778 unit.NewUnitOption("Socket", "ListenFIFO", filepath.Join("/rkt/iottymux", appName.String(), "stage2-"+streamName)), 779 }...) 780 781 uw.WriteUnit( 782 TypedUnitPath(uw.p.Root, appName.String()+"-"+streamName, "socket"), 783 fmt.Sprintf("failed to write %s socket for %q service", streamName, appName), 784 opts..., 785 ) 786 } 787 788 // appendOptionsList updates an existing unit options list appending 789 // an array of new properties, one entry at a time. 790 // This is the preferred method to avoid hitting line length limits 791 // in unit files. Target property must support multi-line entries. 792 func appendOptionsList(opts []*unit.UnitOption, section, property, prefix string, vals ...string) []*unit.UnitOption { 793 for _, v := range vals { 794 opts = append(opts, unit.NewUnitOption(section, property, fmt.Sprintf("%s%s", prefix, v))) 795 } 796 return opts 797 }