github.com/rkt/rkt@v1.30.1-0.20200224141603-171c416fac02/stage1/init/common/units.go (about)

     1  // Copyright 2014 The rkt Authors
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  //+build linux
    16  
    17  package common
    18  
    19  import (
    20  	"errors"
    21  	"fmt"
    22  	"io"
    23  	"os"
    24  	"path"
    25  	"path/filepath"
    26  	"strconv"
    27  	"strings"
    28  
    29  	"github.com/appc/spec/schema"
    30  	"github.com/appc/spec/schema/types"
    31  	"github.com/rkt/rkt/common"
    32  	"github.com/rkt/rkt/pkg/user"
    33  	stage1commontypes "github.com/rkt/rkt/stage1/common/types"
    34  
    35  	"github.com/coreos/go-systemd/unit"
    36  	"github.com/hashicorp/errwrap"
    37  )
    38  
    39  // The maximum value for the MilliValue of an appc resource limit.
    40  const MaxMilliValue = int64(((1 << 63) - 1) / 1000)
    41  
    42  func MutableEnv(p *stage1commontypes.Pod) error {
    43  	w := NewUnitWriter(p)
    44  
    45  	w.WriteUnit(
    46  		TargetUnitPath(p.Root, "default"),
    47  		"failed to write default.target",
    48  		unit.NewUnitOption("Unit", "Description", "rkt apps target"),
    49  		unit.NewUnitOption("Unit", "DefaultDependencies", "false"),
    50  		unit.NewUnitOption("Unit", "Requires", "systemd-journald.service"),
    51  		unit.NewUnitOption("Unit", "After", "systemd-journald.service"),
    52  		unit.NewUnitOption("Unit", "Wants", "supervisor-ready.service"),
    53  		unit.NewUnitOption("Unit", "Before", "supervisor-ready.service"),
    54  		unit.NewUnitOption("Unit", "Before", "halt.target"),
    55  		unit.NewUnitOption("Unit", "Conflicts", "halt.target"),
    56  	)
    57  
    58  	w.WriteUnit(
    59  		ServiceUnitPath(p.Root, "prepare-app@"),
    60  		"failed to write prepare-app service template",
    61  		unit.NewUnitOption("Unit", "Description", "Prepare minimum environment for chrooted applications"),
    62  		unit.NewUnitOption("Unit", "DefaultDependencies", "false"),
    63  		unit.NewUnitOption("Unit", "OnFailureJobMode", "fail"),
    64  
    65  		// prepare-app is meant to be executed at most once.
    66  		// We must ensure that the prepare-app service unit remains started after the prepare-app binary exits
    67  		// such that it is not executed again during restarts of the target app.
    68  		unit.NewUnitOption("Service", "RemainAfterExit", "yes"),
    69  
    70  		unit.NewUnitOption("Service", "Type", "oneshot"),
    71  		unit.NewUnitOption("Service", "Restart", "no"),
    72  		unit.NewUnitOption("Service", "ExecStart", "/prepare-app %I"),
    73  		unit.NewUnitOption("Service", "User", "0"),
    74  		unit.NewUnitOption("Service", "Group", "0"),
    75  		unit.NewUnitOption("Service", "CapabilityBoundingSet", "CAP_SYS_ADMIN CAP_DAC_OVERRIDE CAP_MKNOD"),
    76  	)
    77  
    78  	w.WriteUnit(
    79  		TargetUnitPath(p.Root, "halt"),
    80  		"failed to write halt target",
    81  		unit.NewUnitOption("Unit", "Description", "Halt"),
    82  		unit.NewUnitOption("Unit", "DefaultDependencies", "false"),
    83  		unit.NewUnitOption("Unit", "AllowIsolate", "true"),
    84  		unit.NewUnitOption("Unit", "Requires", "shutdown.service"),
    85  		unit.NewUnitOption("Unit", "After", "shutdown.service"),
    86  	)
    87  
    88  	w.writeShutdownService(
    89  		"ExecStart",
    90  		unit.NewUnitOption("Unit", "Description", "Pod shutdown"),
    91  		unit.NewUnitOption("Unit", "AllowIsolate", "true"),
    92  		unit.NewUnitOption("Unit", "DefaultDependencies", "false"),
    93  		unit.NewUnitOption("Service", "RemainAfterExit", "yes"),
    94  	)
    95  
    96  	w.Activate("systemd-journal-flush.service", ServiceWantPath(p.Root, "systemd-journal-flush"))
    97  
    98  	return w.Error()
    99  }
   100  
   101  func ImmutableEnv(p *stage1commontypes.Pod) error {
   102  	uw := NewUnitWriter(p)
   103  
   104  	opts := []*unit.UnitOption{
   105  		unit.NewUnitOption("Unit", "Description", "rkt apps target"),
   106  		unit.NewUnitOption("Unit", "DefaultDependencies", "false"),
   107  		unit.NewUnitOption("Unit", "Wants", "supervisor-ready.service"),
   108  		unit.NewUnitOption("Unit", "Before", "supervisor-ready.service"),
   109  	}
   110  
   111  	for i := range p.Manifest.Apps {
   112  		ra := &p.Manifest.Apps[i]
   113  		serviceName := ServiceUnitName(ra.Name)
   114  		opts = append(opts, unit.NewUnitOption("Unit", "After", serviceName))
   115  		opts = append(opts, unit.NewUnitOption("Unit", "Wants", serviceName))
   116  	}
   117  
   118  	uw.WriteUnit(
   119  		TargetUnitPath(p.Root, "default"),
   120  		"failed to write default.target",
   121  		opts...,
   122  	)
   123  
   124  	uw.WriteUnit(
   125  		ServiceUnitPath(p.Root, "prepare-app@"),
   126  		"failed to write prepare-app service template",
   127  		unit.NewUnitOption("Unit", "Description", "Prepare minimum environment for chrooted applications"),
   128  		unit.NewUnitOption("Unit", "DefaultDependencies", "false"),
   129  		unit.NewUnitOption("Unit", "OnFailureJobMode", "fail"),
   130  		unit.NewUnitOption("Unit", "Requires", "systemd-journald.service"),
   131  		unit.NewUnitOption("Unit", "After", "systemd-journald.service"),
   132  		unit.NewUnitOption("Service", "Type", "oneshot"),
   133  		unit.NewUnitOption("Service", "Restart", "no"),
   134  		unit.NewUnitOption("Service", "ExecStart", "/prepare-app %I"),
   135  		unit.NewUnitOption("Service", "User", "0"),
   136  		unit.NewUnitOption("Service", "Group", "0"),
   137  		unit.NewUnitOption("Service", "CapabilityBoundingSet", "CAP_SYS_ADMIN CAP_DAC_OVERRIDE CAP_MKNOD"),
   138  	)
   139  
   140  	uw.WriteUnit(
   141  		TargetUnitPath(p.Root, "halt"),
   142  		"failed to write halt target",
   143  		unit.NewUnitOption("Unit", "Description", "Halt"),
   144  		unit.NewUnitOption("Unit", "DefaultDependencies", "false"),
   145  		unit.NewUnitOption("Unit", "AllowIsolate", "true"),
   146  	)
   147  
   148  	uw.writeShutdownService(
   149  		"ExecStop",
   150  		unit.NewUnitOption("Unit", "Description", "Pod shutdown"),
   151  		unit.NewUnitOption("Unit", "AllowIsolate", "true"),
   152  		unit.NewUnitOption("Unit", "StopWhenUnneeded", "yes"),
   153  		unit.NewUnitOption("Unit", "DefaultDependencies", "false"),
   154  		unit.NewUnitOption("Service", "RemainAfterExit", "yes"),
   155  	)
   156  
   157  	if err := uw.Error(); err != nil {
   158  		return err
   159  	}
   160  
   161  	for i := range p.Manifest.Apps {
   162  		ra := &p.Manifest.Apps[i]
   163  
   164  		if ra.App.WorkingDirectory == "" {
   165  			ra.App.WorkingDirectory = "/"
   166  		}
   167  
   168  		binPath, err := FindBinPath(p, ra)
   169  		if err != nil {
   170  			return err
   171  		}
   172  
   173  		uw.AppUnit(ra, binPath,
   174  			unit.NewUnitOption("Unit", "After", "systemd-journald.service"),
   175  			// When an app fails, we shut down the pod
   176  			unit.NewUnitOption("Unit", "OnFailure", "halt.target"))
   177  
   178  		uw.AppReaperUnit(ra.Name, binPath,
   179  			unit.NewUnitOption("Service", "Environment", `"EXIT_POD=true"`),
   180  			unit.NewUnitOption("Unit", "Wants", "shutdown.service"),
   181  			unit.NewUnitOption("Unit", "After", "shutdown.service"),
   182  		)
   183  	}
   184  
   185  	uw.Activate("systemd-journal-flush.service", ServiceWantPath(p.Root, "systemd-journal-flush"))
   186  
   187  	return uw.Error()
   188  }
   189  
   190  // SetupAppIO prepares all properties related to streams (stdin/stdout/stderr) and TTY
   191  // for an application service unit.
   192  //
   193  // It works according to the following steps:
   194  //  1. short-circuit interactive pods and legacy systemd, for backward compatibility
   195  //  2. parse app-level annotations to determine stdin/stdout/stderr mode
   196  //     2a. if an annotation is missing/invalid, it fallbacks to legacy mode (in: null, out/err: journald)
   197  //     2b. if a valid annotation is found, it prepares:
   198  //          - TTY and stream properties for the systemd service unit
   199  //          - env variables for iottymux binary
   200  //  3. if any of stdin/stdout/stderr is in TTY or streaming mode:
   201  //     3a. the env file for iottymux is written to `/rkt/iottymux/<appname>/env` with the above content
   202  //     3b. for TTY mode, a `TTYPath` property and an `After=ttymux@<appname>.service` dependency are added
   203  //     3c. for streaming mode, a `Before=iomux@<appname>.service` dependency is added
   204  //
   205  // For complete details, see dev-docs at Documentation/devel/log-attach-design.md
   206  func (uw *UnitWriter) SetupAppIO(p *stage1commontypes.Pod, ra *schema.RuntimeApp, binPath string, opts ...*unit.UnitOption) []*unit.UnitOption {
   207  	if uw.err != nil {
   208  		return opts
   209  	}
   210  
   211  	if p.Interactive {
   212  		opts = append(opts, unit.NewUnitOption("Service", "StandardInput", "tty"))
   213  		opts = append(opts, unit.NewUnitOption("Service", "StandardOutput", "tty"))
   214  		opts = append(opts, unit.NewUnitOption("Service", "StandardError", "tty"))
   215  		return opts
   216  	}
   217  
   218  	flavor, systemdVersion, err := GetFlavor(uw.p)
   219  	if err != nil {
   220  		uw.err = err
   221  		return opts
   222  	}
   223  
   224  	stdin, _ := ra.Annotations.Get(stage1commontypes.AppStdinMode)
   225  	stdout, _ := ra.Annotations.Get(stage1commontypes.AppStdoutMode)
   226  	stderr, _ := ra.Annotations.Get(stage1commontypes.AppStderrMode)
   227  
   228  	// Attach needs https://github.com/systemd/systemd/pull/4179, ie. systemd v232 or a backport
   229  	if ((flavor == "src" || flavor == "host") && systemdVersion < 232) ||
   230  		((flavor == "coreos" || flavor == "kvm") && systemdVersion < 231) {
   231  		// Explicit error if systemd is too old for attaching
   232  		if stdin != "" || stdout != "" || stderr != "" {
   233  			uw.err = fmt.Errorf("stage1 systemd %d does not support attachable I/O", systemdVersion)
   234  			return opts
   235  		}
   236  		opts = append(opts, unit.NewUnitOption("Service", "StandardInput", "null"))
   237  		opts = append(opts, unit.NewUnitOption("Service", "StandardOutput", "journal+console"))
   238  		opts = append(opts, unit.NewUnitOption("Service", "StandardError", "journal+console"))
   239  		return opts
   240  	}
   241  
   242  	var iottymuxEnvFlags []string
   243  	needsIOMux := false
   244  	needsTTYMux := false
   245  
   246  	switch stdin {
   247  	case "stream":
   248  		needsIOMux = true
   249  		uw.AppSocketUnit(ra.Name, binPath, "stdin")
   250  		iottymuxEnvFlags = append(iottymuxEnvFlags, "STAGE2_STDIN=true")
   251  		opts = append(opts, unit.NewUnitOption("Service", "StandardInput", "fd"))
   252  		opts = append(opts, unit.NewUnitOption("Service", "Sockets", fmt.Sprintf("%s-%s.socket", ra.Name, "stdin")))
   253  	case "tty":
   254  		needsTTYMux = true
   255  		iottymuxEnvFlags = append(iottymuxEnvFlags, "STAGE2_STDIN=true")
   256  		opts = append(opts, unit.NewUnitOption("Service", "StandardInput", "tty-force"))
   257  	case "interactive":
   258  		opts = append(opts, unit.NewUnitOption("Service", "StandardInput", "tty"))
   259  	default:
   260  		// null mode
   261  		opts = append(opts, unit.NewUnitOption("Service", "StandardInput", "null"))
   262  	}
   263  
   264  	switch stdout {
   265  	case "stream":
   266  		needsIOMux = true
   267  		uw.AppSocketUnit(ra.Name, binPath, "stdout")
   268  		iottymuxEnvFlags = append(iottymuxEnvFlags, "STAGE2_STDOUT=true")
   269  		opts = append(opts, unit.NewUnitOption("Service", "StandardOutput", "fd"))
   270  		opts = append(opts, unit.NewUnitOption("Service", "Sockets", fmt.Sprintf("%s-%s.socket", ra.Name, "stdout")))
   271  	case "tty":
   272  		needsTTYMux = true
   273  		iottymuxEnvFlags = append(iottymuxEnvFlags, "STAGE2_STDOUT=true")
   274  		opts = append(opts, unit.NewUnitOption("Service", "StandardOutput", "tty"))
   275  	case "interactive":
   276  		opts = append(opts, unit.NewUnitOption("Service", "StandardOutput", "tty"))
   277  	case "null":
   278  		opts = append(opts, unit.NewUnitOption("Service", "StandardOutput", "null"))
   279  	default:
   280  		// log mode
   281  		opts = append(opts, unit.NewUnitOption("Service", "StandardOutput", "journal+console"))
   282  	}
   283  
   284  	switch stderr {
   285  	case "stream":
   286  		needsIOMux = true
   287  		uw.AppSocketUnit(ra.Name, binPath, "stderr")
   288  		iottymuxEnvFlags = append(iottymuxEnvFlags, "STAGE2_STDERR=true")
   289  		opts = append(opts, unit.NewUnitOption("Service", "StandardError", "fd"))
   290  		opts = append(opts, unit.NewUnitOption("Service", "Sockets", fmt.Sprintf("%s-%s.socket", ra.Name, "stderr")))
   291  	case "tty":
   292  		needsTTYMux = true
   293  		iottymuxEnvFlags = append(iottymuxEnvFlags, "STAGE2_STDERR=true")
   294  		opts = append(opts, unit.NewUnitOption("Service", "StandardError", "tty"))
   295  	case "interactive":
   296  		opts = append(opts, unit.NewUnitOption("Service", "StandardError", "tty"))
   297  	case "null":
   298  		opts = append(opts, unit.NewUnitOption("Service", "StandardError", "null"))
   299  	default:
   300  		// log mode
   301  		opts = append(opts, unit.NewUnitOption("Service", "StandardError", "journal+console"))
   302  	}
   303  
   304  	// if at least one stream requires I/O muxing, an appropriate iottymux dependency needs to be setup
   305  	if needsIOMux || needsTTYMux {
   306  		// an env file is written here for iottymux, containing service configuration.
   307  		appIODir := IOMuxDir(p.Root, ra.Name)
   308  		os.MkdirAll(appIODir, 0644)
   309  		file, err := os.OpenFile(filepath.Join(appIODir, "env"), os.O_WRONLY|os.O_CREATE|os.O_TRUNC, 0644)
   310  		if err != nil {
   311  			uw.err = err
   312  			return nil
   313  		}
   314  		defer file.Close()
   315  
   316  		// env file specifies: debug verbosity, which streams to mux and whether a dedicated TTY is needed.
   317  		file.WriteString(fmt.Sprintf("STAGE2_TTY=%t\n", needsTTYMux))
   318  		file.WriteString(fmt.Sprintf("STAGE1_DEBUG=%t\n", p.Debug))
   319  		for _, l := range iottymuxEnvFlags {
   320  			file.WriteString(l + "\n")
   321  		}
   322  
   323  		if needsIOMux {
   324  			// streaming mode brings in a `iomux@.service` before-dependency
   325  			opts = append(opts, unit.NewUnitOption("Unit", "Requires", fmt.Sprintf("iomux@%s.service", ra.Name)))
   326  			opts = append(opts, unit.NewUnitOption("Unit", "Before", fmt.Sprintf("iomux@%s.service", ra.Name)))
   327  			logMode, ok := p.Manifest.Annotations.Get("coreos.com/rkt/experiment/logmode")
   328  			if ok {
   329  				file.WriteString(fmt.Sprintf("STAGE1_LOGMODE=%s\n", logMode))
   330  			}
   331  			switch logMode {
   332  			case "k8s-plain":
   333  				kubernetesLogPath, ok := ra.Annotations.Get("coreos.com/rkt/experiment/kubernetes-log-path")
   334  				if !ok {
   335  					uw.err = fmt.Errorf("kubernetes-log-path annotation needs to be specified when k8s-plain logging mode is used")
   336  					return nil
   337  				}
   338  				file.WriteString(fmt.Sprintf("KUBERNETES_LOG_PATH=%s\n", kubernetesLogPath))
   339  			}
   340  
   341  		} else if needsTTYMux {
   342  			// tty mode brings in a `ttymux@.service` after-dependency (it needs to create the TTY first)
   343  			opts = append(opts, unit.NewUnitOption("Service", "TTYPath", filepath.Join("/rkt/iottymux", ra.Name.String(), "stage2-pts")))
   344  			opts = append(opts, unit.NewUnitOption("Unit", "Requires", fmt.Sprintf("ttymux@%s.service", ra.Name)))
   345  			opts = append(opts, unit.NewUnitOption("Unit", "After", fmt.Sprintf("ttymux@%s.service", ra.Name)))
   346  		}
   347  	}
   348  	return opts
   349  }
   350  
   351  // UnitWriter is the type that writes systemd units preserving the first previously occurred error.
   352  // Any method of this type can be invoked multiple times without error checking.
   353  // If a previous invocation generated an error, any invoked method will be skipped.
   354  // If an error occurred during method invocations, it can be retrieved using Error().
   355  type UnitWriter struct {
   356  	err error
   357  	p   *stage1commontypes.Pod
   358  }
   359  
   360  // NewUnitWriter returns a new UnitWriter for the given pod.
   361  func NewUnitWriter(p *stage1commontypes.Pod) *UnitWriter {
   362  	return &UnitWriter{p: p}
   363  }
   364  
   365  // WriteUnit writes a systemd unit in the given path with the given unit options
   366  // if no previous error occurred.
   367  func (uw *UnitWriter) WriteUnit(path string, errmsg string, opts ...*unit.UnitOption) {
   368  	if uw.err != nil {
   369  		return
   370  	}
   371  
   372  	file, err := os.OpenFile(path, os.O_WRONLY|os.O_CREATE|os.O_TRUNC, 0644)
   373  	if err != nil {
   374  		uw.err = errwrap.Wrap(errors.New(errmsg), err)
   375  		return
   376  	}
   377  	defer file.Close()
   378  
   379  	if _, err = io.Copy(file, unit.Serialize(opts)); err != nil {
   380  		uw.err = errwrap.Wrap(errors.New(errmsg), err)
   381  		return
   382  	}
   383  	if err := user.ShiftFiles([]string{path}, &uw.p.UidRange); err != nil {
   384  		uw.err = errwrap.Wrap(errors.New(errmsg), err)
   385  		return
   386  	}
   387  }
   388  
   389  // writeShutdownService writes a shutdown.service unit with the given unit options
   390  // if no previous error occurred.
   391  // exec specifies how systemctl should be invoked, i.e. ExecStart, or ExecStop.
   392  func (uw *UnitWriter) writeShutdownService(exec string, opts ...*unit.UnitOption) {
   393  	if uw.err != nil {
   394  		return
   395  	}
   396  
   397  	flavor, systemdVersion, err := GetFlavor(uw.p)
   398  	if err != nil {
   399  		uw.err = errwrap.Wrap(errors.New("failed to create shutdown service"), err)
   400  		return
   401  	}
   402  
   403  	opts = append(opts, []*unit.UnitOption{
   404  		// The default stdout is /dev/console (the tty created by nspawn).
   405  		// But the tty might be destroyed if rkt is executed via ssh and
   406  		// the user terminates the ssh session. We still want
   407  		// shutdown.service to succeed in that case, so don't use
   408  		// /dev/console.
   409  		unit.NewUnitOption("Service", "StandardInput", "null"),
   410  		unit.NewUnitOption("Service", "StandardOutput", "null"),
   411  		unit.NewUnitOption("Service", "StandardError", "null"),
   412  	}...)
   413  
   414  	shutdownVerb := "exit"
   415  	// systemd <v227 doesn't allow the "exit" verb when running as PID 1, so
   416  	// use "halt".
   417  	// If systemdVersion is 0 it means it couldn't be guessed, assume it's new
   418  	// enough for "systemctl exit".
   419  	// This can happen, for example, when building rkt with:
   420  	//
   421  	// ./configure --with-stage1-flavors=src --with-stage1-systemd-version=master
   422  	//
   423  	// The patches for the "exit" verb are backported to the "coreos" flavor, so
   424  	// don't rely on the systemd version on the "coreos" flavor.
   425  	if flavor != "coreos" && systemdVersion != 0 && systemdVersion < 227 {
   426  		shutdownVerb = "halt"
   427  	}
   428  
   429  	opts = append(
   430  		opts,
   431  		unit.NewUnitOption("Service", exec, fmt.Sprintf("/usr/bin/systemctl --force %s", shutdownVerb)),
   432  	)
   433  
   434  	uw.WriteUnit(
   435  		ServiceUnitPath(uw.p.Root, "shutdown"),
   436  		"failed to create shutdown service",
   437  		opts...,
   438  	)
   439  }
   440  
   441  // Activate actives the given unit in the given wantPath.
   442  func (uw *UnitWriter) Activate(unit, wantPath string) {
   443  	if uw.err != nil {
   444  		return
   445  	}
   446  
   447  	if err := os.Symlink(path.Join("..", unit), wantPath); err != nil && !os.IsExist(err) {
   448  		uw.err = errwrap.Wrap(errors.New("failed to link service want"), err)
   449  	}
   450  }
   451  
   452  // error returns the first error that occurred during write* invocations.
   453  func (uw *UnitWriter) Error() error {
   454  	return uw.err
   455  }
   456  
   457  // AppUnit sets up the main systemd service unit for the application.
   458  func (uw *UnitWriter) AppUnit(ra *schema.RuntimeApp, binPath string, opts ...*unit.UnitOption) {
   459  	if uw.err != nil {
   460  		return
   461  	}
   462  
   463  	if len(ra.App.Exec) == 0 {
   464  		uw.err = fmt.Errorf(`image %q has an empty "exec" (try --exec=BINARY)`,
   465  			uw.p.AppNameToImageName(ra.Name))
   466  		return
   467  	}
   468  
   469  	pa, err := prepareApp(uw.p, ra)
   470  	if err != nil {
   471  		uw.err = err
   472  		return
   473  	}
   474  
   475  	appName := ra.Name.String()
   476  	imgName := uw.p.AppNameToImageName(ra.Name)
   477  	/* Write the generic unit options */
   478  	opts = append(opts, []*unit.UnitOption{
   479  		unit.NewUnitOption("Unit", "Description", fmt.Sprintf("Application=%v Image=%v", appName, imgName)),
   480  		unit.NewUnitOption("Unit", "DefaultDependencies", "false"),
   481  		unit.NewUnitOption("Unit", "Wants", fmt.Sprintf("reaper-%s.service", appName)),
   482  		unit.NewUnitOption("Service", "Restart", "no"),
   483  
   484  		// This helps working around a race
   485  		// (https://github.com/systemd/systemd/issues/2913) that causes the
   486  		// systemd unit name not getting written to the journal if the unit is
   487  		// short-lived and runs as non-root.
   488  		unit.NewUnitOption("Service", "SyslogIdentifier", appName),
   489  	}...)
   490  
   491  	// Setup I/O for iottymux (stdin/stdout/stderr)
   492  	opts = append(opts, uw.SetupAppIO(uw.p, ra, binPath)...)
   493  
   494  	if supportsNotify(uw.p, ra.Name.String()) {
   495  		opts = append(opts, unit.NewUnitOption("Service", "Type", "notify"))
   496  	}
   497  
   498  	// Some pre-start jobs take a long time, set the timeout to 0
   499  	opts = append(opts, unit.NewUnitOption("Service", "TimeoutStartSec", "0"))
   500  
   501  	opts = append(opts, unit.NewUnitOption("Unit", "Requires", "sysusers.service"))
   502  	opts = append(opts, unit.NewUnitOption("Unit", "After", "sysusers.service"))
   503  
   504  	opts = uw.appSystemdUnit(pa, binPath, opts)
   505  
   506  	uw.WriteUnit(ServiceUnitPath(uw.p.Root, ra.Name), "failed to create service unit file", opts...)
   507  	uw.Activate(ServiceUnitName(ra.Name), ServiceWantPath(uw.p.Root, ra.Name))
   508  
   509  }
   510  
   511  // appSystemdUnit sets up an application for isolation via systemd
   512  func (uw *UnitWriter) appSystemdUnit(pa *preparedApp, binPath string, opts []*unit.UnitOption) []*unit.UnitOption {
   513  	if uw.err != nil {
   514  		return nil
   515  	}
   516  
   517  	flavor, systemdVersion, err := GetFlavor(uw.p)
   518  	if err != nil {
   519  		uw.err = errwrap.Wrap(errors.New("unable to determine stage1 flavor"), err)
   520  		return nil
   521  	}
   522  
   523  	ra := pa.app
   524  	app := ra.App
   525  	appName := ra.Name
   526  	imgName := uw.p.AppNameToImageName(ra.Name)
   527  
   528  	podAbsRoot, err := filepath.Abs(uw.p.Root)
   529  	if err != nil {
   530  		uw.err = err
   531  		return nil
   532  	}
   533  
   534  	var supplementaryGroups []string
   535  	for _, g := range app.SupplementaryGIDs {
   536  		supplementaryGroups = append(supplementaryGroups, strconv.Itoa(g))
   537  	}
   538  
   539  	// Write env file
   540  	if err := common.WriteEnvFile(common.ComposeEnviron(pa.env), &uw.p.UidRange, EnvFilePath(uw.p.Root, pa.app.Name)); err != nil {
   541  		uw.err = errwrap.Wrapf("unable to write environment file", err)
   542  		return nil
   543  	}
   544  
   545  	execStart := append([]string{binPath}, app.Exec[1:]...)
   546  	execStartString := quoteExec(execStart)
   547  	opts = append(opts,
   548  		unit.NewUnitOption("Service", "ExecStart", execStartString),
   549  		unit.NewUnitOption("Service", "RootDirectory", common.RelAppRootfsPath(appName)),
   550  		unit.NewUnitOption("Service", "WorkingDirectory", app.WorkingDirectory),
   551  		unit.NewUnitOption("Service", "EnvironmentFile", RelEnvFilePath(appName)),
   552  		unit.NewUnitOption("Service", "User", strconv.Itoa(int(pa.uid))),
   553  		unit.NewUnitOption("Service", "Group", strconv.Itoa(int(pa.gid))),
   554  		unit.NewUnitOption("Service", "PermissionsStartOnly", "true"),
   555  		unit.NewUnitOption("Unit", "Requires", InstantiatedPrepareAppUnitName(ra.Name)),
   556  		unit.NewUnitOption("Unit", "After", InstantiatedPrepareAppUnitName(ra.Name)),
   557  	)
   558  
   559  	if len(supplementaryGroups) > 0 {
   560  		opts = appendOptionsList(opts, "Service", "SupplementaryGroups", "", supplementaryGroups...)
   561  	}
   562  
   563  	if !uw.p.InsecureOptions.DisableCapabilities {
   564  		opts = append(opts, unit.NewUnitOption("Service", "CapabilityBoundingSet", strings.Join(pa.capabilities, " ")))
   565  	}
   566  
   567  	// Apply seccomp isolator, if any and not opt-ing out;
   568  	// see https://www.freedesktop.org/software/systemd/man/systemd.exec.html#SystemCallFilter=
   569  	if pa.seccomp != nil {
   570  		opts, err = seccompUnitOptions(opts, pa.seccomp)
   571  		if err != nil {
   572  			uw.err = errwrap.Wrapf("unable to apply seccomp options", err)
   573  			return nil
   574  		}
   575  	}
   576  	opts = append(opts, unit.NewUnitOption("Service", "NoNewPrivileges", strconv.FormatBool(pa.noNewPrivileges)))
   577  
   578  	if ra.ReadOnlyRootFS {
   579  		for _, m := range pa.mounts {
   580  			mntPath, err := EvaluateSymlinksInsideApp(podAbsRoot, m.Mount.Path)
   581  			if err != nil {
   582  				uw.err = err
   583  				return nil
   584  			}
   585  
   586  			if !m.ReadOnly {
   587  				rwDir := filepath.Join(common.RelAppRootfsPath(ra.Name), mntPath)
   588  				opts = appendOptionsList(opts, "Service", "ReadWriteDirectories", "", rwDir)
   589  			}
   590  		}
   591  		opts = appendOptionsList(opts, "Service", "ReadOnlyDirectories", "", common.RelAppRootfsPath(ra.Name))
   592  	}
   593  
   594  	// Unless we have --insecure-options=paths, then do some path protections:
   595  	//
   596  	// * prevent access to sensitive kernel tunables
   597  	// * Run the app in a separate mount namespace
   598  	//
   599  	if !uw.p.InsecureOptions.DisablePaths {
   600  		// Systemd 231+ has InaccessiblePaths
   601  		// older versions only have InaccessibleDirectories
   602  		// Paths prepended with "-" are ignored if they don't exist.
   603  		if systemdVersion >= 231 {
   604  			opts = appendOptionsList(opts, "Service", "InaccessiblePaths", "-", pa.relAppPaths(pa.hiddenPaths)...)
   605  			opts = appendOptionsList(opts, "Service", "InaccessiblePaths", "-", pa.relAppPaths(pa.hiddenDirs)...)
   606  			opts = appendOptionsList(opts, "Service", "ReadOnlyPaths", "-", pa.relAppPaths(pa.roPaths)...)
   607  		} else {
   608  			opts = appendOptionsList(opts, "Service", "InaccessibleDirectories", "-", pa.relAppPaths(pa.hiddenDirs)...)
   609  			opts = appendOptionsList(opts, "Service", "ReadOnlyDirectories", "-", pa.relAppPaths(pa.roPaths)...)
   610  		}
   611  
   612  		if systemdVersion >= 233 {
   613  			// ProtectKernelTunables is introduced in systemd-v232 but didn't work
   614  			// until v233 due to a systemd bug, see
   615  			// https://github.com/systemd/systemd/pull/4594
   616  			// However, from v233, setting ProtectKernelTunables + RootDirectory causes
   617  			// MountAPIVFS to be enabled unconditionally, which we don't want.
   618  			//
   619  			// opts = append(opts, unit.NewUnitOption("Service", "ProtectKernelTunables", "true"))
   620  
   621  			// MountAPIVFS is introduced in systemd-233. Don't let systemd mount /sys:
   622  			// it is mounted by prepare-app (tested by TestVolumeSysfs)
   623  			opts = append(opts, unit.NewUnitOption("Service", "MountAPIVFS", "false"))
   624  		}
   625  
   626  		// MountFlags=shared creates a new mount namespace and (as unintuitive
   627  		// as it might seem) makes sure the mount is slave+shared.
   628  		opts = append(opts, unit.NewUnitOption("Service", "MountFlags", "shared"))
   629  	}
   630  
   631  	// Generate default device policy for the app, as well as the list of allowed devices.
   632  	// For kvm flavor, devices are VM-specific and restricting them is not strictly needed.
   633  	if !uw.p.InsecureOptions.DisablePaths && flavor != "kvm" {
   634  		opts = append(opts, unit.NewUnitOption("Service", "DevicePolicy", "closed"))
   635  		deviceAllows, err := generateDeviceAllows(common.Stage1RootfsPath(podAbsRoot), appName, app.MountPoints, pa.mounts, &uw.p.UidRange)
   636  		if err != nil {
   637  			uw.err = err
   638  			return nil
   639  		}
   640  		for _, dev := range deviceAllows {
   641  			opts = append(opts, unit.NewUnitOption("Service", "DeviceAllow", dev))
   642  		}
   643  	}
   644  
   645  	for _, eh := range app.EventHandlers {
   646  		var typ string
   647  		switch eh.Name {
   648  		case "pre-start":
   649  			typ = "ExecStartPre"
   650  		case "post-stop":
   651  			typ = "ExecStopPost"
   652  		default:
   653  			uw.err = fmt.Errorf("unrecognized eventHandler: %v", eh.Name)
   654  			return nil
   655  		}
   656  		exec := quoteExec(eh.Exec)
   657  		opts = append(opts, unit.NewUnitOption("Service", typ, exec))
   658  	}
   659  
   660  	// Resource isolators
   661  	if pa.resources.MemoryLimit != nil {
   662  		opts = append(opts, unit.NewUnitOption("Service", "MemoryLimit", strconv.FormatUint(*pa.resources.MemoryLimit, 10)))
   663  	}
   664  	if pa.resources.CPUQuota != nil {
   665  		quota := strconv.FormatUint(*pa.resources.CPUQuota, 10) + "%"
   666  		opts = append(opts, unit.NewUnitOption("Service", "CPUQuota", quota))
   667  	}
   668  	if pa.resources.LinuxCPUShares != nil {
   669  		opts = append(opts, unit.NewUnitOption("Service", "CPUShares", strconv.FormatUint(*pa.resources.LinuxCPUShares, 10)))
   670  	}
   671  	if pa.resources.LinuxOOMScoreAdjust != nil {
   672  		opts = append(opts, unit.NewUnitOption("Service", "OOMScoreAdjust", strconv.Itoa(*pa.resources.LinuxOOMScoreAdjust)))
   673  	}
   674  
   675  	var saPorts []types.Port
   676  	for _, p := range ra.App.Ports {
   677  		if p.SocketActivated {
   678  			saPorts = append(saPorts, p)
   679  		}
   680  	}
   681  
   682  	if len(saPorts) > 0 {
   683  		sockopts := []*unit.UnitOption{
   684  			unit.NewUnitOption("Unit", "Description", fmt.Sprintf("Application=%v Image=%v %s", appName, imgName, "socket-activated ports")),
   685  			unit.NewUnitOption("Unit", "DefaultDependencies", "false"),
   686  			unit.NewUnitOption("Socket", "BindIPv6Only", "both"),
   687  			unit.NewUnitOption("Socket", "Service", ServiceUnitName(appName)),
   688  		}
   689  
   690  		for _, sap := range saPorts {
   691  			var proto string
   692  			switch sap.Protocol {
   693  			case "tcp":
   694  				proto = "ListenStream"
   695  			case "udp":
   696  				proto = "ListenDatagram"
   697  			default:
   698  				uw.err = fmt.Errorf("unrecognized protocol: %v", sap.Protocol)
   699  				return nil
   700  			}
   701  			// We find the host port for the pod's port and use that in the
   702  			// socket unit file.
   703  			// This is so because systemd inside the pod will match based on
   704  			// the socket port number, and since the socket was created on the
   705  			// host, it will have the host port number.
   706  			port := findHostPort(*uw.p.Manifest, sap.Name)
   707  			if port == 0 {
   708  				log.Printf("warning: no --port option for socket-activated port %q, assuming port %d as specified in the manifest", sap.Name, sap.Port)
   709  				port = sap.Port
   710  			}
   711  			sockopts = append(sockopts, unit.NewUnitOption("Socket", proto, fmt.Sprintf("%v", port)))
   712  		}
   713  
   714  		file, err := os.OpenFile(SocketUnitPath(uw.p.Root, appName), os.O_WRONLY|os.O_CREATE, 0644)
   715  		if err != nil {
   716  			uw.err = errwrap.Wrap(errors.New("failed to create socket file"), err)
   717  			return nil
   718  		}
   719  		defer file.Close()
   720  
   721  		if _, err = io.Copy(file, unit.Serialize(sockopts)); err != nil {
   722  			uw.err = errwrap.Wrap(errors.New("failed to write socket unit file"), err)
   723  			return nil
   724  		}
   725  
   726  		if err = os.Symlink(path.Join("..", SocketUnitName(appName)), SocketWantPath(uw.p.Root, appName)); err != nil {
   727  			uw.err = errwrap.Wrap(errors.New("failed to link socket want"), err)
   728  			return nil
   729  		}
   730  
   731  		opts = append(opts, unit.NewUnitOption("Unit", "Requires", SocketUnitName(appName)))
   732  	}
   733  	return opts
   734  }
   735  
   736  // AppReaperUnit writes an app reaper service unit for the given app in the given path using the given unit options.
   737  func (uw *UnitWriter) AppReaperUnit(appName types.ACName, binPath string, opts ...*unit.UnitOption) {
   738  	if uw.err != nil {
   739  		return
   740  	}
   741  
   742  	opts = append(opts, []*unit.UnitOption{
   743  		unit.NewUnitOption("Unit", "Description", fmt.Sprintf("%s Reaper", appName)),
   744  		unit.NewUnitOption("Unit", "DefaultDependencies", "false"),
   745  		unit.NewUnitOption("Unit", "StopWhenUnneeded", "yes"),
   746  		unit.NewUnitOption("Unit", "Before", "halt.target"),
   747  		unit.NewUnitOption("Unit", "Conflicts", "exit.target"),
   748  		unit.NewUnitOption("Unit", "Conflicts", "halt.target"),
   749  		unit.NewUnitOption("Unit", "Conflicts", "poweroff.target"),
   750  		unit.NewUnitOption("Service", "RemainAfterExit", "yes"),
   751  		unit.NewUnitOption("Service", "ExecStop", fmt.Sprintf(
   752  			"/reaper.sh \"%s\" \"%s\" \"%s\"",
   753  			appName,
   754  			common.RelAppRootfsPath(appName),
   755  			binPath,
   756  		)),
   757  	}...)
   758  
   759  	uw.WriteUnit(
   760  		ServiceUnitPath(uw.p.Root, types.ACName(fmt.Sprintf("reaper-%s", appName))),
   761  		fmt.Sprintf("failed to write app %q reaper service", appName),
   762  		opts...,
   763  	)
   764  }
   765  
   766  // AppSocketUnits writes a stream socket-unit for the given app in the given path.
   767  func (uw *UnitWriter) AppSocketUnit(appName types.ACName, binPath string, streamName string, opts ...*unit.UnitOption) {
   768  	opts = append(opts, []*unit.UnitOption{
   769  		unit.NewUnitOption("Unit", "Description", fmt.Sprintf("%s socket for %s", streamName, appName)),
   770  		unit.NewUnitOption("Unit", "DefaultDependencies", "no"),
   771  		unit.NewUnitOption("Unit", "StopWhenUnneeded", "yes"),
   772  		unit.NewUnitOption("Unit", "RefuseManualStart", "yes"),
   773  		unit.NewUnitOption("Unit", "RefuseManualStop", "yes"),
   774  		unit.NewUnitOption("Unit", "BindsTo", fmt.Sprintf("%s.service", appName)),
   775  		unit.NewUnitOption("Socket", "RemoveOnStop", "yes"),
   776  		unit.NewUnitOption("Socket", "Service", fmt.Sprintf("%s.service", appName)),
   777  		unit.NewUnitOption("Socket", "FileDescriptorName", streamName),
   778  		unit.NewUnitOption("Socket", "ListenFIFO", filepath.Join("/rkt/iottymux", appName.String(), "stage2-"+streamName)),
   779  	}...)
   780  
   781  	uw.WriteUnit(
   782  		TypedUnitPath(uw.p.Root, appName.String()+"-"+streamName, "socket"),
   783  		fmt.Sprintf("failed to write %s socket for %q service", streamName, appName),
   784  		opts...,
   785  	)
   786  }
   787  
   788  // appendOptionsList updates an existing unit options list appending
   789  // an array of new properties, one entry at a time.
   790  // This is the preferred method to avoid hitting line length limits
   791  // in unit files. Target property must support multi-line entries.
   792  func appendOptionsList(opts []*unit.UnitOption, section, property, prefix string, vals ...string) []*unit.UnitOption {
   793  	for _, v := range vals {
   794  		opts = append(opts, unit.NewUnitOption(section, property, fmt.Sprintf("%s%s", prefix, v)))
   795  	}
   796  	return opts
   797  }