github.com/rkt/rkt@v1.30.1-0.20200224141603-171c416fac02/stage1/init/init.go (about)

     1  // Copyright 2014 The rkt Authors
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  //+build linux
    16  
    17  package main
    18  
    19  // this implements /init of stage1/nspawn+systemd
    20  
    21  import (
    22  	"errors"
    23  	"flag"
    24  	"fmt"
    25  	"io"
    26  	"io/ioutil"
    27  	"net"
    28  	"os"
    29  	"os/exec"
    30  	"path/filepath"
    31  	"runtime"
    32  	"strings"
    33  	"syscall"
    34  
    35  	"github.com/appc/goaci/proj2aci"
    36  	"github.com/appc/spec/schema/types"
    37  	"github.com/coreos/go-systemd/util"
    38  	"github.com/coreos/pkg/dlopen"
    39  	"github.com/godbus/dbus"
    40  	"github.com/godbus/dbus/introspect"
    41  	"github.com/hashicorp/errwrap"
    42  
    43  	stage1common "github.com/rkt/rkt/stage1/common"
    44  	stage1commontypes "github.com/rkt/rkt/stage1/common/types"
    45  	stage1initcommon "github.com/rkt/rkt/stage1/init/common"
    46  
    47  	"github.com/rkt/rkt/common"
    48  	"github.com/rkt/rkt/common/cgroup"
    49  	"github.com/rkt/rkt/common/cgroup/v1"
    50  	"github.com/rkt/rkt/common/cgroup/v2"
    51  	commonnet "github.com/rkt/rkt/common/networking"
    52  	"github.com/rkt/rkt/networking"
    53  	pkgflag "github.com/rkt/rkt/pkg/flag"
    54  	"github.com/rkt/rkt/pkg/fs"
    55  	rktlog "github.com/rkt/rkt/pkg/log"
    56  	"github.com/rkt/rkt/pkg/sys"
    57  	"github.com/rkt/rkt/pkg/user"
    58  	"github.com/rkt/rkt/stage1/init/kvm"
    59  	"github.com/rkt/rkt/stage1/init/kvm/hypervisor/hvlkvm"
    60  	"github.com/rkt/rkt/stage1/init/kvm/hypervisor/hvqemu"
    61  )
    62  
    63  const (
    64  	// Path to systemd-nspawn binary within the stage1 rootfs
    65  	nspawnBin = "/usr/bin/systemd-nspawn"
    66  	// Path to the localtime file/symlink in host
    67  	localtimePath = "/etc/localtime"
    68  )
    69  
    70  // mirrorLocalZoneInfo tries to reproduce the /etc/localtime target in stage1/ to satisfy systemd-nspawn
    71  func mirrorLocalZoneInfo(root string) {
    72  	zif, err := os.Readlink(localtimePath)
    73  	if err != nil {
    74  		return
    75  	}
    76  
    77  	// On some systems /etc/localtime is a relative symlink, make it absolute
    78  	if !filepath.IsAbs(zif) {
    79  		zif = filepath.Join(filepath.Dir(localtimePath), zif)
    80  		zif = filepath.Clean(zif)
    81  	}
    82  
    83  	src, err := os.Open(zif)
    84  	if err != nil {
    85  		return
    86  	}
    87  	defer src.Close()
    88  
    89  	destp := filepath.Join(common.Stage1RootfsPath(root), zif)
    90  
    91  	if err = os.MkdirAll(filepath.Dir(destp), 0755); err != nil {
    92  		return
    93  	}
    94  
    95  	dest, err := os.OpenFile(destp, os.O_CREATE|os.O_WRONLY, 0644)
    96  	if err != nil {
    97  		return
    98  	}
    99  	defer dest.Close()
   100  
   101  	_, _ = io.Copy(dest, src)
   102  }
   103  
   104  var (
   105  	debug       bool
   106  	localhostIP net.IP
   107  	localConfig string
   108  	log         *rktlog.Logger
   109  	diag        *rktlog.Logger
   110  	interpBin   string // Path to the interpreter within the stage1 rootfs, set by the linker
   111  )
   112  
   113  func parseFlags() *stage1commontypes.RuntimePod {
   114  	rp := stage1commontypes.RuntimePod{}
   115  
   116  	flag.BoolVar(&debug, "debug", false, "Run in debug mode")
   117  	flag.StringVar(&localConfig, "local-config", common.DefaultLocalConfigDir, "Local config path")
   118  
   119  	// These flags are persisted in the PodRuntime
   120  	flag.BoolVar(&rp.Interactive, "interactive", false, "The pod is interactive")
   121  	flag.BoolVar(&rp.Mutable, "mutable", false, "Enable mutable operations on this pod, including starting an empty one")
   122  	flag.Var(&rp.NetList, "net", "Setup networking")
   123  	flag.StringVar(&rp.PrivateUsers, "private-users", "", "Run within user namespace. Can be set to [=UIDBASE[:NUIDS]]")
   124  	flag.StringVar(&rp.MDSToken, "mds-token", "", "MDS auth token")
   125  	flag.StringVar(&rp.Hostname, "hostname", "", "Hostname of the pod")
   126  	flag.BoolVar(&rp.InsecureOptions.DisableCapabilities, "disable-capabilities-restriction", false, "Disable capability restrictions")
   127  	flag.BoolVar(&rp.InsecureOptions.DisablePaths, "disable-paths", false, "Disable paths restrictions")
   128  	flag.BoolVar(&rp.InsecureOptions.DisableSeccomp, "disable-seccomp", false, "Disable seccomp restrictions")
   129  	dnsConfMode := pkgflag.MustNewPairList(map[string][]string{
   130  		"resolv": {"host", "stage0", "none", "default"},
   131  		"hosts":  {"host", "stage0", "default"},
   132  	}, map[string]string{
   133  		"resolv": "default",
   134  		"hosts":  "default",
   135  	})
   136  	flag.Var(dnsConfMode, "dns-conf-mode", "DNS config file modes")
   137  	flag.StringVar(&rp.IPCMode, "ipc", "", "IPC mode --ipc=[auto|private|parent]")
   138  
   139  	flag.Parse()
   140  
   141  	rp.Debug = debug
   142  	rp.ResolvConfMode = dnsConfMode.Pairs["resolv"]
   143  	rp.EtcHostsMode = dnsConfMode.Pairs["hosts"]
   144  
   145  	return &rp
   146  }
   147  
   148  func init() {
   149  	// this ensures that main runs only on main thread (thread group leader).
   150  	// since namespace ops (unshare, setns) are done for a single thread, we
   151  	// must ensure that the goroutine does not jump from OS thread to thread
   152  	runtime.LockOSThread()
   153  
   154  	// We'll need this later
   155  	localhostIP = net.ParseIP("127.0.0.1")
   156  	if localhostIP == nil {
   157  		panic("localhost IP failed to parse")
   158  	}
   159  }
   160  
   161  // machinedRegister checks if nspawn should register the pod to machined
   162  func machinedRegister() bool {
   163  	// machined has a D-Bus interface following versioning guidelines, see:
   164  	// http://www.freedesktop.org/wiki/Software/systemd/machined/
   165  	// Therefore we can just check if the D-Bus method we need exists and we
   166  	// don't need to check the signature.
   167  	var found int
   168  
   169  	conn, err := dbus.SystemBus()
   170  	if err != nil {
   171  		return false
   172  	}
   173  	node, err := introspect.Call(conn.Object("org.freedesktop.machine1", "/org/freedesktop/machine1"))
   174  	if err != nil {
   175  		return false
   176  	}
   177  	for _, iface := range node.Interfaces {
   178  		if iface.Name != "org.freedesktop.machine1.Manager" {
   179  			continue
   180  		}
   181  		// machined v215 supports methods "RegisterMachine" and "CreateMachine" called by nspawn v215.
   182  		// machined v216+ (since commit 5aa4bb) additionally supports methods "CreateMachineWithNetwork"
   183  		// and "RegisterMachineWithNetwork", called by nspawn v216+.
   184  		for _, method := range iface.Methods {
   185  			if method.Name == "CreateMachineWithNetwork" || method.Name == "RegisterMachineWithNetwork" {
   186  				found++
   187  			}
   188  		}
   189  		break
   190  	}
   191  	return found == 2
   192  }
   193  
   194  func installAssets(systemdVersion int) error {
   195  	systemctlBin, err := common.LookupPath("systemctl", os.Getenv("PATH"))
   196  	if err != nil {
   197  		return err
   198  	}
   199  	systemdSysusersBin, err := common.LookupPath("systemd-sysusers", os.Getenv("PATH"))
   200  	if err != nil {
   201  		return err
   202  	}
   203  	systemdTmpfilesBin, err := common.LookupPath("systemd-tmpfiles", os.Getenv("PATH"))
   204  	if err != nil {
   205  		return err
   206  	}
   207  	bashBin, err := common.LookupPath("bash", os.Getenv("PATH"))
   208  	if err != nil {
   209  		return err
   210  	}
   211  	mountBin, err := common.LookupPath("mount", os.Getenv("PATH"))
   212  	if err != nil {
   213  		return err
   214  	}
   215  	umountBin, err := common.LookupPath("umount", os.Getenv("PATH"))
   216  	if err != nil {
   217  		return err
   218  	}
   219  	// More paths could be added in that list if some Linux distributions install it in a different path
   220  	// Note that we look in /usr/lib/... first because of the merge:
   221  	// http://www.freedesktop.org/wiki/Software/systemd/TheCaseForTheUsrMerge/
   222  	systemdShutdownBin, err := common.LookupPath("systemd-shutdown", "/usr/lib/systemd:/lib/systemd")
   223  	if err != nil {
   224  		return err
   225  	}
   226  	systemdBin, err := common.LookupPath("systemd", "/usr/lib/systemd:/lib/systemd")
   227  	if err != nil {
   228  		return err
   229  	}
   230  	systemdJournaldBin, err := common.LookupPath("systemd-journald", "/usr/lib/systemd:/lib/systemd")
   231  	if err != nil {
   232  		return err
   233  	}
   234  
   235  	systemdUnitsPath := "/lib/systemd/system"
   236  	assets := []string{
   237  		proj2aci.GetAssetString("/usr/lib/systemd/systemd", systemdBin),
   238  		proj2aci.GetAssetString("/usr/bin/systemctl", systemctlBin),
   239  		proj2aci.GetAssetString("/usr/bin/systemd-sysusers", systemdSysusersBin),
   240  		proj2aci.GetAssetString("/usr/bin/systemd-tmpfiles", systemdTmpfilesBin),
   241  		proj2aci.GetAssetString("/usr/lib/systemd/systemd-journald", systemdJournaldBin),
   242  		proj2aci.GetAssetString("/usr/bin/bash", bashBin),
   243  		proj2aci.GetAssetString("/bin/mount", mountBin),
   244  		proj2aci.GetAssetString("/bin/umount", umountBin),
   245  		proj2aci.GetAssetString(fmt.Sprintf("%s/systemd-journald.service", systemdUnitsPath), fmt.Sprintf("%s/systemd-journald.service", systemdUnitsPath)),
   246  		proj2aci.GetAssetString(fmt.Sprintf("%s/systemd-journald.socket", systemdUnitsPath), fmt.Sprintf("%s/systemd-journald.socket", systemdUnitsPath)),
   247  		proj2aci.GetAssetString(fmt.Sprintf("%s/systemd-journald-dev-log.socket", systemdUnitsPath), fmt.Sprintf("%s/systemd-journald-dev-log.socket", systemdUnitsPath)),
   248  		proj2aci.GetAssetString(fmt.Sprintf("%s/systemd-journald-audit.socket", systemdUnitsPath), fmt.Sprintf("%s/systemd-journald-audit.socket", systemdUnitsPath)),
   249  		// systemd-shutdown has to be installed at the same path as on the host
   250  		// because it depends on systemd build flag -DSYSTEMD_SHUTDOWN_BINARY_PATH=
   251  		proj2aci.GetAssetString(systemdShutdownBin, systemdShutdownBin),
   252  	}
   253  
   254  	// systemd-journal-flush.service was added in systemd-v233. Required to place
   255  	// the logs in /var/log/journal instead of /run/log/journal. See:
   256  	// https://github.com/systemd/systemd/commit/f78273c8dacf678cc8fd7387f678e6344a99405c
   257  	if systemdVersion >= 233 {
   258  		assets = append(assets, proj2aci.GetAssetString(fmt.Sprintf("%s/systemd-journal-flush.service", systemdUnitsPath), fmt.Sprintf("%s/systemd-journald.service", systemdUnitsPath)))
   259  	}
   260  
   261  	return proj2aci.PrepareAssets(assets, "./stage1/rootfs/", nil)
   262  }
   263  
   264  // getArgsEnv returns the nspawn or lkvm args and env according to the flavor
   265  // as the first two return values respectively.
   266  func getArgsEnv(p *stage1commontypes.Pod, flavor string, canMachinedRegister bool, debug bool, n *networking.Networking, parentIPC bool) ([]string, []string, error) {
   267  	var args []string
   268  	env := os.Environ()
   269  
   270  	// We store the pod's flavor so we can later garbage collect it correctly
   271  	if err := os.Symlink(flavor, filepath.Join(p.Root, stage1initcommon.FlavorFile)); err != nil {
   272  		return nil, nil, errwrap.Wrap(errors.New("failed to create flavor symlink"), err)
   273  	}
   274  
   275  	// systemd-nspawn needs /etc/machine-id to link the container's journal
   276  	// to the host. Since systemd-v230, /etc/machine-id is mandatory, see
   277  	// https://github.com/systemd/systemd/commit/e01ff70a77e781734e1e73a2238af2e9bf7967a8
   278  	mPath := filepath.Join(common.Stage1RootfsPath(p.Root), "etc", "machine-id")
   279  	machineID := strings.Replace(p.UUID.String(), "-", "", -1)
   280  
   281  	switch flavor {
   282  	case "kvm":
   283  		if p.PrivateUsers != "" {
   284  			return nil, nil, fmt.Errorf("flag --private-users cannot be used with an lkvm stage1")
   285  		}
   286  
   287  		// kernel and hypervisor binaries are located relative to the working directory
   288  		// of init (/var/lib/rkt/..../uuid)
   289  		// TODO: move to path.go
   290  		kernelPath := filepath.Join(common.Stage1RootfsPath(p.Root), "kernel_image")
   291  		netDescriptions := kvm.GetNetworkDescriptions(n)
   292  
   293  		cpu, mem := kvm.GetAppsResources(p.Manifest.Apps)
   294  
   295  		// Parse hypervisor
   296  		hv, err := KvmCheckHypervisor(common.Stage1RootfsPath(p.Root))
   297  		if err != nil {
   298  			return nil, nil, err
   299  		}
   300  
   301  		// Set start command for hypervisor
   302  		StartCmd := hvlkvm.StartCmd
   303  		switch hv {
   304  		case "lkvm":
   305  			StartCmd = hvlkvm.StartCmd
   306  		case "qemu":
   307  			StartCmd = hvqemu.StartCmd
   308  		default:
   309  			return nil, nil, fmt.Errorf("unrecognized hypervisor")
   310  		}
   311  
   312  		hvStartCmd := StartCmd(
   313  			common.Stage1RootfsPath(p.Root),
   314  			p.UUID.String(),
   315  			kernelPath,
   316  			netDescriptions,
   317  			cpu,
   318  			mem,
   319  			debug,
   320  		)
   321  
   322  		if hvStartCmd == nil {
   323  			return nil, nil, fmt.Errorf("no hypervisor")
   324  		}
   325  
   326  		args = append(args, hvStartCmd...)
   327  
   328  		// lkvm requires $HOME to be defined,
   329  		// see https://github.com/rkt/rkt/issues/1393
   330  		if os.Getenv("HOME") == "" {
   331  			env = append(env, "HOME=/root")
   332  		}
   333  
   334  		if err := linkJournal(common.Stage1RootfsPath(p.Root), machineID); err != nil {
   335  			return nil, nil, errwrap.Wrap(errors.New("error linking pod's journal"), err)
   336  		}
   337  
   338  		// use only dynamic libraries provided in the image
   339  		// from systemd v231 there's a new internal libsystemd-shared-v231.so
   340  		// which is present in /usr/lib/systemd
   341  		env = append(env, "LD_LIBRARY_PATH="+filepath.Join(common.Stage1RootfsPath(p.Root), "usr/lib/systemd"))
   342  
   343  		return args, env, nil
   344  
   345  	case "coreos":
   346  		args = append(args, filepath.Join(common.Stage1RootfsPath(p.Root), interpBin))
   347  		args = append(args, filepath.Join(common.Stage1RootfsPath(p.Root), nspawnBin))
   348  		args = append(args, "--boot")             // Launch systemd in the pod
   349  		args = append(args, "--notify-ready=yes") // From systemd v231
   350  
   351  		if context := os.Getenv(common.EnvSELinuxContext); context != "" {
   352  			args = append(args, fmt.Sprintf("-Z%s", context))
   353  		}
   354  
   355  		if context := os.Getenv(common.EnvSELinuxMountContext); context != "" {
   356  			args = append(args, fmt.Sprintf("-L%s", context))
   357  		}
   358  
   359  		if canMachinedRegister {
   360  			args = append(args, fmt.Sprintf("--register=true"))
   361  		} else {
   362  			args = append(args, fmt.Sprintf("--register=false"))
   363  		}
   364  
   365  		kubernetesLogDir, ok := p.Manifest.Annotations.Get("coreos.com/rkt/experiment/kubernetes-log-dir")
   366  		if ok {
   367  			args = append(args, fmt.Sprintf("--bind=%s:/rkt/kubernetes/log", kubernetesLogDir))
   368  		}
   369  
   370  		// use only dynamic libraries provided in the image
   371  		// from systemd v231 there's a new internal libsystemd-shared-v231.so
   372  		// which is present in /usr/lib/systemd
   373  		env = append(env, "LD_LIBRARY_PATH="+
   374  			filepath.Join(common.Stage1RootfsPath(p.Root), "usr/lib")+":"+
   375  			filepath.Join(common.Stage1RootfsPath(p.Root), "usr/lib/systemd"))
   376  
   377  	case "src":
   378  		args = append(args, filepath.Join(common.Stage1RootfsPath(p.Root), interpBin))
   379  		args = append(args, filepath.Join(common.Stage1RootfsPath(p.Root), nspawnBin))
   380  		args = append(args, "--boot")             // Launch systemd in the pod
   381  		args = append(args, "--notify-ready=yes") // From systemd v231
   382  
   383  		if context := os.Getenv(common.EnvSELinuxContext); context != "" {
   384  			args = append(args, fmt.Sprintf("-Z%s", context))
   385  		}
   386  
   387  		if context := os.Getenv(common.EnvSELinuxMountContext); context != "" {
   388  			args = append(args, fmt.Sprintf("-L%s", context))
   389  		}
   390  
   391  		if canMachinedRegister {
   392  			args = append(args, fmt.Sprintf("--register=true"))
   393  		} else {
   394  			args = append(args, fmt.Sprintf("--register=false"))
   395  		}
   396  
   397  		kubernetesLogDir, ok := p.Manifest.Annotations.Get("coreos.com/rkt/experiment/kubernetes-log-dir")
   398  		if ok {
   399  			args = append(args, fmt.Sprintf("--bind=%s:/rkt/kubernetes/log", kubernetesLogDir))
   400  		}
   401  
   402  		// use only dynamic libraries provided in the image
   403  		// from systemd v231 there's a new internal libsystemd-shared-v231.so
   404  		// which is present in /usr/lib/systemd
   405  		env = append(env, "LD_LIBRARY_PATH="+
   406  			filepath.Join(common.Stage1RootfsPath(p.Root), "usr/lib")+":"+
   407  			filepath.Join(common.Stage1RootfsPath(p.Root), "usr/lib/systemd"))
   408  
   409  	case "host":
   410  		hostNspawnBin, err := common.LookupPath("systemd-nspawn", os.Getenv("PATH"))
   411  		if err != nil {
   412  			return nil, nil, err
   413  		}
   414  
   415  		// Check dynamically which version is installed on the host
   416  		// Support version >= 220
   417  		versionBytes, err := exec.Command(hostNspawnBin, "--version").CombinedOutput()
   418  		if err != nil {
   419  			return nil, nil, errwrap.Wrap(fmt.Errorf("unable to probe %s version", hostNspawnBin), err)
   420  		}
   421  		versionStr := strings.SplitN(string(versionBytes), "\n", 2)[0]
   422  		var version int
   423  		n, err := fmt.Sscanf(versionStr, "systemd %d", &version)
   424  		if err != nil {
   425  			return nil, nil, fmt.Errorf("cannot parse version: %q", versionStr)
   426  		}
   427  		if n != 1 || version < 220 {
   428  			return nil, nil, fmt.Errorf("rkt needs systemd-nspawn >= 220. %s version not supported: %v", hostNspawnBin, versionStr)
   429  		}
   430  
   431  		// Copy systemd, bash, etc. in stage1 at run-time
   432  		if err := installAssets(version); err != nil {
   433  			return nil, nil, errwrap.Wrap(errors.New("cannot install assets from the host"), err)
   434  		}
   435  
   436  		args = append(args, hostNspawnBin)
   437  		args = append(args, "--boot") // Launch systemd in the pod
   438  		args = append(args, fmt.Sprintf("--register=true"))
   439  
   440  		if version >= 231 {
   441  			args = append(args, "--notify-ready=yes") // From systemd v231
   442  		}
   443  
   444  		if context := os.Getenv(common.EnvSELinuxContext); context != "" {
   445  			args = append(args, fmt.Sprintf("-Z%s", context))
   446  		}
   447  
   448  		if context := os.Getenv(common.EnvSELinuxMountContext); context != "" {
   449  			args = append(args, fmt.Sprintf("-L%s", context))
   450  		}
   451  
   452  		kubernetesLogDir, ok := p.Manifest.Annotations.Get("coreos.com/rkt/experiment/kubernetes-log-dir")
   453  		if ok {
   454  			args = append(args, fmt.Sprintf("--bind=%s:/rkt/kubernetes/log", kubernetesLogDir))
   455  		}
   456  
   457  	default:
   458  		return nil, nil, fmt.Errorf("unrecognized stage1 flavor: %q", flavor)
   459  	}
   460  
   461  	machineIDBytes := append([]byte(machineID), '\n')
   462  	if err := ioutil.WriteFile(mPath, machineIDBytes, 0644); err != nil {
   463  		return nil, nil, errwrap.Wrap(errors.New("error writing /etc/machine-id"), err)
   464  	}
   465  	if err := user.ShiftFiles([]string{mPath}, &p.UidRange); err != nil {
   466  		return nil, nil, errwrap.Wrap(errors.New("error shifting /etc/machine-id"), err)
   467  	}
   468  
   469  	// link journal only if the host is running systemd
   470  	if util.IsRunningSystemd() {
   471  		args = append(args, "--link-journal=try-guest")
   472  
   473  		keepUnit, err := util.RunningFromSystemService()
   474  		if err != nil {
   475  			if err == dlopen.ErrSoNotFound {
   476  				log.Print("warning: libsystemd not found even though systemd is running. Cgroup limits set by the environment (e.g. a systemd service) won't be enforced.")
   477  			} else {
   478  				return nil, nil, errwrap.Wrap(errors.New("error determining if we're running from a system service"), err)
   479  			}
   480  		}
   481  
   482  		if keepUnit {
   483  			args = append(args, "--keep-unit")
   484  		}
   485  	} else {
   486  		args = append(args, "--link-journal=no")
   487  	}
   488  
   489  	if !debug {
   490  		args = append(args, "--quiet")             // silence most nspawn output (log_warning is currently not covered by this)
   491  		env = append(env, "SYSTEMD_LOG_LEVEL=err") // silence log_warning too
   492  	}
   493  
   494  	if parentIPC {
   495  		env = append(env, "SYSTEMD_NSPAWN_SHARE_NS_IPC=true")
   496  	}
   497  
   498  	env = append(env, "SYSTEMD_NSPAWN_CONTAINER_SERVICE=rkt")
   499  	// TODO (alepuccetti) remove this line when rkt will use cgroup namespace
   500  	// If the kernel has the cgroup namespace enabled, systemd v232 will use it by default.
   501  	// This was introduced by https://github.com/systemd/systemd/pull/3809 and it will cause
   502  	// problems in rkt when cgns is enabled and cgroup-v1 is used. For more information see
   503  	// https://github.com/systemd/systemd/pull/3589#discussion_r70277625.
   504  	// The following line tells systemd-nspawn not to use cgroup namespace using the environment variable
   505  	// introduced by https://github.com/systemd/systemd/pull/3809.
   506  	env = append(env, "SYSTEMD_NSPAWN_USE_CGNS=no")
   507  
   508  	if p.InsecureOptions.DisablePaths {
   509  		env = append(env, "SYSTEMD_NSPAWN_API_VFS_WRITABLE=yes")
   510  	}
   511  
   512  	if len(p.PrivateUsers) > 0 {
   513  		args = append(args, "--private-users="+p.PrivateUsers)
   514  	}
   515  
   516  	nsargs, err := stage1initcommon.PodToNspawnArgs(p)
   517  	if err != nil {
   518  		return nil, nil, errwrap.Wrap(errors.New("failed to generate nspawn args"), err)
   519  	}
   520  	args = append(args, nsargs...)
   521  
   522  	// Arguments to systemd
   523  	args = append(args, "--")
   524  	args = append(args, "--default-standard-output=tty") // redirect all service logs straight to tty
   525  	if !debug {
   526  		args = append(args, "--log-target=null") // silence systemd output inside pod
   527  		args = append(args, "--show-status=0")   // silence systemd initialization status output
   528  	}
   529  
   530  	return args, env, nil
   531  }
   532  
   533  func stage1(rp *stage1commontypes.RuntimePod) int {
   534  	uuid, err := types.NewUUID(flag.Arg(0))
   535  	if err != nil {
   536  		log.FatalE("UUID is missing or malformed", err)
   537  	}
   538  
   539  	root := "."
   540  	p, err := stage1commontypes.LoadPod(root, uuid, rp)
   541  	if err != nil {
   542  		log.FatalE("failed to load pod", err)
   543  	}
   544  
   545  	if err := p.SaveRuntime(); err != nil {
   546  		log.FatalE("failed to save runtime parameters", err)
   547  	}
   548  
   549  	// set close-on-exec flag on RKT_LOCK_FD so it gets correctly closed when invoking
   550  	// network plugins
   551  	lfd, err := common.GetRktLockFD()
   552  	if err != nil {
   553  		log.FatalE("failed to get rkt lock fd", err)
   554  	}
   555  
   556  	if err := sys.CloseOnExec(lfd, true); err != nil {
   557  		log.FatalE("failed to set FD_CLOEXEC on rkt lock", err)
   558  	}
   559  
   560  	mirrorLocalZoneInfo(p.Root)
   561  
   562  	flavor, _, err := stage1initcommon.GetFlavor(p)
   563  	if err != nil {
   564  		log.FatalE("failed to get stage1 flavor", err)
   565  	}
   566  
   567  	var n *networking.Networking
   568  	if p.NetList.Contained() {
   569  		fps, err := commonnet.ForwardedPorts(p.Manifest)
   570  		if err != nil {
   571  			log.FatalE("error initializing forwarding ports", err)
   572  		}
   573  
   574  		noDNS := p.ResolvConfMode != "default" // force ignore CNI DNS results
   575  		n, err = networking.Setup(root, p.UUID, fps, p.NetList, localConfig, flavor, noDNS, debug)
   576  		if err != nil {
   577  			log.FatalE("failed to setup network", err)
   578  		}
   579  
   580  		if err = n.Save(); err != nil {
   581  			log.PrintE("failed to save networking state", err)
   582  			n.Teardown(flavor, debug)
   583  			return 254
   584  		}
   585  
   586  		if len(p.MDSToken) > 0 {
   587  			hostIP, err := n.GetForwardableNetHostIP()
   588  			if err != nil {
   589  				log.FatalE("failed to get default Host IP", err)
   590  			}
   591  
   592  			p.MetadataServiceURL = common.MetadataServicePublicURL(hostIP, p.MDSToken)
   593  		}
   594  	} else {
   595  		if flavor == "kvm" {
   596  			log.Fatal("flavor kvm requires private network configuration (try --net)")
   597  		}
   598  		if len(p.MDSToken) > 0 {
   599  			p.MetadataServiceURL = common.MetadataServicePublicURL(localhostIP, p.MDSToken)
   600  		}
   601  	}
   602  
   603  	mnt := fs.NewLoggingMounter(
   604  		fs.MounterFunc(syscall.Mount),
   605  		fs.UnmounterFunc(syscall.Unmount),
   606  		diag.Printf,
   607  	)
   608  
   609  	// set hostname inside pod
   610  	// According to systemd manual (https://www.freedesktop.org/software/systemd/man/hostname.html) :
   611  	// "The /etc/hostname file configures the name of the local system that is set
   612  	// during boot using the sethostname system call"
   613  	if p.Hostname == "" {
   614  		p.Hostname = stage1initcommon.GetMachineID(p)
   615  	}
   616  	hostnamePath := filepath.Join(common.Stage1RootfsPath(p.Root), "etc/hostname")
   617  	if err := ioutil.WriteFile(hostnamePath, []byte(p.Hostname), 0644); err != nil {
   618  		log.PrintE("error writing "+hostnamePath, err)
   619  		return 254
   620  	}
   621  	if err := user.ShiftFiles([]string{hostnamePath}, &p.UidRange); err != nil {
   622  		log.PrintE("error shifting "+hostnamePath, err)
   623  	}
   624  
   625  	if p.ResolvConfMode == "host" {
   626  		stage1initcommon.UseHostResolv(mnt, root)
   627  	}
   628  
   629  	// Set up the hosts file.
   630  	// We write <stage1>/etc/rkt-hosts if we want to override each app's hosts,
   631  	// and <stage1>/etc/hosts-fallback if we want to let the app "win"
   632  	// Either way, we should add our hostname to it, unless the hosts's
   633  	// /etc/hosts is bind-mounted in.
   634  	if p.EtcHostsMode == "host" { // We should bind-mount the hosts's /etc/hosts
   635  		stage1initcommon.UseHostHosts(mnt, root)
   636  	} else if p.EtcHostsMode == "default" { // Create hosts-fallback
   637  		hostsFile := filepath.Join(common.Stage1RootfsPath(p.Root), "etc", "hosts-fallback")
   638  		if err := stage1initcommon.AddHostsEntry(hostsFile, "127.0.0.1", p.Hostname); err != nil {
   639  			log.PrintE("Failed to write hostname to "+hostsFile, err)
   640  			return 254
   641  		}
   642  	} else if p.EtcHostsMode == "stage0" { // The stage0 has created rkt-hosts
   643  		hostsFile := filepath.Join(common.Stage1RootfsPath(p.Root), "etc", "rkt-hosts")
   644  		if err := stage1initcommon.AddHostsEntry(hostsFile, "127.0.0.1", p.Hostname); err != nil {
   645  			log.PrintE("Failed to write hostname to "+hostsFile, err)
   646  			return 254
   647  		}
   648  	}
   649  
   650  	if p.Mutable {
   651  		if err = stage1initcommon.MutableEnv(p); err != nil {
   652  			log.FatalE("cannot initialize mutable environment", err)
   653  		}
   654  	} else {
   655  		if err = stage1initcommon.ImmutableEnv(p); err != nil {
   656  			log.FatalE("cannot initialize immutable environment", err)
   657  		}
   658  	}
   659  
   660  	if err := stage1initcommon.SetJournalPermissions(p); err != nil {
   661  		log.PrintE("warning: error setting journal ACLs, you'll need root to read the pod journal", err)
   662  	}
   663  
   664  	if flavor == "kvm" {
   665  		kvm.InitDebug(debug)
   666  		if err := KvmNetworkingToSystemd(p, n); err != nil {
   667  			log.FatalE("failed to configure systemd for kvm", err)
   668  		}
   669  	}
   670  
   671  	canMachinedRegister := false
   672  	if flavor != "kvm" {
   673  		// kvm doesn't register with systemd right now, see #2664.
   674  		canMachinedRegister = machinedRegister()
   675  	}
   676  	diag.Printf("canMachinedRegister %t", canMachinedRegister)
   677  
   678  	// --ipc=[auto|private|parent]
   679  	// default to private
   680  	parentIPC := false
   681  	switch p.IPCMode {
   682  	case "parent":
   683  		parentIPC = true
   684  	case "private":
   685  		parentIPC = false
   686  	case "auto":
   687  		fallthrough
   688  	case "":
   689  		parentIPC = false
   690  	default:
   691  		log.Fatalf("unknown value for --ipc parameter: %v", p.IPCMode)
   692  	}
   693  	if parentIPC && flavor == "kvm" {
   694  		log.Fatal("flavor kvm requires private IPC namespace (try to remove --ipc)")
   695  	}
   696  
   697  	args, env, err := getArgsEnv(p, flavor, canMachinedRegister, debug, n, parentIPC)
   698  	if err != nil {
   699  		log.FatalE("cannot get environment", err)
   700  	}
   701  	diag.Printf("args %q", args)
   702  	diag.Printf("env %q", env)
   703  
   704  	// create a separate mount namespace so the cgroup filesystems
   705  	// are unmounted when exiting the pod
   706  	if err := syscall.Unshare(syscall.CLONE_NEWNS); err != nil {
   707  		log.FatalE("error unsharing", err)
   708  	}
   709  
   710  	// we recursively make / a "shared and slave" so mount events from the
   711  	// new namespace don't propagate to the host namespace but mount events
   712  	// from the host propagate to the new namespace and are forwarded to
   713  	// its peer group
   714  	// See https://www.kernel.org/doc/Documentation/filesystems/sharedsubtree.txt
   715  	if err := mnt.Mount("", "/", "none", syscall.MS_REC|syscall.MS_SLAVE, ""); err != nil {
   716  		log.FatalE("error making / a slave mount", err)
   717  	}
   718  	if err := mnt.Mount("", "/", "none", syscall.MS_REC|syscall.MS_SHARED, ""); err != nil {
   719  		log.FatalE("error making / a shared and slave mount", err)
   720  	}
   721  
   722  	unifiedCgroup, err := cgroup.IsCgroupUnified("/")
   723  	if err != nil {
   724  		log.FatalE("error determining cgroup version", err)
   725  	}
   726  	diag.Printf("unifiedCgroup %t", unifiedCgroup)
   727  
   728  	machineID := stage1initcommon.GetMachineID(p)
   729  
   730  	subcgroup, err := getContainerSubCgroup(machineID, canMachinedRegister, unifiedCgroup)
   731  	if err != nil {
   732  		log.FatalE("error getting container subcgroup", err)
   733  	}
   734  	diag.Printf("subcgroup %q", subcgroup)
   735  
   736  	if err := ioutil.WriteFile(filepath.Join(p.Root, "subcgroup"),
   737  		[]byte(fmt.Sprintf("%s", subcgroup)), 0644); err != nil {
   738  		log.FatalE("cannot write subcgroup file", err)
   739  	}
   740  
   741  	if !unifiedCgroup {
   742  		enabledCgroups, err := v1.GetEnabledCgroups()
   743  		if err != nil {
   744  			log.FatalE("error getting v1 cgroups", err)
   745  		}
   746  		diag.Printf("enabledCgroups %q", enabledCgroups)
   747  
   748  		if err := mountHostV1Cgroups(mnt, enabledCgroups); err != nil {
   749  			log.FatalE("couldn't mount the host v1 cgroups", err)
   750  		}
   751  
   752  		if !canMachinedRegister {
   753  			if err := v1.JoinSubcgroup("systemd", subcgroup); err != nil {
   754  				log.FatalE(fmt.Sprintf("error joining subcgroup %q", subcgroup), err)
   755  			}
   756  		}
   757  
   758  		var serviceNames []string
   759  		for _, app := range p.Manifest.Apps {
   760  			serviceNames = append(serviceNames, stage1initcommon.ServiceUnitName(app.Name))
   761  		}
   762  		diag.Printf("serviceNames %q", serviceNames)
   763  
   764  		if err := mountContainerV1Cgroups(mnt, p, enabledCgroups, subcgroup, serviceNames); err != nil {
   765  			log.FatalE("couldn't mount the container v1 cgroups", err)
   766  		}
   767  
   768  	}
   769  
   770  	// KVM flavor has a bit different logic in handling pid vs ppid, for details look into #2389
   771  	// it doesn't require the existence of a "ppid", instead it registers the current pid (which
   772  	// will be reused by lkvm binary) as a pod process pid used during entering
   773  	pid_filename := "ppid"
   774  	if flavor == "kvm" {
   775  		pid_filename = "pid"
   776  	}
   777  
   778  	if err = stage1common.WritePid(os.Getpid(), pid_filename); err != nil {
   779  		log.FatalE("error writing pid", err)
   780  	}
   781  
   782  	if flavor == "kvm" {
   783  		if err := KvmPrepareMounts(p); err != nil {
   784  			log.FatalE("error preparing mounts", err)
   785  		}
   786  	}
   787  
   788  	err = stage1common.WithClearedCloExec(lfd, func() error {
   789  		return syscall.Exec(args[0], args, env)
   790  	})
   791  
   792  	if err != nil {
   793  		log.FatalE(fmt.Sprintf("failed to execute %q", args[0]), err)
   794  	}
   795  
   796  	return 0
   797  }
   798  
   799  func areHostV1CgroupsMounted(enabledV1Cgroups map[int][]string) bool {
   800  	controllers := v1.GetControllerDirs(enabledV1Cgroups)
   801  	for _, c := range controllers {
   802  		if mounted, _ := v1.IsControllerMounted(c); !mounted {
   803  			return false
   804  		}
   805  	}
   806  
   807  	return true
   808  }
   809  
   810  // mountHostV1Cgroups mounts the host v1 cgroup hierarchy as required by
   811  // systemd-nspawn. We need this because some distributions don't have the
   812  // "name=systemd" cgroup or don't mount the cgroup controllers in
   813  // "/sys/fs/cgroup", and systemd-nspawn needs this. Since this is mounted
   814  // inside the rkt mount namespace, it doesn't affect the host.
   815  func mountHostV1Cgroups(m fs.Mounter, enabledCgroups map[int][]string) error {
   816  	systemdControllerPath := "/sys/fs/cgroup/systemd"
   817  	if !areHostV1CgroupsMounted(enabledCgroups) {
   818  		mountContext := os.Getenv(common.EnvSELinuxMountContext)
   819  		if err := v1.CreateCgroups(m, "/", enabledCgroups, mountContext); err != nil {
   820  			return errwrap.Wrap(errors.New("error creating host cgroups"), err)
   821  		}
   822  	}
   823  
   824  	mounted, err := v1.IsControllerMounted("systemd")
   825  	if err != nil {
   826  		return err
   827  	}
   828  	if !mounted {
   829  		if err := os.MkdirAll(systemdControllerPath, 0700); err != nil {
   830  			return err
   831  		}
   832  		if err := m.Mount("cgroup", systemdControllerPath, "cgroup", 0, "none,name=systemd"); err != nil {
   833  			return errwrap.Wrap(fmt.Errorf("error mounting name=systemd hierarchy on %q", systemdControllerPath), err)
   834  		}
   835  	}
   836  
   837  	return nil
   838  }
   839  
   840  // mountContainerV1Cgroups mounts the cgroup controllers hierarchy in the container's
   841  // namespace read-only, leaving the needed knobs in the subcgroup for each-app
   842  // read-write so systemd inside stage1 can apply isolators to them
   843  func mountContainerV1Cgroups(m fs.Mounter, p *stage1commontypes.Pod, enabledCgroups map[int][]string, subcgroup string, serviceNames []string) error {
   844  	mountContext := os.Getenv(common.EnvSELinuxMountContext)
   845  	stage1Root := common.Stage1RootfsPath(p.Root)
   846  	if err := v1.CreateCgroups(m, stage1Root, enabledCgroups, mountContext); err != nil {
   847  		return errwrap.Wrap(errors.New("error creating container cgroups"), err)
   848  	}
   849  
   850  	if err := v1.RemountCgroups(m, stage1Root, enabledCgroups, subcgroup, p.InsecureOptions.DisablePaths); err != nil {
   851  		return errwrap.Wrap(errors.New("error restricting container cgroups"), err)
   852  	}
   853  
   854  	return nil
   855  }
   856  
   857  func getContainerSubCgroup(machineID string, canMachinedRegister, unified bool) (string, error) {
   858  	var fromUnit bool
   859  
   860  	if util.IsRunningSystemd() {
   861  		var err error
   862  		if fromUnit, err = util.RunningFromSystemService(); err != nil {
   863  			return "", errwrap.Wrap(errors.New("could not determine if we're running from a unit file"), err)
   864  		}
   865  	}
   866  
   867  	if fromUnit {
   868  		slice, err := util.GetRunningSlice()
   869  		if err != nil {
   870  			return "", errwrap.Wrap(errors.New("could not get slice name"), err)
   871  		}
   872  		slicePath, err := common.SliceToPath(slice)
   873  		if err != nil {
   874  			return "", errwrap.Wrap(errors.New("could not convert slice name to path"), err)
   875  		}
   876  		unit, err := util.CurrentUnitName()
   877  		if err != nil {
   878  			return "", errwrap.Wrap(errors.New("could not get unit name"), err)
   879  		}
   880  		subcgroup := filepath.Join(slicePath, unit)
   881  
   882  		if unified {
   883  			return filepath.Join(subcgroup, "payload"), nil
   884  		}
   885  
   886  		return subcgroup, nil
   887  	}
   888  
   889  	escapedmID := strings.Replace(machineID, "-", "\\x2d", -1)
   890  	machineDir := "machine-" + escapedmID + ".scope"
   891  
   892  	if canMachinedRegister {
   893  		// we are not in the final cgroup yet: systemd-nspawn will move us
   894  		// to the correct cgroup later during registration so we can't
   895  		// look it up in /proc/self/cgroup
   896  		return filepath.Join("machine.slice", machineDir), nil
   897  	}
   898  
   899  	if unified {
   900  		subcgroup, err := v2.GetOwnCgroupPath()
   901  		if err != nil {
   902  			return "", errwrap.Wrap(errors.New("could not get own v2 cgroup path"), err)
   903  		}
   904  		return subcgroup, nil
   905  	}
   906  
   907  	// when registration is disabled the container will be directly
   908  	// under the current cgroup so we can look it up in /proc/self/cgroup
   909  	// Try the systemd slice first, falling back to cpu if that fails (e.g. on
   910  	// systems not running systemd). See issue #3502.
   911  	ownV1CgroupPath, err := v1.GetOwnCgroupPath("name=systemd")
   912  	if err != nil {
   913  		ownV1CgroupPath, err = v1.GetOwnCgroupPath("cpu")
   914  		if err != nil {
   915  			return "", errwrap.Wrap(errors.New("could not get own v1 cgroup path"), err)
   916  		}
   917  	}
   918  
   919  	// systemd-nspawn won't work if we are in the root cgroup. In addition,
   920  	// we want all rkt instances to be in distinct cgroups. Create a
   921  	// subcgroup and add ourselves to it.
   922  	return filepath.Join(ownV1CgroupPath, machineDir), nil
   923  }
   924  
   925  func main() {
   926  	rp := parseFlags()
   927  	stage1initcommon.InitDebug(debug)
   928  
   929  	log, diag, _ = rktlog.NewLogSet("stage1", debug)
   930  	if !debug {
   931  		diag.SetOutput(ioutil.Discard)
   932  	}
   933  
   934  	// move code into stage1() helper so deferred fns get run
   935  	os.Exit(stage1(rp))
   936  }