github.com/blixtra/rkt@v0.8.1-0.20160204105720-ab0d1add1a43/stage1/init/init.go (about)

     1  // Copyright 2014 The rkt Authors
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  //+build linux
    16  
    17  package main
    18  
    19  // this implements /init of stage1/nspawn+systemd
    20  
    21  import (
    22  	"errors"
    23  	"flag"
    24  	"fmt"
    25  	"io"
    26  	"io/ioutil"
    27  	"net"
    28  	"os"
    29  	"os/exec"
    30  	"path/filepath"
    31  	"runtime"
    32  	"strconv"
    33  	"strings"
    34  	"syscall"
    35  
    36  	"github.com/appc/goaci/proj2aci"
    37  	"github.com/appc/spec/schema/types"
    38  	"github.com/coreos/go-systemd/util"
    39  	"github.com/godbus/dbus"
    40  	"github.com/godbus/dbus/introspect"
    41  	"github.com/hashicorp/errwrap"
    42  
    43  	stage1common "github.com/coreos/rkt/stage1/common"
    44  	stage1commontypes "github.com/coreos/rkt/stage1/common/types"
    45  	stage1initcommon "github.com/coreos/rkt/stage1/init/common"
    46  
    47  	"github.com/coreos/rkt/common"
    48  	"github.com/coreos/rkt/common/cgroup"
    49  	"github.com/coreos/rkt/networking"
    50  	rktlog "github.com/coreos/rkt/pkg/log"
    51  	"github.com/coreos/rkt/pkg/sys"
    52  	"github.com/coreos/rkt/stage1/init/kvm"
    53  )
    54  
    55  const (
    56  	// Path to systemd-nspawn binary within the stage1 rootfs
    57  	nspawnBin = "/usr/bin/systemd-nspawn"
    58  	// Path to the interpreter within the stage1 rootfs
    59  	interpBin = "/usr/lib/ld-linux-x86-64.so.2"
    60  	// Path to the localtime file/symlink in host
    61  	localtimePath = "/etc/localtime"
    62  )
    63  
    64  // mirrorLocalZoneInfo tries to reproduce the /etc/localtime target in stage1/ to satisfy systemd-nspawn
    65  func mirrorLocalZoneInfo(root string) {
    66  	zif, err := os.Readlink(localtimePath)
    67  	if err != nil {
    68  		return
    69  	}
    70  
    71  	// On some systems /etc/localtime is a relative symlink, make it absolute
    72  	if !filepath.IsAbs(zif) {
    73  		zif = filepath.Join(filepath.Dir(localtimePath), zif)
    74  		zif = filepath.Clean(zif)
    75  	}
    76  
    77  	src, err := os.Open(zif)
    78  	if err != nil {
    79  		return
    80  	}
    81  	defer src.Close()
    82  
    83  	destp := filepath.Join(common.Stage1RootfsPath(root), zif)
    84  
    85  	if err = os.MkdirAll(filepath.Dir(destp), 0755); err != nil {
    86  		return
    87  	}
    88  
    89  	dest, err := os.OpenFile(destp, os.O_CREATE|os.O_WRONLY, 0644)
    90  	if err != nil {
    91  		return
    92  	}
    93  	defer dest.Close()
    94  
    95  	_, _ = io.Copy(dest, src)
    96  }
    97  
    98  var (
    99  	debug        bool
   100  	netList      common.NetList
   101  	interactive  bool
   102  	privateUsers string
   103  	mdsToken     string
   104  	localhostIP  net.IP
   105  	localConfig  string
   106  	log          *rktlog.Logger
   107  	diag         *rktlog.Logger
   108  )
   109  
   110  func init() {
   111  	flag.BoolVar(&debug, "debug", false, "Run in debug mode")
   112  	flag.Var(&netList, "net", "Setup networking")
   113  	flag.BoolVar(&interactive, "interactive", false, "The pod is interactive")
   114  	flag.StringVar(&privateUsers, "private-users", "", "Run within user namespace. Can be set to [=UIDBASE[:NUIDS]]")
   115  	flag.StringVar(&mdsToken, "mds-token", "", "MDS auth token")
   116  	flag.StringVar(&localConfig, "local-config", common.DefaultLocalConfigDir, "Local config path")
   117  	// this ensures that main runs only on main thread (thread group leader).
   118  	// since namespace ops (unshare, setns) are done for a single thread, we
   119  	// must ensure that the goroutine does not jump from OS thread to thread
   120  	runtime.LockOSThread()
   121  
   122  	localhostIP = net.ParseIP("127.0.0.1")
   123  	if localhostIP == nil {
   124  		panic("localhost IP failed to parse")
   125  	}
   126  }
   127  
   128  // machinedRegister checks if nspawn should register the pod to machined
   129  func machinedRegister() bool {
   130  	// machined has a D-Bus interface following versioning guidelines, see:
   131  	// http://www.freedesktop.org/wiki/Software/systemd/machined/
   132  	// Therefore we can just check if the D-Bus method we need exists and we
   133  	// don't need to check the signature.
   134  	var found int
   135  
   136  	conn, err := dbus.SystemBus()
   137  	if err != nil {
   138  		return false
   139  	}
   140  	node, err := introspect.Call(conn.Object("org.freedesktop.machine1", "/org/freedesktop/machine1"))
   141  	if err != nil {
   142  		return false
   143  	}
   144  	for _, iface := range node.Interfaces {
   145  		if iface.Name != "org.freedesktop.machine1.Manager" {
   146  			continue
   147  		}
   148  		// machined v215 supports methods "RegisterMachine" and "CreateMachine" called by nspawn v215.
   149  		// machined v216+ (since commit 5aa4bb) additionally supports methods "CreateMachineWithNetwork"
   150  		// and "RegisterMachineWithNetwork", called by nspawn v216+.
   151  		for _, method := range iface.Methods {
   152  			if method.Name == "CreateMachineWithNetwork" || method.Name == "RegisterMachineWithNetwork" {
   153  				found++
   154  			}
   155  		}
   156  		break
   157  	}
   158  	return found == 2
   159  }
   160  
   161  func lookupPath(bin string, paths string) (string, error) {
   162  	pathsArr := filepath.SplitList(paths)
   163  	for _, path := range pathsArr {
   164  		binPath := filepath.Join(path, bin)
   165  		binAbsPath, err := filepath.Abs(binPath)
   166  		if err != nil {
   167  			return "", fmt.Errorf("unable to find absolute path for %s", binPath)
   168  		}
   169  		d, err := os.Stat(binAbsPath)
   170  		if err != nil {
   171  			continue
   172  		}
   173  		// Check the executable bit, inspired by os.exec.LookPath()
   174  		if m := d.Mode(); !m.IsDir() && m&0111 != 0 {
   175  			return binAbsPath, nil
   176  		}
   177  	}
   178  	return "", fmt.Errorf("unable to find %q in %q", bin, paths)
   179  }
   180  
   181  func installAssets() error {
   182  	systemctlBin, err := lookupPath("systemctl", os.Getenv("PATH"))
   183  	if err != nil {
   184  		return err
   185  	}
   186  	bashBin, err := lookupPath("bash", os.Getenv("PATH"))
   187  	if err != nil {
   188  		return err
   189  	}
   190  	// More paths could be added in that list if some Linux distributions install it in a different path
   191  	// Note that we look in /usr/lib/... first because of the merge:
   192  	// http://www.freedesktop.org/wiki/Software/systemd/TheCaseForTheUsrMerge/
   193  	systemdShutdownBin, err := lookupPath("systemd-shutdown", "/usr/lib/systemd:/lib/systemd")
   194  	if err != nil {
   195  		return err
   196  	}
   197  	systemdBin, err := lookupPath("systemd", "/usr/lib/systemd:/lib/systemd")
   198  	if err != nil {
   199  		return err
   200  	}
   201  	systemdJournaldBin, err := lookupPath("systemd-journald", "/usr/lib/systemd:/lib/systemd")
   202  	if err != nil {
   203  		return err
   204  	}
   205  
   206  	systemdUnitsPath := "/lib/systemd/system"
   207  	assets := []string{
   208  		proj2aci.GetAssetString("/usr/lib/systemd/systemd", systemdBin),
   209  		proj2aci.GetAssetString("/usr/bin/systemctl", systemctlBin),
   210  		proj2aci.GetAssetString("/usr/lib/systemd/systemd-journald", systemdJournaldBin),
   211  		proj2aci.GetAssetString("/usr/bin/bash", bashBin),
   212  		proj2aci.GetAssetString(fmt.Sprintf("%s/systemd-journald.service", systemdUnitsPath), fmt.Sprintf("%s/systemd-journald.service", systemdUnitsPath)),
   213  		proj2aci.GetAssetString(fmt.Sprintf("%s/systemd-journald.socket", systemdUnitsPath), fmt.Sprintf("%s/systemd-journald.socket", systemdUnitsPath)),
   214  		proj2aci.GetAssetString(fmt.Sprintf("%s/systemd-journald-dev-log.socket", systemdUnitsPath), fmt.Sprintf("%s/systemd-journald-dev-log.socket", systemdUnitsPath)),
   215  		proj2aci.GetAssetString(fmt.Sprintf("%s/systemd-journald-audit.socket", systemdUnitsPath), fmt.Sprintf("%s/systemd-journald-audit.socket", systemdUnitsPath)),
   216  		// systemd-shutdown has to be installed at the same path as on the host
   217  		// because it depends on systemd build flag -DSYSTEMD_SHUTDOWN_BINARY_PATH=
   218  		proj2aci.GetAssetString(systemdShutdownBin, systemdShutdownBin),
   219  	}
   220  
   221  	return proj2aci.PrepareAssets(assets, "./stage1/rootfs/", nil)
   222  }
   223  
   224  // getArgsEnv returns the nspawn or lkvm args and env according to the flavor used
   225  func getArgsEnv(p *stage1commontypes.Pod, flavor string, debug bool, n *networking.Networking) ([]string, []string, error) {
   226  	var args []string
   227  	env := os.Environ()
   228  
   229  	// We store the pod's flavor so we can later garbage collect it correctly
   230  	if err := os.Symlink(flavor, filepath.Join(p.Root, stage1initcommon.FlavorFile)); err != nil {
   231  		return nil, nil, errwrap.Wrap(errors.New("failed to create flavor symlink"), err)
   232  	}
   233  
   234  	switch flavor {
   235  	case "kvm":
   236  		if privateUsers != "" {
   237  			return nil, nil, fmt.Errorf("flag --private-users cannot be used with an lkvm stage1")
   238  		}
   239  
   240  		// kernel and lkvm are relative path, because init has /var/lib/rkt/..../uuid as its working directory
   241  		// TODO: move to path.go
   242  		kernelPath := filepath.Join(common.Stage1RootfsPath(p.Root), "bzImage")
   243  		lkvmPath := filepath.Join(common.Stage1RootfsPath(p.Root), "lkvm")
   244  		netDescriptions := kvm.GetNetworkDescriptions(n)
   245  		lkvmNetArgs, err := kvm.GetKVMNetArgs(netDescriptions)
   246  		if err != nil {
   247  			return nil, nil, err
   248  		}
   249  
   250  		cpu, mem := kvm.GetAppsResources(p.Manifest.Apps)
   251  
   252  		kernelParams := []string{
   253  			"console=hvc0",
   254  			"init=/usr/lib/systemd/systemd",
   255  			"no_timer_check",
   256  			"noreplace-smp",
   257  			"systemd.default_standard_error=journal+console",
   258  			"systemd.default_standard_output=journal+console",
   259  			// "systemd.default_standard_output=tty",
   260  			"tsc=reliable",
   261  			"MACHINEID=" + p.UUID.String(),
   262  		}
   263  
   264  		if debug {
   265  			kernelParams = append(kernelParams, []string{
   266  				"debug",
   267  				"systemd.log_level=debug",
   268  				"systemd.show_status=true",
   269  				// "systemd.confirm_spawn=true",
   270  			}...)
   271  		} else {
   272  			kernelParams = append(kernelParams, "quiet")
   273  		}
   274  
   275  		args = append(args, []string{
   276  			"./" + lkvmPath, // relative path
   277  			"run",
   278  			"--name", "rkt-" + p.UUID.String(),
   279  			"--no-dhcp", // speed bootup
   280  			"--cpu", strconv.FormatInt(cpu, 10),
   281  			"--mem", strconv.FormatInt(mem, 10),
   282  			"--console=virtio",
   283  			"--kernel", kernelPath,
   284  			"--disk", "stage1/rootfs", // relative to run/pods/uuid dir this is a place where systemd resides
   285  			// MACHINEID will be available as environment variable
   286  			"--params", strings.Join(kernelParams, " "),
   287  		}...,
   288  		)
   289  		args = append(args, lkvmNetArgs...)
   290  
   291  		if debug {
   292  			args = append(args, "--debug")
   293  		}
   294  
   295  		// host volume sharing with 9p
   296  		nsargs := stage1initcommon.VolumesToKvmDiskArgs(p.Manifest.Volumes)
   297  		args = append(args, nsargs...)
   298  
   299  		// lkvm requires $HOME to be defined,
   300  		// see https://github.com/coreos/rkt/issues/1393
   301  		if os.Getenv("HOME") == "" {
   302  			env = append(env, "HOME=/root")
   303  		}
   304  
   305  		return args, env, nil
   306  
   307  	case "coreos":
   308  		args = append(args, filepath.Join(common.Stage1RootfsPath(p.Root), interpBin))
   309  		args = append(args, filepath.Join(common.Stage1RootfsPath(p.Root), nspawnBin))
   310  		args = append(args, "--boot") // Launch systemd in the pod
   311  
   312  		if context := os.Getenv(common.EnvSELinuxContext); context != "" {
   313  			args = append(args, fmt.Sprintf("-Z%s", context))
   314  		}
   315  
   316  		if machinedRegister() {
   317  			args = append(args, fmt.Sprintf("--register=true"))
   318  		} else {
   319  			args = append(args, fmt.Sprintf("--register=false"))
   320  		}
   321  
   322  		// use only dynamic libraries provided in the image
   323  		env = append(env, "LD_LIBRARY_PATH="+filepath.Join(common.Stage1RootfsPath(p.Root), "usr/lib"))
   324  
   325  	case "src":
   326  		args = append(args, filepath.Join(common.Stage1RootfsPath(p.Root), nspawnBin))
   327  		args = append(args, "--boot") // Launch systemd in the pod
   328  
   329  		if context := os.Getenv(common.EnvSELinuxContext); context != "" {
   330  			args = append(args, fmt.Sprintf("-Z%s", context))
   331  		}
   332  
   333  		if machinedRegister() {
   334  			args = append(args, fmt.Sprintf("--register=true"))
   335  		} else {
   336  			args = append(args, fmt.Sprintf("--register=false"))
   337  		}
   338  
   339  	case "host":
   340  		hostNspawnBin, err := lookupPath("systemd-nspawn", os.Getenv("PATH"))
   341  		if err != nil {
   342  			return nil, nil, err
   343  		}
   344  
   345  		// Check dynamically which version is installed on the host
   346  		// Support version >= 220
   347  		versionBytes, err := exec.Command(hostNspawnBin, "--version").CombinedOutput()
   348  		if err != nil {
   349  			return nil, nil, errwrap.Wrap(fmt.Errorf("unable to probe %s version", hostNspawnBin), err)
   350  		}
   351  		versionStr := strings.SplitN(string(versionBytes), "\n", 2)[0]
   352  		var version int
   353  		n, err := fmt.Sscanf(versionStr, "systemd %d", &version)
   354  		if err != nil {
   355  			return nil, nil, fmt.Errorf("cannot parse version: %q", versionStr)
   356  		}
   357  		if n != 1 || version < 220 {
   358  			return nil, nil, fmt.Errorf("rkt needs systemd-nspawn >= 220. %s version not supported: %v", hostNspawnBin, versionStr)
   359  		}
   360  
   361  		// Copy systemd, bash, etc. in stage1 at run-time
   362  		if err := installAssets(); err != nil {
   363  			return nil, nil, errwrap.Wrap(errors.New("cannot install assets from the host"), err)
   364  		}
   365  
   366  		args = append(args, hostNspawnBin)
   367  		args = append(args, "--boot") // Launch systemd in the pod
   368  		args = append(args, fmt.Sprintf("--register=true"))
   369  
   370  		if context := os.Getenv(common.EnvSELinuxContext); context != "" {
   371  			args = append(args, fmt.Sprintf("-Z%s", context))
   372  		}
   373  
   374  	default:
   375  		return nil, nil, fmt.Errorf("unrecognized stage1 flavor: %q", flavor)
   376  	}
   377  
   378  	// link journal only if the host is running systemd
   379  	if util.IsRunningSystemd() {
   380  		// we write /etc/machine-id here because systemd-nspawn needs it to link
   381  		// the container's journal to the host
   382  		mPath := filepath.Join(common.Stage1RootfsPath(p.Root), "etc", "machine-id")
   383  		mID := strings.Replace(p.UUID.String(), "-", "", -1)
   384  
   385  		if err := ioutil.WriteFile(mPath, []byte(mID), 0644); err != nil {
   386  			log.FatalE("error writing /etc/machine-id", err)
   387  		}
   388  
   389  		args = append(args, "--link-journal=try-guest")
   390  
   391  		keepUnit, err := util.RunningFromSystemService()
   392  		if err != nil {
   393  			if err == util.ErrSoNotFound {
   394  				diag.Print("warning: libsystemd not found even though systemd is running. Cgroup limits set by the environment (e.g. a systemd service) won't be enforced.")
   395  			} else {
   396  				return nil, nil, errwrap.Wrap(errors.New("error determining if we're running from a system service"), err)
   397  			}
   398  		}
   399  
   400  		if keepUnit {
   401  			args = append(args, "--keep-unit")
   402  		}
   403  	}
   404  
   405  	if !debug {
   406  		args = append(args, "--quiet")             // silence most nspawn output (log_warning is currently not covered by this)
   407  		env = append(env, "SYSTEMD_LOG_LEVEL=err") // silence log_warning too
   408  	}
   409  
   410  	env = append(env, "SYSTEMD_NSPAWN_CONTAINER_SERVICE=rkt")
   411  
   412  	if len(privateUsers) > 0 {
   413  		args = append(args, "--private-users="+privateUsers)
   414  	}
   415  
   416  	nsargs, err := stage1initcommon.PodToNspawnArgs(p)
   417  	if err != nil {
   418  		return nil, nil, errwrap.Wrap(errors.New("failed to generate nspawn args"), err)
   419  	}
   420  	args = append(args, nsargs...)
   421  
   422  	// Arguments to systemd
   423  	args = append(args, "--")
   424  	args = append(args, "--default-standard-output=tty") // redirect all service logs straight to tty
   425  	if !debug {
   426  		args = append(args, "--log-target=null") // silence systemd output inside pod
   427  		// TODO remove --log-level=warning when we update stage1 to systemd v222
   428  		args = append(args, "--log-level=warning") // limit log output (systemd-shutdown ignores --log-target)
   429  		args = append(args, "--show-status=0")     // silence systemd initialization status output
   430  	}
   431  
   432  	return args, env, nil
   433  }
   434  
   435  func forwardedPorts(pod *stage1commontypes.Pod) ([]networking.ForwardedPort, error) {
   436  	var fps []networking.ForwardedPort
   437  
   438  	for _, ep := range pod.Manifest.Ports {
   439  		n := ""
   440  		fp := networking.ForwardedPort{}
   441  
   442  		for _, a := range pod.Manifest.Apps {
   443  			for _, p := range a.App.Ports {
   444  				if p.Name == ep.Name {
   445  					if n == "" {
   446  						fp.Protocol = p.Protocol
   447  						fp.HostPort = ep.HostPort
   448  						fp.PodPort = p.Port
   449  						n = a.Name.String()
   450  					} else {
   451  						return nil, fmt.Errorf("ambiguous exposed port in PodManifest: %q and %q both define port %q", n, a.Name, p.Name)
   452  					}
   453  				}
   454  			}
   455  		}
   456  
   457  		if n == "" {
   458  			return nil, fmt.Errorf("port name %q is not defined by any apps", ep.Name)
   459  		}
   460  
   461  		fps = append(fps, fp)
   462  	}
   463  
   464  	// TODO(eyakubovich): validate that there're no conflicts
   465  
   466  	return fps, nil
   467  }
   468  
   469  func stage1() int {
   470  	uuid, err := types.NewUUID(flag.Arg(0))
   471  	if err != nil {
   472  		log.PrintE("UUID is missing or malformed", err)
   473  		return 1
   474  	}
   475  
   476  	root := "."
   477  	p, err := stage1commontypes.LoadPod(root, uuid)
   478  	if err != nil {
   479  		log.PrintE("failed to load pod", err)
   480  		return 1
   481  	}
   482  
   483  	// set close-on-exec flag on RKT_LOCK_FD so it gets correctly closed when invoking
   484  	// network plugins
   485  	lfd, err := common.GetRktLockFD()
   486  	if err != nil {
   487  		log.PrintE("failed to get rkt lock fd", err)
   488  		return 1
   489  	}
   490  
   491  	if err := sys.CloseOnExec(lfd, true); err != nil {
   492  		log.PrintE("failed to set FD_CLOEXEC on rkt lock", err)
   493  		return 1
   494  	}
   495  
   496  	mirrorLocalZoneInfo(p.Root)
   497  
   498  	flavor, _, err := stage1initcommon.GetFlavor(p)
   499  	if err != nil {
   500  		log.PrintE("failed to get stage1 flavor", err)
   501  		return 3
   502  	}
   503  
   504  	var n *networking.Networking
   505  	if netList.Contained() {
   506  		fps, err := forwardedPorts(p)
   507  		if err != nil {
   508  			log.Error(err)
   509  			return 6
   510  		}
   511  
   512  		n, err = networking.Setup(root, p.UUID, fps, netList, localConfig, flavor, debug)
   513  		if err != nil {
   514  			log.PrintE("failed to setup network", err)
   515  			return 6
   516  		}
   517  
   518  		if err = n.Save(); err != nil {
   519  			log.PrintE("failed to save networking state", err)
   520  			n.Teardown(flavor, debug)
   521  			return 6
   522  		}
   523  
   524  		if len(mdsToken) > 0 {
   525  			hostIP, err := n.GetDefaultHostIP()
   526  			if err != nil {
   527  				log.PrintE("failed to get default Host IP", err)
   528  				return 6
   529  			}
   530  
   531  			p.MetadataServiceURL = common.MetadataServicePublicURL(hostIP, mdsToken)
   532  		}
   533  	} else {
   534  		if flavor == "kvm" {
   535  			log.Print("flavor kvm requires private network configuration (try --net)")
   536  			return 6
   537  		}
   538  		if len(mdsToken) > 0 {
   539  			p.MetadataServiceURL = common.MetadataServicePublicURL(localhostIP, mdsToken)
   540  		}
   541  	}
   542  
   543  	if err = stage1initcommon.WriteDefaultTarget(p); err != nil {
   544  		log.PrintE("failed to write default.target", err)
   545  		return 2
   546  	}
   547  
   548  	if err = stage1initcommon.WritePrepareAppTemplate(p); err != nil {
   549  		log.PrintE("failed to write prepare-app service template", err)
   550  		return 2
   551  	}
   552  
   553  	if err := stage1initcommon.SetJournalPermissions(p); err != nil {
   554  		log.PrintE("warning: error setting journal ACLs, you'll need root to read the pod journal", err)
   555  	}
   556  
   557  	if flavor == "kvm" {
   558  		if err := KvmPodToSystemd(p, n); err != nil {
   559  			log.PrintE("failed to configure systemd for kvm", err)
   560  			return 2
   561  		}
   562  	}
   563  
   564  	if err = stage1initcommon.PodToSystemd(p, interactive, flavor, privateUsers); err != nil {
   565  		log.PrintE("failed to configure systemd", err)
   566  		return 2
   567  	}
   568  
   569  	args, env, err := getArgsEnv(p, flavor, debug, n)
   570  	if err != nil {
   571  		log.Error(err)
   572  		return 3
   573  	}
   574  
   575  	// create a separate mount namespace so the cgroup filesystems
   576  	// are unmounted when exiting the pod
   577  	if err := syscall.Unshare(syscall.CLONE_NEWNS); err != nil {
   578  		log.FatalE("error unsharing", err)
   579  	}
   580  
   581  	// we recursively make / a "shared and slave" so mount events from the
   582  	// new namespace don't propagate to the host namespace but mount events
   583  	// from the host propagate to the new namespace and are forwarded to
   584  	// its peer group
   585  	// See https://www.kernel.org/doc/Documentation/filesystems/sharedsubtree.txt
   586  	if err := syscall.Mount("", "/", "none", syscall.MS_REC|syscall.MS_SLAVE, ""); err != nil {
   587  		log.FatalE("error making / a slave mount", err)
   588  	}
   589  	if err := syscall.Mount("", "/", "none", syscall.MS_REC|syscall.MS_SHARED, ""); err != nil {
   590  		log.FatalE("error making / a shared and slave mount", err)
   591  	}
   592  
   593  	enabledCgroups, err := cgroup.GetEnabledCgroups()
   594  	if err != nil {
   595  		log.FatalE("error getting cgroups", err)
   596  		return 5
   597  	}
   598  
   599  	// mount host cgroups in the rkt mount namespace
   600  	if err := mountHostCgroups(enabledCgroups); err != nil {
   601  		log.FatalE("couldn't mount the host cgroups", err)
   602  		return 5
   603  	}
   604  
   605  	var serviceNames []string
   606  	for _, app := range p.Manifest.Apps {
   607  		serviceNames = append(serviceNames, stage1initcommon.ServiceUnitName(app.Name))
   608  	}
   609  	s1Root := common.Stage1RootfsPath(p.Root)
   610  	machineID := stage1initcommon.GetMachineID(p)
   611  	subcgroup, err := getContainerSubCgroup(machineID)
   612  	if err == nil {
   613  		if err := mountContainerCgroups(s1Root, enabledCgroups, subcgroup, serviceNames); err != nil {
   614  			log.PrintE("couldn't mount the container cgroups", err)
   615  			return 5
   616  		}
   617  	} else {
   618  		log.PrintE("continuing with per-app isolators disabled", err)
   619  	}
   620  
   621  	if err = stage1common.WritePpid(os.Getpid()); err != nil {
   622  		log.Error(err)
   623  		return 4
   624  	}
   625  
   626  	err = stage1common.WithClearedCloExec(lfd, func() error {
   627  		return syscall.Exec(args[0], args, env)
   628  	})
   629  	if err != nil {
   630  		log.PrintE(fmt.Sprintf("failed to execute %q", args[0]), err)
   631  		return 7
   632  	}
   633  
   634  	return 0
   635  }
   636  
   637  func areHostCgroupsMounted(enabledCgroups map[int][]string) bool {
   638  	controllers := cgroup.GetControllerDirs(enabledCgroups)
   639  	for _, c := range controllers {
   640  		if !cgroup.IsControllerMounted(c) {
   641  			return false
   642  		}
   643  	}
   644  
   645  	return true
   646  }
   647  
   648  // mountHostCgroups mounts the host cgroup hierarchy as required by
   649  // systemd-nspawn. We need this because some distributions don't have the
   650  // "name=systemd" cgroup or don't mount the cgroup controllers in
   651  // "/sys/fs/cgroup", and systemd-nspawn needs this. Since this is mounted
   652  // inside the rkt mount namespace, it doesn't affect the host.
   653  func mountHostCgroups(enabledCgroups map[int][]string) error {
   654  	systemdControllerPath := "/sys/fs/cgroup/systemd"
   655  	if !areHostCgroupsMounted(enabledCgroups) {
   656  		if err := cgroup.CreateCgroups("/", enabledCgroups); err != nil {
   657  			return errwrap.Wrap(errors.New("error creating host cgroups"), err)
   658  		}
   659  	}
   660  
   661  	if !cgroup.IsControllerMounted("systemd") {
   662  		if err := os.MkdirAll(systemdControllerPath, 0700); err != nil {
   663  			return err
   664  		}
   665  		if err := syscall.Mount("cgroup", systemdControllerPath, "cgroup", 0, "none,name=systemd"); err != nil {
   666  			return errwrap.Wrap(fmt.Errorf("error mounting name=systemd hierarchy on %q", systemdControllerPath), err)
   667  		}
   668  	}
   669  
   670  	return nil
   671  }
   672  
   673  // mountContainerCgroups mounts the cgroup controllers hierarchy in the container's
   674  // namespace read-only, leaving the needed knobs in the subcgroup for each-app
   675  // read-write so systemd inside stage1 can apply isolators to them
   676  func mountContainerCgroups(s1Root string, enabledCgroups map[int][]string, subcgroup string, serviceNames []string) error {
   677  	if err := cgroup.CreateCgroups(s1Root, enabledCgroups); err != nil {
   678  		return errwrap.Wrap(errors.New("error creating container cgroups"), err)
   679  	}
   680  	if err := cgroup.RemountCgroupsRO(s1Root, enabledCgroups, subcgroup, serviceNames); err != nil {
   681  		return errwrap.Wrap(errors.New("error restricting container cgroups"), err)
   682  	}
   683  
   684  	return nil
   685  }
   686  
   687  func getContainerSubCgroup(machineID string) (string, error) {
   688  	var subcgroup string
   689  	fromUnit, err := util.RunningFromSystemService()
   690  	if err != nil {
   691  		return "", errwrap.Wrap(errors.New("could not determine if we're running from a unit file"), err)
   692  	}
   693  	if fromUnit {
   694  		slice, err := util.GetRunningSlice()
   695  		if err != nil {
   696  			return "", errwrap.Wrap(errors.New("could not get slice name"), err)
   697  		}
   698  		slicePath, err := common.SliceToPath(slice)
   699  		if err != nil {
   700  			return "", errwrap.Wrap(errors.New("could not convert slice name to path"), err)
   701  		}
   702  		unit, err := util.CurrentUnitName()
   703  		if err != nil {
   704  			return "", errwrap.Wrap(errors.New("could not get unit name"), err)
   705  		}
   706  		subcgroup = filepath.Join(slicePath, unit, "system.slice")
   707  	} else {
   708  		escapedmID := strings.Replace(machineID, "-", "\\x2d", -1)
   709  		machineDir := "machine-" + escapedmID + ".scope"
   710  		if machinedRegister() {
   711  			// we are not in the final cgroup yet: systemd-nspawn will move us
   712  			// to the correct cgroup later during registration so we can't
   713  			// look it up in /proc/self/cgroup
   714  			subcgroup = filepath.Join("machine.slice", machineDir, "system.slice")
   715  		} else {
   716  			// when registration is disabled the container will be directly
   717  			// under the current cgroup so we can look it up in /proc/self/cgroup
   718  			ownCgroupPath, err := cgroup.GetOwnCgroupPath("name=systemd")
   719  			if err != nil {
   720  				return "", errwrap.Wrap(errors.New("could not get own cgroup path"), err)
   721  			}
   722  			// systemd-nspawn won't work if we are in the root cgroup. In addition,
   723  			// we want all rkt instances to be in distinct cgroups. Create a
   724  			// subcgroup and add ourselves to it.
   725  			ownCgroupPath = filepath.Join(ownCgroupPath, machineDir)
   726  			if err := cgroup.JoinSubcgroup("systemd", ownCgroupPath); err != nil {
   727  				return "", errwrap.Wrap(fmt.Errorf("error joining %s subcgroup", ownCgroupPath), err)
   728  			}
   729  			subcgroup = filepath.Join(ownCgroupPath, "system.slice")
   730  		}
   731  	}
   732  
   733  	return subcgroup, nil
   734  }
   735  
   736  func main() {
   737  	flag.Parse()
   738  
   739  	stage1initcommon.InitDebug(debug)
   740  
   741  	log, diag, _ = rktlog.NewLogSet("stage1", debug)
   742  	if !debug {
   743  		diag.SetOutput(ioutil.Discard)
   744  	}
   745  
   746  	// move code into stage1() helper so deferred fns get run
   747  	os.Exit(stage1())
   748  }