github.com/stackdocker/rkt@v0.10.1-0.20151109095037-1aa827478248/stage1/init/init.go (about)

     1  // Copyright 2014 The rkt Authors
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  //+build linux
    16  
    17  package main
    18  
    19  // #cgo LDFLAGS: -ldl
    20  // #include <stdlib.h>
    21  // #include <dlfcn.h>
    22  // #include <sys/types.h>
    23  // #include <unistd.h>
    24  //
    25  // int
    26  // my_sd_pid_get_owner_uid(void *f, pid_t pid, uid_t *uid)
    27  // {
    28  //   int (*sd_pid_get_owner_uid)(pid_t, uid_t *);
    29  //
    30  //   sd_pid_get_owner_uid = (int (*)(pid_t, uid_t *))f;
    31  //   return sd_pid_get_owner_uid(pid, uid);
    32  // }
    33  //
    34  // int
    35  // my_sd_pid_get_unit(void *f, pid_t pid, char **unit)
    36  // {
    37  //   int (*sd_pid_get_unit)(pid_t, char **);
    38  //
    39  //   sd_pid_get_unit = (int (*)(pid_t, char **))f;
    40  //   return sd_pid_get_unit(pid, unit);
    41  // }
    42  //
    43  // int
    44  // my_sd_pid_get_slice(void *f, pid_t pid, char **slice)
    45  // {
    46  //   int (*sd_pid_get_slice)(pid_t, char **);
    47  //
    48  //   sd_pid_get_slice = (int (*)(pid_t, char **))f;
    49  //   return sd_pid_get_slice(pid, slice);
    50  // }
    51  //
    52  // int
    53  // am_session_leader()
    54  // {
    55  //   return (getsid(0) == getpid());
    56  // }
    57  import "C"
    58  
    59  // this implements /init of stage1/nspawn+systemd
    60  
    61  import (
    62  	"flag"
    63  	"fmt"
    64  	"io"
    65  	"io/ioutil"
    66  	"log"
    67  	"net"
    68  	"os"
    69  	"os/exec"
    70  	"path/filepath"
    71  	"runtime"
    72  	"strconv"
    73  	"strings"
    74  	"syscall"
    75  	"unsafe"
    76  
    77  	"github.com/coreos/rkt/Godeps/_workspace/src/github.com/appc/goaci/proj2aci"
    78  	"github.com/coreos/rkt/Godeps/_workspace/src/github.com/appc/spec/schema/types"
    79  	"github.com/coreos/rkt/Godeps/_workspace/src/github.com/coreos/go-systemd/util"
    80  	"github.com/coreos/rkt/Godeps/_workspace/src/github.com/godbus/dbus"
    81  	"github.com/coreos/rkt/Godeps/_workspace/src/github.com/godbus/dbus/introspect"
    82  
    83  	"github.com/coreos/rkt/common"
    84  	"github.com/coreos/rkt/common/cgroup"
    85  	"github.com/coreos/rkt/networking"
    86  	"github.com/coreos/rkt/pkg/sys"
    87  	"github.com/coreos/rkt/stage1/init/kvm"
    88  )
    89  
    90  const (
    91  	// Path to systemd-nspawn binary within the stage1 rootfs
    92  	nspawnBin = "/usr/bin/systemd-nspawn"
    93  	// Path to the interpreter within the stage1 rootfs
    94  	interpBin = "/usr/lib/ld-linux-x86-64.so.2"
    95  	// Path to the localtime file/symlink in host
    96  	localtimePath = "/etc/localtime"
    97  )
    98  
    99  // mirrorLocalZoneInfo tries to reproduce the /etc/localtime target in stage1/ to satisfy systemd-nspawn
   100  func mirrorLocalZoneInfo(root string) {
   101  	zif, err := os.Readlink(localtimePath)
   102  	if err != nil {
   103  		return
   104  	}
   105  
   106  	// On some systems /etc/localtime is a relative symlink, make it absolute
   107  	if !filepath.IsAbs(zif) {
   108  		zif = filepath.Join(filepath.Dir(localtimePath), zif)
   109  		zif = filepath.Clean(zif)
   110  	}
   111  
   112  	src, err := os.Open(zif)
   113  	if err != nil {
   114  		return
   115  	}
   116  	defer src.Close()
   117  
   118  	destp := filepath.Join(common.Stage1RootfsPath(root), zif)
   119  
   120  	if err = os.MkdirAll(filepath.Dir(destp), 0755); err != nil {
   121  		return
   122  	}
   123  
   124  	dest, err := os.OpenFile(destp, os.O_CREATE|os.O_WRONLY, 0644)
   125  	if err != nil {
   126  		return
   127  	}
   128  	defer dest.Close()
   129  
   130  	_, _ = io.Copy(dest, src)
   131  }
   132  
   133  var (
   134  	debug        bool
   135  	netList      common.NetList
   136  	interactive  bool
   137  	privateUsers string
   138  	mdsToken     string
   139  	localhostIP  net.IP
   140  	localConfig  string
   141  )
   142  
   143  func init() {
   144  	flag.BoolVar(&debug, "debug", false, "Run in debug mode")
   145  	flag.Var(&netList, "net", "Setup networking")
   146  	flag.BoolVar(&interactive, "interactive", false, "The pod is interactive")
   147  	flag.StringVar(&privateUsers, "private-users", "", "Run within user namespace. Can be set to [=UIDBASE[:NUIDS]]")
   148  	flag.StringVar(&mdsToken, "mds-token", "", "MDS auth token")
   149  	flag.StringVar(&localConfig, "local-config", common.DefaultLocalConfigDir, "Local config path")
   150  	// this ensures that main runs only on main thread (thread group leader).
   151  	// since namespace ops (unshare, setns) are done for a single thread, we
   152  	// must ensure that the goroutine does not jump from OS thread to thread
   153  	runtime.LockOSThread()
   154  
   155  	localhostIP = net.ParseIP("127.0.0.1")
   156  	if localhostIP == nil {
   157  		panic("localhost IP failed to parse")
   158  	}
   159  }
   160  
   161  // machinedRegister checks if nspawn should register the pod to machined
   162  func machinedRegister() bool {
   163  	// machined has a D-Bus interface following versioning guidelines, see:
   164  	// http://www.freedesktop.org/wiki/Software/systemd/machined/
   165  	// Therefore we can just check if the D-Bus method we need exists and we
   166  	// don't need to check the signature.
   167  	var found int
   168  
   169  	conn, err := dbus.SystemBus()
   170  	if err != nil {
   171  		return false
   172  	}
   173  	node, err := introspect.Call(conn.Object("org.freedesktop.machine1", "/org/freedesktop/machine1"))
   174  	if err != nil {
   175  		return false
   176  	}
   177  	for _, iface := range node.Interfaces {
   178  		if iface.Name != "org.freedesktop.machine1.Manager" {
   179  			continue
   180  		}
   181  		// machined v215 supports methods "RegisterMachine" and "CreateMachine" called by nspawn v215.
   182  		// machined v216+ (since commit 5aa4bb) additionally supports methods "CreateMachineWithNetwork"
   183  		// and "RegisterMachineWithNetwork", called by nspawn v216+.
   184  		for _, method := range iface.Methods {
   185  			if method.Name == "CreateMachineWithNetwork" || method.Name == "RegisterMachineWithNetwork" {
   186  				found++
   187  			}
   188  		}
   189  		break
   190  	}
   191  	return found == 2
   192  }
   193  
   194  func lookupPath(bin string, paths string) (string, error) {
   195  	pathsArr := filepath.SplitList(paths)
   196  	for _, path := range pathsArr {
   197  		binPath := filepath.Join(path, bin)
   198  		binAbsPath, err := filepath.Abs(binPath)
   199  		if err != nil {
   200  			return "", fmt.Errorf("unable to find absolute path for %s", binPath)
   201  		}
   202  		d, err := os.Stat(binAbsPath)
   203  		if err != nil {
   204  			continue
   205  		}
   206  		// Check the executable bit, inspired by os.exec.LookPath()
   207  		if m := d.Mode(); !m.IsDir() && m&0111 != 0 {
   208  			return binAbsPath, nil
   209  		}
   210  	}
   211  	return "", fmt.Errorf("unable to find %q in %q", bin, paths)
   212  }
   213  
   214  func installAssets() error {
   215  	systemctlBin, err := lookupPath("systemctl", os.Getenv("PATH"))
   216  	if err != nil {
   217  		return err
   218  	}
   219  	bashBin, err := lookupPath("bash", os.Getenv("PATH"))
   220  	if err != nil {
   221  		return err
   222  	}
   223  	// More paths could be added in that list if some Linux distributions install it in a different path
   224  	// Note that we look in /usr/lib/... first because of the merge:
   225  	// http://www.freedesktop.org/wiki/Software/systemd/TheCaseForTheUsrMerge/
   226  	systemdShutdownBin, err := lookupPath("systemd-shutdown", "/usr/lib/systemd:/lib/systemd")
   227  	if err != nil {
   228  		return err
   229  	}
   230  	systemdBin, err := lookupPath("systemd", "/usr/lib/systemd:/lib/systemd")
   231  	if err != nil {
   232  		return err
   233  	}
   234  	systemdJournaldBin, err := lookupPath("systemd-journald", "/usr/lib/systemd:/lib/systemd")
   235  	if err != nil {
   236  		return err
   237  	}
   238  
   239  	systemdUnitsPath := "/usr/lib/systemd/system"
   240  	assets := []string{
   241  		proj2aci.GetAssetString("/usr/lib/systemd/systemd", systemdBin),
   242  		proj2aci.GetAssetString("/usr/bin/systemctl", systemctlBin),
   243  		proj2aci.GetAssetString("/usr/lib/systemd/systemd-journald", systemdJournaldBin),
   244  		proj2aci.GetAssetString("/usr/bin/bash", bashBin),
   245  		proj2aci.GetAssetString(fmt.Sprintf("%s/systemd-journald.service", systemdUnitsPath), fmt.Sprintf("%s/systemd-journald.service", systemdUnitsPath)),
   246  		proj2aci.GetAssetString(fmt.Sprintf("%s/systemd-journald.socket", systemdUnitsPath), fmt.Sprintf("%s/systemd-journald.socket", systemdUnitsPath)),
   247  		proj2aci.GetAssetString(fmt.Sprintf("%s/systemd-journald-dev-log.socket", systemdUnitsPath), fmt.Sprintf("%s/systemd-journald-dev-log.socket", systemdUnitsPath)),
   248  		proj2aci.GetAssetString(fmt.Sprintf("%s/systemd-journald-audit.socket", systemdUnitsPath), fmt.Sprintf("%s/systemd-journald-audit.socket", systemdUnitsPath)),
   249  		// systemd-shutdown has to be installed at the same path as on the host
   250  		// because it depends on systemd build flag -DSYSTEMD_SHUTDOWN_BINARY_PATH=
   251  		proj2aci.GetAssetString(systemdShutdownBin, systemdShutdownBin),
   252  	}
   253  
   254  	return proj2aci.PrepareAssets(assets, "./stage1/rootfs/", nil)
   255  }
   256  
   257  // getArgsEnv returns the nspawn or lkvm args and env according to the flavor used
   258  func getArgsEnv(p *Pod, flavor string, debug bool, n *networking.Networking) ([]string, []string, error) {
   259  	var args []string
   260  	env := os.Environ()
   261  
   262  	// We store the pod's flavor so we can later garbage collect it correctly
   263  	if err := os.Symlink(flavor, filepath.Join(p.Root, flavorFile)); err != nil {
   264  		return nil, nil, fmt.Errorf("failed to create flavor symlink: %v", err)
   265  	}
   266  
   267  	switch flavor {
   268  	case "kvm":
   269  		if privateUsers != "" {
   270  			return nil, nil, fmt.Errorf("flag --private-users cannot be used with an lkvm stage1")
   271  		}
   272  
   273  		// kernel and lkvm are relative path, because init has /var/lib/rkt/..../uuid as its working directory
   274  		// TODO: move to path.go
   275  		kernelPath := filepath.Join(common.Stage1RootfsPath(p.Root), "bzImage")
   276  		lkvmPath := filepath.Join(common.Stage1RootfsPath(p.Root), "lkvm")
   277  		netDescriptions := kvm.GetNetworkDescriptions(n)
   278  		lkvmNetArgs, kernelNetParams, err := kvm.GetKVMNetArgs(netDescriptions)
   279  		if err != nil {
   280  			return nil, nil, err
   281  		}
   282  
   283  		// TODO: base on resource isolators
   284  		cpu := 1
   285  		mem := 128
   286  
   287  		kernelParams := []string{
   288  			"console=hvc0",
   289  			"init=/usr/lib/systemd/systemd",
   290  			"no_timer_check",
   291  			"noreplace-smp",
   292  			"systemd.default_standard_error=journal+console",
   293  			"systemd.default_standard_output=journal+console",
   294  			strings.Join(kernelNetParams, " "),
   295  			// "systemd.default_standard_output=tty",
   296  			"tsc=reliable",
   297  			"MACHINEID=" + p.UUID.String(),
   298  		}
   299  
   300  		if debug {
   301  			kernelParams = append(kernelParams, []string{
   302  				"debug",
   303  				"systemd.log_level=debug",
   304  				"systemd.show_status=true",
   305  				// "systemd.confirm_spawn=true",
   306  			}...)
   307  		} else {
   308  			kernelParams = append(kernelParams, "quiet")
   309  		}
   310  
   311  		args = append(args, []string{
   312  			"./" + lkvmPath, // relative path
   313  			"run",
   314  			"--name", "rkt-" + p.UUID.String(),
   315  			"--no-dhcp", // speed bootup
   316  			"--cpu", strconv.Itoa(cpu),
   317  			"--mem", strconv.Itoa(mem),
   318  			"--console=virtio",
   319  			"--kernel", kernelPath,
   320  			"--disk", "stage1/rootfs", // relative to run/pods/uuid dir this is a place where systemd resides
   321  			// MACHINEID will be available as environment variable
   322  			"--params", strings.Join(kernelParams, " "),
   323  		}...,
   324  		)
   325  		args = append(args, lkvmNetArgs...)
   326  
   327  		if debug {
   328  			args = append(args, "--debug")
   329  		}
   330  
   331  		// host volume sharing with 9p
   332  		nsargs := kvm.VolumesToKvmDiskArgs(p.Manifest.Volumes)
   333  		args = append(args, nsargs...)
   334  
   335  		// lkvm requires $HOME to be defined,
   336  		// see https://github.com/coreos/rkt/issues/1393
   337  		if os.Getenv("HOME") == "" {
   338  			env = append(env, "HOME=/root")
   339  		}
   340  
   341  		return args, env, nil
   342  
   343  	case "coreos":
   344  		args = append(args, filepath.Join(common.Stage1RootfsPath(p.Root), interpBin))
   345  		args = append(args, filepath.Join(common.Stage1RootfsPath(p.Root), nspawnBin))
   346  		args = append(args, "--boot") // Launch systemd in the pod
   347  
   348  		if context := os.Getenv(common.EnvSELinuxContext); context != "" {
   349  			args = append(args, fmt.Sprintf("-Z%s", context))
   350  		}
   351  
   352  		if machinedRegister() {
   353  			args = append(args, fmt.Sprintf("--register=true"))
   354  		} else {
   355  			args = append(args, fmt.Sprintf("--register=false"))
   356  		}
   357  
   358  		// use only dynamic libraries provided in the image
   359  		env = append(env, "LD_LIBRARY_PATH="+filepath.Join(common.Stage1RootfsPath(p.Root), "usr/lib"))
   360  
   361  	case "src":
   362  		args = append(args, filepath.Join(common.Stage1RootfsPath(p.Root), nspawnBin))
   363  		args = append(args, "--boot") // Launch systemd in the pod
   364  
   365  		if context := os.Getenv(common.EnvSELinuxContext); context != "" {
   366  			args = append(args, fmt.Sprintf("-Z%s", context))
   367  		}
   368  
   369  		if machinedRegister() {
   370  			args = append(args, fmt.Sprintf("--register=true"))
   371  		} else {
   372  			args = append(args, fmt.Sprintf("--register=false"))
   373  		}
   374  
   375  	case "host":
   376  		hostNspawnBin, err := lookupPath("systemd-nspawn", os.Getenv("PATH"))
   377  		if err != nil {
   378  			return nil, nil, err
   379  		}
   380  
   381  		// Check dynamically which version is installed on the host
   382  		// Support version >= 220
   383  		versionBytes, err := exec.Command(hostNspawnBin, "--version").CombinedOutput()
   384  		if err != nil {
   385  			return nil, nil, fmt.Errorf("unable to probe %s version: %v", hostNspawnBin, err)
   386  		}
   387  		versionStr := strings.SplitN(string(versionBytes), "\n", 2)[0]
   388  		var version int
   389  		n, err := fmt.Sscanf(versionStr, "systemd %d", &version)
   390  		if err != nil {
   391  			return nil, nil, fmt.Errorf("cannot parse version: %q", versionStr)
   392  		}
   393  		if n != 1 || version < 220 {
   394  			return nil, nil, fmt.Errorf("rkt needs systemd-nspawn >= 220. %s version not supported: %v", hostNspawnBin, versionStr)
   395  		}
   396  
   397  		// Copy systemd, bash, etc. in stage1 at run-time
   398  		if err := installAssets(); err != nil {
   399  			return nil, nil, fmt.Errorf("cannot install assets from the host: %v", err)
   400  		}
   401  
   402  		args = append(args, hostNspawnBin)
   403  		args = append(args, "--boot") // Launch systemd in the pod
   404  		args = append(args, fmt.Sprintf("--register=true"))
   405  
   406  		if context := os.Getenv(common.EnvSELinuxContext); context != "" {
   407  			args = append(args, fmt.Sprintf("-Z%s", context))
   408  		}
   409  
   410  	default:
   411  		return nil, nil, fmt.Errorf("unrecognized stage1 flavor: %q", flavor)
   412  	}
   413  
   414  	// link journal only if the host is running systemd
   415  	if util.IsRunningSystemd() {
   416  		// we write /etc/machine-id here because systemd-nspawn needs it to link
   417  		// the container's journal to the host
   418  		mPath := filepath.Join(common.Stage1RootfsPath(p.Root), "etc", "machine-id")
   419  		mId := strings.Replace(p.UUID.String(), "-", "", -1)
   420  
   421  		if err := ioutil.WriteFile(mPath, []byte(mId), 0644); err != nil {
   422  			log.Fatalf("error writing /etc/machine-id: %v\n", err)
   423  		}
   424  
   425  		args = append(args, "--link-journal=try-guest")
   426  	}
   427  
   428  	if !debug {
   429  		args = append(args, "--quiet")             // silence most nspawn output (log_warning is currently not covered by this)
   430  		env = append(env, "SYSTEMD_LOG_LEVEL=err") // silence log_warning too
   431  	}
   432  
   433  	if len(privateUsers) > 0 {
   434  		args = append(args, "--private-users="+privateUsers)
   435  	}
   436  
   437  	keepUnit, err := isRunningFromUnitFile()
   438  	if err != nil {
   439  		return nil, nil, fmt.Errorf("error determining if we're running from a unit file: %v", err)
   440  	}
   441  
   442  	if keepUnit {
   443  		args = append(args, "--keep-unit")
   444  	}
   445  
   446  	nsargs, err := p.PodToNspawnArgs()
   447  	if err != nil {
   448  		return nil, nil, fmt.Errorf("failed to generate nspawn args: %v", err)
   449  	}
   450  	args = append(args, nsargs...)
   451  
   452  	// Arguments to systemd
   453  	args = append(args, "--")
   454  	args = append(args, "--default-standard-output=tty") // redirect all service logs straight to tty
   455  	if !debug {
   456  		args = append(args, "--log-target=null") // silence systemd output inside pod
   457  		// TODO remove --log-level=warning when we update stage1 to systemd v222
   458  		args = append(args, "--log-level=warning") // limit log output (systemd-shutdown ignores --log-target)
   459  		args = append(args, "--show-status=0")     // silence systemd initialization status output
   460  	}
   461  
   462  	return args, env, nil
   463  }
   464  
   465  func withClearedCloExec(lfd int, f func() error) error {
   466  	err := sys.CloseOnExec(lfd, false)
   467  	if err != nil {
   468  		return err
   469  	}
   470  	defer sys.CloseOnExec(lfd, true)
   471  
   472  	return f()
   473  }
   474  
   475  func forwardedPorts(pod *Pod) ([]networking.ForwardedPort, error) {
   476  	var fps []networking.ForwardedPort
   477  
   478  	for _, ep := range pod.Manifest.Ports {
   479  		n := ""
   480  		fp := networking.ForwardedPort{}
   481  
   482  		for _, a := range pod.Manifest.Apps {
   483  			for _, p := range a.App.Ports {
   484  				if p.Name == ep.Name {
   485  					if n == "" {
   486  						fp.Protocol = p.Protocol
   487  						fp.HostPort = ep.HostPort
   488  						fp.PodPort = p.Port
   489  						n = a.Name.String()
   490  					} else {
   491  						return nil, fmt.Errorf("Ambiguous exposed port in PodManifest: %q and %q both define port %q", n, a.Name, p.Name)
   492  					}
   493  				}
   494  			}
   495  		}
   496  
   497  		if n == "" {
   498  			return nil, fmt.Errorf("Port name %q is not defined by any apps", ep.Name)
   499  		}
   500  
   501  		fps = append(fps, fp)
   502  	}
   503  
   504  	// TODO(eyakubovich): validate that there're no conflicts
   505  
   506  	return fps, nil
   507  }
   508  
   509  func writePpid(pid int) error {
   510  	// write ppid file as specified in
   511  	// Documentation/devel/stage1-implementors-guide.md
   512  	out, err := os.Getwd()
   513  	if err != nil {
   514  		return fmt.Errorf("Cannot get current working directory: %v\n", err)
   515  	}
   516  	// we are the parent of the process that is PID 1 in the container so we write our PID to "ppid"
   517  	err = ioutil.WriteFile(filepath.Join(out, "ppid"),
   518  		[]byte(fmt.Sprintf("%d\n", pid)), 0644)
   519  	if err != nil {
   520  		return fmt.Errorf("Cannot write ppid file: %v\n", err)
   521  	}
   522  	return nil
   523  }
   524  
   525  func stage1() int {
   526  	uuid, err := types.NewUUID(flag.Arg(0))
   527  	if err != nil {
   528  		fmt.Fprintln(os.Stderr, "UUID is missing or malformed")
   529  		return 1
   530  	}
   531  
   532  	root := "."
   533  	p, err := LoadPod(root, uuid)
   534  	if err != nil {
   535  		fmt.Fprintf(os.Stderr, "Failed to load pod: %v\n", err)
   536  		return 1
   537  	}
   538  
   539  	// set close-on-exec flag on RKT_LOCK_FD so it gets correctly closed when invoking
   540  	// network plugins
   541  	lfd, err := common.GetRktLockFD()
   542  	if err != nil {
   543  		fmt.Fprintf(os.Stderr, "Failed to get rkt lock fd: %v\n", err)
   544  		return 1
   545  	}
   546  
   547  	if err := sys.CloseOnExec(lfd, true); err != nil {
   548  		fmt.Fprintf(os.Stderr, "Failed to set FD_CLOEXEC on rkt lock: %v\n", err)
   549  		return 1
   550  	}
   551  
   552  	mirrorLocalZoneInfo(p.Root)
   553  
   554  	flavor, _, err := p.getFlavor()
   555  	if err != nil {
   556  		fmt.Fprintf(os.Stderr, "Failed to get stage1 flavor: %v\n", err)
   557  		return 3
   558  	}
   559  
   560  	var n *networking.Networking
   561  	if netList.Contained() {
   562  		fps, err := forwardedPorts(p)
   563  		if err != nil {
   564  			fmt.Fprintln(os.Stderr, err.Error())
   565  			return 6
   566  		}
   567  
   568  		n, err = networking.Setup(root, p.UUID, fps, netList, localConfig, flavor)
   569  		if err != nil {
   570  			fmt.Fprintf(os.Stderr, "Failed to setup network: %v\n", err)
   571  			return 6
   572  		}
   573  
   574  		if err = n.Save(); err != nil {
   575  			fmt.Fprintf(os.Stderr, "Failed to save networking state %v\n", err)
   576  			n.Teardown(flavor)
   577  			return 6
   578  		}
   579  
   580  		if len(mdsToken) > 0 {
   581  			hostIP, err := n.GetDefaultHostIP()
   582  			if err != nil {
   583  				fmt.Fprintf(os.Stderr, "Failed to get default Host IP: %v\n", err)
   584  				return 6
   585  			}
   586  
   587  			p.MetadataServiceURL = common.MetadataServicePublicURL(hostIP, mdsToken)
   588  		}
   589  	} else {
   590  		if flavor == "kvm" {
   591  			fmt.Fprintf(os.Stderr, "Flavor kvm requires private network configuration (try --net).\n")
   592  			return 6
   593  		}
   594  		if len(mdsToken) > 0 {
   595  			p.MetadataServiceURL = common.MetadataServicePublicURL(localhostIP, mdsToken)
   596  		}
   597  	}
   598  
   599  	if err = p.WriteDefaultTarget(); err != nil {
   600  		fmt.Fprintf(os.Stderr, "Failed to write default.target: %v\n", err)
   601  		return 2
   602  	}
   603  
   604  	if err = p.WritePrepareAppTemplate(); err != nil {
   605  		fmt.Fprintf(os.Stderr, "Failed to write prepare-app service template: %v\n", err)
   606  		return 2
   607  	}
   608  
   609  	if err = p.PodToSystemd(interactive, flavor, privateUsers); err != nil {
   610  		fmt.Fprintf(os.Stderr, "Failed to configure systemd: %v\n", err)
   611  		return 2
   612  	}
   613  
   614  	args, env, err := getArgsEnv(p, flavor, debug, n)
   615  	if err != nil {
   616  		fmt.Fprintf(os.Stderr, "Error: %v\n", err)
   617  		return 3
   618  	}
   619  
   620  	// create a separate mount namespace so the cgroup filesystems
   621  	// are unmounted when exiting the pod
   622  	if err := syscall.Unshare(syscall.CLONE_NEWNS); err != nil {
   623  		log.Fatalf("Error unsharing: %v", err)
   624  	}
   625  
   626  	// we recursively make / a "shared and slave" so mount events from the
   627  	// new namespace don't propagate to the host namespace but mount events
   628  	// from the host propagate to the new namespace and are forwarded to
   629  	// its peer group
   630  	// See https://www.kernel.org/doc/Documentation/filesystems/sharedsubtree.txt
   631  	if err := syscall.Mount("", "/", "none", syscall.MS_REC|syscall.MS_SLAVE, ""); err != nil {
   632  		log.Fatalf("Error making / a slave mount: %v", err)
   633  	}
   634  	if err := syscall.Mount("", "/", "none", syscall.MS_REC|syscall.MS_SHARED, ""); err != nil {
   635  		log.Fatalf("Error making / a shared and slave mount: %v", err)
   636  	}
   637  
   638  	enabledCgroups, err := cgroup.GetEnabledCgroups()
   639  	if err != nil {
   640  		fmt.Fprintf(os.Stderr, "Error getting cgroups: %v", err)
   641  		return 5
   642  	}
   643  
   644  	// mount host cgroups in the rkt mount namespace
   645  	if err := mountHostCgroups(enabledCgroups); err != nil {
   646  		log.Fatalf("Couldn't mount the host cgroups: %v\n", err)
   647  		return 5
   648  	}
   649  
   650  	var serviceNames []string
   651  	for _, app := range p.Manifest.Apps {
   652  		serviceNames = append(serviceNames, ServiceUnitName(app.Name))
   653  	}
   654  	s1Root := common.Stage1RootfsPath(p.Root)
   655  	machineID := p.GetMachineID()
   656  	subcgroup, err := getContainerSubCgroup(machineID)
   657  	if err == nil {
   658  		if err := mountContainerCgroups(s1Root, enabledCgroups, subcgroup, serviceNames); err != nil {
   659  			fmt.Fprintf(os.Stderr, "Couldn't mount the container cgroups: %v\n", err)
   660  			return 5
   661  		}
   662  	} else {
   663  		fmt.Fprintf(os.Stderr, "Continuing with per-app isolators disabled: %v\n", err)
   664  	}
   665  
   666  	if err = writePpid(os.Getpid()); err != nil {
   667  		fmt.Fprintln(os.Stderr, err.Error())
   668  		return 4
   669  	}
   670  
   671  	err = withClearedCloExec(lfd, func() error {
   672  		return syscall.Exec(args[0], args, env)
   673  	})
   674  	if err != nil {
   675  		fmt.Fprintf(os.Stderr, "Failed to execute %q: %v\n", args[0], err)
   676  		return 7
   677  	}
   678  
   679  	return 0
   680  }
   681  
   682  func areHostCgroupsMounted(enabledCgroups map[int][]string) bool {
   683  	controllers := cgroup.GetControllerDirs(enabledCgroups)
   684  	for _, c := range controllers {
   685  		if !cgroup.IsControllerMounted(c) {
   686  			return false
   687  		}
   688  	}
   689  
   690  	return true
   691  }
   692  
   693  // mountHostCgroups mounts the host cgroup hierarchy as required by
   694  // systemd-nspawn. We need this because some distributions don't have the
   695  // "name=systemd" cgroup or don't mount the cgroup controllers in
   696  // "/sys/fs/cgroup", and systemd-nspawn needs this. Since this is mounted
   697  // inside the rkt mount namespace, it doesn't affect the host.
   698  func mountHostCgroups(enabledCgroups map[int][]string) error {
   699  	systemdControllerPath := "/sys/fs/cgroup/systemd"
   700  	if !areHostCgroupsMounted(enabledCgroups) {
   701  		if err := cgroup.CreateCgroups("/", enabledCgroups); err != nil {
   702  			return fmt.Errorf("error creating host cgroups: %v\n", err)
   703  		}
   704  	}
   705  
   706  	if !cgroup.IsControllerMounted("systemd") {
   707  		if err := os.MkdirAll(systemdControllerPath, 0700); err != nil {
   708  			return err
   709  		}
   710  		if err := syscall.Mount("cgroup", systemdControllerPath, "cgroup", 0, "none,name=systemd"); err != nil {
   711  			return fmt.Errorf("error mounting name=systemd hierarchy on %q: %v", systemdControllerPath, err)
   712  		}
   713  	}
   714  
   715  	return nil
   716  }
   717  
   718  // mountContainerCgroups mounts the cgroup controllers hierarchy in the container's
   719  // namespace read-only, leaving the needed knobs in the subcgroup for each-app
   720  // read-write so systemd inside stage1 can apply isolators to them
   721  func mountContainerCgroups(s1Root string, enabledCgroups map[int][]string, subcgroup string, serviceNames []string) error {
   722  	if err := cgroup.CreateCgroups(s1Root, enabledCgroups); err != nil {
   723  		return fmt.Errorf("error creating container cgroups: %v\n", err)
   724  	}
   725  	if err := cgroup.RemountCgroupsRO(s1Root, enabledCgroups, subcgroup, serviceNames); err != nil {
   726  		return fmt.Errorf("error restricting container cgroups: %v\n", err)
   727  	}
   728  
   729  	return nil
   730  }
   731  
   732  func getContainerSubCgroup(machineID string) (string, error) {
   733  	var subcgroup string
   734  	fromUnit, err := isRunningFromUnitFile()
   735  	if err != nil {
   736  		return "", fmt.Errorf("could not determine if we're running from a unit file: %v", err)
   737  	}
   738  	if fromUnit {
   739  		slice, err := getSlice()
   740  		if err != nil {
   741  			return "", fmt.Errorf("could not get slice name: %v", err)
   742  		}
   743  		slicePath, err := common.SliceToPath(slice)
   744  		if err != nil {
   745  			return "", fmt.Errorf("could not convert slice name to path: %v", err)
   746  		}
   747  		unit, err := getUnitFileName()
   748  		if err != nil {
   749  			return "", fmt.Errorf("could not get unit name: %v", err)
   750  		}
   751  		subcgroup = filepath.Join(slicePath, unit, "system.slice")
   752  	} else {
   753  		if machinedRegister() {
   754  			// we are not in the final cgroup yet: systemd-nspawn will move us
   755  			// to the correct cgroup later during registration so we can't
   756  			// look it up in /proc/self/cgroup
   757  			escapedmID := strings.Replace(machineID, "-", "\\x2d", -1)
   758  			machineDir := "machine-" + escapedmID + ".scope"
   759  			subcgroup = filepath.Join("machine.slice", machineDir, "system.slice")
   760  		} else {
   761  			// when registration is disabled the container will be directly
   762  			// under rkt's cgroup so we can look it up in /proc/self/cgroup
   763  			ownCgroupPath, err := cgroup.GetOwnCgroupPath("name=systemd")
   764  			if err != nil {
   765  				return "", fmt.Errorf("could not get own cgroup path: %v", err)
   766  			}
   767  			// systemd-nspawn won't work unless we're in a subcgroup. If we're
   768  			// in the root cgroup, we create a "rkt" subcgroup and we add
   769  			// ourselves to it
   770  			if ownCgroupPath == "/" {
   771  				ownCgroupPath = "/rkt"
   772  				if err := cgroup.JoinSubcgroup("systemd", ownCgroupPath); err != nil {
   773  					return "", fmt.Errorf("error joining %s subcgroup: %v", ownCgroupPath, err)
   774  				}
   775  			}
   776  			subcgroup = filepath.Join(ownCgroupPath, "system.slice")
   777  		}
   778  	}
   779  
   780  	return subcgroup, nil
   781  }
   782  
   783  func getUnitFileName() (unit string, err error) {
   784  	libname := C.CString("libsystemd.so")
   785  	defer C.free(unsafe.Pointer(libname))
   786  	handle := C.dlopen(libname, C.RTLD_LAZY)
   787  	if handle == nil {
   788  		err = fmt.Errorf("error opening libsystemd.so")
   789  		return
   790  	}
   791  	defer func() {
   792  		if r := C.dlclose(handle); r != 0 {
   793  			err = fmt.Errorf("error closing libsystemd.so")
   794  		}
   795  	}()
   796  
   797  	sym := C.CString("sd_pid_get_unit")
   798  	defer C.free(unsafe.Pointer(sym))
   799  	sd_pid_get_unit := C.dlsym(handle, sym)
   800  	if sd_pid_get_unit == nil {
   801  		err = fmt.Errorf("error resolving sd_pid_get_unit function")
   802  		return
   803  	}
   804  
   805  	var s string
   806  	u := C.CString(s)
   807  	defer C.free(unsafe.Pointer(u))
   808  
   809  	ret := C.my_sd_pid_get_unit(sd_pid_get_unit, 0, &u)
   810  	if ret < 0 {
   811  		err = fmt.Errorf("error calling sd_pid_get_unit: %v", syscall.Errno(-ret))
   812  		return
   813  	}
   814  
   815  	unit = C.GoString(u)
   816  	return
   817  }
   818  
   819  func getSlice() (slice string, err error) {
   820  	libname := C.CString("libsystemd.so")
   821  	defer C.free(unsafe.Pointer(libname))
   822  	handle := C.dlopen(libname, C.RTLD_LAZY)
   823  	if handle == nil {
   824  		err = fmt.Errorf("error opening libsystemd.so")
   825  		return
   826  	}
   827  	defer func() {
   828  		if r := C.dlclose(handle); r != 0 {
   829  			err = fmt.Errorf("error closing libsystemd.so")
   830  		}
   831  	}()
   832  
   833  	sym := C.CString("sd_pid_get_slice")
   834  	defer C.free(unsafe.Pointer(sym))
   835  	sd_pid_get_slice := C.dlsym(handle, sym)
   836  	if sd_pid_get_slice == nil {
   837  		err = fmt.Errorf("error resolving sd_pid_get_slice function")
   838  		return
   839  	}
   840  
   841  	var s string
   842  	sl := C.CString(s)
   843  	defer C.free(unsafe.Pointer(sl))
   844  
   845  	ret := C.my_sd_pid_get_slice(sd_pid_get_slice, 0, &sl)
   846  	if ret < 0 {
   847  		err = fmt.Errorf("error calling sd_pid_get_slice: %v", syscall.Errno(-ret))
   848  		return
   849  	}
   850  
   851  	slice = C.GoString(sl)
   852  	return
   853  }
   854  
   855  func isRunningFromUnitFile() (ret bool, err error) {
   856  	libname := C.CString("libsystemd.so")
   857  	defer C.free(unsafe.Pointer(libname))
   858  	handle := C.dlopen(libname, C.RTLD_LAZY)
   859  	if handle == nil {
   860  		// we can't open libsystemd.so so we assume systemd is not
   861  		// installed and we're not running from a unit file
   862  		return
   863  	}
   864  	defer func() {
   865  		if r := C.dlclose(handle); r != 0 {
   866  			err = fmt.Errorf("error closing libsystemd.so")
   867  		}
   868  	}()
   869  
   870  	sd_pid_get_owner_uid := C.dlsym(handle, C.CString("sd_pid_get_owner_uid"))
   871  	if sd_pid_get_owner_uid == nil {
   872  		err = fmt.Errorf("error resolving sd_pid_get_owner_uid function")
   873  		return
   874  	}
   875  
   876  	var uid C.uid_t
   877  	errno := C.my_sd_pid_get_owner_uid(sd_pid_get_owner_uid, 0, &uid)
   878  	// when we're running from a unit file, sd_pid_get_owner_uid returns
   879  	// ENOENT (systemd <220) or ENXIO (systemd >=220)
   880  	switch {
   881  	case errno >= 0:
   882  	case syscall.Errno(-errno) == syscall.ENOENT || syscall.Errno(-errno) == syscall.ENXIO:
   883  		if C.am_session_leader() == 1 {
   884  			ret = true
   885  		}
   886  	default:
   887  		err = fmt.Errorf("error calling sd_pid_get_owner_uid: %v", syscall.Errno(-errno))
   888  	}
   889  	return
   890  }
   891  
   892  func main() {
   893  	flag.Parse()
   894  
   895  	if !debug {
   896  		log.SetOutput(ioutil.Discard)
   897  	}
   898  
   899  	// move code into stage1() helper so defered fns get run
   900  	os.Exit(stage1())
   901  }