github.com/coreos/rocket@v1.30.1-0.20200224141603-171c416fac02/stage0/run.go (about)

     1  // Copyright 2014 The rkt Authors
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  //+build linux
    16  
    17  package stage0
    18  
    19  //
    20  // rkt is a reference implementation of the app container specification.
    21  //
    22  // Execution on rkt is divided into a number of stages, and the `rkt`
    23  // binary implements the first stage (stage0)
    24  //
    25  
    26  import (
    27  	"encoding/json"
    28  	"errors"
    29  	"fmt"
    30  	"io/ioutil"
    31  	"os"
    32  	"path"
    33  	"path/filepath"
    34  	"runtime"
    35  	"strconv"
    36  	"strings"
    37  	"syscall"
    38  	"time"
    39  
    40  	"github.com/appc/spec/schema"
    41  	"github.com/appc/spec/schema/types"
    42  	cnitypes "github.com/containernetworking/cni/pkg/types"
    43  	"github.com/hashicorp/errwrap"
    44  	"github.com/opencontainers/selinux/go-selinux/label"
    45  	"github.com/rkt/rkt/common"
    46  	"github.com/rkt/rkt/common/apps"
    47  	commonnet "github.com/rkt/rkt/common/networking"
    48  	"github.com/rkt/rkt/common/overlay"
    49  	"github.com/rkt/rkt/pkg/aci"
    50  	"github.com/rkt/rkt/pkg/fileutil"
    51  	"github.com/rkt/rkt/pkg/sys"
    52  	"github.com/rkt/rkt/pkg/tpm"
    53  	"github.com/rkt/rkt/pkg/user"
    54  	"github.com/rkt/rkt/store/imagestore"
    55  	"github.com/rkt/rkt/store/treestore"
    56  	"github.com/rkt/rkt/version"
    57  )
    58  
    59  var debugEnabled bool
    60  
    61  // PrepareConfig defines the configuration parameters required by Prepare
    62  type PrepareConfig struct {
    63  	*CommonConfig
    64  	Apps            *apps.Apps            // apps to prepare
    65  	InheritEnv      bool                  // inherit parent environment into apps
    66  	ExplicitEnv     []string              // always set these environment variables for all the apps
    67  	EnvFromFile     []string              // environment variables loaded from files, set for all the apps
    68  	Ports           []types.ExposedPort   // list of ports that rkt will expose on the host
    69  	UseOverlay      bool                  // prepare pod with overlay fs
    70  	PodManifest     string                // use the pod manifest specified by the user, this will ignore flags such as '--volume', '--port', etc.
    71  	PrivateUsers    *user.UidRange        // user namespaces
    72  	UserAnnotations types.UserAnnotations // user annotations for the pod.
    73  	UserLabels      types.UserLabels      // user labels for the pod.
    74  }
    75  
    76  // RunConfig defines the configuration parameters needed by Run
    77  type RunConfig struct {
    78  	*CommonConfig
    79  	Net                  common.NetList // pod should have its own network stack
    80  	LockFd               int            // lock file descriptor
    81  	Interactive          bool           // whether the pod is interactive or not
    82  	MDSRegister          bool           // whether to register with metadata service or not
    83  	Apps                 schema.AppList // applications (prepare gets them via Apps)
    84  	LocalConfig          string         // Path to local configuration
    85  	Hostname             string         // hostname of the pod
    86  	RktGid               int            // group id of the 'rkt' group, -1 ere's no rkt group.
    87  	DNSConfMode          DNSConfMode    // dns configuration file mode - for stAage1
    88  	DNSConfig            cnitypes.DNS   // the DNS configuration (nameservers, search, options)
    89  	InsecureCapabilities bool           // Do not restrict capabilities
    90  	InsecurePaths        bool           // Do not restrict access to files in sysfs or procfs
    91  	InsecureSeccomp      bool           // Do not add seccomp restrictions
    92  	UseOverlay           bool           // run pod with overlay fs
    93  	HostsEntries         HostsEntries   // The entries in /etc/hosts
    94  	IPCMode              string         // whether to stay in the host IPC namespace
    95  }
    96  
    97  // CommonConfig defines the configuration shared by both Run and Prepare
    98  type CommonConfig struct {
    99  	DataDir      string                        // The path to the data directory, e.g. /var/lib/rkt/pods
   100  	Store        *imagestore.Store             // store containing all of the configured application images
   101  	TreeStore    *treestore.Store              // store containing all of the configured application images
   102  	Stage1Image  types.Hash                    // stage1 image containing usable /init and /enter entrypoints
   103  	UUID         *types.UUID                   // UUID of the pod
   104  	RootHash     string                        // hash of the root filesystem
   105  	ManifestData string                        // the pod manifest data
   106  	Debug        bool                          // debug mode
   107  	MountLabel   string                        // SELinux label to use for fs
   108  	ProcessLabel string                        // SELinux label to use
   109  	Mutable      bool                          // whether this pod is mutable
   110  	Annotations  map[types.ACIdentifier]string // pod-level annotations, for internal/experimental usage
   111  }
   112  
   113  // HostsEntries encapsulates the entries in an etc-hosts file: mapping from IP
   114  // to arbitrary list of hostnames
   115  type HostsEntries map[string][]string
   116  
   117  // DNSConfMode indicates what the stage1 should do with dns config files
   118  // The values and meanings are:
   119  // 'host': bind-mount from host
   120  // 'stage0': the stage0 has generated it
   121  // 'none' : do not generate it
   122  // 'default' : do whatever was the default
   123  type DNSConfMode struct {
   124  	Resolv string // /etc/rkt-resolv.conf
   125  	Hosts  string // /etc/rkt-hosts
   126  }
   127  
   128  func init() {
   129  	// this ensures that main runs only on main thread (thread group leader).
   130  	// since namespace ops (unshare, setns) are done for a single thread, we
   131  	// must ensure that the goroutine does not jump from OS thread to thread
   132  	runtime.LockOSThread()
   133  }
   134  
   135  // InitDebug enables debugging
   136  func InitDebug() {
   137  	debugEnabled = true
   138  	log.SetDebug(true)
   139  }
   140  
   141  func debug(format string, i ...interface{}) {
   142  	if debugEnabled {
   143  		log.Printf(format, i...)
   144  	}
   145  }
   146  
   147  // mergeEnvs merges environment variables from env into the current appEnv
   148  // if override is set to true, then variables with the same name will be set to the value in env
   149  // env is expected to be in the os.Environ() key=value format
   150  func mergeEnvs(appEnv *types.Environment, env []string, override bool) {
   151  	for _, ev := range env {
   152  		pair := strings.SplitN(ev, "=", 2)
   153  		if _, exists := appEnv.Get(pair[0]); override || !exists {
   154  			appEnv.Set(pair[0], pair[1])
   155  		}
   156  	}
   157  }
   158  
   159  // deduplicateMPs removes Mounts with duplicated paths. If there's more than
   160  // one Mount with the same path, it keeps the first one encountered.
   161  func deduplicateMPs(mounts []schema.Mount) []schema.Mount {
   162  	var res []schema.Mount
   163  	seen := make(map[string]struct{})
   164  	for _, m := range mounts {
   165  		cleanPath := path.Clean(m.Path)
   166  		if _, ok := seen[cleanPath]; !ok {
   167  			res = append(res, m)
   168  			seen[cleanPath] = struct{}{}
   169  		}
   170  	}
   171  	return res
   172  }
   173  
   174  // MergeMounts combines the global and per-app mount slices
   175  func MergeMounts(mounts []schema.Mount, appMounts []schema.Mount) []schema.Mount {
   176  	ml := append(appMounts, mounts...)
   177  	return deduplicateMPs(ml)
   178  }
   179  
   180  // generatePodManifest creates the pod manifest from the command line input.
   181  // It returns the pod manifest as []byte on success.
   182  // This is invoked if no pod manifest is specified at the command line.
   183  func generatePodManifest(cfg PrepareConfig, dir string) ([]byte, error) {
   184  	pm := schema.PodManifest{
   185  		ACKind: "PodManifest",
   186  		Apps:   make(schema.AppList, 0),
   187  	}
   188  
   189  	v, err := types.NewSemVer(version.Version)
   190  	if err != nil {
   191  		return nil, errwrap.Wrap(errors.New("error creating version"), err)
   192  	}
   193  	pm.ACVersion = *v
   194  
   195  	if err := cfg.Apps.Walk(func(app *apps.App) error {
   196  		img := app.ImageID
   197  
   198  		am, err := cfg.Store.GetImageManifest(img.String())
   199  		if err != nil {
   200  			return errwrap.Wrap(errors.New("error getting the manifest"), err)
   201  		}
   202  
   203  		if app.Name == "" {
   204  			appName, err := common.ImageNameToAppName(am.Name)
   205  			if err != nil {
   206  				return errwrap.Wrap(errors.New("error converting image name to app name"), err)
   207  			}
   208  			app.Name = appName.String()
   209  		}
   210  
   211  		appName, err := types.NewACName(app.Name)
   212  		if err != nil {
   213  			return errwrap.Wrap(errors.New("invalid app name format"), err)
   214  		}
   215  
   216  		if _, err := prepareAppImage(cfg, *appName, img, dir, cfg.UseOverlay); err != nil {
   217  			return errwrap.Wrap(fmt.Errorf("error preparing image %s", img), err)
   218  		}
   219  		if pm.Apps.Get(*appName) != nil {
   220  			return fmt.Errorf("error: multiple apps with name %s", app.Name)
   221  		}
   222  		if am.App == nil && app.Exec == "" {
   223  			return fmt.Errorf("error: image %s has no app section and --exec argument is not provided", img)
   224  		}
   225  
   226  		ra, err := generateRuntimeApp(app, am, cfg.Apps.Mounts)
   227  		if err != nil {
   228  			return err
   229  		}
   230  
   231  		// loading the environment from the lowest priority to highest
   232  		if cfg.InheritEnv {
   233  			// Inherit environment does not override app image environment
   234  			mergeEnvs(&ra.App.Environment, os.Environ(), false)
   235  		}
   236  
   237  		mergeEnvs(&ra.App.Environment, cfg.EnvFromFile, true)
   238  		mergeEnvs(&ra.App.Environment, cfg.ExplicitEnv, true)
   239  
   240  		pm.Apps = append(pm.Apps, ra)
   241  
   242  		return nil
   243  	}); err != nil {
   244  		return nil, err
   245  	}
   246  
   247  	// TODO(jonboulle): check that app mountpoint expectations are
   248  	// satisfied here, rather than waiting for stage1
   249  	pm.Volumes = cfg.Apps.Volumes
   250  
   251  	// Check to see if ports have any errors
   252  	pm.Ports = cfg.Ports
   253  	if _, err := commonnet.ForwardedPorts(&pm); err != nil {
   254  		return nil, err
   255  	}
   256  
   257  	pm.Annotations = append(pm.Annotations, types.Annotation{
   258  		Name:  "coreos.com/rkt/stage1/mutable",
   259  		Value: strconv.FormatBool(cfg.Mutable),
   260  	})
   261  
   262  	pm.UserAnnotations = cfg.UserAnnotations
   263  	pm.UserLabels = cfg.UserLabels
   264  
   265  	// Add internal annotations for rkt experiments
   266  	for k, v := range cfg.Annotations {
   267  		if _, ok := pm.Annotations.Get(k.String()); ok {
   268  			continue
   269  		}
   270  		pm.Annotations.Set(k, v)
   271  	}
   272  
   273  	pmb, err := json.Marshal(pm)
   274  	if err != nil {
   275  		return nil, errwrap.Wrap(errors.New("error marshalling pod manifest"), err)
   276  	}
   277  	return pmb, nil
   278  }
   279  
   280  // prepareIsolators merges the CLI app parameters with the manifest's app
   281  func prepareIsolators(setup *apps.App, app *types.App) error {
   282  	if memoryOverride := setup.MemoryLimit; memoryOverride != nil {
   283  		isolator := memoryOverride.AsIsolator()
   284  		app.Isolators = append(app.Isolators, isolator)
   285  	}
   286  
   287  	if cpuOverride := setup.CPULimit; cpuOverride != nil {
   288  		isolator := cpuOverride.AsIsolator()
   289  		app.Isolators = append(app.Isolators, isolator)
   290  	}
   291  
   292  	if cpuSharesOverride := setup.CPUShares; cpuSharesOverride != nil {
   293  		isolator := cpuSharesOverride.AsIsolator()
   294  		app.Isolators.ReplaceIsolatorsByName(isolator, []types.ACIdentifier{types.LinuxCPUSharesName})
   295  	}
   296  
   297  	if oomAdjOverride := setup.OOMScoreAdj; oomAdjOverride != nil {
   298  		app.Isolators.ReplaceIsolatorsByName(oomAdjOverride.AsIsolator(), []types.ACIdentifier{types.LinuxOOMScoreAdjName})
   299  	}
   300  
   301  	if setup.CapsRetain != nil && setup.CapsRemove != nil {
   302  		return fmt.Errorf("error: cannot use both --caps-retain and --caps-remove on the same image")
   303  	}
   304  
   305  	// Delete existing caps isolators if the user wants to override
   306  	// them with either --caps-retain or --caps-remove
   307  	if setup.CapsRetain != nil || setup.CapsRemove != nil {
   308  		for i := len(app.Isolators) - 1; i >= 0; i-- {
   309  			isolator := app.Isolators[i]
   310  			if _, ok := isolator.Value().(types.LinuxCapabilitiesSet); ok {
   311  				app.Isolators = append(app.Isolators[:i],
   312  					app.Isolators[i+1:]...)
   313  			}
   314  		}
   315  	}
   316  
   317  	if capsRetain := setup.CapsRetain; capsRetain != nil {
   318  		isolator, err := capsRetain.AsIsolator()
   319  		if err != nil {
   320  			return err
   321  		}
   322  		app.Isolators = append(app.Isolators, *isolator)
   323  	} else if capsRemove := setup.CapsRemove; capsRemove != nil {
   324  		isolator, err := capsRemove.AsIsolator()
   325  		if err != nil {
   326  			return err
   327  		}
   328  		app.Isolators = append(app.Isolators, *isolator)
   329  	}
   330  
   331  	// Override seccomp isolators via --seccomp CLI switch
   332  	if setup.SeccompFilter != "" {
   333  		var is *types.Isolator
   334  		mode, errno, set, err := setup.SeccompOverride()
   335  		if err != nil {
   336  			return err
   337  		}
   338  		switch mode {
   339  		case "retain":
   340  			lss, err := types.NewLinuxSeccompRetainSet(errno, set...)
   341  			if err != nil {
   342  				return err
   343  			}
   344  			if is, err = lss.AsIsolator(); err != nil {
   345  				return err
   346  			}
   347  		case "remove":
   348  			lss, err := types.NewLinuxSeccompRemoveSet(errno, set...)
   349  			if err != nil {
   350  				return err
   351  			}
   352  			if is, err = lss.AsIsolator(); err != nil {
   353  				return err
   354  			}
   355  		default:
   356  			return apps.ErrInvalidSeccompMode
   357  		}
   358  		app.Isolators.ReplaceIsolatorsByName(*is, []types.ACIdentifier{types.LinuxSeccompRemoveSetName, types.LinuxSeccompRetainSetName})
   359  	}
   360  	return nil
   361  }
   362  
   363  // validatePodManifest reads the user-specified pod manifest, prepares the app images
   364  // and validates the pod manifest. If the pod manifest passes validation, it returns
   365  // the manifest as []byte.
   366  // TODO(yifan): More validation in the future.
   367  func validatePodManifest(cfg PrepareConfig, dir string) ([]byte, error) {
   368  	pmb, err := ioutil.ReadFile(cfg.PodManifest)
   369  	if err != nil {
   370  		return nil, errwrap.Wrap(errors.New("error reading pod manifest"), err)
   371  	}
   372  	var pm schema.PodManifest
   373  	if err := json.Unmarshal(pmb, &pm); err != nil {
   374  		return nil, errwrap.Wrap(errors.New("error unmarshaling pod manifest"), err)
   375  	}
   376  
   377  	appNames := make(map[types.ACName]struct{})
   378  	for _, ra := range pm.Apps {
   379  		img := ra.Image
   380  
   381  		if img.ID.Empty() {
   382  			return nil, fmt.Errorf("no image ID for app %q", ra.Name)
   383  		}
   384  		am, err := cfg.Store.GetImageManifest(img.ID.String())
   385  		if err != nil {
   386  			return nil, errwrap.Wrap(errors.New("error getting the image manifest from store"), err)
   387  		}
   388  		if _, err := prepareAppImage(cfg, ra.Name, img.ID, dir, cfg.UseOverlay); err != nil {
   389  			return nil, errwrap.Wrap(fmt.Errorf("error preparing image %s", img), err)
   390  		}
   391  		if _, ok := appNames[ra.Name]; ok {
   392  			return nil, fmt.Errorf("multiple apps with same name %s", ra.Name)
   393  		}
   394  		appNames[ra.Name] = struct{}{}
   395  		if ra.App == nil && am.App == nil {
   396  			return nil, fmt.Errorf("no app section in the pod manifest or the image manifest")
   397  		}
   398  	}
   399  
   400  	// Validate forwarded ports
   401  	if _, err := commonnet.ForwardedPorts(&pm); err != nil {
   402  		return nil, err
   403  	}
   404  	return pmb, nil
   405  }
   406  
   407  // Prepare sets up a pod based on the given config.
   408  func Prepare(cfg PrepareConfig, dir string, uuid *types.UUID) error {
   409  	if err := os.MkdirAll(common.AppsInfoPath(dir), common.DefaultRegularDirPerm); err != nil {
   410  		return errwrap.Wrap(errors.New("error creating apps info directory"), err)
   411  	}
   412  	debug("Preparing stage1")
   413  	if err := prepareStage1Image(cfg, dir); err != nil {
   414  		return errwrap.Wrap(errors.New("error preparing stage1"), err)
   415  	}
   416  
   417  	var pmb []byte
   418  	var err error
   419  	if len(cfg.PodManifest) > 0 {
   420  		pmb, err = validatePodManifest(cfg, dir)
   421  	} else {
   422  		pmb, err = generatePodManifest(cfg, dir)
   423  	}
   424  	if err != nil {
   425  		return err
   426  	}
   427  
   428  	cfg.CommonConfig.ManifestData = string(pmb)
   429  
   430  	// create pod lock file for app add/rm operations.
   431  	f, err := os.OpenFile(common.PodManifestLockPath(dir), os.O_CREATE|os.O_RDWR, 0600)
   432  	if err != nil {
   433  		return err
   434  	}
   435  	f.Close()
   436  
   437  	debug("Writing pod manifest")
   438  	fn := common.PodManifestPath(dir)
   439  	if err := ioutil.WriteFile(fn, pmb, common.DefaultRegularFilePerm); err != nil {
   440  		return errwrap.Wrap(errors.New("error writing pod manifest"), err)
   441  	}
   442  
   443  	f, err = os.OpenFile(common.PodCreatedPath(dir), os.O_CREATE|os.O_RDWR, common.DefaultRegularFilePerm)
   444  	if err != nil {
   445  		return err
   446  	}
   447  	f.Close()
   448  
   449  	if cfg.UseOverlay {
   450  		// mark the pod as prepared with overlay
   451  		f, err := os.Create(filepath.Join(dir, common.OverlayPreparedFilename))
   452  		if err != nil {
   453  			return errwrap.Wrap(errors.New("error writing overlay marker file"), err)
   454  		}
   455  		defer f.Close()
   456  	}
   457  
   458  	if cfg.PrivateUsers.Shift > 0 {
   459  		// mark the pod as prepared for user namespaces
   460  		uidrangeBytes := cfg.PrivateUsers.Serialize()
   461  
   462  		if err := ioutil.WriteFile(filepath.Join(dir, common.PrivateUsersPreparedFilename), uidrangeBytes, common.DefaultRegularFilePerm); err != nil {
   463  			return errwrap.Wrap(errors.New("error writing userns marker file"), err)
   464  		}
   465  	}
   466  
   467  	return nil
   468  }
   469  
   470  func preparedWithPrivateUsers(dir string) (string, error) {
   471  	bytes, err := ioutil.ReadFile(filepath.Join(dir, common.PrivateUsersPreparedFilename))
   472  	if os.IsNotExist(err) {
   473  		return "", nil
   474  	}
   475  	if err != nil {
   476  		return "", err
   477  	}
   478  
   479  	return string(bytes), nil
   480  }
   481  
   482  func writeDnsConfig(cfg *RunConfig, rootfs string) {
   483  	writeResolvConf(cfg, rootfs)
   484  	writeEtcHosts(cfg, rootfs)
   485  }
   486  
   487  // writeResolvConf will generate <stage1>/etc/rkt-resolv.conf if needed
   488  func writeResolvConf(cfg *RunConfig, rootfs string) {
   489  	if cfg.DNSConfMode.Resolv != "stage0" {
   490  		return
   491  	}
   492  
   493  	if err := os.Mkdir(filepath.Join(rootfs, "etc"), common.DefaultRegularDirPerm); err != nil {
   494  		if !os.IsExist(err) {
   495  			log.Fatalf("error creating dir %q: %v\n", "/etc", err)
   496  		}
   497  	}
   498  	resolvPath := filepath.Join(rootfs, "etc/rkt-resolv.conf")
   499  	f, err := os.Create(resolvPath)
   500  	if err != nil {
   501  		log.Fatalf("error writing etc/rkt-resolv.conf: %v\n", err)
   502  	}
   503  	defer f.Close()
   504  
   505  	_, err = f.WriteString(common.MakeResolvConf(cfg.DNSConfig, "Generated by rkt run"))
   506  	if err != nil {
   507  		log.Fatalf("error writing etc/rkt-resolv.conf: %v\n", err)
   508  	}
   509  }
   510  
   511  // writeEtcHosts writes the file /etc/rkt-hosts into the stage1 rootfs.
   512  // This will read defaults from <rootfs>/etc/hosts-fallback if it exists.
   513  // Therefore, this should be called after the stage1 is mounted
   514  func writeEtcHosts(cfg *RunConfig, rootfs string) {
   515  	if cfg.DNSConfMode.Hosts != "stage0" {
   516  		return
   517  	}
   518  
   519  	// Read <stage1>/rootfs/etc/hosts-fallback to get some sane defaults
   520  	hostsTextb, err := ioutil.ReadFile(filepath.Join(rootfs, "etc/hosts-fallback"))
   521  	if err != nil {
   522  		// fallback-fallback :-)
   523  		hostsTextb = []byte("#created by rkt stage0\n127.0.0.1 localhost localhost.localdomain\n")
   524  	}
   525  	hostsText := string(hostsTextb)
   526  
   527  	hostsText += "\n\n# Added by rkt run --hosts-entry\n"
   528  
   529  	for ip, hostnames := range cfg.HostsEntries {
   530  		hostsText = fmt.Sprintf("%s%s %s\n", hostsText, ip, strings.Join(hostnames, " "))
   531  	}
   532  
   533  	// Create /etc if it does not exist
   534  	etcPath := filepath.Join(rootfs, "etc")
   535  	if _, err := os.Stat(etcPath); err != nil && os.IsNotExist(err) {
   536  		err = os.Mkdir(etcPath, 0755)
   537  		if err != nil {
   538  			log.FatalE("failed to make stage1 etc directory", err)
   539  		}
   540  	} else if err != nil {
   541  		log.FatalE("Failed to stat stage1 etc", err)
   542  	}
   543  
   544  	hostsPath := filepath.Join(etcPath, "rkt-hosts")
   545  	err = ioutil.WriteFile(hostsPath, []byte(hostsText), 0644)
   546  	if err != nil {
   547  		log.FatalE("failed to write etc/rkt-hosts", err)
   548  	}
   549  }
   550  
   551  // Run mounts the right overlay filesystems and actually runs the prepared
   552  // pod by exec()ing the stage1 init inside the pod filesystem.
   553  func Run(cfg RunConfig, dir string, dataDir string) {
   554  	privateUsers, err := preparedWithPrivateUsers(dir)
   555  	if err != nil {
   556  		log.FatalE("error preparing private users", err)
   557  	}
   558  
   559  	debug("Setting up stage1")
   560  	if err := setupStage1Image(cfg, dir, cfg.UseOverlay); err != nil {
   561  		log.FatalE("error setting up stage1", err)
   562  	}
   563  	debug("Wrote filesystem to %s\n", dir)
   564  
   565  	for _, app := range cfg.Apps {
   566  		if err := setupAppImage(cfg, app.Name, app.Image.ID, dir, cfg.UseOverlay); err != nil {
   567  			log.FatalE("error setting up app image", err)
   568  		}
   569  	}
   570  
   571  	destRootfs := common.Stage1RootfsPath(dir)
   572  
   573  	writeDnsConfig(&cfg, destRootfs)
   574  
   575  	if err := os.Setenv(common.EnvLockFd, fmt.Sprintf("%v", cfg.LockFd)); err != nil {
   576  		log.FatalE("setting lock fd environment", err)
   577  	}
   578  
   579  	if err := os.Setenv(common.EnvSELinuxContext, fmt.Sprintf("%v", cfg.ProcessLabel)); err != nil {
   580  		log.FatalE("setting SELinux context environment", err)
   581  	}
   582  
   583  	if err := os.Setenv(common.EnvSELinuxMountContext, fmt.Sprintf("%v", cfg.MountLabel)); err != nil {
   584  		log.FatalE("setting SELinux mount context environment", err)
   585  	}
   586  
   587  	debug("Pivoting to filesystem %s", dir)
   588  	if err := os.Chdir(dir); err != nil {
   589  		log.FatalE("failed changing to dir", err)
   590  	}
   591  
   592  	ep, err := getStage1Entrypoint(dir, runEntrypoint)
   593  	if err != nil {
   594  		log.FatalE("error determining 'run' entrypoint", err)
   595  	}
   596  	args := []string{filepath.Join(destRootfs, ep)}
   597  
   598  	if cfg.Debug {
   599  		args = append(args, "--debug")
   600  	}
   601  
   602  	args = append(args, "--net="+cfg.Net.String())
   603  
   604  	if cfg.Interactive {
   605  		args = append(args, "--interactive")
   606  	}
   607  	if len(privateUsers) > 0 {
   608  		args = append(args, "--private-users="+privateUsers)
   609  	}
   610  	if cfg.MDSRegister {
   611  		mdsToken, err := registerPod(".", cfg.UUID, cfg.Apps)
   612  		if err != nil {
   613  			log.FatalE("failed to register the pod", err)
   614  		}
   615  
   616  		args = append(args, "--mds-token="+mdsToken)
   617  	}
   618  
   619  	if cfg.LocalConfig != "" {
   620  		args = append(args, "--local-config="+cfg.LocalConfig)
   621  	}
   622  
   623  	s1v, err := getStage1InterfaceVersion(dir)
   624  	if err != nil {
   625  		log.FatalE("error determining stage1 interface version", err)
   626  	}
   627  
   628  	if cfg.Hostname != "" {
   629  		if interfaceVersionSupportsHostname(s1v) {
   630  			args = append(args, "--hostname="+cfg.Hostname)
   631  		} else {
   632  			log.Printf("warning: --hostname option is not supported by stage1")
   633  		}
   634  	}
   635  
   636  	if cfg.DNSConfMode.Hosts != "default" || cfg.DNSConfMode.Resolv != "default" {
   637  		if interfaceVersionSupportsDNSConfMode(s1v) {
   638  			args = append(args, fmt.Sprintf("--dns-conf-mode=resolv=%s,hosts=%s", cfg.DNSConfMode.Resolv, cfg.DNSConfMode.Hosts))
   639  		} else {
   640  			log.Printf("warning: --dns-conf-mode option not supported by stage1")
   641  		}
   642  	}
   643  
   644  	if interfaceVersionSupportsInsecureOptions(s1v) {
   645  		if cfg.InsecureCapabilities {
   646  			args = append(args, "--disable-capabilities-restriction")
   647  		}
   648  		if cfg.InsecurePaths {
   649  			args = append(args, "--disable-paths")
   650  		}
   651  		if cfg.InsecureSeccomp {
   652  			args = append(args, "--disable-seccomp")
   653  		}
   654  	}
   655  
   656  	if cfg.Mutable {
   657  		mutable, err := supportsMutableEnvironment(dir)
   658  
   659  		switch {
   660  		case err != nil:
   661  			log.FatalE("error determining stage1 mutable support", err)
   662  		case !mutable:
   663  			log.Fatalln("stage1 does not support mutable pods")
   664  		}
   665  
   666  		args = append(args, "--mutable")
   667  	}
   668  
   669  	if cfg.IPCMode != "" {
   670  		if interfaceVersionSupportsIPCMode(s1v) {
   671  			args = append(args, "--ipc="+cfg.IPCMode)
   672  		} else {
   673  			log.Printf("warning: --ipc option is not supported by stage1")
   674  		}
   675  	}
   676  
   677  	args = append(args, cfg.UUID.String())
   678  
   679  	// make sure the lock fd stays open across exec
   680  	if err := sys.CloseOnExec(cfg.LockFd, false); err != nil {
   681  		log.Fatalf("error clearing FD_CLOEXEC on lock fd")
   682  	}
   683  
   684  	tpmEvent := fmt.Sprintf("rkt: Rootfs: %s Manifest: %s Stage1 args: %s", cfg.CommonConfig.RootHash, cfg.CommonConfig.ManifestData, strings.Join(args, " "))
   685  	// If there's no TPM available or there's a failure for some other
   686  	// reason, ignore it and continue anyway. Long term we'll want policy
   687  	// that enforces TPM behaviour, but we don't have any infrastructure
   688  	// around that yet.
   689  	_ = tpm.Extend(tpmEvent)
   690  
   691  	debug("Execing %s", args)
   692  	if err := syscall.Exec(args[0], args, os.Environ()); err != nil {
   693  		log.FatalE("error execing init", err)
   694  	}
   695  }
   696  
   697  // prepareAppImage renders and verifies the tree cache of the app image that
   698  // corresponds to the given app name.
   699  // When useOverlay is false, it attempts to render and expand the app image.
   700  // It returns the tree store ID if overlay is being used.
   701  func prepareAppImage(cfg PrepareConfig, appName types.ACName, img types.Hash, cdir string, useOverlay bool) (string, error) {
   702  	debug("Loading image %s", img.String())
   703  
   704  	am, err := cfg.Store.GetImageManifest(img.String())
   705  	if err != nil {
   706  		return "", errwrap.Wrap(errors.New("error getting the manifest"), err)
   707  	}
   708  
   709  	if _, hasOS := am.Labels.Get("os"); !hasOS {
   710  		return "", fmt.Errorf("missing os label in the image manifest")
   711  	}
   712  
   713  	if _, hasArch := am.Labels.Get("arch"); !hasArch {
   714  		return "", fmt.Errorf("missing arch label in the image manifest")
   715  	}
   716  
   717  	if err := types.IsValidOSArch(am.Labels.ToMap(), ValidOSArch); err != nil {
   718  		return "", err
   719  	}
   720  
   721  	appInfoDir := common.AppInfoPath(cdir, appName)
   722  	if err := os.MkdirAll(appInfoDir, common.DefaultRegularDirPerm); err != nil {
   723  		return "", errwrap.Wrap(errors.New("error creating apps info directory"), err)
   724  	}
   725  
   726  	var treeStoreID string
   727  	if useOverlay {
   728  		if cfg.PrivateUsers.Shift > 0 {
   729  			return "", fmt.Errorf("cannot use both overlay and user namespace: not implemented yet. (Try --no-overlay)")
   730  		}
   731  
   732  		treeStoreID, _, err = cfg.TreeStore.Render(img.String(), false)
   733  		if err != nil {
   734  			return "", errwrap.Wrap(errors.New("error rendering tree image"), err)
   735  		}
   736  
   737  		if err := ioutil.WriteFile(common.AppTreeStoreIDPath(cdir, appName), []byte(treeStoreID), common.DefaultRegularFilePerm); err != nil {
   738  			return "", errwrap.Wrap(errors.New("error writing app treeStoreID"), err)
   739  		}
   740  	} else {
   741  		ad := common.AppPath(cdir, appName)
   742  
   743  		err := os.MkdirAll(ad, common.DefaultRegularDirPerm)
   744  		if err != nil {
   745  			return "", errwrap.Wrap(errors.New("error creating image directory"), err)
   746  		}
   747  
   748  		shiftedUid, shiftedGid, err := cfg.PrivateUsers.ShiftRange(uint32(os.Getuid()), uint32(os.Getgid()))
   749  		if err != nil {
   750  			return "", errwrap.Wrap(errors.New("error getting uid, gid"), err)
   751  		}
   752  
   753  		if err := os.Chown(ad, int(shiftedUid), int(shiftedGid)); err != nil {
   754  			return "", errwrap.Wrap(fmt.Errorf("error shifting app %q's stage2 dir", appName), err)
   755  		}
   756  
   757  		if err := aci.RenderACIWithImageID(img, ad, cfg.Store, cfg.PrivateUsers); err != nil {
   758  			return "", errwrap.Wrap(errors.New("error rendering ACI"), err)
   759  		}
   760  	}
   761  
   762  	if err := writeManifest(*cfg.CommonConfig, img, appInfoDir); err != nil {
   763  		return "", errwrap.Wrap(errors.New("error writing manifest"), err)
   764  	}
   765  
   766  	return treeStoreID, nil
   767  }
   768  
   769  // setupAppImage mounts the overlay filesystem for the app image that
   770  // corresponds to the given hash if useOverlay is true.
   771  // It also creates an mtab file in the application's rootfs if one is not
   772  // present.
   773  func setupAppImage(cfg RunConfig, appName types.ACName, img types.Hash, cdir string, useOverlay bool) error {
   774  	ad := common.AppPath(cdir, appName)
   775  	if useOverlay {
   776  		err := os.MkdirAll(ad, common.DefaultRegularDirPerm)
   777  		if err != nil {
   778  			return errwrap.Wrap(errors.New("error creating image directory"), err)
   779  		}
   780  		treeStoreID, err := ioutil.ReadFile(common.AppTreeStoreIDPath(cdir, appName))
   781  		if err != nil {
   782  			return err
   783  		}
   784  		if err := copyAppManifest(cdir, appName, ad); err != nil {
   785  			return err
   786  		}
   787  		if err := overlayRender(cfg, string(treeStoreID), cdir, ad, appName.String()); err != nil {
   788  			return errwrap.Wrap(errors.New("error rendering overlay filesystem"), err)
   789  		}
   790  	}
   791  	return ensureMtabExists(filepath.Join(ad, "rootfs"))
   792  }
   793  
   794  // ensureMtabExists creates a symlink from /etc/mtab -> /proc/self/mounts if
   795  // nothing exists at /etc/mtab.
   796  // Various tools, such as mount from util-linux 2.25, expect the mtab file to
   797  // be populated.
   798  func ensureMtabExists(rootfs string) error {
   799  	stat, err := os.Stat(filepath.Join(rootfs, "etc"))
   800  	if os.IsNotExist(err) {
   801  		// If your image has no /etc you don't get /etc/mtab either
   802  		return nil
   803  	}
   804  	if err != nil {
   805  		return errwrap.Wrap(errors.New("error determining if /etc existed in the image"), err)
   806  	}
   807  	if !stat.IsDir() {
   808  		return nil
   809  	}
   810  	mtabPath := filepath.Join(rootfs, "etc", "mtab")
   811  	if _, err = os.Lstat(mtabPath); err == nil {
   812  		// If the image already has an mtab, don't replace it
   813  		return nil
   814  	}
   815  	if !os.IsNotExist(err) {
   816  		return errwrap.Wrap(errors.New("error determining if /etc/mtab exists in the image"), err)
   817  	}
   818  
   819  	target := "../proc/self/mounts"
   820  	err = os.Symlink(target, mtabPath)
   821  	if err != nil {
   822  		return errwrap.Wrap(errors.New("error creating mtab symlink"), err)
   823  	}
   824  	return nil
   825  }
   826  
   827  // prepareStage1Image renders and verifies tree cache of the given hash
   828  // when using overlay.
   829  // When useOverlay is false, it attempts to render and expand the stage1.
   830  func prepareStage1Image(cfg PrepareConfig, cdir string) error {
   831  	s1 := common.Stage1ImagePath(cdir)
   832  	if err := os.MkdirAll(s1, common.DefaultRegularDirPerm); err != nil {
   833  		return errwrap.Wrap(errors.New("error creating stage1 directory"), err)
   834  	}
   835  
   836  	treeStoreID, _, err := cfg.TreeStore.Render(cfg.Stage1Image.String(), false)
   837  	if err != nil {
   838  		return errwrap.Wrap(errors.New("error rendering tree image"), err)
   839  	}
   840  
   841  	if err := writeManifest(*cfg.CommonConfig, cfg.Stage1Image, s1); err != nil {
   842  		return errwrap.Wrap(errors.New("error writing manifest"), err)
   843  	}
   844  
   845  	if !cfg.UseOverlay {
   846  		destRootfs := filepath.Join(s1, "rootfs")
   847  		cachedTreePath := cfg.TreeStore.GetRootFS(treeStoreID)
   848  		if err := fileutil.CopyTree(cachedTreePath, destRootfs, cfg.PrivateUsers); err != nil {
   849  			return errwrap.Wrap(errors.New("error rendering ACI"), err)
   850  		}
   851  	}
   852  
   853  	fn := path.Join(cdir, common.Stage1TreeStoreIDFilename)
   854  	if err := ioutil.WriteFile(fn, []byte(treeStoreID), common.DefaultRegularFilePerm); err != nil {
   855  		return errwrap.Wrap(errors.New("error writing stage1 treeStoreID"), err)
   856  	}
   857  	return nil
   858  }
   859  
   860  // setupStage1Image mounts the overlay filesystem for stage1.
   861  // When useOverlay is false it is a noop
   862  func setupStage1Image(cfg RunConfig, cdir string, useOverlay bool) error {
   863  	s1 := common.Stage1ImagePath(cdir)
   864  	if useOverlay {
   865  		treeStoreID, err := ioutil.ReadFile(filepath.Join(cdir, common.Stage1TreeStoreIDFilename))
   866  		if err != nil {
   867  			return err
   868  		}
   869  
   870  		// pass an empty appName
   871  		if err := overlayRender(cfg, string(treeStoreID), cdir, s1, ""); err != nil {
   872  			return errwrap.Wrap(errors.New("error rendering overlay filesystem"), err)
   873  		}
   874  
   875  		// we will later read the status from the upper layer of the overlay fs
   876  		// force the status directory to be there by touching it
   877  		statusPath := filepath.Join(s1, "rootfs", "rkt", "status")
   878  		if err := os.Chtimes(statusPath, time.Now(), time.Now()); err != nil {
   879  			return errwrap.Wrap(errors.New("error touching status dir"), err)
   880  		}
   881  	}
   882  
   883  	return nil
   884  }
   885  
   886  // writeManifest takes an img ID and writes the corresponding manifest in dest
   887  func writeManifest(cfg CommonConfig, img types.Hash, dest string) error {
   888  	mb, err := cfg.Store.GetImageManifestJSON(img.String())
   889  	if err != nil {
   890  		return err
   891  	}
   892  
   893  	debug("Writing image manifest")
   894  	if err := ioutil.WriteFile(filepath.Join(dest, "manifest"), mb, common.DefaultRegularFilePerm); err != nil {
   895  		return errwrap.Wrap(errors.New("error writing image manifest"), err)
   896  	}
   897  
   898  	return nil
   899  }
   900  
   901  // copyAppManifest copies to saved image manifest for the given appName and
   902  // writes it in the dest directory.
   903  func copyAppManifest(cdir string, appName types.ACName, dest string) error {
   904  	appInfoDir := common.AppInfoPath(cdir, appName)
   905  	sourceFn := filepath.Join(appInfoDir, "manifest")
   906  	destFn := filepath.Join(dest, "manifest")
   907  	if err := fileutil.CopyRegularFile(sourceFn, destFn); err != nil {
   908  		return errwrap.Wrap(errors.New("error copying image manifest"), err)
   909  	}
   910  	return nil
   911  }
   912  
   913  // overlayRender renders the image that corresponds to the given hash using the
   914  // overlay filesystem. It mounts an overlay filesystem from the cached tree of
   915  // the image as rootfs.
   916  func overlayRender(cfg RunConfig, treeStoreID string, cdir string, dest string, appName string) error {
   917  	cachedTreePath := cfg.TreeStore.GetRootFS(treeStoreID)
   918  	mc, err := prepareOverlay(cachedTreePath, treeStoreID, cdir, dest, appName, cfg.MountLabel,
   919  		cfg.RktGid, common.DefaultRegularDirPerm)
   920  	if err != nil {
   921  		return errwrap.Wrap(errors.New("problem preparing overlay directories"), err)
   922  	}
   923  	if err = overlay.Mount(mc); err != nil {
   924  		return errwrap.Wrap(errors.New("problem mounting overlay filesystem"), err)
   925  	}
   926  
   927  	return nil
   928  }
   929  
   930  // prepateOverlay sets up the needed directories, files and permissions for the
   931  // overlay-rendered pods
   932  func prepareOverlay(lower, treeStoreID, cdir, dest, appName, lbl string,
   933  	gid int, fm os.FileMode) (*overlay.MountCfg, error) {
   934  	fi, err := os.Stat(lower)
   935  	if err != nil {
   936  		return nil, err
   937  	}
   938  	imgMode := fi.Mode()
   939  
   940  	dst := path.Join(dest, "rootfs")
   941  	if err := os.MkdirAll(dst, imgMode); err != nil {
   942  		return nil, err
   943  	}
   944  
   945  	overlayDir := path.Join(cdir, "overlay")
   946  	if err := os.MkdirAll(overlayDir, fm); err != nil {
   947  		return nil, err
   948  	}
   949  
   950  	// Since the parent directory (rkt/pods/$STATE/$POD_UUID) has the 'S_ISGID' bit, here
   951  	// we need to explicitly turn the bit off when creating this overlay
   952  	// directory so that it won't inherit the bit. Otherwise the files
   953  	// created by users within the pod will inherit the 'S_ISGID' bit
   954  	// as well.
   955  	if err := os.Chmod(overlayDir, fm); err != nil {
   956  		return nil, err
   957  	}
   958  
   959  	imgDir := path.Join(overlayDir, treeStoreID)
   960  	if err := os.MkdirAll(imgDir, fm); err != nil {
   961  		return nil, err
   962  	}
   963  	// Also make 'rkt/pods/$STATE/$POD_UUID/overlay/$IMAGE_ID' to be readable by 'rkt' group
   964  	// As 'rkt' status will read the 'rkt/pods/$STATE/$POD_UUID/overlay/$IMAGE_ID/upper/rkt/status/$APP'
   965  	// to get exgid
   966  	if err := os.Chown(imgDir, -1, gid); err != nil {
   967  		return nil, err
   968  	}
   969  
   970  	upper := path.Join(imgDir, "upper", appName)
   971  	if err := os.MkdirAll(upper, imgMode); err != nil {
   972  		return nil, err
   973  	}
   974  	if err := label.SetFileLabel(upper, lbl); err != nil {
   975  		return nil, err
   976  	}
   977  
   978  	work := path.Join(imgDir, "work", appName)
   979  	if err := os.MkdirAll(work, fm); err != nil {
   980  		return nil, err
   981  	}
   982  	if err := label.SetFileLabel(work, lbl); err != nil {
   983  		return nil, err
   984  	}
   985  
   986  	return &overlay.MountCfg{lower, upper, work, dst, lbl}, nil
   987  }