github.com/zhuohuang-hust/src-cbuild@v0.0.0-20230105071821-c7aab3e7c840/mergeCode/runc/libcontainer/container_linux.go (about)

     1  // +build linux
     2  
     3  package libcontainer
     4  
     5  import (
     6  	"bytes"
     7  	"encoding/json"
     8  	"fmt"
     9  	"io"
    10  	"io/ioutil"
    11  	"os"
    12  	"os/exec"
    13  	"path/filepath"
    14  	"reflect"
    15  	"strings"
    16  	"sync"
    17  	"syscall"
    18  	"time"
    19  
    20  	"github.com/Sirupsen/logrus"
    21  	"github.com/golang/protobuf/proto"
    22  	"github.com/opencontainers/runc/libcontainer/cgroups"
    23  	"github.com/opencontainers/runc/libcontainer/configs"
    24  	"github.com/opencontainers/runc/libcontainer/criurpc"
    25  	"github.com/opencontainers/runc/libcontainer/system"
    26  	"github.com/opencontainers/runc/libcontainer/utils"
    27  	"github.com/syndtr/gocapability/capability"
    28  	"github.com/vishvananda/netlink/nl"
    29  )
    30  
    31  const stdioFdCount = 3
    32  
    33  type linuxContainer struct {
    34  	id                   string
    35  	root                 string
    36  	config               *configs.Config
    37  	cgroupManager        cgroups.Manager
    38  	initArgs             []string
    39  	initProcess          parentProcess
    40  	initProcessStartTime string
    41  	criuPath             string
    42  	m                    sync.Mutex
    43  	criuVersion          int
    44  	state                containerState
    45  	created              time.Time
    46  }
    47  
    48  // State represents a running container's state
    49  type State struct {
    50  	BaseState
    51  
    52  	// Platform specific fields below here
    53  
    54  	// Path to all the cgroups setup for a container. Key is cgroup subsystem name
    55  	// with the value as the path.
    56  	CgroupPaths map[string]string `json:"cgroup_paths"`
    57  
    58  	// NamespacePaths are filepaths to the container's namespaces. Key is the namespace type
    59  	// with the value as the path.
    60  	NamespacePaths map[configs.NamespaceType]string `json:"namespace_paths"`
    61  
    62  	// Container's standard descriptors (std{in,out,err}), needed for checkpoint and restore
    63  	ExternalDescriptors []string `json:"external_descriptors,omitempty"`
    64  }
    65  
    66  // Container is a libcontainer container object.
    67  //
    68  // Each container is thread-safe within the same process. Since a container can
    69  // be destroyed by a separate process, any function may return that the container
    70  // was not found.
    71  type Container interface {
    72  	BaseContainer
    73  
    74  	// Methods below here are platform specific
    75  
    76  	// Checkpoint checkpoints the running container's state to disk using the criu(8) utility.
    77  	//
    78  	// errors:
    79  	// Systemerror - System error.
    80  	Checkpoint(criuOpts *CriuOpts) error
    81  
    82  	// Restore restores the checkpointed container to a running state using the criu(8) utility.
    83  	//
    84  	// errors:
    85  	// Systemerror - System error.
    86  	Restore(process *Process, criuOpts *CriuOpts) error
    87  
    88  	// If the Container state is RUNNING or CREATED, sets the Container state to PAUSING and pauses
    89  	// the execution of any user processes. Asynchronously, when the container finished being paused the
    90  	// state is changed to PAUSED.
    91  	// If the Container state is PAUSED, do nothing.
    92  	//
    93  	// errors:
    94  	// ContainerNotExists - Container no longer exists,
    95  	// ContainerNotRunning - Container not running or created,
    96  	// Systemerror - System error.
    97  	Pause() error
    98  
    99  	// If the Container state is PAUSED, resumes the execution of any user processes in the
   100  	// Container before setting the Container state to RUNNING.
   101  	// If the Container state is RUNNING, do nothing.
   102  	//
   103  	// errors:
   104  	// ContainerNotExists - Container no longer exists,
   105  	// ContainerNotPaused - Container is not paused,
   106  	// Systemerror - System error.
   107  	Resume() error
   108  
   109  	// NotifyOOM returns a read-only channel signaling when the container receives an OOM notification.
   110  	//
   111  	// errors:
   112  	// Systemerror - System error.
   113  	NotifyOOM() (<-chan struct{}, error)
   114  
   115  	// NotifyMemoryPressure returns a read-only channel signaling when the container reaches a given pressure level
   116  	//
   117  	// errors:
   118  	// Systemerror - System error.
   119  	NotifyMemoryPressure(level PressureLevel) (<-chan struct{}, error)
   120  }
   121  
   122  // ID returns the container's unique ID
   123  func (c *linuxContainer) ID() string {
   124  	return c.id
   125  }
   126  
   127  // Config returns the container's configuration
   128  func (c *linuxContainer) Config() configs.Config {
   129  	return *c.config
   130  }
   131  
   132  func (c *linuxContainer) Status() (Status, error) {
   133  	c.m.Lock()
   134  	defer c.m.Unlock()
   135  	return c.currentStatus()
   136  }
   137  
   138  func (c *linuxContainer) State() (*State, error) {
   139  	c.m.Lock()
   140  	defer c.m.Unlock()
   141  	return c.currentState()
   142  }
   143  
   144  func (c *linuxContainer) Processes() ([]int, error) {
   145  	pids, err := c.cgroupManager.GetAllPids()
   146  	if err != nil {
   147  		return nil, newSystemErrorWithCause(err, "getting all container pids from cgroups")
   148  	}
   149  	return pids, nil
   150  }
   151  
   152  func (c *linuxContainer) Stats() (*Stats, error) {
   153  	var (
   154  		err   error
   155  		stats = &Stats{}
   156  	)
   157  	if stats.CgroupStats, err = c.cgroupManager.GetStats(); err != nil {
   158  		return stats, newSystemErrorWithCause(err, "getting container stats from cgroups")
   159  	}
   160  	for _, iface := range c.config.Networks {
   161  		switch iface.Type {
   162  		case "veth":
   163  			istats, err := getNetworkInterfaceStats(iface.HostInterfaceName)
   164  			if err != nil {
   165  				return stats, newSystemErrorWithCausef(err, "getting network stats for interface %q", iface.HostInterfaceName)
   166  			}
   167  			stats.Interfaces = append(stats.Interfaces, istats)
   168  		}
   169  	}
   170  	return stats, nil
   171  }
   172  
   173  func (c *linuxContainer) Set(config configs.Config) error {
   174  	c.m.Lock()
   175  	defer c.m.Unlock()
   176  	status, err := c.currentStatus()
   177  	if err != nil {
   178  		return err
   179  	}
   180  	if status == Stopped {
   181  		return newGenericError(fmt.Errorf("container not running"), ContainerNotRunning)
   182  	}
   183  	c.config = &config
   184  	return c.cgroupManager.Set(c.config)
   185  }
   186  
   187  func (c *linuxContainer) Start(process *Process) error {
   188  	c.m.Lock()
   189  	defer c.m.Unlock()
   190  	status, err := c.currentStatus()
   191  	if err != nil {
   192  		return err
   193  	}
   194  	return c.start(process, status == Stopped)
   195  }
   196  
   197  func (c *linuxContainer) Run(process *Process) error {
   198  	c.m.Lock()
   199  	defer c.m.Unlock()
   200  	status, err := c.currentStatus()
   201  	if err != nil {
   202  		return err
   203  	}
   204  	if err := c.start(process, status == Stopped); err != nil {
   205  		return err
   206  	}
   207  	if status == Stopped {
   208  		return c.exec()
   209  	}
   210  	return nil
   211  }
   212  
   213  func (c *linuxContainer) Exec() error {
   214  	c.m.Lock()
   215  	defer c.m.Unlock()
   216  	return c.exec()
   217  }
   218  
   219  func (c *linuxContainer) exec() error {
   220  	path := filepath.Join(c.root, execFifoFilename)
   221  	f, err := os.OpenFile(path, os.O_RDONLY, 0)
   222  	if err != nil {
   223  		return newSystemErrorWithCause(err, "open exec fifo for reading")
   224  	}
   225  	defer f.Close()
   226  	data, err := ioutil.ReadAll(f)
   227  	if err != nil {
   228  		return err
   229  	}
   230  	if len(data) > 0 {
   231  		os.Remove(path)
   232  		return nil
   233  	}
   234  	return fmt.Errorf("cannot start an already running container")
   235  }
   236  
   237  func (c *linuxContainer) start(process *Process, isInit bool) error {
   238  	parent, err := c.newParentProcess(process, isInit)
   239  	if err != nil {
   240  		return newSystemErrorWithCause(err, "creating new parent process")
   241  	}
   242  	if err := parent.start(); err != nil {
   243  		// terminate the process to ensure that it properly is reaped.
   244  		if err := parent.terminate(); err != nil {
   245  			logrus.Warn(err)
   246  		}
   247  		return newSystemErrorWithCause(err, "starting container process")
   248  	}
   249  	// generate a timestamp indicating when the container was started
   250  	c.created = time.Now().UTC()
   251  	c.state = &runningState{
   252  		c: c,
   253  	}
   254  	if isInit {
   255  		c.state = &createdState{
   256  			c: c,
   257  		}
   258  		state, err := c.updateState(parent)
   259  		if err != nil {
   260  			return err
   261  		}
   262  		c.initProcessStartTime = state.InitProcessStartTime
   263  
   264  		if c.config.Hooks != nil {
   265  			s := configs.HookState{
   266  				Version:    c.config.Version,
   267  				ID:         c.id,
   268  				Pid:        parent.pid(),
   269  				Root:       c.config.Rootfs,
   270  				BundlePath: utils.SearchLabels(c.config.Labels, "bundle"),
   271  			}
   272  			for i, hook := range c.config.Hooks.Poststart {
   273  				if err := hook.Run(s); err != nil {
   274  					if err := parent.terminate(); err != nil {
   275  						logrus.Warn(err)
   276  					}
   277  					return newSystemErrorWithCausef(err, "running poststart hook %d", i)
   278  				}
   279  			}
   280  		}
   281  	}
   282  	return nil
   283  }
   284  
   285  func (c *linuxContainer) Signal(s os.Signal, all bool) error {
   286  	if all {
   287  		return signalAllProcesses(c.cgroupManager, s)
   288  	}
   289  	if err := c.initProcess.signal(s); err != nil {
   290  		return newSystemErrorWithCause(err, "signaling init process")
   291  	}
   292  	return nil
   293  }
   294  
   295  func (c *linuxContainer) newParentProcess(p *Process, doInit bool) (parentProcess, error) {
   296  	parentPipe, childPipe, err := newPipe()
   297  	if err != nil {
   298  		return nil, newSystemErrorWithCause(err, "creating new init pipe")
   299  	}
   300  	rootDir, err := os.Open(c.root)
   301  	if err != nil {
   302  		return nil, err
   303  	}
   304  	cmd, err := c.commandTemplate(p, childPipe, rootDir)
   305  	if err != nil {
   306  		return nil, newSystemErrorWithCause(err, "creating new command template")
   307  	}
   308  	if !doInit {
   309  		return c.newSetnsProcess(p, cmd, parentPipe, childPipe, rootDir)
   310  	}
   311  	return c.newInitProcess(p, cmd, parentPipe, childPipe, rootDir)
   312  }
   313  
   314  func (c *linuxContainer) commandTemplate(p *Process, childPipe, rootDir *os.File) (*exec.Cmd, error) {
   315  	cmd := exec.Command(c.initArgs[0], c.initArgs[1:]...)
   316  	cmd.Stdin = p.Stdin
   317  	cmd.Stdout = p.Stdout
   318  	cmd.Stderr = p.Stderr
   319  	cmd.Dir = c.config.Rootfs
   320  	if cmd.SysProcAttr == nil {
   321  		cmd.SysProcAttr = &syscall.SysProcAttr{}
   322  	}
   323  	cmd.ExtraFiles = append(p.ExtraFiles, childPipe, rootDir)
   324  	cmd.Env = append(cmd.Env,
   325  		fmt.Sprintf("_LIBCONTAINER_INITPIPE=%d", stdioFdCount+len(cmd.ExtraFiles)-2),
   326  		fmt.Sprintf("_LIBCONTAINER_STATEDIR=%d", stdioFdCount+len(cmd.ExtraFiles)-1))
   327  	// NOTE: when running a container with no PID namespace and the parent process spawning the container is
   328  	// PID1 the pdeathsig is being delivered to the container's init process by the kernel for some reason
   329  	// even with the parent still running.
   330  	if c.config.ParentDeathSignal > 0 {
   331  		cmd.SysProcAttr.Pdeathsig = syscall.Signal(c.config.ParentDeathSignal)
   332  	}
   333  	return cmd, nil
   334  }
   335  
   336  func (c *linuxContainer) newInitProcess(p *Process, cmd *exec.Cmd, parentPipe, childPipe, rootDir *os.File) (*initProcess, error) {
   337  	cmd.Env = append(cmd.Env, "_LIBCONTAINER_INITTYPE="+string(initStandard))
   338  	nsMaps := make(map[configs.NamespaceType]string)
   339  	for _, ns := range c.config.Namespaces {
   340  		if ns.Path != "" {
   341  			nsMaps[ns.Type] = ns.Path
   342  		}
   343  	}
   344  	_, sharePidns := nsMaps[configs.NEWPID]
   345  	data, err := c.bootstrapData(c.config.Namespaces.CloneFlags(), nsMaps, "")
   346  	if err != nil {
   347  		return nil, err
   348  	}
   349  	return &initProcess{
   350  		cmd:           cmd,
   351  		childPipe:     childPipe,
   352  		parentPipe:    parentPipe,
   353  		manager:       c.cgroupManager,
   354  		config:        c.newInitConfig(p),
   355  		container:     c,
   356  		process:       p,
   357  		bootstrapData: data,
   358  		sharePidns:    sharePidns,
   359  		rootDir:       rootDir,
   360  	}, nil
   361  }
   362  
   363  func (c *linuxContainer) newSetnsProcess(p *Process, cmd *exec.Cmd, parentPipe, childPipe, rootDir *os.File) (*setnsProcess, error) {
   364  	cmd.Env = append(cmd.Env, "_LIBCONTAINER_INITTYPE="+string(initSetns))
   365  	state, err := c.currentState()
   366  	if err != nil {
   367  		return nil, newSystemErrorWithCause(err, "getting container's current state")
   368  	}
   369  	// for setns process, we dont have to set cloneflags as the process namespaces
   370  	// will only be set via setns syscall
   371  	data, err := c.bootstrapData(0, state.NamespacePaths, p.consolePath)
   372  	if err != nil {
   373  		return nil, err
   374  	}
   375  	// TODO: set on container for process management
   376  	return &setnsProcess{
   377  		cmd:           cmd,
   378  		cgroupPaths:   c.cgroupManager.GetPaths(),
   379  		childPipe:     childPipe,
   380  		parentPipe:    parentPipe,
   381  		config:        c.newInitConfig(p),
   382  		process:       p,
   383  		bootstrapData: data,
   384  		rootDir:       rootDir,
   385  	}, nil
   386  }
   387  
   388  func (c *linuxContainer) newInitConfig(process *Process) *initConfig {
   389  	cfg := &initConfig{
   390  		Config:           c.config,
   391  		Args:             process.Args,
   392  		Env:              process.Env,
   393  		User:             process.User,
   394  		AdditionalGroups: process.AdditionalGroups,
   395  		Cwd:              process.Cwd,
   396  		Console:          process.consolePath,
   397  		Capabilities:     process.Capabilities,
   398  		PassedFilesCount: len(process.ExtraFiles),
   399  		ContainerId:      c.ID(),
   400  		NoNewPrivileges:  c.config.NoNewPrivileges,
   401  		AppArmorProfile:  c.config.AppArmorProfile,
   402  		ProcessLabel:     c.config.ProcessLabel,
   403  		Rlimits:          c.config.Rlimits,
   404  		ExecFifoPath:     filepath.Join(c.root, execFifoFilename),
   405  	}
   406  	if process.NoNewPrivileges != nil {
   407  		cfg.NoNewPrivileges = *process.NoNewPrivileges
   408  	}
   409  	if process.AppArmorProfile != "" {
   410  		cfg.AppArmorProfile = process.AppArmorProfile
   411  	}
   412  	if process.Label != "" {
   413  		cfg.ProcessLabel = process.Label
   414  	}
   415  	if len(process.Rlimits) > 0 {
   416  		cfg.Rlimits = process.Rlimits
   417  	}
   418  	return cfg
   419  }
   420  
   421  func newPipe() (parent *os.File, child *os.File, err error) {
   422  	fds, err := syscall.Socketpair(syscall.AF_LOCAL, syscall.SOCK_STREAM|syscall.SOCK_CLOEXEC, 0)
   423  	if err != nil {
   424  		return nil, nil, err
   425  	}
   426  	return os.NewFile(uintptr(fds[1]), "parent"), os.NewFile(uintptr(fds[0]), "child"), nil
   427  }
   428  
   429  func (c *linuxContainer) Destroy() error {
   430  	c.m.Lock()
   431  	defer c.m.Unlock()
   432  	return c.state.destroy()
   433  }
   434  
   435  func (c *linuxContainer) Pause() error {
   436  	c.m.Lock()
   437  	defer c.m.Unlock()
   438  	status, err := c.currentStatus()
   439  	if err != nil {
   440  		return err
   441  	}
   442  	switch status {
   443  	case Running, Created:
   444  		if err := c.cgroupManager.Freeze(configs.Frozen); err != nil {
   445  			return err
   446  		}
   447  		return c.state.transition(&pausedState{
   448  			c: c,
   449  		})
   450  	}
   451  	return newGenericError(fmt.Errorf("container not running or created: %s", status), ContainerNotRunning)
   452  }
   453  
   454  func (c *linuxContainer) Resume() error {
   455  	c.m.Lock()
   456  	defer c.m.Unlock()
   457  	status, err := c.currentStatus()
   458  	if err != nil {
   459  		return err
   460  	}
   461  	if status != Paused {
   462  		return newGenericError(fmt.Errorf("container not paused"), ContainerNotPaused)
   463  	}
   464  	if err := c.cgroupManager.Freeze(configs.Thawed); err != nil {
   465  		return err
   466  	}
   467  	return c.state.transition(&runningState{
   468  		c: c,
   469  	})
   470  }
   471  
   472  func (c *linuxContainer) NotifyOOM() (<-chan struct{}, error) {
   473  	return notifyOnOOM(c.cgroupManager.GetPaths())
   474  }
   475  
   476  func (c *linuxContainer) NotifyMemoryPressure(level PressureLevel) (<-chan struct{}, error) {
   477  	return notifyMemoryPressure(c.cgroupManager.GetPaths(), level)
   478  }
   479  
   480  // checkCriuVersion checks Criu version greater than or equal to minVersion
   481  func (c *linuxContainer) checkCriuVersion(minVersion string) error {
   482  	var x, y, z, versionReq int
   483  
   484  	_, err := fmt.Sscanf(minVersion, "%d.%d.%d\n", &x, &y, &z) // 1.5.2
   485  	if err != nil {
   486  		_, err = fmt.Sscanf(minVersion, "Version: %d.%d\n", &x, &y) // 1.6
   487  	}
   488  	versionReq = x*10000 + y*100 + z
   489  
   490  	out, err := exec.Command(c.criuPath, "-V").Output()
   491  	if err != nil {
   492  		return fmt.Errorf("Unable to execute CRIU command: %s", c.criuPath)
   493  	}
   494  
   495  	x = 0
   496  	y = 0
   497  	z = 0
   498  	if ep := strings.Index(string(out), "-"); ep >= 0 {
   499  		// criu Git version format
   500  		var version string
   501  		if sp := strings.Index(string(out), "GitID"); sp > 0 {
   502  			version = string(out)[sp:ep]
   503  		} else {
   504  			return fmt.Errorf("Unable to parse the CRIU version: %s", c.criuPath)
   505  		}
   506  
   507  		n, err := fmt.Sscanf(string(version), "GitID: v%d.%d.%d", &x, &y, &z) // 1.5.2
   508  		if err != nil {
   509  			n, err = fmt.Sscanf(string(version), "GitID: v%d.%d", &x, &y) // 1.6
   510  			y++
   511  		} else {
   512  			z++
   513  		}
   514  		if n < 2 || err != nil {
   515  			return fmt.Errorf("Unable to parse the CRIU version: %s %d %s", version, n, err)
   516  		}
   517  	} else {
   518  		// criu release version format
   519  		n, err := fmt.Sscanf(string(out), "Version: %d.%d.%d\n", &x, &y, &z) // 1.5.2
   520  		if err != nil {
   521  			n, err = fmt.Sscanf(string(out), "Version: %d.%d\n", &x, &y) // 1.6
   522  		}
   523  		if n < 2 || err != nil {
   524  			return fmt.Errorf("Unable to parse the CRIU version: %s %d %s", out, n, err)
   525  		}
   526  	}
   527  
   528  	c.criuVersion = x*10000 + y*100 + z
   529  
   530  	if c.criuVersion < versionReq {
   531  		return fmt.Errorf("CRIU version must be %s or higher", minVersion)
   532  	}
   533  
   534  	return nil
   535  }
   536  
   537  const descriptorsFilename = "descriptors.json"
   538  
   539  func (c *linuxContainer) addCriuDumpMount(req *criurpc.CriuReq, m *configs.Mount) {
   540  	mountDest := m.Destination
   541  	if strings.HasPrefix(mountDest, c.config.Rootfs) {
   542  		mountDest = mountDest[len(c.config.Rootfs):]
   543  	}
   544  
   545  	extMnt := &criurpc.ExtMountMap{
   546  		Key: proto.String(mountDest),
   547  		Val: proto.String(mountDest),
   548  	}
   549  	req.Opts.ExtMnt = append(req.Opts.ExtMnt, extMnt)
   550  }
   551  
   552  func (c *linuxContainer) Checkpoint(criuOpts *CriuOpts) error {
   553  	c.m.Lock()
   554  	defer c.m.Unlock()
   555  
   556  	if err := c.checkCriuVersion("1.5.2"); err != nil {
   557  		return err
   558  	}
   559  
   560  	if criuOpts.ImagesDirectory == "" {
   561  		return fmt.Errorf("invalid directory to save checkpoint")
   562  	}
   563  
   564  	// Since a container can be C/R'ed multiple times,
   565  	// the checkpoint directory may already exist.
   566  	if err := os.Mkdir(criuOpts.ImagesDirectory, 0755); err != nil && !os.IsExist(err) {
   567  		return err
   568  	}
   569  
   570  	if criuOpts.WorkDirectory == "" {
   571  		criuOpts.WorkDirectory = filepath.Join(c.root, "criu.work")
   572  	}
   573  
   574  	if err := os.Mkdir(criuOpts.WorkDirectory, 0755); err != nil && !os.IsExist(err) {
   575  		return err
   576  	}
   577  
   578  	workDir, err := os.Open(criuOpts.WorkDirectory)
   579  	if err != nil {
   580  		return err
   581  	}
   582  	defer workDir.Close()
   583  
   584  	imageDir, err := os.Open(criuOpts.ImagesDirectory)
   585  	if err != nil {
   586  		return err
   587  	}
   588  	defer imageDir.Close()
   589  
   590  	rpcOpts := criurpc.CriuOpts{
   591  		ImagesDirFd:    proto.Int32(int32(imageDir.Fd())),
   592  		WorkDirFd:      proto.Int32(int32(workDir.Fd())),
   593  		LogLevel:       proto.Int32(4),
   594  		LogFile:        proto.String("dump.log"),
   595  		Root:           proto.String(c.config.Rootfs),
   596  		ManageCgroups:  proto.Bool(true),
   597  		NotifyScripts:  proto.Bool(true),
   598  		Pid:            proto.Int32(int32(c.initProcess.pid())),
   599  		ShellJob:       proto.Bool(criuOpts.ShellJob),
   600  		LeaveRunning:   proto.Bool(criuOpts.LeaveRunning),
   601  		TcpEstablished: proto.Bool(criuOpts.TcpEstablished),
   602  		ExtUnixSk:      proto.Bool(criuOpts.ExternalUnixConnections),
   603  		FileLocks:      proto.Bool(criuOpts.FileLocks),
   604  		EmptyNs:        proto.Uint32(criuOpts.EmptyNs),
   605  	}
   606  
   607  	// append optional criu opts, e.g., page-server and port
   608  	if criuOpts.PageServer.Address != "" && criuOpts.PageServer.Port != 0 {
   609  		rpcOpts.Ps = &criurpc.CriuPageServerInfo{
   610  			Address: proto.String(criuOpts.PageServer.Address),
   611  			Port:    proto.Int32(criuOpts.PageServer.Port),
   612  		}
   613  	}
   614  
   615  	// append optional manage cgroups mode
   616  	if criuOpts.ManageCgroupsMode != 0 {
   617  		if err := c.checkCriuVersion("1.7"); err != nil {
   618  			return err
   619  		}
   620  		mode := criurpc.CriuCgMode(criuOpts.ManageCgroupsMode)
   621  		rpcOpts.ManageCgroupsMode = &mode
   622  	}
   623  
   624  	t := criurpc.CriuReqType_DUMP
   625  	req := &criurpc.CriuReq{
   626  		Type: &t,
   627  		Opts: &rpcOpts,
   628  	}
   629  
   630  	for _, m := range c.config.Mounts {
   631  		switch m.Device {
   632  		case "bind":
   633  			c.addCriuDumpMount(req, m)
   634  			break
   635  		case "cgroup":
   636  			binds, err := getCgroupMounts(m)
   637  			if err != nil {
   638  				return err
   639  			}
   640  			for _, b := range binds {
   641  				c.addCriuDumpMount(req, b)
   642  			}
   643  			break
   644  		}
   645  	}
   646  
   647  	// Write the FD info to a file in the image directory
   648  
   649  	fdsJSON, err := json.Marshal(c.initProcess.externalDescriptors())
   650  	if err != nil {
   651  		return err
   652  	}
   653  
   654  	err = ioutil.WriteFile(filepath.Join(criuOpts.ImagesDirectory, descriptorsFilename), fdsJSON, 0655)
   655  	if err != nil {
   656  		return err
   657  	}
   658  
   659  	err = c.criuSwrk(nil, req, criuOpts, false)
   660  	if err != nil {
   661  		return err
   662  	}
   663  	return nil
   664  }
   665  
   666  func (c *linuxContainer) addCriuRestoreMount(req *criurpc.CriuReq, m *configs.Mount) {
   667  	mountDest := m.Destination
   668  	if strings.HasPrefix(mountDest, c.config.Rootfs) {
   669  		mountDest = mountDest[len(c.config.Rootfs):]
   670  	}
   671  
   672  	extMnt := &criurpc.ExtMountMap{
   673  		Key: proto.String(mountDest),
   674  		Val: proto.String(m.Source),
   675  	}
   676  	req.Opts.ExtMnt = append(req.Opts.ExtMnt, extMnt)
   677  }
   678  
   679  func (c *linuxContainer) restoreNetwork(req *criurpc.CriuReq, criuOpts *CriuOpts) {
   680  	for _, iface := range c.config.Networks {
   681  		switch iface.Type {
   682  		case "veth":
   683  			veth := new(criurpc.CriuVethPair)
   684  			veth.IfOut = proto.String(iface.HostInterfaceName)
   685  			veth.IfIn = proto.String(iface.Name)
   686  			req.Opts.Veths = append(req.Opts.Veths, veth)
   687  			break
   688  		case "loopback":
   689  			break
   690  		}
   691  	}
   692  	for _, i := range criuOpts.VethPairs {
   693  		veth := new(criurpc.CriuVethPair)
   694  		veth.IfOut = proto.String(i.HostInterfaceName)
   695  		veth.IfIn = proto.String(i.ContainerInterfaceName)
   696  		req.Opts.Veths = append(req.Opts.Veths, veth)
   697  	}
   698  }
   699  
   700  func (c *linuxContainer) Restore(process *Process, criuOpts *CriuOpts) error {
   701  	c.m.Lock()
   702  	defer c.m.Unlock()
   703  	if err := c.checkCriuVersion("1.5.2"); err != nil {
   704  		return err
   705  	}
   706  	if criuOpts.WorkDirectory == "" {
   707  		criuOpts.WorkDirectory = filepath.Join(c.root, "criu.work")
   708  	}
   709  	// Since a container can be C/R'ed multiple times,
   710  	// the work directory may already exist.
   711  	if err := os.Mkdir(criuOpts.WorkDirectory, 0655); err != nil && !os.IsExist(err) {
   712  		return err
   713  	}
   714  	workDir, err := os.Open(criuOpts.WorkDirectory)
   715  	if err != nil {
   716  		return err
   717  	}
   718  	defer workDir.Close()
   719  	if criuOpts.ImagesDirectory == "" {
   720  		return fmt.Errorf("invalid directory to restore checkpoint")
   721  	}
   722  	imageDir, err := os.Open(criuOpts.ImagesDirectory)
   723  	if err != nil {
   724  		return err
   725  	}
   726  	defer imageDir.Close()
   727  	// CRIU has a few requirements for a root directory:
   728  	// * it must be a mount point
   729  	// * its parent must not be overmounted
   730  	// c.config.Rootfs is bind-mounted to a temporary directory
   731  	// to satisfy these requirements.
   732  	root := filepath.Join(c.root, "criu-root")
   733  	if err := os.Mkdir(root, 0755); err != nil {
   734  		return err
   735  	}
   736  	defer os.Remove(root)
   737  	root, err = filepath.EvalSymlinks(root)
   738  	if err != nil {
   739  		return err
   740  	}
   741  	err = syscall.Mount(c.config.Rootfs, root, "", syscall.MS_BIND|syscall.MS_REC, "")
   742  	if err != nil {
   743  		return err
   744  	}
   745  	defer syscall.Unmount(root, syscall.MNT_DETACH)
   746  	t := criurpc.CriuReqType_RESTORE
   747  	req := &criurpc.CriuReq{
   748  		Type: &t,
   749  		Opts: &criurpc.CriuOpts{
   750  			ImagesDirFd:    proto.Int32(int32(imageDir.Fd())),
   751  			WorkDirFd:      proto.Int32(int32(workDir.Fd())),
   752  			EvasiveDevices: proto.Bool(true),
   753  			LogLevel:       proto.Int32(4),
   754  			LogFile:        proto.String("restore.log"),
   755  			RstSibling:     proto.Bool(true),
   756  			Root:           proto.String(root),
   757  			ManageCgroups:  proto.Bool(true),
   758  			NotifyScripts:  proto.Bool(true),
   759  			ShellJob:       proto.Bool(criuOpts.ShellJob),
   760  			ExtUnixSk:      proto.Bool(criuOpts.ExternalUnixConnections),
   761  			TcpEstablished: proto.Bool(criuOpts.TcpEstablished),
   762  			FileLocks:      proto.Bool(criuOpts.FileLocks),
   763  			EmptyNs:        proto.Uint32(criuOpts.EmptyNs),
   764  		},
   765  	}
   766  
   767  	for _, m := range c.config.Mounts {
   768  		switch m.Device {
   769  		case "bind":
   770  			c.addCriuRestoreMount(req, m)
   771  			break
   772  		case "cgroup":
   773  			binds, err := getCgroupMounts(m)
   774  			if err != nil {
   775  				return err
   776  			}
   777  			for _, b := range binds {
   778  				c.addCriuRestoreMount(req, b)
   779  			}
   780  			break
   781  		}
   782  	}
   783  
   784  	if criuOpts.EmptyNs&syscall.CLONE_NEWNET == 0 {
   785  		c.restoreNetwork(req, criuOpts)
   786  	}
   787  
   788  	// append optional manage cgroups mode
   789  	if criuOpts.ManageCgroupsMode != 0 {
   790  		if err := c.checkCriuVersion("1.7"); err != nil {
   791  			return err
   792  		}
   793  		mode := criurpc.CriuCgMode(criuOpts.ManageCgroupsMode)
   794  		req.Opts.ManageCgroupsMode = &mode
   795  	}
   796  
   797  	var (
   798  		fds    []string
   799  		fdJSON []byte
   800  	)
   801  	if fdJSON, err = ioutil.ReadFile(filepath.Join(criuOpts.ImagesDirectory, descriptorsFilename)); err != nil {
   802  		return err
   803  	}
   804  
   805  	if err := json.Unmarshal(fdJSON, &fds); err != nil {
   806  		return err
   807  	}
   808  	for i := range fds {
   809  		if s := fds[i]; strings.Contains(s, "pipe:") {
   810  			inheritFd := new(criurpc.InheritFd)
   811  			inheritFd.Key = proto.String(s)
   812  			inheritFd.Fd = proto.Int32(int32(i))
   813  			req.Opts.InheritFd = append(req.Opts.InheritFd, inheritFd)
   814  		}
   815  	}
   816  	return c.criuSwrk(process, req, criuOpts, true)
   817  }
   818  
   819  func (c *linuxContainer) criuApplyCgroups(pid int, req *criurpc.CriuReq) error {
   820  	if err := c.cgroupManager.Apply(pid); err != nil {
   821  		return err
   822  	}
   823  
   824  	path := fmt.Sprintf("/proc/%d/cgroup", pid)
   825  	cgroupsPaths, err := cgroups.ParseCgroupFile(path)
   826  	if err != nil {
   827  		return err
   828  	}
   829  
   830  	for c, p := range cgroupsPaths {
   831  		cgroupRoot := &criurpc.CgroupRoot{
   832  			Ctrl: proto.String(c),
   833  			Path: proto.String(p),
   834  		}
   835  		req.Opts.CgRoot = append(req.Opts.CgRoot, cgroupRoot)
   836  	}
   837  
   838  	return nil
   839  }
   840  
   841  func (c *linuxContainer) criuSwrk(process *Process, req *criurpc.CriuReq, opts *CriuOpts, applyCgroups bool) error {
   842  	fds, err := syscall.Socketpair(syscall.AF_LOCAL, syscall.SOCK_SEQPACKET|syscall.SOCK_CLOEXEC, 0)
   843  	if err != nil {
   844  		return err
   845  	}
   846  
   847  	logPath := filepath.Join(opts.WorkDirectory, req.GetOpts().GetLogFile())
   848  	criuClient := os.NewFile(uintptr(fds[0]), "criu-transport-client")
   849  	criuServer := os.NewFile(uintptr(fds[1]), "criu-transport-server")
   850  	defer criuClient.Close()
   851  	defer criuServer.Close()
   852  
   853  	args := []string{"swrk", "3"}
   854  	logrus.Debugf("Using CRIU %d at: %s", c.criuVersion, c.criuPath)
   855  	logrus.Debugf("Using CRIU with following args: %s", args)
   856  	cmd := exec.Command(c.criuPath, args...)
   857  	if process != nil {
   858  		cmd.Stdin = process.Stdin
   859  		cmd.Stdout = process.Stdout
   860  		cmd.Stderr = process.Stderr
   861  	}
   862  	cmd.ExtraFiles = append(cmd.ExtraFiles, criuServer)
   863  
   864  	if err := cmd.Start(); err != nil {
   865  		return err
   866  	}
   867  	criuServer.Close()
   868  
   869  	defer func() {
   870  		criuClient.Close()
   871  		_, err := cmd.Process.Wait()
   872  		if err != nil {
   873  			return
   874  		}
   875  	}()
   876  
   877  	if applyCgroups {
   878  		err := c.criuApplyCgroups(cmd.Process.Pid, req)
   879  		if err != nil {
   880  			return err
   881  		}
   882  	}
   883  
   884  	var extFds []string
   885  	if process != nil {
   886  		extFds, err = getPipeFds(cmd.Process.Pid)
   887  		if err != nil {
   888  			return err
   889  		}
   890  	}
   891  
   892  	logrus.Debugf("Using CRIU in %s mode", req.GetType().String())
   893  	val := reflect.ValueOf(req.GetOpts())
   894  	v := reflect.Indirect(val)
   895  	for i := 0; i < v.NumField(); i++ {
   896  		st := v.Type()
   897  		name := st.Field(i).Name
   898  		if strings.HasPrefix(name, "XXX_") {
   899  			continue
   900  		}
   901  		value := val.MethodByName("Get" + name).Call([]reflect.Value{})
   902  		logrus.Debugf("CRIU option %s with value %v", name, value[0])
   903  	}
   904  	data, err := proto.Marshal(req)
   905  	if err != nil {
   906  		return err
   907  	}
   908  	_, err = criuClient.Write(data)
   909  	if err != nil {
   910  		return err
   911  	}
   912  
   913  	buf := make([]byte, 10*4096)
   914  	for true {
   915  		n, err := criuClient.Read(buf)
   916  		if err != nil {
   917  			return err
   918  		}
   919  		if n == 0 {
   920  			return fmt.Errorf("unexpected EOF")
   921  		}
   922  		if n == len(buf) {
   923  			return fmt.Errorf("buffer is too small")
   924  		}
   925  
   926  		resp := new(criurpc.CriuResp)
   927  		err = proto.Unmarshal(buf[:n], resp)
   928  		if err != nil {
   929  			return err
   930  		}
   931  		if !resp.GetSuccess() {
   932  			typeString := req.GetType().String()
   933  			return fmt.Errorf("criu failed: type %s errno %d\nlog file: %s", typeString, resp.GetCrErrno(), logPath)
   934  		}
   935  
   936  		t := resp.GetType()
   937  		switch {
   938  		case t == criurpc.CriuReqType_NOTIFY:
   939  			if err := c.criuNotifications(resp, process, opts, extFds); err != nil {
   940  				return err
   941  			}
   942  			t = criurpc.CriuReqType_NOTIFY
   943  			req = &criurpc.CriuReq{
   944  				Type:          &t,
   945  				NotifySuccess: proto.Bool(true),
   946  			}
   947  			data, err = proto.Marshal(req)
   948  			if err != nil {
   949  				return err
   950  			}
   951  			_, err = criuClient.Write(data)
   952  			if err != nil {
   953  				return err
   954  			}
   955  			continue
   956  		case t == criurpc.CriuReqType_RESTORE:
   957  		case t == criurpc.CriuReqType_DUMP:
   958  			break
   959  		default:
   960  			return fmt.Errorf("unable to parse the response %s", resp.String())
   961  		}
   962  
   963  		break
   964  	}
   965  
   966  	// cmd.Wait() waits cmd.goroutines which are used for proxying file descriptors.
   967  	// Here we want to wait only the CRIU process.
   968  	st, err := cmd.Process.Wait()
   969  	if err != nil {
   970  		return err
   971  	}
   972  	if !st.Success() {
   973  		return fmt.Errorf("criu failed: %s\nlog file: %s", st.String(), logPath)
   974  	}
   975  	return nil
   976  }
   977  
   978  // block any external network activity
   979  func lockNetwork(config *configs.Config) error {
   980  	for _, config := range config.Networks {
   981  		strategy, err := getStrategy(config.Type)
   982  		if err != nil {
   983  			return err
   984  		}
   985  
   986  		if err := strategy.detach(config); err != nil {
   987  			return err
   988  		}
   989  	}
   990  	return nil
   991  }
   992  
   993  func unlockNetwork(config *configs.Config) error {
   994  	for _, config := range config.Networks {
   995  		strategy, err := getStrategy(config.Type)
   996  		if err != nil {
   997  			return err
   998  		}
   999  		if err = strategy.attach(config); err != nil {
  1000  			return err
  1001  		}
  1002  	}
  1003  	return nil
  1004  }
  1005  
  1006  func (c *linuxContainer) criuNotifications(resp *criurpc.CriuResp, process *Process, opts *CriuOpts, fds []string) error {
  1007  	notify := resp.GetNotify()
  1008  	if notify == nil {
  1009  		return fmt.Errorf("invalid response: %s", resp.String())
  1010  	}
  1011  	switch {
  1012  	case notify.GetScript() == "post-dump":
  1013  		f, err := os.Create(filepath.Join(c.root, "checkpoint"))
  1014  		if err != nil {
  1015  			return err
  1016  		}
  1017  		f.Close()
  1018  	case notify.GetScript() == "network-unlock":
  1019  		if err := unlockNetwork(c.config); err != nil {
  1020  			return err
  1021  		}
  1022  	case notify.GetScript() == "network-lock":
  1023  		if err := lockNetwork(c.config); err != nil {
  1024  			return err
  1025  		}
  1026  	case notify.GetScript() == "setup-namespaces":
  1027  		if c.config.Hooks != nil {
  1028  			s := configs.HookState{
  1029  				Version: c.config.Version,
  1030  				ID:      c.id,
  1031  				Pid:     int(notify.GetPid()),
  1032  				Root:    c.config.Rootfs,
  1033  			}
  1034  			for i, hook := range c.config.Hooks.Prestart {
  1035  				if err := hook.Run(s); err != nil {
  1036  					return newSystemErrorWithCausef(err, "running prestart hook %d", i)
  1037  				}
  1038  			}
  1039  		}
  1040  	case notify.GetScript() == "post-restore":
  1041  		pid := notify.GetPid()
  1042  		r, err := newRestoredProcess(int(pid), fds)
  1043  		if err != nil {
  1044  			return err
  1045  		}
  1046  		process.ops = r
  1047  		if err := c.state.transition(&restoredState{
  1048  			imageDir: opts.ImagesDirectory,
  1049  			c:        c,
  1050  		}); err != nil {
  1051  			return err
  1052  		}
  1053  		// create a timestamp indicating when the restored checkpoint was started
  1054  		c.created = time.Now().UTC()
  1055  		if _, err := c.updateState(r); err != nil {
  1056  			return err
  1057  		}
  1058  		if err := os.Remove(filepath.Join(c.root, "checkpoint")); err != nil {
  1059  			if !os.IsNotExist(err) {
  1060  				logrus.Error(err)
  1061  			}
  1062  		}
  1063  	}
  1064  	return nil
  1065  }
  1066  
  1067  func (c *linuxContainer) updateState(process parentProcess) (*State, error) {
  1068  	c.initProcess = process
  1069  	state, err := c.currentState()
  1070  	if err != nil {
  1071  		return nil, err
  1072  	}
  1073  	err = c.saveState(state)
  1074  	if err != nil {
  1075  		return nil, err
  1076  	}
  1077  	return state, nil
  1078  }
  1079  
  1080  func (c *linuxContainer) saveState(s *State) error {
  1081  	f, err := os.Create(filepath.Join(c.root, stateFilename))
  1082  	if err != nil {
  1083  		return err
  1084  	}
  1085  	defer f.Close()
  1086  	return utils.WriteJSON(f, s)
  1087  }
  1088  
  1089  func (c *linuxContainer) deleteState() error {
  1090  	return os.Remove(filepath.Join(c.root, stateFilename))
  1091  }
  1092  
  1093  func (c *linuxContainer) currentStatus() (Status, error) {
  1094  	if err := c.refreshState(); err != nil {
  1095  		return -1, err
  1096  	}
  1097  	return c.state.status(), nil
  1098  }
  1099  
  1100  // refreshState needs to be called to verify that the current state on the
  1101  // container is what is true.  Because consumers of libcontainer can use it
  1102  // out of process we need to verify the container's status based on runtime
  1103  // information and not rely on our in process info.
  1104  func (c *linuxContainer) refreshState() error {
  1105  	paused, err := c.isPaused()
  1106  	if err != nil {
  1107  		return err
  1108  	}
  1109  	if paused {
  1110  		return c.state.transition(&pausedState{c: c})
  1111  	}
  1112  	t, err := c.runType()
  1113  	if err != nil {
  1114  		return err
  1115  	}
  1116  	switch t {
  1117  	case Created:
  1118  		return c.state.transition(&createdState{c: c})
  1119  	case Running:
  1120  		return c.state.transition(&runningState{c: c})
  1121  	}
  1122  	return c.state.transition(&stoppedState{c: c})
  1123  }
  1124  
  1125  // doesInitProcessExist checks if the init process is still the same process
  1126  // as the initial one, it could happen that the original process has exited
  1127  // and a new process has been created with the same pid, in this case, the
  1128  // container would already be stopped.
  1129  func (c *linuxContainer) doesInitProcessExist(initPid int) (bool, error) {
  1130  	startTime, err := system.GetProcessStartTime(initPid)
  1131  	if err != nil {
  1132  		return false, newSystemErrorWithCausef(err, "getting init process %d start time", initPid)
  1133  	}
  1134  	if c.initProcessStartTime != startTime {
  1135  		return false, nil
  1136  	}
  1137  	return true, nil
  1138  }
  1139  
  1140  func (c *linuxContainer) runType() (Status, error) {
  1141  	if c.initProcess == nil {
  1142  		return Stopped, nil
  1143  	}
  1144  	pid := c.initProcess.pid()
  1145  	// return Running if the init process is alive
  1146  	if err := syscall.Kill(pid, 0); err != nil {
  1147  		if err == syscall.ESRCH {
  1148  			// It means the process does not exist anymore, could happen when the
  1149  			// process exited just when we call the function, we should not return
  1150  			// error in this case.
  1151  			return Stopped, nil
  1152  		}
  1153  		return Stopped, newSystemErrorWithCausef(err, "sending signal 0 to pid %d", pid)
  1154  	}
  1155  	// check if the process is still the original init process.
  1156  	exist, err := c.doesInitProcessExist(pid)
  1157  	if !exist || err != nil {
  1158  		return Stopped, err
  1159  	}
  1160  	// check if the process that is running is the init process or the user's process.
  1161  	// this is the difference between the container Running and Created.
  1162  	environ, err := ioutil.ReadFile(fmt.Sprintf("/proc/%d/environ", pid))
  1163  	if err != nil {
  1164  		return Stopped, newSystemErrorWithCausef(err, "reading /proc/%d/environ", pid)
  1165  	}
  1166  	check := []byte("_LIBCONTAINER")
  1167  	if bytes.Contains(environ, check) {
  1168  		return Created, nil
  1169  	}
  1170  	return Running, nil
  1171  }
  1172  
  1173  func (c *linuxContainer) isPaused() (bool, error) {
  1174  	data, err := ioutil.ReadFile(filepath.Join(c.cgroupManager.GetPaths()["freezer"], "freezer.state"))
  1175  	if err != nil {
  1176  		// If freezer cgroup is not mounted, the container would just be not paused.
  1177  		if os.IsNotExist(err) {
  1178  			return false, nil
  1179  		}
  1180  		return false, newSystemErrorWithCause(err, "checking if container is paused")
  1181  	}
  1182  	return bytes.Equal(bytes.TrimSpace(data), []byte("FROZEN")), nil
  1183  }
  1184  
  1185  func (c *linuxContainer) currentState() (*State, error) {
  1186  	var (
  1187  		startTime           string
  1188  		externalDescriptors []string
  1189  		pid                 = -1
  1190  	)
  1191  	if c.initProcess != nil {
  1192  		pid = c.initProcess.pid()
  1193  		startTime, _ = c.initProcess.startTime()
  1194  		externalDescriptors = c.initProcess.externalDescriptors()
  1195  	}
  1196  	state := &State{
  1197  		BaseState: BaseState{
  1198  			ID:                   c.ID(),
  1199  			Config:               *c.config,
  1200  			InitProcessPid:       pid,
  1201  			InitProcessStartTime: startTime,
  1202  			Created:              c.created,
  1203  		},
  1204  		CgroupPaths:         c.cgroupManager.GetPaths(),
  1205  		NamespacePaths:      make(map[configs.NamespaceType]string),
  1206  		ExternalDescriptors: externalDescriptors,
  1207  	}
  1208  	if pid > 0 {
  1209  		for _, ns := range c.config.Namespaces {
  1210  			state.NamespacePaths[ns.Type] = ns.GetPath(pid)
  1211  		}
  1212  		for _, nsType := range configs.NamespaceTypes() {
  1213  			if !configs.IsNamespaceSupported(nsType) {
  1214  				continue
  1215  			}
  1216  			if _, ok := state.NamespacePaths[nsType]; !ok {
  1217  				ns := configs.Namespace{Type: nsType}
  1218  				state.NamespacePaths[ns.Type] = ns.GetPath(pid)
  1219  			}
  1220  		}
  1221  	}
  1222  	return state, nil
  1223  }
  1224  
  1225  // orderNamespacePaths sorts namespace paths into a list of paths that we
  1226  // can setns in order.
  1227  func (c *linuxContainer) orderNamespacePaths(namespaces map[configs.NamespaceType]string) ([]string, error) {
  1228  	paths := []string{}
  1229  	order := []configs.NamespaceType{
  1230  		// The user namespace *must* be done first.
  1231  		configs.NEWUSER,
  1232  		configs.NEWIPC,
  1233  		configs.NEWUTS,
  1234  		configs.NEWNET,
  1235  		configs.NEWPID,
  1236  		configs.NEWNS,
  1237  	}
  1238  
  1239  	// Remove namespaces that we don't need to join.
  1240  	var nsTypes []configs.NamespaceType
  1241  	for _, ns := range order {
  1242  		if c.config.Namespaces.Contains(ns) {
  1243  			nsTypes = append(nsTypes, ns)
  1244  		}
  1245  	}
  1246  	for _, nsType := range nsTypes {
  1247  		if p, ok := namespaces[nsType]; ok && p != "" {
  1248  			// check if the requested namespace is supported
  1249  			if !configs.IsNamespaceSupported(nsType) {
  1250  				return nil, newSystemError(fmt.Errorf("namespace %s is not supported", nsType))
  1251  			}
  1252  			// only set to join this namespace if it exists
  1253  			if _, err := os.Lstat(p); err != nil {
  1254  				return nil, newSystemErrorWithCausef(err, "running lstat on namespace path %q", p)
  1255  			}
  1256  			// do not allow namespace path with comma as we use it to separate
  1257  			// the namespace paths
  1258  			if strings.ContainsRune(p, ',') {
  1259  				return nil, newSystemError(fmt.Errorf("invalid path %s", p))
  1260  			}
  1261  			paths = append(paths, fmt.Sprintf("%s:%s", configs.NsName(nsType), p))
  1262  		}
  1263  	}
  1264  	return paths, nil
  1265  }
  1266  
  1267  func encodeIDMapping(idMap []configs.IDMap) ([]byte, error) {
  1268  	data := bytes.NewBuffer(nil)
  1269  	for _, im := range idMap {
  1270  		line := fmt.Sprintf("%d %d %d\n", im.ContainerID, im.HostID, im.Size)
  1271  		if _, err := data.WriteString(line); err != nil {
  1272  			return nil, err
  1273  		}
  1274  	}
  1275  	return data.Bytes(), nil
  1276  }
  1277  
  1278  // bootstrapData encodes the necessary data in netlink binary format
  1279  // as a io.Reader.
  1280  // Consumer can write the data to a bootstrap program
  1281  // such as one that uses nsenter package to bootstrap the container's
  1282  // init process correctly, i.e. with correct namespaces, uid/gid
  1283  // mapping etc.
  1284  func (c *linuxContainer) bootstrapData(cloneFlags uintptr, nsMaps map[configs.NamespaceType]string, consolePath string) (io.Reader, error) {
  1285  	// create the netlink message
  1286  	r := nl.NewNetlinkRequest(int(InitMsg), 0)
  1287  
  1288  	// write cloneFlags
  1289  	r.AddData(&Int32msg{
  1290  		Type:  CloneFlagsAttr,
  1291  		Value: uint32(cloneFlags),
  1292  	})
  1293  
  1294  	// write console path
  1295  	if consolePath != "" {
  1296  		r.AddData(&Bytemsg{
  1297  			Type:  ConsolePathAttr,
  1298  			Value: []byte(consolePath),
  1299  		})
  1300  	}
  1301  
  1302  	// write custom namespace paths
  1303  	if len(nsMaps) > 0 {
  1304  		nsPaths, err := c.orderNamespacePaths(nsMaps)
  1305  		if err != nil {
  1306  			return nil, err
  1307  		}
  1308  		r.AddData(&Bytemsg{
  1309  			Type:  NsPathsAttr,
  1310  			Value: []byte(strings.Join(nsPaths, ",")),
  1311  		})
  1312  	}
  1313  
  1314  	// write namespace paths only when we are not joining an existing user ns
  1315  	_, joinExistingUser := nsMaps[configs.NEWUSER]
  1316  	if !joinExistingUser {
  1317  		// write uid mappings
  1318  		if len(c.config.UidMappings) > 0 {
  1319  			b, err := encodeIDMapping(c.config.UidMappings)
  1320  			if err != nil {
  1321  				return nil, err
  1322  			}
  1323  			r.AddData(&Bytemsg{
  1324  				Type:  UidmapAttr,
  1325  				Value: b,
  1326  			})
  1327  		}
  1328  
  1329  		// write gid mappings
  1330  		if len(c.config.GidMappings) > 0 {
  1331  			b, err := encodeIDMapping(c.config.GidMappings)
  1332  			if err != nil {
  1333  				return nil, err
  1334  			}
  1335  			r.AddData(&Bytemsg{
  1336  				Type:  GidmapAttr,
  1337  				Value: b,
  1338  			})
  1339  			// check if we have CAP_SETGID to setgroup properly
  1340  			pid, err := capability.NewPid(os.Getpid())
  1341  			if err != nil {
  1342  				return nil, err
  1343  			}
  1344  			if !pid.Get(capability.EFFECTIVE, capability.CAP_SETGID) {
  1345  				r.AddData(&Boolmsg{
  1346  					Type:  SetgroupAttr,
  1347  					Value: true,
  1348  				})
  1349  			}
  1350  		}
  1351  	}
  1352  
  1353  	return bytes.NewReader(r.Serialize()), nil
  1354  }