github.com/apptainer/singularity@v3.1.1+incompatible/internal/pkg/runtime/engines/oci/create.go (about)

     1  // Copyright (c) 2018, Sylabs Inc. All rights reserved.
     2  // This software is licensed under a 3-clause BSD license. Please consult the
     3  // LICENSE.md file distributed with the sources of this project regarding your
     4  // rights to use or distribute this software.
     5  
     6  package oci
     7  
     8  import (
     9  	"bufio"
    10  	"encoding/json"
    11  	"fmt"
    12  	"net"
    13  	"net/rpc"
    14  	"os"
    15  	"path/filepath"
    16  	"strings"
    17  	"syscall"
    18  	"time"
    19  
    20  	"github.com/sylabs/singularity/pkg/ociruntime"
    21  	"github.com/sylabs/singularity/pkg/util/namespaces"
    22  	"github.com/sylabs/singularity/pkg/util/sysctl"
    23  	"github.com/sylabs/singularity/pkg/util/unix"
    24  
    25  	specs "github.com/opencontainers/runtime-spec/specs-go"
    26  	"github.com/sylabs/singularity/internal/pkg/cgroups"
    27  	"github.com/sylabs/singularity/internal/pkg/instance"
    28  	"github.com/sylabs/singularity/internal/pkg/runtime/engines/oci/rpc/client"
    29  	"github.com/sylabs/singularity/internal/pkg/sylog"
    30  	"github.com/sylabs/singularity/internal/pkg/util/fs"
    31  	"github.com/sylabs/singularity/internal/pkg/util/fs/mount"
    32  	"github.com/sylabs/singularity/pkg/util/fs/proc"
    33  )
    34  
    35  var symlinkDevices = []struct {
    36  	old string
    37  	new string
    38  }{
    39  	{"/proc/self/fd", "/dev/fd"},
    40  	{"/proc/kcore", "/dev/core"},
    41  	{"pts/ptmx", "/dev/ptmx"},
    42  	{"/proc/self/fd/0", "/dev/stdin"},
    43  	{"/proc/self/fd/1", "/dev/stdout"},
    44  	{"/proc/self/fd/2", "/dev/stderr"},
    45  }
    46  
    47  type device struct {
    48  	major int64
    49  	minor int64
    50  	path  string
    51  	mode  os.FileMode
    52  	uid   int
    53  	gid   int
    54  }
    55  
    56  var devices = []device{
    57  	{1, 7, "/dev/full", syscall.S_IFCHR | 0666, 0, 0},
    58  	{1, 3, "/dev/null", syscall.S_IFCHR | 0666, 0, 0},
    59  	{1, 8, "/dev/random", syscall.S_IFCHR | 0666, 0, 0},
    60  	{5, 0, "/dev/tty", syscall.S_IFCHR | 0666, 0, 0},
    61  	{1, 9, "/dev/urandom", syscall.S_IFCHR | 0666, 0, 0},
    62  	{1, 5, "/dev/zero", syscall.S_IFCHR | 0666, 0, 0},
    63  }
    64  
    65  func int64ptr(i int) *int64 {
    66  	t := int64(i)
    67  	return &t
    68  }
    69  
    70  var cgroupDevices = []specs.LinuxDeviceCgroup{
    71  	{
    72  		Allow:  true,
    73  		Type:   "c",
    74  		Major:  int64ptr(1),
    75  		Minor:  int64ptr(7),
    76  		Access: "rw",
    77  	},
    78  	{
    79  		Allow:  true,
    80  		Type:   "c",
    81  		Major:  int64ptr(1),
    82  		Minor:  int64ptr(3),
    83  		Access: "rw",
    84  	},
    85  	{
    86  		Allow:  true,
    87  		Type:   "c",
    88  		Major:  int64ptr(1),
    89  		Minor:  int64ptr(8),
    90  		Access: "rw",
    91  	},
    92  	{
    93  		Allow:  true,
    94  		Type:   "c",
    95  		Major:  int64ptr(5),
    96  		Minor:  int64ptr(0),
    97  		Access: "rw",
    98  	},
    99  	{
   100  		Allow:  true,
   101  		Type:   "c",
   102  		Major:  int64ptr(1),
   103  		Minor:  int64ptr(9),
   104  		Access: "rw",
   105  	},
   106  	{
   107  		Allow:  true,
   108  		Type:   "c",
   109  		Major:  int64ptr(1),
   110  		Minor:  int64ptr(5),
   111  		Access: "rw",
   112  	},
   113  	{
   114  		Allow:  true,
   115  		Type:   "c",
   116  		Major:  int64ptr(136),
   117  		Access: "rwm",
   118  	},
   119  	{
   120  		Allow:  true,
   121  		Type:   "c",
   122  		Major:  int64ptr(5),
   123  		Minor:  int64ptr(1),
   124  		Access: "rw",
   125  	},
   126  	{
   127  		Allow:  true,
   128  		Type:   "c",
   129  		Major:  int64ptr(5),
   130  		Minor:  int64ptr(2),
   131  		Access: "rw",
   132  	},
   133  }
   134  
   135  type container struct {
   136  	engine      *EngineOperations
   137  	rpcOps      *client.RPC
   138  	rootfs      string
   139  	rpcRoot     string
   140  	userNS      bool
   141  	utsNS       bool
   142  	mntNS       bool
   143  	devIndex    int
   144  	cgroupIndex int
   145  }
   146  
   147  var statusChan = make(chan string, 1)
   148  
   149  func (engine *EngineOperations) createState(pid int) error {
   150  	engine.EngineConfig.Lock()
   151  	defer engine.EngineConfig.Unlock()
   152  
   153  	name := engine.CommonConfig.ContainerID
   154  
   155  	file, err := instance.Add(name, true, instance.OciSubDir)
   156  	if err != nil {
   157  		return err
   158  	}
   159  
   160  	engine.EngineConfig.State.Version = specs.Version
   161  	engine.EngineConfig.State.Bundle = engine.EngineConfig.GetBundlePath()
   162  	engine.EngineConfig.State.ID = engine.CommonConfig.ContainerID
   163  	engine.EngineConfig.State.Pid = pid
   164  	engine.EngineConfig.State.Status = ociruntime.Creating
   165  	engine.EngineConfig.State.Annotations = engine.EngineConfig.OciConfig.Annotations
   166  
   167  	file.Config, err = json.Marshal(engine.CommonConfig)
   168  	if err != nil {
   169  		return err
   170  	}
   171  
   172  	file.User = "root"
   173  	file.Pid = pid
   174  	file.PPid = os.Getpid()
   175  	file.Image = filepath.Join(engine.EngineConfig.GetBundlePath(), engine.EngineConfig.OciConfig.Root.Path)
   176  
   177  	if err := file.Update(); err != nil {
   178  		return err
   179  	}
   180  
   181  	socketPath := engine.EngineConfig.SyncSocket
   182  
   183  	if socketPath != "" {
   184  		data, err := json.Marshal(engine.EngineConfig.State)
   185  		if err != nil {
   186  			sylog.Warningf("failed to marshal state data: %s", err)
   187  		} else if err := unix.WriteSocket(socketPath, data); err != nil {
   188  			sylog.Warningf("%s", err)
   189  		}
   190  	}
   191  
   192  	return nil
   193  }
   194  
   195  func (engine *EngineOperations) updateState(status string) error {
   196  	engine.EngineConfig.Lock()
   197  	defer engine.EngineConfig.Unlock()
   198  
   199  	file, err := instance.Get(engine.CommonConfig.ContainerID, instance.OciSubDir)
   200  	if err != nil {
   201  		return err
   202  	}
   203  	// do nothing if already stopped
   204  	if engine.EngineConfig.State.Status == ociruntime.Stopped {
   205  		return nil
   206  	}
   207  	oldStatus := engine.EngineConfig.State.Status
   208  	engine.EngineConfig.State.Status = status
   209  
   210  	t := time.Now().UnixNano()
   211  
   212  	switch status {
   213  	case ociruntime.Created:
   214  		if engine.EngineConfig.State.CreatedAt == nil {
   215  			engine.EngineConfig.State.CreatedAt = &t
   216  		}
   217  	case ociruntime.Running:
   218  		if engine.EngineConfig.State.StartedAt == nil {
   219  			engine.EngineConfig.State.StartedAt = &t
   220  		}
   221  	case ociruntime.Stopped:
   222  		if engine.EngineConfig.State.FinishedAt == nil {
   223  			engine.EngineConfig.State.FinishedAt = &t
   224  		}
   225  	}
   226  
   227  	file.Config, err = json.Marshal(engine.CommonConfig)
   228  	if err != nil {
   229  		return err
   230  	}
   231  
   232  	if err := file.Update(); err != nil {
   233  		return err
   234  	}
   235  
   236  	socketPath := engine.EngineConfig.SyncSocket
   237  
   238  	if socketPath != "" {
   239  		data, err := json.Marshal(engine.EngineConfig.State)
   240  		if err != nil {
   241  			sylog.Warningf("failed to marshal state data: %s", err)
   242  		} else if err := unix.WriteSocket(socketPath, data); err != nil {
   243  			sylog.Warningf("%s", err)
   244  		}
   245  	}
   246  
   247  	// send running or stopped status right after container creation
   248  	// to notify that container process started
   249  	if statusChan != nil && oldStatus == ociruntime.Created &&
   250  		(status == ociruntime.Running || status == ociruntime.Stopped) {
   251  		statusChan <- status
   252  	}
   253  	return nil
   254  }
   255  
   256  // one shot function to wait on running or stopped status
   257  func (engine *EngineOperations) waitStatusUpdate() {
   258  	if statusChan == nil {
   259  		return
   260  	}
   261  	// block until status update is sent
   262  	<-statusChan
   263  	// close channel and set it to nil
   264  	close(statusChan)
   265  	statusChan = nil
   266  }
   267  
   268  // CreateContainer creates a container
   269  func (engine *EngineOperations) CreateContainer(pid int, rpcConn net.Conn) error {
   270  	var err error
   271  
   272  	if engine.CommonConfig.EngineName != Name {
   273  		return fmt.Errorf("engineName configuration doesn't match runtime name")
   274  	}
   275  
   276  	rpcOps := &client.RPC{}
   277  	rpcOps.Client = rpc.NewClient(rpcConn)
   278  	rpcOps.Name = engine.CommonConfig.EngineName
   279  
   280  	if rpcOps.Client == nil {
   281  		return fmt.Errorf("failed to initialize RPC client")
   282  	}
   283  
   284  	if err := engine.createState(pid); err != nil {
   285  		return err
   286  	}
   287  
   288  	rootfs := engine.EngineConfig.OciConfig.Root.Path
   289  
   290  	if !filepath.IsAbs(rootfs) {
   291  		rootfs = filepath.Join(engine.EngineConfig.GetBundlePath(), rootfs)
   292  	}
   293  
   294  	resolvedRootfs, err := filepath.EvalSymlinks(rootfs)
   295  	if err != nil {
   296  		return fmt.Errorf("failed to resolve %s path: %s", rootfs, err)
   297  	}
   298  
   299  	c := &container{
   300  		engine:      engine,
   301  		rpcOps:      rpcOps,
   302  		rootfs:      resolvedRootfs,
   303  		rpcRoot:     fmt.Sprintf("/proc/%d/root", pid),
   304  		cgroupIndex: -1,
   305  		devIndex:    -1,
   306  	}
   307  
   308  	for _, ns := range engine.EngineConfig.OciConfig.Linux.Namespaces {
   309  		switch ns.Type {
   310  		case specs.UserNamespace:
   311  			c.userNS = true
   312  		case specs.UTSNamespace:
   313  			c.utsNS = true
   314  		case specs.MountNamespace:
   315  			c.mntNS = true
   316  		}
   317  	}
   318  
   319  	p := &mount.Points{}
   320  	if engine.EngineConfig.OciConfig.Linux.MountLabel != "" {
   321  		if err := p.SetContext(engine.EngineConfig.OciConfig.Linux.MountLabel); err != nil {
   322  			return err
   323  		}
   324  	}
   325  
   326  	system := &mount.System{Points: p, Mount: c.mount}
   327  
   328  	for i, point := range engine.EngineConfig.OciConfig.Config.Mounts {
   329  		// cgroup creation
   330  		if point.Type == "cgroup" {
   331  			c.cgroupIndex = i
   332  			continue
   333  		}
   334  		// dev creation
   335  		if point.Destination == "/dev" && point.Type == "tmpfs" {
   336  			c.devIndex = i
   337  		}
   338  	}
   339  
   340  	if err := c.addDevices(system); err != nil {
   341  		return err
   342  	}
   343  
   344  	if err := c.addCgroups(pid, system); err != nil {
   345  		return err
   346  	}
   347  
   348  	// import OCI mount spec
   349  	if err := system.Points.ImportFromSpec(engine.EngineConfig.OciConfig.Config.Mounts); err != nil {
   350  		return err
   351  	}
   352  
   353  	if err := c.addRootfsMount(system); err != nil {
   354  		return err
   355  	}
   356  
   357  	if err := system.RunAfterTag(mount.KernelTag, c.addDefaultDevices); err != nil {
   358  		return err
   359  	}
   360  
   361  	if err := system.RunAfterTag(mount.KernelTag, c.addAllPaths); err != nil {
   362  		return err
   363  	}
   364  
   365  	if err := proc.SetOOMScoreAdj(pid, engine.EngineConfig.OciConfig.Process.OOMScoreAdj); err != nil {
   366  		return err
   367  	}
   368  
   369  	if err := namespaces.Enter(pid, "ipc"); err != nil {
   370  		return err
   371  	}
   372  	if err := namespaces.Enter(pid, "net"); err != nil {
   373  		return err
   374  	}
   375  
   376  	for key, value := range engine.EngineConfig.OciConfig.Linux.Sysctl {
   377  		if err := sysctl.Set(key, value); err != nil {
   378  			return err
   379  		}
   380  	}
   381  
   382  	if err := namespaces.Enter(os.Getpid(), "ipc"); err != nil {
   383  		return err
   384  	}
   385  	if err := namespaces.Enter(os.Getpid(), "net"); err != nil {
   386  		return err
   387  	}
   388  
   389  	sylog.Debugf("Mount all")
   390  	if err := system.MountAll(); err != nil {
   391  		return err
   392  	}
   393  
   394  	if c.utsNS && engine.EngineConfig.OciConfig.Hostname != "" {
   395  		if _, err := rpcOps.SetHostname(engine.EngineConfig.OciConfig.Hostname); err != nil {
   396  			return err
   397  		}
   398  	}
   399  
   400  	// update namespaces configuration path
   401  	namespaces := []struct {
   402  		nstype       string
   403  		ns           specs.LinuxNamespaceType
   404  		checkEnabled bool
   405  	}{
   406  		{"pid", specs.PIDNamespace, false},
   407  		{"uts", specs.UTSNamespace, false},
   408  		{"ipc", specs.IPCNamespace, false},
   409  		{"mnt", specs.MountNamespace, false},
   410  		{"cgroup", specs.CgroupNamespace, false},
   411  		{"net", specs.NetworkNamespace, false},
   412  		{"user", specs.UserNamespace, true},
   413  	}
   414  
   415  	path := fmt.Sprintf("/proc/%d/ns", pid)
   416  
   417  	for _, n := range namespaces {
   418  		has, err := proc.HasNamespace(pid, n.nstype)
   419  		if err == nil && (has || n.checkEnabled) {
   420  			enabled := false
   421  			if n.checkEnabled {
   422  				if engine.EngineConfig.OciConfig.Linux != nil {
   423  					for _, namespace := range engine.EngineConfig.OciConfig.Linux.Namespaces {
   424  						if n.ns == namespace.Type {
   425  							enabled = true
   426  							break
   427  						}
   428  					}
   429  				}
   430  			}
   431  			if has || enabled {
   432  				nspath := filepath.Join(path, n.nstype)
   433  				engine.EngineConfig.OciConfig.AddOrReplaceLinuxNamespace(string(n.ns), nspath)
   434  			}
   435  		} else if err != nil {
   436  			return fmt.Errorf("failed to check %s root and container namespace: %s", n.ns, err)
   437  		}
   438  	}
   439  
   440  	method := "pivot"
   441  	if !c.mntNS {
   442  		method = "chroot"
   443  	}
   444  
   445  	_, err = rpcOps.Chroot(c.rootfs, method)
   446  	if err != nil {
   447  		return fmt.Errorf("chroot failed: %s", err)
   448  	}
   449  
   450  	if engine.EngineConfig.SlavePts != -1 {
   451  		if err := syscall.Close(engine.EngineConfig.SlavePts); err != nil {
   452  			return fmt.Errorf("failed to close slave part: %s", err)
   453  		}
   454  	}
   455  	if engine.EngineConfig.OutputStreams[0] != -1 {
   456  		if err := syscall.Close(engine.EngineConfig.OutputStreams[1]); err != nil {
   457  			return fmt.Errorf("failed to close write output stream: %s", err)
   458  		}
   459  	}
   460  	if engine.EngineConfig.ErrorStreams[0] != -1 {
   461  		if err := syscall.Close(engine.EngineConfig.ErrorStreams[1]); err != nil {
   462  			return fmt.Errorf("failed to close write error stream: %s", err)
   463  		}
   464  	}
   465  	if engine.EngineConfig.InputStreams[0] != -1 {
   466  		if err := syscall.Close(engine.EngineConfig.InputStreams[1]); err != nil {
   467  			return fmt.Errorf("failed to close write input stream: %s", err)
   468  		}
   469  	}
   470  
   471  	return nil
   472  }
   473  
   474  func (c *container) addCgroups(pid int, system *mount.System) error {
   475  	name := c.engine.CommonConfig.ContainerID
   476  	cgroupsPath := c.engine.EngineConfig.OciConfig.Linux.CgroupsPath
   477  
   478  	if !filepath.IsAbs(cgroupsPath) {
   479  		if cgroupsPath == "" {
   480  			cgroupsPath = filepath.Join("/singularity-oci", name)
   481  		} else {
   482  			cgroupsPath = filepath.Join("/", cgroupsPath)
   483  		}
   484  	}
   485  
   486  	c.engine.EngineConfig.OciConfig.Linux.CgroupsPath = cgroupsPath
   487  
   488  	manager := &cgroups.Manager{Path: cgroupsPath, Pid: pid}
   489  
   490  	if err := manager.ApplyFromSpec(c.engine.EngineConfig.OciConfig.Linux.Resources); err != nil {
   491  		return fmt.Errorf("Failed to apply cgroups ressources restriction: %s", err)
   492  	}
   493  
   494  	if c.cgroupIndex >= 0 {
   495  		m := c.engine.EngineConfig.OciConfig.Config.Mounts[c.cgroupIndex]
   496  		c.engine.EngineConfig.OciConfig.Config.Mounts = append(
   497  			c.engine.EngineConfig.OciConfig.Config.Mounts[:c.cgroupIndex],
   498  			c.engine.EngineConfig.OciConfig.Config.Mounts[c.cgroupIndex+1:]...,
   499  		)
   500  
   501  		cgroupRootPath := manager.GetCgroupRootPath()
   502  		if cgroupRootPath == "" {
   503  			return fmt.Errorf("failed to determine cgroup root path")
   504  		}
   505  
   506  		flags, opt := mount.ConvertOptions(m.Options)
   507  		options := strings.Join(opt, ",")
   508  
   509  		readOnly := false
   510  		if flags&syscall.MS_RDONLY != 0 {
   511  			readOnly = true
   512  			flags &^= uintptr(syscall.MS_RDONLY)
   513  		}
   514  
   515  		hasMode := false
   516  		for _, o := range opt {
   517  			if strings.HasPrefix(o, "mode=") {
   518  				hasMode = true
   519  				break
   520  			}
   521  		}
   522  		if !hasMode {
   523  			options += ",mode=755"
   524  		}
   525  
   526  		if err := system.Points.AddFS(mount.OtherTag, m.Destination, "tmpfs", flags, options); err != nil {
   527  			return err
   528  		}
   529  
   530  		createSymlinks := func(*mount.System) error {
   531  			cgroupPath := filepath.Join(c.rpcRoot, c.rootfs, m.Destination)
   532  			if _, err := os.Stat(filepath.Join(cgroupPath, "cpu")); err != nil && os.IsNotExist(err) {
   533  				if _, err := c.rpcOps.Symlink("cpu,cpuacct", filepath.Join(c.rootfs, m.Destination, "cpu")); err != nil {
   534  					return err
   535  				}
   536  				if _, err := c.rpcOps.Symlink("cpu,cpuacct", filepath.Join(c.rootfs, m.Destination, "cpuacct")); err != nil {
   537  					return err
   538  				}
   539  			}
   540  
   541  			if _, err := os.Stat(filepath.Join(cgroupPath, "net_cls")); err != nil && os.IsNotExist(err) {
   542  				if _, err := c.rpcOps.Symlink("net_cls,net_prio", filepath.Join(c.rootfs, m.Destination, "net_cls")); err != nil {
   543  					return err
   544  				}
   545  				if _, err := c.rpcOps.Symlink("net_cls,net_prio", filepath.Join(c.rootfs, m.Destination, "net_prio")); err != nil {
   546  					return err
   547  				}
   548  			}
   549  			return nil
   550  		}
   551  
   552  		if err := system.RunAfterTag(mount.OtherTag, createSymlinks); err != nil {
   553  			return err
   554  		}
   555  
   556  		f, err := os.Open(fmt.Sprintf("/proc/%d/cgroup", pid))
   557  		if err != nil {
   558  			return err
   559  		}
   560  		defer f.Close()
   561  
   562  		flags |= uintptr(syscall.MS_BIND)
   563  		if readOnly {
   564  			flags |= syscall.MS_RDONLY
   565  		}
   566  
   567  		scanner := bufio.NewScanner(f)
   568  		for scanner.Scan() {
   569  			cgroupLine := strings.Split(scanner.Text(), ":")
   570  			if strings.HasPrefix(cgroupLine[1], "name=") {
   571  				cgroupLine[1] = strings.Replace(cgroupLine[1], "name=", "", 1)
   572  			}
   573  			if cgroupLine[1] != "" {
   574  				source := filepath.Join(cgroupRootPath, cgroupLine[1], cgroupLine[2])
   575  				dest := filepath.Join(m.Destination, cgroupLine[1])
   576  				if err := system.Points.AddBind(mount.OtherTag, source, dest, flags); err != nil {
   577  					return err
   578  				}
   579  				if readOnly {
   580  					if err := system.Points.AddRemount(mount.OtherTag, dest, flags); err != nil {
   581  						return err
   582  					}
   583  				}
   584  			}
   585  		}
   586  
   587  		if readOnly {
   588  			if err := system.Points.AddRemount(mount.FinalTag, m.Destination, flags); err != nil {
   589  				return err
   590  			}
   591  		}
   592  	}
   593  
   594  	c.engine.EngineConfig.Cgroups = manager
   595  
   596  	return nil
   597  }
   598  
   599  func (c *container) addAllPaths(system *mount.System) error {
   600  	// add masked path
   601  	if err := c.addMaskedPathsMount(system); err != nil {
   602  		return err
   603  	}
   604  
   605  	// add read-only path
   606  	if !c.userNS {
   607  		if err := c.addReadonlyPathsMount(system); err != nil {
   608  			return err
   609  		}
   610  	}
   611  
   612  	return nil
   613  }
   614  
   615  func (c *container) addRootfsMount(system *mount.System) error {
   616  	flags := uintptr(syscall.MS_BIND)
   617  
   618  	if c.engine.EngineConfig.OciConfig.Root.Readonly {
   619  		sylog.Debugf("Mounted read-only")
   620  		flags |= syscall.MS_RDONLY
   621  	}
   622  
   623  	parentRootfs, err := proc.ParentMount(c.rootfs)
   624  	if err != nil {
   625  		return err
   626  	}
   627  
   628  	sylog.Debugf("Parent rootfs: %s", parentRootfs)
   629  
   630  	if _, err := c.rpcOps.Mount("", parentRootfs, "", syscall.MS_PRIVATE, ""); err != nil {
   631  		return err
   632  	}
   633  	if err := system.Points.AddBind(mount.RootfsTag, c.rootfs, c.rootfs, flags); err != nil {
   634  		return err
   635  	}
   636  	if flags&syscall.MS_RDONLY != 0 {
   637  		return system.Points.AddRemount(mount.FinalTag, c.rootfs, flags)
   638  	}
   639  
   640  	return nil
   641  }
   642  
   643  func (c *container) addDefaultDevices(system *mount.System) error {
   644  	oldmask := syscall.Umask(0)
   645  	defer syscall.Umask(oldmask)
   646  
   647  	rootfsPath := filepath.Join(c.rpcRoot, c.rootfs)
   648  
   649  	devPath := filepath.Join(rootfsPath, fs.EvalRelative("/dev", rootfsPath))
   650  	if _, err := os.Lstat(devPath); os.IsNotExist(err) {
   651  		if err := os.Mkdir(devPath, 0755); err != nil {
   652  			return err
   653  		}
   654  	}
   655  
   656  	for _, symlink := range symlinkDevices {
   657  		path := filepath.Join(rootfsPath, symlink.new)
   658  		if _, err := os.Lstat(path); os.IsNotExist(err) {
   659  			if c.userNS {
   660  				path = filepath.Join(c.rootfs, symlink.new)
   661  				if _, err := c.rpcOps.Symlink(symlink.old, path); err != nil {
   662  					return err
   663  				}
   664  			} else {
   665  				if err := os.Symlink(symlink.old, path); err != nil {
   666  					return err
   667  				}
   668  			}
   669  		}
   670  	}
   671  
   672  	if c.engine.EngineConfig.OciConfig.Process.Terminal {
   673  		path := filepath.Join(rootfsPath, "dev", "console")
   674  		if _, err := os.Lstat(path); os.IsNotExist(err) {
   675  			if c.userNS {
   676  				if _, err := c.rpcOps.Touch(filepath.Join(c.rootfs, "dev", "console")); err != nil {
   677  					return err
   678  				}
   679  			} else {
   680  				if err := fs.Touch(path); err != nil {
   681  					return err
   682  				}
   683  			}
   684  			path = fmt.Sprintf("/proc/self/fd/%d", c.engine.EngineConfig.SlavePts)
   685  			console, err := os.Readlink(path)
   686  			if err != nil {
   687  				return err
   688  			}
   689  			if err := system.Points.AddBind(mount.OtherTag, console, "/dev/console", syscall.MS_BIND); err != nil {
   690  				return err
   691  			}
   692  		}
   693  	}
   694  
   695  	for _, device := range devices {
   696  		dev := int((device.major << 8) | (device.minor & 0xff) | ((device.minor & 0xfff00) << 12))
   697  		path := filepath.Join(rootfsPath, device.path)
   698  		if _, err := os.Lstat(path); os.IsNotExist(err) {
   699  			if c.userNS {
   700  				path = filepath.Join(c.rootfs, device.path)
   701  				if _, err := os.Stat(device.path); os.IsNotExist(err) {
   702  					sylog.Debugf("skipping mount, %s doesn't exists", device.path)
   703  					continue
   704  				}
   705  				if _, err := c.rpcOps.Touch(path); err != nil {
   706  					return err
   707  				}
   708  				if _, err := c.rpcOps.Mount(device.path, path, "", syscall.MS_BIND, ""); err != nil {
   709  					return err
   710  				}
   711  			} else {
   712  				if err := syscall.Mknod(path, uint32(device.mode), dev); err != nil {
   713  					return fmt.Errorf("mknod: %s", err)
   714  				}
   715  				if device.uid != 0 || device.gid != 0 {
   716  					if err := os.Chown(path, device.uid, device.gid); err != nil {
   717  						return err
   718  					}
   719  				}
   720  			}
   721  		}
   722  	}
   723  
   724  	return nil
   725  }
   726  
   727  func (c *container) addDevices(system *mount.System) error {
   728  	for _, d := range c.engine.EngineConfig.OciConfig.Linux.Devices {
   729  		var dev device
   730  
   731  		if d.Path == "" {
   732  			return fmt.Errorf("device path required")
   733  		}
   734  		dev.path = d.Path
   735  
   736  		if d.FileMode != nil {
   737  			dev.mode = *d.FileMode
   738  		} else {
   739  			dev.mode = 0644
   740  		}
   741  
   742  		switch d.Type {
   743  		case "c", "u":
   744  			dev.mode |= syscall.S_IFCHR
   745  			dev.major = d.Major
   746  			dev.minor = d.Minor
   747  		case "b":
   748  			dev.mode |= syscall.S_IFBLK
   749  			dev.major = d.Major
   750  			dev.minor = d.Minor
   751  		case "p":
   752  			dev.mode |= syscall.S_IFIFO
   753  		default:
   754  			return fmt.Errorf("device type unknown for %s", d.Path)
   755  		}
   756  
   757  		if d.UID != nil {
   758  			dev.uid = int(*d.UID)
   759  		}
   760  		if d.GID != nil {
   761  			dev.gid = int(*d.GID)
   762  		}
   763  
   764  		devices = append(devices, dev)
   765  	}
   766  
   767  	if c.devIndex >= 0 {
   768  		m := &c.engine.EngineConfig.OciConfig.Config.Mounts[c.devIndex]
   769  
   770  		flags, _ := mount.ConvertOptions(m.Options)
   771  
   772  		flags |= uintptr(syscall.MS_BIND)
   773  		if flags&syscall.MS_RDONLY != 0 {
   774  			if err := system.Points.AddRemount(mount.FinalTag, m.Destination, flags); err != nil {
   775  				return err
   776  			}
   777  			for i := len(m.Options) - 1; i >= 0; i-- {
   778  				if m.Options[i] == "ro" {
   779  					m.Options = append(m.Options[:i], m.Options[i+1:]...)
   780  					break
   781  				}
   782  			}
   783  		}
   784  
   785  		if c.engine.EngineConfig.OciConfig.Linux.Resources == nil {
   786  			c.engine.EngineConfig.OciConfig.Linux.Resources = &specs.LinuxResources{}
   787  		}
   788  
   789  		c.engine.EngineConfig.OciConfig.Linux.Resources.Devices = append(c.engine.EngineConfig.OciConfig.Linux.Resources.Devices, cgroupDevices...)
   790  	}
   791  
   792  	return nil
   793  }
   794  
   795  func (c *container) addMaskedPathsMount(system *mount.System) error {
   796  	paths := c.engine.EngineConfig.OciConfig.Linux.MaskedPaths
   797  
   798  	dir, err := instance.GetDirPrivileged(c.engine.CommonConfig.ContainerID, instance.OciSubDir)
   799  	if err != nil {
   800  		return err
   801  	}
   802  	nullPath := filepath.Join(dir, "null")
   803  
   804  	if _, err := os.Stat(nullPath); os.IsNotExist(err) {
   805  		oldmask := syscall.Umask(0)
   806  		defer syscall.Umask(oldmask)
   807  
   808  		if err := os.Mkdir(nullPath, 0755); err != nil {
   809  			return err
   810  		}
   811  	}
   812  
   813  	for _, path := range paths {
   814  		relativePath := filepath.Join(c.rootfs, path)
   815  		rpcPath := filepath.Join(c.rpcRoot, relativePath)
   816  		fi, err := os.Stat(rpcPath)
   817  		if err != nil {
   818  			sylog.Debugf("ignoring masked path %s: %s", path, err)
   819  			continue
   820  		}
   821  		if fi.IsDir() {
   822  			if err := system.Points.AddBind(mount.OtherTag, nullPath, relativePath, syscall.MS_BIND); err != nil {
   823  				return err
   824  			}
   825  		} else if err := system.Points.AddBind(mount.OtherTag, "/dev/null", relativePath, syscall.MS_BIND); err != nil {
   826  			return err
   827  		}
   828  	}
   829  	return nil
   830  }
   831  
   832  func (c *container) addReadonlyPathsMount(system *mount.System) error {
   833  	paths := c.engine.EngineConfig.OciConfig.Linux.ReadonlyPaths
   834  
   835  	for _, path := range paths {
   836  		relativePath := filepath.Join(c.rootfs, path)
   837  		rpcPath := filepath.Join(c.rpcRoot, relativePath)
   838  		_, err := os.Stat(rpcPath)
   839  		if err != nil {
   840  			sylog.Debugf("ignoring read-only path %s: %s", path, err)
   841  			continue
   842  		}
   843  		if err := system.Points.AddBind(mount.OtherTag, relativePath, relativePath, syscall.MS_BIND|syscall.MS_RDONLY); err != nil {
   844  			return err
   845  		}
   846  		if err := system.Points.AddRemount(mount.OtherTag, relativePath, syscall.MS_BIND|syscall.MS_RDONLY); err != nil {
   847  			return err
   848  		}
   849  	}
   850  	return nil
   851  }
   852  
   853  func (c *container) mount(point *mount.Point) error {
   854  	source := point.Source
   855  	dest := point.Destination
   856  	flags, opts := mount.ConvertOptions(point.Options)
   857  	optsString := strings.Join(opts, ",")
   858  	ignore := false
   859  
   860  	if flags&syscall.MS_REMOUNT != 0 {
   861  		ignore = true
   862  	}
   863  
   864  	if !strings.HasPrefix(dest, c.rootfs) {
   865  		rootfsPath := filepath.Join(c.rpcRoot, c.rootfs)
   866  		relativeDest := fs.EvalRelative(dest, c.rootfs)
   867  		procDest := filepath.Join(rootfsPath, relativeDest)
   868  
   869  		dest = filepath.Join(c.rootfs, relativeDest)
   870  
   871  		sylog.Debugf("Checking if %s exists", procDest)
   872  		if _, err := os.Stat(procDest); os.IsNotExist(err) && !ignore {
   873  			oldmask := syscall.Umask(0)
   874  			defer syscall.Umask(oldmask)
   875  
   876  			if point.Type != "" {
   877  				sylog.Debugf("Creating %s", procDest)
   878  				if c.userNS {
   879  					if _, err := c.rpcOps.MkdirAll(dest, 0755); err != nil {
   880  						return err
   881  					}
   882  				} else {
   883  					if err := os.MkdirAll(procDest, 0755); err != nil {
   884  						return err
   885  					}
   886  				}
   887  			} else {
   888  				var st syscall.Stat_t
   889  
   890  				dir := filepath.Dir(procDest)
   891  				if _, err := os.Stat(dir); os.IsNotExist(err) {
   892  					sylog.Debugf("Creating parent %s", dir)
   893  					if c.userNS {
   894  						if _, err := c.rpcOps.Mkdir(filepath.Dir(dest), 0755); err != nil {
   895  							return err
   896  						}
   897  					} else {
   898  						if err := os.MkdirAll(dir, 0755); err != nil {
   899  							return err
   900  						}
   901  					}
   902  				}
   903  
   904  				if err := syscall.Stat(source, &st); err != nil {
   905  					sylog.Debugf("Ignoring %s: %s", source, err)
   906  					return nil
   907  				}
   908  				switch st.Mode & syscall.S_IFMT {
   909  				case syscall.S_IFDIR:
   910  					sylog.Debugf("Creating dir %s", filepath.Base(procDest))
   911  					if c.userNS {
   912  						if _, err := c.rpcOps.Mkdir(dest, 0755); err != nil {
   913  							return err
   914  						}
   915  					} else {
   916  						if err := os.Mkdir(procDest, 0755); err != nil {
   917  							return err
   918  						}
   919  					}
   920  				case syscall.S_IFREG:
   921  					sylog.Debugf("Creating file %s", filepath.Base(procDest))
   922  					if c.userNS {
   923  						if _, err := c.rpcOps.Touch(dest); err != nil {
   924  							return err
   925  						}
   926  					} else {
   927  						if err := fs.Touch(procDest); err != nil {
   928  							return err
   929  						}
   930  					}
   931  				}
   932  			}
   933  		}
   934  	} else {
   935  		procDest := filepath.Join(c.rpcRoot, dest)
   936  
   937  		sylog.Debugf("Checking if %s exists", procDest)
   938  		if _, err := os.Stat(procDest); os.IsNotExist(err) {
   939  			sylog.Warningf("destination %s doesn't exist", dest)
   940  			return nil
   941  		}
   942  	}
   943  
   944  	if ignore {
   945  		sylog.Debugf("(re)mount %s", dest)
   946  	} else {
   947  		sylog.Debugf("Mount %s to %s : %s [%s]", source, dest, point.Type, optsString)
   948  	}
   949  
   950  	_, err := c.rpcOps.Mount(source, dest, point.Type, flags, optsString)
   951  	if err != nil {
   952  		sylog.Debugf("RPC mount error: %s", err)
   953  	}
   954  
   955  	return err
   956  }