github.com/ncdc/docker@v0.10.1-0.20160129113957-6c6729ef5b74/daemon/execdriver/native/create.go (about)

     1  // +build linux,cgo
     2  
     3  package native
     4  
     5  import (
     6  	"fmt"
     7  	"path/filepath"
     8  	"strings"
     9  	"syscall"
    10  
    11  	"github.com/docker/docker/daemon/execdriver"
    12  	derr "github.com/docker/docker/errors"
    13  	"github.com/docker/docker/pkg/mount"
    14  	"github.com/docker/docker/profiles/seccomp"
    15  
    16  	"github.com/docker/docker/volume"
    17  	"github.com/opencontainers/runc/libcontainer/apparmor"
    18  	"github.com/opencontainers/runc/libcontainer/configs"
    19  	"github.com/opencontainers/runc/libcontainer/devices"
    20  )
    21  
    22  // createContainer populates and configures the container type with the
    23  // data provided by the execdriver.Command
    24  func (d *Driver) createContainer(c *execdriver.Command, hooks execdriver.Hooks) (container *configs.Config, err error) {
    25  	container = execdriver.InitContainer(c)
    26  
    27  	if err := d.createIpc(container, c); err != nil {
    28  		return nil, err
    29  	}
    30  
    31  	if err := d.createPid(container, c); err != nil {
    32  		return nil, err
    33  	}
    34  
    35  	if err := d.createUTS(container, c); err != nil {
    36  		return nil, err
    37  	}
    38  
    39  	if err := d.setupRemappedRoot(container, c); err != nil {
    40  		return nil, err
    41  	}
    42  
    43  	if err := d.createNetwork(container, c, hooks); err != nil {
    44  		return nil, err
    45  	}
    46  
    47  	if c.ProcessConfig.Privileged {
    48  		if !container.Readonlyfs {
    49  			// clear readonly for /sys
    50  			for i := range container.Mounts {
    51  				if container.Mounts[i].Destination == "/sys" {
    52  					container.Mounts[i].Flags &= ^syscall.MS_RDONLY
    53  				}
    54  			}
    55  			container.ReadonlyPaths = nil
    56  		}
    57  
    58  		// clear readonly for cgroup
    59  		for i := range container.Mounts {
    60  			if container.Mounts[i].Device == "cgroup" {
    61  				container.Mounts[i].Flags &= ^syscall.MS_RDONLY
    62  			}
    63  		}
    64  
    65  		container.MaskPaths = nil
    66  		if err := d.setPrivileged(container); err != nil {
    67  			return nil, err
    68  		}
    69  	} else {
    70  		if err := d.setCapabilities(container, c); err != nil {
    71  			return nil, err
    72  		}
    73  
    74  		if c.SeccompProfile == "" {
    75  			container.Seccomp = seccomp.GetDefaultProfile()
    76  		}
    77  	}
    78  	// add CAP_ prefix to all caps for new libcontainer update to match
    79  	// the spec format.
    80  	for i, s := range container.Capabilities {
    81  		if !strings.HasPrefix(s, "CAP_") {
    82  			container.Capabilities[i] = fmt.Sprintf("CAP_%s", s)
    83  		}
    84  	}
    85  	container.AdditionalGroups = c.GroupAdd
    86  
    87  	if c.AppArmorProfile != "" {
    88  		container.AppArmorProfile = c.AppArmorProfile
    89  	}
    90  
    91  	if c.SeccompProfile != "" && c.SeccompProfile != "unconfined" {
    92  		container.Seccomp, err = seccomp.LoadProfile(c.SeccompProfile)
    93  		if err != nil {
    94  			return nil, err
    95  		}
    96  	}
    97  
    98  	if err := execdriver.SetupCgroups(container, c); err != nil {
    99  		return nil, err
   100  	}
   101  
   102  	container.OomScoreAdj = c.OomScoreAdj
   103  
   104  	if container.Readonlyfs {
   105  		for i := range container.Mounts {
   106  			switch container.Mounts[i].Destination {
   107  			case "/proc", "/dev", "/dev/pts":
   108  				continue
   109  			}
   110  			container.Mounts[i].Flags |= syscall.MS_RDONLY
   111  		}
   112  
   113  		/* These paths must be remounted as r/o */
   114  		container.ReadonlyPaths = append(container.ReadonlyPaths, "/dev")
   115  	}
   116  
   117  	if err := d.setupMounts(container, c); err != nil {
   118  		return nil, err
   119  	}
   120  
   121  	d.setupLabels(container, c)
   122  	d.setupRlimits(container, c)
   123  	return container, nil
   124  }
   125  
   126  func (d *Driver) createNetwork(container *configs.Config, c *execdriver.Command, hooks execdriver.Hooks) error {
   127  	if c.Network == nil {
   128  		return nil
   129  	}
   130  	if c.Network.ContainerID != "" {
   131  		d.Lock()
   132  		active := d.activeContainers[c.Network.ContainerID]
   133  		d.Unlock()
   134  
   135  		if active == nil {
   136  			return fmt.Errorf("%s is not a valid running container to join", c.Network.ContainerID)
   137  		}
   138  
   139  		state, err := active.State()
   140  		if err != nil {
   141  			return err
   142  		}
   143  
   144  		container.Namespaces.Add(configs.NEWNET, state.NamespacePaths[configs.NEWNET])
   145  		return nil
   146  	}
   147  
   148  	if c.Network.NamespacePath != "" {
   149  		container.Namespaces.Add(configs.NEWNET, c.Network.NamespacePath)
   150  		return nil
   151  	}
   152  	// only set up prestart hook if the namespace path is not set (this should be
   153  	// all cases *except* for --net=host shared networking)
   154  	container.Hooks = &configs.Hooks{
   155  		Prestart: []configs.Hook{
   156  			configs.NewFunctionHook(func(s configs.HookState) error {
   157  				if len(hooks.PreStart) > 0 {
   158  					for _, fnHook := range hooks.PreStart {
   159  						// A closed channel for OOM is returned here as it will be
   160  						// non-blocking and return the correct result when read.
   161  						chOOM := make(chan struct{})
   162  						close(chOOM)
   163  						if err := fnHook(&c.ProcessConfig, s.Pid, chOOM); err != nil {
   164  							return err
   165  						}
   166  					}
   167  				}
   168  				return nil
   169  			}),
   170  		},
   171  	}
   172  	return nil
   173  }
   174  
   175  func (d *Driver) createIpc(container *configs.Config, c *execdriver.Command) error {
   176  	if c.Ipc.HostIpc {
   177  		container.Namespaces.Remove(configs.NEWIPC)
   178  		return nil
   179  	}
   180  
   181  	if c.Ipc.ContainerID != "" {
   182  		d.Lock()
   183  		active := d.activeContainers[c.Ipc.ContainerID]
   184  		d.Unlock()
   185  
   186  		if active == nil {
   187  			return fmt.Errorf("%s is not a valid running container to join", c.Ipc.ContainerID)
   188  		}
   189  
   190  		state, err := active.State()
   191  		if err != nil {
   192  			return err
   193  		}
   194  		container.Namespaces.Add(configs.NEWIPC, state.NamespacePaths[configs.NEWIPC])
   195  	}
   196  
   197  	return nil
   198  }
   199  
   200  func (d *Driver) createPid(container *configs.Config, c *execdriver.Command) error {
   201  	if c.Pid.HostPid {
   202  		container.Namespaces.Remove(configs.NEWPID)
   203  		return nil
   204  	}
   205  
   206  	return nil
   207  }
   208  
   209  func (d *Driver) createUTS(container *configs.Config, c *execdriver.Command) error {
   210  	if c.UTS.HostUTS {
   211  		container.Namespaces.Remove(configs.NEWUTS)
   212  		container.Hostname = ""
   213  		return nil
   214  	}
   215  
   216  	return nil
   217  }
   218  
   219  func (d *Driver) setupRemappedRoot(container *configs.Config, c *execdriver.Command) error {
   220  	if c.RemappedRoot.UID == 0 {
   221  		container.Namespaces.Remove(configs.NEWUSER)
   222  		return nil
   223  	}
   224  
   225  	// convert the Docker daemon id map to the libcontainer variant of the same struct
   226  	// this keeps us from having to import libcontainer code across Docker client + daemon packages
   227  	cuidMaps := []configs.IDMap{}
   228  	cgidMaps := []configs.IDMap{}
   229  	for _, idMap := range c.UIDMapping {
   230  		cuidMaps = append(cuidMaps, configs.IDMap(idMap))
   231  	}
   232  	for _, idMap := range c.GIDMapping {
   233  		cgidMaps = append(cgidMaps, configs.IDMap(idMap))
   234  	}
   235  	container.UidMappings = cuidMaps
   236  	container.GidMappings = cgidMaps
   237  
   238  	for _, node := range container.Devices {
   239  		node.Uid = uint32(c.RemappedRoot.UID)
   240  		node.Gid = uint32(c.RemappedRoot.GID)
   241  	}
   242  	// TODO: until a kernel/mount solution exists for handling remount in a user namespace,
   243  	// we must clear the readonly flag for the cgroups mount (@mrunalp concurs)
   244  	for i := range container.Mounts {
   245  		if container.Mounts[i].Device == "cgroup" {
   246  			container.Mounts[i].Flags &= ^syscall.MS_RDONLY
   247  		}
   248  	}
   249  
   250  	return nil
   251  }
   252  
   253  func (d *Driver) setPrivileged(container *configs.Config) (err error) {
   254  	container.Capabilities = execdriver.GetAllCapabilities()
   255  	container.Cgroups.Resources.AllowAllDevices = true
   256  
   257  	hostDevices, err := devices.HostDevices()
   258  	if err != nil {
   259  		return err
   260  	}
   261  	container.Devices = hostDevices
   262  
   263  	if apparmor.IsEnabled() {
   264  		container.AppArmorProfile = "unconfined"
   265  	}
   266  	return nil
   267  }
   268  
   269  func (d *Driver) setCapabilities(container *configs.Config, c *execdriver.Command) (err error) {
   270  	container.Capabilities, err = execdriver.TweakCapabilities(container.Capabilities, c.CapAdd, c.CapDrop)
   271  	return err
   272  }
   273  
   274  func (d *Driver) setupRlimits(container *configs.Config, c *execdriver.Command) {
   275  	if c.Resources == nil {
   276  		return
   277  	}
   278  
   279  	for _, rlimit := range c.Resources.Rlimits {
   280  		container.Rlimits = append(container.Rlimits, configs.Rlimit{
   281  			Type: rlimit.Type,
   282  			Hard: rlimit.Hard,
   283  			Soft: rlimit.Soft,
   284  		})
   285  	}
   286  }
   287  
   288  // If rootfs mount propagation is RPRIVATE, that means all the volumes are
   289  // going to be private anyway. There is no need to apply per volume
   290  // propagation on top. This is just an optimization so that cost of per volume
   291  // propagation is paid only if user decides to make some volume non-private
   292  // which will force rootfs mount propagation to be non RPRIVATE.
   293  func checkResetVolumePropagation(container *configs.Config) {
   294  	if container.RootPropagation != mount.RPRIVATE {
   295  		return
   296  	}
   297  	for _, m := range container.Mounts {
   298  		m.PropagationFlags = nil
   299  	}
   300  }
   301  
   302  func getMountInfo(mountinfo []*mount.Info, dir string) *mount.Info {
   303  	for _, m := range mountinfo {
   304  		if m.Mountpoint == dir {
   305  			return m
   306  		}
   307  	}
   308  	return nil
   309  }
   310  
   311  // Get the source mount point of directory passed in as argument. Also return
   312  // optional fields.
   313  func getSourceMount(source string) (string, string, error) {
   314  	// Ensure any symlinks are resolved.
   315  	sourcePath, err := filepath.EvalSymlinks(source)
   316  	if err != nil {
   317  		return "", "", err
   318  	}
   319  
   320  	mountinfos, err := mount.GetMounts()
   321  	if err != nil {
   322  		return "", "", err
   323  	}
   324  
   325  	mountinfo := getMountInfo(mountinfos, sourcePath)
   326  	if mountinfo != nil {
   327  		return sourcePath, mountinfo.Optional, nil
   328  	}
   329  
   330  	path := sourcePath
   331  	for {
   332  		path = filepath.Dir(path)
   333  
   334  		mountinfo = getMountInfo(mountinfos, path)
   335  		if mountinfo != nil {
   336  			return path, mountinfo.Optional, nil
   337  		}
   338  
   339  		if path == "/" {
   340  			break
   341  		}
   342  	}
   343  
   344  	// If we are here, we did not find parent mount. Something is wrong.
   345  	return "", "", fmt.Errorf("Could not find source mount of %s", source)
   346  }
   347  
   348  // Ensure mount point on which path is mounted, is shared.
   349  func ensureShared(path string) error {
   350  	sharedMount := false
   351  
   352  	sourceMount, optionalOpts, err := getSourceMount(path)
   353  	if err != nil {
   354  		return err
   355  	}
   356  	// Make sure source mount point is shared.
   357  	optsSplit := strings.Split(optionalOpts, " ")
   358  	for _, opt := range optsSplit {
   359  		if strings.HasPrefix(opt, "shared:") {
   360  			sharedMount = true
   361  			break
   362  		}
   363  	}
   364  
   365  	if !sharedMount {
   366  		return fmt.Errorf("Path %s is mounted on %s but it is not a shared mount.", path, sourceMount)
   367  	}
   368  	return nil
   369  }
   370  
   371  // Ensure mount point on which path is mounted, is either shared or slave.
   372  func ensureSharedOrSlave(path string) error {
   373  	sharedMount := false
   374  	slaveMount := false
   375  
   376  	sourceMount, optionalOpts, err := getSourceMount(path)
   377  	if err != nil {
   378  		return err
   379  	}
   380  	// Make sure source mount point is shared.
   381  	optsSplit := strings.Split(optionalOpts, " ")
   382  	for _, opt := range optsSplit {
   383  		if strings.HasPrefix(opt, "shared:") {
   384  			sharedMount = true
   385  			break
   386  		} else if strings.HasPrefix(opt, "master:") {
   387  			slaveMount = true
   388  			break
   389  		}
   390  	}
   391  
   392  	if !sharedMount && !slaveMount {
   393  		return fmt.Errorf("Path %s is mounted on %s but it is not a shared or slave mount.", path, sourceMount)
   394  	}
   395  	return nil
   396  }
   397  
   398  func (d *Driver) setupMounts(container *configs.Config, c *execdriver.Command) error {
   399  	userMounts := make(map[string]struct{})
   400  	for _, m := range c.Mounts {
   401  		userMounts[m.Destination] = struct{}{}
   402  	}
   403  
   404  	// Filter out mounts that are overridden by user supplied mounts
   405  	var defaultMounts []*configs.Mount
   406  	_, mountDev := userMounts["/dev"]
   407  	for _, m := range container.Mounts {
   408  		if _, ok := userMounts[m.Destination]; !ok {
   409  			if mountDev && strings.HasPrefix(m.Destination, "/dev/") {
   410  				container.Devices = nil
   411  				continue
   412  			}
   413  			defaultMounts = append(defaultMounts, m)
   414  		}
   415  	}
   416  	container.Mounts = defaultMounts
   417  
   418  	mountPropagationMap := map[string]int{
   419  		"private":  mount.PRIVATE,
   420  		"rprivate": mount.RPRIVATE,
   421  		"shared":   mount.SHARED,
   422  		"rshared":  mount.RSHARED,
   423  		"slave":    mount.SLAVE,
   424  		"rslave":   mount.RSLAVE,
   425  	}
   426  
   427  	for _, m := range c.Mounts {
   428  		for _, cm := range container.Mounts {
   429  			if cm.Destination == m.Destination {
   430  				return derr.ErrorCodeMountDup.WithArgs(m.Destination)
   431  			}
   432  		}
   433  
   434  		if m.Source == "tmpfs" {
   435  			var (
   436  				data  = "size=65536k"
   437  				flags = syscall.MS_NOEXEC | syscall.MS_NOSUID | syscall.MS_NODEV
   438  				err   error
   439  			)
   440  			if m.Data != "" {
   441  				flags, data, err = mount.ParseTmpfsOptions(m.Data)
   442  				if err != nil {
   443  					return err
   444  				}
   445  			}
   446  			container.Mounts = append(container.Mounts, &configs.Mount{
   447  				Source:           m.Source,
   448  				Destination:      m.Destination,
   449  				Data:             data,
   450  				Device:           "tmpfs",
   451  				Flags:            flags,
   452  				PropagationFlags: []int{mountPropagationMap[volume.DefaultPropagationMode]},
   453  			})
   454  			continue
   455  		}
   456  		flags := syscall.MS_BIND | syscall.MS_REC
   457  		var pFlag int
   458  		if !m.Writable {
   459  			flags |= syscall.MS_RDONLY
   460  		}
   461  
   462  		// Determine property of RootPropagation based on volume
   463  		// properties. If a volume is shared, then keep root propagation
   464  		// shared. This should work for slave and private volumes too.
   465  		//
   466  		// For slave volumes, it can be either [r]shared/[r]slave.
   467  		//
   468  		// For private volumes any root propagation value should work.
   469  
   470  		pFlag = mountPropagationMap[m.Propagation]
   471  		if pFlag == mount.SHARED || pFlag == mount.RSHARED {
   472  			if err := ensureShared(m.Source); err != nil {
   473  				return err
   474  			}
   475  			rootpg := container.RootPropagation
   476  			if rootpg != mount.SHARED && rootpg != mount.RSHARED {
   477  				execdriver.SetRootPropagation(container, mount.SHARED)
   478  			}
   479  		} else if pFlag == mount.SLAVE || pFlag == mount.RSLAVE {
   480  			if err := ensureSharedOrSlave(m.Source); err != nil {
   481  				return err
   482  			}
   483  			rootpg := container.RootPropagation
   484  			if rootpg != mount.SHARED && rootpg != mount.RSHARED && rootpg != mount.SLAVE && rootpg != mount.RSLAVE {
   485  				execdriver.SetRootPropagation(container, mount.RSLAVE)
   486  			}
   487  		}
   488  
   489  		mount := &configs.Mount{
   490  			Source:      m.Source,
   491  			Destination: m.Destination,
   492  			Device:      "bind",
   493  			Flags:       flags,
   494  		}
   495  
   496  		if pFlag != 0 {
   497  			mount.PropagationFlags = []int{pFlag}
   498  		}
   499  
   500  		container.Mounts = append(container.Mounts, mount)
   501  	}
   502  
   503  	checkResetVolumePropagation(container)
   504  	return nil
   505  }
   506  
   507  func (d *Driver) setupLabels(container *configs.Config, c *execdriver.Command) {
   508  	container.ProcessLabel = c.ProcessLabel
   509  	container.MountLabel = c.MountLabel
   510  }