github.com/hustcat/docker@v1.3.3-0.20160314103604-901c67a8eeab/daemon/execdriver/native/create.go (about)

     1  // +build linux,cgo
     2  
     3  package native
     4  
     5  import (
     6  	"fmt"
     7  	"path/filepath"
     8  	"strings"
     9  	"syscall"
    10  
    11  	"github.com/docker/docker/daemon/execdriver"
    12  	"github.com/docker/docker/pkg/mount"
    13  	"github.com/docker/docker/profiles/seccomp"
    14  
    15  	"github.com/docker/docker/volume"
    16  	"github.com/opencontainers/runc/libcontainer/apparmor"
    17  	"github.com/opencontainers/runc/libcontainer/configs"
    18  	"github.com/opencontainers/runc/libcontainer/devices"
    19  )
    20  
    21  // createContainer populates and configures the container type with the
    22  // data provided by the execdriver.Command
    23  func (d *Driver) createContainer(c *execdriver.Command, hooks execdriver.Hooks) (container *configs.Config, err error) {
    24  	container = execdriver.InitContainer(c)
    25  
    26  	if err := d.createIpc(container, c); err != nil {
    27  		return nil, err
    28  	}
    29  
    30  	if err := d.createPid(container, c); err != nil {
    31  		return nil, err
    32  	}
    33  
    34  	if err := d.createUTS(container, c); err != nil {
    35  		return nil, err
    36  	}
    37  
    38  	if err := d.setupRemappedRoot(container, c); err != nil {
    39  		return nil, err
    40  	}
    41  
    42  	if err := d.createNetwork(container, c, hooks); err != nil {
    43  		return nil, err
    44  	}
    45  
    46  	if c.ProcessConfig.Privileged {
    47  		if !container.Readonlyfs {
    48  			// clear readonly for /sys
    49  			for i := range container.Mounts {
    50  				if container.Mounts[i].Destination == "/sys" {
    51  					container.Mounts[i].Flags &= ^syscall.MS_RDONLY
    52  				}
    53  			}
    54  			container.ReadonlyPaths = nil
    55  		}
    56  
    57  		// clear readonly for cgroup
    58  		for i := range container.Mounts {
    59  			if container.Mounts[i].Device == "cgroup" {
    60  				container.Mounts[i].Flags &= ^syscall.MS_RDONLY
    61  			}
    62  		}
    63  
    64  		container.MaskPaths = nil
    65  		if err := d.setPrivileged(container); err != nil {
    66  			return nil, err
    67  		}
    68  	} else {
    69  		if err := d.setCapabilities(container, c); err != nil {
    70  			return nil, err
    71  		}
    72  
    73  		if c.SeccompProfile == "" {
    74  			container.Seccomp, err = seccomp.GetDefaultProfile()
    75  			if err != nil {
    76  				return nil, err
    77  			}
    78  		}
    79  	}
    80  	// add CAP_ prefix to all caps for new libcontainer update to match
    81  	// the spec format.
    82  	for i, s := range container.Capabilities {
    83  		if !strings.HasPrefix(s, "CAP_") {
    84  			container.Capabilities[i] = fmt.Sprintf("CAP_%s", s)
    85  		}
    86  	}
    87  	container.AdditionalGroups = c.GroupAdd
    88  
    89  	if c.AppArmorProfile != "" {
    90  		container.AppArmorProfile = c.AppArmorProfile
    91  	}
    92  
    93  	if c.SeccompProfile != "" && c.SeccompProfile != "unconfined" {
    94  		container.Seccomp, err = seccomp.LoadProfile(c.SeccompProfile)
    95  		if err != nil {
    96  			return nil, err
    97  		}
    98  	}
    99  
   100  	if err := execdriver.SetupCgroups(container, c); err != nil {
   101  		return nil, err
   102  	}
   103  
   104  	container.OomScoreAdj = c.OomScoreAdj
   105  
   106  	if container.Readonlyfs {
   107  		for i := range container.Mounts {
   108  			switch container.Mounts[i].Destination {
   109  			case "/proc", "/dev", "/dev/pts", "/dev/mqueue":
   110  				continue
   111  			}
   112  			container.Mounts[i].Flags |= syscall.MS_RDONLY
   113  		}
   114  
   115  		/* These paths must be remounted as r/o */
   116  		container.ReadonlyPaths = append(container.ReadonlyPaths, "/dev")
   117  	}
   118  
   119  	if err := d.setupMounts(container, c); err != nil {
   120  		return nil, err
   121  	}
   122  
   123  	d.setupLabels(container, c)
   124  	d.setupRlimits(container, c)
   125  
   126  	container.NoNewPrivileges = c.NoNewPrivileges
   127  	return container, nil
   128  }
   129  
   130  func (d *Driver) createNetwork(container *configs.Config, c *execdriver.Command, hooks execdriver.Hooks) error {
   131  	if c.Network == nil {
   132  		return nil
   133  	}
   134  	if c.Network.ContainerID != "" {
   135  		d.Lock()
   136  		active := d.activeContainers[c.Network.ContainerID]
   137  		d.Unlock()
   138  
   139  		if active == nil {
   140  			return fmt.Errorf("%s is not a valid running container to join", c.Network.ContainerID)
   141  		}
   142  
   143  		state, err := active.State()
   144  		if err != nil {
   145  			return err
   146  		}
   147  
   148  		container.Namespaces.Add(configs.NEWNET, state.NamespacePaths[configs.NEWNET])
   149  		return nil
   150  	}
   151  
   152  	if c.Network.NamespacePath != "" {
   153  		container.Namespaces.Add(configs.NEWNET, c.Network.NamespacePath)
   154  		return nil
   155  	}
   156  	// only set up prestart hook if the namespace path is not set (this should be
   157  	// all cases *except* for --net=host shared networking)
   158  	container.Hooks = &configs.Hooks{
   159  		Prestart: []configs.Hook{
   160  			configs.NewFunctionHook(func(s configs.HookState) error {
   161  				if len(hooks.PreStart) > 0 {
   162  					for _, fnHook := range hooks.PreStart {
   163  						// A closed channel for OOM is returned here as it will be
   164  						// non-blocking and return the correct result when read.
   165  						chOOM := make(chan struct{})
   166  						close(chOOM)
   167  						if err := fnHook(&c.ProcessConfig, s.Pid, chOOM); err != nil {
   168  							return err
   169  						}
   170  					}
   171  				}
   172  				return nil
   173  			}),
   174  		},
   175  	}
   176  	return nil
   177  }
   178  
   179  func (d *Driver) createIpc(container *configs.Config, c *execdriver.Command) error {
   180  	if c.Ipc.HostIpc {
   181  		container.Namespaces.Remove(configs.NEWIPC)
   182  		return nil
   183  	}
   184  
   185  	if c.Ipc.ContainerID != "" {
   186  		d.Lock()
   187  		active := d.activeContainers[c.Ipc.ContainerID]
   188  		d.Unlock()
   189  
   190  		if active == nil {
   191  			return fmt.Errorf("%s is not a valid running container to join", c.Ipc.ContainerID)
   192  		}
   193  
   194  		state, err := active.State()
   195  		if err != nil {
   196  			return err
   197  		}
   198  		container.Namespaces.Add(configs.NEWIPC, state.NamespacePaths[configs.NEWIPC])
   199  	}
   200  
   201  	return nil
   202  }
   203  
   204  func (d *Driver) createPid(container *configs.Config, c *execdriver.Command) error {
   205  	if c.Pid.HostPid {
   206  		container.Namespaces.Remove(configs.NEWPID)
   207  		return nil
   208  	}
   209  
   210  	return nil
   211  }
   212  
   213  func (d *Driver) createUTS(container *configs.Config, c *execdriver.Command) error {
   214  	if c.UTS.HostUTS {
   215  		container.Namespaces.Remove(configs.NEWUTS)
   216  		container.Hostname = ""
   217  		return nil
   218  	}
   219  
   220  	return nil
   221  }
   222  
   223  func (d *Driver) setupRemappedRoot(container *configs.Config, c *execdriver.Command) error {
   224  	if c.RemappedRoot.UID == 0 {
   225  		container.Namespaces.Remove(configs.NEWUSER)
   226  		return nil
   227  	}
   228  
   229  	// convert the Docker daemon id map to the libcontainer variant of the same struct
   230  	// this keeps us from having to import libcontainer code across Docker client + daemon packages
   231  	cuidMaps := []configs.IDMap{}
   232  	cgidMaps := []configs.IDMap{}
   233  	for _, idMap := range c.UIDMapping {
   234  		cuidMaps = append(cuidMaps, configs.IDMap(idMap))
   235  	}
   236  	for _, idMap := range c.GIDMapping {
   237  		cgidMaps = append(cgidMaps, configs.IDMap(idMap))
   238  	}
   239  	container.UidMappings = cuidMaps
   240  	container.GidMappings = cgidMaps
   241  
   242  	for _, node := range container.Devices {
   243  		node.Uid = uint32(c.RemappedRoot.UID)
   244  		node.Gid = uint32(c.RemappedRoot.GID)
   245  	}
   246  	// TODO: until a kernel/mount solution exists for handling remount in a user namespace,
   247  	// we must clear the readonly flag for the cgroups mount (@mrunalp concurs)
   248  	for i := range container.Mounts {
   249  		if container.Mounts[i].Device == "cgroup" {
   250  			container.Mounts[i].Flags &= ^syscall.MS_RDONLY
   251  		}
   252  	}
   253  
   254  	return nil
   255  }
   256  
   257  func (d *Driver) setPrivileged(container *configs.Config) (err error) {
   258  	container.Capabilities = execdriver.GetAllCapabilities()
   259  	container.Cgroups.Resources.AllowAllDevices = true
   260  
   261  	hostDevices, err := devices.HostDevices()
   262  	if err != nil {
   263  		return err
   264  	}
   265  	container.Devices = hostDevices
   266  
   267  	if apparmor.IsEnabled() {
   268  		container.AppArmorProfile = "unconfined"
   269  	}
   270  	return nil
   271  }
   272  
   273  func (d *Driver) setCapabilities(container *configs.Config, c *execdriver.Command) (err error) {
   274  	container.Capabilities, err = execdriver.TweakCapabilities(container.Capabilities, c.CapAdd, c.CapDrop)
   275  	return err
   276  }
   277  
   278  func (d *Driver) setupRlimits(container *configs.Config, c *execdriver.Command) {
   279  	if c.Resources == nil {
   280  		return
   281  	}
   282  
   283  	for _, rlimit := range c.Resources.Rlimits {
   284  		container.Rlimits = append(container.Rlimits, configs.Rlimit{
   285  			Type: rlimit.Type,
   286  			Hard: rlimit.Hard,
   287  			Soft: rlimit.Soft,
   288  		})
   289  	}
   290  }
   291  
   292  // If rootfs mount propagation is RPRIVATE, that means all the volumes are
   293  // going to be private anyway. There is no need to apply per volume
   294  // propagation on top. This is just an optimization so that cost of per volume
   295  // propagation is paid only if user decides to make some volume non-private
   296  // which will force rootfs mount propagation to be non RPRIVATE.
   297  func checkResetVolumePropagation(container *configs.Config) {
   298  	if container.RootPropagation != mount.RPRIVATE {
   299  		return
   300  	}
   301  	for _, m := range container.Mounts {
   302  		m.PropagationFlags = nil
   303  	}
   304  }
   305  
   306  func getMountInfo(mountinfo []*mount.Info, dir string) *mount.Info {
   307  	for _, m := range mountinfo {
   308  		if m.Mountpoint == dir {
   309  			return m
   310  		}
   311  	}
   312  	return nil
   313  }
   314  
   315  // Get the source mount point of directory passed in as argument. Also return
   316  // optional fields.
   317  func getSourceMount(source string) (string, string, error) {
   318  	// Ensure any symlinks are resolved.
   319  	sourcePath, err := filepath.EvalSymlinks(source)
   320  	if err != nil {
   321  		return "", "", err
   322  	}
   323  
   324  	mountinfos, err := mount.GetMounts()
   325  	if err != nil {
   326  		return "", "", err
   327  	}
   328  
   329  	mountinfo := getMountInfo(mountinfos, sourcePath)
   330  	if mountinfo != nil {
   331  		return sourcePath, mountinfo.Optional, nil
   332  	}
   333  
   334  	path := sourcePath
   335  	for {
   336  		path = filepath.Dir(path)
   337  
   338  		mountinfo = getMountInfo(mountinfos, path)
   339  		if mountinfo != nil {
   340  			return path, mountinfo.Optional, nil
   341  		}
   342  
   343  		if path == "/" {
   344  			break
   345  		}
   346  	}
   347  
   348  	// If we are here, we did not find parent mount. Something is wrong.
   349  	return "", "", fmt.Errorf("Could not find source mount of %s", source)
   350  }
   351  
   352  // Ensure mount point on which path is mounted, is shared.
   353  func ensureShared(path string) error {
   354  	sharedMount := false
   355  
   356  	sourceMount, optionalOpts, err := getSourceMount(path)
   357  	if err != nil {
   358  		return err
   359  	}
   360  	// Make sure source mount point is shared.
   361  	optsSplit := strings.Split(optionalOpts, " ")
   362  	for _, opt := range optsSplit {
   363  		if strings.HasPrefix(opt, "shared:") {
   364  			sharedMount = true
   365  			break
   366  		}
   367  	}
   368  
   369  	if !sharedMount {
   370  		return fmt.Errorf("Path %s is mounted on %s but it is not a shared mount.", path, sourceMount)
   371  	}
   372  	return nil
   373  }
   374  
   375  // Ensure mount point on which path is mounted, is either shared or slave.
   376  func ensureSharedOrSlave(path string) error {
   377  	sharedMount := false
   378  	slaveMount := false
   379  
   380  	sourceMount, optionalOpts, err := getSourceMount(path)
   381  	if err != nil {
   382  		return err
   383  	}
   384  	// Make sure source mount point is shared.
   385  	optsSplit := strings.Split(optionalOpts, " ")
   386  	for _, opt := range optsSplit {
   387  		if strings.HasPrefix(opt, "shared:") {
   388  			sharedMount = true
   389  			break
   390  		} else if strings.HasPrefix(opt, "master:") {
   391  			slaveMount = true
   392  			break
   393  		}
   394  	}
   395  
   396  	if !sharedMount && !slaveMount {
   397  		return fmt.Errorf("Path %s is mounted on %s but it is not a shared or slave mount.", path, sourceMount)
   398  	}
   399  	return nil
   400  }
   401  
   402  func (d *Driver) setupMounts(container *configs.Config, c *execdriver.Command) error {
   403  	userMounts := make(map[string]struct{})
   404  	for _, m := range c.Mounts {
   405  		userMounts[m.Destination] = struct{}{}
   406  	}
   407  
   408  	// Filter out mounts that are overridden by user supplied mounts
   409  	var defaultMounts []*configs.Mount
   410  	_, mountDev := userMounts["/dev"]
   411  	for _, m := range container.Mounts {
   412  		if _, ok := userMounts[m.Destination]; !ok {
   413  			if mountDev && strings.HasPrefix(m.Destination, "/dev/") {
   414  				container.Devices = nil
   415  				continue
   416  			}
   417  			defaultMounts = append(defaultMounts, m)
   418  		}
   419  	}
   420  	container.Mounts = defaultMounts
   421  
   422  	mountPropagationMap := map[string]int{
   423  		"private":  mount.PRIVATE,
   424  		"rprivate": mount.RPRIVATE,
   425  		"shared":   mount.SHARED,
   426  		"rshared":  mount.RSHARED,
   427  		"slave":    mount.SLAVE,
   428  		"rslave":   mount.RSLAVE,
   429  	}
   430  
   431  	for _, m := range c.Mounts {
   432  		for _, cm := range container.Mounts {
   433  			if cm.Destination == m.Destination {
   434  				return fmt.Errorf("Duplicate mount point '%s'", m.Destination)
   435  			}
   436  		}
   437  
   438  		if m.Source == "tmpfs" {
   439  			var (
   440  				data  = "size=65536k"
   441  				flags = syscall.MS_NOEXEC | syscall.MS_NOSUID | syscall.MS_NODEV
   442  				err   error
   443  			)
   444  			if m.Data != "" {
   445  				flags, data, err = mount.ParseTmpfsOptions(m.Data)
   446  				if err != nil {
   447  					return err
   448  				}
   449  			}
   450  			container.Mounts = append(container.Mounts, &configs.Mount{
   451  				Source:           m.Source,
   452  				Destination:      m.Destination,
   453  				Data:             data,
   454  				Device:           "tmpfs",
   455  				Flags:            flags,
   456  				PropagationFlags: []int{mountPropagationMap[volume.DefaultPropagationMode]},
   457  			})
   458  			continue
   459  		}
   460  		flags := syscall.MS_BIND | syscall.MS_REC
   461  		var pFlag int
   462  		if !m.Writable {
   463  			flags |= syscall.MS_RDONLY
   464  		}
   465  
   466  		// Determine property of RootPropagation based on volume
   467  		// properties. If a volume is shared, then keep root propagation
   468  		// shared. This should work for slave and private volumes too.
   469  		//
   470  		// For slave volumes, it can be either [r]shared/[r]slave.
   471  		//
   472  		// For private volumes any root propagation value should work.
   473  
   474  		pFlag = mountPropagationMap[m.Propagation]
   475  		if pFlag == mount.SHARED || pFlag == mount.RSHARED {
   476  			if err := ensureShared(m.Source); err != nil {
   477  				return err
   478  			}
   479  			rootpg := container.RootPropagation
   480  			if rootpg != mount.SHARED && rootpg != mount.RSHARED {
   481  				execdriver.SetRootPropagation(container, mount.SHARED)
   482  			}
   483  		} else if pFlag == mount.SLAVE || pFlag == mount.RSLAVE {
   484  			if err := ensureSharedOrSlave(m.Source); err != nil {
   485  				return err
   486  			}
   487  			rootpg := container.RootPropagation
   488  			if rootpg != mount.SHARED && rootpg != mount.RSHARED && rootpg != mount.SLAVE && rootpg != mount.RSLAVE {
   489  				execdriver.SetRootPropagation(container, mount.RSLAVE)
   490  			}
   491  		}
   492  
   493  		mount := &configs.Mount{
   494  			Source:      m.Source,
   495  			Destination: m.Destination,
   496  			Device:      "bind",
   497  			Flags:       flags,
   498  		}
   499  
   500  		if pFlag != 0 {
   501  			mount.PropagationFlags = []int{pFlag}
   502  		}
   503  
   504  		container.Mounts = append(container.Mounts, mount)
   505  	}
   506  
   507  	checkResetVolumePropagation(container)
   508  	return nil
   509  }
   510  
   511  func (d *Driver) setupLabels(container *configs.Config, c *execdriver.Command) {
   512  	container.ProcessLabel = c.ProcessLabel
   513  	container.MountLabel = c.MountLabel
   514  }