github.com/hustcat/docker@v1.3.3-0.20160314103604-901c67a8eeab/daemon/execdriver/native/driver.go (about)

     1  // +build linux,cgo
     2  
     3  package native
     4  
     5  import (
     6  	"fmt"
     7  	"io"
     8  	"io/ioutil"
     9  	"os"
    10  	"os/exec"
    11  	"path/filepath"
    12  	"strings"
    13  	"sync"
    14  	"syscall"
    15  	"time"
    16  
    17  	"github.com/Sirupsen/logrus"
    18  	"github.com/docker/docker/daemon/execdriver"
    19  	"github.com/docker/docker/pkg/parsers"
    20  	"github.com/docker/docker/pkg/pools"
    21  	"github.com/docker/docker/pkg/reexec"
    22  	sysinfo "github.com/docker/docker/pkg/system"
    23  	"github.com/docker/docker/pkg/term"
    24  	aaprofile "github.com/docker/docker/profiles/apparmor"
    25  	"github.com/opencontainers/runc/libcontainer"
    26  	"github.com/opencontainers/runc/libcontainer/apparmor"
    27  	"github.com/opencontainers/runc/libcontainer/cgroups/systemd"
    28  	"github.com/opencontainers/runc/libcontainer/configs"
    29  	"github.com/opencontainers/runc/libcontainer/system"
    30  	"github.com/opencontainers/runc/libcontainer/utils"
    31  )
    32  
    33  // Define constants for native driver
    34  const (
    35  	DriverName = "native"
    36  	Version    = "0.2"
    37  
    38  	defaultApparmorProfile = "docker-default"
    39  )
    40  
    41  // Driver contains all information for native driver,
    42  // it implements execdriver.Driver.
    43  type Driver struct {
    44  	root             string
    45  	activeContainers map[string]libcontainer.Container
    46  	machineMemory    int64
    47  	factory          libcontainer.Factory
    48  	sync.Mutex
    49  }
    50  
    51  // NewDriver returns a new native driver, called from NewDriver of execdriver.
    52  func NewDriver(root string, options []string) (*Driver, error) {
    53  	meminfo, err := sysinfo.ReadMemInfo()
    54  	if err != nil {
    55  		return nil, err
    56  	}
    57  
    58  	if err := sysinfo.MkdirAll(root, 0700); err != nil {
    59  		return nil, err
    60  	}
    61  
    62  	if apparmor.IsEnabled() {
    63  		if err := aaprofile.InstallDefault(defaultApparmorProfile); err != nil {
    64  			apparmorProfiles := []string{defaultApparmorProfile}
    65  
    66  			// Allow daemon to run if loading failed, but are active
    67  			// (possibly through another run, manually, or via system startup)
    68  			for _, policy := range apparmorProfiles {
    69  				if err := aaprofile.IsLoaded(policy); err != nil {
    70  					return nil, fmt.Errorf("AppArmor enabled on system but the %s profile could not be loaded.", policy)
    71  				}
    72  			}
    73  		}
    74  	}
    75  
    76  	// choose cgroup manager
    77  	// this makes sure there are no breaking changes to people
    78  	// who upgrade from versions without native.cgroupdriver opt
    79  	cgm := libcontainer.Cgroupfs
    80  
    81  	// parse the options
    82  	for _, option := range options {
    83  		key, val, err := parsers.ParseKeyValueOpt(option)
    84  		if err != nil {
    85  			return nil, err
    86  		}
    87  		key = strings.ToLower(key)
    88  		switch key {
    89  		case "native.cgroupdriver":
    90  			// override the default if they set options
    91  			switch val {
    92  			case "systemd":
    93  				if systemd.UseSystemd() {
    94  					cgm = libcontainer.SystemdCgroups
    95  				} else {
    96  					// warn them that they chose the wrong driver
    97  					logrus.Warn("You cannot use systemd as native.cgroupdriver, using cgroupfs instead")
    98  				}
    99  			case "cgroupfs":
   100  				cgm = libcontainer.Cgroupfs
   101  			default:
   102  				return nil, fmt.Errorf("Unknown native.cgroupdriver given %q. try cgroupfs or systemd", val)
   103  			}
   104  		default:
   105  			return nil, fmt.Errorf("Unknown option %s\n", key)
   106  		}
   107  	}
   108  
   109  	f, err := libcontainer.New(
   110  		root,
   111  		cgm,
   112  		libcontainer.InitPath(reexec.Self(), DriverName),
   113  	)
   114  	if err != nil {
   115  		return nil, err
   116  	}
   117  
   118  	return &Driver{
   119  		root:             root,
   120  		activeContainers: make(map[string]libcontainer.Container),
   121  		machineMemory:    meminfo.MemTotal,
   122  		factory:          f,
   123  	}, nil
   124  }
   125  
   126  // Run implements the exec driver Driver interface,
   127  // it calls libcontainer APIs to run a container.
   128  func (d *Driver) Run(c *execdriver.Command, pipes *execdriver.Pipes, hooks execdriver.Hooks) (execdriver.ExitStatus, error) {
   129  	destroyed := false
   130  	var err error
   131  	c.TmpDir, err = ioutil.TempDir("", c.ID)
   132  	if err != nil {
   133  		return execdriver.ExitStatus{ExitCode: -1}, err
   134  	}
   135  	defer os.RemoveAll(c.TmpDir)
   136  
   137  	// take the Command and populate the libcontainer.Config from it
   138  	container, err := d.createContainer(c, hooks)
   139  	if err != nil {
   140  		return execdriver.ExitStatus{ExitCode: -1}, err
   141  	}
   142  
   143  	p := &libcontainer.Process{
   144  		Args: append([]string{c.ProcessConfig.Entrypoint}, c.ProcessConfig.Arguments...),
   145  		Env:  c.ProcessConfig.Env,
   146  		Cwd:  c.WorkingDir,
   147  		User: c.ProcessConfig.User,
   148  	}
   149  
   150  	wg := sync.WaitGroup{}
   151  	writers, err := setupPipes(container, &c.ProcessConfig, p, pipes, &wg)
   152  	if err != nil {
   153  		return execdriver.ExitStatus{ExitCode: -1}, err
   154  	}
   155  
   156  	cont, err := d.factory.Create(c.ID, container)
   157  	if err != nil {
   158  		return execdriver.ExitStatus{ExitCode: -1}, err
   159  	}
   160  
   161  	if err := cont.Start(p); err != nil {
   162  		return execdriver.ExitStatus{ExitCode: -1}, err
   163  	}
   164  	d.Lock()
   165  	d.activeContainers[c.ID] = cont
   166  	d.Unlock()
   167  	defer func() {
   168  		if !destroyed {
   169  			cont.Destroy()
   170  		}
   171  		d.cleanContainer(c.ID)
   172  	}()
   173  
   174  	//close the write end of any opened pipes now that they are dup'ed into the container
   175  	for _, writer := range writers {
   176  		writer.Close()
   177  	}
   178  	// 'oom' is used to emit 'oom' events to the eventstream, 'oomKilled' is used
   179  	// to set the 'OOMKilled' flag in state
   180  	oom := notifyOnOOM(cont)
   181  	oomKilled := notifyOnOOM(cont)
   182  	if hooks.Start != nil {
   183  		pid, err := p.Pid()
   184  		if err != nil {
   185  			p.Signal(os.Kill)
   186  			p.Wait()
   187  			return execdriver.ExitStatus{ExitCode: -1}, err
   188  		}
   189  		hooks.Start(&c.ProcessConfig, pid, oom)
   190  	}
   191  
   192  	waitF := p.Wait
   193  	if nss := cont.Config().Namespaces; !nss.Contains(configs.NEWPID) {
   194  		// we need such hack for tracking processes with inherited fds,
   195  		// because cmd.Wait() waiting for all streams to be copied
   196  		waitF = waitInPIDHost(p, cont)
   197  	}
   198  	ps, err := waitF()
   199  	if err != nil {
   200  		execErr, ok := err.(*exec.ExitError)
   201  		if !ok {
   202  			return execdriver.ExitStatus{ExitCode: -1}, err
   203  		}
   204  		ps = execErr.ProcessState
   205  	}
   206  	// wait for all IO goroutine copiers to finish
   207  	wg.Wait()
   208  
   209  	cont.Destroy()
   210  	destroyed = true
   211  	// oomKilled will have an oom event if any process within the container was
   212  	// OOM killed at any time, not only if the init process OOMed.
   213  	//
   214  	// Perhaps we only want the OOMKilled flag to be set if the OOM
   215  	// resulted in a container death, but there isn't a good way to do this
   216  	// because the kernel's cgroup oom notification does not provide information
   217  	// such as the PID. This could be heuristically done by checking that the OOM
   218  	// happened within some very small time slice for the container dying (and
   219  	// optionally exit-code 137), but I don't think the cgroup oom notification
   220  	// can be used to reliably determine this
   221  	//
   222  	// Even if there were multiple OOMs, it's sufficient to read one value
   223  	// because libcontainer's oom notify will discard the channel after the
   224  	// cgroup is destroyed
   225  	_, oomKill := <-oomKilled
   226  	return execdriver.ExitStatus{ExitCode: utils.ExitStatus(ps.Sys().(syscall.WaitStatus)), OOMKilled: oomKill}, nil
   227  }
   228  
   229  // notifyOnOOM returns a channel that signals if the container received an OOM notification
   230  // for any process. If it is unable to subscribe to OOM notifications then a closed
   231  // channel is returned as it will be non-blocking and return the correct result when read.
   232  func notifyOnOOM(container libcontainer.Container) <-chan struct{} {
   233  	oom, err := container.NotifyOOM()
   234  	if err != nil {
   235  		logrus.Warnf("Your kernel does not support OOM notifications: %s", err)
   236  		c := make(chan struct{})
   237  		close(c)
   238  		return c
   239  	}
   240  	return oom
   241  }
   242  
   243  func killCgroupProcs(c libcontainer.Container) {
   244  	var procs []*os.Process
   245  	if err := c.Pause(); err != nil {
   246  		logrus.Warn(err)
   247  	}
   248  	pids, err := c.Processes()
   249  	if err != nil {
   250  		// don't care about childs if we can't get them, this is mostly because cgroup already deleted
   251  		logrus.Warnf("Failed to get processes from container %s: %v", c.ID(), err)
   252  	}
   253  	for _, pid := range pids {
   254  		if p, err := os.FindProcess(pid); err == nil {
   255  			procs = append(procs, p)
   256  			if err := p.Kill(); err != nil {
   257  				logrus.Warn(err)
   258  			}
   259  		}
   260  	}
   261  	if err := c.Resume(); err != nil {
   262  		logrus.Warn(err)
   263  	}
   264  	for _, p := range procs {
   265  		if _, err := p.Wait(); err != nil {
   266  			logrus.Warn(err)
   267  		}
   268  	}
   269  }
   270  
   271  func waitInPIDHost(p *libcontainer.Process, c libcontainer.Container) func() (*os.ProcessState, error) {
   272  	return func() (*os.ProcessState, error) {
   273  		pid, err := p.Pid()
   274  		if err != nil {
   275  			return nil, err
   276  		}
   277  
   278  		process, err := os.FindProcess(pid)
   279  		s, err := process.Wait()
   280  		if err != nil {
   281  			execErr, ok := err.(*exec.ExitError)
   282  			if !ok {
   283  				return s, err
   284  			}
   285  			s = execErr.ProcessState
   286  		}
   287  		killCgroupProcs(c)
   288  		p.Wait()
   289  		return s, err
   290  	}
   291  }
   292  
   293  // Kill implements the exec driver Driver interface.
   294  func (d *Driver) Kill(c *execdriver.Command, sig int) error {
   295  	d.Lock()
   296  	active := d.activeContainers[c.ID]
   297  	d.Unlock()
   298  	if active == nil {
   299  		return fmt.Errorf("active container for %s does not exist", c.ID)
   300  	}
   301  	state, err := active.State()
   302  	if err != nil {
   303  		return err
   304  	}
   305  	if state.InitProcessPid == -1 {
   306  		return fmt.Errorf("avoid sending signal %d to container %s with pid -1", sig, c.ID)
   307  	}
   308  	return syscall.Kill(state.InitProcessPid, syscall.Signal(sig))
   309  }
   310  
   311  // Pause implements the exec driver Driver interface,
   312  // it calls libcontainer API to pause a container.
   313  func (d *Driver) Pause(c *execdriver.Command) error {
   314  	d.Lock()
   315  	active := d.activeContainers[c.ID]
   316  	d.Unlock()
   317  	if active == nil {
   318  		return fmt.Errorf("active container for %s does not exist", c.ID)
   319  	}
   320  	return active.Pause()
   321  }
   322  
   323  // Unpause implements the exec driver Driver interface,
   324  // it calls libcontainer API to unpause a container.
   325  func (d *Driver) Unpause(c *execdriver.Command) error {
   326  	d.Lock()
   327  	active := d.activeContainers[c.ID]
   328  	d.Unlock()
   329  	if active == nil {
   330  		return fmt.Errorf("active container for %s does not exist", c.ID)
   331  	}
   332  	return active.Resume()
   333  }
   334  
   335  // Terminate implements the exec driver Driver interface.
   336  func (d *Driver) Terminate(c *execdriver.Command) error {
   337  	defer d.cleanContainer(c.ID)
   338  	container, err := d.factory.Load(c.ID)
   339  	if err != nil {
   340  		return err
   341  	}
   342  	defer container.Destroy()
   343  	state, err := container.State()
   344  	if err != nil {
   345  		return err
   346  	}
   347  	pid := state.InitProcessPid
   348  	currentStartTime, err := system.GetProcessStartTime(pid)
   349  	if err != nil {
   350  		return err
   351  	}
   352  	if state.InitProcessStartTime == currentStartTime {
   353  		err = syscall.Kill(pid, 9)
   354  		syscall.Wait4(pid, nil, 0, nil)
   355  	}
   356  	return err
   357  }
   358  
   359  // Name implements the exec driver Driver interface.
   360  func (d *Driver) Name() string {
   361  	return fmt.Sprintf("%s-%s", DriverName, Version)
   362  }
   363  
   364  // GetPidsForContainer implements the exec driver Driver interface.
   365  func (d *Driver) GetPidsForContainer(id string) ([]int, error) {
   366  	d.Lock()
   367  	active := d.activeContainers[id]
   368  	d.Unlock()
   369  
   370  	if active == nil {
   371  		return nil, fmt.Errorf("active container for %s does not exist", id)
   372  	}
   373  	return active.Processes()
   374  }
   375  
   376  func (d *Driver) cleanContainer(id string) error {
   377  	d.Lock()
   378  	delete(d.activeContainers, id)
   379  	d.Unlock()
   380  	return os.RemoveAll(filepath.Join(d.root, id))
   381  }
   382  
   383  func (d *Driver) createContainerRoot(id string) error {
   384  	return os.MkdirAll(filepath.Join(d.root, id), 0655)
   385  }
   386  
   387  // Clean implements the exec driver Driver interface.
   388  func (d *Driver) Clean(id string) error {
   389  	return os.RemoveAll(filepath.Join(d.root, id))
   390  }
   391  
   392  // Stats implements the exec driver Driver interface.
   393  func (d *Driver) Stats(id string) (*execdriver.ResourceStats, error) {
   394  	d.Lock()
   395  	c := d.activeContainers[id]
   396  	d.Unlock()
   397  	if c == nil {
   398  		return nil, execdriver.ErrNotRunning
   399  	}
   400  	now := time.Now()
   401  	stats, err := c.Stats()
   402  	if err != nil {
   403  		return nil, err
   404  	}
   405  	memoryLimit := c.Config().Cgroups.Resources.Memory
   406  	// if the container does not have any memory limit specified set the
   407  	// limit to the machines memory
   408  	if memoryLimit == 0 {
   409  		memoryLimit = d.machineMemory
   410  	}
   411  	return &execdriver.ResourceStats{
   412  		Stats:       stats,
   413  		Read:        now,
   414  		MemoryLimit: memoryLimit,
   415  	}, nil
   416  }
   417  
   418  // Update updates configs for a container
   419  func (d *Driver) Update(c *execdriver.Command) error {
   420  	d.Lock()
   421  	cont := d.activeContainers[c.ID]
   422  	d.Unlock()
   423  	if cont == nil {
   424  		return execdriver.ErrNotRunning
   425  	}
   426  	config := cont.Config()
   427  	if err := execdriver.SetupCgroups(&config, c); err != nil {
   428  		return err
   429  	}
   430  
   431  	if err := cont.Set(config); err != nil {
   432  		return err
   433  	}
   434  
   435  	return nil
   436  }
   437  
   438  // TtyConsole implements the exec driver Terminal interface.
   439  type TtyConsole struct {
   440  	console libcontainer.Console
   441  }
   442  
   443  // NewTtyConsole returns a new TtyConsole struct.
   444  func NewTtyConsole(console libcontainer.Console, pipes *execdriver.Pipes, wg *sync.WaitGroup) (*TtyConsole, error) {
   445  	tty := &TtyConsole{
   446  		console: console,
   447  	}
   448  
   449  	if err := tty.AttachPipes(pipes, wg); err != nil {
   450  		tty.Close()
   451  		return nil, err
   452  	}
   453  
   454  	return tty, nil
   455  }
   456  
   457  // Resize implements Resize method of Terminal interface
   458  func (t *TtyConsole) Resize(h, w int) error {
   459  	return term.SetWinsize(t.console.Fd(), &term.Winsize{Height: uint16(h), Width: uint16(w)})
   460  }
   461  
   462  // AttachPipes attaches given pipes to TtyConsole
   463  func (t *TtyConsole) AttachPipes(pipes *execdriver.Pipes, wg *sync.WaitGroup) error {
   464  	wg.Add(1)
   465  	go func() {
   466  		defer wg.Done()
   467  		if wb, ok := pipes.Stdout.(interface {
   468  			CloseWriters() error
   469  		}); ok {
   470  			defer wb.CloseWriters()
   471  		}
   472  
   473  		pools.Copy(pipes.Stdout, t.console)
   474  	}()
   475  
   476  	if pipes.Stdin != nil {
   477  		go func() {
   478  			pools.Copy(t.console, pipes.Stdin)
   479  
   480  			pipes.Stdin.Close()
   481  		}()
   482  	}
   483  
   484  	return nil
   485  }
   486  
   487  // Close implements Close method of Terminal interface
   488  func (t *TtyConsole) Close() error {
   489  	return t.console.Close()
   490  }
   491  
   492  func setupPipes(container *configs.Config, processConfig *execdriver.ProcessConfig, p *libcontainer.Process, pipes *execdriver.Pipes, wg *sync.WaitGroup) ([]io.WriteCloser, error) {
   493  
   494  	writers := []io.WriteCloser{}
   495  
   496  	rootuid, err := container.HostUID()
   497  	if err != nil {
   498  		return writers, err
   499  	}
   500  
   501  	if processConfig.Tty {
   502  		cons, err := p.NewConsole(rootuid)
   503  		if err != nil {
   504  			return writers, err
   505  		}
   506  		term, err := NewTtyConsole(cons, pipes, wg)
   507  		if err != nil {
   508  			return writers, err
   509  		}
   510  		processConfig.Terminal = term
   511  		return writers, nil
   512  	}
   513  	// not a tty--set up stdio pipes
   514  	term := &execdriver.StdConsole{}
   515  	processConfig.Terminal = term
   516  
   517  	// if we are not in a user namespace, there is no reason to go through
   518  	// the hassle of setting up os-level pipes with proper (remapped) ownership
   519  	// so we will do the prior shortcut for non-userns containers
   520  	if rootuid == 0 {
   521  		p.Stdout = pipes.Stdout
   522  		p.Stderr = pipes.Stderr
   523  
   524  		r, w, err := os.Pipe()
   525  		if err != nil {
   526  			return writers, err
   527  		}
   528  		if pipes.Stdin != nil {
   529  			go func() {
   530  				io.Copy(w, pipes.Stdin)
   531  				w.Close()
   532  			}()
   533  			p.Stdin = r
   534  		}
   535  		return writers, nil
   536  	}
   537  
   538  	// if we have user namespaces enabled (rootuid != 0), we will set
   539  	// up os pipes for stderr, stdout, stdin so we can chown them to
   540  	// the proper ownership to allow for proper access to the underlying
   541  	// fds
   542  	var fds []uintptr
   543  
   544  	copyPipes := func(out io.Writer, in io.ReadCloser) {
   545  		defer wg.Done()
   546  		io.Copy(out, in)
   547  		in.Close()
   548  	}
   549  
   550  	//setup stdout
   551  	r, w, err := os.Pipe()
   552  	if err != nil {
   553  		w.Close()
   554  		return writers, err
   555  	}
   556  	writers = append(writers, w)
   557  	fds = append(fds, r.Fd(), w.Fd())
   558  	if pipes.Stdout != nil {
   559  		wg.Add(1)
   560  		go copyPipes(pipes.Stdout, r)
   561  	}
   562  	term.Closers = append(term.Closers, r)
   563  	p.Stdout = w
   564  
   565  	//setup stderr
   566  	r, w, err = os.Pipe()
   567  	if err != nil {
   568  		w.Close()
   569  		return writers, err
   570  	}
   571  	writers = append(writers, w)
   572  	fds = append(fds, r.Fd(), w.Fd())
   573  	if pipes.Stderr != nil {
   574  		wg.Add(1)
   575  		go copyPipes(pipes.Stderr, r)
   576  	}
   577  	term.Closers = append(term.Closers, r)
   578  	p.Stderr = w
   579  
   580  	//setup stdin
   581  	r, w, err = os.Pipe()
   582  	if err != nil {
   583  		r.Close()
   584  		return writers, err
   585  	}
   586  	fds = append(fds, r.Fd(), w.Fd())
   587  	if pipes.Stdin != nil {
   588  		go func() {
   589  			io.Copy(w, pipes.Stdin)
   590  			w.Close()
   591  		}()
   592  		p.Stdin = r
   593  	}
   594  	for _, fd := range fds {
   595  		if err := syscall.Fchown(int(fd), rootuid, rootuid); err != nil {
   596  			return writers, fmt.Errorf("Failed to chown pipes fd: %v", err)
   597  		}
   598  	}
   599  	return writers, nil
   600  }
   601  
   602  // SupportsHooks implements the execdriver Driver interface.
   603  // The libcontainer/runC-based native execdriver does exploit the hook mechanism
   604  func (d *Driver) SupportsHooks() bool {
   605  	return true
   606  }