github.com/ncdc/docker@v0.10.1-0.20160129113957-6c6729ef5b74/container/monitor.go

github.com/ncdc/docker@v0.10.1-0.20160129113957-6c6729ef5b74/container/monitor.go (about)

     1  package container
     2  
     3  import (
     4  	"io"
     5  	"os/exec"
     6  	"strings"
     7  	"sync"
     8  	"syscall"
     9  	"time"
    10  
    11  	"github.com/Sirupsen/logrus"
    12  	"github.com/docker/docker/daemon/execdriver"
    13  	derr "github.com/docker/docker/errors"
    14  	"github.com/docker/docker/pkg/promise"
    15  	"github.com/docker/docker/pkg/stringid"
    16  	"github.com/docker/docker/utils"
    17  	"github.com/docker/engine-api/types/container"
    18  )
    19  
    20  const (
    21  	defaultTimeIncrement = 100
    22  	loggerCloseTimeout   = 10 * time.Second
    23  )
    24  
    25  // supervisor defines the interface that a supervisor must implement
    26  type supervisor interface {
    27  	// LogContainerEvent generates events related to a given container
    28  	LogContainerEvent(*Container, string)
    29  	// Cleanup ensures that the container is properly unmounted
    30  	Cleanup(*Container)
    31  	// StartLogging starts the logging driver for the container
    32  	StartLogging(*Container) error
    33  	// Run starts a container
    34  	Run(c *Container, pipes *execdriver.Pipes, startCallback execdriver.DriverCallback) (execdriver.ExitStatus, error)
    35  	// IsShuttingDown tells whether the supervisor is shutting down or not
    36  	IsShuttingDown() bool
    37  }
    38  
    39  // containerMonitor monitors the execution of a container's main process.
    40  // If a restart policy is specified for the container the monitor will ensure that the
    41  // process is restarted based on the rules of the policy.  When the container is finally stopped
    42  // the monitor will reset and cleanup any of the container resources such as networking allocations
    43  // and the rootfs
    44  type containerMonitor struct {
    45  	mux sync.Mutex
    46  
    47  	// supervisor keeps track of the container and the events it generates
    48  	supervisor supervisor
    49  
    50  	// container is the container being monitored
    51  	container *Container
    52  
    53  	// restartPolicy is the current policy being applied to the container monitor
    54  	restartPolicy container.RestartPolicy
    55  
    56  	// failureCount is the number of times the container has failed to
    57  	// start in a row
    58  	failureCount int
    59  
    60  	// shouldStop signals the monitor that the next time the container exits it is
    61  	// either because docker or the user asked for the container to be stopped
    62  	shouldStop bool
    63  
    64  	// startSignal is a channel that is closes after the container initially starts
    65  	startSignal chan struct{}
    66  
    67  	// stopChan is used to signal to the monitor whenever there is a wait for the
    68  	// next restart so that the timeIncrement is not honored and the user is not
    69  	// left waiting for nothing to happen during this time
    70  	stopChan chan struct{}
    71  
    72  	// timeIncrement is the amount of time to wait between restarts
    73  	// this is in milliseconds
    74  	timeIncrement int
    75  
    76  	// lastStartTime is the time which the monitor last exec'd the container's process
    77  	lastStartTime time.Time
    78  }
    79  
    80  // StartMonitor initializes a containerMonitor for this container with the provided supervisor and restart policy
    81  // and starts the container's process.
    82  func (container *Container) StartMonitor(s supervisor, policy container.RestartPolicy) error {
    83  	container.monitor = &containerMonitor{
    84  		supervisor:    s,
    85  		container:     container,
    86  		restartPolicy: policy,
    87  		timeIncrement: defaultTimeIncrement,
    88  		stopChan:      make(chan struct{}),
    89  		startSignal:   make(chan struct{}),
    90  	}
    91  
    92  	return container.monitor.wait()
    93  }
    94  
    95  // wait starts the container and wait until
    96  // we either receive an error from the initial start of the container's
    97  // process or until the process is running in the container
    98  func (m *containerMonitor) wait() error {
    99  	select {
   100  	case <-m.startSignal:
   101  	case err := <-promise.Go(m.start):
   102  		return err
   103  	}
   104  
   105  	return nil
   106  }
   107  
   108  // Stop signals to the container monitor that it should stop monitoring the container
   109  // for exits the next time the process dies
   110  func (m *containerMonitor) ExitOnNext() {
   111  	m.mux.Lock()
   112  
   113  	// we need to protect having a double close of the channel when stop is called
   114  	// twice or else we will get a panic
   115  	if !m.shouldStop {
   116  		m.shouldStop = true
   117  		close(m.stopChan)
   118  	}
   119  
   120  	m.mux.Unlock()
   121  }
   122  
   123  // Close closes the container's resources such as networking allocations and
   124  // unmounts the container's root filesystem
   125  func (m *containerMonitor) Close() error {
   126  	// Cleanup networking and mounts
   127  	m.supervisor.Cleanup(m.container)
   128  
   129  	// FIXME: here is race condition between two RUN instructions in Dockerfile
   130  	// because they share same runconfig and change image. Must be fixed
   131  	// in builder/builder.go
   132  	if err := m.container.ToDisk(); err != nil {
   133  		logrus.Errorf("Error dumping container %s state to disk: %s", m.container.ID, err)
   134  
   135  		return err
   136  	}
   137  
   138  	return nil
   139  }
   140  
   141  // Start starts the containers process and monitors it according to the restart policy
   142  func (m *containerMonitor) start() error {
   143  	var (
   144  		err        error
   145  		exitStatus execdriver.ExitStatus
   146  		// this variable indicates where we in execution flow:
   147  		// before Run or after
   148  		afterRun bool
   149  	)
   150  
   151  	// ensure that when the monitor finally exits we release the networking and unmount the rootfs
   152  	defer func() {
   153  		if afterRun {
   154  			m.container.Lock()
   155  			defer m.container.Unlock()
   156  			m.container.SetStopped(&exitStatus)
   157  		}
   158  		m.Close()
   159  	}()
   160  	// reset stopped flag
   161  	if m.container.HasBeenManuallyStopped {
   162  		m.container.HasBeenManuallyStopped = false
   163  	}
   164  
   165  	// reset the restart count
   166  	m.container.RestartCount = -1
   167  
   168  	for {
   169  		m.container.RestartCount++
   170  
   171  		if err := m.supervisor.StartLogging(m.container); err != nil {
   172  			m.resetContainer(false)
   173  
   174  			return err
   175  		}
   176  
   177  		pipes := execdriver.NewPipes(m.container.Stdin(), m.container.Stdout(), m.container.Stderr(), m.container.Config.OpenStdin)
   178  
   179  		m.logEvent("start")
   180  
   181  		m.lastStartTime = time.Now()
   182  
   183  		if exitStatus, err = m.supervisor.Run(m.container, pipes, m.callback); err != nil {
   184  			// if we receive an internal error from the initial start of a container then lets
   185  			// return it instead of entering the restart loop
   186  			// set to 127 for container cmd not found/does not exist)
   187  			if strings.Contains(err.Error(), "executable file not found") ||
   188  				strings.Contains(err.Error(), "no such file or directory") ||
   189  				strings.Contains(err.Error(), "system cannot find the file specified") {
   190  				if m.container.RestartCount == 0 {
   191  					m.container.ExitCode = 127
   192  					m.resetContainer(false)
   193  					return derr.ErrorCodeCmdNotFound
   194  				}
   195  			}
   196  			// set to 126 for container cmd can't be invoked errors
   197  			if strings.Contains(err.Error(), syscall.EACCES.Error()) {
   198  				if m.container.RestartCount == 0 {
   199  					m.container.ExitCode = 126
   200  					m.resetContainer(false)
   201  					return derr.ErrorCodeCmdCouldNotBeInvoked
   202  				}
   203  			}
   204  
   205  			if m.container.RestartCount == 0 {
   206  				m.container.ExitCode = -1
   207  				m.resetContainer(false)
   208  
   209  				return derr.ErrorCodeCantStart.WithArgs(m.container.ID, utils.GetErrorMessage(err))
   210  			}
   211  
   212  			logrus.Errorf("Error running container: %s", err)
   213  		}
   214  
   215  		// here container.Lock is already lost
   216  		afterRun = true
   217  
   218  		m.resetMonitor(err == nil && exitStatus.ExitCode == 0)
   219  
   220  		if m.shouldRestart(exitStatus.ExitCode) {
   221  			m.container.SetRestarting(&exitStatus)
   222  			m.logEvent("die")
   223  			m.resetContainer(true)
   224  
   225  			// sleep with a small time increment between each restart to help avoid issues cased by quickly
   226  			// restarting the container because of some types of errors ( networking cut out, etc... )
   227  			m.waitForNextRestart()
   228  
   229  			// we need to check this before reentering the loop because the waitForNextRestart could have
   230  			// been terminated by a request from a user
   231  			if m.shouldStop {
   232  				return err
   233  			}
   234  			continue
   235  		}
   236  
   237  		m.logEvent("die")
   238  		m.resetContainer(true)
   239  		return err
   240  	}
   241  }
   242  
   243  // resetMonitor resets the stateful fields on the containerMonitor based on the
   244  // previous runs success or failure.  Regardless of success, if the container had
   245  // an execution time of more than 10s then reset the timer back to the default
   246  func (m *containerMonitor) resetMonitor(successful bool) {
   247  	executionTime := time.Now().Sub(m.lastStartTime).Seconds()
   248  
   249  	if executionTime > 10 {
   250  		m.timeIncrement = defaultTimeIncrement
   251  	} else {
   252  		// otherwise we need to increment the amount of time we wait before restarting
   253  		// the process.  We will build up by multiplying the increment by 2
   254  		m.timeIncrement *= 2
   255  	}
   256  
   257  	// the container exited successfully so we need to reset the failure counter
   258  	if successful {
   259  		m.failureCount = 0
   260  	} else {
   261  		m.failureCount++
   262  	}
   263  }
   264  
   265  // waitForNextRestart waits with the default time increment to restart the container unless
   266  // a user or docker asks for the container to be stopped
   267  func (m *containerMonitor) waitForNextRestart() {
   268  	select {
   269  	case <-time.After(time.Duration(m.timeIncrement) * time.Millisecond):
   270  	case <-m.stopChan:
   271  	}
   272  }
   273  
   274  // shouldRestart checks the restart policy and applies the rules to determine if
   275  // the container's process should be restarted
   276  func (m *containerMonitor) shouldRestart(exitCode int) bool {
   277  	m.mux.Lock()
   278  	defer m.mux.Unlock()
   279  
   280  	// do not restart if the user or docker has requested that this container be stopped
   281  	if m.shouldStop {
   282  		m.container.HasBeenManuallyStopped = !m.supervisor.IsShuttingDown()
   283  		return false
   284  	}
   285  
   286  	switch {
   287  	case m.restartPolicy.IsAlways(), m.restartPolicy.IsUnlessStopped():
   288  		return true
   289  	case m.restartPolicy.IsOnFailure():
   290  		// the default value of 0 for MaximumRetryCount means that we will not enforce a maximum count
   291  		if max := m.restartPolicy.MaximumRetryCount; max != 0 && m.failureCount > max {
   292  			logrus.Debugf("stopping restart of container %s because maximum failure could of %d has been reached",
   293  				stringid.TruncateID(m.container.ID), max)
   294  			return false
   295  		}
   296  
   297  		return exitCode != 0
   298  	}
   299  
   300  	return false
   301  }
   302  
   303  // callback ensures that the container's state is properly updated after we
   304  // received ack from the execution drivers
   305  func (m *containerMonitor) callback(processConfig *execdriver.ProcessConfig, pid int, chOOM <-chan struct{}) error {
   306  	go func() {
   307  		for range chOOM {
   308  			m.logEvent("oom")
   309  		}
   310  	}()
   311  
   312  	if processConfig.Tty {
   313  		// The callback is called after the process start()
   314  		// so we are in the parent process. In TTY mode, stdin/out/err is the PtySlave
   315  		// which we close here.
   316  		if c, ok := processConfig.Stdout.(io.Closer); ok {
   317  			c.Close()
   318  		}
   319  	}
   320  
   321  	m.container.SetRunning(pid)
   322  
   323  	// signal that the process has started
   324  	// close channel only if not closed
   325  	select {
   326  	case <-m.startSignal:
   327  	default:
   328  		close(m.startSignal)
   329  	}
   330  
   331  	if err := m.container.ToDiskLocking(); err != nil {
   332  		logrus.Errorf("Error saving container to disk: %v", err)
   333  	}
   334  	return nil
   335  }
   336  
   337  // resetContainer resets the container's IO and ensures that the command is able to be executed again
   338  // by copying the data into a new struct
   339  // if lock is true, then container locked during reset
   340  func (m *containerMonitor) resetContainer(lock bool) {
   341  	container := m.container
   342  	if lock {
   343  		container.Lock()
   344  		defer container.Unlock()
   345  	}
   346  
   347  	if err := container.CloseStreams(); err != nil {
   348  		logrus.Errorf("%s: %s", container.ID, err)
   349  	}
   350  
   351  	if container.Command != nil && container.Command.ProcessConfig.Terminal != nil {
   352  		if err := container.Command.ProcessConfig.Terminal.Close(); err != nil {
   353  			logrus.Errorf("%s: Error closing terminal: %s", container.ID, err)
   354  		}
   355  	}
   356  
   357  	// Re-create a brand new stdin pipe once the container exited
   358  	if container.Config.OpenStdin {
   359  		container.NewInputPipes()
   360  	}
   361  
   362  	if container.LogDriver != nil {
   363  		if container.LogCopier != nil {
   364  			exit := make(chan struct{})
   365  			go func() {
   366  				container.LogCopier.Wait()
   367  				close(exit)
   368  			}()
   369  			select {
   370  			case <-time.After(loggerCloseTimeout):
   371  				logrus.Warnf("Logger didn't exit in time: logs may be truncated")
   372  				container.LogCopier.Close()
   373  				// always waits for the LogCopier to finished before closing
   374  				<-exit
   375  			case <-exit:
   376  			}
   377  		}
   378  		container.LogDriver.Close()
   379  		container.LogCopier = nil
   380  		container.LogDriver = nil
   381  	}
   382  
   383  	c := container.Command.ProcessConfig.Cmd
   384  
   385  	container.Command.ProcessConfig.Cmd = exec.Cmd{
   386  		Stdin:       c.Stdin,
   387  		Stdout:      c.Stdout,
   388  		Stderr:      c.Stderr,
   389  		Path:        c.Path,
   390  		Env:         c.Env,
   391  		ExtraFiles:  c.ExtraFiles,
   392  		Args:        c.Args,
   393  		Dir:         c.Dir,
   394  		SysProcAttr: c.SysProcAttr,
   395  	}
   396  }
   397  
   398  func (m *containerMonitor) logEvent(action string) {
   399  	m.supervisor.LogContainerEvent(m.container, action)
   400  }