github.com/endocode/docker@v1.4.2-0.20160113120958-46eb4700391e/container/monitor.go

github.com/endocode/docker@v1.4.2-0.20160113120958-46eb4700391e/container/monitor.go (about)

     1  package container
     2  
     3  import (
     4  	"io"
     5  	"os/exec"
     6  	"strings"
     7  	"sync"
     8  	"syscall"
     9  	"time"
    10  
    11  	"github.com/Sirupsen/logrus"
    12  	"github.com/docker/docker/daemon/execdriver"
    13  	derr "github.com/docker/docker/errors"
    14  	"github.com/docker/docker/pkg/promise"
    15  	"github.com/docker/docker/pkg/stringid"
    16  	"github.com/docker/docker/utils"
    17  	"github.com/docker/engine-api/types/container"
    18  )
    19  
    20  const (
    21  	defaultTimeIncrement = 100
    22  	loggerCloseTimeout   = 10 * time.Second
    23  )
    24  
    25  // supervisor defines the interface that a supervisor must implement
    26  type supervisor interface {
    27  	// LogContainerEvent generates events related to a given container
    28  	LogContainerEvent(*Container, string)
    29  	// Cleanup ensures that the container is properly unmounted
    30  	Cleanup(*Container)
    31  	// StartLogging starts the logging driver for the container
    32  	StartLogging(*Container) error
    33  	// Run starts a container
    34  	Run(c *Container, pipes *execdriver.Pipes, startCallback execdriver.DriverCallback) (execdriver.ExitStatus, error)
    35  	// IsShuttingDown tells whether the supervisor is shutting down or not
    36  	IsShuttingDown() bool
    37  }
    38  
    39  // containerMonitor monitors the execution of a container's main process.
    40  // If a restart policy is specified for the container the monitor will ensure that the
    41  // process is restarted based on the rules of the policy.  When the container is finally stopped
    42  // the monitor will reset and cleanup any of the container resources such as networking allocations
    43  // and the rootfs
    44  type containerMonitor struct {
    45  	mux sync.Mutex
    46  
    47  	// supervisor keeps track of the container and the events it generates
    48  	supervisor supervisor
    49  
    50  	// container is the container being monitored
    51  	container *Container
    52  
    53  	// restartPolicy is the current policy being applied to the container monitor
    54  	restartPolicy container.RestartPolicy
    55  
    56  	// failureCount is the number of times the container has failed to
    57  	// start in a row
    58  	failureCount int
    59  
    60  	// shouldStop signals the monitor that the next time the container exits it is
    61  	// either because docker or the user asked for the container to be stopped
    62  	shouldStop bool
    63  
    64  	// startSignal is a channel that is closes after the container initially starts
    65  	startSignal chan struct{}
    66  
    67  	// stopChan is used to signal to the monitor whenever there is a wait for the
    68  	// next restart so that the timeIncrement is not honored and the user is not
    69  	// left waiting for nothing to happen during this time
    70  	stopChan chan struct{}
    71  
    72  	// timeIncrement is the amount of time to wait between restarts
    73  	// this is in milliseconds
    74  	timeIncrement int
    75  
    76  	// lastStartTime is the time which the monitor last exec'd the container's process
    77  	lastStartTime time.Time
    78  }
    79  
    80  // StartMonitor initializes a containerMonitor for this container with the provided supervisor and restart policy
    81  // and starts the container's process.
    82  func (container *Container) StartMonitor(s supervisor, policy container.RestartPolicy) error {
    83  	container.Lock()
    84  	container.monitor = &containerMonitor{
    85  		supervisor:    s,
    86  		container:     container,
    87  		restartPolicy: policy,
    88  		timeIncrement: defaultTimeIncrement,
    89  		stopChan:      make(chan struct{}),
    90  		startSignal:   make(chan struct{}),
    91  	}
    92  	container.Unlock()
    93  
    94  	return container.monitor.wait()
    95  }
    96  
    97  // wait starts the container and wait until
    98  // we either receive an error from the initial start of the container's
    99  // process or until the process is running in the container
   100  func (m *containerMonitor) wait() error {
   101  	select {
   102  	case <-m.startSignal:
   103  	case err := <-promise.Go(m.start):
   104  		return err
   105  	}
   106  
   107  	return nil
   108  }
   109  
   110  // Stop signals to the container monitor that it should stop monitoring the container
   111  // for exits the next time the process dies
   112  func (m *containerMonitor) ExitOnNext() {
   113  	m.mux.Lock()
   114  
   115  	// we need to protect having a double close of the channel when stop is called
   116  	// twice or else we will get a panic
   117  	if !m.shouldStop {
   118  		m.shouldStop = true
   119  		close(m.stopChan)
   120  	}
   121  
   122  	m.mux.Unlock()
   123  }
   124  
   125  // Close closes the container's resources such as networking allocations and
   126  // unmounts the container's root filesystem
   127  func (m *containerMonitor) Close() error {
   128  	// Cleanup networking and mounts
   129  	m.supervisor.Cleanup(m.container)
   130  
   131  	// FIXME: here is race condition between two RUN instructions in Dockerfile
   132  	// because they share same runconfig and change image. Must be fixed
   133  	// in builder/builder.go
   134  	if err := m.container.ToDisk(); err != nil {
   135  		logrus.Errorf("Error dumping container %s state to disk: %s", m.container.ID, err)
   136  
   137  		return err
   138  	}
   139  
   140  	return nil
   141  }
   142  
   143  // Start starts the containers process and monitors it according to the restart policy
   144  func (m *containerMonitor) start() error {
   145  	var (
   146  		err        error
   147  		exitStatus execdriver.ExitStatus
   148  		// this variable indicates where we in execution flow:
   149  		// before Run or after
   150  		afterRun bool
   151  	)
   152  
   153  	// ensure that when the monitor finally exits we release the networking and unmount the rootfs
   154  	defer func() {
   155  		if afterRun {
   156  			m.container.Lock()
   157  			defer m.container.Unlock()
   158  			m.container.SetStopped(&exitStatus)
   159  		}
   160  		m.Close()
   161  	}()
   162  
   163  	m.container.Lock()
   164  	// reset stopped flag
   165  	if m.container.HasBeenManuallyStopped {
   166  		m.container.HasBeenManuallyStopped = false
   167  	}
   168  
   169  	// reset the restart count
   170  	m.container.RestartCount = -1
   171  
   172  	for {
   173  		m.container.RestartCount++
   174  
   175  		if err := m.supervisor.StartLogging(m.container); err != nil {
   176  			m.resetContainer(false)
   177  
   178  			m.container.Unlock()
   179  			return err
   180  		}
   181  
   182  		pipes := execdriver.NewPipes(m.container.Stdin(), m.container.Stdout(), m.container.Stderr(), m.container.Config.OpenStdin)
   183  		m.container.Unlock()
   184  
   185  		m.logEvent("start")
   186  
   187  		m.lastStartTime = time.Now()
   188  
   189  		// don't lock Run because m.callback has own lock
   190  		if exitStatus, err = m.supervisor.Run(m.container, pipes, m.callback); err != nil {
   191  			m.container.Lock()
   192  			// if we receive an internal error from the initial start of a container then lets
   193  			// return it instead of entering the restart loop
   194  			// set to 127 for container cmd not found/does not exist)
   195  			if strings.Contains(err.Error(), "executable file not found") ||
   196  				strings.Contains(err.Error(), "no such file or directory") ||
   197  				strings.Contains(err.Error(), "system cannot find the file specified") {
   198  				if m.container.RestartCount == 0 {
   199  					m.container.ExitCode = 127
   200  					m.resetContainer(false)
   201  					m.container.Unlock()
   202  					return derr.ErrorCodeCmdNotFound
   203  				}
   204  			}
   205  			// set to 126 for container cmd can't be invoked errors
   206  			if strings.Contains(err.Error(), syscall.EACCES.Error()) {
   207  				if m.container.RestartCount == 0 {
   208  					m.container.ExitCode = 126
   209  					m.resetContainer(false)
   210  					m.container.Unlock()
   211  					return derr.ErrorCodeCmdCouldNotBeInvoked
   212  				}
   213  			}
   214  
   215  			if m.container.RestartCount == 0 {
   216  				m.container.ExitCode = -1
   217  				m.resetContainer(false)
   218  
   219  				m.container.Unlock()
   220  				return derr.ErrorCodeCantStart.WithArgs(m.container.ID, utils.GetErrorMessage(err))
   221  			}
   222  
   223  			m.container.Unlock()
   224  			logrus.Errorf("Error running container: %s", err)
   225  		} // end if
   226  
   227  		// here container.Lock is already lost
   228  		afterRun = true
   229  
   230  		m.resetMonitor(err == nil && exitStatus.ExitCode == 0)
   231  
   232  		if m.shouldRestart(exitStatus.ExitCode) {
   233  			m.container.SetRestarting(&exitStatus)
   234  			m.logEvent("die")
   235  			m.resetContainer(true)
   236  
   237  			// sleep with a small time increment between each restart to help avoid issues cased by quickly
   238  			// restarting the container because of some types of errors ( networking cut out, etc... )
   239  			m.waitForNextRestart()
   240  
   241  			// we need to check this before reentering the loop because the waitForNextRestart could have
   242  			// been terminated by a request from a user
   243  			if m.shouldStop {
   244  				return err
   245  			}
   246  			m.container.Lock()
   247  			continue
   248  		}
   249  
   250  		m.logEvent("die")
   251  		m.resetContainer(true)
   252  		return err
   253  	} // end for
   254  }
   255  
   256  // resetMonitor resets the stateful fields on the containerMonitor based on the
   257  // previous runs success or failure.  Regardless of success, if the container had
   258  // an execution time of more than 10s then reset the timer back to the default
   259  func (m *containerMonitor) resetMonitor(successful bool) {
   260  	executionTime := time.Now().Sub(m.lastStartTime).Seconds()
   261  
   262  	if executionTime > 10 {
   263  		m.timeIncrement = defaultTimeIncrement
   264  	} else {
   265  		// otherwise we need to increment the amount of time we wait before restarting
   266  		// the process.  We will build up by multiplying the increment by 2
   267  		m.timeIncrement *= 2
   268  	}
   269  
   270  	// the container exited successfully so we need to reset the failure counter
   271  	if successful {
   272  		m.failureCount = 0
   273  	} else {
   274  		m.failureCount++
   275  	}
   276  }
   277  
   278  // waitForNextRestart waits with the default time increment to restart the container unless
   279  // a user or docker asks for the container to be stopped
   280  func (m *containerMonitor) waitForNextRestart() {
   281  	select {
   282  	case <-time.After(time.Duration(m.timeIncrement) * time.Millisecond):
   283  	case <-m.stopChan:
   284  	}
   285  }
   286  
   287  // shouldRestart checks the restart policy and applies the rules to determine if
   288  // the container's process should be restarted
   289  func (m *containerMonitor) shouldRestart(exitCode int) bool {
   290  	m.mux.Lock()
   291  	defer m.mux.Unlock()
   292  
   293  	// do not restart if the user or docker has requested that this container be stopped
   294  	if m.shouldStop {
   295  		m.container.HasBeenManuallyStopped = !m.supervisor.IsShuttingDown()
   296  		return false
   297  	}
   298  
   299  	switch {
   300  	case m.restartPolicy.IsAlways(), m.restartPolicy.IsUnlessStopped():
   301  		return true
   302  	case m.restartPolicy.IsOnFailure():
   303  		// the default value of 0 for MaximumRetryCount means that we will not enforce a maximum count
   304  		if max := m.restartPolicy.MaximumRetryCount; max != 0 && m.failureCount > max {
   305  			logrus.Debugf("stopping restart of container %s because maximum failure could of %d has been reached",
   306  				stringid.TruncateID(m.container.ID), max)
   307  			return false
   308  		}
   309  
   310  		return exitCode != 0
   311  	}
   312  
   313  	return false
   314  }
   315  
   316  // callback ensures that the container's state is properly updated after we
   317  // received ack from the execution drivers
   318  func (m *containerMonitor) callback(processConfig *execdriver.ProcessConfig, pid int, chOOM <-chan struct{}) error {
   319  	go func() {
   320  		for range chOOM {
   321  			m.logEvent("oom")
   322  		}
   323  	}()
   324  
   325  	if processConfig.Tty {
   326  		// The callback is called after the process start()
   327  		// so we are in the parent process. In TTY mode, stdin/out/err is the PtySlave
   328  		// which we close here.
   329  		if c, ok := processConfig.Stdout.(io.Closer); ok {
   330  			c.Close()
   331  		}
   332  	}
   333  
   334  	m.container.SetRunningLocking(pid)
   335  
   336  	// signal that the process has started
   337  	// close channel only if not closed
   338  	select {
   339  	case <-m.startSignal:
   340  	default:
   341  		close(m.startSignal)
   342  	}
   343  
   344  	if err := m.container.ToDiskLocking(); err != nil {
   345  		logrus.Errorf("Error saving container to disk: %v", err)
   346  	}
   347  	return nil
   348  }
   349  
   350  // resetContainer resets the container's IO and ensures that the command is able to be executed again
   351  // by copying the data into a new struct
   352  // if lock is true, then container locked during reset
   353  func (m *containerMonitor) resetContainer(lock bool) {
   354  	container := m.container
   355  	if lock {
   356  		container.Lock()
   357  		defer container.Unlock()
   358  	}
   359  
   360  	if err := container.CloseStreams(); err != nil {
   361  		logrus.Errorf("%s: %s", container.ID, err)
   362  	}
   363  
   364  	if container.Command != nil && container.Command.ProcessConfig.Terminal != nil {
   365  		if err := container.Command.ProcessConfig.Terminal.Close(); err != nil {
   366  			logrus.Errorf("%s: Error closing terminal: %s", container.ID, err)
   367  		}
   368  	}
   369  
   370  	// Re-create a brand new stdin pipe once the container exited
   371  	if container.Config.OpenStdin {
   372  		container.NewInputPipes()
   373  	}
   374  
   375  	if container.LogDriver != nil {
   376  		if container.LogCopier != nil {
   377  			exit := make(chan struct{})
   378  			go func() {
   379  				container.LogCopier.Wait()
   380  				close(exit)
   381  			}()
   382  			select {
   383  			case <-time.After(loggerCloseTimeout):
   384  				logrus.Warnf("Logger didn't exit in time: logs may be truncated")
   385  			case <-exit:
   386  			}
   387  		}
   388  		container.LogDriver.Close()
   389  		container.LogCopier = nil
   390  		container.LogDriver = nil
   391  	}
   392  
   393  	c := container.Command.ProcessConfig.Cmd
   394  
   395  	container.Command.ProcessConfig.Cmd = exec.Cmd{
   396  		Stdin:       c.Stdin,
   397  		Stdout:      c.Stdout,
   398  		Stderr:      c.Stderr,
   399  		Path:        c.Path,
   400  		Env:         c.Env,
   401  		ExtraFiles:  c.ExtraFiles,
   402  		Args:        c.Args,
   403  		Dir:         c.Dir,
   404  		SysProcAttr: c.SysProcAttr,
   405  	}
   406  }
   407  
   408  func (m *containerMonitor) logEvent(action string) {
   409  	m.supervisor.LogContainerEvent(m.container, action)
   410  }