github.com/mheon/docker@v0.11.2-0.20150922122814-44f47903a831/daemon/monitor.go

github.com/mheon/docker@v0.11.2-0.20150922122814-44f47903a831/daemon/monitor.go (about)

     1  package daemon
     2  
     3  import (
     4  	"io"
     5  	"os/exec"
     6  	"sync"
     7  	"time"
     8  
     9  	"github.com/Sirupsen/logrus"
    10  	"github.com/docker/docker/daemon/execdriver"
    11  	"github.com/docker/docker/pkg/stringid"
    12  	"github.com/docker/docker/runconfig"
    13  )
    14  
    15  const (
    16  	defaultTimeIncrement = 100
    17  	loggerCloseTimeout   = 10 * time.Second
    18  )
    19  
    20  // containerMonitor monitors the execution of a container's main process.
    21  // If a restart policy is specified for the container the monitor will ensure that the
    22  // process is restarted based on the rules of the policy.  When the container is finally stopped
    23  // the monitor will reset and cleanup any of the container resources such as networking allocations
    24  // and the rootfs
    25  type containerMonitor struct {
    26  	mux sync.Mutex
    27  
    28  	// container is the container being monitored
    29  	container *Container
    30  
    31  	// restartPolicy is the current policy being applied to the container monitor
    32  	restartPolicy runconfig.RestartPolicy
    33  
    34  	// failureCount is the number of times the container has failed to
    35  	// start in a row
    36  	failureCount int
    37  
    38  	// shouldStop signals the monitor that the next time the container exits it is
    39  	// either because docker or the user asked for the container to be stopped
    40  	shouldStop bool
    41  
    42  	// startSignal is a channel that is closes after the container initially starts
    43  	startSignal chan struct{}
    44  
    45  	// stopChan is used to signal to the monitor whenever there is a wait for the
    46  	// next restart so that the timeIncrement is not honored and the user is not
    47  	// left waiting for nothing to happen during this time
    48  	stopChan chan struct{}
    49  
    50  	// timeIncrement is the amount of time to wait between restarts
    51  	// this is in milliseconds
    52  	timeIncrement int
    53  
    54  	// lastStartTime is the time which the monitor last exec'd the container's process
    55  	lastStartTime time.Time
    56  }
    57  
    58  // newContainerMonitor returns an initialized containerMonitor for the provided container
    59  // honoring the provided restart policy
    60  func newContainerMonitor(container *Container, policy runconfig.RestartPolicy) *containerMonitor {
    61  	return &containerMonitor{
    62  		container:     container,
    63  		restartPolicy: policy,
    64  		timeIncrement: defaultTimeIncrement,
    65  		stopChan:      make(chan struct{}),
    66  		startSignal:   make(chan struct{}),
    67  	}
    68  }
    69  
    70  // Stop signals to the container monitor that it should stop monitoring the container
    71  // for exits the next time the process dies
    72  func (m *containerMonitor) ExitOnNext() {
    73  	m.mux.Lock()
    74  
    75  	// we need to protect having a double close of the channel when stop is called
    76  	// twice or else we will get a panic
    77  	if !m.shouldStop {
    78  		m.shouldStop = true
    79  		close(m.stopChan)
    80  	}
    81  
    82  	m.mux.Unlock()
    83  }
    84  
    85  // Close closes the container's resources such as networking allocations and
    86  // unmounts the contatiner's root filesystem
    87  func (m *containerMonitor) Close() error {
    88  	// Cleanup networking and mounts
    89  	m.container.cleanup()
    90  
    91  	// FIXME: here is race condition between two RUN instructions in Dockerfile
    92  	// because they share same runconfig and change image. Must be fixed
    93  	// in builder/builder.go
    94  	if err := m.container.toDisk(); err != nil {
    95  		logrus.Errorf("Error dumping container %s state to disk: %s", m.container.ID, err)
    96  
    97  		return err
    98  	}
    99  
   100  	return nil
   101  }
   102  
   103  // Start starts the containers process and monitors it according to the restart policy
   104  func (m *containerMonitor) Start() error {
   105  	var (
   106  		err        error
   107  		exitStatus execdriver.ExitStatus
   108  		// this variable indicates where we in execution flow:
   109  		// before Run or after
   110  		afterRun bool
   111  	)
   112  
   113  	// ensure that when the monitor finally exits we release the networking and unmount the rootfs
   114  	defer func() {
   115  		if afterRun {
   116  			m.container.Lock()
   117  			m.container.setStopped(&exitStatus)
   118  			defer m.container.Unlock()
   119  		}
   120  		m.Close()
   121  	}()
   122  	// reset stopped flag
   123  	if m.container.HasBeenManuallyStopped {
   124  		m.container.HasBeenManuallyStopped = false
   125  	}
   126  
   127  	// reset the restart count
   128  	m.container.RestartCount = -1
   129  
   130  	for {
   131  		m.container.RestartCount++
   132  
   133  		if err := m.container.startLogging(); err != nil {
   134  			m.resetContainer(false)
   135  
   136  			return err
   137  		}
   138  
   139  		pipes := execdriver.NewPipes(m.container.stdin, m.container.stdout, m.container.stderr, m.container.Config.OpenStdin)
   140  
   141  		m.container.logEvent("start")
   142  
   143  		m.lastStartTime = time.Now()
   144  
   145  		if exitStatus, err = m.container.daemon.run(m.container, pipes, m.callback); err != nil {
   146  			// if we receive an internal error from the initial start of a container then lets
   147  			// return it instead of entering the restart loop
   148  			if m.container.RestartCount == 0 {
   149  				m.container.ExitCode = -1
   150  				m.resetContainer(false)
   151  
   152  				return err
   153  			}
   154  
   155  			logrus.Errorf("Error running container: %s", err)
   156  		}
   157  
   158  		// here container.Lock is already lost
   159  		afterRun = true
   160  
   161  		m.resetMonitor(err == nil && exitStatus.ExitCode == 0)
   162  
   163  		if m.shouldRestart(exitStatus.ExitCode) {
   164  			m.container.setRestarting(&exitStatus)
   165  			if exitStatus.OOMKilled {
   166  				m.container.logEvent("oom")
   167  			}
   168  			m.container.logEvent("die")
   169  			m.resetContainer(true)
   170  
   171  			// sleep with a small time increment between each restart to help avoid issues cased by quickly
   172  			// restarting the container because of some types of errors ( networking cut out, etc... )
   173  			m.waitForNextRestart()
   174  
   175  			// we need to check this before reentering the loop because the waitForNextRestart could have
   176  			// been terminated by a request from a user
   177  			if m.shouldStop {
   178  				return err
   179  			}
   180  			continue
   181  		}
   182  		if exitStatus.OOMKilled {
   183  			m.container.logEvent("oom")
   184  		}
   185  		m.container.logEvent("die")
   186  		m.resetContainer(true)
   187  		return err
   188  	}
   189  }
   190  
   191  // resetMonitor resets the stateful fields on the containerMonitor based on the
   192  // previous runs success or failure.  Regardless of success, if the container had
   193  // an execution time of more than 10s then reset the timer back to the default
   194  func (m *containerMonitor) resetMonitor(successful bool) {
   195  	executionTime := time.Now().Sub(m.lastStartTime).Seconds()
   196  
   197  	if executionTime > 10 {
   198  		m.timeIncrement = defaultTimeIncrement
   199  	} else {
   200  		// otherwise we need to increment the amount of time we wait before restarting
   201  		// the process.  We will build up by multiplying the increment by 2
   202  		m.timeIncrement *= 2
   203  	}
   204  
   205  	// the container exited successfully so we need to reset the failure counter
   206  	if successful {
   207  		m.failureCount = 0
   208  	} else {
   209  		m.failureCount++
   210  	}
   211  }
   212  
   213  // waitForNextRestart waits with the default time increment to restart the container unless
   214  // a user or docker asks for the container to be stopped
   215  func (m *containerMonitor) waitForNextRestart() {
   216  	select {
   217  	case <-time.After(time.Duration(m.timeIncrement) * time.Millisecond):
   218  	case <-m.stopChan:
   219  	}
   220  }
   221  
   222  // shouldRestart checks the restart policy and applies the rules to determine if
   223  // the container's process should be restarted
   224  func (m *containerMonitor) shouldRestart(exitCode int) bool {
   225  	m.mux.Lock()
   226  	defer m.mux.Unlock()
   227  
   228  	// do not restart if the user or docker has requested that this container be stopped
   229  	if m.shouldStop {
   230  		m.container.HasBeenManuallyStopped = !m.container.daemon.shutdown
   231  		return false
   232  	}
   233  
   234  	switch {
   235  	case m.restartPolicy.IsAlways(), m.restartPolicy.IsUnlessStopped():
   236  		return true
   237  	case m.restartPolicy.IsOnFailure():
   238  		// the default value of 0 for MaximumRetryCount means that we will not enforce a maximum count
   239  		if max := m.restartPolicy.MaximumRetryCount; max != 0 && m.failureCount > max {
   240  			logrus.Debugf("stopping restart of container %s because maximum failure could of %d has been reached",
   241  				stringid.TruncateID(m.container.ID), max)
   242  			return false
   243  		}
   244  
   245  		return exitCode != 0
   246  	}
   247  
   248  	return false
   249  }
   250  
   251  // callback ensures that the container's state is properly updated after we
   252  // received ack from the execution drivers
   253  func (m *containerMonitor) callback(processConfig *execdriver.ProcessConfig, pid int) error {
   254  	if processConfig.Tty {
   255  		// The callback is called after the process Start()
   256  		// so we are in the parent process. In TTY mode, stdin/out/err is the PtySlave
   257  		// which we close here.
   258  		if c, ok := processConfig.Stdout.(io.Closer); ok {
   259  			c.Close()
   260  		}
   261  	}
   262  
   263  	m.container.setRunning(pid)
   264  
   265  	// signal that the process has started
   266  	// close channel only if not closed
   267  	select {
   268  	case <-m.startSignal:
   269  	default:
   270  		close(m.startSignal)
   271  	}
   272  
   273  	if err := m.container.toDiskLocking(); err != nil {
   274  		logrus.Errorf("Error saving container to disk: %v", err)
   275  	}
   276  	return nil
   277  }
   278  
   279  // resetContainer resets the container's IO and ensures that the command is able to be executed again
   280  // by copying the data into a new struct
   281  // if lock is true, then container locked during reset
   282  func (m *containerMonitor) resetContainer(lock bool) {
   283  	container := m.container
   284  	if lock {
   285  		container.Lock()
   286  		defer container.Unlock()
   287  	}
   288  
   289  	if container.Config.OpenStdin {
   290  		if err := container.stdin.Close(); err != nil {
   291  			logrus.Errorf("%s: Error close stdin: %s", container.ID, err)
   292  		}
   293  	}
   294  
   295  	if err := container.stdout.Clean(); err != nil {
   296  		logrus.Errorf("%s: Error close stdout: %s", container.ID, err)
   297  	}
   298  
   299  	if err := container.stderr.Clean(); err != nil {
   300  		logrus.Errorf("%s: Error close stderr: %s", container.ID, err)
   301  	}
   302  
   303  	if container.command != nil && container.command.ProcessConfig.Terminal != nil {
   304  		if err := container.command.ProcessConfig.Terminal.Close(); err != nil {
   305  			logrus.Errorf("%s: Error closing terminal: %s", container.ID, err)
   306  		}
   307  	}
   308  
   309  	// Re-create a brand new stdin pipe once the container exited
   310  	if container.Config.OpenStdin {
   311  		container.stdin, container.stdinPipe = io.Pipe()
   312  	}
   313  
   314  	if container.logDriver != nil {
   315  		if container.logCopier != nil {
   316  			exit := make(chan struct{})
   317  			go func() {
   318  				container.logCopier.Wait()
   319  				close(exit)
   320  			}()
   321  			select {
   322  			case <-time.After(loggerCloseTimeout):
   323  				logrus.Warnf("Logger didn't exit in time: logs may be truncated")
   324  			case <-exit:
   325  			}
   326  		}
   327  		container.logDriver.Close()
   328  		container.logCopier = nil
   329  		container.logDriver = nil
   330  	}
   331  
   332  	c := container.command.ProcessConfig.Cmd
   333  
   334  	container.command.ProcessConfig.Cmd = exec.Cmd{
   335  		Stdin:       c.Stdin,
   336  		Stdout:      c.Stdout,
   337  		Stderr:      c.Stderr,
   338  		Path:        c.Path,
   339  		Env:         c.Env,
   340  		ExtraFiles:  c.ExtraFiles,
   341  		Args:        c.Args,
   342  		Dir:         c.Dir,
   343  		SysProcAttr: c.SysProcAttr,
   344  	}
   345  }