github.com/erriapo/docker@v1.6.0-rc2/daemon/monitor.go (about)

     1  package daemon
     2  
     3  import (
     4  	"io"
     5  	"os/exec"
     6  	"sync"
     7  	"time"
     8  
     9  	log "github.com/Sirupsen/logrus"
    10  	"github.com/docker/docker/daemon/execdriver"
    11  	"github.com/docker/docker/pkg/common"
    12  	"github.com/docker/docker/runconfig"
    13  )
    14  
    15  const defaultTimeIncrement = 100
    16  
    17  // containerMonitor monitors the execution of a container's main process.
    18  // If a restart policy is specified for the container the monitor will ensure that the
    19  // process is restarted based on the rules of the policy.  When the container is finally stopped
    20  // the monitor will reset and cleanup any of the container resources such as networking allocations
    21  // and the rootfs
    22  type containerMonitor struct {
    23  	mux sync.Mutex
    24  
    25  	// container is the container being monitored
    26  	container *Container
    27  
    28  	// restartPolicy is the current policy being applied to the container monitor
    29  	restartPolicy runconfig.RestartPolicy
    30  
    31  	// failureCount is the number of times the container has failed to
    32  	// start in a row
    33  	failureCount int
    34  
    35  	// shouldStop signals the monitor that the next time the container exits it is
    36  	// either because docker or the user asked for the container to be stopped
    37  	shouldStop bool
    38  
    39  	// startSignal is a channel that is closes after the container initially starts
    40  	startSignal chan struct{}
    41  
    42  	// stopChan is used to signal to the monitor whenever there is a wait for the
    43  	// next restart so that the timeIncrement is not honored and the user is not
    44  	// left waiting for nothing to happen during this time
    45  	stopChan chan struct{}
    46  
    47  	// timeIncrement is the amount of time to wait between restarts
    48  	// this is in milliseconds
    49  	timeIncrement int
    50  
    51  	// lastStartTime is the time which the monitor last exec'd the container's process
    52  	lastStartTime time.Time
    53  }
    54  
    55  // newContainerMonitor returns an initialized containerMonitor for the provided container
    56  // honoring the provided restart policy
    57  func newContainerMonitor(container *Container, policy runconfig.RestartPolicy) *containerMonitor {
    58  	return &containerMonitor{
    59  		container:     container,
    60  		restartPolicy: policy,
    61  		timeIncrement: defaultTimeIncrement,
    62  		stopChan:      make(chan struct{}),
    63  		startSignal:   make(chan struct{}),
    64  	}
    65  }
    66  
    67  // Stop signals to the container monitor that it should stop monitoring the container
    68  // for exits the next time the process dies
    69  func (m *containerMonitor) ExitOnNext() {
    70  	m.mux.Lock()
    71  
    72  	// we need to protect having a double close of the channel when stop is called
    73  	// twice or else we will get a panic
    74  	if !m.shouldStop {
    75  		m.shouldStop = true
    76  		close(m.stopChan)
    77  	}
    78  
    79  	m.mux.Unlock()
    80  }
    81  
    82  // Close closes the container's resources such as networking allocations and
    83  // unmounts the contatiner's root filesystem
    84  func (m *containerMonitor) Close() error {
    85  	// Cleanup networking and mounts
    86  	m.container.cleanup()
    87  
    88  	// FIXME: here is race condition between two RUN instructions in Dockerfile
    89  	// because they share same runconfig and change image. Must be fixed
    90  	// in builder/builder.go
    91  	if err := m.container.toDisk(); err != nil {
    92  		log.Errorf("Error dumping container %s state to disk: %s", m.container.ID, err)
    93  
    94  		return err
    95  	}
    96  
    97  	return nil
    98  }
    99  
   100  // Start starts the containers process and monitors it according to the restart policy
   101  func (m *containerMonitor) Start() error {
   102  	var (
   103  		err        error
   104  		exitStatus execdriver.ExitStatus
   105  		// this variable indicates where we in execution flow:
   106  		// before Run or after
   107  		afterRun bool
   108  	)
   109  
   110  	// ensure that when the monitor finally exits we release the networking and unmount the rootfs
   111  	defer func() {
   112  		if afterRun {
   113  			m.container.Lock()
   114  			m.container.setStopped(&exitStatus)
   115  			defer m.container.Unlock()
   116  		}
   117  		m.Close()
   118  	}()
   119  
   120  	// reset the restart count
   121  	m.container.RestartCount = -1
   122  
   123  	for {
   124  		m.container.RestartCount++
   125  
   126  		if err := m.container.startLogging(); err != nil {
   127  			m.resetContainer(false)
   128  
   129  			return err
   130  		}
   131  
   132  		pipes := execdriver.NewPipes(m.container.stdin, m.container.stdout, m.container.stderr, m.container.Config.OpenStdin)
   133  
   134  		m.container.LogEvent("start")
   135  
   136  		m.lastStartTime = time.Now()
   137  
   138  		if exitStatus, err = m.container.daemon.Run(m.container, pipes, m.callback); err != nil {
   139  			// if we receive an internal error from the initial start of a container then lets
   140  			// return it instead of entering the restart loop
   141  			if m.container.RestartCount == 0 {
   142  				m.container.ExitCode = -1
   143  				m.resetContainer(false)
   144  
   145  				return err
   146  			}
   147  
   148  			log.Errorf("Error running container: %s", err)
   149  		}
   150  
   151  		// here container.Lock is already lost
   152  		afterRun = true
   153  
   154  		m.resetMonitor(err == nil && exitStatus.ExitCode == 0)
   155  
   156  		if m.shouldRestart(exitStatus.ExitCode) {
   157  			m.container.SetRestarting(&exitStatus)
   158  			if exitStatus.OOMKilled {
   159  				m.container.LogEvent("oom")
   160  			}
   161  			m.container.LogEvent("die")
   162  			m.resetContainer(true)
   163  
   164  			// sleep with a small time increment between each restart to help avoid issues cased by quickly
   165  			// restarting the container because of some types of errors ( networking cut out, etc... )
   166  			m.waitForNextRestart()
   167  
   168  			// we need to check this before reentering the loop because the waitForNextRestart could have
   169  			// been terminated by a request from a user
   170  			if m.shouldStop {
   171  				return err
   172  			}
   173  			continue
   174  		}
   175  		if exitStatus.OOMKilled {
   176  			m.container.LogEvent("oom")
   177  		}
   178  		m.container.LogEvent("die")
   179  		m.resetContainer(true)
   180  		return err
   181  	}
   182  }
   183  
   184  // resetMonitor resets the stateful fields on the containerMonitor based on the
   185  // previous runs success or failure.  Regardless of success, if the container had
   186  // an execution time of more than 10s then reset the timer back to the default
   187  func (m *containerMonitor) resetMonitor(successful bool) {
   188  	executionTime := time.Now().Sub(m.lastStartTime).Seconds()
   189  
   190  	if executionTime > 10 {
   191  		m.timeIncrement = defaultTimeIncrement
   192  	} else {
   193  		// otherwise we need to increment the amount of time we wait before restarting
   194  		// the process.  We will build up by multiplying the increment by 2
   195  		m.timeIncrement *= 2
   196  	}
   197  
   198  	// the container exited successfully so we need to reset the failure counter
   199  	if successful {
   200  		m.failureCount = 0
   201  	} else {
   202  		m.failureCount++
   203  	}
   204  }
   205  
   206  // waitForNextRestart waits with the default time increment to restart the container unless
   207  // a user or docker asks for the container to be stopped
   208  func (m *containerMonitor) waitForNextRestart() {
   209  	select {
   210  	case <-time.After(time.Duration(m.timeIncrement) * time.Millisecond):
   211  	case <-m.stopChan:
   212  	}
   213  }
   214  
   215  // shouldRestart checks the restart policy and applies the rules to determine if
   216  // the container's process should be restarted
   217  func (m *containerMonitor) shouldRestart(exitCode int) bool {
   218  	m.mux.Lock()
   219  	defer m.mux.Unlock()
   220  
   221  	// do not restart if the user or docker has requested that this container be stopped
   222  	if m.shouldStop {
   223  		return false
   224  	}
   225  
   226  	switch m.restartPolicy.Name {
   227  	case "always":
   228  		return true
   229  	case "on-failure":
   230  		// the default value of 0 for MaximumRetryCount means that we will not enforce a maximum count
   231  		if max := m.restartPolicy.MaximumRetryCount; max != 0 && m.failureCount > max {
   232  			log.Debugf("stopping restart of container %s because maximum failure could of %d has been reached",
   233  				common.TruncateID(m.container.ID), max)
   234  			return false
   235  		}
   236  
   237  		return exitCode != 0
   238  	}
   239  
   240  	return false
   241  }
   242  
   243  // callback ensures that the container's state is properly updated after we
   244  // received ack from the execution drivers
   245  func (m *containerMonitor) callback(processConfig *execdriver.ProcessConfig, pid int) {
   246  	if processConfig.Tty {
   247  		// The callback is called after the process Start()
   248  		// so we are in the parent process. In TTY mode, stdin/out/err is the PtySlave
   249  		// which we close here.
   250  		if c, ok := processConfig.Stdout.(io.Closer); ok {
   251  			c.Close()
   252  		}
   253  	}
   254  
   255  	m.container.setRunning(pid)
   256  
   257  	// signal that the process has started
   258  	// close channel only if not closed
   259  	select {
   260  	case <-m.startSignal:
   261  	default:
   262  		close(m.startSignal)
   263  	}
   264  
   265  	if err := m.container.ToDisk(); err != nil {
   266  		log.Debugf("%s", err)
   267  	}
   268  }
   269  
   270  // resetContainer resets the container's IO and ensures that the command is able to be executed again
   271  // by copying the data into a new struct
   272  // if lock is true, then container locked during reset
   273  func (m *containerMonitor) resetContainer(lock bool) {
   274  	container := m.container
   275  	if lock {
   276  		container.Lock()
   277  		defer container.Unlock()
   278  	}
   279  
   280  	if container.Config.OpenStdin {
   281  		if err := container.stdin.Close(); err != nil {
   282  			log.Errorf("%s: Error close stdin: %s", container.ID, err)
   283  		}
   284  	}
   285  
   286  	if err := container.stdout.Clean(); err != nil {
   287  		log.Errorf("%s: Error close stdout: %s", container.ID, err)
   288  	}
   289  
   290  	if err := container.stderr.Clean(); err != nil {
   291  		log.Errorf("%s: Error close stderr: %s", container.ID, err)
   292  	}
   293  
   294  	if container.command != nil && container.command.ProcessConfig.Terminal != nil {
   295  		if err := container.command.ProcessConfig.Terminal.Close(); err != nil {
   296  			log.Errorf("%s: Error closing terminal: %s", container.ID, err)
   297  		}
   298  	}
   299  
   300  	// Re-create a brand new stdin pipe once the container exited
   301  	if container.Config.OpenStdin {
   302  		container.stdin, container.stdinPipe = io.Pipe()
   303  	}
   304  
   305  	if container.logDriver != nil {
   306  		if container.logCopier != nil {
   307  			exit := make(chan struct{})
   308  			go func() {
   309  				container.logCopier.Wait()
   310  				close(exit)
   311  			}()
   312  			select {
   313  			case <-time.After(1 * time.Second):
   314  				log.Warnf("Logger didn't exit in time: logs may be truncated")
   315  			case <-exit:
   316  			}
   317  		}
   318  		container.logDriver.Close()
   319  		container.logCopier = nil
   320  		container.logDriver = nil
   321  	}
   322  
   323  	c := container.command.ProcessConfig.Cmd
   324  
   325  	container.command.ProcessConfig.Cmd = exec.Cmd{
   326  		Stdin:       c.Stdin,
   327  		Stdout:      c.Stdout,
   328  		Stderr:      c.Stderr,
   329  		Path:        c.Path,
   330  		Env:         c.Env,
   331  		ExtraFiles:  c.ExtraFiles,
   332  		Args:        c.Args,
   333  		Dir:         c.Dir,
   334  		SysProcAttr: c.SysProcAttr,
   335  	}
   336  }