github.com/jiasir/docker@v1.3.3-0.20170609024000-252e610103e7/daemon/health.go (about)

     1  package daemon
     2  
     3  import (
     4  	"bytes"
     5  	"fmt"
     6  	"runtime"
     7  	"strings"
     8  	"sync"
     9  	"time"
    10  
    11  	"golang.org/x/net/context"
    12  
    13  	"github.com/Sirupsen/logrus"
    14  	"github.com/docker/docker/api/types"
    15  	containertypes "github.com/docker/docker/api/types/container"
    16  	"github.com/docker/docker/api/types/strslice"
    17  	"github.com/docker/docker/container"
    18  	"github.com/docker/docker/daemon/exec"
    19  )
    20  
    21  const (
    22  	// Longest healthcheck probe output message to store. Longer messages will be truncated.
    23  	maxOutputLen = 4096
    24  
    25  	// Default interval between probe runs (from the end of the first to the start of the second).
    26  	// Also the time before the first probe.
    27  	defaultProbeInterval = 30 * time.Second
    28  
    29  	// The maximum length of time a single probe run should take. If the probe takes longer
    30  	// than this, the check is considered to have failed.
    31  	defaultProbeTimeout = 30 * time.Second
    32  
    33  	// The time given for the container to start before the health check starts considering
    34  	// the container unstable. Defaults to none.
    35  	defaultStartPeriod = 0 * time.Second
    36  
    37  	// Default number of consecutive failures of the health check
    38  	// for the container to be considered unhealthy.
    39  	defaultProbeRetries = 3
    40  
    41  	// Maximum number of entries to record
    42  	maxLogEntries = 5
    43  )
    44  
    45  const (
    46  	// Exit status codes that can be returned by the probe command.
    47  
    48  	exitStatusHealthy   = 0 // Container is healthy
    49  	exitStatusUnhealthy = 1 // Container is unhealthy
    50  )
    51  
    52  // probe implementations know how to run a particular type of probe.
    53  type probe interface {
    54  	// Perform one run of the check. Returns the exit code and an optional
    55  	// short diagnostic string.
    56  	run(context.Context, *Daemon, *container.Container) (*types.HealthcheckResult, error)
    57  }
    58  
    59  // cmdProbe implements the "CMD" probe type.
    60  type cmdProbe struct {
    61  	// Run the command with the system's default shell instead of execing it directly.
    62  	shell bool
    63  }
    64  
    65  // exec the healthcheck command in the container.
    66  // Returns the exit code and probe output (if any)
    67  func (p *cmdProbe) run(ctx context.Context, d *Daemon, cntr *container.Container) (*types.HealthcheckResult, error) {
    68  	cmdSlice := strslice.StrSlice(cntr.Config.Healthcheck.Test)[1:]
    69  	if p.shell {
    70  		cmdSlice = append(getShell(cntr.Config), cmdSlice...)
    71  	}
    72  	entrypoint, args := d.getEntrypointAndArgs(strslice.StrSlice{}, cmdSlice)
    73  	execConfig := exec.NewConfig()
    74  	execConfig.OpenStdin = false
    75  	execConfig.OpenStdout = true
    76  	execConfig.OpenStderr = true
    77  	execConfig.ContainerID = cntr.ID
    78  	execConfig.DetachKeys = []byte{}
    79  	execConfig.Entrypoint = entrypoint
    80  	execConfig.Args = args
    81  	execConfig.Tty = false
    82  	execConfig.Privileged = false
    83  	execConfig.User = cntr.Config.User
    84  
    85  	linkedEnv, err := d.setupLinkedContainers(cntr)
    86  	if err != nil {
    87  		return nil, err
    88  	}
    89  	execConfig.Env = container.ReplaceOrAppendEnvValues(cntr.CreateDaemonEnvironment(execConfig.Tty, linkedEnv), execConfig.Env)
    90  
    91  	d.registerExecCommand(cntr, execConfig)
    92  	d.LogContainerEvent(cntr, "exec_create: "+execConfig.Entrypoint+" "+strings.Join(execConfig.Args, " "))
    93  
    94  	output := &limitedBuffer{}
    95  	err = d.ContainerExecStart(ctx, execConfig.ID, nil, output, output)
    96  	if err != nil {
    97  		return nil, err
    98  	}
    99  	info, err := d.getExecConfig(execConfig.ID)
   100  	if err != nil {
   101  		return nil, err
   102  	}
   103  	if info.ExitCode == nil {
   104  		return nil, fmt.Errorf("Healthcheck for container %s has no exit code!", cntr.ID)
   105  	}
   106  	// Note: Go's json package will handle invalid UTF-8 for us
   107  	out := output.String()
   108  	return &types.HealthcheckResult{
   109  		End:      time.Now(),
   110  		ExitCode: *info.ExitCode,
   111  		Output:   out,
   112  	}, nil
   113  }
   114  
   115  // Update the container's Status.Health struct based on the latest probe's result.
   116  func handleProbeResult(d *Daemon, c *container.Container, result *types.HealthcheckResult, done chan struct{}) {
   117  	c.Lock()
   118  	defer c.Unlock()
   119  
   120  	// probe may have been cancelled while waiting on lock. Ignore result then
   121  	select {
   122  	case <-done:
   123  		return
   124  	default:
   125  	}
   126  
   127  	retries := c.Config.Healthcheck.Retries
   128  	if retries <= 0 {
   129  		retries = defaultProbeRetries
   130  	}
   131  
   132  	h := c.State.Health
   133  	oldStatus := h.Status
   134  
   135  	if len(h.Log) >= maxLogEntries {
   136  		h.Log = append(h.Log[len(h.Log)+1-maxLogEntries:], result)
   137  	} else {
   138  		h.Log = append(h.Log, result)
   139  	}
   140  
   141  	if result.ExitCode == exitStatusHealthy {
   142  		h.FailingStreak = 0
   143  		h.Status = types.Healthy
   144  	} else { // Failure (including invalid exit code)
   145  		shouldIncrementStreak := true
   146  
   147  		// If the container is starting (i.e. we never had a successful health check)
   148  		// then we check if we are within the start period of the container in which
   149  		// case we do not increment the failure streak.
   150  		if h.Status == types.Starting {
   151  			startPeriod := timeoutWithDefault(c.Config.Healthcheck.StartPeriod, defaultStartPeriod)
   152  			timeSinceStart := result.Start.Sub(c.State.StartedAt)
   153  
   154  			// If still within the start period, then don't increment failing streak.
   155  			if timeSinceStart < startPeriod {
   156  				shouldIncrementStreak = false
   157  			}
   158  		}
   159  
   160  		if shouldIncrementStreak {
   161  			h.FailingStreak++
   162  
   163  			if h.FailingStreak >= retries {
   164  				h.Status = types.Unhealthy
   165  			}
   166  		}
   167  		// Else we're starting or healthy. Stay in that state.
   168  	}
   169  
   170  	if oldStatus != h.Status {
   171  		d.LogContainerEvent(c, "health_status: "+h.Status)
   172  	}
   173  }
   174  
   175  // Run the container's monitoring thread until notified via "stop".
   176  // There is never more than one monitor thread running per container at a time.
   177  func monitor(d *Daemon, c *container.Container, stop chan struct{}, probe probe) {
   178  	probeTimeout := timeoutWithDefault(c.Config.Healthcheck.Timeout, defaultProbeTimeout)
   179  	probeInterval := timeoutWithDefault(c.Config.Healthcheck.Interval, defaultProbeInterval)
   180  	for {
   181  		select {
   182  		case <-stop:
   183  			logrus.Debugf("Stop healthcheck monitoring for container %s (received while idle)", c.ID)
   184  			return
   185  		case <-time.After(probeInterval):
   186  			logrus.Debugf("Running health check for container %s ...", c.ID)
   187  			startTime := time.Now()
   188  			ctx, cancelProbe := context.WithTimeout(context.Background(), probeTimeout)
   189  			results := make(chan *types.HealthcheckResult)
   190  			go func() {
   191  				healthChecksCounter.Inc()
   192  				result, err := probe.run(ctx, d, c)
   193  				if err != nil {
   194  					healthChecksFailedCounter.Inc()
   195  					logrus.Warnf("Health check for container %s error: %v", c.ID, err)
   196  					results <- &types.HealthcheckResult{
   197  						ExitCode: -1,
   198  						Output:   err.Error(),
   199  						Start:    startTime,
   200  						End:      time.Now(),
   201  					}
   202  				} else {
   203  					result.Start = startTime
   204  					logrus.Debugf("Health check for container %s done (exitCode=%d)", c.ID, result.ExitCode)
   205  					results <- result
   206  				}
   207  				close(results)
   208  			}()
   209  			select {
   210  			case <-stop:
   211  				logrus.Debugf("Stop healthcheck monitoring for container %s (received while probing)", c.ID)
   212  				// Stop timeout and kill probe, but don't wait for probe to exit.
   213  				cancelProbe()
   214  				return
   215  			case result := <-results:
   216  				handleProbeResult(d, c, result, stop)
   217  				// Stop timeout
   218  				cancelProbe()
   219  			case <-ctx.Done():
   220  				logrus.Debugf("Health check for container %s taking too long", c.ID)
   221  				handleProbeResult(d, c, &types.HealthcheckResult{
   222  					ExitCode: -1,
   223  					Output:   fmt.Sprintf("Health check exceeded timeout (%v)", probeTimeout),
   224  					Start:    startTime,
   225  					End:      time.Now(),
   226  				}, stop)
   227  				cancelProbe()
   228  				// Wait for probe to exit (it might take a while to respond to the TERM
   229  				// signal and we don't want dying probes to pile up).
   230  				<-results
   231  			}
   232  		}
   233  	}
   234  }
   235  
   236  // Get a suitable probe implementation for the container's healthcheck configuration.
   237  // Nil will be returned if no healthcheck was configured or NONE was set.
   238  func getProbe(c *container.Container) probe {
   239  	config := c.Config.Healthcheck
   240  	if config == nil || len(config.Test) == 0 {
   241  		return nil
   242  	}
   243  	switch config.Test[0] {
   244  	case "CMD":
   245  		return &cmdProbe{shell: false}
   246  	case "CMD-SHELL":
   247  		return &cmdProbe{shell: true}
   248  	default:
   249  		logrus.Warnf("Unknown healthcheck type '%s' (expected 'CMD') in container %s", config.Test[0], c.ID)
   250  		return nil
   251  	}
   252  }
   253  
   254  // Ensure the health-check monitor is running or not, depending on the current
   255  // state of the container.
   256  // Called from monitor.go, with c locked.
   257  func (d *Daemon) updateHealthMonitor(c *container.Container) {
   258  	h := c.State.Health
   259  	if h == nil {
   260  		return // No healthcheck configured
   261  	}
   262  
   263  	probe := getProbe(c)
   264  	wantRunning := c.Running && !c.Paused && probe != nil
   265  	if wantRunning {
   266  		if stop := h.OpenMonitorChannel(); stop != nil {
   267  			go monitor(d, c, stop, probe)
   268  		}
   269  	} else {
   270  		h.CloseMonitorChannel()
   271  	}
   272  }
   273  
   274  // Reset the health state for a newly-started, restarted or restored container.
   275  // initHealthMonitor is called from monitor.go and we should never be running
   276  // two instances at once.
   277  // Called with c locked.
   278  func (d *Daemon) initHealthMonitor(c *container.Container) {
   279  	// If no healthcheck is setup then don't init the monitor
   280  	if getProbe(c) == nil {
   281  		return
   282  	}
   283  
   284  	// This is needed in case we're auto-restarting
   285  	d.stopHealthchecks(c)
   286  
   287  	if h := c.State.Health; h != nil {
   288  		h.Status = types.Starting
   289  		h.FailingStreak = 0
   290  	} else {
   291  		h := &container.Health{}
   292  		h.Status = types.Starting
   293  		c.State.Health = h
   294  	}
   295  
   296  	d.updateHealthMonitor(c)
   297  }
   298  
   299  // Called when the container is being stopped (whether because the health check is
   300  // failing or for any other reason).
   301  func (d *Daemon) stopHealthchecks(c *container.Container) {
   302  	h := c.State.Health
   303  	if h != nil {
   304  		h.CloseMonitorChannel()
   305  	}
   306  }
   307  
   308  // Buffer up to maxOutputLen bytes. Further data is discarded.
   309  type limitedBuffer struct {
   310  	buf       bytes.Buffer
   311  	mu        sync.Mutex
   312  	truncated bool // indicates that data has been lost
   313  }
   314  
   315  // Append to limitedBuffer while there is room.
   316  func (b *limitedBuffer) Write(data []byte) (int, error) {
   317  	b.mu.Lock()
   318  	defer b.mu.Unlock()
   319  
   320  	bufLen := b.buf.Len()
   321  	dataLen := len(data)
   322  	keep := min(maxOutputLen-bufLen, dataLen)
   323  	if keep > 0 {
   324  		b.buf.Write(data[:keep])
   325  	}
   326  	if keep < dataLen {
   327  		b.truncated = true
   328  	}
   329  	return dataLen, nil
   330  }
   331  
   332  // The contents of the buffer, with "..." appended if it overflowed.
   333  func (b *limitedBuffer) String() string {
   334  	b.mu.Lock()
   335  	defer b.mu.Unlock()
   336  
   337  	out := b.buf.String()
   338  	if b.truncated {
   339  		out = out + "..."
   340  	}
   341  	return out
   342  }
   343  
   344  // If configuredValue is zero, use defaultValue instead.
   345  func timeoutWithDefault(configuredValue time.Duration, defaultValue time.Duration) time.Duration {
   346  	if configuredValue == 0 {
   347  		return defaultValue
   348  	}
   349  	return configuredValue
   350  }
   351  
   352  func min(x, y int) int {
   353  	if x < y {
   354  		return x
   355  	}
   356  	return y
   357  }
   358  
   359  func getShell(config *containertypes.Config) []string {
   360  	if len(config.Shell) != 0 {
   361  		return config.Shell
   362  	}
   363  	if runtime.GOOS != "windows" {
   364  		return []string{"/bin/sh", "-c"}
   365  	}
   366  	return []string{"cmd", "/S", "/C"}
   367  }