github.com/jen20/docker@v1.13.1/daemon/health.go (about)

     1  package daemon
     2  
     3  import (
     4  	"bytes"
     5  	"fmt"
     6  	"runtime"
     7  	"strings"
     8  	"sync"
     9  	"time"
    10  
    11  	"golang.org/x/net/context"
    12  
    13  	"github.com/Sirupsen/logrus"
    14  	"github.com/docker/docker/api/types"
    15  	containertypes "github.com/docker/docker/api/types/container"
    16  	"github.com/docker/docker/api/types/strslice"
    17  	"github.com/docker/docker/container"
    18  	"github.com/docker/docker/daemon/exec"
    19  )
    20  
    21  const (
    22  	// Longest healthcheck probe output message to store. Longer messages will be truncated.
    23  	maxOutputLen = 4096
    24  
    25  	// Default interval between probe runs (from the end of the first to the start of the second).
    26  	// Also the time before the first probe.
    27  	defaultProbeInterval = 30 * time.Second
    28  
    29  	// The maximum length of time a single probe run should take. If the probe takes longer
    30  	// than this, the check is considered to have failed.
    31  	defaultProbeTimeout = 30 * time.Second
    32  
    33  	// Default number of consecutive failures of the health check
    34  	// for the container to be considered unhealthy.
    35  	defaultProbeRetries = 3
    36  
    37  	// Maximum number of entries to record
    38  	maxLogEntries = 5
    39  )
    40  
    41  const (
    42  	// Exit status codes that can be returned by the probe command.
    43  
    44  	exitStatusHealthy   = 0 // Container is healthy
    45  	exitStatusUnhealthy = 1 // Container is unhealthy
    46  )
    47  
    48  // probe implementations know how to run a particular type of probe.
    49  type probe interface {
    50  	// Perform one run of the check. Returns the exit code and an optional
    51  	// short diagnostic string.
    52  	run(context.Context, *Daemon, *container.Container) (*types.HealthcheckResult, error)
    53  }
    54  
    55  // cmdProbe implements the "CMD" probe type.
    56  type cmdProbe struct {
    57  	// Run the command with the system's default shell instead of execing it directly.
    58  	shell bool
    59  }
    60  
    61  // exec the healthcheck command in the container.
    62  // Returns the exit code and probe output (if any)
    63  func (p *cmdProbe) run(ctx context.Context, d *Daemon, container *container.Container) (*types.HealthcheckResult, error) {
    64  
    65  	cmdSlice := strslice.StrSlice(container.Config.Healthcheck.Test)[1:]
    66  	if p.shell {
    67  		cmdSlice = append(getShell(container.Config), cmdSlice...)
    68  	}
    69  	entrypoint, args := d.getEntrypointAndArgs(strslice.StrSlice{}, cmdSlice)
    70  	execConfig := exec.NewConfig()
    71  	execConfig.OpenStdin = false
    72  	execConfig.OpenStdout = true
    73  	execConfig.OpenStderr = true
    74  	execConfig.ContainerID = container.ID
    75  	execConfig.DetachKeys = []byte{}
    76  	execConfig.Entrypoint = entrypoint
    77  	execConfig.Args = args
    78  	execConfig.Tty = false
    79  	execConfig.Privileged = false
    80  	execConfig.User = container.Config.User
    81  
    82  	d.registerExecCommand(container, execConfig)
    83  	d.LogContainerEvent(container, "exec_create: "+execConfig.Entrypoint+" "+strings.Join(execConfig.Args, " "))
    84  
    85  	output := &limitedBuffer{}
    86  	err := d.ContainerExecStart(ctx, execConfig.ID, nil, output, output)
    87  	if err != nil {
    88  		return nil, err
    89  	}
    90  	info, err := d.getExecConfig(execConfig.ID)
    91  	if err != nil {
    92  		return nil, err
    93  	}
    94  	if info.ExitCode == nil {
    95  		return nil, fmt.Errorf("Healthcheck for container %s has no exit code!", container.ID)
    96  	}
    97  	// Note: Go's json package will handle invalid UTF-8 for us
    98  	out := output.String()
    99  	return &types.HealthcheckResult{
   100  		End:      time.Now(),
   101  		ExitCode: *info.ExitCode,
   102  		Output:   out,
   103  	}, nil
   104  }
   105  
   106  // Update the container's Status.Health struct based on the latest probe's result.
   107  func handleProbeResult(d *Daemon, c *container.Container, result *types.HealthcheckResult, done chan struct{}) {
   108  	c.Lock()
   109  	defer c.Unlock()
   110  
   111  	// probe may have been cancelled while waiting on lock. Ignore result then
   112  	select {
   113  	case <-done:
   114  		return
   115  	default:
   116  	}
   117  
   118  	retries := c.Config.Healthcheck.Retries
   119  	if retries <= 0 {
   120  		retries = defaultProbeRetries
   121  	}
   122  
   123  	h := c.State.Health
   124  	oldStatus := h.Status
   125  
   126  	if len(h.Log) >= maxLogEntries {
   127  		h.Log = append(h.Log[len(h.Log)+1-maxLogEntries:], result)
   128  	} else {
   129  		h.Log = append(h.Log, result)
   130  	}
   131  
   132  	if result.ExitCode == exitStatusHealthy {
   133  		h.FailingStreak = 0
   134  		h.Status = types.Healthy
   135  	} else {
   136  		// Failure (including invalid exit code)
   137  		h.FailingStreak++
   138  		if h.FailingStreak >= retries {
   139  			h.Status = types.Unhealthy
   140  		}
   141  		// Else we're starting or healthy. Stay in that state.
   142  	}
   143  
   144  	if oldStatus != h.Status {
   145  		d.LogContainerEvent(c, "health_status: "+h.Status)
   146  	}
   147  }
   148  
   149  // Run the container's monitoring thread until notified via "stop".
   150  // There is never more than one monitor thread running per container at a time.
   151  func monitor(d *Daemon, c *container.Container, stop chan struct{}, probe probe) {
   152  	probeTimeout := timeoutWithDefault(c.Config.Healthcheck.Timeout, defaultProbeTimeout)
   153  	probeInterval := timeoutWithDefault(c.Config.Healthcheck.Interval, defaultProbeInterval)
   154  	for {
   155  		select {
   156  		case <-stop:
   157  			logrus.Debugf("Stop healthcheck monitoring for container %s (received while idle)", c.ID)
   158  			return
   159  		case <-time.After(probeInterval):
   160  			logrus.Debugf("Running health check for container %s ...", c.ID)
   161  			startTime := time.Now()
   162  			ctx, cancelProbe := context.WithTimeout(context.Background(), probeTimeout)
   163  			results := make(chan *types.HealthcheckResult)
   164  			go func() {
   165  				healthChecksCounter.Inc()
   166  				result, err := probe.run(ctx, d, c)
   167  				if err != nil {
   168  					healthChecksFailedCounter.Inc()
   169  					logrus.Warnf("Health check for container %s error: %v", c.ID, err)
   170  					results <- &types.HealthcheckResult{
   171  						ExitCode: -1,
   172  						Output:   err.Error(),
   173  						Start:    startTime,
   174  						End:      time.Now(),
   175  					}
   176  				} else {
   177  					result.Start = startTime
   178  					logrus.Debugf("Health check for container %s done (exitCode=%d)", c.ID, result.ExitCode)
   179  					results <- result
   180  				}
   181  				close(results)
   182  			}()
   183  			select {
   184  			case <-stop:
   185  				logrus.Debugf("Stop healthcheck monitoring for container %s (received while probing)", c.ID)
   186  				// Stop timeout and kill probe, but don't wait for probe to exit.
   187  				cancelProbe()
   188  				return
   189  			case result := <-results:
   190  				handleProbeResult(d, c, result, stop)
   191  				// Stop timeout
   192  				cancelProbe()
   193  			case <-ctx.Done():
   194  				logrus.Debugf("Health check for container %s taking too long", c.ID)
   195  				handleProbeResult(d, c, &types.HealthcheckResult{
   196  					ExitCode: -1,
   197  					Output:   fmt.Sprintf("Health check exceeded timeout (%v)", probeTimeout),
   198  					Start:    startTime,
   199  					End:      time.Now(),
   200  				}, stop)
   201  				cancelProbe()
   202  				// Wait for probe to exit (it might take a while to respond to the TERM
   203  				// signal and we don't want dying probes to pile up).
   204  				<-results
   205  			}
   206  		}
   207  	}
   208  }
   209  
   210  // Get a suitable probe implementation for the container's healthcheck configuration.
   211  // Nil will be returned if no healthcheck was configured or NONE was set.
   212  func getProbe(c *container.Container) probe {
   213  	config := c.Config.Healthcheck
   214  	if config == nil || len(config.Test) == 0 {
   215  		return nil
   216  	}
   217  	switch config.Test[0] {
   218  	case "CMD":
   219  		return &cmdProbe{shell: false}
   220  	case "CMD-SHELL":
   221  		return &cmdProbe{shell: true}
   222  	default:
   223  		logrus.Warnf("Unknown healthcheck type '%s' (expected 'CMD') in container %s", config.Test[0], c.ID)
   224  		return nil
   225  	}
   226  }
   227  
   228  // Ensure the health-check monitor is running or not, depending on the current
   229  // state of the container.
   230  // Called from monitor.go, with c locked.
   231  func (d *Daemon) updateHealthMonitor(c *container.Container) {
   232  	h := c.State.Health
   233  	if h == nil {
   234  		return // No healthcheck configured
   235  	}
   236  
   237  	probe := getProbe(c)
   238  	wantRunning := c.Running && !c.Paused && probe != nil
   239  	if wantRunning {
   240  		if stop := h.OpenMonitorChannel(); stop != nil {
   241  			go monitor(d, c, stop, probe)
   242  		}
   243  	} else {
   244  		h.CloseMonitorChannel()
   245  	}
   246  }
   247  
   248  // Reset the health state for a newly-started, restarted or restored container.
   249  // initHealthMonitor is called from monitor.go and we should never be running
   250  // two instances at once.
   251  // Called with c locked.
   252  func (d *Daemon) initHealthMonitor(c *container.Container) {
   253  	// If no healthcheck is setup then don't init the monitor
   254  	if getProbe(c) == nil {
   255  		return
   256  	}
   257  
   258  	// This is needed in case we're auto-restarting
   259  	d.stopHealthchecks(c)
   260  
   261  	if h := c.State.Health; h != nil {
   262  		h.Status = types.Starting
   263  		h.FailingStreak = 0
   264  	} else {
   265  		h := &container.Health{}
   266  		h.Status = types.Starting
   267  		c.State.Health = h
   268  	}
   269  
   270  	d.updateHealthMonitor(c)
   271  }
   272  
   273  // Called when the container is being stopped (whether because the health check is
   274  // failing or for any other reason).
   275  func (d *Daemon) stopHealthchecks(c *container.Container) {
   276  	h := c.State.Health
   277  	if h != nil {
   278  		h.CloseMonitorChannel()
   279  	}
   280  }
   281  
   282  // Buffer up to maxOutputLen bytes. Further data is discarded.
   283  type limitedBuffer struct {
   284  	buf       bytes.Buffer
   285  	mu        sync.Mutex
   286  	truncated bool // indicates that data has been lost
   287  }
   288  
   289  // Append to limitedBuffer while there is room.
   290  func (b *limitedBuffer) Write(data []byte) (int, error) {
   291  	b.mu.Lock()
   292  	defer b.mu.Unlock()
   293  
   294  	bufLen := b.buf.Len()
   295  	dataLen := len(data)
   296  	keep := min(maxOutputLen-bufLen, dataLen)
   297  	if keep > 0 {
   298  		b.buf.Write(data[:keep])
   299  	}
   300  	if keep < dataLen {
   301  		b.truncated = true
   302  	}
   303  	return dataLen, nil
   304  }
   305  
   306  // The contents of the buffer, with "..." appended if it overflowed.
   307  func (b *limitedBuffer) String() string {
   308  	b.mu.Lock()
   309  	defer b.mu.Unlock()
   310  
   311  	out := b.buf.String()
   312  	if b.truncated {
   313  		out = out + "..."
   314  	}
   315  	return out
   316  }
   317  
   318  // If configuredValue is zero, use defaultValue instead.
   319  func timeoutWithDefault(configuredValue time.Duration, defaultValue time.Duration) time.Duration {
   320  	if configuredValue == 0 {
   321  		return defaultValue
   322  	}
   323  	return configuredValue
   324  }
   325  
   326  func min(x, y int) int {
   327  	if x < y {
   328  		return x
   329  	}
   330  	return y
   331  }
   332  
   333  func getShell(config *containertypes.Config) []string {
   334  	if len(config.Shell) != 0 {
   335  		return config.Shell
   336  	}
   337  	if runtime.GOOS != "windows" {
   338  		return []string{"/bin/sh", "-c"}
   339  	}
   340  	return []string{"cmd", "/S", "/C"}
   341  }