github.com/kim0/docker@v0.6.2-0.20161130212042-4addda3f07e7/daemon/health.go (about)

     1  package daemon
     2  
     3  import (
     4  	"bytes"
     5  	"fmt"
     6  	"runtime"
     7  	"strings"
     8  	"sync"
     9  	"time"
    10  
    11  	"golang.org/x/net/context"
    12  
    13  	"github.com/Sirupsen/logrus"
    14  	"github.com/docker/docker/api/types"
    15  	"github.com/docker/docker/api/types/strslice"
    16  	"github.com/docker/docker/container"
    17  	"github.com/docker/docker/daemon/exec"
    18  )
    19  
    20  const (
    21  	// Longest healthcheck probe output message to store. Longer messages will be truncated.
    22  	maxOutputLen = 4096
    23  
    24  	// Default interval between probe runs (from the end of the first to the start of the second).
    25  	// Also the time before the first probe.
    26  	defaultProbeInterval = 30 * time.Second
    27  
    28  	// The maximum length of time a single probe run should take. If the probe takes longer
    29  	// than this, the check is considered to have failed.
    30  	defaultProbeTimeout = 30 * time.Second
    31  
    32  	// Default number of consecutive failures of the health check
    33  	// for the container to be considered unhealthy.
    34  	defaultProbeRetries = 3
    35  
    36  	// Maximum number of entries to record
    37  	maxLogEntries = 5
    38  )
    39  
    40  const (
    41  	// Exit status codes that can be returned by the probe command.
    42  
    43  	exitStatusHealthy   = 0 // Container is healthy
    44  	exitStatusUnhealthy = 1 // Container is unhealthy
    45  )
    46  
    47  // probe implementations know how to run a particular type of probe.
    48  type probe interface {
    49  	// Perform one run of the check. Returns the exit code and an optional
    50  	// short diagnostic string.
    51  	run(context.Context, *Daemon, *container.Container) (*types.HealthcheckResult, error)
    52  }
    53  
    54  // cmdProbe implements the "CMD" probe type.
    55  type cmdProbe struct {
    56  	// Run the command with the system's default shell instead of execing it directly.
    57  	shell bool
    58  }
    59  
    60  // exec the healthcheck command in the container.
    61  // Returns the exit code and probe output (if any)
    62  func (p *cmdProbe) run(ctx context.Context, d *Daemon, container *container.Container) (*types.HealthcheckResult, error) {
    63  
    64  	cmdSlice := strslice.StrSlice(container.Config.Healthcheck.Test)[1:]
    65  	if p.shell {
    66  		if runtime.GOOS != "windows" {
    67  			cmdSlice = append([]string{"/bin/sh", "-c"}, cmdSlice...)
    68  		} else {
    69  			cmdSlice = append([]string{"cmd", "/S", "/C"}, cmdSlice...)
    70  		}
    71  	}
    72  	entrypoint, args := d.getEntrypointAndArgs(strslice.StrSlice{}, cmdSlice)
    73  	execConfig := exec.NewConfig()
    74  	execConfig.OpenStdin = false
    75  	execConfig.OpenStdout = true
    76  	execConfig.OpenStderr = true
    77  	execConfig.ContainerID = container.ID
    78  	execConfig.DetachKeys = []byte{}
    79  	execConfig.Entrypoint = entrypoint
    80  	execConfig.Args = args
    81  	execConfig.Tty = false
    82  	execConfig.Privileged = false
    83  	execConfig.User = container.Config.User
    84  
    85  	d.registerExecCommand(container, execConfig)
    86  	d.LogContainerEvent(container, "exec_create: "+execConfig.Entrypoint+" "+strings.Join(execConfig.Args, " "))
    87  
    88  	output := &limitedBuffer{}
    89  	err := d.ContainerExecStart(ctx, execConfig.ID, nil, output, output)
    90  	if err != nil {
    91  		return nil, err
    92  	}
    93  	info, err := d.getExecConfig(execConfig.ID)
    94  	if err != nil {
    95  		return nil, err
    96  	}
    97  	if info.ExitCode == nil {
    98  		return nil, fmt.Errorf("Healthcheck for container %s has no exit code!", container.ID)
    99  	}
   100  	// Note: Go's json package will handle invalid UTF-8 for us
   101  	out := output.String()
   102  	return &types.HealthcheckResult{
   103  		End:      time.Now(),
   104  		ExitCode: *info.ExitCode,
   105  		Output:   out,
   106  	}, nil
   107  }
   108  
   109  // Update the container's Status.Health struct based on the latest probe's result.
   110  func handleProbeResult(d *Daemon, c *container.Container, result *types.HealthcheckResult) {
   111  	c.Lock()
   112  	defer c.Unlock()
   113  
   114  	retries := c.Config.Healthcheck.Retries
   115  	if retries <= 0 {
   116  		retries = defaultProbeRetries
   117  	}
   118  
   119  	h := c.State.Health
   120  	oldStatus := h.Status
   121  
   122  	if len(h.Log) >= maxLogEntries {
   123  		h.Log = append(h.Log[len(h.Log)+1-maxLogEntries:], result)
   124  	} else {
   125  		h.Log = append(h.Log, result)
   126  	}
   127  
   128  	if result.ExitCode == exitStatusHealthy {
   129  		h.FailingStreak = 0
   130  		h.Status = types.Healthy
   131  	} else {
   132  		// Failure (including invalid exit code)
   133  		h.FailingStreak++
   134  		if h.FailingStreak >= retries {
   135  			h.Status = types.Unhealthy
   136  		}
   137  		// Else we're starting or healthy. Stay in that state.
   138  	}
   139  
   140  	if oldStatus != h.Status {
   141  		d.LogContainerEvent(c, "health_status: "+h.Status)
   142  	}
   143  }
   144  
   145  // Run the container's monitoring thread until notified via "stop".
   146  // There is never more than one monitor thread running per container at a time.
   147  func monitor(d *Daemon, c *container.Container, stop chan struct{}, probe probe) {
   148  	probeTimeout := timeoutWithDefault(c.Config.Healthcheck.Timeout, defaultProbeTimeout)
   149  	probeInterval := timeoutWithDefault(c.Config.Healthcheck.Interval, defaultProbeInterval)
   150  	for {
   151  		select {
   152  		case <-stop:
   153  			logrus.Debugf("Stop healthcheck monitoring for container %s (received while idle)", c.ID)
   154  			return
   155  		case <-time.After(probeInterval):
   156  			logrus.Debugf("Running health check for container %s ...", c.ID)
   157  			startTime := time.Now()
   158  			ctx, cancelProbe := context.WithTimeout(context.Background(), probeTimeout)
   159  			results := make(chan *types.HealthcheckResult)
   160  			go func() {
   161  				healthChecksCounter.Inc()
   162  				result, err := probe.run(ctx, d, c)
   163  				if err != nil {
   164  					healthChecksFailedCounter.Inc()
   165  					logrus.Warnf("Health check for container %s error: %v", c.ID, err)
   166  					results <- &types.HealthcheckResult{
   167  						ExitCode: -1,
   168  						Output:   err.Error(),
   169  						Start:    startTime,
   170  						End:      time.Now(),
   171  					}
   172  				} else {
   173  					result.Start = startTime
   174  					logrus.Debugf("Health check for container %s done (exitCode=%d)", c.ID, result.ExitCode)
   175  					results <- result
   176  				}
   177  				close(results)
   178  			}()
   179  			select {
   180  			case <-stop:
   181  				logrus.Debugf("Stop healthcheck monitoring for container %s (received while probing)", c.ID)
   182  				// Stop timeout and kill probe, but don't wait for probe to exit.
   183  				cancelProbe()
   184  				return
   185  			case result := <-results:
   186  				handleProbeResult(d, c, result)
   187  				// Stop timeout
   188  				cancelProbe()
   189  			case <-ctx.Done():
   190  				logrus.Debugf("Health check for container %s taking too long", c.ID)
   191  				handleProbeResult(d, c, &types.HealthcheckResult{
   192  					ExitCode: -1,
   193  					Output:   fmt.Sprintf("Health check exceeded timeout (%v)", probeTimeout),
   194  					Start:    startTime,
   195  					End:      time.Now(),
   196  				})
   197  				cancelProbe()
   198  				// Wait for probe to exit (it might take a while to respond to the TERM
   199  				// signal and we don't want dying probes to pile up).
   200  				<-results
   201  			}
   202  		}
   203  	}
   204  }
   205  
   206  // Get a suitable probe implementation for the container's healthcheck configuration.
   207  // Nil will be returned if no healthcheck was configured or NONE was set.
   208  func getProbe(c *container.Container) probe {
   209  	config := c.Config.Healthcheck
   210  	if config == nil || len(config.Test) == 0 {
   211  		return nil
   212  	}
   213  	switch config.Test[0] {
   214  	case "CMD":
   215  		return &cmdProbe{shell: false}
   216  	case "CMD-SHELL":
   217  		return &cmdProbe{shell: true}
   218  	default:
   219  		logrus.Warnf("Unknown healthcheck type '%s' (expected 'CMD') in container %s", config.Test[0], c.ID)
   220  		return nil
   221  	}
   222  }
   223  
   224  // Ensure the health-check monitor is running or not, depending on the current
   225  // state of the container.
   226  // Called from monitor.go, with c locked.
   227  func (d *Daemon) updateHealthMonitor(c *container.Container) {
   228  	h := c.State.Health
   229  	if h == nil {
   230  		return // No healthcheck configured
   231  	}
   232  
   233  	probe := getProbe(c)
   234  	wantRunning := c.Running && !c.Paused && probe != nil
   235  	if wantRunning {
   236  		if stop := h.OpenMonitorChannel(); stop != nil {
   237  			go monitor(d, c, stop, probe)
   238  		}
   239  	} else {
   240  		h.CloseMonitorChannel()
   241  	}
   242  }
   243  
   244  // Reset the health state for a newly-started, restarted or restored container.
   245  // initHealthMonitor is called from monitor.go and we should never be running
   246  // two instances at once.
   247  // Called with c locked.
   248  func (d *Daemon) initHealthMonitor(c *container.Container) {
   249  	// If no healthcheck is setup then don't init the monitor
   250  	if getProbe(c) == nil {
   251  		return
   252  	}
   253  
   254  	// This is needed in case we're auto-restarting
   255  	d.stopHealthchecks(c)
   256  
   257  	if h := c.State.Health; h != nil {
   258  		h.Status = types.Starting
   259  		h.FailingStreak = 0
   260  	} else {
   261  		h := &container.Health{}
   262  		h.Status = types.Starting
   263  		c.State.Health = h
   264  	}
   265  
   266  	d.updateHealthMonitor(c)
   267  }
   268  
   269  // Called when the container is being stopped (whether because the health check is
   270  // failing or for any other reason).
   271  func (d *Daemon) stopHealthchecks(c *container.Container) {
   272  	h := c.State.Health
   273  	if h != nil {
   274  		h.CloseMonitorChannel()
   275  	}
   276  }
   277  
   278  // Buffer up to maxOutputLen bytes. Further data is discarded.
   279  type limitedBuffer struct {
   280  	buf       bytes.Buffer
   281  	mu        sync.Mutex
   282  	truncated bool // indicates that data has been lost
   283  }
   284  
   285  // Append to limitedBuffer while there is room.
   286  func (b *limitedBuffer) Write(data []byte) (int, error) {
   287  	b.mu.Lock()
   288  	defer b.mu.Unlock()
   289  
   290  	bufLen := b.buf.Len()
   291  	dataLen := len(data)
   292  	keep := min(maxOutputLen-bufLen, dataLen)
   293  	if keep > 0 {
   294  		b.buf.Write(data[:keep])
   295  	}
   296  	if keep < dataLen {
   297  		b.truncated = true
   298  	}
   299  	return dataLen, nil
   300  }
   301  
   302  // The contents of the buffer, with "..." appended if it overflowed.
   303  func (b *limitedBuffer) String() string {
   304  	b.mu.Lock()
   305  	defer b.mu.Unlock()
   306  
   307  	out := b.buf.String()
   308  	if b.truncated {
   309  		out = out + "..."
   310  	}
   311  	return out
   312  }
   313  
   314  // If configuredValue is zero, use defaultValue instead.
   315  func timeoutWithDefault(configuredValue time.Duration, defaultValue time.Duration) time.Duration {
   316  	if configuredValue == 0 {
   317  		return defaultValue
   318  	}
   319  	return configuredValue
   320  }
   321  
   322  func min(x, y int) int {
   323  	if x < y {
   324  		return x
   325  	}
   326  	return y
   327  }