github.com/noxiouz/docker@v0.7.3-0.20160629055221-3d231c78e8c5/daemon/health.go (about)

     1  package daemon
     2  
     3  import (
     4  	"bytes"
     5  	"fmt"
     6  	"runtime"
     7  	"strings"
     8  	"time"
     9  
    10  	"golang.org/x/net/context"
    11  
    12  	"github.com/Sirupsen/logrus"
    13  	"github.com/docker/docker/container"
    14  	"github.com/docker/docker/daemon/exec"
    15  	"github.com/docker/engine-api/types"
    16  	"github.com/docker/engine-api/types/strslice"
    17  )
    18  
    19  const (
    20  	// Longest healthcheck probe output message to store. Longer messages will be truncated.
    21  	maxOutputLen = 4096
    22  
    23  	// Default interval between probe runs (from the end of the first to the start of the second).
    24  	// Also the time before the first probe.
    25  	defaultProbeInterval = 30 * time.Second
    26  
    27  	// The maximum length of time a single probe run should take. If the probe takes longer
    28  	// than this, the check is considered to have failed.
    29  	defaultProbeTimeout = 30 * time.Second
    30  
    31  	// Default number of consecutive failures of the health check
    32  	// for the container to be considered unhealthy.
    33  	defaultProbeRetries = 3
    34  
    35  	// Maximum number of entries to record
    36  	maxLogEntries = 5
    37  )
    38  
    39  const (
    40  	// Exit status codes that can be returned by the probe command.
    41  
    42  	exitStatusHealthy   = 0 // Container is healthy
    43  	exitStatusUnhealthy = 1 // Container is unhealthy
    44  	exitStatusStarting  = 2 // Container needs more time to start
    45  )
    46  
    47  // probe implementations know how to run a particular type of probe.
    48  type probe interface {
    49  	// Perform one run of the check. Returns the exit code and an optional
    50  	// short diagnostic string.
    51  	run(context.Context, *Daemon, *container.Container) (*types.HealthcheckResult, error)
    52  }
    53  
    54  // cmdProbe implements the "CMD" probe type.
    55  type cmdProbe struct {
    56  	// Run the command with the system's default shell instead of execing it directly.
    57  	shell bool
    58  }
    59  
    60  // exec the healthcheck command in the container.
    61  // Returns the exit code and probe output (if any)
    62  func (p *cmdProbe) run(ctx context.Context, d *Daemon, container *container.Container) (*types.HealthcheckResult, error) {
    63  	cmdSlice := strslice.StrSlice(container.Config.Healthcheck.Test)[1:]
    64  	if p.shell {
    65  		if runtime.GOOS != "windows" {
    66  			cmdSlice = append([]string{"/bin/sh", "-c"}, cmdSlice...)
    67  		} else {
    68  			cmdSlice = append([]string{"cmd", "/S", "/C"}, cmdSlice...)
    69  		}
    70  	}
    71  	entrypoint, args := d.getEntrypointAndArgs(strslice.StrSlice{}, cmdSlice)
    72  	execConfig := exec.NewConfig()
    73  	execConfig.OpenStdin = false
    74  	execConfig.OpenStdout = true
    75  	execConfig.OpenStderr = true
    76  	execConfig.ContainerID = container.ID
    77  	execConfig.DetachKeys = []byte{}
    78  	execConfig.Entrypoint = entrypoint
    79  	execConfig.Args = args
    80  	execConfig.Tty = false
    81  	execConfig.Privileged = false
    82  	execConfig.User = container.Config.User
    83  
    84  	d.registerExecCommand(container, execConfig)
    85  	d.LogContainerEvent(container, "exec_create: "+execConfig.Entrypoint+" "+strings.Join(execConfig.Args, " "))
    86  
    87  	output := &limitedBuffer{}
    88  	err := d.ContainerExecStart(ctx, execConfig.ID, nil, output, output)
    89  	if err != nil {
    90  		return nil, err
    91  	}
    92  	info, err := d.getExecConfig(execConfig.ID)
    93  	if err != nil {
    94  		return nil, err
    95  	}
    96  	if info.ExitCode == nil {
    97  		return nil, fmt.Errorf("Healthcheck has no exit code!")
    98  	}
    99  	// Note: Go's json package will handle invalid UTF-8 for us
   100  	out := output.String()
   101  	return &types.HealthcheckResult{
   102  		End:      time.Now(),
   103  		ExitCode: *info.ExitCode,
   104  		Output:   out,
   105  	}, nil
   106  }
   107  
   108  // Update the container's Status.Health struct based on the latest probe's result.
   109  func handleProbeResult(d *Daemon, c *container.Container, result *types.HealthcheckResult) {
   110  	c.Lock()
   111  	defer c.Unlock()
   112  
   113  	retries := c.Config.Healthcheck.Retries
   114  	if retries <= 0 {
   115  		retries = defaultProbeRetries
   116  	}
   117  
   118  	h := c.State.Health
   119  	oldStatus := h.Status
   120  
   121  	if len(h.Log) >= maxLogEntries {
   122  		h.Log = append(h.Log[len(h.Log)+1-maxLogEntries:], result)
   123  	} else {
   124  		h.Log = append(h.Log, result)
   125  	}
   126  
   127  	if result.ExitCode == exitStatusHealthy {
   128  		h.FailingStreak = 0
   129  		h.Status = types.Healthy
   130  	} else if result.ExitCode == exitStatusStarting && c.State.Health.Status == types.Starting {
   131  		// The container is not ready yet. Remain in the starting state.
   132  	} else {
   133  		// Failure (including invalid exit code)
   134  		h.FailingStreak++
   135  		if c.State.Health.FailingStreak >= retries {
   136  			h.Status = types.Unhealthy
   137  		}
   138  		// Else we're starting or healthy. Stay in that state.
   139  	}
   140  
   141  	if oldStatus != h.Status {
   142  		d.LogContainerEvent(c, "health_status: "+h.Status)
   143  	}
   144  }
   145  
   146  // Run the container's monitoring thread until notified via "stop".
   147  // There is never more than one monitor thread running per container at a time.
   148  func monitor(d *Daemon, c *container.Container, stop chan struct{}, probe probe) {
   149  	probeTimeout := timeoutWithDefault(c.Config.Healthcheck.Timeout, defaultProbeTimeout)
   150  	probeInterval := timeoutWithDefault(c.Config.Healthcheck.Interval, defaultProbeInterval)
   151  	for {
   152  		select {
   153  		case <-stop:
   154  			logrus.Debug("Stop healthcheck monitoring (received while idle)")
   155  			return
   156  		case <-time.After(probeInterval):
   157  			logrus.Debug("Running health check...")
   158  			startTime := time.Now()
   159  			ctx, cancelProbe := context.WithTimeout(context.Background(), probeTimeout)
   160  			results := make(chan *types.HealthcheckResult)
   161  			go func() {
   162  				result, err := probe.run(ctx, d, c)
   163  				if err != nil {
   164  					logrus.Warnf("Health check error: %v", err)
   165  					results <- &types.HealthcheckResult{
   166  						ExitCode: -1,
   167  						Output:   err.Error(),
   168  						Start:    startTime,
   169  						End:      time.Now(),
   170  					}
   171  				} else {
   172  					result.Start = startTime
   173  					logrus.Debugf("Health check done (exitCode=%d)", result.ExitCode)
   174  					results <- result
   175  				}
   176  				close(results)
   177  			}()
   178  			select {
   179  			case <-stop:
   180  				logrus.Debug("Stop healthcheck monitoring (received while probing)")
   181  				// Stop timeout and kill probe, but don't wait for probe to exit.
   182  				cancelProbe()
   183  				return
   184  			case result := <-results:
   185  				handleProbeResult(d, c, result)
   186  				// Stop timeout
   187  				cancelProbe()
   188  			case <-ctx.Done():
   189  				logrus.Debug("Health check taking too long")
   190  				handleProbeResult(d, c, &types.HealthcheckResult{
   191  					ExitCode: -1,
   192  					Output:   fmt.Sprintf("Health check exceeded timeout (%v)", probeTimeout),
   193  					Start:    startTime,
   194  					End:      time.Now(),
   195  				})
   196  				cancelProbe()
   197  				// Wait for probe to exit (it might take a while to respond to the TERM
   198  				// signal and we don't want dying probes to pile up).
   199  				<-results
   200  			}
   201  		}
   202  	}
   203  }
   204  
   205  // Get a suitable probe implementation for the container's healthcheck configuration.
   206  func getProbe(c *container.Container) probe {
   207  	config := c.Config.Healthcheck
   208  	if config == nil || len(config.Test) == 0 {
   209  		return nil
   210  	}
   211  	switch config.Test[0] {
   212  	case "CMD":
   213  		return &cmdProbe{shell: false}
   214  	case "CMD-SHELL":
   215  		return &cmdProbe{shell: true}
   216  	default:
   217  		logrus.Warnf("Unknown healthcheck type '%s' (expected 'CMD')", config.Test[0])
   218  		return nil
   219  	}
   220  }
   221  
   222  // Ensure the health-check monitor is running or not, depending on the current
   223  // state of the container.
   224  // Called from monitor.go, with c locked.
   225  func (d *Daemon) updateHealthMonitor(c *container.Container) {
   226  	h := c.State.Health
   227  	if h == nil {
   228  		return // No healthcheck configured
   229  	}
   230  
   231  	probe := getProbe(c)
   232  	wantRunning := c.Running && !c.Paused && probe != nil
   233  	if wantRunning {
   234  		if stop := h.OpenMonitorChannel(); stop != nil {
   235  			go monitor(d, c, stop, probe)
   236  		}
   237  	} else {
   238  		h.CloseMonitorChannel()
   239  	}
   240  }
   241  
   242  // Reset the health state for a newly-started, restarted or restored container.
   243  // initHealthMonitor is called from monitor.go and we should never be running
   244  // two instances at once.
   245  // Called with c locked.
   246  func (d *Daemon) initHealthMonitor(c *container.Container) {
   247  	if c.Config.Healthcheck == nil {
   248  		return
   249  	}
   250  
   251  	// This is needed in case we're auto-restarting
   252  	d.stopHealthchecks(c)
   253  
   254  	if c.State.Health == nil {
   255  		h := &container.Health{}
   256  		h.Status = types.Starting
   257  		h.FailingStreak = 0
   258  		c.State.Health = h
   259  	}
   260  
   261  	d.updateHealthMonitor(c)
   262  }
   263  
   264  // Called when the container is being stopped (whether because the health check is
   265  // failing or for any other reason).
   266  func (d *Daemon) stopHealthchecks(c *container.Container) {
   267  	h := c.State.Health
   268  	if h != nil {
   269  		h.CloseMonitorChannel()
   270  	}
   271  }
   272  
   273  // Buffer up to maxOutputLen bytes. Further data is discarded.
   274  type limitedBuffer struct {
   275  	buf       bytes.Buffer
   276  	truncated bool // indicates that data has been lost
   277  }
   278  
   279  // Append to limitedBuffer while there is room.
   280  func (b *limitedBuffer) Write(data []byte) (int, error) {
   281  	bufLen := b.buf.Len()
   282  	dataLen := len(data)
   283  	keep := min(maxOutputLen-bufLen, dataLen)
   284  	if keep > 0 {
   285  		b.buf.Write(data[:keep])
   286  	}
   287  	if keep < dataLen {
   288  		b.truncated = true
   289  	}
   290  	return dataLen, nil
   291  }
   292  
   293  // The contents of the buffer, with "..." appended if it overflowed.
   294  func (b *limitedBuffer) String() string {
   295  	out := b.buf.String()
   296  	if b.truncated {
   297  		out = out + "..."
   298  	}
   299  	return out
   300  }
   301  
   302  // If configuredValue is zero, use defaultValue instead.
   303  func timeoutWithDefault(configuredValue time.Duration, defaultValue time.Duration) time.Duration {
   304  	if configuredValue == 0 {
   305  		return defaultValue
   306  	}
   307  	return configuredValue
   308  }
   309  
   310  func min(x, y int) int {
   311  	if x < y {
   312  		return x
   313  	}
   314  	return y
   315  }