github.com/brahmaroutu/docker@v1.2.1-0.20160809185609-eb28dde01f16/daemon/health.go (about)

     1  package daemon
     2  
     3  import (
     4  	"bytes"
     5  	"fmt"
     6  	"runtime"
     7  	"strings"
     8  	"time"
     9  
    10  	"golang.org/x/net/context"
    11  
    12  	"github.com/Sirupsen/logrus"
    13  	"github.com/docker/docker/container"
    14  	"github.com/docker/docker/daemon/exec"
    15  	"github.com/docker/engine-api/types"
    16  	"github.com/docker/engine-api/types/strslice"
    17  )
    18  
    19  const (
    20  	// Longest healthcheck probe output message to store. Longer messages will be truncated.
    21  	maxOutputLen = 4096
    22  
    23  	// Default interval between probe runs (from the end of the first to the start of the second).
    24  	// Also the time before the first probe.
    25  	defaultProbeInterval = 30 * time.Second
    26  
    27  	// The maximum length of time a single probe run should take. If the probe takes longer
    28  	// than this, the check is considered to have failed.
    29  	defaultProbeTimeout = 30 * time.Second
    30  
    31  	// Default number of consecutive failures of the health check
    32  	// for the container to be considered unhealthy.
    33  	defaultProbeRetries = 3
    34  
    35  	// Maximum number of entries to record
    36  	maxLogEntries = 5
    37  )
    38  
    39  const (
    40  	// Exit status codes that can be returned by the probe command.
    41  
    42  	exitStatusHealthy   = 0 // Container is healthy
    43  	exitStatusUnhealthy = 1 // Container is unhealthy
    44  )
    45  
    46  // probe implementations know how to run a particular type of probe.
    47  type probe interface {
    48  	// Perform one run of the check. Returns the exit code and an optional
    49  	// short diagnostic string.
    50  	run(context.Context, *Daemon, *container.Container) (*types.HealthcheckResult, error)
    51  }
    52  
    53  // cmdProbe implements the "CMD" probe type.
    54  type cmdProbe struct {
    55  	// Run the command with the system's default shell instead of execing it directly.
    56  	shell bool
    57  }
    58  
    59  // exec the healthcheck command in the container.
    60  // Returns the exit code and probe output (if any)
    61  func (p *cmdProbe) run(ctx context.Context, d *Daemon, container *container.Container) (*types.HealthcheckResult, error) {
    62  	cmdSlice := strslice.StrSlice(container.Config.Healthcheck.Test)[1:]
    63  	if p.shell {
    64  		if runtime.GOOS != "windows" {
    65  			cmdSlice = append([]string{"/bin/sh", "-c"}, cmdSlice...)
    66  		} else {
    67  			cmdSlice = append([]string{"cmd", "/S", "/C"}, cmdSlice...)
    68  		}
    69  	}
    70  	entrypoint, args := d.getEntrypointAndArgs(strslice.StrSlice{}, cmdSlice)
    71  	execConfig := exec.NewConfig()
    72  	execConfig.OpenStdin = false
    73  	execConfig.OpenStdout = true
    74  	execConfig.OpenStderr = true
    75  	execConfig.ContainerID = container.ID
    76  	execConfig.DetachKeys = []byte{}
    77  	execConfig.Entrypoint = entrypoint
    78  	execConfig.Args = args
    79  	execConfig.Tty = false
    80  	execConfig.Privileged = false
    81  	execConfig.User = container.Config.User
    82  
    83  	d.registerExecCommand(container, execConfig)
    84  	d.LogContainerEvent(container, "exec_create: "+execConfig.Entrypoint+" "+strings.Join(execConfig.Args, " "))
    85  
    86  	output := &limitedBuffer{}
    87  	err := d.ContainerExecStart(ctx, execConfig.ID, nil, output, output)
    88  	if err != nil {
    89  		return nil, err
    90  	}
    91  	info, err := d.getExecConfig(execConfig.ID)
    92  	if err != nil {
    93  		return nil, err
    94  	}
    95  	if info.ExitCode == nil {
    96  		return nil, fmt.Errorf("Healthcheck has no exit code!")
    97  	}
    98  	// Note: Go's json package will handle invalid UTF-8 for us
    99  	out := output.String()
   100  	return &types.HealthcheckResult{
   101  		End:      time.Now(),
   102  		ExitCode: *info.ExitCode,
   103  		Output:   out,
   104  	}, nil
   105  }
   106  
   107  // Update the container's Status.Health struct based on the latest probe's result.
   108  func handleProbeResult(d *Daemon, c *container.Container, result *types.HealthcheckResult) {
   109  	c.Lock()
   110  	defer c.Unlock()
   111  
   112  	retries := c.Config.Healthcheck.Retries
   113  	if retries <= 0 {
   114  		retries = defaultProbeRetries
   115  	}
   116  
   117  	h := c.State.Health
   118  	oldStatus := h.Status
   119  
   120  	if len(h.Log) >= maxLogEntries {
   121  		h.Log = append(h.Log[len(h.Log)+1-maxLogEntries:], result)
   122  	} else {
   123  		h.Log = append(h.Log, result)
   124  	}
   125  
   126  	if result.ExitCode == exitStatusHealthy {
   127  		h.FailingStreak = 0
   128  		h.Status = types.Healthy
   129  	} else {
   130  		// Failure (including invalid exit code)
   131  		h.FailingStreak++
   132  		if h.FailingStreak >= retries {
   133  			h.Status = types.Unhealthy
   134  		}
   135  		// Else we're starting or healthy. Stay in that state.
   136  	}
   137  
   138  	if oldStatus != h.Status {
   139  		d.LogContainerEvent(c, "health_status: "+h.Status)
   140  	}
   141  }
   142  
   143  // Run the container's monitoring thread until notified via "stop".
   144  // There is never more than one monitor thread running per container at a time.
   145  func monitor(d *Daemon, c *container.Container, stop chan struct{}, probe probe) {
   146  	probeTimeout := timeoutWithDefault(c.Config.Healthcheck.Timeout, defaultProbeTimeout)
   147  	probeInterval := timeoutWithDefault(c.Config.Healthcheck.Interval, defaultProbeInterval)
   148  	for {
   149  		select {
   150  		case <-stop:
   151  			logrus.Debug("Stop healthcheck monitoring (received while idle)")
   152  			return
   153  		case <-time.After(probeInterval):
   154  			logrus.Debug("Running health check...")
   155  			startTime := time.Now()
   156  			ctx, cancelProbe := context.WithTimeout(context.Background(), probeTimeout)
   157  			results := make(chan *types.HealthcheckResult)
   158  			go func() {
   159  				result, err := probe.run(ctx, d, c)
   160  				if err != nil {
   161  					logrus.Warnf("Health check error: %v", err)
   162  					results <- &types.HealthcheckResult{
   163  						ExitCode: -1,
   164  						Output:   err.Error(),
   165  						Start:    startTime,
   166  						End:      time.Now(),
   167  					}
   168  				} else {
   169  					result.Start = startTime
   170  					logrus.Debugf("Health check done (exitCode=%d)", result.ExitCode)
   171  					results <- result
   172  				}
   173  				close(results)
   174  			}()
   175  			select {
   176  			case <-stop:
   177  				logrus.Debug("Stop healthcheck monitoring (received while probing)")
   178  				// Stop timeout and kill probe, but don't wait for probe to exit.
   179  				cancelProbe()
   180  				return
   181  			case result := <-results:
   182  				handleProbeResult(d, c, result)
   183  				// Stop timeout
   184  				cancelProbe()
   185  			case <-ctx.Done():
   186  				logrus.Debug("Health check taking too long")
   187  				handleProbeResult(d, c, &types.HealthcheckResult{
   188  					ExitCode: -1,
   189  					Output:   fmt.Sprintf("Health check exceeded timeout (%v)", probeTimeout),
   190  					Start:    startTime,
   191  					End:      time.Now(),
   192  				})
   193  				cancelProbe()
   194  				// Wait for probe to exit (it might take a while to respond to the TERM
   195  				// signal and we don't want dying probes to pile up).
   196  				<-results
   197  			}
   198  		}
   199  	}
   200  }
   201  
   202  // Get a suitable probe implementation for the container's healthcheck configuration.
   203  // Nil will be returned if no healthcheck was configured or NONE was set.
   204  func getProbe(c *container.Container) probe {
   205  	config := c.Config.Healthcheck
   206  	if config == nil || len(config.Test) == 0 {
   207  		return nil
   208  	}
   209  	switch config.Test[0] {
   210  	case "CMD":
   211  		return &cmdProbe{shell: false}
   212  	case "CMD-SHELL":
   213  		return &cmdProbe{shell: true}
   214  	default:
   215  		logrus.Warnf("Unknown healthcheck type '%s' (expected 'CMD')", config.Test[0])
   216  		return nil
   217  	}
   218  }
   219  
   220  // Ensure the health-check monitor is running or not, depending on the current
   221  // state of the container.
   222  // Called from monitor.go, with c locked.
   223  func (d *Daemon) updateHealthMonitor(c *container.Container) {
   224  	h := c.State.Health
   225  	if h == nil {
   226  		return // No healthcheck configured
   227  	}
   228  
   229  	probe := getProbe(c)
   230  	wantRunning := c.Running && !c.Paused && probe != nil
   231  	if wantRunning {
   232  		if stop := h.OpenMonitorChannel(); stop != nil {
   233  			go monitor(d, c, stop, probe)
   234  		}
   235  	} else {
   236  		h.CloseMonitorChannel()
   237  	}
   238  }
   239  
   240  // Reset the health state for a newly-started, restarted or restored container.
   241  // initHealthMonitor is called from monitor.go and we should never be running
   242  // two instances at once.
   243  // Called with c locked.
   244  func (d *Daemon) initHealthMonitor(c *container.Container) {
   245  	// If no healthcheck is setup then don't init the monitor
   246  	if getProbe(c) == nil {
   247  		return
   248  	}
   249  
   250  	// This is needed in case we're auto-restarting
   251  	d.stopHealthchecks(c)
   252  
   253  	if c.State.Health == nil {
   254  		h := &container.Health{}
   255  		h.Status = types.Starting
   256  		c.State.Health = h
   257  	}
   258  
   259  	d.updateHealthMonitor(c)
   260  }
   261  
   262  // Called when the container is being stopped (whether because the health check is
   263  // failing or for any other reason).
   264  func (d *Daemon) stopHealthchecks(c *container.Container) {
   265  	h := c.State.Health
   266  	if h != nil {
   267  		h.CloseMonitorChannel()
   268  	}
   269  }
   270  
   271  // Buffer up to maxOutputLen bytes. Further data is discarded.
   272  type limitedBuffer struct {
   273  	buf       bytes.Buffer
   274  	truncated bool // indicates that data has been lost
   275  }
   276  
   277  // Append to limitedBuffer while there is room.
   278  func (b *limitedBuffer) Write(data []byte) (int, error) {
   279  	bufLen := b.buf.Len()
   280  	dataLen := len(data)
   281  	keep := min(maxOutputLen-bufLen, dataLen)
   282  	if keep > 0 {
   283  		b.buf.Write(data[:keep])
   284  	}
   285  	if keep < dataLen {
   286  		b.truncated = true
   287  	}
   288  	return dataLen, nil
   289  }
   290  
   291  // The contents of the buffer, with "..." appended if it overflowed.
   292  func (b *limitedBuffer) String() string {
   293  	out := b.buf.String()
   294  	if b.truncated {
   295  		out = out + "..."
   296  	}
   297  	return out
   298  }
   299  
   300  // If configuredValue is zero, use defaultValue instead.
   301  func timeoutWithDefault(configuredValue time.Duration, defaultValue time.Duration) time.Duration {
   302  	if configuredValue == 0 {
   303  		return defaultValue
   304  	}
   305  	return configuredValue
   306  }
   307  
   308  func min(x, y int) int {
   309  	if x < y {
   310  		return x
   311  	}
   312  	return y
   313  }