github.com/DaoCloud/dao@v0.0.0-20161212064103-c3dbfd13ee36/daemon/health.go (about)

     1  package daemon
     2  
     3  import (
     4  	"bytes"
     5  	"fmt"
     6  	"runtime"
     7  	"strings"
     8  	"sync"
     9  	"time"
    10  
    11  	"golang.org/x/net/context"
    12  
    13  	"github.com/Sirupsen/logrus"
    14  	"github.com/docker/docker/container"
    15  	"github.com/docker/docker/daemon/exec"
    16  	"github.com/docker/engine-api/types"
    17  	"github.com/docker/engine-api/types/strslice"
    18  )
    19  
    20  const (
    21  	// Longest healthcheck probe output message to store. Longer messages will be truncated.
    22  	maxOutputLen = 4096
    23  
    24  	// Default interval between probe runs (from the end of the first to the start of the second).
    25  	// Also the time before the first probe.
    26  	defaultProbeInterval = 30 * time.Second
    27  
    28  	// The maximum length of time a single probe run should take. If the probe takes longer
    29  	// than this, the check is considered to have failed.
    30  	defaultProbeTimeout = 30 * time.Second
    31  
    32  	// Default number of consecutive failures of the health check
    33  	// for the container to be considered unhealthy.
    34  	defaultProbeRetries = 3
    35  
    36  	// Maximum number of entries to record
    37  	maxLogEntries = 5
    38  )
    39  
    40  const (
    41  	// Exit status codes that can be returned by the probe command.
    42  
    43  	exitStatusHealthy   = 0 // Container is healthy
    44  	exitStatusUnhealthy = 1 // Container is unhealthy
    45  )
    46  
    47  // probe implementations know how to run a particular type of probe.
    48  type probe interface {
    49  	// Perform one run of the check. Returns the exit code and an optional
    50  	// short diagnostic string.
    51  	run(context.Context, *Daemon, *container.Container) (*types.HealthcheckResult, error)
    52  }
    53  
    54  // cmdProbe implements the "CMD" probe type.
    55  type cmdProbe struct {
    56  	// Run the command with the system's default shell instead of execing it directly.
    57  	shell bool
    58  }
    59  
    60  // exec the healthcheck command in the container.
    61  // Returns the exit code and probe output (if any)
    62  func (p *cmdProbe) run(ctx context.Context, d *Daemon, container *container.Container) (*types.HealthcheckResult, error) {
    63  	cmdSlice := strslice.StrSlice(container.Config.Healthcheck.Test)[1:]
    64  	if p.shell {
    65  		if runtime.GOOS != "windows" {
    66  			cmdSlice = append([]string{"/bin/sh", "-c"}, cmdSlice...)
    67  		} else {
    68  			cmdSlice = append([]string{"cmd", "/S", "/C"}, cmdSlice...)
    69  		}
    70  	}
    71  	entrypoint, args := d.getEntrypointAndArgs(strslice.StrSlice{}, cmdSlice)
    72  	execConfig := exec.NewConfig()
    73  	execConfig.OpenStdin = false
    74  	execConfig.OpenStdout = true
    75  	execConfig.OpenStderr = true
    76  	execConfig.ContainerID = container.ID
    77  	execConfig.DetachKeys = []byte{}
    78  	execConfig.Entrypoint = entrypoint
    79  	execConfig.Args = args
    80  	execConfig.Tty = false
    81  	execConfig.Privileged = false
    82  	execConfig.User = container.Config.User
    83  
    84  	d.registerExecCommand(container, execConfig)
    85  	d.LogContainerEvent(container, "exec_create: "+execConfig.Entrypoint+" "+strings.Join(execConfig.Args, " "))
    86  
    87  	output := &limitedBuffer{}
    88  	err := d.ContainerExecStart(ctx, execConfig.ID, nil, output, output)
    89  	if err != nil {
    90  		return nil, err
    91  	}
    92  	info, err := d.getExecConfig(execConfig.ID)
    93  	if err != nil {
    94  		return nil, err
    95  	}
    96  	if info.ExitCode == nil {
    97  		return nil, fmt.Errorf("Healthcheck has no exit code!")
    98  	}
    99  	// Note: Go's json package will handle invalid UTF-8 for us
   100  	out := output.String()
   101  	return &types.HealthcheckResult{
   102  		End:      time.Now(),
   103  		ExitCode: *info.ExitCode,
   104  		Output:   out,
   105  	}, nil
   106  }
   107  
   108  // Update the container's Status.Health struct based on the latest probe's result.
   109  func handleProbeResult(d *Daemon, c *container.Container, result *types.HealthcheckResult) {
   110  	c.Lock()
   111  	defer c.Unlock()
   112  
   113  	retries := c.Config.Healthcheck.Retries
   114  	if retries <= 0 {
   115  		retries = defaultProbeRetries
   116  	}
   117  
   118  	h := c.State.Health
   119  	oldStatus := h.Status
   120  
   121  	if len(h.Log) >= maxLogEntries {
   122  		h.Log = append(h.Log[len(h.Log)+1-maxLogEntries:], result)
   123  	} else {
   124  		h.Log = append(h.Log, result)
   125  	}
   126  
   127  	if result.ExitCode == exitStatusHealthy {
   128  		h.FailingStreak = 0
   129  		h.Status = types.Healthy
   130  	} else {
   131  		// Failure (including invalid exit code)
   132  		h.FailingStreak++
   133  		if h.FailingStreak >= retries {
   134  			h.Status = types.Unhealthy
   135  		}
   136  		// Else we're starting or healthy. Stay in that state.
   137  	}
   138  
   139  	if oldStatus != h.Status {
   140  		d.LogContainerEvent(c, "health_status: "+h.Status)
   141  	}
   142  }
   143  
   144  // Run the container's monitoring thread until notified via "stop".
   145  // There is never more than one monitor thread running per container at a time.
   146  func monitor(d *Daemon, c *container.Container, stop chan struct{}, probe probe) {
   147  	probeTimeout := timeoutWithDefault(c.Config.Healthcheck.Timeout, defaultProbeTimeout)
   148  	probeInterval := timeoutWithDefault(c.Config.Healthcheck.Interval, defaultProbeInterval)
   149  	for {
   150  		select {
   151  		case <-stop:
   152  			logrus.Debug("Stop healthcheck monitoring (received while idle)")
   153  			return
   154  		case <-time.After(probeInterval):
   155  			logrus.Debug("Running health check...")
   156  			startTime := time.Now()
   157  			ctx, cancelProbe := context.WithTimeout(context.Background(), probeTimeout)
   158  			results := make(chan *types.HealthcheckResult)
   159  			go func() {
   160  				result, err := probe.run(ctx, d, c)
   161  				if err != nil {
   162  					logrus.Warnf("Health check error: %v", err)
   163  					results <- &types.HealthcheckResult{
   164  						ExitCode: -1,
   165  						Output:   err.Error(),
   166  						Start:    startTime,
   167  						End:      time.Now(),
   168  					}
   169  				} else {
   170  					result.Start = startTime
   171  					logrus.Debugf("Health check done (exitCode=%d)", result.ExitCode)
   172  					results <- result
   173  				}
   174  				close(results)
   175  			}()
   176  			select {
   177  			case <-stop:
   178  				logrus.Debug("Stop healthcheck monitoring (received while probing)")
   179  				// Stop timeout and kill probe, but don't wait for probe to exit.
   180  				cancelProbe()
   181  				return
   182  			case result := <-results:
   183  				handleProbeResult(d, c, result)
   184  				// Stop timeout
   185  				cancelProbe()
   186  			case <-ctx.Done():
   187  				logrus.Debug("Health check taking too long")
   188  				handleProbeResult(d, c, &types.HealthcheckResult{
   189  					ExitCode: -1,
   190  					Output:   fmt.Sprintf("Health check exceeded timeout (%v)", probeTimeout),
   191  					Start:    startTime,
   192  					End:      time.Now(),
   193  				})
   194  				cancelProbe()
   195  				// Wait for probe to exit (it might take a while to respond to the TERM
   196  				// signal and we don't want dying probes to pile up).
   197  				<-results
   198  			}
   199  		}
   200  	}
   201  }
   202  
   203  // Get a suitable probe implementation for the container's healthcheck configuration.
   204  // Nil will be returned if no healthcheck was configured or NONE was set.
   205  func getProbe(c *container.Container) probe {
   206  	config := c.Config.Healthcheck
   207  	if config == nil || len(config.Test) == 0 {
   208  		return nil
   209  	}
   210  	switch config.Test[0] {
   211  	case "CMD":
   212  		return &cmdProbe{shell: false}
   213  	case "CMD-SHELL":
   214  		return &cmdProbe{shell: true}
   215  	default:
   216  		logrus.Warnf("Unknown healthcheck type '%s' (expected 'CMD')", config.Test[0])
   217  		return nil
   218  	}
   219  }
   220  
   221  // Ensure the health-check monitor is running or not, depending on the current
   222  // state of the container.
   223  // Called from monitor.go, with c locked.
   224  func (d *Daemon) updateHealthMonitor(c *container.Container) {
   225  	h := c.State.Health
   226  	if h == nil {
   227  		return // No healthcheck configured
   228  	}
   229  
   230  	probe := getProbe(c)
   231  	wantRunning := c.Running && !c.Paused && probe != nil
   232  	if wantRunning {
   233  		if stop := h.OpenMonitorChannel(); stop != nil {
   234  			go monitor(d, c, stop, probe)
   235  		}
   236  	} else {
   237  		h.CloseMonitorChannel()
   238  	}
   239  }
   240  
   241  // Reset the health state for a newly-started, restarted or restored container.
   242  // initHealthMonitor is called from monitor.go and we should never be running
   243  // two instances at once.
   244  // Called with c locked.
   245  func (d *Daemon) initHealthMonitor(c *container.Container) {
   246  	// If no healthcheck is setup then don't init the monitor
   247  	if getProbe(c) == nil {
   248  		return
   249  	}
   250  
   251  	// This is needed in case we're auto-restarting
   252  	d.stopHealthchecks(c)
   253  
   254  	if h := c.State.Health; h != nil {
   255  		h.Status = types.Starting
   256  		h.FailingStreak = 0
   257  	} else {
   258  		h := &container.Health{}
   259  		h.Status = types.Starting
   260  		c.State.Health = h
   261  	}
   262  
   263  	d.updateHealthMonitor(c)
   264  }
   265  
   266  // Called when the container is being stopped (whether because the health check is
   267  // failing or for any other reason).
   268  func (d *Daemon) stopHealthchecks(c *container.Container) {
   269  	h := c.State.Health
   270  	if h != nil {
   271  		h.CloseMonitorChannel()
   272  	}
   273  }
   274  
   275  // Buffer up to maxOutputLen bytes. Further data is discarded.
   276  type limitedBuffer struct {
   277  	buf       bytes.Buffer
   278  	mu        sync.Mutex
   279  	truncated bool // indicates that data has been lost
   280  }
   281  
   282  // Append to limitedBuffer while there is room.
   283  func (b *limitedBuffer) Write(data []byte) (int, error) {
   284  	b.mu.Lock()
   285  	defer b.mu.Unlock()
   286  
   287  	bufLen := b.buf.Len()
   288  	dataLen := len(data)
   289  	keep := min(maxOutputLen-bufLen, dataLen)
   290  	if keep > 0 {
   291  		b.buf.Write(data[:keep])
   292  	}
   293  	if keep < dataLen {
   294  		b.truncated = true
   295  	}
   296  	return dataLen, nil
   297  }
   298  
   299  // The contents of the buffer, with "..." appended if it overflowed.
   300  func (b *limitedBuffer) String() string {
   301  	b.mu.Lock()
   302  	defer b.mu.Unlock()
   303  
   304  	out := b.buf.String()
   305  	if b.truncated {
   306  		out = out + "..."
   307  	}
   308  	return out
   309  }
   310  
   311  // If configuredValue is zero, use defaultValue instead.
   312  func timeoutWithDefault(configuredValue time.Duration, defaultValue time.Duration) time.Duration {
   313  	if configuredValue == 0 {
   314  		return defaultValue
   315  	}
   316  	return configuredValue
   317  }
   318  
   319  func min(x, y int) int {
   320  	if x < y {
   321  		return x
   322  	}
   323  	return y
   324  }