github.com/zhouyu0/docker-note@v0.0.0-20190722021225-b8d3825084db/daemon/health.go (about)

     1  package daemon // import "github.com/docker/docker/daemon"
     2  
     3  import (
     4  	"bytes"
     5  	"context"
     6  	"fmt"
     7  	"runtime"
     8  	"strings"
     9  	"sync"
    10  	"time"
    11  
    12  	"github.com/docker/docker/api/types"
    13  	containertypes "github.com/docker/docker/api/types/container"
    14  	"github.com/docker/docker/api/types/strslice"
    15  	"github.com/docker/docker/container"
    16  	"github.com/docker/docker/daemon/exec"
    17  	"github.com/sirupsen/logrus"
    18  )
    19  
    20  const (
    21  	// Longest healthcheck probe output message to store. Longer messages will be truncated.
    22  	maxOutputLen = 4096
    23  
    24  	// Default interval between probe runs (from the end of the first to the start of the second).
    25  	// Also the time before the first probe.
    26  	defaultProbeInterval = 30 * time.Second
    27  
    28  	// The maximum length of time a single probe run should take. If the probe takes longer
    29  	// than this, the check is considered to have failed.
    30  	defaultProbeTimeout = 30 * time.Second
    31  
    32  	// The time given for the container to start before the health check starts considering
    33  	// the container unstable. Defaults to none.
    34  	defaultStartPeriod = 0 * time.Second
    35  
    36  	// Default number of consecutive failures of the health check
    37  	// for the container to be considered unhealthy.
    38  	defaultProbeRetries = 3
    39  
    40  	// Maximum number of entries to record
    41  	maxLogEntries = 5
    42  )
    43  
    44  const (
    45  	// Exit status codes that can be returned by the probe command.
    46  
    47  	exitStatusHealthy = 0 // Container is healthy
    48  )
    49  
    50  // probe implementations know how to run a particular type of probe.
    51  type probe interface {
    52  	// Perform one run of the check. Returns the exit code and an optional
    53  	// short diagnostic string.
    54  	run(context.Context, *Daemon, *container.Container) (*types.HealthcheckResult, error)
    55  }
    56  
    57  // cmdProbe implements the "CMD" probe type.
    58  type cmdProbe struct {
    59  	// Run the command with the system's default shell instead of execing it directly.
    60  	shell bool
    61  }
    62  
    63  // exec the healthcheck command in the container.
    64  // Returns the exit code and probe output (if any)
    65  func (p *cmdProbe) run(ctx context.Context, d *Daemon, cntr *container.Container) (*types.HealthcheckResult, error) {
    66  	cmdSlice := strslice.StrSlice(cntr.Config.Healthcheck.Test)[1:]
    67  	if p.shell {
    68  		cmdSlice = append(getShell(cntr.Config), cmdSlice...)
    69  	}
    70  	entrypoint, args := d.getEntrypointAndArgs(strslice.StrSlice{}, cmdSlice)
    71  	execConfig := exec.NewConfig()
    72  	execConfig.OpenStdin = false
    73  	execConfig.OpenStdout = true
    74  	execConfig.OpenStderr = true
    75  	execConfig.ContainerID = cntr.ID
    76  	execConfig.DetachKeys = []byte{}
    77  	execConfig.Entrypoint = entrypoint
    78  	execConfig.Args = args
    79  	execConfig.Tty = false
    80  	execConfig.Privileged = false
    81  	execConfig.User = cntr.Config.User
    82  	execConfig.WorkingDir = cntr.Config.WorkingDir
    83  
    84  	linkedEnv, err := d.setupLinkedContainers(cntr)
    85  	if err != nil {
    86  		return nil, err
    87  	}
    88  	execConfig.Env = container.ReplaceOrAppendEnvValues(cntr.CreateDaemonEnvironment(execConfig.Tty, linkedEnv), execConfig.Env)
    89  
    90  	d.registerExecCommand(cntr, execConfig)
    91  	attributes := map[string]string{
    92  		"execID": execConfig.ID,
    93  	}
    94  	d.LogContainerEventWithAttributes(cntr, "exec_create: "+execConfig.Entrypoint+" "+strings.Join(execConfig.Args, " "), attributes)
    95  
    96  	output := &limitedBuffer{}
    97  	err = d.ContainerExecStart(ctx, execConfig.ID, nil, output, output)
    98  	if err != nil {
    99  		return nil, err
   100  	}
   101  	info, err := d.getExecConfig(execConfig.ID)
   102  	if err != nil {
   103  		return nil, err
   104  	}
   105  	if info.ExitCode == nil {
   106  		return nil, fmt.Errorf("healthcheck for container %s has no exit code", cntr.ID)
   107  	}
   108  	// Note: Go's json package will handle invalid UTF-8 for us
   109  	out := output.String()
   110  	return &types.HealthcheckResult{
   111  		End:      time.Now(),
   112  		ExitCode: *info.ExitCode,
   113  		Output:   out,
   114  	}, nil
   115  }
   116  
   117  // Update the container's Status.Health struct based on the latest probe's result.
   118  func handleProbeResult(d *Daemon, c *container.Container, result *types.HealthcheckResult, done chan struct{}) {
   119  	c.Lock()
   120  	defer c.Unlock()
   121  
   122  	// probe may have been cancelled while waiting on lock. Ignore result then
   123  	select {
   124  	case <-done:
   125  		return
   126  	default:
   127  	}
   128  
   129  	retries := c.Config.Healthcheck.Retries
   130  	if retries <= 0 {
   131  		retries = defaultProbeRetries
   132  	}
   133  
   134  	h := c.State.Health
   135  	oldStatus := h.Status()
   136  
   137  	if len(h.Log) >= maxLogEntries {
   138  		h.Log = append(h.Log[len(h.Log)+1-maxLogEntries:], result)
   139  	} else {
   140  		h.Log = append(h.Log, result)
   141  	}
   142  
   143  	if result.ExitCode == exitStatusHealthy {
   144  		h.FailingStreak = 0
   145  		h.SetStatus(types.Healthy)
   146  	} else { // Failure (including invalid exit code)
   147  		shouldIncrementStreak := true
   148  
   149  		// If the container is starting (i.e. we never had a successful health check)
   150  		// then we check if we are within the start period of the container in which
   151  		// case we do not increment the failure streak.
   152  		if h.Status() == types.Starting {
   153  			startPeriod := timeoutWithDefault(c.Config.Healthcheck.StartPeriod, defaultStartPeriod)
   154  			timeSinceStart := result.Start.Sub(c.State.StartedAt)
   155  
   156  			// If still within the start period, then don't increment failing streak.
   157  			if timeSinceStart < startPeriod {
   158  				shouldIncrementStreak = false
   159  			}
   160  		}
   161  
   162  		if shouldIncrementStreak {
   163  			h.FailingStreak++
   164  
   165  			if h.FailingStreak >= retries {
   166  				h.SetStatus(types.Unhealthy)
   167  			}
   168  		}
   169  		// Else we're starting or healthy. Stay in that state.
   170  	}
   171  
   172  	// replicate Health status changes
   173  	if err := c.CheckpointTo(d.containersReplica); err != nil {
   174  		// queries will be inconsistent until the next probe runs or other state mutations
   175  		// checkpoint the container
   176  		logrus.Errorf("Error replicating health state for container %s: %v", c.ID, err)
   177  	}
   178  
   179  	current := h.Status()
   180  	if oldStatus != current {
   181  		d.LogContainerEvent(c, "health_status: "+current)
   182  	}
   183  }
   184  
   185  // Run the container's monitoring thread until notified via "stop".
   186  // There is never more than one monitor thread running per container at a time.
   187  func monitor(d *Daemon, c *container.Container, stop chan struct{}, probe probe) {
   188  	probeTimeout := timeoutWithDefault(c.Config.Healthcheck.Timeout, defaultProbeTimeout)
   189  	probeInterval := timeoutWithDefault(c.Config.Healthcheck.Interval, defaultProbeInterval)
   190  	for {
   191  		select {
   192  		case <-stop:
   193  			logrus.Debugf("Stop healthcheck monitoring for container %s (received while idle)", c.ID)
   194  			return
   195  		case <-time.After(probeInterval):
   196  			logrus.Debugf("Running health check for container %s ...", c.ID)
   197  			startTime := time.Now()
   198  			ctx, cancelProbe := context.WithTimeout(context.Background(), probeTimeout)
   199  			results := make(chan *types.HealthcheckResult, 1)
   200  			go func() {
   201  				healthChecksCounter.Inc()
   202  				result, err := probe.run(ctx, d, c)
   203  				if err != nil {
   204  					healthChecksFailedCounter.Inc()
   205  					logrus.Warnf("Health check for container %s error: %v", c.ID, err)
   206  					results <- &types.HealthcheckResult{
   207  						ExitCode: -1,
   208  						Output:   err.Error(),
   209  						Start:    startTime,
   210  						End:      time.Now(),
   211  					}
   212  				} else {
   213  					result.Start = startTime
   214  					logrus.Debugf("Health check for container %s done (exitCode=%d)", c.ID, result.ExitCode)
   215  					results <- result
   216  				}
   217  				close(results)
   218  			}()
   219  			select {
   220  			case <-stop:
   221  				logrus.Debugf("Stop healthcheck monitoring for container %s (received while probing)", c.ID)
   222  				cancelProbe()
   223  				// Wait for probe to exit (it might take a while to respond to the TERM
   224  				// signal and we don't want dying probes to pile up).
   225  				<-results
   226  				return
   227  			case result := <-results:
   228  				handleProbeResult(d, c, result, stop)
   229  				// Stop timeout
   230  				cancelProbe()
   231  			case <-ctx.Done():
   232  				logrus.Debugf("Health check for container %s taking too long", c.ID)
   233  				handleProbeResult(d, c, &types.HealthcheckResult{
   234  					ExitCode: -1,
   235  					Output:   fmt.Sprintf("Health check exceeded timeout (%v)", probeTimeout),
   236  					Start:    startTime,
   237  					End:      time.Now(),
   238  				}, stop)
   239  				cancelProbe()
   240  				// Wait for probe to exit (it might take a while to respond to the TERM
   241  				// signal and we don't want dying probes to pile up).
   242  				<-results
   243  			}
   244  		}
   245  	}
   246  }
   247  
   248  // Get a suitable probe implementation for the container's healthcheck configuration.
   249  // Nil will be returned if no healthcheck was configured or NONE was set.
   250  func getProbe(c *container.Container) probe {
   251  	config := c.Config.Healthcheck
   252  	if config == nil || len(config.Test) == 0 {
   253  		return nil
   254  	}
   255  	switch config.Test[0] {
   256  	case "CMD":
   257  		return &cmdProbe{shell: false}
   258  	case "CMD-SHELL":
   259  		return &cmdProbe{shell: true}
   260  	case "NONE":
   261  		return nil
   262  	default:
   263  		logrus.Warnf("Unknown healthcheck type '%s' (expected 'CMD') in container %s", config.Test[0], c.ID)
   264  		return nil
   265  	}
   266  }
   267  
   268  // Ensure the health-check monitor is running or not, depending on the current
   269  // state of the container.
   270  // Called from monitor.go, with c locked.
   271  func (d *Daemon) updateHealthMonitor(c *container.Container) {
   272  	h := c.State.Health
   273  	if h == nil {
   274  		return // No healthcheck configured
   275  	}
   276  
   277  	probe := getProbe(c)
   278  	wantRunning := c.Running && !c.Paused && probe != nil
   279  	if wantRunning {
   280  		if stop := h.OpenMonitorChannel(); stop != nil {
   281  			go monitor(d, c, stop, probe)
   282  		}
   283  	} else {
   284  		h.CloseMonitorChannel()
   285  	}
   286  }
   287  
   288  // Reset the health state for a newly-started, restarted or restored container.
   289  // initHealthMonitor is called from monitor.go and we should never be running
   290  // two instances at once.
   291  // Called with c locked.
   292  func (d *Daemon) initHealthMonitor(c *container.Container) {
   293  	// If no healthcheck is setup then don't init the monitor
   294  	if getProbe(c) == nil {
   295  		return
   296  	}
   297  
   298  	// This is needed in case we're auto-restarting
   299  	d.stopHealthchecks(c)
   300  
   301  	if h := c.State.Health; h != nil {
   302  		h.SetStatus(types.Starting)
   303  		h.FailingStreak = 0
   304  	} else {
   305  		h := &container.Health{}
   306  		h.SetStatus(types.Starting)
   307  		c.State.Health = h
   308  	}
   309  
   310  	d.updateHealthMonitor(c)
   311  }
   312  
   313  // Called when the container is being stopped (whether because the health check is
   314  // failing or for any other reason).
   315  func (d *Daemon) stopHealthchecks(c *container.Container) {
   316  	h := c.State.Health
   317  	if h != nil {
   318  		h.CloseMonitorChannel()
   319  	}
   320  }
   321  
   322  // Buffer up to maxOutputLen bytes. Further data is discarded.
   323  type limitedBuffer struct {
   324  	buf       bytes.Buffer
   325  	mu        sync.Mutex
   326  	truncated bool // indicates that data has been lost
   327  }
   328  
   329  // Append to limitedBuffer while there is room.
   330  func (b *limitedBuffer) Write(data []byte) (int, error) {
   331  	b.mu.Lock()
   332  	defer b.mu.Unlock()
   333  
   334  	bufLen := b.buf.Len()
   335  	dataLen := len(data)
   336  	keep := min(maxOutputLen-bufLen, dataLen)
   337  	if keep > 0 {
   338  		b.buf.Write(data[:keep])
   339  	}
   340  	if keep < dataLen {
   341  		b.truncated = true
   342  	}
   343  	return dataLen, nil
   344  }
   345  
   346  // The contents of the buffer, with "..." appended if it overflowed.
   347  func (b *limitedBuffer) String() string {
   348  	b.mu.Lock()
   349  	defer b.mu.Unlock()
   350  
   351  	out := b.buf.String()
   352  	if b.truncated {
   353  		out = out + "..."
   354  	}
   355  	return out
   356  }
   357  
   358  // If configuredValue is zero, use defaultValue instead.
   359  func timeoutWithDefault(configuredValue time.Duration, defaultValue time.Duration) time.Duration {
   360  	if configuredValue == 0 {
   361  		return defaultValue
   362  	}
   363  	return configuredValue
   364  }
   365  
   366  func min(x, y int) int {
   367  	if x < y {
   368  		return x
   369  	}
   370  	return y
   371  }
   372  
   373  func getShell(config *containertypes.Config) []string {
   374  	if len(config.Shell) != 0 {
   375  		return config.Shell
   376  	}
   377  	if runtime.GOOS != "windows" {
   378  		return []string{"/bin/sh", "-c"}
   379  	}
   380  	return []string{"cmd", "/S", "/C"}
   381  }