github.com/jfrazelle/docker@v1.1.2-0.20210712172922-bf78e25fe508/daemon/health.go

github.com/jfrazelle/docker@v1.1.2-0.20210712172922-bf78e25fe508/daemon/health.go (about)

     1  package daemon // import "github.com/docker/docker/daemon"
     2  
     3  import (
     4  	"bytes"
     5  	"context"
     6  	"fmt"
     7  	"runtime"
     8  	"strings"
     9  	"sync"
    10  	"time"
    11  
    12  	"github.com/docker/docker/api/types"
    13  	"github.com/docker/docker/api/types/strslice"
    14  	"github.com/docker/docker/container"
    15  	"github.com/docker/docker/daemon/exec"
    16  	"github.com/sirupsen/logrus"
    17  )
    18  
    19  const (
    20  	// Longest healthcheck probe output message to store. Longer messages will be truncated.
    21  	maxOutputLen = 4096
    22  
    23  	// Default interval between probe runs (from the end of the first to the start of the second).
    24  	// Also the time before the first probe.
    25  	defaultProbeInterval = 30 * time.Second
    26  
    27  	// The maximum length of time a single probe run should take. If the probe takes longer
    28  	// than this, the check is considered to have failed.
    29  	defaultProbeTimeout = 30 * time.Second
    30  
    31  	// The time given for the container to start before the health check starts considering
    32  	// the container unstable. Defaults to none.
    33  	defaultStartPeriod = 0 * time.Second
    34  
    35  	// Default number of consecutive failures of the health check
    36  	// for the container to be considered unhealthy.
    37  	defaultProbeRetries = 3
    38  
    39  	// Maximum number of entries to record
    40  	maxLogEntries = 5
    41  )
    42  
    43  const (
    44  	// Exit status codes that can be returned by the probe command.
    45  
    46  	exitStatusHealthy = 0 // Container is healthy
    47  )
    48  
    49  // probe implementations know how to run a particular type of probe.
    50  type probe interface {
    51  	// Perform one run of the check. Returns the exit code and an optional
    52  	// short diagnostic string.
    53  	run(context.Context, *Daemon, *container.Container) (*types.HealthcheckResult, error)
    54  }
    55  
    56  // cmdProbe implements the "CMD" probe type.
    57  type cmdProbe struct {
    58  	// Run the command with the system's default shell instead of execing it directly.
    59  	shell bool
    60  }
    61  
    62  // exec the healthcheck command in the container.
    63  // Returns the exit code and probe output (if any)
    64  func (p *cmdProbe) run(ctx context.Context, d *Daemon, cntr *container.Container) (*types.HealthcheckResult, error) {
    65  	cmdSlice := strslice.StrSlice(cntr.Config.Healthcheck.Test)[1:]
    66  	if p.shell {
    67  		cmdSlice = append(getShell(cntr), cmdSlice...)
    68  	}
    69  	entrypoint, args := d.getEntrypointAndArgs(strslice.StrSlice{}, cmdSlice)
    70  	execConfig := exec.NewConfig()
    71  	execConfig.OpenStdin = false
    72  	execConfig.OpenStdout = true
    73  	execConfig.OpenStderr = true
    74  	execConfig.ContainerID = cntr.ID
    75  	execConfig.DetachKeys = []byte{}
    76  	execConfig.Entrypoint = entrypoint
    77  	execConfig.Args = args
    78  	execConfig.Tty = false
    79  	execConfig.Privileged = false
    80  	execConfig.User = cntr.Config.User
    81  	execConfig.WorkingDir = cntr.Config.WorkingDir
    82  
    83  	linkedEnv, err := d.setupLinkedContainers(cntr)
    84  	if err != nil {
    85  		return nil, err
    86  	}
    87  	execConfig.Env = container.ReplaceOrAppendEnvValues(cntr.CreateDaemonEnvironment(execConfig.Tty, linkedEnv), execConfig.Env)
    88  
    89  	d.registerExecCommand(cntr, execConfig)
    90  	attributes := map[string]string{
    91  		"execID": execConfig.ID,
    92  	}
    93  	d.LogContainerEventWithAttributes(cntr, "exec_create: "+execConfig.Entrypoint+" "+strings.Join(execConfig.Args, " "), attributes)
    94  
    95  	output := &limitedBuffer{}
    96  	err = d.ContainerExecStart(ctx, execConfig.ID, nil, output, output)
    97  	if err != nil {
    98  		return nil, err
    99  	}
   100  	info, err := d.getExecConfig(execConfig.ID)
   101  	if err != nil {
   102  		return nil, err
   103  	}
   104  	if info.ExitCode == nil {
   105  		return nil, fmt.Errorf("healthcheck for container %s has no exit code", cntr.ID)
   106  	}
   107  	// Note: Go's json package will handle invalid UTF-8 for us
   108  	out := output.String()
   109  	return &types.HealthcheckResult{
   110  		End:      time.Now(),
   111  		ExitCode: *info.ExitCode,
   112  		Output:   out,
   113  	}, nil
   114  }
   115  
   116  // Update the container's Status.Health struct based on the latest probe's result.
   117  func handleProbeResult(d *Daemon, c *container.Container, result *types.HealthcheckResult, done chan struct{}) {
   118  	c.Lock()
   119  	defer c.Unlock()
   120  
   121  	// probe may have been cancelled while waiting on lock. Ignore result then
   122  	select {
   123  	case <-done:
   124  		return
   125  	default:
   126  	}
   127  
   128  	retries := c.Config.Healthcheck.Retries
   129  	if retries <= 0 {
   130  		retries = defaultProbeRetries
   131  	}
   132  
   133  	h := c.State.Health
   134  	oldStatus := h.Status()
   135  
   136  	if len(h.Log) >= maxLogEntries {
   137  		h.Log = append(h.Log[len(h.Log)+1-maxLogEntries:], result)
   138  	} else {
   139  		h.Log = append(h.Log, result)
   140  	}
   141  
   142  	if result.ExitCode == exitStatusHealthy {
   143  		h.FailingStreak = 0
   144  		h.SetStatus(types.Healthy)
   145  	} else { // Failure (including invalid exit code)
   146  		shouldIncrementStreak := true
   147  
   148  		// If the container is starting (i.e. we never had a successful health check)
   149  		// then we check if we are within the start period of the container in which
   150  		// case we do not increment the failure streak.
   151  		if h.Status() == types.Starting {
   152  			startPeriod := timeoutWithDefault(c.Config.Healthcheck.StartPeriod, defaultStartPeriod)
   153  			timeSinceStart := result.Start.Sub(c.State.StartedAt)
   154  
   155  			// If still within the start period, then don't increment failing streak.
   156  			if timeSinceStart < startPeriod {
   157  				shouldIncrementStreak = false
   158  			}
   159  		}
   160  
   161  		if shouldIncrementStreak {
   162  			h.FailingStreak++
   163  
   164  			if h.FailingStreak >= retries {
   165  				h.SetStatus(types.Unhealthy)
   166  			}
   167  		}
   168  		// Else we're starting or healthy. Stay in that state.
   169  	}
   170  
   171  	// replicate Health status changes
   172  	if err := c.CheckpointTo(d.containersReplica); err != nil {
   173  		// queries will be inconsistent until the next probe runs or other state mutations
   174  		// checkpoint the container
   175  		logrus.Errorf("Error replicating health state for container %s: %v", c.ID, err)
   176  	}
   177  
   178  	current := h.Status()
   179  	if oldStatus != current {
   180  		d.LogContainerEvent(c, "health_status: "+current)
   181  	}
   182  }
   183  
   184  // Run the container's monitoring thread until notified via "stop".
   185  // There is never more than one monitor thread running per container at a time.
   186  func monitor(d *Daemon, c *container.Container, stop chan struct{}, probe probe) {
   187  	probeTimeout := timeoutWithDefault(c.Config.Healthcheck.Timeout, defaultProbeTimeout)
   188  	probeInterval := timeoutWithDefault(c.Config.Healthcheck.Interval, defaultProbeInterval)
   189  
   190  	intervalTimer := time.NewTimer(probeInterval)
   191  	defer intervalTimer.Stop()
   192  
   193  	for {
   194  		intervalTimer.Reset(probeInterval)
   195  
   196  		select {
   197  		case <-stop:
   198  			logrus.Debugf("Stop healthcheck monitoring for container %s (received while idle)", c.ID)
   199  			return
   200  		case <-intervalTimer.C:
   201  			logrus.Debugf("Running health check for container %s ...", c.ID)
   202  			startTime := time.Now()
   203  			ctx, cancelProbe := context.WithTimeout(context.Background(), probeTimeout)
   204  			results := make(chan *types.HealthcheckResult, 1)
   205  			go func() {
   206  				healthChecksCounter.Inc()
   207  				result, err := probe.run(ctx, d, c)
   208  				if err != nil {
   209  					healthChecksFailedCounter.Inc()
   210  					logrus.Warnf("Health check for container %s error: %v", c.ID, err)
   211  					results <- &types.HealthcheckResult{
   212  						ExitCode: -1,
   213  						Output:   err.Error(),
   214  						Start:    startTime,
   215  						End:      time.Now(),
   216  					}
   217  				} else {
   218  					result.Start = startTime
   219  					logrus.Debugf("Health check for container %s done (exitCode=%d)", c.ID, result.ExitCode)
   220  					results <- result
   221  				}
   222  				close(results)
   223  			}()
   224  			select {
   225  			case <-stop:
   226  				logrus.Debugf("Stop healthcheck monitoring for container %s (received while probing)", c.ID)
   227  				cancelProbe()
   228  				// Wait for probe to exit (it might take a while to respond to the TERM
   229  				// signal and we don't want dying probes to pile up).
   230  				<-results
   231  				return
   232  			case result := <-results:
   233  				handleProbeResult(d, c, result, stop)
   234  				// Stop timeout
   235  				cancelProbe()
   236  			case <-ctx.Done():
   237  				logrus.Debugf("Health check for container %s taking too long", c.ID)
   238  				handleProbeResult(d, c, &types.HealthcheckResult{
   239  					ExitCode: -1,
   240  					Output:   fmt.Sprintf("Health check exceeded timeout (%v)", probeTimeout),
   241  					Start:    startTime,
   242  					End:      time.Now(),
   243  				}, stop)
   244  				cancelProbe()
   245  				// Wait for probe to exit (it might take a while to respond to the TERM
   246  				// signal and we don't want dying probes to pile up).
   247  				<-results
   248  			}
   249  		}
   250  	}
   251  }
   252  
   253  // Get a suitable probe implementation for the container's healthcheck configuration.
   254  // Nil will be returned if no healthcheck was configured or NONE was set.
   255  func getProbe(c *container.Container) probe {
   256  	config := c.Config.Healthcheck
   257  	if config == nil || len(config.Test) == 0 {
   258  		return nil
   259  	}
   260  	switch config.Test[0] {
   261  	case "CMD":
   262  		return &cmdProbe{shell: false}
   263  	case "CMD-SHELL":
   264  		return &cmdProbe{shell: true}
   265  	case "NONE":
   266  		return nil
   267  	default:
   268  		logrus.Warnf("Unknown healthcheck type '%s' (expected 'CMD') in container %s", config.Test[0], c.ID)
   269  		return nil
   270  	}
   271  }
   272  
   273  // Ensure the health-check monitor is running or not, depending on the current
   274  // state of the container.
   275  // Called from monitor.go, with c locked.
   276  func (daemon *Daemon) updateHealthMonitor(c *container.Container) {
   277  	h := c.State.Health
   278  	if h == nil {
   279  		return // No healthcheck configured
   280  	}
   281  
   282  	probe := getProbe(c)
   283  	wantRunning := c.Running && !c.Paused && probe != nil
   284  	if wantRunning {
   285  		if stop := h.OpenMonitorChannel(); stop != nil {
   286  			go monitor(daemon, c, stop, probe)
   287  		}
   288  	} else {
   289  		h.CloseMonitorChannel()
   290  	}
   291  }
   292  
   293  // Reset the health state for a newly-started, restarted or restored container.
   294  // initHealthMonitor is called from monitor.go and we should never be running
   295  // two instances at once.
   296  // Called with c locked.
   297  func (daemon *Daemon) initHealthMonitor(c *container.Container) {
   298  	// If no healthcheck is setup then don't init the monitor
   299  	if getProbe(c) == nil {
   300  		return
   301  	}
   302  
   303  	// This is needed in case we're auto-restarting
   304  	daemon.stopHealthchecks(c)
   305  
   306  	if h := c.State.Health; h != nil {
   307  		h.SetStatus(types.Starting)
   308  		h.FailingStreak = 0
   309  	} else {
   310  		h := &container.Health{}
   311  		h.SetStatus(types.Starting)
   312  		c.State.Health = h
   313  	}
   314  
   315  	daemon.updateHealthMonitor(c)
   316  }
   317  
   318  // Called when the container is being stopped (whether because the health check is
   319  // failing or for any other reason).
   320  func (daemon *Daemon) stopHealthchecks(c *container.Container) {
   321  	h := c.State.Health
   322  	if h != nil {
   323  		h.CloseMonitorChannel()
   324  	}
   325  }
   326  
   327  // Buffer up to maxOutputLen bytes. Further data is discarded.
   328  type limitedBuffer struct {
   329  	buf       bytes.Buffer
   330  	mu        sync.Mutex
   331  	truncated bool // indicates that data has been lost
   332  }
   333  
   334  // Append to limitedBuffer while there is room.
   335  func (b *limitedBuffer) Write(data []byte) (int, error) {
   336  	b.mu.Lock()
   337  	defer b.mu.Unlock()
   338  
   339  	bufLen := b.buf.Len()
   340  	dataLen := len(data)
   341  	keep := min(maxOutputLen-bufLen, dataLen)
   342  	if keep > 0 {
   343  		b.buf.Write(data[:keep])
   344  	}
   345  	if keep < dataLen {
   346  		b.truncated = true
   347  	}
   348  	return dataLen, nil
   349  }
   350  
   351  // The contents of the buffer, with "..." appended if it overflowed.
   352  func (b *limitedBuffer) String() string {
   353  	b.mu.Lock()
   354  	defer b.mu.Unlock()
   355  
   356  	out := b.buf.String()
   357  	if b.truncated {
   358  		out = out + "..."
   359  	}
   360  	return out
   361  }
   362  
   363  // If configuredValue is zero, use defaultValue instead.
   364  func timeoutWithDefault(configuredValue time.Duration, defaultValue time.Duration) time.Duration {
   365  	if configuredValue == 0 {
   366  		return defaultValue
   367  	}
   368  	return configuredValue
   369  }
   370  
   371  func min(x, y int) int {
   372  	if x < y {
   373  		return x
   374  	}
   375  	return y
   376  }
   377  
   378  func getShell(cntr *container.Container) []string {
   379  	if len(cntr.Config.Shell) != 0 {
   380  		return cntr.Config.Shell
   381  	}
   382  	if runtime.GOOS != "windows" {
   383  		return []string{"/bin/sh", "-c"}
   384  	}
   385  	if cntr.OS != runtime.GOOS {
   386  		return []string{"/bin/sh", "-c"}
   387  	}
   388  	return []string{"cmd", "/S", "/C"}
   389  }