github.com/moby/docker@v26.1.3+incompatible/daemon/health.go (about)

     1  package daemon // import "github.com/docker/docker/daemon"
     2  
     3  import (
     4  	"bytes"
     5  	"context"
     6  	"fmt"
     7  	"runtime"
     8  	"strings"
     9  	"sync"
    10  	"time"
    11  
    12  	"github.com/containerd/log"
    13  	"github.com/docker/docker/api/types"
    14  	containertypes "github.com/docker/docker/api/types/container"
    15  	"github.com/docker/docker/api/types/events"
    16  	"github.com/docker/docker/api/types/strslice"
    17  	"github.com/docker/docker/container"
    18  )
    19  
    20  const (
    21  	// Longest healthcheck probe output message to store. Longer messages will be truncated.
    22  	maxOutputLen = 4096
    23  
    24  	// Default interval between probe runs (from the end of the first to the start of the second).
    25  	// Also the time before the first probe.
    26  	defaultProbeInterval = 30 * time.Second
    27  
    28  	// The maximum length of time a single probe run should take. If the probe takes longer
    29  	// than this, the check is considered to have failed.
    30  	defaultProbeTimeout = 30 * time.Second
    31  
    32  	// The time given for the container to start before the health check starts considering
    33  	// the container unstable. Defaults to none.
    34  	defaultStartPeriod = 0 * time.Second
    35  
    36  	// Default number of consecutive failures of the health check
    37  	// for the container to be considered unhealthy.
    38  	defaultProbeRetries = 3
    39  
    40  	// Maximum number of entries to record
    41  	maxLogEntries = 5
    42  )
    43  
    44  const (
    45  	// Exit status codes that can be returned by the probe command.
    46  
    47  	exitStatusHealthy = 0 // Container is healthy
    48  )
    49  
    50  // probe implementations know how to run a particular type of probe.
    51  type probe interface {
    52  	// Perform one run of the check. Returns the exit code and an optional
    53  	// short diagnostic string.
    54  	run(context.Context, *Daemon, *container.Container) (*types.HealthcheckResult, error)
    55  }
    56  
    57  // cmdProbe implements the "CMD" probe type.
    58  type cmdProbe struct {
    59  	// Run the command with the system's default shell instead of execing it directly.
    60  	shell bool
    61  }
    62  
    63  // exec the healthcheck command in the container.
    64  // Returns the exit code and probe output (if any)
    65  func (p *cmdProbe) run(ctx context.Context, d *Daemon, cntr *container.Container) (*types.HealthcheckResult, error) {
    66  	startTime := time.Now()
    67  	cmdSlice := strslice.StrSlice(cntr.Config.Healthcheck.Test)[1:]
    68  	if p.shell {
    69  		cmdSlice = append(getShell(cntr), cmdSlice...)
    70  	}
    71  	entrypoint, args := d.getEntrypointAndArgs(strslice.StrSlice{}, cmdSlice)
    72  	execConfig := container.NewExecConfig(cntr)
    73  	execConfig.OpenStdin = false
    74  	execConfig.OpenStdout = true
    75  	execConfig.OpenStderr = true
    76  	execConfig.DetachKeys = []byte{}
    77  	execConfig.Entrypoint = entrypoint
    78  	execConfig.Args = args
    79  	execConfig.Tty = false
    80  	execConfig.Privileged = false
    81  	execConfig.User = cntr.Config.User
    82  	execConfig.WorkingDir = cntr.Config.WorkingDir
    83  
    84  	linkedEnv, err := d.setupLinkedContainers(cntr)
    85  	if err != nil {
    86  		return nil, err
    87  	}
    88  	execConfig.Env = container.ReplaceOrAppendEnvValues(cntr.CreateDaemonEnvironment(execConfig.Tty, linkedEnv), execConfig.Env)
    89  
    90  	d.registerExecCommand(cntr, execConfig)
    91  	d.LogContainerEventWithAttributes(cntr, events.Action(string(events.ActionExecCreate)+": "+execConfig.Entrypoint+" "+strings.Join(execConfig.Args, " ")), map[string]string{
    92  		"execID": execConfig.ID,
    93  	})
    94  
    95  	output := &limitedBuffer{}
    96  	probeCtx, cancelProbe := context.WithCancel(ctx)
    97  	defer cancelProbe()
    98  	execErr := make(chan error, 1)
    99  
   100  	options := containertypes.ExecStartOptions{
   101  		Stdout: output,
   102  		Stderr: output,
   103  	}
   104  
   105  	go func() { execErr <- d.ContainerExecStart(probeCtx, execConfig.ID, options) }()
   106  
   107  	// Starting an exec can take a significant amount of time: on the order
   108  	// of 1s in extreme cases. The time it takes dockerd and containerd to
   109  	// start the exec is time that the probe process is not running, and so
   110  	// should not count towards the health check's timeout. Apply a separate
   111  	// timeout to abort if the exec request is wedged.
   112  	tm := time.NewTimer(30 * time.Second)
   113  	defer tm.Stop()
   114  	select {
   115  	case <-tm.C:
   116  		return nil, fmt.Errorf("timed out starting health check for container %s", cntr.ID)
   117  	case err := <-execErr:
   118  		if err != nil {
   119  			return nil, err
   120  		}
   121  	case <-execConfig.Started:
   122  		healthCheckStartDuration.UpdateSince(startTime)
   123  	}
   124  
   125  	if !tm.Stop() {
   126  		<-tm.C
   127  	}
   128  	probeTimeout := timeoutWithDefault(cntr.Config.Healthcheck.Timeout, defaultProbeTimeout)
   129  	tm.Reset(probeTimeout)
   130  	select {
   131  	case <-tm.C:
   132  		cancelProbe()
   133  		log.G(ctx).WithContext(ctx).Debugf("Health check for container %s taking too long", cntr.ID)
   134  		// Wait for probe to exit (it might take some time to call containerd to kill
   135  		// the process and we don't want dying probes to pile up).
   136  		<-execErr
   137  
   138  		var msg string
   139  		if out := output.String(); len(out) > 0 {
   140  			msg = fmt.Sprintf("Health check exceeded timeout (%v): %s", probeTimeout, out)
   141  		} else {
   142  			msg = fmt.Sprintf("Health check exceeded timeout (%v)", probeTimeout)
   143  		}
   144  		return &types.HealthcheckResult{
   145  			ExitCode: -1,
   146  			Output:   msg,
   147  			End:      time.Now(),
   148  		}, nil
   149  	case err := <-execErr:
   150  		if err != nil {
   151  			return nil, err
   152  		}
   153  	}
   154  
   155  	info, err := d.getExecConfig(execConfig.ID)
   156  	if err != nil {
   157  		return nil, err
   158  	}
   159  	exitCode, err := func() (int, error) {
   160  		info.Lock()
   161  		defer info.Unlock()
   162  		if info.ExitCode == nil {
   163  			return 0, fmt.Errorf("healthcheck for container %s has no exit code", cntr.ID)
   164  		}
   165  		return *info.ExitCode, nil
   166  	}()
   167  	if err != nil {
   168  		return nil, err
   169  	}
   170  	// Note: Go's json package will handle invalid UTF-8 for us
   171  	out := output.String()
   172  	return &types.HealthcheckResult{
   173  		End:      time.Now(),
   174  		ExitCode: exitCode,
   175  		Output:   out,
   176  	}, nil
   177  }
   178  
   179  // Update the container's Status.Health struct based on the latest probe's result.
   180  func handleProbeResult(d *Daemon, c *container.Container, result *types.HealthcheckResult, done chan struct{}) {
   181  	c.Lock()
   182  	defer c.Unlock()
   183  
   184  	// probe may have been cancelled while waiting on lock. Ignore result then
   185  	select {
   186  	case <-done:
   187  		return
   188  	default:
   189  	}
   190  
   191  	retries := c.Config.Healthcheck.Retries
   192  	if retries <= 0 {
   193  		retries = defaultProbeRetries
   194  	}
   195  
   196  	h := c.State.Health
   197  	oldStatus := h.Status()
   198  
   199  	if len(h.Log) >= maxLogEntries {
   200  		h.Log = append(h.Log[len(h.Log)+1-maxLogEntries:], result)
   201  	} else {
   202  		h.Log = append(h.Log, result)
   203  	}
   204  
   205  	if result.ExitCode == exitStatusHealthy {
   206  		h.FailingStreak = 0
   207  		h.SetStatus(types.Healthy)
   208  	} else { // Failure (including invalid exit code)
   209  		shouldIncrementStreak := true
   210  
   211  		// If the container is starting (i.e. we never had a successful health check)
   212  		// then we check if we are within the start period of the container in which
   213  		// case we do not increment the failure streak.
   214  		if h.Status() == types.Starting {
   215  			startPeriod := timeoutWithDefault(c.Config.Healthcheck.StartPeriod, defaultStartPeriod)
   216  			timeSinceStart := result.Start.Sub(c.State.StartedAt)
   217  
   218  			// If still within the start period, then don't increment failing streak.
   219  			if timeSinceStart < startPeriod {
   220  				shouldIncrementStreak = false
   221  			}
   222  		}
   223  
   224  		if shouldIncrementStreak {
   225  			h.FailingStreak++
   226  
   227  			if h.FailingStreak >= retries {
   228  				h.SetStatus(types.Unhealthy)
   229  			}
   230  		}
   231  		// Else we're starting or healthy. Stay in that state.
   232  	}
   233  
   234  	// Replicate Health status changes to the API, skipping persistent storage
   235  	// to avoid unnecessary disk writes. The health state is only best-effort
   236  	// persisted across of the daemon. It will get written to disk on the next
   237  	// checkpoint, such as when the container state changes.
   238  	if err := c.CommitInMemory(d.containersReplica); err != nil {
   239  		// queries will be inconsistent until the next probe runs or other state mutations
   240  		// checkpoint the container
   241  		log.G(context.TODO()).Errorf("Error replicating health state for container %s: %v", c.ID, err)
   242  	}
   243  
   244  	current := h.Status()
   245  	if oldStatus != current {
   246  		d.LogContainerEvent(c, events.Action(string(events.ActionHealthStatus)+": "+current))
   247  	}
   248  }
   249  
   250  // Run the container's monitoring thread until notified via "stop".
   251  // There is never more than one monitor thread running per container at a time.
   252  func monitor(d *Daemon, c *container.Container, stop chan struct{}, probe probe) {
   253  	probeInterval := timeoutWithDefault(c.Config.Healthcheck.Interval, defaultProbeInterval)
   254  	startInterval := timeoutWithDefault(c.Config.Healthcheck.StartInterval, probeInterval)
   255  	startPeriod := timeoutWithDefault(c.Config.Healthcheck.StartPeriod, defaultStartPeriod)
   256  
   257  	c.Lock()
   258  	started := c.State.StartedAt
   259  	c.Unlock()
   260  
   261  	getInterval := func() time.Duration {
   262  		if time.Since(started) >= startPeriod {
   263  			return probeInterval
   264  		}
   265  		c.Lock()
   266  		status := c.Health.Health.Status
   267  		c.Unlock()
   268  
   269  		if status == types.Starting {
   270  			return startInterval
   271  		}
   272  		return probeInterval
   273  	}
   274  
   275  	intervalTimer := time.NewTimer(getInterval())
   276  	defer intervalTimer.Stop()
   277  
   278  	for {
   279  		select {
   280  		case <-stop:
   281  			log.G(context.TODO()).Debugf("Stop healthcheck monitoring for container %s (received while idle)", c.ID)
   282  			return
   283  		case <-intervalTimer.C:
   284  			log.G(context.TODO()).Debugf("Running health check for container %s ...", c.ID)
   285  			startTime := time.Now()
   286  			ctx, cancelProbe := context.WithCancel(context.Background())
   287  			results := make(chan *types.HealthcheckResult, 1)
   288  			go func() {
   289  				healthChecksCounter.Inc()
   290  				result, err := probe.run(ctx, d, c)
   291  				if err != nil {
   292  					healthChecksFailedCounter.Inc()
   293  					log.G(ctx).Warnf("Health check for container %s error: %v", c.ID, err)
   294  					results <- &types.HealthcheckResult{
   295  						ExitCode: -1,
   296  						Output:   err.Error(),
   297  						Start:    startTime,
   298  						End:      time.Now(),
   299  					}
   300  				} else {
   301  					result.Start = startTime
   302  					log.G(ctx).Debugf("Health check for container %s done (exitCode=%d)", c.ID, result.ExitCode)
   303  					results <- result
   304  				}
   305  				close(results)
   306  			}()
   307  			select {
   308  			case <-stop:
   309  				log.G(ctx).Debugf("Stop healthcheck monitoring for container %s (received while probing)", c.ID)
   310  				cancelProbe()
   311  				// Wait for probe to exit (it might take a while to respond to the TERM
   312  				// signal and we don't want dying probes to pile up).
   313  				<-results
   314  				return
   315  			case result := <-results:
   316  				handleProbeResult(d, c, result, stop)
   317  				cancelProbe()
   318  			}
   319  		}
   320  		intervalTimer.Reset(getInterval())
   321  	}
   322  }
   323  
   324  // Get a suitable probe implementation for the container's healthcheck configuration.
   325  // Nil will be returned if no healthcheck was configured or NONE was set.
   326  func getProbe(c *container.Container) probe {
   327  	config := c.Config.Healthcheck
   328  	if config == nil || len(config.Test) == 0 {
   329  		return nil
   330  	}
   331  	switch config.Test[0] {
   332  	case "CMD":
   333  		return &cmdProbe{shell: false}
   334  	case "CMD-SHELL":
   335  		return &cmdProbe{shell: true}
   336  	case "NONE":
   337  		return nil
   338  	default:
   339  		log.G(context.TODO()).Warnf("Unknown healthcheck type '%s' (expected 'CMD') in container %s", config.Test[0], c.ID)
   340  		return nil
   341  	}
   342  }
   343  
   344  // Ensure the health-check monitor is running or not, depending on the current
   345  // state of the container.
   346  // Called from monitor.go, with c locked.
   347  func (daemon *Daemon) updateHealthMonitor(c *container.Container) {
   348  	h := c.State.Health
   349  	if h == nil {
   350  		return // No healthcheck configured
   351  	}
   352  
   353  	probe := getProbe(c)
   354  	wantRunning := c.Running && !c.Paused && probe != nil
   355  	if wantRunning {
   356  		if stop := h.OpenMonitorChannel(); stop != nil {
   357  			go monitor(daemon, c, stop, probe)
   358  		}
   359  	} else {
   360  		h.CloseMonitorChannel()
   361  	}
   362  }
   363  
   364  // Reset the health state for a newly-started, restarted or restored container.
   365  // initHealthMonitor is called from monitor.go and we should never be running
   366  // two instances at once.
   367  // Called with c locked.
   368  func (daemon *Daemon) initHealthMonitor(c *container.Container) {
   369  	// If no healthcheck is setup then don't init the monitor
   370  	if getProbe(c) == nil {
   371  		return
   372  	}
   373  
   374  	// This is needed in case we're auto-restarting
   375  	daemon.stopHealthchecks(c)
   376  
   377  	if h := c.State.Health; h != nil {
   378  		h.SetStatus(types.Starting)
   379  		h.FailingStreak = 0
   380  	} else {
   381  		h := &container.Health{}
   382  		h.SetStatus(types.Starting)
   383  		c.State.Health = h
   384  	}
   385  
   386  	daemon.updateHealthMonitor(c)
   387  }
   388  
   389  // Called when the container is being stopped (whether because the health check is
   390  // failing or for any other reason).
   391  func (daemon *Daemon) stopHealthchecks(c *container.Container) {
   392  	h := c.State.Health
   393  	if h != nil {
   394  		h.CloseMonitorChannel()
   395  	}
   396  }
   397  
   398  // Buffer up to maxOutputLen bytes. Further data is discarded.
   399  type limitedBuffer struct {
   400  	buf       bytes.Buffer
   401  	mu        sync.Mutex
   402  	truncated bool // indicates that data has been lost
   403  }
   404  
   405  // Append to limitedBuffer while there is room.
   406  func (b *limitedBuffer) Write(data []byte) (int, error) {
   407  	b.mu.Lock()
   408  	defer b.mu.Unlock()
   409  
   410  	bufLen := b.buf.Len()
   411  	dataLen := len(data)
   412  	keep := minInt(maxOutputLen-bufLen, dataLen)
   413  	if keep > 0 {
   414  		b.buf.Write(data[:keep])
   415  	}
   416  	if keep < dataLen {
   417  		b.truncated = true
   418  	}
   419  	return dataLen, nil
   420  }
   421  
   422  // The contents of the buffer, with "..." appended if it overflowed.
   423  func (b *limitedBuffer) String() string {
   424  	b.mu.Lock()
   425  	defer b.mu.Unlock()
   426  
   427  	out := b.buf.String()
   428  	if b.truncated {
   429  		out = out + "..."
   430  	}
   431  	return out
   432  }
   433  
   434  // If configuredValue is zero, use defaultValue instead.
   435  func timeoutWithDefault(configuredValue time.Duration, defaultValue time.Duration) time.Duration {
   436  	if configuredValue == 0 {
   437  		return defaultValue
   438  	}
   439  	return configuredValue
   440  }
   441  
   442  func minInt(x, y int) int {
   443  	if x < y {
   444  		return x
   445  	}
   446  	return y
   447  }
   448  
   449  func getShell(cntr *container.Container) []string {
   450  	if len(cntr.Config.Shell) != 0 {
   451  		return cntr.Config.Shell
   452  	}
   453  	if runtime.GOOS != "windows" {
   454  		return []string{"/bin/sh", "-c"}
   455  	}
   456  	if cntr.OS != runtime.GOOS {
   457  		return []string{"/bin/sh", "-c"}
   458  	}
   459  	return []string{"cmd", "/S", "/C"}
   460  }