github.com/moby/docker@v26.1.3+incompatible/daemon/health.go (about) 1 package daemon // import "github.com/docker/docker/daemon" 2 3 import ( 4 "bytes" 5 "context" 6 "fmt" 7 "runtime" 8 "strings" 9 "sync" 10 "time" 11 12 "github.com/containerd/log" 13 "github.com/docker/docker/api/types" 14 containertypes "github.com/docker/docker/api/types/container" 15 "github.com/docker/docker/api/types/events" 16 "github.com/docker/docker/api/types/strslice" 17 "github.com/docker/docker/container" 18 ) 19 20 const ( 21 // Longest healthcheck probe output message to store. Longer messages will be truncated. 22 maxOutputLen = 4096 23 24 // Default interval between probe runs (from the end of the first to the start of the second). 25 // Also the time before the first probe. 26 defaultProbeInterval = 30 * time.Second 27 28 // The maximum length of time a single probe run should take. If the probe takes longer 29 // than this, the check is considered to have failed. 30 defaultProbeTimeout = 30 * time.Second 31 32 // The time given for the container to start before the health check starts considering 33 // the container unstable. Defaults to none. 34 defaultStartPeriod = 0 * time.Second 35 36 // Default number of consecutive failures of the health check 37 // for the container to be considered unhealthy. 38 defaultProbeRetries = 3 39 40 // Maximum number of entries to record 41 maxLogEntries = 5 42 ) 43 44 const ( 45 // Exit status codes that can be returned by the probe command. 46 47 exitStatusHealthy = 0 // Container is healthy 48 ) 49 50 // probe implementations know how to run a particular type of probe. 51 type probe interface { 52 // Perform one run of the check. Returns the exit code and an optional 53 // short diagnostic string. 54 run(context.Context, *Daemon, *container.Container) (*types.HealthcheckResult, error) 55 } 56 57 // cmdProbe implements the "CMD" probe type. 58 type cmdProbe struct { 59 // Run the command with the system's default shell instead of execing it directly. 60 shell bool 61 } 62 63 // exec the healthcheck command in the container. 64 // Returns the exit code and probe output (if any) 65 func (p *cmdProbe) run(ctx context.Context, d *Daemon, cntr *container.Container) (*types.HealthcheckResult, error) { 66 startTime := time.Now() 67 cmdSlice := strslice.StrSlice(cntr.Config.Healthcheck.Test)[1:] 68 if p.shell { 69 cmdSlice = append(getShell(cntr), cmdSlice...) 70 } 71 entrypoint, args := d.getEntrypointAndArgs(strslice.StrSlice{}, cmdSlice) 72 execConfig := container.NewExecConfig(cntr) 73 execConfig.OpenStdin = false 74 execConfig.OpenStdout = true 75 execConfig.OpenStderr = true 76 execConfig.DetachKeys = []byte{} 77 execConfig.Entrypoint = entrypoint 78 execConfig.Args = args 79 execConfig.Tty = false 80 execConfig.Privileged = false 81 execConfig.User = cntr.Config.User 82 execConfig.WorkingDir = cntr.Config.WorkingDir 83 84 linkedEnv, err := d.setupLinkedContainers(cntr) 85 if err != nil { 86 return nil, err 87 } 88 execConfig.Env = container.ReplaceOrAppendEnvValues(cntr.CreateDaemonEnvironment(execConfig.Tty, linkedEnv), execConfig.Env) 89 90 d.registerExecCommand(cntr, execConfig) 91 d.LogContainerEventWithAttributes(cntr, events.Action(string(events.ActionExecCreate)+": "+execConfig.Entrypoint+" "+strings.Join(execConfig.Args, " ")), map[string]string{ 92 "execID": execConfig.ID, 93 }) 94 95 output := &limitedBuffer{} 96 probeCtx, cancelProbe := context.WithCancel(ctx) 97 defer cancelProbe() 98 execErr := make(chan error, 1) 99 100 options := containertypes.ExecStartOptions{ 101 Stdout: output, 102 Stderr: output, 103 } 104 105 go func() { execErr <- d.ContainerExecStart(probeCtx, execConfig.ID, options) }() 106 107 // Starting an exec can take a significant amount of time: on the order 108 // of 1s in extreme cases. The time it takes dockerd and containerd to 109 // start the exec is time that the probe process is not running, and so 110 // should not count towards the health check's timeout. Apply a separate 111 // timeout to abort if the exec request is wedged. 112 tm := time.NewTimer(30 * time.Second) 113 defer tm.Stop() 114 select { 115 case <-tm.C: 116 return nil, fmt.Errorf("timed out starting health check for container %s", cntr.ID) 117 case err := <-execErr: 118 if err != nil { 119 return nil, err 120 } 121 case <-execConfig.Started: 122 healthCheckStartDuration.UpdateSince(startTime) 123 } 124 125 if !tm.Stop() { 126 <-tm.C 127 } 128 probeTimeout := timeoutWithDefault(cntr.Config.Healthcheck.Timeout, defaultProbeTimeout) 129 tm.Reset(probeTimeout) 130 select { 131 case <-tm.C: 132 cancelProbe() 133 log.G(ctx).WithContext(ctx).Debugf("Health check for container %s taking too long", cntr.ID) 134 // Wait for probe to exit (it might take some time to call containerd to kill 135 // the process and we don't want dying probes to pile up). 136 <-execErr 137 138 var msg string 139 if out := output.String(); len(out) > 0 { 140 msg = fmt.Sprintf("Health check exceeded timeout (%v): %s", probeTimeout, out) 141 } else { 142 msg = fmt.Sprintf("Health check exceeded timeout (%v)", probeTimeout) 143 } 144 return &types.HealthcheckResult{ 145 ExitCode: -1, 146 Output: msg, 147 End: time.Now(), 148 }, nil 149 case err := <-execErr: 150 if err != nil { 151 return nil, err 152 } 153 } 154 155 info, err := d.getExecConfig(execConfig.ID) 156 if err != nil { 157 return nil, err 158 } 159 exitCode, err := func() (int, error) { 160 info.Lock() 161 defer info.Unlock() 162 if info.ExitCode == nil { 163 return 0, fmt.Errorf("healthcheck for container %s has no exit code", cntr.ID) 164 } 165 return *info.ExitCode, nil 166 }() 167 if err != nil { 168 return nil, err 169 } 170 // Note: Go's json package will handle invalid UTF-8 for us 171 out := output.String() 172 return &types.HealthcheckResult{ 173 End: time.Now(), 174 ExitCode: exitCode, 175 Output: out, 176 }, nil 177 } 178 179 // Update the container's Status.Health struct based on the latest probe's result. 180 func handleProbeResult(d *Daemon, c *container.Container, result *types.HealthcheckResult, done chan struct{}) { 181 c.Lock() 182 defer c.Unlock() 183 184 // probe may have been cancelled while waiting on lock. Ignore result then 185 select { 186 case <-done: 187 return 188 default: 189 } 190 191 retries := c.Config.Healthcheck.Retries 192 if retries <= 0 { 193 retries = defaultProbeRetries 194 } 195 196 h := c.State.Health 197 oldStatus := h.Status() 198 199 if len(h.Log) >= maxLogEntries { 200 h.Log = append(h.Log[len(h.Log)+1-maxLogEntries:], result) 201 } else { 202 h.Log = append(h.Log, result) 203 } 204 205 if result.ExitCode == exitStatusHealthy { 206 h.FailingStreak = 0 207 h.SetStatus(types.Healthy) 208 } else { // Failure (including invalid exit code) 209 shouldIncrementStreak := true 210 211 // If the container is starting (i.e. we never had a successful health check) 212 // then we check if we are within the start period of the container in which 213 // case we do not increment the failure streak. 214 if h.Status() == types.Starting { 215 startPeriod := timeoutWithDefault(c.Config.Healthcheck.StartPeriod, defaultStartPeriod) 216 timeSinceStart := result.Start.Sub(c.State.StartedAt) 217 218 // If still within the start period, then don't increment failing streak. 219 if timeSinceStart < startPeriod { 220 shouldIncrementStreak = false 221 } 222 } 223 224 if shouldIncrementStreak { 225 h.FailingStreak++ 226 227 if h.FailingStreak >= retries { 228 h.SetStatus(types.Unhealthy) 229 } 230 } 231 // Else we're starting or healthy. Stay in that state. 232 } 233 234 // Replicate Health status changes to the API, skipping persistent storage 235 // to avoid unnecessary disk writes. The health state is only best-effort 236 // persisted across of the daemon. It will get written to disk on the next 237 // checkpoint, such as when the container state changes. 238 if err := c.CommitInMemory(d.containersReplica); err != nil { 239 // queries will be inconsistent until the next probe runs or other state mutations 240 // checkpoint the container 241 log.G(context.TODO()).Errorf("Error replicating health state for container %s: %v", c.ID, err) 242 } 243 244 current := h.Status() 245 if oldStatus != current { 246 d.LogContainerEvent(c, events.Action(string(events.ActionHealthStatus)+": "+current)) 247 } 248 } 249 250 // Run the container's monitoring thread until notified via "stop". 251 // There is never more than one monitor thread running per container at a time. 252 func monitor(d *Daemon, c *container.Container, stop chan struct{}, probe probe) { 253 probeInterval := timeoutWithDefault(c.Config.Healthcheck.Interval, defaultProbeInterval) 254 startInterval := timeoutWithDefault(c.Config.Healthcheck.StartInterval, probeInterval) 255 startPeriod := timeoutWithDefault(c.Config.Healthcheck.StartPeriod, defaultStartPeriod) 256 257 c.Lock() 258 started := c.State.StartedAt 259 c.Unlock() 260 261 getInterval := func() time.Duration { 262 if time.Since(started) >= startPeriod { 263 return probeInterval 264 } 265 c.Lock() 266 status := c.Health.Health.Status 267 c.Unlock() 268 269 if status == types.Starting { 270 return startInterval 271 } 272 return probeInterval 273 } 274 275 intervalTimer := time.NewTimer(getInterval()) 276 defer intervalTimer.Stop() 277 278 for { 279 select { 280 case <-stop: 281 log.G(context.TODO()).Debugf("Stop healthcheck monitoring for container %s (received while idle)", c.ID) 282 return 283 case <-intervalTimer.C: 284 log.G(context.TODO()).Debugf("Running health check for container %s ...", c.ID) 285 startTime := time.Now() 286 ctx, cancelProbe := context.WithCancel(context.Background()) 287 results := make(chan *types.HealthcheckResult, 1) 288 go func() { 289 healthChecksCounter.Inc() 290 result, err := probe.run(ctx, d, c) 291 if err != nil { 292 healthChecksFailedCounter.Inc() 293 log.G(ctx).Warnf("Health check for container %s error: %v", c.ID, err) 294 results <- &types.HealthcheckResult{ 295 ExitCode: -1, 296 Output: err.Error(), 297 Start: startTime, 298 End: time.Now(), 299 } 300 } else { 301 result.Start = startTime 302 log.G(ctx).Debugf("Health check for container %s done (exitCode=%d)", c.ID, result.ExitCode) 303 results <- result 304 } 305 close(results) 306 }() 307 select { 308 case <-stop: 309 log.G(ctx).Debugf("Stop healthcheck monitoring for container %s (received while probing)", c.ID) 310 cancelProbe() 311 // Wait for probe to exit (it might take a while to respond to the TERM 312 // signal and we don't want dying probes to pile up). 313 <-results 314 return 315 case result := <-results: 316 handleProbeResult(d, c, result, stop) 317 cancelProbe() 318 } 319 } 320 intervalTimer.Reset(getInterval()) 321 } 322 } 323 324 // Get a suitable probe implementation for the container's healthcheck configuration. 325 // Nil will be returned if no healthcheck was configured or NONE was set. 326 func getProbe(c *container.Container) probe { 327 config := c.Config.Healthcheck 328 if config == nil || len(config.Test) == 0 { 329 return nil 330 } 331 switch config.Test[0] { 332 case "CMD": 333 return &cmdProbe{shell: false} 334 case "CMD-SHELL": 335 return &cmdProbe{shell: true} 336 case "NONE": 337 return nil 338 default: 339 log.G(context.TODO()).Warnf("Unknown healthcheck type '%s' (expected 'CMD') in container %s", config.Test[0], c.ID) 340 return nil 341 } 342 } 343 344 // Ensure the health-check monitor is running or not, depending on the current 345 // state of the container. 346 // Called from monitor.go, with c locked. 347 func (daemon *Daemon) updateHealthMonitor(c *container.Container) { 348 h := c.State.Health 349 if h == nil { 350 return // No healthcheck configured 351 } 352 353 probe := getProbe(c) 354 wantRunning := c.Running && !c.Paused && probe != nil 355 if wantRunning { 356 if stop := h.OpenMonitorChannel(); stop != nil { 357 go monitor(daemon, c, stop, probe) 358 } 359 } else { 360 h.CloseMonitorChannel() 361 } 362 } 363 364 // Reset the health state for a newly-started, restarted or restored container. 365 // initHealthMonitor is called from monitor.go and we should never be running 366 // two instances at once. 367 // Called with c locked. 368 func (daemon *Daemon) initHealthMonitor(c *container.Container) { 369 // If no healthcheck is setup then don't init the monitor 370 if getProbe(c) == nil { 371 return 372 } 373 374 // This is needed in case we're auto-restarting 375 daemon.stopHealthchecks(c) 376 377 if h := c.State.Health; h != nil { 378 h.SetStatus(types.Starting) 379 h.FailingStreak = 0 380 } else { 381 h := &container.Health{} 382 h.SetStatus(types.Starting) 383 c.State.Health = h 384 } 385 386 daemon.updateHealthMonitor(c) 387 } 388 389 // Called when the container is being stopped (whether because the health check is 390 // failing or for any other reason). 391 func (daemon *Daemon) stopHealthchecks(c *container.Container) { 392 h := c.State.Health 393 if h != nil { 394 h.CloseMonitorChannel() 395 } 396 } 397 398 // Buffer up to maxOutputLen bytes. Further data is discarded. 399 type limitedBuffer struct { 400 buf bytes.Buffer 401 mu sync.Mutex 402 truncated bool // indicates that data has been lost 403 } 404 405 // Append to limitedBuffer while there is room. 406 func (b *limitedBuffer) Write(data []byte) (int, error) { 407 b.mu.Lock() 408 defer b.mu.Unlock() 409 410 bufLen := b.buf.Len() 411 dataLen := len(data) 412 keep := minInt(maxOutputLen-bufLen, dataLen) 413 if keep > 0 { 414 b.buf.Write(data[:keep]) 415 } 416 if keep < dataLen { 417 b.truncated = true 418 } 419 return dataLen, nil 420 } 421 422 // The contents of the buffer, with "..." appended if it overflowed. 423 func (b *limitedBuffer) String() string { 424 b.mu.Lock() 425 defer b.mu.Unlock() 426 427 out := b.buf.String() 428 if b.truncated { 429 out = out + "..." 430 } 431 return out 432 } 433 434 // If configuredValue is zero, use defaultValue instead. 435 func timeoutWithDefault(configuredValue time.Duration, defaultValue time.Duration) time.Duration { 436 if configuredValue == 0 { 437 return defaultValue 438 } 439 return configuredValue 440 } 441 442 func minInt(x, y int) int { 443 if x < y { 444 return x 445 } 446 return y 447 } 448 449 func getShell(cntr *container.Container) []string { 450 if len(cntr.Config.Shell) != 0 { 451 return cntr.Config.Shell 452 } 453 if runtime.GOOS != "windows" { 454 return []string{"/bin/sh", "-c"} 455 } 456 if cntr.OS != runtime.GOOS { 457 return []string{"/bin/sh", "-c"} 458 } 459 return []string{"cmd", "/S", "/C"} 460 }