github.com/rawahars/moby@v24.0.4+incompatible/daemon/health.go (about) 1 package daemon // import "github.com/docker/docker/daemon" 2 3 import ( 4 "bytes" 5 "context" 6 "fmt" 7 "runtime" 8 "strings" 9 "sync" 10 "time" 11 12 "github.com/docker/docker/api/types" 13 containertypes "github.com/docker/docker/api/types/container" 14 "github.com/docker/docker/api/types/strslice" 15 "github.com/docker/docker/container" 16 "github.com/sirupsen/logrus" 17 ) 18 19 const ( 20 // Longest healthcheck probe output message to store. Longer messages will be truncated. 21 maxOutputLen = 4096 22 23 // Default interval between probe runs (from the end of the first to the start of the second). 24 // Also the time before the first probe. 25 defaultProbeInterval = 30 * time.Second 26 27 // The maximum length of time a single probe run should take. If the probe takes longer 28 // than this, the check is considered to have failed. 29 defaultProbeTimeout = 30 * time.Second 30 31 // The time given for the container to start before the health check starts considering 32 // the container unstable. Defaults to none. 33 defaultStartPeriod = 0 * time.Second 34 35 // Default number of consecutive failures of the health check 36 // for the container to be considered unhealthy. 37 defaultProbeRetries = 3 38 39 // Maximum number of entries to record 40 maxLogEntries = 5 41 ) 42 43 const ( 44 // Exit status codes that can be returned by the probe command. 45 46 exitStatusHealthy = 0 // Container is healthy 47 ) 48 49 // probe implementations know how to run a particular type of probe. 50 type probe interface { 51 // Perform one run of the check. Returns the exit code and an optional 52 // short diagnostic string. 53 run(context.Context, *Daemon, *container.Container) (*types.HealthcheckResult, error) 54 } 55 56 // cmdProbe implements the "CMD" probe type. 57 type cmdProbe struct { 58 // Run the command with the system's default shell instead of execing it directly. 59 shell bool 60 } 61 62 // exec the healthcheck command in the container. 63 // Returns the exit code and probe output (if any) 64 func (p *cmdProbe) run(ctx context.Context, d *Daemon, cntr *container.Container) (*types.HealthcheckResult, error) { 65 startTime := time.Now() 66 cmdSlice := strslice.StrSlice(cntr.Config.Healthcheck.Test)[1:] 67 if p.shell { 68 cmdSlice = append(getShell(cntr), cmdSlice...) 69 } 70 entrypoint, args := d.getEntrypointAndArgs(strslice.StrSlice{}, cmdSlice) 71 execConfig := container.NewExecConfig(cntr) 72 execConfig.OpenStdin = false 73 execConfig.OpenStdout = true 74 execConfig.OpenStderr = true 75 execConfig.DetachKeys = []byte{} 76 execConfig.Entrypoint = entrypoint 77 execConfig.Args = args 78 execConfig.Tty = false 79 execConfig.Privileged = false 80 execConfig.User = cntr.Config.User 81 execConfig.WorkingDir = cntr.Config.WorkingDir 82 83 linkedEnv, err := d.setupLinkedContainers(cntr) 84 if err != nil { 85 return nil, err 86 } 87 execConfig.Env = container.ReplaceOrAppendEnvValues(cntr.CreateDaemonEnvironment(execConfig.Tty, linkedEnv), execConfig.Env) 88 89 d.registerExecCommand(cntr, execConfig) 90 attributes := map[string]string{ 91 "execID": execConfig.ID, 92 } 93 d.LogContainerEventWithAttributes(cntr, "exec_create: "+execConfig.Entrypoint+" "+strings.Join(execConfig.Args, " "), attributes) 94 95 output := &limitedBuffer{} 96 probeCtx, cancelProbe := context.WithCancel(ctx) 97 defer cancelProbe() 98 execErr := make(chan error, 1) 99 100 options := containertypes.ExecStartOptions{ 101 Stdout: output, 102 Stderr: output, 103 } 104 105 go func() { execErr <- d.ContainerExecStart(probeCtx, execConfig.ID, options) }() 106 107 // Starting an exec can take a significant amount of time: on the order 108 // of 1s in extreme cases. The time it takes dockerd and containerd to 109 // start the exec is time that the probe process is not running, and so 110 // should not count towards the health check's timeout. Apply a separate 111 // timeout to abort if the exec request is wedged. 112 tm := time.NewTimer(30 * time.Second) 113 defer tm.Stop() 114 select { 115 case <-tm.C: 116 return nil, fmt.Errorf("timed out starting health check for container %s", cntr.ID) 117 case err := <-execErr: 118 if err != nil { 119 return nil, err 120 } 121 case <-execConfig.Started: 122 healthCheckStartDuration.UpdateSince(startTime) 123 } 124 125 if !tm.Stop() { 126 <-tm.C 127 } 128 probeTimeout := timeoutWithDefault(cntr.Config.Healthcheck.Timeout, defaultProbeTimeout) 129 tm.Reset(probeTimeout) 130 select { 131 case <-tm.C: 132 cancelProbe() 133 logrus.WithContext(ctx).Debugf("Health check for container %s taking too long", cntr.ID) 134 // Wait for probe to exit (it might take some time to call containerd to kill 135 // the process and we don't want dying probes to pile up). 136 <-execErr 137 138 var msg string 139 if out := output.String(); len(out) > 0 { 140 msg = fmt.Sprintf("Health check exceeded timeout (%v): %s", probeTimeout, out) 141 } else { 142 msg = fmt.Sprintf("Health check exceeded timeout (%v)", probeTimeout) 143 } 144 return &types.HealthcheckResult{ 145 ExitCode: -1, 146 Output: msg, 147 End: time.Now(), 148 }, nil 149 case err := <-execErr: 150 if err != nil { 151 return nil, err 152 } 153 } 154 155 info, err := d.getExecConfig(execConfig.ID) 156 if err != nil { 157 return nil, err 158 } 159 exitCode, err := func() (int, error) { 160 info.Lock() 161 defer info.Unlock() 162 if info.ExitCode == nil { 163 return 0, fmt.Errorf("healthcheck for container %s has no exit code", cntr.ID) 164 } 165 return *info.ExitCode, nil 166 }() 167 if err != nil { 168 return nil, err 169 } 170 // Note: Go's json package will handle invalid UTF-8 for us 171 out := output.String() 172 return &types.HealthcheckResult{ 173 End: time.Now(), 174 ExitCode: exitCode, 175 Output: out, 176 }, nil 177 } 178 179 // Update the container's Status.Health struct based on the latest probe's result. 180 func handleProbeResult(d *Daemon, c *container.Container, result *types.HealthcheckResult, done chan struct{}) { 181 c.Lock() 182 defer c.Unlock() 183 184 // probe may have been cancelled while waiting on lock. Ignore result then 185 select { 186 case <-done: 187 return 188 default: 189 } 190 191 retries := c.Config.Healthcheck.Retries 192 if retries <= 0 { 193 retries = defaultProbeRetries 194 } 195 196 h := c.State.Health 197 oldStatus := h.Status() 198 199 if len(h.Log) >= maxLogEntries { 200 h.Log = append(h.Log[len(h.Log)+1-maxLogEntries:], result) 201 } else { 202 h.Log = append(h.Log, result) 203 } 204 205 if result.ExitCode == exitStatusHealthy { 206 h.FailingStreak = 0 207 h.SetStatus(types.Healthy) 208 } else { // Failure (including invalid exit code) 209 shouldIncrementStreak := true 210 211 // If the container is starting (i.e. we never had a successful health check) 212 // then we check if we are within the start period of the container in which 213 // case we do not increment the failure streak. 214 if h.Status() == types.Starting { 215 startPeriod := timeoutWithDefault(c.Config.Healthcheck.StartPeriod, defaultStartPeriod) 216 timeSinceStart := result.Start.Sub(c.State.StartedAt) 217 218 // If still within the start period, then don't increment failing streak. 219 if timeSinceStart < startPeriod { 220 shouldIncrementStreak = false 221 } 222 } 223 224 if shouldIncrementStreak { 225 h.FailingStreak++ 226 227 if h.FailingStreak >= retries { 228 h.SetStatus(types.Unhealthy) 229 } 230 } 231 // Else we're starting or healthy. Stay in that state. 232 } 233 234 // replicate Health status changes 235 if err := c.CheckpointTo(d.containersReplica); err != nil { 236 // queries will be inconsistent until the next probe runs or other state mutations 237 // checkpoint the container 238 logrus.Errorf("Error replicating health state for container %s: %v", c.ID, err) 239 } 240 241 current := h.Status() 242 if oldStatus != current { 243 d.LogContainerEvent(c, "health_status: "+current) 244 } 245 } 246 247 // Run the container's monitoring thread until notified via "stop". 248 // There is never more than one monitor thread running per container at a time. 249 func monitor(d *Daemon, c *container.Container, stop chan struct{}, probe probe) { 250 probeInterval := timeoutWithDefault(c.Config.Healthcheck.Interval, defaultProbeInterval) 251 252 intervalTimer := time.NewTimer(probeInterval) 253 defer intervalTimer.Stop() 254 255 for { 256 intervalTimer.Reset(probeInterval) 257 258 select { 259 case <-stop: 260 logrus.Debugf("Stop healthcheck monitoring for container %s (received while idle)", c.ID) 261 return 262 case <-intervalTimer.C: 263 logrus.Debugf("Running health check for container %s ...", c.ID) 264 startTime := time.Now() 265 ctx, cancelProbe := context.WithCancel(context.Background()) 266 results := make(chan *types.HealthcheckResult, 1) 267 go func() { 268 healthChecksCounter.Inc() 269 result, err := probe.run(ctx, d, c) 270 if err != nil { 271 healthChecksFailedCounter.Inc() 272 logrus.Warnf("Health check for container %s error: %v", c.ID, err) 273 results <- &types.HealthcheckResult{ 274 ExitCode: -1, 275 Output: err.Error(), 276 Start: startTime, 277 End: time.Now(), 278 } 279 } else { 280 result.Start = startTime 281 logrus.Debugf("Health check for container %s done (exitCode=%d)", c.ID, result.ExitCode) 282 results <- result 283 } 284 close(results) 285 }() 286 select { 287 case <-stop: 288 logrus.Debugf("Stop healthcheck monitoring for container %s (received while probing)", c.ID) 289 cancelProbe() 290 // Wait for probe to exit (it might take a while to respond to the TERM 291 // signal and we don't want dying probes to pile up). 292 <-results 293 return 294 case result := <-results: 295 handleProbeResult(d, c, result, stop) 296 cancelProbe() 297 } 298 } 299 } 300 } 301 302 // Get a suitable probe implementation for the container's healthcheck configuration. 303 // Nil will be returned if no healthcheck was configured or NONE was set. 304 func getProbe(c *container.Container) probe { 305 config := c.Config.Healthcheck 306 if config == nil || len(config.Test) == 0 { 307 return nil 308 } 309 switch config.Test[0] { 310 case "CMD": 311 return &cmdProbe{shell: false} 312 case "CMD-SHELL": 313 return &cmdProbe{shell: true} 314 case "NONE": 315 return nil 316 default: 317 logrus.Warnf("Unknown healthcheck type '%s' (expected 'CMD') in container %s", config.Test[0], c.ID) 318 return nil 319 } 320 } 321 322 // Ensure the health-check monitor is running or not, depending on the current 323 // state of the container. 324 // Called from monitor.go, with c locked. 325 func (daemon *Daemon) updateHealthMonitor(c *container.Container) { 326 h := c.State.Health 327 if h == nil { 328 return // No healthcheck configured 329 } 330 331 probe := getProbe(c) 332 wantRunning := c.Running && !c.Paused && probe != nil 333 if wantRunning { 334 if stop := h.OpenMonitorChannel(); stop != nil { 335 go monitor(daemon, c, stop, probe) 336 } 337 } else { 338 h.CloseMonitorChannel() 339 } 340 } 341 342 // Reset the health state for a newly-started, restarted or restored container. 343 // initHealthMonitor is called from monitor.go and we should never be running 344 // two instances at once. 345 // Called with c locked. 346 func (daemon *Daemon) initHealthMonitor(c *container.Container) { 347 // If no healthcheck is setup then don't init the monitor 348 if getProbe(c) == nil { 349 return 350 } 351 352 // This is needed in case we're auto-restarting 353 daemon.stopHealthchecks(c) 354 355 if h := c.State.Health; h != nil { 356 h.SetStatus(types.Starting) 357 h.FailingStreak = 0 358 } else { 359 h := &container.Health{} 360 h.SetStatus(types.Starting) 361 c.State.Health = h 362 } 363 364 daemon.updateHealthMonitor(c) 365 } 366 367 // Called when the container is being stopped (whether because the health check is 368 // failing or for any other reason). 369 func (daemon *Daemon) stopHealthchecks(c *container.Container) { 370 h := c.State.Health 371 if h != nil { 372 h.CloseMonitorChannel() 373 } 374 } 375 376 // Buffer up to maxOutputLen bytes. Further data is discarded. 377 type limitedBuffer struct { 378 buf bytes.Buffer 379 mu sync.Mutex 380 truncated bool // indicates that data has been lost 381 } 382 383 // Append to limitedBuffer while there is room. 384 func (b *limitedBuffer) Write(data []byte) (int, error) { 385 b.mu.Lock() 386 defer b.mu.Unlock() 387 388 bufLen := b.buf.Len() 389 dataLen := len(data) 390 keep := min(maxOutputLen-bufLen, dataLen) 391 if keep > 0 { 392 b.buf.Write(data[:keep]) 393 } 394 if keep < dataLen { 395 b.truncated = true 396 } 397 return dataLen, nil 398 } 399 400 // The contents of the buffer, with "..." appended if it overflowed. 401 func (b *limitedBuffer) String() string { 402 b.mu.Lock() 403 defer b.mu.Unlock() 404 405 out := b.buf.String() 406 if b.truncated { 407 out = out + "..." 408 } 409 return out 410 } 411 412 // If configuredValue is zero, use defaultValue instead. 413 func timeoutWithDefault(configuredValue time.Duration, defaultValue time.Duration) time.Duration { 414 if configuredValue == 0 { 415 return defaultValue 416 } 417 return configuredValue 418 } 419 420 func min(x, y int) int { 421 if x < y { 422 return x 423 } 424 return y 425 } 426 427 func getShell(cntr *container.Container) []string { 428 if len(cntr.Config.Shell) != 0 { 429 return cntr.Config.Shell 430 } 431 if runtime.GOOS != "windows" { 432 return []string{"/bin/sh", "-c"} 433 } 434 if cntr.OS != runtime.GOOS { 435 return []string{"/bin/sh", "-c"} 436 } 437 return []string{"cmd", "/S", "/C"} 438 }