github.com/adityamillind98/moby@v23.0.0-rc.4+incompatible/daemon/health.go (about) 1 package daemon // import "github.com/docker/docker/daemon" 2 3 import ( 4 "bytes" 5 "context" 6 "fmt" 7 "runtime" 8 "strings" 9 "sync" 10 "time" 11 12 "github.com/docker/docker/api/types" 13 containertypes "github.com/docker/docker/api/types/container" 14 "github.com/docker/docker/api/types/strslice" 15 "github.com/docker/docker/container" 16 "github.com/docker/docker/daemon/exec" 17 "github.com/sirupsen/logrus" 18 ) 19 20 const ( 21 // Longest healthcheck probe output message to store. Longer messages will be truncated. 22 maxOutputLen = 4096 23 24 // Default interval between probe runs (from the end of the first to the start of the second). 25 // Also the time before the first probe. 26 defaultProbeInterval = 30 * time.Second 27 28 // The maximum length of time a single probe run should take. If the probe takes longer 29 // than this, the check is considered to have failed. 30 defaultProbeTimeout = 30 * time.Second 31 32 // The time given for the container to start before the health check starts considering 33 // the container unstable. Defaults to none. 34 defaultStartPeriod = 0 * time.Second 35 36 // Default number of consecutive failures of the health check 37 // for the container to be considered unhealthy. 38 defaultProbeRetries = 3 39 40 // Maximum number of entries to record 41 maxLogEntries = 5 42 ) 43 44 const ( 45 // Exit status codes that can be returned by the probe command. 46 47 exitStatusHealthy = 0 // Container is healthy 48 ) 49 50 // probe implementations know how to run a particular type of probe. 51 type probe interface { 52 // Perform one run of the check. Returns the exit code and an optional 53 // short diagnostic string. 54 run(context.Context, *Daemon, *container.Container) (*types.HealthcheckResult, error) 55 } 56 57 // cmdProbe implements the "CMD" probe type. 58 type cmdProbe struct { 59 // Run the command with the system's default shell instead of execing it directly. 60 shell bool 61 } 62 63 // exec the healthcheck command in the container. 64 // Returns the exit code and probe output (if any) 65 func (p *cmdProbe) run(ctx context.Context, d *Daemon, cntr *container.Container) (*types.HealthcheckResult, error) { 66 startTime := time.Now() 67 cmdSlice := strslice.StrSlice(cntr.Config.Healthcheck.Test)[1:] 68 if p.shell { 69 cmdSlice = append(getShell(cntr), cmdSlice...) 70 } 71 entrypoint, args := d.getEntrypointAndArgs(strslice.StrSlice{}, cmdSlice) 72 execConfig := exec.NewConfig() 73 execConfig.OpenStdin = false 74 execConfig.OpenStdout = true 75 execConfig.OpenStderr = true 76 execConfig.ContainerID = cntr.ID 77 execConfig.DetachKeys = []byte{} 78 execConfig.Entrypoint = entrypoint 79 execConfig.Args = args 80 execConfig.Tty = false 81 execConfig.Privileged = false 82 execConfig.User = cntr.Config.User 83 execConfig.WorkingDir = cntr.Config.WorkingDir 84 85 linkedEnv, err := d.setupLinkedContainers(cntr) 86 if err != nil { 87 return nil, err 88 } 89 execConfig.Env = container.ReplaceOrAppendEnvValues(cntr.CreateDaemonEnvironment(execConfig.Tty, linkedEnv), execConfig.Env) 90 91 d.registerExecCommand(cntr, execConfig) 92 attributes := map[string]string{ 93 "execID": execConfig.ID, 94 } 95 d.LogContainerEventWithAttributes(cntr, "exec_create: "+execConfig.Entrypoint+" "+strings.Join(execConfig.Args, " "), attributes) 96 97 output := &limitedBuffer{} 98 probeCtx, cancelProbe := context.WithCancel(ctx) 99 defer cancelProbe() 100 execErr := make(chan error, 1) 101 102 options := containertypes.ExecStartOptions{ 103 Stdout: output, 104 Stderr: output, 105 } 106 107 go func() { execErr <- d.ContainerExecStart(probeCtx, execConfig.ID, options) }() 108 109 // Starting an exec can take a significant amount of time: on the order 110 // of 1s in extreme cases. The time it takes dockerd and containerd to 111 // start the exec is time that the probe process is not running, and so 112 // should not count towards the health check's timeout. Apply a separate 113 // timeout to abort if the exec request is wedged. 114 tm := time.NewTimer(30 * time.Second) 115 defer tm.Stop() 116 select { 117 case <-tm.C: 118 return nil, fmt.Errorf("timed out starting health check for container %s", cntr.ID) 119 case err := <-execErr: 120 if err != nil { 121 return nil, err 122 } 123 case <-execConfig.Started: 124 healthCheckStartDuration.UpdateSince(startTime) 125 } 126 127 if !tm.Stop() { 128 <-tm.C 129 } 130 probeTimeout := timeoutWithDefault(cntr.Config.Healthcheck.Timeout, defaultProbeTimeout) 131 tm.Reset(probeTimeout) 132 select { 133 case <-tm.C: 134 cancelProbe() 135 logrus.WithContext(ctx).Debugf("Health check for container %s taking too long", cntr.ID) 136 // Wait for probe to exit (it might take some time to call containerd to kill 137 // the process and we don't want dying probes to pile up). 138 <-execErr 139 return &types.HealthcheckResult{ 140 ExitCode: -1, 141 Output: fmt.Sprintf("Health check exceeded timeout (%v)", probeTimeout), 142 End: time.Now(), 143 }, nil 144 case err := <-execErr: 145 if err != nil { 146 return nil, err 147 } 148 } 149 150 info, err := d.getExecConfig(execConfig.ID) 151 if err != nil { 152 return nil, err 153 } 154 if info.ExitCode == nil { 155 return nil, fmt.Errorf("healthcheck for container %s has no exit code", cntr.ID) 156 } 157 // Note: Go's json package will handle invalid UTF-8 for us 158 out := output.String() 159 return &types.HealthcheckResult{ 160 End: time.Now(), 161 ExitCode: *info.ExitCode, 162 Output: out, 163 }, nil 164 } 165 166 // Update the container's Status.Health struct based on the latest probe's result. 167 func handleProbeResult(d *Daemon, c *container.Container, result *types.HealthcheckResult, done chan struct{}) { 168 c.Lock() 169 defer c.Unlock() 170 171 // probe may have been cancelled while waiting on lock. Ignore result then 172 select { 173 case <-done: 174 return 175 default: 176 } 177 178 retries := c.Config.Healthcheck.Retries 179 if retries <= 0 { 180 retries = defaultProbeRetries 181 } 182 183 h := c.State.Health 184 oldStatus := h.Status() 185 186 if len(h.Log) >= maxLogEntries { 187 h.Log = append(h.Log[len(h.Log)+1-maxLogEntries:], result) 188 } else { 189 h.Log = append(h.Log, result) 190 } 191 192 if result.ExitCode == exitStatusHealthy { 193 h.FailingStreak = 0 194 h.SetStatus(types.Healthy) 195 } else { // Failure (including invalid exit code) 196 shouldIncrementStreak := true 197 198 // If the container is starting (i.e. we never had a successful health check) 199 // then we check if we are within the start period of the container in which 200 // case we do not increment the failure streak. 201 if h.Status() == types.Starting { 202 startPeriod := timeoutWithDefault(c.Config.Healthcheck.StartPeriod, defaultStartPeriod) 203 timeSinceStart := result.Start.Sub(c.State.StartedAt) 204 205 // If still within the start period, then don't increment failing streak. 206 if timeSinceStart < startPeriod { 207 shouldIncrementStreak = false 208 } 209 } 210 211 if shouldIncrementStreak { 212 h.FailingStreak++ 213 214 if h.FailingStreak >= retries { 215 h.SetStatus(types.Unhealthy) 216 } 217 } 218 // Else we're starting or healthy. Stay in that state. 219 } 220 221 // replicate Health status changes 222 if err := c.CheckpointTo(d.containersReplica); err != nil { 223 // queries will be inconsistent until the next probe runs or other state mutations 224 // checkpoint the container 225 logrus.Errorf("Error replicating health state for container %s: %v", c.ID, err) 226 } 227 228 current := h.Status() 229 if oldStatus != current { 230 d.LogContainerEvent(c, "health_status: "+current) 231 } 232 } 233 234 // Run the container's monitoring thread until notified via "stop". 235 // There is never more than one monitor thread running per container at a time. 236 func monitor(d *Daemon, c *container.Container, stop chan struct{}, probe probe) { 237 probeInterval := timeoutWithDefault(c.Config.Healthcheck.Interval, defaultProbeInterval) 238 239 intervalTimer := time.NewTimer(probeInterval) 240 defer intervalTimer.Stop() 241 242 for { 243 intervalTimer.Reset(probeInterval) 244 245 select { 246 case <-stop: 247 logrus.Debugf("Stop healthcheck monitoring for container %s (received while idle)", c.ID) 248 return 249 case <-intervalTimer.C: 250 logrus.Debugf("Running health check for container %s ...", c.ID) 251 startTime := time.Now() 252 ctx, cancelProbe := context.WithCancel(context.Background()) 253 results := make(chan *types.HealthcheckResult, 1) 254 go func() { 255 healthChecksCounter.Inc() 256 result, err := probe.run(ctx, d, c) 257 if err != nil { 258 healthChecksFailedCounter.Inc() 259 logrus.Warnf("Health check for container %s error: %v", c.ID, err) 260 results <- &types.HealthcheckResult{ 261 ExitCode: -1, 262 Output: err.Error(), 263 Start: startTime, 264 End: time.Now(), 265 } 266 } else { 267 result.Start = startTime 268 logrus.Debugf("Health check for container %s done (exitCode=%d)", c.ID, result.ExitCode) 269 results <- result 270 } 271 close(results) 272 }() 273 select { 274 case <-stop: 275 logrus.Debugf("Stop healthcheck monitoring for container %s (received while probing)", c.ID) 276 cancelProbe() 277 // Wait for probe to exit (it might take a while to respond to the TERM 278 // signal and we don't want dying probes to pile up). 279 <-results 280 return 281 case result := <-results: 282 handleProbeResult(d, c, result, stop) 283 cancelProbe() 284 } 285 } 286 } 287 } 288 289 // Get a suitable probe implementation for the container's healthcheck configuration. 290 // Nil will be returned if no healthcheck was configured or NONE was set. 291 func getProbe(c *container.Container) probe { 292 config := c.Config.Healthcheck 293 if config == nil || len(config.Test) == 0 { 294 return nil 295 } 296 switch config.Test[0] { 297 case "CMD": 298 return &cmdProbe{shell: false} 299 case "CMD-SHELL": 300 return &cmdProbe{shell: true} 301 case "NONE": 302 return nil 303 default: 304 logrus.Warnf("Unknown healthcheck type '%s' (expected 'CMD') in container %s", config.Test[0], c.ID) 305 return nil 306 } 307 } 308 309 // Ensure the health-check monitor is running or not, depending on the current 310 // state of the container. 311 // Called from monitor.go, with c locked. 312 func (daemon *Daemon) updateHealthMonitor(c *container.Container) { 313 h := c.State.Health 314 if h == nil { 315 return // No healthcheck configured 316 } 317 318 probe := getProbe(c) 319 wantRunning := c.Running && !c.Paused && probe != nil 320 if wantRunning { 321 if stop := h.OpenMonitorChannel(); stop != nil { 322 go monitor(daemon, c, stop, probe) 323 } 324 } else { 325 h.CloseMonitorChannel() 326 } 327 } 328 329 // Reset the health state for a newly-started, restarted or restored container. 330 // initHealthMonitor is called from monitor.go and we should never be running 331 // two instances at once. 332 // Called with c locked. 333 func (daemon *Daemon) initHealthMonitor(c *container.Container) { 334 // If no healthcheck is setup then don't init the monitor 335 if getProbe(c) == nil { 336 return 337 } 338 339 // This is needed in case we're auto-restarting 340 daemon.stopHealthchecks(c) 341 342 if h := c.State.Health; h != nil { 343 h.SetStatus(types.Starting) 344 h.FailingStreak = 0 345 } else { 346 h := &container.Health{} 347 h.SetStatus(types.Starting) 348 c.State.Health = h 349 } 350 351 daemon.updateHealthMonitor(c) 352 } 353 354 // Called when the container is being stopped (whether because the health check is 355 // failing or for any other reason). 356 func (daemon *Daemon) stopHealthchecks(c *container.Container) { 357 h := c.State.Health 358 if h != nil { 359 h.CloseMonitorChannel() 360 } 361 } 362 363 // Buffer up to maxOutputLen bytes. Further data is discarded. 364 type limitedBuffer struct { 365 buf bytes.Buffer 366 mu sync.Mutex 367 truncated bool // indicates that data has been lost 368 } 369 370 // Append to limitedBuffer while there is room. 371 func (b *limitedBuffer) Write(data []byte) (int, error) { 372 b.mu.Lock() 373 defer b.mu.Unlock() 374 375 bufLen := b.buf.Len() 376 dataLen := len(data) 377 keep := min(maxOutputLen-bufLen, dataLen) 378 if keep > 0 { 379 b.buf.Write(data[:keep]) 380 } 381 if keep < dataLen { 382 b.truncated = true 383 } 384 return dataLen, nil 385 } 386 387 // The contents of the buffer, with "..." appended if it overflowed. 388 func (b *limitedBuffer) String() string { 389 b.mu.Lock() 390 defer b.mu.Unlock() 391 392 out := b.buf.String() 393 if b.truncated { 394 out = out + "..." 395 } 396 return out 397 } 398 399 // If configuredValue is zero, use defaultValue instead. 400 func timeoutWithDefault(configuredValue time.Duration, defaultValue time.Duration) time.Duration { 401 if configuredValue == 0 { 402 return defaultValue 403 } 404 return configuredValue 405 } 406 407 func min(x, y int) int { 408 if x < y { 409 return x 410 } 411 return y 412 } 413 414 func getShell(cntr *container.Container) []string { 415 if len(cntr.Config.Shell) != 0 { 416 return cntr.Config.Shell 417 } 418 if runtime.GOOS != "windows" { 419 return []string{"/bin/sh", "-c"} 420 } 421 if cntr.OS != runtime.GOOS { 422 return []string{"/bin/sh", "-c"} 423 } 424 return []string{"cmd", "/S", "/C"} 425 }