github.com/jfrazelle/docker@v1.1.2-0.20210712172922-bf78e25fe508/daemon/health.go (about) 1 package daemon // import "github.com/docker/docker/daemon" 2 3 import ( 4 "bytes" 5 "context" 6 "fmt" 7 "runtime" 8 "strings" 9 "sync" 10 "time" 11 12 "github.com/docker/docker/api/types" 13 "github.com/docker/docker/api/types/strslice" 14 "github.com/docker/docker/container" 15 "github.com/docker/docker/daemon/exec" 16 "github.com/sirupsen/logrus" 17 ) 18 19 const ( 20 // Longest healthcheck probe output message to store. Longer messages will be truncated. 21 maxOutputLen = 4096 22 23 // Default interval between probe runs (from the end of the first to the start of the second). 24 // Also the time before the first probe. 25 defaultProbeInterval = 30 * time.Second 26 27 // The maximum length of time a single probe run should take. If the probe takes longer 28 // than this, the check is considered to have failed. 29 defaultProbeTimeout = 30 * time.Second 30 31 // The time given for the container to start before the health check starts considering 32 // the container unstable. Defaults to none. 33 defaultStartPeriod = 0 * time.Second 34 35 // Default number of consecutive failures of the health check 36 // for the container to be considered unhealthy. 37 defaultProbeRetries = 3 38 39 // Maximum number of entries to record 40 maxLogEntries = 5 41 ) 42 43 const ( 44 // Exit status codes that can be returned by the probe command. 45 46 exitStatusHealthy = 0 // Container is healthy 47 ) 48 49 // probe implementations know how to run a particular type of probe. 50 type probe interface { 51 // Perform one run of the check. Returns the exit code and an optional 52 // short diagnostic string. 53 run(context.Context, *Daemon, *container.Container) (*types.HealthcheckResult, error) 54 } 55 56 // cmdProbe implements the "CMD" probe type. 57 type cmdProbe struct { 58 // Run the command with the system's default shell instead of execing it directly. 59 shell bool 60 } 61 62 // exec the healthcheck command in the container. 63 // Returns the exit code and probe output (if any) 64 func (p *cmdProbe) run(ctx context.Context, d *Daemon, cntr *container.Container) (*types.HealthcheckResult, error) { 65 cmdSlice := strslice.StrSlice(cntr.Config.Healthcheck.Test)[1:] 66 if p.shell { 67 cmdSlice = append(getShell(cntr), cmdSlice...) 68 } 69 entrypoint, args := d.getEntrypointAndArgs(strslice.StrSlice{}, cmdSlice) 70 execConfig := exec.NewConfig() 71 execConfig.OpenStdin = false 72 execConfig.OpenStdout = true 73 execConfig.OpenStderr = true 74 execConfig.ContainerID = cntr.ID 75 execConfig.DetachKeys = []byte{} 76 execConfig.Entrypoint = entrypoint 77 execConfig.Args = args 78 execConfig.Tty = false 79 execConfig.Privileged = false 80 execConfig.User = cntr.Config.User 81 execConfig.WorkingDir = cntr.Config.WorkingDir 82 83 linkedEnv, err := d.setupLinkedContainers(cntr) 84 if err != nil { 85 return nil, err 86 } 87 execConfig.Env = container.ReplaceOrAppendEnvValues(cntr.CreateDaemonEnvironment(execConfig.Tty, linkedEnv), execConfig.Env) 88 89 d.registerExecCommand(cntr, execConfig) 90 attributes := map[string]string{ 91 "execID": execConfig.ID, 92 } 93 d.LogContainerEventWithAttributes(cntr, "exec_create: "+execConfig.Entrypoint+" "+strings.Join(execConfig.Args, " "), attributes) 94 95 output := &limitedBuffer{} 96 err = d.ContainerExecStart(ctx, execConfig.ID, nil, output, output) 97 if err != nil { 98 return nil, err 99 } 100 info, err := d.getExecConfig(execConfig.ID) 101 if err != nil { 102 return nil, err 103 } 104 if info.ExitCode == nil { 105 return nil, fmt.Errorf("healthcheck for container %s has no exit code", cntr.ID) 106 } 107 // Note: Go's json package will handle invalid UTF-8 for us 108 out := output.String() 109 return &types.HealthcheckResult{ 110 End: time.Now(), 111 ExitCode: *info.ExitCode, 112 Output: out, 113 }, nil 114 } 115 116 // Update the container's Status.Health struct based on the latest probe's result. 117 func handleProbeResult(d *Daemon, c *container.Container, result *types.HealthcheckResult, done chan struct{}) { 118 c.Lock() 119 defer c.Unlock() 120 121 // probe may have been cancelled while waiting on lock. Ignore result then 122 select { 123 case <-done: 124 return 125 default: 126 } 127 128 retries := c.Config.Healthcheck.Retries 129 if retries <= 0 { 130 retries = defaultProbeRetries 131 } 132 133 h := c.State.Health 134 oldStatus := h.Status() 135 136 if len(h.Log) >= maxLogEntries { 137 h.Log = append(h.Log[len(h.Log)+1-maxLogEntries:], result) 138 } else { 139 h.Log = append(h.Log, result) 140 } 141 142 if result.ExitCode == exitStatusHealthy { 143 h.FailingStreak = 0 144 h.SetStatus(types.Healthy) 145 } else { // Failure (including invalid exit code) 146 shouldIncrementStreak := true 147 148 // If the container is starting (i.e. we never had a successful health check) 149 // then we check if we are within the start period of the container in which 150 // case we do not increment the failure streak. 151 if h.Status() == types.Starting { 152 startPeriod := timeoutWithDefault(c.Config.Healthcheck.StartPeriod, defaultStartPeriod) 153 timeSinceStart := result.Start.Sub(c.State.StartedAt) 154 155 // If still within the start period, then don't increment failing streak. 156 if timeSinceStart < startPeriod { 157 shouldIncrementStreak = false 158 } 159 } 160 161 if shouldIncrementStreak { 162 h.FailingStreak++ 163 164 if h.FailingStreak >= retries { 165 h.SetStatus(types.Unhealthy) 166 } 167 } 168 // Else we're starting or healthy. Stay in that state. 169 } 170 171 // replicate Health status changes 172 if err := c.CheckpointTo(d.containersReplica); err != nil { 173 // queries will be inconsistent until the next probe runs or other state mutations 174 // checkpoint the container 175 logrus.Errorf("Error replicating health state for container %s: %v", c.ID, err) 176 } 177 178 current := h.Status() 179 if oldStatus != current { 180 d.LogContainerEvent(c, "health_status: "+current) 181 } 182 } 183 184 // Run the container's monitoring thread until notified via "stop". 185 // There is never more than one monitor thread running per container at a time. 186 func monitor(d *Daemon, c *container.Container, stop chan struct{}, probe probe) { 187 probeTimeout := timeoutWithDefault(c.Config.Healthcheck.Timeout, defaultProbeTimeout) 188 probeInterval := timeoutWithDefault(c.Config.Healthcheck.Interval, defaultProbeInterval) 189 190 intervalTimer := time.NewTimer(probeInterval) 191 defer intervalTimer.Stop() 192 193 for { 194 intervalTimer.Reset(probeInterval) 195 196 select { 197 case <-stop: 198 logrus.Debugf("Stop healthcheck monitoring for container %s (received while idle)", c.ID) 199 return 200 case <-intervalTimer.C: 201 logrus.Debugf("Running health check for container %s ...", c.ID) 202 startTime := time.Now() 203 ctx, cancelProbe := context.WithTimeout(context.Background(), probeTimeout) 204 results := make(chan *types.HealthcheckResult, 1) 205 go func() { 206 healthChecksCounter.Inc() 207 result, err := probe.run(ctx, d, c) 208 if err != nil { 209 healthChecksFailedCounter.Inc() 210 logrus.Warnf("Health check for container %s error: %v", c.ID, err) 211 results <- &types.HealthcheckResult{ 212 ExitCode: -1, 213 Output: err.Error(), 214 Start: startTime, 215 End: time.Now(), 216 } 217 } else { 218 result.Start = startTime 219 logrus.Debugf("Health check for container %s done (exitCode=%d)", c.ID, result.ExitCode) 220 results <- result 221 } 222 close(results) 223 }() 224 select { 225 case <-stop: 226 logrus.Debugf("Stop healthcheck monitoring for container %s (received while probing)", c.ID) 227 cancelProbe() 228 // Wait for probe to exit (it might take a while to respond to the TERM 229 // signal and we don't want dying probes to pile up). 230 <-results 231 return 232 case result := <-results: 233 handleProbeResult(d, c, result, stop) 234 // Stop timeout 235 cancelProbe() 236 case <-ctx.Done(): 237 logrus.Debugf("Health check for container %s taking too long", c.ID) 238 handleProbeResult(d, c, &types.HealthcheckResult{ 239 ExitCode: -1, 240 Output: fmt.Sprintf("Health check exceeded timeout (%v)", probeTimeout), 241 Start: startTime, 242 End: time.Now(), 243 }, stop) 244 cancelProbe() 245 // Wait for probe to exit (it might take a while to respond to the TERM 246 // signal and we don't want dying probes to pile up). 247 <-results 248 } 249 } 250 } 251 } 252 253 // Get a suitable probe implementation for the container's healthcheck configuration. 254 // Nil will be returned if no healthcheck was configured or NONE was set. 255 func getProbe(c *container.Container) probe { 256 config := c.Config.Healthcheck 257 if config == nil || len(config.Test) == 0 { 258 return nil 259 } 260 switch config.Test[0] { 261 case "CMD": 262 return &cmdProbe{shell: false} 263 case "CMD-SHELL": 264 return &cmdProbe{shell: true} 265 case "NONE": 266 return nil 267 default: 268 logrus.Warnf("Unknown healthcheck type '%s' (expected 'CMD') in container %s", config.Test[0], c.ID) 269 return nil 270 } 271 } 272 273 // Ensure the health-check monitor is running or not, depending on the current 274 // state of the container. 275 // Called from monitor.go, with c locked. 276 func (daemon *Daemon) updateHealthMonitor(c *container.Container) { 277 h := c.State.Health 278 if h == nil { 279 return // No healthcheck configured 280 } 281 282 probe := getProbe(c) 283 wantRunning := c.Running && !c.Paused && probe != nil 284 if wantRunning { 285 if stop := h.OpenMonitorChannel(); stop != nil { 286 go monitor(daemon, c, stop, probe) 287 } 288 } else { 289 h.CloseMonitorChannel() 290 } 291 } 292 293 // Reset the health state for a newly-started, restarted or restored container. 294 // initHealthMonitor is called from monitor.go and we should never be running 295 // two instances at once. 296 // Called with c locked. 297 func (daemon *Daemon) initHealthMonitor(c *container.Container) { 298 // If no healthcheck is setup then don't init the monitor 299 if getProbe(c) == nil { 300 return 301 } 302 303 // This is needed in case we're auto-restarting 304 daemon.stopHealthchecks(c) 305 306 if h := c.State.Health; h != nil { 307 h.SetStatus(types.Starting) 308 h.FailingStreak = 0 309 } else { 310 h := &container.Health{} 311 h.SetStatus(types.Starting) 312 c.State.Health = h 313 } 314 315 daemon.updateHealthMonitor(c) 316 } 317 318 // Called when the container is being stopped (whether because the health check is 319 // failing or for any other reason). 320 func (daemon *Daemon) stopHealthchecks(c *container.Container) { 321 h := c.State.Health 322 if h != nil { 323 h.CloseMonitorChannel() 324 } 325 } 326 327 // Buffer up to maxOutputLen bytes. Further data is discarded. 328 type limitedBuffer struct { 329 buf bytes.Buffer 330 mu sync.Mutex 331 truncated bool // indicates that data has been lost 332 } 333 334 // Append to limitedBuffer while there is room. 335 func (b *limitedBuffer) Write(data []byte) (int, error) { 336 b.mu.Lock() 337 defer b.mu.Unlock() 338 339 bufLen := b.buf.Len() 340 dataLen := len(data) 341 keep := min(maxOutputLen-bufLen, dataLen) 342 if keep > 0 { 343 b.buf.Write(data[:keep]) 344 } 345 if keep < dataLen { 346 b.truncated = true 347 } 348 return dataLen, nil 349 } 350 351 // The contents of the buffer, with "..." appended if it overflowed. 352 func (b *limitedBuffer) String() string { 353 b.mu.Lock() 354 defer b.mu.Unlock() 355 356 out := b.buf.String() 357 if b.truncated { 358 out = out + "..." 359 } 360 return out 361 } 362 363 // If configuredValue is zero, use defaultValue instead. 364 func timeoutWithDefault(configuredValue time.Duration, defaultValue time.Duration) time.Duration { 365 if configuredValue == 0 { 366 return defaultValue 367 } 368 return configuredValue 369 } 370 371 func min(x, y int) int { 372 if x < y { 373 return x 374 } 375 return y 376 } 377 378 func getShell(cntr *container.Container) []string { 379 if len(cntr.Config.Shell) != 0 { 380 return cntr.Config.Shell 381 } 382 if runtime.GOOS != "windows" { 383 return []string{"/bin/sh", "-c"} 384 } 385 if cntr.OS != runtime.GOOS { 386 return []string{"/bin/sh", "-c"} 387 } 388 return []string{"cmd", "/S", "/C"} 389 }