github.com/ssdev-go/moby@v17.12.1-ce-rc2+incompatible/daemon/health.go (about) 1 package daemon 2 3 import ( 4 "bytes" 5 "fmt" 6 "runtime" 7 "strings" 8 "sync" 9 "time" 10 11 "golang.org/x/net/context" 12 13 "github.com/docker/docker/api/types" 14 containertypes "github.com/docker/docker/api/types/container" 15 "github.com/docker/docker/api/types/strslice" 16 "github.com/docker/docker/container" 17 "github.com/docker/docker/daemon/exec" 18 "github.com/sirupsen/logrus" 19 ) 20 21 const ( 22 // Longest healthcheck probe output message to store. Longer messages will be truncated. 23 maxOutputLen = 4096 24 25 // Default interval between probe runs (from the end of the first to the start of the second). 26 // Also the time before the first probe. 27 defaultProbeInterval = 30 * time.Second 28 29 // The maximum length of time a single probe run should take. If the probe takes longer 30 // than this, the check is considered to have failed. 31 defaultProbeTimeout = 30 * time.Second 32 33 // The time given for the container to start before the health check starts considering 34 // the container unstable. Defaults to none. 35 defaultStartPeriod = 0 * time.Second 36 37 // Default number of consecutive failures of the health check 38 // for the container to be considered unhealthy. 39 defaultProbeRetries = 3 40 41 // Maximum number of entries to record 42 maxLogEntries = 5 43 ) 44 45 const ( 46 // Exit status codes that can be returned by the probe command. 47 48 exitStatusHealthy = 0 // Container is healthy 49 ) 50 51 // probe implementations know how to run a particular type of probe. 52 type probe interface { 53 // Perform one run of the check. Returns the exit code and an optional 54 // short diagnostic string. 55 run(context.Context, *Daemon, *container.Container) (*types.HealthcheckResult, error) 56 } 57 58 // cmdProbe implements the "CMD" probe type. 59 type cmdProbe struct { 60 // Run the command with the system's default shell instead of execing it directly. 61 shell bool 62 } 63 64 // exec the healthcheck command in the container. 65 // Returns the exit code and probe output (if any) 66 func (p *cmdProbe) run(ctx context.Context, d *Daemon, cntr *container.Container) (*types.HealthcheckResult, error) { 67 cmdSlice := strslice.StrSlice(cntr.Config.Healthcheck.Test)[1:] 68 if p.shell { 69 cmdSlice = append(getShell(cntr.Config), cmdSlice...) 70 } 71 entrypoint, args := d.getEntrypointAndArgs(strslice.StrSlice{}, cmdSlice) 72 execConfig := exec.NewConfig() 73 execConfig.OpenStdin = false 74 execConfig.OpenStdout = true 75 execConfig.OpenStderr = true 76 execConfig.ContainerID = cntr.ID 77 execConfig.DetachKeys = []byte{} 78 execConfig.Entrypoint = entrypoint 79 execConfig.Args = args 80 execConfig.Tty = false 81 execConfig.Privileged = false 82 execConfig.User = cntr.Config.User 83 execConfig.WorkingDir = cntr.Config.WorkingDir 84 85 linkedEnv, err := d.setupLinkedContainers(cntr) 86 if err != nil { 87 return nil, err 88 } 89 execConfig.Env = container.ReplaceOrAppendEnvValues(cntr.CreateDaemonEnvironment(execConfig.Tty, linkedEnv), execConfig.Env) 90 91 d.registerExecCommand(cntr, execConfig) 92 d.LogContainerEvent(cntr, "exec_create: "+execConfig.Entrypoint+" "+strings.Join(execConfig.Args, " ")) 93 94 output := &limitedBuffer{} 95 err = d.ContainerExecStart(ctx, execConfig.ID, nil, output, output) 96 if err != nil { 97 return nil, err 98 } 99 info, err := d.getExecConfig(execConfig.ID) 100 if err != nil { 101 return nil, err 102 } 103 if info.ExitCode == nil { 104 return nil, fmt.Errorf("healthcheck for container %s has no exit code", cntr.ID) 105 } 106 // Note: Go's json package will handle invalid UTF-8 for us 107 out := output.String() 108 return &types.HealthcheckResult{ 109 End: time.Now(), 110 ExitCode: *info.ExitCode, 111 Output: out, 112 }, nil 113 } 114 115 // Update the container's Status.Health struct based on the latest probe's result. 116 func handleProbeResult(d *Daemon, c *container.Container, result *types.HealthcheckResult, done chan struct{}) { 117 c.Lock() 118 defer c.Unlock() 119 120 // probe may have been cancelled while waiting on lock. Ignore result then 121 select { 122 case <-done: 123 return 124 default: 125 } 126 127 retries := c.Config.Healthcheck.Retries 128 if retries <= 0 { 129 retries = defaultProbeRetries 130 } 131 132 h := c.State.Health 133 oldStatus := h.Status() 134 135 if len(h.Log) >= maxLogEntries { 136 h.Log = append(h.Log[len(h.Log)+1-maxLogEntries:], result) 137 } else { 138 h.Log = append(h.Log, result) 139 } 140 141 if result.ExitCode == exitStatusHealthy { 142 h.FailingStreak = 0 143 h.SetStatus(types.Healthy) 144 } else { // Failure (including invalid exit code) 145 shouldIncrementStreak := true 146 147 // If the container is starting (i.e. we never had a successful health check) 148 // then we check if we are within the start period of the container in which 149 // case we do not increment the failure streak. 150 if h.Status() == types.Starting { 151 startPeriod := timeoutWithDefault(c.Config.Healthcheck.StartPeriod, defaultStartPeriod) 152 timeSinceStart := result.Start.Sub(c.State.StartedAt) 153 154 // If still within the start period, then don't increment failing streak. 155 if timeSinceStart < startPeriod { 156 shouldIncrementStreak = false 157 } 158 } 159 160 if shouldIncrementStreak { 161 h.FailingStreak++ 162 163 if h.FailingStreak >= retries { 164 h.SetStatus(types.Unhealthy) 165 } 166 } 167 // Else we're starting or healthy. Stay in that state. 168 } 169 170 // replicate Health status changes 171 if err := c.CheckpointTo(d.containersReplica); err != nil { 172 // queries will be inconsistent until the next probe runs or other state mutations 173 // checkpoint the container 174 logrus.Errorf("Error replicating health state for container %s: %v", c.ID, err) 175 } 176 177 current := h.Status() 178 if oldStatus != current { 179 d.LogContainerEvent(c, "health_status: "+current) 180 } 181 } 182 183 // Run the container's monitoring thread until notified via "stop". 184 // There is never more than one monitor thread running per container at a time. 185 func monitor(d *Daemon, c *container.Container, stop chan struct{}, probe probe) { 186 probeTimeout := timeoutWithDefault(c.Config.Healthcheck.Timeout, defaultProbeTimeout) 187 probeInterval := timeoutWithDefault(c.Config.Healthcheck.Interval, defaultProbeInterval) 188 for { 189 select { 190 case <-stop: 191 logrus.Debugf("Stop healthcheck monitoring for container %s (received while idle)", c.ID) 192 return 193 case <-time.After(probeInterval): 194 logrus.Debugf("Running health check for container %s ...", c.ID) 195 startTime := time.Now() 196 ctx, cancelProbe := context.WithTimeout(context.Background(), probeTimeout) 197 results := make(chan *types.HealthcheckResult, 1) 198 go func() { 199 healthChecksCounter.Inc() 200 result, err := probe.run(ctx, d, c) 201 if err != nil { 202 healthChecksFailedCounter.Inc() 203 logrus.Warnf("Health check for container %s error: %v", c.ID, err) 204 results <- &types.HealthcheckResult{ 205 ExitCode: -1, 206 Output: err.Error(), 207 Start: startTime, 208 End: time.Now(), 209 } 210 } else { 211 result.Start = startTime 212 logrus.Debugf("Health check for container %s done (exitCode=%d)", c.ID, result.ExitCode) 213 results <- result 214 } 215 close(results) 216 }() 217 select { 218 case <-stop: 219 logrus.Debugf("Stop healthcheck monitoring for container %s (received while probing)", c.ID) 220 cancelProbe() 221 // Wait for probe to exit (it might take a while to respond to the TERM 222 // signal and we don't want dying probes to pile up). 223 <-results 224 return 225 case result := <-results: 226 handleProbeResult(d, c, result, stop) 227 // Stop timeout 228 cancelProbe() 229 case <-ctx.Done(): 230 logrus.Debugf("Health check for container %s taking too long", c.ID) 231 handleProbeResult(d, c, &types.HealthcheckResult{ 232 ExitCode: -1, 233 Output: fmt.Sprintf("Health check exceeded timeout (%v)", probeTimeout), 234 Start: startTime, 235 End: time.Now(), 236 }, stop) 237 cancelProbe() 238 // Wait for probe to exit (it might take a while to respond to the TERM 239 // signal and we don't want dying probes to pile up). 240 <-results 241 } 242 } 243 } 244 } 245 246 // Get a suitable probe implementation for the container's healthcheck configuration. 247 // Nil will be returned if no healthcheck was configured or NONE was set. 248 func getProbe(c *container.Container) probe { 249 config := c.Config.Healthcheck 250 if config == nil || len(config.Test) == 0 { 251 return nil 252 } 253 switch config.Test[0] { 254 case "CMD": 255 return &cmdProbe{shell: false} 256 case "CMD-SHELL": 257 return &cmdProbe{shell: true} 258 case "NONE": 259 return nil 260 default: 261 logrus.Warnf("Unknown healthcheck type '%s' (expected 'CMD') in container %s", config.Test[0], c.ID) 262 return nil 263 } 264 } 265 266 // Ensure the health-check monitor is running or not, depending on the current 267 // state of the container. 268 // Called from monitor.go, with c locked. 269 func (d *Daemon) updateHealthMonitor(c *container.Container) { 270 h := c.State.Health 271 if h == nil { 272 return // No healthcheck configured 273 } 274 275 probe := getProbe(c) 276 wantRunning := c.Running && !c.Paused && probe != nil 277 if wantRunning { 278 if stop := h.OpenMonitorChannel(); stop != nil { 279 go monitor(d, c, stop, probe) 280 } 281 } else { 282 h.CloseMonitorChannel() 283 } 284 } 285 286 // Reset the health state for a newly-started, restarted or restored container. 287 // initHealthMonitor is called from monitor.go and we should never be running 288 // two instances at once. 289 // Called with c locked. 290 func (d *Daemon) initHealthMonitor(c *container.Container) { 291 // If no healthcheck is setup then don't init the monitor 292 if getProbe(c) == nil { 293 return 294 } 295 296 // This is needed in case we're auto-restarting 297 d.stopHealthchecks(c) 298 299 if h := c.State.Health; h != nil { 300 h.SetStatus(types.Starting) 301 h.FailingStreak = 0 302 } else { 303 h := &container.Health{} 304 h.SetStatus(types.Starting) 305 c.State.Health = h 306 } 307 308 d.updateHealthMonitor(c) 309 } 310 311 // Called when the container is being stopped (whether because the health check is 312 // failing or for any other reason). 313 func (d *Daemon) stopHealthchecks(c *container.Container) { 314 h := c.State.Health 315 if h != nil { 316 h.CloseMonitorChannel() 317 } 318 } 319 320 // Buffer up to maxOutputLen bytes. Further data is discarded. 321 type limitedBuffer struct { 322 buf bytes.Buffer 323 mu sync.Mutex 324 truncated bool // indicates that data has been lost 325 } 326 327 // Append to limitedBuffer while there is room. 328 func (b *limitedBuffer) Write(data []byte) (int, error) { 329 b.mu.Lock() 330 defer b.mu.Unlock() 331 332 bufLen := b.buf.Len() 333 dataLen := len(data) 334 keep := min(maxOutputLen-bufLen, dataLen) 335 if keep > 0 { 336 b.buf.Write(data[:keep]) 337 } 338 if keep < dataLen { 339 b.truncated = true 340 } 341 return dataLen, nil 342 } 343 344 // The contents of the buffer, with "..." appended if it overflowed. 345 func (b *limitedBuffer) String() string { 346 b.mu.Lock() 347 defer b.mu.Unlock() 348 349 out := b.buf.String() 350 if b.truncated { 351 out = out + "..." 352 } 353 return out 354 } 355 356 // If configuredValue is zero, use defaultValue instead. 357 func timeoutWithDefault(configuredValue time.Duration, defaultValue time.Duration) time.Duration { 358 if configuredValue == 0 { 359 return defaultValue 360 } 361 return configuredValue 362 } 363 364 func min(x, y int) int { 365 if x < y { 366 return x 367 } 368 return y 369 } 370 371 func getShell(config *containertypes.Config) []string { 372 if len(config.Shell) != 0 { 373 return config.Shell 374 } 375 if runtime.GOOS != "windows" { 376 return []string{"/bin/sh", "-c"} 377 } 378 return []string{"cmd", "/S", "/C"} 379 }