github.com/jen20/docker@v1.13.1/daemon/health.go (about) 1 package daemon 2 3 import ( 4 "bytes" 5 "fmt" 6 "runtime" 7 "strings" 8 "sync" 9 "time" 10 11 "golang.org/x/net/context" 12 13 "github.com/Sirupsen/logrus" 14 "github.com/docker/docker/api/types" 15 containertypes "github.com/docker/docker/api/types/container" 16 "github.com/docker/docker/api/types/strslice" 17 "github.com/docker/docker/container" 18 "github.com/docker/docker/daemon/exec" 19 ) 20 21 const ( 22 // Longest healthcheck probe output message to store. Longer messages will be truncated. 23 maxOutputLen = 4096 24 25 // Default interval between probe runs (from the end of the first to the start of the second). 26 // Also the time before the first probe. 27 defaultProbeInterval = 30 * time.Second 28 29 // The maximum length of time a single probe run should take. If the probe takes longer 30 // than this, the check is considered to have failed. 31 defaultProbeTimeout = 30 * time.Second 32 33 // Default number of consecutive failures of the health check 34 // for the container to be considered unhealthy. 35 defaultProbeRetries = 3 36 37 // Maximum number of entries to record 38 maxLogEntries = 5 39 ) 40 41 const ( 42 // Exit status codes that can be returned by the probe command. 43 44 exitStatusHealthy = 0 // Container is healthy 45 exitStatusUnhealthy = 1 // Container is unhealthy 46 ) 47 48 // probe implementations know how to run a particular type of probe. 49 type probe interface { 50 // Perform one run of the check. Returns the exit code and an optional 51 // short diagnostic string. 52 run(context.Context, *Daemon, *container.Container) (*types.HealthcheckResult, error) 53 } 54 55 // cmdProbe implements the "CMD" probe type. 56 type cmdProbe struct { 57 // Run the command with the system's default shell instead of execing it directly. 58 shell bool 59 } 60 61 // exec the healthcheck command in the container. 62 // Returns the exit code and probe output (if any) 63 func (p *cmdProbe) run(ctx context.Context, d *Daemon, container *container.Container) (*types.HealthcheckResult, error) { 64 65 cmdSlice := strslice.StrSlice(container.Config.Healthcheck.Test)[1:] 66 if p.shell { 67 cmdSlice = append(getShell(container.Config), cmdSlice...) 68 } 69 entrypoint, args := d.getEntrypointAndArgs(strslice.StrSlice{}, cmdSlice) 70 execConfig := exec.NewConfig() 71 execConfig.OpenStdin = false 72 execConfig.OpenStdout = true 73 execConfig.OpenStderr = true 74 execConfig.ContainerID = container.ID 75 execConfig.DetachKeys = []byte{} 76 execConfig.Entrypoint = entrypoint 77 execConfig.Args = args 78 execConfig.Tty = false 79 execConfig.Privileged = false 80 execConfig.User = container.Config.User 81 82 d.registerExecCommand(container, execConfig) 83 d.LogContainerEvent(container, "exec_create: "+execConfig.Entrypoint+" "+strings.Join(execConfig.Args, " ")) 84 85 output := &limitedBuffer{} 86 err := d.ContainerExecStart(ctx, execConfig.ID, nil, output, output) 87 if err != nil { 88 return nil, err 89 } 90 info, err := d.getExecConfig(execConfig.ID) 91 if err != nil { 92 return nil, err 93 } 94 if info.ExitCode == nil { 95 return nil, fmt.Errorf("Healthcheck for container %s has no exit code!", container.ID) 96 } 97 // Note: Go's json package will handle invalid UTF-8 for us 98 out := output.String() 99 return &types.HealthcheckResult{ 100 End: time.Now(), 101 ExitCode: *info.ExitCode, 102 Output: out, 103 }, nil 104 } 105 106 // Update the container's Status.Health struct based on the latest probe's result. 107 func handleProbeResult(d *Daemon, c *container.Container, result *types.HealthcheckResult, done chan struct{}) { 108 c.Lock() 109 defer c.Unlock() 110 111 // probe may have been cancelled while waiting on lock. Ignore result then 112 select { 113 case <-done: 114 return 115 default: 116 } 117 118 retries := c.Config.Healthcheck.Retries 119 if retries <= 0 { 120 retries = defaultProbeRetries 121 } 122 123 h := c.State.Health 124 oldStatus := h.Status 125 126 if len(h.Log) >= maxLogEntries { 127 h.Log = append(h.Log[len(h.Log)+1-maxLogEntries:], result) 128 } else { 129 h.Log = append(h.Log, result) 130 } 131 132 if result.ExitCode == exitStatusHealthy { 133 h.FailingStreak = 0 134 h.Status = types.Healthy 135 } else { 136 // Failure (including invalid exit code) 137 h.FailingStreak++ 138 if h.FailingStreak >= retries { 139 h.Status = types.Unhealthy 140 } 141 // Else we're starting or healthy. Stay in that state. 142 } 143 144 if oldStatus != h.Status { 145 d.LogContainerEvent(c, "health_status: "+h.Status) 146 } 147 } 148 149 // Run the container's monitoring thread until notified via "stop". 150 // There is never more than one monitor thread running per container at a time. 151 func monitor(d *Daemon, c *container.Container, stop chan struct{}, probe probe) { 152 probeTimeout := timeoutWithDefault(c.Config.Healthcheck.Timeout, defaultProbeTimeout) 153 probeInterval := timeoutWithDefault(c.Config.Healthcheck.Interval, defaultProbeInterval) 154 for { 155 select { 156 case <-stop: 157 logrus.Debugf("Stop healthcheck monitoring for container %s (received while idle)", c.ID) 158 return 159 case <-time.After(probeInterval): 160 logrus.Debugf("Running health check for container %s ...", c.ID) 161 startTime := time.Now() 162 ctx, cancelProbe := context.WithTimeout(context.Background(), probeTimeout) 163 results := make(chan *types.HealthcheckResult) 164 go func() { 165 healthChecksCounter.Inc() 166 result, err := probe.run(ctx, d, c) 167 if err != nil { 168 healthChecksFailedCounter.Inc() 169 logrus.Warnf("Health check for container %s error: %v", c.ID, err) 170 results <- &types.HealthcheckResult{ 171 ExitCode: -1, 172 Output: err.Error(), 173 Start: startTime, 174 End: time.Now(), 175 } 176 } else { 177 result.Start = startTime 178 logrus.Debugf("Health check for container %s done (exitCode=%d)", c.ID, result.ExitCode) 179 results <- result 180 } 181 close(results) 182 }() 183 select { 184 case <-stop: 185 logrus.Debugf("Stop healthcheck monitoring for container %s (received while probing)", c.ID) 186 // Stop timeout and kill probe, but don't wait for probe to exit. 187 cancelProbe() 188 return 189 case result := <-results: 190 handleProbeResult(d, c, result, stop) 191 // Stop timeout 192 cancelProbe() 193 case <-ctx.Done(): 194 logrus.Debugf("Health check for container %s taking too long", c.ID) 195 handleProbeResult(d, c, &types.HealthcheckResult{ 196 ExitCode: -1, 197 Output: fmt.Sprintf("Health check exceeded timeout (%v)", probeTimeout), 198 Start: startTime, 199 End: time.Now(), 200 }, stop) 201 cancelProbe() 202 // Wait for probe to exit (it might take a while to respond to the TERM 203 // signal and we don't want dying probes to pile up). 204 <-results 205 } 206 } 207 } 208 } 209 210 // Get a suitable probe implementation for the container's healthcheck configuration. 211 // Nil will be returned if no healthcheck was configured or NONE was set. 212 func getProbe(c *container.Container) probe { 213 config := c.Config.Healthcheck 214 if config == nil || len(config.Test) == 0 { 215 return nil 216 } 217 switch config.Test[0] { 218 case "CMD": 219 return &cmdProbe{shell: false} 220 case "CMD-SHELL": 221 return &cmdProbe{shell: true} 222 default: 223 logrus.Warnf("Unknown healthcheck type '%s' (expected 'CMD') in container %s", config.Test[0], c.ID) 224 return nil 225 } 226 } 227 228 // Ensure the health-check monitor is running or not, depending on the current 229 // state of the container. 230 // Called from monitor.go, with c locked. 231 func (d *Daemon) updateHealthMonitor(c *container.Container) { 232 h := c.State.Health 233 if h == nil { 234 return // No healthcheck configured 235 } 236 237 probe := getProbe(c) 238 wantRunning := c.Running && !c.Paused && probe != nil 239 if wantRunning { 240 if stop := h.OpenMonitorChannel(); stop != nil { 241 go monitor(d, c, stop, probe) 242 } 243 } else { 244 h.CloseMonitorChannel() 245 } 246 } 247 248 // Reset the health state for a newly-started, restarted or restored container. 249 // initHealthMonitor is called from monitor.go and we should never be running 250 // two instances at once. 251 // Called with c locked. 252 func (d *Daemon) initHealthMonitor(c *container.Container) { 253 // If no healthcheck is setup then don't init the monitor 254 if getProbe(c) == nil { 255 return 256 } 257 258 // This is needed in case we're auto-restarting 259 d.stopHealthchecks(c) 260 261 if h := c.State.Health; h != nil { 262 h.Status = types.Starting 263 h.FailingStreak = 0 264 } else { 265 h := &container.Health{} 266 h.Status = types.Starting 267 c.State.Health = h 268 } 269 270 d.updateHealthMonitor(c) 271 } 272 273 // Called when the container is being stopped (whether because the health check is 274 // failing or for any other reason). 275 func (d *Daemon) stopHealthchecks(c *container.Container) { 276 h := c.State.Health 277 if h != nil { 278 h.CloseMonitorChannel() 279 } 280 } 281 282 // Buffer up to maxOutputLen bytes. Further data is discarded. 283 type limitedBuffer struct { 284 buf bytes.Buffer 285 mu sync.Mutex 286 truncated bool // indicates that data has been lost 287 } 288 289 // Append to limitedBuffer while there is room. 290 func (b *limitedBuffer) Write(data []byte) (int, error) { 291 b.mu.Lock() 292 defer b.mu.Unlock() 293 294 bufLen := b.buf.Len() 295 dataLen := len(data) 296 keep := min(maxOutputLen-bufLen, dataLen) 297 if keep > 0 { 298 b.buf.Write(data[:keep]) 299 } 300 if keep < dataLen { 301 b.truncated = true 302 } 303 return dataLen, nil 304 } 305 306 // The contents of the buffer, with "..." appended if it overflowed. 307 func (b *limitedBuffer) String() string { 308 b.mu.Lock() 309 defer b.mu.Unlock() 310 311 out := b.buf.String() 312 if b.truncated { 313 out = out + "..." 314 } 315 return out 316 } 317 318 // If configuredValue is zero, use defaultValue instead. 319 func timeoutWithDefault(configuredValue time.Duration, defaultValue time.Duration) time.Duration { 320 if configuredValue == 0 { 321 return defaultValue 322 } 323 return configuredValue 324 } 325 326 func min(x, y int) int { 327 if x < y { 328 return x 329 } 330 return y 331 } 332 333 func getShell(config *containertypes.Config) []string { 334 if len(config.Shell) != 0 { 335 return config.Shell 336 } 337 if runtime.GOOS != "windows" { 338 return []string{"/bin/sh", "-c"} 339 } 340 return []string{"cmd", "/S", "/C"} 341 }