github.com/kim0/docker@v0.6.2-0.20161130212042-4addda3f07e7/daemon/health.go (about) 1 package daemon 2 3 import ( 4 "bytes" 5 "fmt" 6 "runtime" 7 "strings" 8 "sync" 9 "time" 10 11 "golang.org/x/net/context" 12 13 "github.com/Sirupsen/logrus" 14 "github.com/docker/docker/api/types" 15 "github.com/docker/docker/api/types/strslice" 16 "github.com/docker/docker/container" 17 "github.com/docker/docker/daemon/exec" 18 ) 19 20 const ( 21 // Longest healthcheck probe output message to store. Longer messages will be truncated. 22 maxOutputLen = 4096 23 24 // Default interval between probe runs (from the end of the first to the start of the second). 25 // Also the time before the first probe. 26 defaultProbeInterval = 30 * time.Second 27 28 // The maximum length of time a single probe run should take. If the probe takes longer 29 // than this, the check is considered to have failed. 30 defaultProbeTimeout = 30 * time.Second 31 32 // Default number of consecutive failures of the health check 33 // for the container to be considered unhealthy. 34 defaultProbeRetries = 3 35 36 // Maximum number of entries to record 37 maxLogEntries = 5 38 ) 39 40 const ( 41 // Exit status codes that can be returned by the probe command. 42 43 exitStatusHealthy = 0 // Container is healthy 44 exitStatusUnhealthy = 1 // Container is unhealthy 45 ) 46 47 // probe implementations know how to run a particular type of probe. 48 type probe interface { 49 // Perform one run of the check. Returns the exit code and an optional 50 // short diagnostic string. 51 run(context.Context, *Daemon, *container.Container) (*types.HealthcheckResult, error) 52 } 53 54 // cmdProbe implements the "CMD" probe type. 55 type cmdProbe struct { 56 // Run the command with the system's default shell instead of execing it directly. 57 shell bool 58 } 59 60 // exec the healthcheck command in the container. 61 // Returns the exit code and probe output (if any) 62 func (p *cmdProbe) run(ctx context.Context, d *Daemon, container *container.Container) (*types.HealthcheckResult, error) { 63 64 cmdSlice := strslice.StrSlice(container.Config.Healthcheck.Test)[1:] 65 if p.shell { 66 if runtime.GOOS != "windows" { 67 cmdSlice = append([]string{"/bin/sh", "-c"}, cmdSlice...) 68 } else { 69 cmdSlice = append([]string{"cmd", "/S", "/C"}, cmdSlice...) 70 } 71 } 72 entrypoint, args := d.getEntrypointAndArgs(strslice.StrSlice{}, cmdSlice) 73 execConfig := exec.NewConfig() 74 execConfig.OpenStdin = false 75 execConfig.OpenStdout = true 76 execConfig.OpenStderr = true 77 execConfig.ContainerID = container.ID 78 execConfig.DetachKeys = []byte{} 79 execConfig.Entrypoint = entrypoint 80 execConfig.Args = args 81 execConfig.Tty = false 82 execConfig.Privileged = false 83 execConfig.User = container.Config.User 84 85 d.registerExecCommand(container, execConfig) 86 d.LogContainerEvent(container, "exec_create: "+execConfig.Entrypoint+" "+strings.Join(execConfig.Args, " ")) 87 88 output := &limitedBuffer{} 89 err := d.ContainerExecStart(ctx, execConfig.ID, nil, output, output) 90 if err != nil { 91 return nil, err 92 } 93 info, err := d.getExecConfig(execConfig.ID) 94 if err != nil { 95 return nil, err 96 } 97 if info.ExitCode == nil { 98 return nil, fmt.Errorf("Healthcheck for container %s has no exit code!", container.ID) 99 } 100 // Note: Go's json package will handle invalid UTF-8 for us 101 out := output.String() 102 return &types.HealthcheckResult{ 103 End: time.Now(), 104 ExitCode: *info.ExitCode, 105 Output: out, 106 }, nil 107 } 108 109 // Update the container's Status.Health struct based on the latest probe's result. 110 func handleProbeResult(d *Daemon, c *container.Container, result *types.HealthcheckResult) { 111 c.Lock() 112 defer c.Unlock() 113 114 retries := c.Config.Healthcheck.Retries 115 if retries <= 0 { 116 retries = defaultProbeRetries 117 } 118 119 h := c.State.Health 120 oldStatus := h.Status 121 122 if len(h.Log) >= maxLogEntries { 123 h.Log = append(h.Log[len(h.Log)+1-maxLogEntries:], result) 124 } else { 125 h.Log = append(h.Log, result) 126 } 127 128 if result.ExitCode == exitStatusHealthy { 129 h.FailingStreak = 0 130 h.Status = types.Healthy 131 } else { 132 // Failure (including invalid exit code) 133 h.FailingStreak++ 134 if h.FailingStreak >= retries { 135 h.Status = types.Unhealthy 136 } 137 // Else we're starting or healthy. Stay in that state. 138 } 139 140 if oldStatus != h.Status { 141 d.LogContainerEvent(c, "health_status: "+h.Status) 142 } 143 } 144 145 // Run the container's monitoring thread until notified via "stop". 146 // There is never more than one monitor thread running per container at a time. 147 func monitor(d *Daemon, c *container.Container, stop chan struct{}, probe probe) { 148 probeTimeout := timeoutWithDefault(c.Config.Healthcheck.Timeout, defaultProbeTimeout) 149 probeInterval := timeoutWithDefault(c.Config.Healthcheck.Interval, defaultProbeInterval) 150 for { 151 select { 152 case <-stop: 153 logrus.Debugf("Stop healthcheck monitoring for container %s (received while idle)", c.ID) 154 return 155 case <-time.After(probeInterval): 156 logrus.Debugf("Running health check for container %s ...", c.ID) 157 startTime := time.Now() 158 ctx, cancelProbe := context.WithTimeout(context.Background(), probeTimeout) 159 results := make(chan *types.HealthcheckResult) 160 go func() { 161 healthChecksCounter.Inc() 162 result, err := probe.run(ctx, d, c) 163 if err != nil { 164 healthChecksFailedCounter.Inc() 165 logrus.Warnf("Health check for container %s error: %v", c.ID, err) 166 results <- &types.HealthcheckResult{ 167 ExitCode: -1, 168 Output: err.Error(), 169 Start: startTime, 170 End: time.Now(), 171 } 172 } else { 173 result.Start = startTime 174 logrus.Debugf("Health check for container %s done (exitCode=%d)", c.ID, result.ExitCode) 175 results <- result 176 } 177 close(results) 178 }() 179 select { 180 case <-stop: 181 logrus.Debugf("Stop healthcheck monitoring for container %s (received while probing)", c.ID) 182 // Stop timeout and kill probe, but don't wait for probe to exit. 183 cancelProbe() 184 return 185 case result := <-results: 186 handleProbeResult(d, c, result) 187 // Stop timeout 188 cancelProbe() 189 case <-ctx.Done(): 190 logrus.Debugf("Health check for container %s taking too long", c.ID) 191 handleProbeResult(d, c, &types.HealthcheckResult{ 192 ExitCode: -1, 193 Output: fmt.Sprintf("Health check exceeded timeout (%v)", probeTimeout), 194 Start: startTime, 195 End: time.Now(), 196 }) 197 cancelProbe() 198 // Wait for probe to exit (it might take a while to respond to the TERM 199 // signal and we don't want dying probes to pile up). 200 <-results 201 } 202 } 203 } 204 } 205 206 // Get a suitable probe implementation for the container's healthcheck configuration. 207 // Nil will be returned if no healthcheck was configured or NONE was set. 208 func getProbe(c *container.Container) probe { 209 config := c.Config.Healthcheck 210 if config == nil || len(config.Test) == 0 { 211 return nil 212 } 213 switch config.Test[0] { 214 case "CMD": 215 return &cmdProbe{shell: false} 216 case "CMD-SHELL": 217 return &cmdProbe{shell: true} 218 default: 219 logrus.Warnf("Unknown healthcheck type '%s' (expected 'CMD') in container %s", config.Test[0], c.ID) 220 return nil 221 } 222 } 223 224 // Ensure the health-check monitor is running or not, depending on the current 225 // state of the container. 226 // Called from monitor.go, with c locked. 227 func (d *Daemon) updateHealthMonitor(c *container.Container) { 228 h := c.State.Health 229 if h == nil { 230 return // No healthcheck configured 231 } 232 233 probe := getProbe(c) 234 wantRunning := c.Running && !c.Paused && probe != nil 235 if wantRunning { 236 if stop := h.OpenMonitorChannel(); stop != nil { 237 go monitor(d, c, stop, probe) 238 } 239 } else { 240 h.CloseMonitorChannel() 241 } 242 } 243 244 // Reset the health state for a newly-started, restarted or restored container. 245 // initHealthMonitor is called from monitor.go and we should never be running 246 // two instances at once. 247 // Called with c locked. 248 func (d *Daemon) initHealthMonitor(c *container.Container) { 249 // If no healthcheck is setup then don't init the monitor 250 if getProbe(c) == nil { 251 return 252 } 253 254 // This is needed in case we're auto-restarting 255 d.stopHealthchecks(c) 256 257 if h := c.State.Health; h != nil { 258 h.Status = types.Starting 259 h.FailingStreak = 0 260 } else { 261 h := &container.Health{} 262 h.Status = types.Starting 263 c.State.Health = h 264 } 265 266 d.updateHealthMonitor(c) 267 } 268 269 // Called when the container is being stopped (whether because the health check is 270 // failing or for any other reason). 271 func (d *Daemon) stopHealthchecks(c *container.Container) { 272 h := c.State.Health 273 if h != nil { 274 h.CloseMonitorChannel() 275 } 276 } 277 278 // Buffer up to maxOutputLen bytes. Further data is discarded. 279 type limitedBuffer struct { 280 buf bytes.Buffer 281 mu sync.Mutex 282 truncated bool // indicates that data has been lost 283 } 284 285 // Append to limitedBuffer while there is room. 286 func (b *limitedBuffer) Write(data []byte) (int, error) { 287 b.mu.Lock() 288 defer b.mu.Unlock() 289 290 bufLen := b.buf.Len() 291 dataLen := len(data) 292 keep := min(maxOutputLen-bufLen, dataLen) 293 if keep > 0 { 294 b.buf.Write(data[:keep]) 295 } 296 if keep < dataLen { 297 b.truncated = true 298 } 299 return dataLen, nil 300 } 301 302 // The contents of the buffer, with "..." appended if it overflowed. 303 func (b *limitedBuffer) String() string { 304 b.mu.Lock() 305 defer b.mu.Unlock() 306 307 out := b.buf.String() 308 if b.truncated { 309 out = out + "..." 310 } 311 return out 312 } 313 314 // If configuredValue is zero, use defaultValue instead. 315 func timeoutWithDefault(configuredValue time.Duration, defaultValue time.Duration) time.Duration { 316 if configuredValue == 0 { 317 return defaultValue 318 } 319 return configuredValue 320 } 321 322 func min(x, y int) int { 323 if x < y { 324 return x 325 } 326 return y 327 }