github.com/noxiouz/docker@v0.7.3-0.20160629055221-3d231c78e8c5/daemon/health.go (about) 1 package daemon 2 3 import ( 4 "bytes" 5 "fmt" 6 "runtime" 7 "strings" 8 "time" 9 10 "golang.org/x/net/context" 11 12 "github.com/Sirupsen/logrus" 13 "github.com/docker/docker/container" 14 "github.com/docker/docker/daemon/exec" 15 "github.com/docker/engine-api/types" 16 "github.com/docker/engine-api/types/strslice" 17 ) 18 19 const ( 20 // Longest healthcheck probe output message to store. Longer messages will be truncated. 21 maxOutputLen = 4096 22 23 // Default interval between probe runs (from the end of the first to the start of the second). 24 // Also the time before the first probe. 25 defaultProbeInterval = 30 * time.Second 26 27 // The maximum length of time a single probe run should take. If the probe takes longer 28 // than this, the check is considered to have failed. 29 defaultProbeTimeout = 30 * time.Second 30 31 // Default number of consecutive failures of the health check 32 // for the container to be considered unhealthy. 33 defaultProbeRetries = 3 34 35 // Maximum number of entries to record 36 maxLogEntries = 5 37 ) 38 39 const ( 40 // Exit status codes that can be returned by the probe command. 41 42 exitStatusHealthy = 0 // Container is healthy 43 exitStatusUnhealthy = 1 // Container is unhealthy 44 exitStatusStarting = 2 // Container needs more time to start 45 ) 46 47 // probe implementations know how to run a particular type of probe. 48 type probe interface { 49 // Perform one run of the check. Returns the exit code and an optional 50 // short diagnostic string. 51 run(context.Context, *Daemon, *container.Container) (*types.HealthcheckResult, error) 52 } 53 54 // cmdProbe implements the "CMD" probe type. 55 type cmdProbe struct { 56 // Run the command with the system's default shell instead of execing it directly. 57 shell bool 58 } 59 60 // exec the healthcheck command in the container. 61 // Returns the exit code and probe output (if any) 62 func (p *cmdProbe) run(ctx context.Context, d *Daemon, container *container.Container) (*types.HealthcheckResult, error) { 63 cmdSlice := strslice.StrSlice(container.Config.Healthcheck.Test)[1:] 64 if p.shell { 65 if runtime.GOOS != "windows" { 66 cmdSlice = append([]string{"/bin/sh", "-c"}, cmdSlice...) 67 } else { 68 cmdSlice = append([]string{"cmd", "/S", "/C"}, cmdSlice...) 69 } 70 } 71 entrypoint, args := d.getEntrypointAndArgs(strslice.StrSlice{}, cmdSlice) 72 execConfig := exec.NewConfig() 73 execConfig.OpenStdin = false 74 execConfig.OpenStdout = true 75 execConfig.OpenStderr = true 76 execConfig.ContainerID = container.ID 77 execConfig.DetachKeys = []byte{} 78 execConfig.Entrypoint = entrypoint 79 execConfig.Args = args 80 execConfig.Tty = false 81 execConfig.Privileged = false 82 execConfig.User = container.Config.User 83 84 d.registerExecCommand(container, execConfig) 85 d.LogContainerEvent(container, "exec_create: "+execConfig.Entrypoint+" "+strings.Join(execConfig.Args, " ")) 86 87 output := &limitedBuffer{} 88 err := d.ContainerExecStart(ctx, execConfig.ID, nil, output, output) 89 if err != nil { 90 return nil, err 91 } 92 info, err := d.getExecConfig(execConfig.ID) 93 if err != nil { 94 return nil, err 95 } 96 if info.ExitCode == nil { 97 return nil, fmt.Errorf("Healthcheck has no exit code!") 98 } 99 // Note: Go's json package will handle invalid UTF-8 for us 100 out := output.String() 101 return &types.HealthcheckResult{ 102 End: time.Now(), 103 ExitCode: *info.ExitCode, 104 Output: out, 105 }, nil 106 } 107 108 // Update the container's Status.Health struct based on the latest probe's result. 109 func handleProbeResult(d *Daemon, c *container.Container, result *types.HealthcheckResult) { 110 c.Lock() 111 defer c.Unlock() 112 113 retries := c.Config.Healthcheck.Retries 114 if retries <= 0 { 115 retries = defaultProbeRetries 116 } 117 118 h := c.State.Health 119 oldStatus := h.Status 120 121 if len(h.Log) >= maxLogEntries { 122 h.Log = append(h.Log[len(h.Log)+1-maxLogEntries:], result) 123 } else { 124 h.Log = append(h.Log, result) 125 } 126 127 if result.ExitCode == exitStatusHealthy { 128 h.FailingStreak = 0 129 h.Status = types.Healthy 130 } else if result.ExitCode == exitStatusStarting && c.State.Health.Status == types.Starting { 131 // The container is not ready yet. Remain in the starting state. 132 } else { 133 // Failure (including invalid exit code) 134 h.FailingStreak++ 135 if c.State.Health.FailingStreak >= retries { 136 h.Status = types.Unhealthy 137 } 138 // Else we're starting or healthy. Stay in that state. 139 } 140 141 if oldStatus != h.Status { 142 d.LogContainerEvent(c, "health_status: "+h.Status) 143 } 144 } 145 146 // Run the container's monitoring thread until notified via "stop". 147 // There is never more than one monitor thread running per container at a time. 148 func monitor(d *Daemon, c *container.Container, stop chan struct{}, probe probe) { 149 probeTimeout := timeoutWithDefault(c.Config.Healthcheck.Timeout, defaultProbeTimeout) 150 probeInterval := timeoutWithDefault(c.Config.Healthcheck.Interval, defaultProbeInterval) 151 for { 152 select { 153 case <-stop: 154 logrus.Debug("Stop healthcheck monitoring (received while idle)") 155 return 156 case <-time.After(probeInterval): 157 logrus.Debug("Running health check...") 158 startTime := time.Now() 159 ctx, cancelProbe := context.WithTimeout(context.Background(), probeTimeout) 160 results := make(chan *types.HealthcheckResult) 161 go func() { 162 result, err := probe.run(ctx, d, c) 163 if err != nil { 164 logrus.Warnf("Health check error: %v", err) 165 results <- &types.HealthcheckResult{ 166 ExitCode: -1, 167 Output: err.Error(), 168 Start: startTime, 169 End: time.Now(), 170 } 171 } else { 172 result.Start = startTime 173 logrus.Debugf("Health check done (exitCode=%d)", result.ExitCode) 174 results <- result 175 } 176 close(results) 177 }() 178 select { 179 case <-stop: 180 logrus.Debug("Stop healthcheck monitoring (received while probing)") 181 // Stop timeout and kill probe, but don't wait for probe to exit. 182 cancelProbe() 183 return 184 case result := <-results: 185 handleProbeResult(d, c, result) 186 // Stop timeout 187 cancelProbe() 188 case <-ctx.Done(): 189 logrus.Debug("Health check taking too long") 190 handleProbeResult(d, c, &types.HealthcheckResult{ 191 ExitCode: -1, 192 Output: fmt.Sprintf("Health check exceeded timeout (%v)", probeTimeout), 193 Start: startTime, 194 End: time.Now(), 195 }) 196 cancelProbe() 197 // Wait for probe to exit (it might take a while to respond to the TERM 198 // signal and we don't want dying probes to pile up). 199 <-results 200 } 201 } 202 } 203 } 204 205 // Get a suitable probe implementation for the container's healthcheck configuration. 206 func getProbe(c *container.Container) probe { 207 config := c.Config.Healthcheck 208 if config == nil || len(config.Test) == 0 { 209 return nil 210 } 211 switch config.Test[0] { 212 case "CMD": 213 return &cmdProbe{shell: false} 214 case "CMD-SHELL": 215 return &cmdProbe{shell: true} 216 default: 217 logrus.Warnf("Unknown healthcheck type '%s' (expected 'CMD')", config.Test[0]) 218 return nil 219 } 220 } 221 222 // Ensure the health-check monitor is running or not, depending on the current 223 // state of the container. 224 // Called from monitor.go, with c locked. 225 func (d *Daemon) updateHealthMonitor(c *container.Container) { 226 h := c.State.Health 227 if h == nil { 228 return // No healthcheck configured 229 } 230 231 probe := getProbe(c) 232 wantRunning := c.Running && !c.Paused && probe != nil 233 if wantRunning { 234 if stop := h.OpenMonitorChannel(); stop != nil { 235 go monitor(d, c, stop, probe) 236 } 237 } else { 238 h.CloseMonitorChannel() 239 } 240 } 241 242 // Reset the health state for a newly-started, restarted or restored container. 243 // initHealthMonitor is called from monitor.go and we should never be running 244 // two instances at once. 245 // Called with c locked. 246 func (d *Daemon) initHealthMonitor(c *container.Container) { 247 if c.Config.Healthcheck == nil { 248 return 249 } 250 251 // This is needed in case we're auto-restarting 252 d.stopHealthchecks(c) 253 254 if c.State.Health == nil { 255 h := &container.Health{} 256 h.Status = types.Starting 257 h.FailingStreak = 0 258 c.State.Health = h 259 } 260 261 d.updateHealthMonitor(c) 262 } 263 264 // Called when the container is being stopped (whether because the health check is 265 // failing or for any other reason). 266 func (d *Daemon) stopHealthchecks(c *container.Container) { 267 h := c.State.Health 268 if h != nil { 269 h.CloseMonitorChannel() 270 } 271 } 272 273 // Buffer up to maxOutputLen bytes. Further data is discarded. 274 type limitedBuffer struct { 275 buf bytes.Buffer 276 truncated bool // indicates that data has been lost 277 } 278 279 // Append to limitedBuffer while there is room. 280 func (b *limitedBuffer) Write(data []byte) (int, error) { 281 bufLen := b.buf.Len() 282 dataLen := len(data) 283 keep := min(maxOutputLen-bufLen, dataLen) 284 if keep > 0 { 285 b.buf.Write(data[:keep]) 286 } 287 if keep < dataLen { 288 b.truncated = true 289 } 290 return dataLen, nil 291 } 292 293 // The contents of the buffer, with "..." appended if it overflowed. 294 func (b *limitedBuffer) String() string { 295 out := b.buf.String() 296 if b.truncated { 297 out = out + "..." 298 } 299 return out 300 } 301 302 // If configuredValue is zero, use defaultValue instead. 303 func timeoutWithDefault(configuredValue time.Duration, defaultValue time.Duration) time.Duration { 304 if configuredValue == 0 { 305 return defaultValue 306 } 307 return configuredValue 308 } 309 310 func min(x, y int) int { 311 if x < y { 312 return x 313 } 314 return y 315 }