github.com/brahmaroutu/docker@v1.2.1-0.20160809185609-eb28dde01f16/daemon/health.go (about) 1 package daemon 2 3 import ( 4 "bytes" 5 "fmt" 6 "runtime" 7 "strings" 8 "time" 9 10 "golang.org/x/net/context" 11 12 "github.com/Sirupsen/logrus" 13 "github.com/docker/docker/container" 14 "github.com/docker/docker/daemon/exec" 15 "github.com/docker/engine-api/types" 16 "github.com/docker/engine-api/types/strslice" 17 ) 18 19 const ( 20 // Longest healthcheck probe output message to store. Longer messages will be truncated. 21 maxOutputLen = 4096 22 23 // Default interval between probe runs (from the end of the first to the start of the second). 24 // Also the time before the first probe. 25 defaultProbeInterval = 30 * time.Second 26 27 // The maximum length of time a single probe run should take. If the probe takes longer 28 // than this, the check is considered to have failed. 29 defaultProbeTimeout = 30 * time.Second 30 31 // Default number of consecutive failures of the health check 32 // for the container to be considered unhealthy. 33 defaultProbeRetries = 3 34 35 // Maximum number of entries to record 36 maxLogEntries = 5 37 ) 38 39 const ( 40 // Exit status codes that can be returned by the probe command. 41 42 exitStatusHealthy = 0 // Container is healthy 43 exitStatusUnhealthy = 1 // Container is unhealthy 44 ) 45 46 // probe implementations know how to run a particular type of probe. 47 type probe interface { 48 // Perform one run of the check. Returns the exit code and an optional 49 // short diagnostic string. 50 run(context.Context, *Daemon, *container.Container) (*types.HealthcheckResult, error) 51 } 52 53 // cmdProbe implements the "CMD" probe type. 54 type cmdProbe struct { 55 // Run the command with the system's default shell instead of execing it directly. 56 shell bool 57 } 58 59 // exec the healthcheck command in the container. 60 // Returns the exit code and probe output (if any) 61 func (p *cmdProbe) run(ctx context.Context, d *Daemon, container *container.Container) (*types.HealthcheckResult, error) { 62 cmdSlice := strslice.StrSlice(container.Config.Healthcheck.Test)[1:] 63 if p.shell { 64 if runtime.GOOS != "windows" { 65 cmdSlice = append([]string{"/bin/sh", "-c"}, cmdSlice...) 66 } else { 67 cmdSlice = append([]string{"cmd", "/S", "/C"}, cmdSlice...) 68 } 69 } 70 entrypoint, args := d.getEntrypointAndArgs(strslice.StrSlice{}, cmdSlice) 71 execConfig := exec.NewConfig() 72 execConfig.OpenStdin = false 73 execConfig.OpenStdout = true 74 execConfig.OpenStderr = true 75 execConfig.ContainerID = container.ID 76 execConfig.DetachKeys = []byte{} 77 execConfig.Entrypoint = entrypoint 78 execConfig.Args = args 79 execConfig.Tty = false 80 execConfig.Privileged = false 81 execConfig.User = container.Config.User 82 83 d.registerExecCommand(container, execConfig) 84 d.LogContainerEvent(container, "exec_create: "+execConfig.Entrypoint+" "+strings.Join(execConfig.Args, " ")) 85 86 output := &limitedBuffer{} 87 err := d.ContainerExecStart(ctx, execConfig.ID, nil, output, output) 88 if err != nil { 89 return nil, err 90 } 91 info, err := d.getExecConfig(execConfig.ID) 92 if err != nil { 93 return nil, err 94 } 95 if info.ExitCode == nil { 96 return nil, fmt.Errorf("Healthcheck has no exit code!") 97 } 98 // Note: Go's json package will handle invalid UTF-8 for us 99 out := output.String() 100 return &types.HealthcheckResult{ 101 End: time.Now(), 102 ExitCode: *info.ExitCode, 103 Output: out, 104 }, nil 105 } 106 107 // Update the container's Status.Health struct based on the latest probe's result. 108 func handleProbeResult(d *Daemon, c *container.Container, result *types.HealthcheckResult) { 109 c.Lock() 110 defer c.Unlock() 111 112 retries := c.Config.Healthcheck.Retries 113 if retries <= 0 { 114 retries = defaultProbeRetries 115 } 116 117 h := c.State.Health 118 oldStatus := h.Status 119 120 if len(h.Log) >= maxLogEntries { 121 h.Log = append(h.Log[len(h.Log)+1-maxLogEntries:], result) 122 } else { 123 h.Log = append(h.Log, result) 124 } 125 126 if result.ExitCode == exitStatusHealthy { 127 h.FailingStreak = 0 128 h.Status = types.Healthy 129 } else { 130 // Failure (including invalid exit code) 131 h.FailingStreak++ 132 if h.FailingStreak >= retries { 133 h.Status = types.Unhealthy 134 } 135 // Else we're starting or healthy. Stay in that state. 136 } 137 138 if oldStatus != h.Status { 139 d.LogContainerEvent(c, "health_status: "+h.Status) 140 } 141 } 142 143 // Run the container's monitoring thread until notified via "stop". 144 // There is never more than one monitor thread running per container at a time. 145 func monitor(d *Daemon, c *container.Container, stop chan struct{}, probe probe) { 146 probeTimeout := timeoutWithDefault(c.Config.Healthcheck.Timeout, defaultProbeTimeout) 147 probeInterval := timeoutWithDefault(c.Config.Healthcheck.Interval, defaultProbeInterval) 148 for { 149 select { 150 case <-stop: 151 logrus.Debug("Stop healthcheck monitoring (received while idle)") 152 return 153 case <-time.After(probeInterval): 154 logrus.Debug("Running health check...") 155 startTime := time.Now() 156 ctx, cancelProbe := context.WithTimeout(context.Background(), probeTimeout) 157 results := make(chan *types.HealthcheckResult) 158 go func() { 159 result, err := probe.run(ctx, d, c) 160 if err != nil { 161 logrus.Warnf("Health check error: %v", err) 162 results <- &types.HealthcheckResult{ 163 ExitCode: -1, 164 Output: err.Error(), 165 Start: startTime, 166 End: time.Now(), 167 } 168 } else { 169 result.Start = startTime 170 logrus.Debugf("Health check done (exitCode=%d)", result.ExitCode) 171 results <- result 172 } 173 close(results) 174 }() 175 select { 176 case <-stop: 177 logrus.Debug("Stop healthcheck monitoring (received while probing)") 178 // Stop timeout and kill probe, but don't wait for probe to exit. 179 cancelProbe() 180 return 181 case result := <-results: 182 handleProbeResult(d, c, result) 183 // Stop timeout 184 cancelProbe() 185 case <-ctx.Done(): 186 logrus.Debug("Health check taking too long") 187 handleProbeResult(d, c, &types.HealthcheckResult{ 188 ExitCode: -1, 189 Output: fmt.Sprintf("Health check exceeded timeout (%v)", probeTimeout), 190 Start: startTime, 191 End: time.Now(), 192 }) 193 cancelProbe() 194 // Wait for probe to exit (it might take a while to respond to the TERM 195 // signal and we don't want dying probes to pile up). 196 <-results 197 } 198 } 199 } 200 } 201 202 // Get a suitable probe implementation for the container's healthcheck configuration. 203 // Nil will be returned if no healthcheck was configured or NONE was set. 204 func getProbe(c *container.Container) probe { 205 config := c.Config.Healthcheck 206 if config == nil || len(config.Test) == 0 { 207 return nil 208 } 209 switch config.Test[0] { 210 case "CMD": 211 return &cmdProbe{shell: false} 212 case "CMD-SHELL": 213 return &cmdProbe{shell: true} 214 default: 215 logrus.Warnf("Unknown healthcheck type '%s' (expected 'CMD')", config.Test[0]) 216 return nil 217 } 218 } 219 220 // Ensure the health-check monitor is running or not, depending on the current 221 // state of the container. 222 // Called from monitor.go, with c locked. 223 func (d *Daemon) updateHealthMonitor(c *container.Container) { 224 h := c.State.Health 225 if h == nil { 226 return // No healthcheck configured 227 } 228 229 probe := getProbe(c) 230 wantRunning := c.Running && !c.Paused && probe != nil 231 if wantRunning { 232 if stop := h.OpenMonitorChannel(); stop != nil { 233 go monitor(d, c, stop, probe) 234 } 235 } else { 236 h.CloseMonitorChannel() 237 } 238 } 239 240 // Reset the health state for a newly-started, restarted or restored container. 241 // initHealthMonitor is called from monitor.go and we should never be running 242 // two instances at once. 243 // Called with c locked. 244 func (d *Daemon) initHealthMonitor(c *container.Container) { 245 // If no healthcheck is setup then don't init the monitor 246 if getProbe(c) == nil { 247 return 248 } 249 250 // This is needed in case we're auto-restarting 251 d.stopHealthchecks(c) 252 253 if c.State.Health == nil { 254 h := &container.Health{} 255 h.Status = types.Starting 256 c.State.Health = h 257 } 258 259 d.updateHealthMonitor(c) 260 } 261 262 // Called when the container is being stopped (whether because the health check is 263 // failing or for any other reason). 264 func (d *Daemon) stopHealthchecks(c *container.Container) { 265 h := c.State.Health 266 if h != nil { 267 h.CloseMonitorChannel() 268 } 269 } 270 271 // Buffer up to maxOutputLen bytes. Further data is discarded. 272 type limitedBuffer struct { 273 buf bytes.Buffer 274 truncated bool // indicates that data has been lost 275 } 276 277 // Append to limitedBuffer while there is room. 278 func (b *limitedBuffer) Write(data []byte) (int, error) { 279 bufLen := b.buf.Len() 280 dataLen := len(data) 281 keep := min(maxOutputLen-bufLen, dataLen) 282 if keep > 0 { 283 b.buf.Write(data[:keep]) 284 } 285 if keep < dataLen { 286 b.truncated = true 287 } 288 return dataLen, nil 289 } 290 291 // The contents of the buffer, with "..." appended if it overflowed. 292 func (b *limitedBuffer) String() string { 293 out := b.buf.String() 294 if b.truncated { 295 out = out + "..." 296 } 297 return out 298 } 299 300 // If configuredValue is zero, use defaultValue instead. 301 func timeoutWithDefault(configuredValue time.Duration, defaultValue time.Duration) time.Duration { 302 if configuredValue == 0 { 303 return defaultValue 304 } 305 return configuredValue 306 } 307 308 func min(x, y int) int { 309 if x < y { 310 return x 311 } 312 return y 313 }