github.com/DaoCloud/dao@v0.0.0-20161212064103-c3dbfd13ee36/daemon/health.go (about) 1 package daemon 2 3 import ( 4 "bytes" 5 "fmt" 6 "runtime" 7 "strings" 8 "sync" 9 "time" 10 11 "golang.org/x/net/context" 12 13 "github.com/Sirupsen/logrus" 14 "github.com/docker/docker/container" 15 "github.com/docker/docker/daemon/exec" 16 "github.com/docker/engine-api/types" 17 "github.com/docker/engine-api/types/strslice" 18 ) 19 20 const ( 21 // Longest healthcheck probe output message to store. Longer messages will be truncated. 22 maxOutputLen = 4096 23 24 // Default interval between probe runs (from the end of the first to the start of the second). 25 // Also the time before the first probe. 26 defaultProbeInterval = 30 * time.Second 27 28 // The maximum length of time a single probe run should take. If the probe takes longer 29 // than this, the check is considered to have failed. 30 defaultProbeTimeout = 30 * time.Second 31 32 // Default number of consecutive failures of the health check 33 // for the container to be considered unhealthy. 34 defaultProbeRetries = 3 35 36 // Maximum number of entries to record 37 maxLogEntries = 5 38 ) 39 40 const ( 41 // Exit status codes that can be returned by the probe command. 42 43 exitStatusHealthy = 0 // Container is healthy 44 exitStatusUnhealthy = 1 // Container is unhealthy 45 ) 46 47 // probe implementations know how to run a particular type of probe. 48 type probe interface { 49 // Perform one run of the check. Returns the exit code and an optional 50 // short diagnostic string. 51 run(context.Context, *Daemon, *container.Container) (*types.HealthcheckResult, error) 52 } 53 54 // cmdProbe implements the "CMD" probe type. 55 type cmdProbe struct { 56 // Run the command with the system's default shell instead of execing it directly. 57 shell bool 58 } 59 60 // exec the healthcheck command in the container. 61 // Returns the exit code and probe output (if any) 62 func (p *cmdProbe) run(ctx context.Context, d *Daemon, container *container.Container) (*types.HealthcheckResult, error) { 63 cmdSlice := strslice.StrSlice(container.Config.Healthcheck.Test)[1:] 64 if p.shell { 65 if runtime.GOOS != "windows" { 66 cmdSlice = append([]string{"/bin/sh", "-c"}, cmdSlice...) 67 } else { 68 cmdSlice = append([]string{"cmd", "/S", "/C"}, cmdSlice...) 69 } 70 } 71 entrypoint, args := d.getEntrypointAndArgs(strslice.StrSlice{}, cmdSlice) 72 execConfig := exec.NewConfig() 73 execConfig.OpenStdin = false 74 execConfig.OpenStdout = true 75 execConfig.OpenStderr = true 76 execConfig.ContainerID = container.ID 77 execConfig.DetachKeys = []byte{} 78 execConfig.Entrypoint = entrypoint 79 execConfig.Args = args 80 execConfig.Tty = false 81 execConfig.Privileged = false 82 execConfig.User = container.Config.User 83 84 d.registerExecCommand(container, execConfig) 85 d.LogContainerEvent(container, "exec_create: "+execConfig.Entrypoint+" "+strings.Join(execConfig.Args, " ")) 86 87 output := &limitedBuffer{} 88 err := d.ContainerExecStart(ctx, execConfig.ID, nil, output, output) 89 if err != nil { 90 return nil, err 91 } 92 info, err := d.getExecConfig(execConfig.ID) 93 if err != nil { 94 return nil, err 95 } 96 if info.ExitCode == nil { 97 return nil, fmt.Errorf("Healthcheck has no exit code!") 98 } 99 // Note: Go's json package will handle invalid UTF-8 for us 100 out := output.String() 101 return &types.HealthcheckResult{ 102 End: time.Now(), 103 ExitCode: *info.ExitCode, 104 Output: out, 105 }, nil 106 } 107 108 // Update the container's Status.Health struct based on the latest probe's result. 109 func handleProbeResult(d *Daemon, c *container.Container, result *types.HealthcheckResult) { 110 c.Lock() 111 defer c.Unlock() 112 113 retries := c.Config.Healthcheck.Retries 114 if retries <= 0 { 115 retries = defaultProbeRetries 116 } 117 118 h := c.State.Health 119 oldStatus := h.Status 120 121 if len(h.Log) >= maxLogEntries { 122 h.Log = append(h.Log[len(h.Log)+1-maxLogEntries:], result) 123 } else { 124 h.Log = append(h.Log, result) 125 } 126 127 if result.ExitCode == exitStatusHealthy { 128 h.FailingStreak = 0 129 h.Status = types.Healthy 130 } else { 131 // Failure (including invalid exit code) 132 h.FailingStreak++ 133 if h.FailingStreak >= retries { 134 h.Status = types.Unhealthy 135 } 136 // Else we're starting or healthy. Stay in that state. 137 } 138 139 if oldStatus != h.Status { 140 d.LogContainerEvent(c, "health_status: "+h.Status) 141 } 142 } 143 144 // Run the container's monitoring thread until notified via "stop". 145 // There is never more than one monitor thread running per container at a time. 146 func monitor(d *Daemon, c *container.Container, stop chan struct{}, probe probe) { 147 probeTimeout := timeoutWithDefault(c.Config.Healthcheck.Timeout, defaultProbeTimeout) 148 probeInterval := timeoutWithDefault(c.Config.Healthcheck.Interval, defaultProbeInterval) 149 for { 150 select { 151 case <-stop: 152 logrus.Debug("Stop healthcheck monitoring (received while idle)") 153 return 154 case <-time.After(probeInterval): 155 logrus.Debug("Running health check...") 156 startTime := time.Now() 157 ctx, cancelProbe := context.WithTimeout(context.Background(), probeTimeout) 158 results := make(chan *types.HealthcheckResult) 159 go func() { 160 result, err := probe.run(ctx, d, c) 161 if err != nil { 162 logrus.Warnf("Health check error: %v", err) 163 results <- &types.HealthcheckResult{ 164 ExitCode: -1, 165 Output: err.Error(), 166 Start: startTime, 167 End: time.Now(), 168 } 169 } else { 170 result.Start = startTime 171 logrus.Debugf("Health check done (exitCode=%d)", result.ExitCode) 172 results <- result 173 } 174 close(results) 175 }() 176 select { 177 case <-stop: 178 logrus.Debug("Stop healthcheck monitoring (received while probing)") 179 // Stop timeout and kill probe, but don't wait for probe to exit. 180 cancelProbe() 181 return 182 case result := <-results: 183 handleProbeResult(d, c, result) 184 // Stop timeout 185 cancelProbe() 186 case <-ctx.Done(): 187 logrus.Debug("Health check taking too long") 188 handleProbeResult(d, c, &types.HealthcheckResult{ 189 ExitCode: -1, 190 Output: fmt.Sprintf("Health check exceeded timeout (%v)", probeTimeout), 191 Start: startTime, 192 End: time.Now(), 193 }) 194 cancelProbe() 195 // Wait for probe to exit (it might take a while to respond to the TERM 196 // signal and we don't want dying probes to pile up). 197 <-results 198 } 199 } 200 } 201 } 202 203 // Get a suitable probe implementation for the container's healthcheck configuration. 204 // Nil will be returned if no healthcheck was configured or NONE was set. 205 func getProbe(c *container.Container) probe { 206 config := c.Config.Healthcheck 207 if config == nil || len(config.Test) == 0 { 208 return nil 209 } 210 switch config.Test[0] { 211 case "CMD": 212 return &cmdProbe{shell: false} 213 case "CMD-SHELL": 214 return &cmdProbe{shell: true} 215 default: 216 logrus.Warnf("Unknown healthcheck type '%s' (expected 'CMD')", config.Test[0]) 217 return nil 218 } 219 } 220 221 // Ensure the health-check monitor is running or not, depending on the current 222 // state of the container. 223 // Called from monitor.go, with c locked. 224 func (d *Daemon) updateHealthMonitor(c *container.Container) { 225 h := c.State.Health 226 if h == nil { 227 return // No healthcheck configured 228 } 229 230 probe := getProbe(c) 231 wantRunning := c.Running && !c.Paused && probe != nil 232 if wantRunning { 233 if stop := h.OpenMonitorChannel(); stop != nil { 234 go monitor(d, c, stop, probe) 235 } 236 } else { 237 h.CloseMonitorChannel() 238 } 239 } 240 241 // Reset the health state for a newly-started, restarted or restored container. 242 // initHealthMonitor is called from monitor.go and we should never be running 243 // two instances at once. 244 // Called with c locked. 245 func (d *Daemon) initHealthMonitor(c *container.Container) { 246 // If no healthcheck is setup then don't init the monitor 247 if getProbe(c) == nil { 248 return 249 } 250 251 // This is needed in case we're auto-restarting 252 d.stopHealthchecks(c) 253 254 if h := c.State.Health; h != nil { 255 h.Status = types.Starting 256 h.FailingStreak = 0 257 } else { 258 h := &container.Health{} 259 h.Status = types.Starting 260 c.State.Health = h 261 } 262 263 d.updateHealthMonitor(c) 264 } 265 266 // Called when the container is being stopped (whether because the health check is 267 // failing or for any other reason). 268 func (d *Daemon) stopHealthchecks(c *container.Container) { 269 h := c.State.Health 270 if h != nil { 271 h.CloseMonitorChannel() 272 } 273 } 274 275 // Buffer up to maxOutputLen bytes. Further data is discarded. 276 type limitedBuffer struct { 277 buf bytes.Buffer 278 mu sync.Mutex 279 truncated bool // indicates that data has been lost 280 } 281 282 // Append to limitedBuffer while there is room. 283 func (b *limitedBuffer) Write(data []byte) (int, error) { 284 b.mu.Lock() 285 defer b.mu.Unlock() 286 287 bufLen := b.buf.Len() 288 dataLen := len(data) 289 keep := min(maxOutputLen-bufLen, dataLen) 290 if keep > 0 { 291 b.buf.Write(data[:keep]) 292 } 293 if keep < dataLen { 294 b.truncated = true 295 } 296 return dataLen, nil 297 } 298 299 // The contents of the buffer, with "..." appended if it overflowed. 300 func (b *limitedBuffer) String() string { 301 b.mu.Lock() 302 defer b.mu.Unlock() 303 304 out := b.buf.String() 305 if b.truncated { 306 out = out + "..." 307 } 308 return out 309 } 310 311 // If configuredValue is zero, use defaultValue instead. 312 func timeoutWithDefault(configuredValue time.Duration, defaultValue time.Duration) time.Duration { 313 if configuredValue == 0 { 314 return defaultValue 315 } 316 return configuredValue 317 } 318 319 func min(x, y int) int { 320 if x < y { 321 return x 322 } 323 return y 324 }