github.com/jiasir/docker@v1.3.3-0.20170609024000-252e610103e7/daemon/health.go (about) 1 package daemon 2 3 import ( 4 "bytes" 5 "fmt" 6 "runtime" 7 "strings" 8 "sync" 9 "time" 10 11 "golang.org/x/net/context" 12 13 "github.com/Sirupsen/logrus" 14 "github.com/docker/docker/api/types" 15 containertypes "github.com/docker/docker/api/types/container" 16 "github.com/docker/docker/api/types/strslice" 17 "github.com/docker/docker/container" 18 "github.com/docker/docker/daemon/exec" 19 ) 20 21 const ( 22 // Longest healthcheck probe output message to store. Longer messages will be truncated. 23 maxOutputLen = 4096 24 25 // Default interval between probe runs (from the end of the first to the start of the second). 26 // Also the time before the first probe. 27 defaultProbeInterval = 30 * time.Second 28 29 // The maximum length of time a single probe run should take. If the probe takes longer 30 // than this, the check is considered to have failed. 31 defaultProbeTimeout = 30 * time.Second 32 33 // The time given for the container to start before the health check starts considering 34 // the container unstable. Defaults to none. 35 defaultStartPeriod = 0 * time.Second 36 37 // Default number of consecutive failures of the health check 38 // for the container to be considered unhealthy. 39 defaultProbeRetries = 3 40 41 // Maximum number of entries to record 42 maxLogEntries = 5 43 ) 44 45 const ( 46 // Exit status codes that can be returned by the probe command. 47 48 exitStatusHealthy = 0 // Container is healthy 49 exitStatusUnhealthy = 1 // Container is unhealthy 50 ) 51 52 // probe implementations know how to run a particular type of probe. 53 type probe interface { 54 // Perform one run of the check. Returns the exit code and an optional 55 // short diagnostic string. 56 run(context.Context, *Daemon, *container.Container) (*types.HealthcheckResult, error) 57 } 58 59 // cmdProbe implements the "CMD" probe type. 60 type cmdProbe struct { 61 // Run the command with the system's default shell instead of execing it directly. 62 shell bool 63 } 64 65 // exec the healthcheck command in the container. 66 // Returns the exit code and probe output (if any) 67 func (p *cmdProbe) run(ctx context.Context, d *Daemon, cntr *container.Container) (*types.HealthcheckResult, error) { 68 cmdSlice := strslice.StrSlice(cntr.Config.Healthcheck.Test)[1:] 69 if p.shell { 70 cmdSlice = append(getShell(cntr.Config), cmdSlice...) 71 } 72 entrypoint, args := d.getEntrypointAndArgs(strslice.StrSlice{}, cmdSlice) 73 execConfig := exec.NewConfig() 74 execConfig.OpenStdin = false 75 execConfig.OpenStdout = true 76 execConfig.OpenStderr = true 77 execConfig.ContainerID = cntr.ID 78 execConfig.DetachKeys = []byte{} 79 execConfig.Entrypoint = entrypoint 80 execConfig.Args = args 81 execConfig.Tty = false 82 execConfig.Privileged = false 83 execConfig.User = cntr.Config.User 84 85 linkedEnv, err := d.setupLinkedContainers(cntr) 86 if err != nil { 87 return nil, err 88 } 89 execConfig.Env = container.ReplaceOrAppendEnvValues(cntr.CreateDaemonEnvironment(execConfig.Tty, linkedEnv), execConfig.Env) 90 91 d.registerExecCommand(cntr, execConfig) 92 d.LogContainerEvent(cntr, "exec_create: "+execConfig.Entrypoint+" "+strings.Join(execConfig.Args, " ")) 93 94 output := &limitedBuffer{} 95 err = d.ContainerExecStart(ctx, execConfig.ID, nil, output, output) 96 if err != nil { 97 return nil, err 98 } 99 info, err := d.getExecConfig(execConfig.ID) 100 if err != nil { 101 return nil, err 102 } 103 if info.ExitCode == nil { 104 return nil, fmt.Errorf("Healthcheck for container %s has no exit code!", cntr.ID) 105 } 106 // Note: Go's json package will handle invalid UTF-8 for us 107 out := output.String() 108 return &types.HealthcheckResult{ 109 End: time.Now(), 110 ExitCode: *info.ExitCode, 111 Output: out, 112 }, nil 113 } 114 115 // Update the container's Status.Health struct based on the latest probe's result. 116 func handleProbeResult(d *Daemon, c *container.Container, result *types.HealthcheckResult, done chan struct{}) { 117 c.Lock() 118 defer c.Unlock() 119 120 // probe may have been cancelled while waiting on lock. Ignore result then 121 select { 122 case <-done: 123 return 124 default: 125 } 126 127 retries := c.Config.Healthcheck.Retries 128 if retries <= 0 { 129 retries = defaultProbeRetries 130 } 131 132 h := c.State.Health 133 oldStatus := h.Status 134 135 if len(h.Log) >= maxLogEntries { 136 h.Log = append(h.Log[len(h.Log)+1-maxLogEntries:], result) 137 } else { 138 h.Log = append(h.Log, result) 139 } 140 141 if result.ExitCode == exitStatusHealthy { 142 h.FailingStreak = 0 143 h.Status = types.Healthy 144 } else { // Failure (including invalid exit code) 145 shouldIncrementStreak := true 146 147 // If the container is starting (i.e. we never had a successful health check) 148 // then we check if we are within the start period of the container in which 149 // case we do not increment the failure streak. 150 if h.Status == types.Starting { 151 startPeriod := timeoutWithDefault(c.Config.Healthcheck.StartPeriod, defaultStartPeriod) 152 timeSinceStart := result.Start.Sub(c.State.StartedAt) 153 154 // If still within the start period, then don't increment failing streak. 155 if timeSinceStart < startPeriod { 156 shouldIncrementStreak = false 157 } 158 } 159 160 if shouldIncrementStreak { 161 h.FailingStreak++ 162 163 if h.FailingStreak >= retries { 164 h.Status = types.Unhealthy 165 } 166 } 167 // Else we're starting or healthy. Stay in that state. 168 } 169 170 if oldStatus != h.Status { 171 d.LogContainerEvent(c, "health_status: "+h.Status) 172 } 173 } 174 175 // Run the container's monitoring thread until notified via "stop". 176 // There is never more than one monitor thread running per container at a time. 177 func monitor(d *Daemon, c *container.Container, stop chan struct{}, probe probe) { 178 probeTimeout := timeoutWithDefault(c.Config.Healthcheck.Timeout, defaultProbeTimeout) 179 probeInterval := timeoutWithDefault(c.Config.Healthcheck.Interval, defaultProbeInterval) 180 for { 181 select { 182 case <-stop: 183 logrus.Debugf("Stop healthcheck monitoring for container %s (received while idle)", c.ID) 184 return 185 case <-time.After(probeInterval): 186 logrus.Debugf("Running health check for container %s ...", c.ID) 187 startTime := time.Now() 188 ctx, cancelProbe := context.WithTimeout(context.Background(), probeTimeout) 189 results := make(chan *types.HealthcheckResult) 190 go func() { 191 healthChecksCounter.Inc() 192 result, err := probe.run(ctx, d, c) 193 if err != nil { 194 healthChecksFailedCounter.Inc() 195 logrus.Warnf("Health check for container %s error: %v", c.ID, err) 196 results <- &types.HealthcheckResult{ 197 ExitCode: -1, 198 Output: err.Error(), 199 Start: startTime, 200 End: time.Now(), 201 } 202 } else { 203 result.Start = startTime 204 logrus.Debugf("Health check for container %s done (exitCode=%d)", c.ID, result.ExitCode) 205 results <- result 206 } 207 close(results) 208 }() 209 select { 210 case <-stop: 211 logrus.Debugf("Stop healthcheck monitoring for container %s (received while probing)", c.ID) 212 // Stop timeout and kill probe, but don't wait for probe to exit. 213 cancelProbe() 214 return 215 case result := <-results: 216 handleProbeResult(d, c, result, stop) 217 // Stop timeout 218 cancelProbe() 219 case <-ctx.Done(): 220 logrus.Debugf("Health check for container %s taking too long", c.ID) 221 handleProbeResult(d, c, &types.HealthcheckResult{ 222 ExitCode: -1, 223 Output: fmt.Sprintf("Health check exceeded timeout (%v)", probeTimeout), 224 Start: startTime, 225 End: time.Now(), 226 }, stop) 227 cancelProbe() 228 // Wait for probe to exit (it might take a while to respond to the TERM 229 // signal and we don't want dying probes to pile up). 230 <-results 231 } 232 } 233 } 234 } 235 236 // Get a suitable probe implementation for the container's healthcheck configuration. 237 // Nil will be returned if no healthcheck was configured or NONE was set. 238 func getProbe(c *container.Container) probe { 239 config := c.Config.Healthcheck 240 if config == nil || len(config.Test) == 0 { 241 return nil 242 } 243 switch config.Test[0] { 244 case "CMD": 245 return &cmdProbe{shell: false} 246 case "CMD-SHELL": 247 return &cmdProbe{shell: true} 248 default: 249 logrus.Warnf("Unknown healthcheck type '%s' (expected 'CMD') in container %s", config.Test[0], c.ID) 250 return nil 251 } 252 } 253 254 // Ensure the health-check monitor is running or not, depending on the current 255 // state of the container. 256 // Called from monitor.go, with c locked. 257 func (d *Daemon) updateHealthMonitor(c *container.Container) { 258 h := c.State.Health 259 if h == nil { 260 return // No healthcheck configured 261 } 262 263 probe := getProbe(c) 264 wantRunning := c.Running && !c.Paused && probe != nil 265 if wantRunning { 266 if stop := h.OpenMonitorChannel(); stop != nil { 267 go monitor(d, c, stop, probe) 268 } 269 } else { 270 h.CloseMonitorChannel() 271 } 272 } 273 274 // Reset the health state for a newly-started, restarted or restored container. 275 // initHealthMonitor is called from monitor.go and we should never be running 276 // two instances at once. 277 // Called with c locked. 278 func (d *Daemon) initHealthMonitor(c *container.Container) { 279 // If no healthcheck is setup then don't init the monitor 280 if getProbe(c) == nil { 281 return 282 } 283 284 // This is needed in case we're auto-restarting 285 d.stopHealthchecks(c) 286 287 if h := c.State.Health; h != nil { 288 h.Status = types.Starting 289 h.FailingStreak = 0 290 } else { 291 h := &container.Health{} 292 h.Status = types.Starting 293 c.State.Health = h 294 } 295 296 d.updateHealthMonitor(c) 297 } 298 299 // Called when the container is being stopped (whether because the health check is 300 // failing or for any other reason). 301 func (d *Daemon) stopHealthchecks(c *container.Container) { 302 h := c.State.Health 303 if h != nil { 304 h.CloseMonitorChannel() 305 } 306 } 307 308 // Buffer up to maxOutputLen bytes. Further data is discarded. 309 type limitedBuffer struct { 310 buf bytes.Buffer 311 mu sync.Mutex 312 truncated bool // indicates that data has been lost 313 } 314 315 // Append to limitedBuffer while there is room. 316 func (b *limitedBuffer) Write(data []byte) (int, error) { 317 b.mu.Lock() 318 defer b.mu.Unlock() 319 320 bufLen := b.buf.Len() 321 dataLen := len(data) 322 keep := min(maxOutputLen-bufLen, dataLen) 323 if keep > 0 { 324 b.buf.Write(data[:keep]) 325 } 326 if keep < dataLen { 327 b.truncated = true 328 } 329 return dataLen, nil 330 } 331 332 // The contents of the buffer, with "..." appended if it overflowed. 333 func (b *limitedBuffer) String() string { 334 b.mu.Lock() 335 defer b.mu.Unlock() 336 337 out := b.buf.String() 338 if b.truncated { 339 out = out + "..." 340 } 341 return out 342 } 343 344 // If configuredValue is zero, use defaultValue instead. 345 func timeoutWithDefault(configuredValue time.Duration, defaultValue time.Duration) time.Duration { 346 if configuredValue == 0 { 347 return defaultValue 348 } 349 return configuredValue 350 } 351 352 func min(x, y int) int { 353 if x < y { 354 return x 355 } 356 return y 357 } 358 359 func getShell(config *containertypes.Config) []string { 360 if len(config.Shell) != 0 { 361 return config.Shell 362 } 363 if runtime.GOOS != "windows" { 364 return []string{"/bin/sh", "-c"} 365 } 366 return []string{"cmd", "/S", "/C"} 367 }