github.com/zhouyu0/docker-note@v0.0.0-20190722021225-b8d3825084db/daemon/health.go (about) 1 package daemon // import "github.com/docker/docker/daemon" 2 3 import ( 4 "bytes" 5 "context" 6 "fmt" 7 "runtime" 8 "strings" 9 "sync" 10 "time" 11 12 "github.com/docker/docker/api/types" 13 containertypes "github.com/docker/docker/api/types/container" 14 "github.com/docker/docker/api/types/strslice" 15 "github.com/docker/docker/container" 16 "github.com/docker/docker/daemon/exec" 17 "github.com/sirupsen/logrus" 18 ) 19 20 const ( 21 // Longest healthcheck probe output message to store. Longer messages will be truncated. 22 maxOutputLen = 4096 23 24 // Default interval between probe runs (from the end of the first to the start of the second). 25 // Also the time before the first probe. 26 defaultProbeInterval = 30 * time.Second 27 28 // The maximum length of time a single probe run should take. If the probe takes longer 29 // than this, the check is considered to have failed. 30 defaultProbeTimeout = 30 * time.Second 31 32 // The time given for the container to start before the health check starts considering 33 // the container unstable. Defaults to none. 34 defaultStartPeriod = 0 * time.Second 35 36 // Default number of consecutive failures of the health check 37 // for the container to be considered unhealthy. 38 defaultProbeRetries = 3 39 40 // Maximum number of entries to record 41 maxLogEntries = 5 42 ) 43 44 const ( 45 // Exit status codes that can be returned by the probe command. 46 47 exitStatusHealthy = 0 // Container is healthy 48 ) 49 50 // probe implementations know how to run a particular type of probe. 51 type probe interface { 52 // Perform one run of the check. Returns the exit code and an optional 53 // short diagnostic string. 54 run(context.Context, *Daemon, *container.Container) (*types.HealthcheckResult, error) 55 } 56 57 // cmdProbe implements the "CMD" probe type. 58 type cmdProbe struct { 59 // Run the command with the system's default shell instead of execing it directly. 60 shell bool 61 } 62 63 // exec the healthcheck command in the container. 64 // Returns the exit code and probe output (if any) 65 func (p *cmdProbe) run(ctx context.Context, d *Daemon, cntr *container.Container) (*types.HealthcheckResult, error) { 66 cmdSlice := strslice.StrSlice(cntr.Config.Healthcheck.Test)[1:] 67 if p.shell { 68 cmdSlice = append(getShell(cntr.Config), cmdSlice...) 69 } 70 entrypoint, args := d.getEntrypointAndArgs(strslice.StrSlice{}, cmdSlice) 71 execConfig := exec.NewConfig() 72 execConfig.OpenStdin = false 73 execConfig.OpenStdout = true 74 execConfig.OpenStderr = true 75 execConfig.ContainerID = cntr.ID 76 execConfig.DetachKeys = []byte{} 77 execConfig.Entrypoint = entrypoint 78 execConfig.Args = args 79 execConfig.Tty = false 80 execConfig.Privileged = false 81 execConfig.User = cntr.Config.User 82 execConfig.WorkingDir = cntr.Config.WorkingDir 83 84 linkedEnv, err := d.setupLinkedContainers(cntr) 85 if err != nil { 86 return nil, err 87 } 88 execConfig.Env = container.ReplaceOrAppendEnvValues(cntr.CreateDaemonEnvironment(execConfig.Tty, linkedEnv), execConfig.Env) 89 90 d.registerExecCommand(cntr, execConfig) 91 attributes := map[string]string{ 92 "execID": execConfig.ID, 93 } 94 d.LogContainerEventWithAttributes(cntr, "exec_create: "+execConfig.Entrypoint+" "+strings.Join(execConfig.Args, " "), attributes) 95 96 output := &limitedBuffer{} 97 err = d.ContainerExecStart(ctx, execConfig.ID, nil, output, output) 98 if err != nil { 99 return nil, err 100 } 101 info, err := d.getExecConfig(execConfig.ID) 102 if err != nil { 103 return nil, err 104 } 105 if info.ExitCode == nil { 106 return nil, fmt.Errorf("healthcheck for container %s has no exit code", cntr.ID) 107 } 108 // Note: Go's json package will handle invalid UTF-8 for us 109 out := output.String() 110 return &types.HealthcheckResult{ 111 End: time.Now(), 112 ExitCode: *info.ExitCode, 113 Output: out, 114 }, nil 115 } 116 117 // Update the container's Status.Health struct based on the latest probe's result. 118 func handleProbeResult(d *Daemon, c *container.Container, result *types.HealthcheckResult, done chan struct{}) { 119 c.Lock() 120 defer c.Unlock() 121 122 // probe may have been cancelled while waiting on lock. Ignore result then 123 select { 124 case <-done: 125 return 126 default: 127 } 128 129 retries := c.Config.Healthcheck.Retries 130 if retries <= 0 { 131 retries = defaultProbeRetries 132 } 133 134 h := c.State.Health 135 oldStatus := h.Status() 136 137 if len(h.Log) >= maxLogEntries { 138 h.Log = append(h.Log[len(h.Log)+1-maxLogEntries:], result) 139 } else { 140 h.Log = append(h.Log, result) 141 } 142 143 if result.ExitCode == exitStatusHealthy { 144 h.FailingStreak = 0 145 h.SetStatus(types.Healthy) 146 } else { // Failure (including invalid exit code) 147 shouldIncrementStreak := true 148 149 // If the container is starting (i.e. we never had a successful health check) 150 // then we check if we are within the start period of the container in which 151 // case we do not increment the failure streak. 152 if h.Status() == types.Starting { 153 startPeriod := timeoutWithDefault(c.Config.Healthcheck.StartPeriod, defaultStartPeriod) 154 timeSinceStart := result.Start.Sub(c.State.StartedAt) 155 156 // If still within the start period, then don't increment failing streak. 157 if timeSinceStart < startPeriod { 158 shouldIncrementStreak = false 159 } 160 } 161 162 if shouldIncrementStreak { 163 h.FailingStreak++ 164 165 if h.FailingStreak >= retries { 166 h.SetStatus(types.Unhealthy) 167 } 168 } 169 // Else we're starting or healthy. Stay in that state. 170 } 171 172 // replicate Health status changes 173 if err := c.CheckpointTo(d.containersReplica); err != nil { 174 // queries will be inconsistent until the next probe runs or other state mutations 175 // checkpoint the container 176 logrus.Errorf("Error replicating health state for container %s: %v", c.ID, err) 177 } 178 179 current := h.Status() 180 if oldStatus != current { 181 d.LogContainerEvent(c, "health_status: "+current) 182 } 183 } 184 185 // Run the container's monitoring thread until notified via "stop". 186 // There is never more than one monitor thread running per container at a time. 187 func monitor(d *Daemon, c *container.Container, stop chan struct{}, probe probe) { 188 probeTimeout := timeoutWithDefault(c.Config.Healthcheck.Timeout, defaultProbeTimeout) 189 probeInterval := timeoutWithDefault(c.Config.Healthcheck.Interval, defaultProbeInterval) 190 for { 191 select { 192 case <-stop: 193 logrus.Debugf("Stop healthcheck monitoring for container %s (received while idle)", c.ID) 194 return 195 case <-time.After(probeInterval): 196 logrus.Debugf("Running health check for container %s ...", c.ID) 197 startTime := time.Now() 198 ctx, cancelProbe := context.WithTimeout(context.Background(), probeTimeout) 199 results := make(chan *types.HealthcheckResult, 1) 200 go func() { 201 healthChecksCounter.Inc() 202 result, err := probe.run(ctx, d, c) 203 if err != nil { 204 healthChecksFailedCounter.Inc() 205 logrus.Warnf("Health check for container %s error: %v", c.ID, err) 206 results <- &types.HealthcheckResult{ 207 ExitCode: -1, 208 Output: err.Error(), 209 Start: startTime, 210 End: time.Now(), 211 } 212 } else { 213 result.Start = startTime 214 logrus.Debugf("Health check for container %s done (exitCode=%d)", c.ID, result.ExitCode) 215 results <- result 216 } 217 close(results) 218 }() 219 select { 220 case <-stop: 221 logrus.Debugf("Stop healthcheck monitoring for container %s (received while probing)", c.ID) 222 cancelProbe() 223 // Wait for probe to exit (it might take a while to respond to the TERM 224 // signal and we don't want dying probes to pile up). 225 <-results 226 return 227 case result := <-results: 228 handleProbeResult(d, c, result, stop) 229 // Stop timeout 230 cancelProbe() 231 case <-ctx.Done(): 232 logrus.Debugf("Health check for container %s taking too long", c.ID) 233 handleProbeResult(d, c, &types.HealthcheckResult{ 234 ExitCode: -1, 235 Output: fmt.Sprintf("Health check exceeded timeout (%v)", probeTimeout), 236 Start: startTime, 237 End: time.Now(), 238 }, stop) 239 cancelProbe() 240 // Wait for probe to exit (it might take a while to respond to the TERM 241 // signal and we don't want dying probes to pile up). 242 <-results 243 } 244 } 245 } 246 } 247 248 // Get a suitable probe implementation for the container's healthcheck configuration. 249 // Nil will be returned if no healthcheck was configured or NONE was set. 250 func getProbe(c *container.Container) probe { 251 config := c.Config.Healthcheck 252 if config == nil || len(config.Test) == 0 { 253 return nil 254 } 255 switch config.Test[0] { 256 case "CMD": 257 return &cmdProbe{shell: false} 258 case "CMD-SHELL": 259 return &cmdProbe{shell: true} 260 case "NONE": 261 return nil 262 default: 263 logrus.Warnf("Unknown healthcheck type '%s' (expected 'CMD') in container %s", config.Test[0], c.ID) 264 return nil 265 } 266 } 267 268 // Ensure the health-check monitor is running or not, depending on the current 269 // state of the container. 270 // Called from monitor.go, with c locked. 271 func (d *Daemon) updateHealthMonitor(c *container.Container) { 272 h := c.State.Health 273 if h == nil { 274 return // No healthcheck configured 275 } 276 277 probe := getProbe(c) 278 wantRunning := c.Running && !c.Paused && probe != nil 279 if wantRunning { 280 if stop := h.OpenMonitorChannel(); stop != nil { 281 go monitor(d, c, stop, probe) 282 } 283 } else { 284 h.CloseMonitorChannel() 285 } 286 } 287 288 // Reset the health state for a newly-started, restarted or restored container. 289 // initHealthMonitor is called from monitor.go and we should never be running 290 // two instances at once. 291 // Called with c locked. 292 func (d *Daemon) initHealthMonitor(c *container.Container) { 293 // If no healthcheck is setup then don't init the monitor 294 if getProbe(c) == nil { 295 return 296 } 297 298 // This is needed in case we're auto-restarting 299 d.stopHealthchecks(c) 300 301 if h := c.State.Health; h != nil { 302 h.SetStatus(types.Starting) 303 h.FailingStreak = 0 304 } else { 305 h := &container.Health{} 306 h.SetStatus(types.Starting) 307 c.State.Health = h 308 } 309 310 d.updateHealthMonitor(c) 311 } 312 313 // Called when the container is being stopped (whether because the health check is 314 // failing or for any other reason). 315 func (d *Daemon) stopHealthchecks(c *container.Container) { 316 h := c.State.Health 317 if h != nil { 318 h.CloseMonitorChannel() 319 } 320 } 321 322 // Buffer up to maxOutputLen bytes. Further data is discarded. 323 type limitedBuffer struct { 324 buf bytes.Buffer 325 mu sync.Mutex 326 truncated bool // indicates that data has been lost 327 } 328 329 // Append to limitedBuffer while there is room. 330 func (b *limitedBuffer) Write(data []byte) (int, error) { 331 b.mu.Lock() 332 defer b.mu.Unlock() 333 334 bufLen := b.buf.Len() 335 dataLen := len(data) 336 keep := min(maxOutputLen-bufLen, dataLen) 337 if keep > 0 { 338 b.buf.Write(data[:keep]) 339 } 340 if keep < dataLen { 341 b.truncated = true 342 } 343 return dataLen, nil 344 } 345 346 // The contents of the buffer, with "..." appended if it overflowed. 347 func (b *limitedBuffer) String() string { 348 b.mu.Lock() 349 defer b.mu.Unlock() 350 351 out := b.buf.String() 352 if b.truncated { 353 out = out + "..." 354 } 355 return out 356 } 357 358 // If configuredValue is zero, use defaultValue instead. 359 func timeoutWithDefault(configuredValue time.Duration, defaultValue time.Duration) time.Duration { 360 if configuredValue == 0 { 361 return defaultValue 362 } 363 return configuredValue 364 } 365 366 func min(x, y int) int { 367 if x < y { 368 return x 369 } 370 return y 371 } 372 373 func getShell(config *containertypes.Config) []string { 374 if len(config.Shell) != 0 { 375 return config.Shell 376 } 377 if runtime.GOOS != "windows" { 378 return []string{"/bin/sh", "-c"} 379 } 380 return []string{"cmd", "/S", "/C"} 381 }