github.com/mheon/docker@v0.11.2-0.20150922122814-44f47903a831/daemon/monitor.go (about) 1 package daemon 2 3 import ( 4 "io" 5 "os/exec" 6 "sync" 7 "time" 8 9 "github.com/Sirupsen/logrus" 10 "github.com/docker/docker/daemon/execdriver" 11 "github.com/docker/docker/pkg/stringid" 12 "github.com/docker/docker/runconfig" 13 ) 14 15 const ( 16 defaultTimeIncrement = 100 17 loggerCloseTimeout = 10 * time.Second 18 ) 19 20 // containerMonitor monitors the execution of a container's main process. 21 // If a restart policy is specified for the container the monitor will ensure that the 22 // process is restarted based on the rules of the policy. When the container is finally stopped 23 // the monitor will reset and cleanup any of the container resources such as networking allocations 24 // and the rootfs 25 type containerMonitor struct { 26 mux sync.Mutex 27 28 // container is the container being monitored 29 container *Container 30 31 // restartPolicy is the current policy being applied to the container monitor 32 restartPolicy runconfig.RestartPolicy 33 34 // failureCount is the number of times the container has failed to 35 // start in a row 36 failureCount int 37 38 // shouldStop signals the monitor that the next time the container exits it is 39 // either because docker or the user asked for the container to be stopped 40 shouldStop bool 41 42 // startSignal is a channel that is closes after the container initially starts 43 startSignal chan struct{} 44 45 // stopChan is used to signal to the monitor whenever there is a wait for the 46 // next restart so that the timeIncrement is not honored and the user is not 47 // left waiting for nothing to happen during this time 48 stopChan chan struct{} 49 50 // timeIncrement is the amount of time to wait between restarts 51 // this is in milliseconds 52 timeIncrement int 53 54 // lastStartTime is the time which the monitor last exec'd the container's process 55 lastStartTime time.Time 56 } 57 58 // newContainerMonitor returns an initialized containerMonitor for the provided container 59 // honoring the provided restart policy 60 func newContainerMonitor(container *Container, policy runconfig.RestartPolicy) *containerMonitor { 61 return &containerMonitor{ 62 container: container, 63 restartPolicy: policy, 64 timeIncrement: defaultTimeIncrement, 65 stopChan: make(chan struct{}), 66 startSignal: make(chan struct{}), 67 } 68 } 69 70 // Stop signals to the container monitor that it should stop monitoring the container 71 // for exits the next time the process dies 72 func (m *containerMonitor) ExitOnNext() { 73 m.mux.Lock() 74 75 // we need to protect having a double close of the channel when stop is called 76 // twice or else we will get a panic 77 if !m.shouldStop { 78 m.shouldStop = true 79 close(m.stopChan) 80 } 81 82 m.mux.Unlock() 83 } 84 85 // Close closes the container's resources such as networking allocations and 86 // unmounts the contatiner's root filesystem 87 func (m *containerMonitor) Close() error { 88 // Cleanup networking and mounts 89 m.container.cleanup() 90 91 // FIXME: here is race condition between two RUN instructions in Dockerfile 92 // because they share same runconfig and change image. Must be fixed 93 // in builder/builder.go 94 if err := m.container.toDisk(); err != nil { 95 logrus.Errorf("Error dumping container %s state to disk: %s", m.container.ID, err) 96 97 return err 98 } 99 100 return nil 101 } 102 103 // Start starts the containers process and monitors it according to the restart policy 104 func (m *containerMonitor) Start() error { 105 var ( 106 err error 107 exitStatus execdriver.ExitStatus 108 // this variable indicates where we in execution flow: 109 // before Run or after 110 afterRun bool 111 ) 112 113 // ensure that when the monitor finally exits we release the networking and unmount the rootfs 114 defer func() { 115 if afterRun { 116 m.container.Lock() 117 m.container.setStopped(&exitStatus) 118 defer m.container.Unlock() 119 } 120 m.Close() 121 }() 122 // reset stopped flag 123 if m.container.HasBeenManuallyStopped { 124 m.container.HasBeenManuallyStopped = false 125 } 126 127 // reset the restart count 128 m.container.RestartCount = -1 129 130 for { 131 m.container.RestartCount++ 132 133 if err := m.container.startLogging(); err != nil { 134 m.resetContainer(false) 135 136 return err 137 } 138 139 pipes := execdriver.NewPipes(m.container.stdin, m.container.stdout, m.container.stderr, m.container.Config.OpenStdin) 140 141 m.container.logEvent("start") 142 143 m.lastStartTime = time.Now() 144 145 if exitStatus, err = m.container.daemon.run(m.container, pipes, m.callback); err != nil { 146 // if we receive an internal error from the initial start of a container then lets 147 // return it instead of entering the restart loop 148 if m.container.RestartCount == 0 { 149 m.container.ExitCode = -1 150 m.resetContainer(false) 151 152 return err 153 } 154 155 logrus.Errorf("Error running container: %s", err) 156 } 157 158 // here container.Lock is already lost 159 afterRun = true 160 161 m.resetMonitor(err == nil && exitStatus.ExitCode == 0) 162 163 if m.shouldRestart(exitStatus.ExitCode) { 164 m.container.setRestarting(&exitStatus) 165 if exitStatus.OOMKilled { 166 m.container.logEvent("oom") 167 } 168 m.container.logEvent("die") 169 m.resetContainer(true) 170 171 // sleep with a small time increment between each restart to help avoid issues cased by quickly 172 // restarting the container because of some types of errors ( networking cut out, etc... ) 173 m.waitForNextRestart() 174 175 // we need to check this before reentering the loop because the waitForNextRestart could have 176 // been terminated by a request from a user 177 if m.shouldStop { 178 return err 179 } 180 continue 181 } 182 if exitStatus.OOMKilled { 183 m.container.logEvent("oom") 184 } 185 m.container.logEvent("die") 186 m.resetContainer(true) 187 return err 188 } 189 } 190 191 // resetMonitor resets the stateful fields on the containerMonitor based on the 192 // previous runs success or failure. Regardless of success, if the container had 193 // an execution time of more than 10s then reset the timer back to the default 194 func (m *containerMonitor) resetMonitor(successful bool) { 195 executionTime := time.Now().Sub(m.lastStartTime).Seconds() 196 197 if executionTime > 10 { 198 m.timeIncrement = defaultTimeIncrement 199 } else { 200 // otherwise we need to increment the amount of time we wait before restarting 201 // the process. We will build up by multiplying the increment by 2 202 m.timeIncrement *= 2 203 } 204 205 // the container exited successfully so we need to reset the failure counter 206 if successful { 207 m.failureCount = 0 208 } else { 209 m.failureCount++ 210 } 211 } 212 213 // waitForNextRestart waits with the default time increment to restart the container unless 214 // a user or docker asks for the container to be stopped 215 func (m *containerMonitor) waitForNextRestart() { 216 select { 217 case <-time.After(time.Duration(m.timeIncrement) * time.Millisecond): 218 case <-m.stopChan: 219 } 220 } 221 222 // shouldRestart checks the restart policy and applies the rules to determine if 223 // the container's process should be restarted 224 func (m *containerMonitor) shouldRestart(exitCode int) bool { 225 m.mux.Lock() 226 defer m.mux.Unlock() 227 228 // do not restart if the user or docker has requested that this container be stopped 229 if m.shouldStop { 230 m.container.HasBeenManuallyStopped = !m.container.daemon.shutdown 231 return false 232 } 233 234 switch { 235 case m.restartPolicy.IsAlways(), m.restartPolicy.IsUnlessStopped(): 236 return true 237 case m.restartPolicy.IsOnFailure(): 238 // the default value of 0 for MaximumRetryCount means that we will not enforce a maximum count 239 if max := m.restartPolicy.MaximumRetryCount; max != 0 && m.failureCount > max { 240 logrus.Debugf("stopping restart of container %s because maximum failure could of %d has been reached", 241 stringid.TruncateID(m.container.ID), max) 242 return false 243 } 244 245 return exitCode != 0 246 } 247 248 return false 249 } 250 251 // callback ensures that the container's state is properly updated after we 252 // received ack from the execution drivers 253 func (m *containerMonitor) callback(processConfig *execdriver.ProcessConfig, pid int) error { 254 if processConfig.Tty { 255 // The callback is called after the process Start() 256 // so we are in the parent process. In TTY mode, stdin/out/err is the PtySlave 257 // which we close here. 258 if c, ok := processConfig.Stdout.(io.Closer); ok { 259 c.Close() 260 } 261 } 262 263 m.container.setRunning(pid) 264 265 // signal that the process has started 266 // close channel only if not closed 267 select { 268 case <-m.startSignal: 269 default: 270 close(m.startSignal) 271 } 272 273 if err := m.container.toDiskLocking(); err != nil { 274 logrus.Errorf("Error saving container to disk: %v", err) 275 } 276 return nil 277 } 278 279 // resetContainer resets the container's IO and ensures that the command is able to be executed again 280 // by copying the data into a new struct 281 // if lock is true, then container locked during reset 282 func (m *containerMonitor) resetContainer(lock bool) { 283 container := m.container 284 if lock { 285 container.Lock() 286 defer container.Unlock() 287 } 288 289 if container.Config.OpenStdin { 290 if err := container.stdin.Close(); err != nil { 291 logrus.Errorf("%s: Error close stdin: %s", container.ID, err) 292 } 293 } 294 295 if err := container.stdout.Clean(); err != nil { 296 logrus.Errorf("%s: Error close stdout: %s", container.ID, err) 297 } 298 299 if err := container.stderr.Clean(); err != nil { 300 logrus.Errorf("%s: Error close stderr: %s", container.ID, err) 301 } 302 303 if container.command != nil && container.command.ProcessConfig.Terminal != nil { 304 if err := container.command.ProcessConfig.Terminal.Close(); err != nil { 305 logrus.Errorf("%s: Error closing terminal: %s", container.ID, err) 306 } 307 } 308 309 // Re-create a brand new stdin pipe once the container exited 310 if container.Config.OpenStdin { 311 container.stdin, container.stdinPipe = io.Pipe() 312 } 313 314 if container.logDriver != nil { 315 if container.logCopier != nil { 316 exit := make(chan struct{}) 317 go func() { 318 container.logCopier.Wait() 319 close(exit) 320 }() 321 select { 322 case <-time.After(loggerCloseTimeout): 323 logrus.Warnf("Logger didn't exit in time: logs may be truncated") 324 case <-exit: 325 } 326 } 327 container.logDriver.Close() 328 container.logCopier = nil 329 container.logDriver = nil 330 } 331 332 c := container.command.ProcessConfig.Cmd 333 334 container.command.ProcessConfig.Cmd = exec.Cmd{ 335 Stdin: c.Stdin, 336 Stdout: c.Stdout, 337 Stderr: c.Stderr, 338 Path: c.Path, 339 Env: c.Env, 340 ExtraFiles: c.ExtraFiles, 341 Args: c.Args, 342 Dir: c.Dir, 343 SysProcAttr: c.SysProcAttr, 344 } 345 }