github.com/endocode/docker@v1.4.2-0.20160113120958-46eb4700391e/container/monitor.go (about) 1 package container 2 3 import ( 4 "io" 5 "os/exec" 6 "strings" 7 "sync" 8 "syscall" 9 "time" 10 11 "github.com/Sirupsen/logrus" 12 "github.com/docker/docker/daemon/execdriver" 13 derr "github.com/docker/docker/errors" 14 "github.com/docker/docker/pkg/promise" 15 "github.com/docker/docker/pkg/stringid" 16 "github.com/docker/docker/utils" 17 "github.com/docker/engine-api/types/container" 18 ) 19 20 const ( 21 defaultTimeIncrement = 100 22 loggerCloseTimeout = 10 * time.Second 23 ) 24 25 // supervisor defines the interface that a supervisor must implement 26 type supervisor interface { 27 // LogContainerEvent generates events related to a given container 28 LogContainerEvent(*Container, string) 29 // Cleanup ensures that the container is properly unmounted 30 Cleanup(*Container) 31 // StartLogging starts the logging driver for the container 32 StartLogging(*Container) error 33 // Run starts a container 34 Run(c *Container, pipes *execdriver.Pipes, startCallback execdriver.DriverCallback) (execdriver.ExitStatus, error) 35 // IsShuttingDown tells whether the supervisor is shutting down or not 36 IsShuttingDown() bool 37 } 38 39 // containerMonitor monitors the execution of a container's main process. 40 // If a restart policy is specified for the container the monitor will ensure that the 41 // process is restarted based on the rules of the policy. When the container is finally stopped 42 // the monitor will reset and cleanup any of the container resources such as networking allocations 43 // and the rootfs 44 type containerMonitor struct { 45 mux sync.Mutex 46 47 // supervisor keeps track of the container and the events it generates 48 supervisor supervisor 49 50 // container is the container being monitored 51 container *Container 52 53 // restartPolicy is the current policy being applied to the container monitor 54 restartPolicy container.RestartPolicy 55 56 // failureCount is the number of times the container has failed to 57 // start in a row 58 failureCount int 59 60 // shouldStop signals the monitor that the next time the container exits it is 61 // either because docker or the user asked for the container to be stopped 62 shouldStop bool 63 64 // startSignal is a channel that is closes after the container initially starts 65 startSignal chan struct{} 66 67 // stopChan is used to signal to the monitor whenever there is a wait for the 68 // next restart so that the timeIncrement is not honored and the user is not 69 // left waiting for nothing to happen during this time 70 stopChan chan struct{} 71 72 // timeIncrement is the amount of time to wait between restarts 73 // this is in milliseconds 74 timeIncrement int 75 76 // lastStartTime is the time which the monitor last exec'd the container's process 77 lastStartTime time.Time 78 } 79 80 // StartMonitor initializes a containerMonitor for this container with the provided supervisor and restart policy 81 // and starts the container's process. 82 func (container *Container) StartMonitor(s supervisor, policy container.RestartPolicy) error { 83 container.Lock() 84 container.monitor = &containerMonitor{ 85 supervisor: s, 86 container: container, 87 restartPolicy: policy, 88 timeIncrement: defaultTimeIncrement, 89 stopChan: make(chan struct{}), 90 startSignal: make(chan struct{}), 91 } 92 container.Unlock() 93 94 return container.monitor.wait() 95 } 96 97 // wait starts the container and wait until 98 // we either receive an error from the initial start of the container's 99 // process or until the process is running in the container 100 func (m *containerMonitor) wait() error { 101 select { 102 case <-m.startSignal: 103 case err := <-promise.Go(m.start): 104 return err 105 } 106 107 return nil 108 } 109 110 // Stop signals to the container monitor that it should stop monitoring the container 111 // for exits the next time the process dies 112 func (m *containerMonitor) ExitOnNext() { 113 m.mux.Lock() 114 115 // we need to protect having a double close of the channel when stop is called 116 // twice or else we will get a panic 117 if !m.shouldStop { 118 m.shouldStop = true 119 close(m.stopChan) 120 } 121 122 m.mux.Unlock() 123 } 124 125 // Close closes the container's resources such as networking allocations and 126 // unmounts the container's root filesystem 127 func (m *containerMonitor) Close() error { 128 // Cleanup networking and mounts 129 m.supervisor.Cleanup(m.container) 130 131 // FIXME: here is race condition between two RUN instructions in Dockerfile 132 // because they share same runconfig and change image. Must be fixed 133 // in builder/builder.go 134 if err := m.container.ToDisk(); err != nil { 135 logrus.Errorf("Error dumping container %s state to disk: %s", m.container.ID, err) 136 137 return err 138 } 139 140 return nil 141 } 142 143 // Start starts the containers process and monitors it according to the restart policy 144 func (m *containerMonitor) start() error { 145 var ( 146 err error 147 exitStatus execdriver.ExitStatus 148 // this variable indicates where we in execution flow: 149 // before Run or after 150 afterRun bool 151 ) 152 153 // ensure that when the monitor finally exits we release the networking and unmount the rootfs 154 defer func() { 155 if afterRun { 156 m.container.Lock() 157 defer m.container.Unlock() 158 m.container.SetStopped(&exitStatus) 159 } 160 m.Close() 161 }() 162 163 m.container.Lock() 164 // reset stopped flag 165 if m.container.HasBeenManuallyStopped { 166 m.container.HasBeenManuallyStopped = false 167 } 168 169 // reset the restart count 170 m.container.RestartCount = -1 171 172 for { 173 m.container.RestartCount++ 174 175 if err := m.supervisor.StartLogging(m.container); err != nil { 176 m.resetContainer(false) 177 178 m.container.Unlock() 179 return err 180 } 181 182 pipes := execdriver.NewPipes(m.container.Stdin(), m.container.Stdout(), m.container.Stderr(), m.container.Config.OpenStdin) 183 m.container.Unlock() 184 185 m.logEvent("start") 186 187 m.lastStartTime = time.Now() 188 189 // don't lock Run because m.callback has own lock 190 if exitStatus, err = m.supervisor.Run(m.container, pipes, m.callback); err != nil { 191 m.container.Lock() 192 // if we receive an internal error from the initial start of a container then lets 193 // return it instead of entering the restart loop 194 // set to 127 for container cmd not found/does not exist) 195 if strings.Contains(err.Error(), "executable file not found") || 196 strings.Contains(err.Error(), "no such file or directory") || 197 strings.Contains(err.Error(), "system cannot find the file specified") { 198 if m.container.RestartCount == 0 { 199 m.container.ExitCode = 127 200 m.resetContainer(false) 201 m.container.Unlock() 202 return derr.ErrorCodeCmdNotFound 203 } 204 } 205 // set to 126 for container cmd can't be invoked errors 206 if strings.Contains(err.Error(), syscall.EACCES.Error()) { 207 if m.container.RestartCount == 0 { 208 m.container.ExitCode = 126 209 m.resetContainer(false) 210 m.container.Unlock() 211 return derr.ErrorCodeCmdCouldNotBeInvoked 212 } 213 } 214 215 if m.container.RestartCount == 0 { 216 m.container.ExitCode = -1 217 m.resetContainer(false) 218 219 m.container.Unlock() 220 return derr.ErrorCodeCantStart.WithArgs(m.container.ID, utils.GetErrorMessage(err)) 221 } 222 223 m.container.Unlock() 224 logrus.Errorf("Error running container: %s", err) 225 } // end if 226 227 // here container.Lock is already lost 228 afterRun = true 229 230 m.resetMonitor(err == nil && exitStatus.ExitCode == 0) 231 232 if m.shouldRestart(exitStatus.ExitCode) { 233 m.container.SetRestarting(&exitStatus) 234 m.logEvent("die") 235 m.resetContainer(true) 236 237 // sleep with a small time increment between each restart to help avoid issues cased by quickly 238 // restarting the container because of some types of errors ( networking cut out, etc... ) 239 m.waitForNextRestart() 240 241 // we need to check this before reentering the loop because the waitForNextRestart could have 242 // been terminated by a request from a user 243 if m.shouldStop { 244 return err 245 } 246 m.container.Lock() 247 continue 248 } 249 250 m.logEvent("die") 251 m.resetContainer(true) 252 return err 253 } // end for 254 } 255 256 // resetMonitor resets the stateful fields on the containerMonitor based on the 257 // previous runs success or failure. Regardless of success, if the container had 258 // an execution time of more than 10s then reset the timer back to the default 259 func (m *containerMonitor) resetMonitor(successful bool) { 260 executionTime := time.Now().Sub(m.lastStartTime).Seconds() 261 262 if executionTime > 10 { 263 m.timeIncrement = defaultTimeIncrement 264 } else { 265 // otherwise we need to increment the amount of time we wait before restarting 266 // the process. We will build up by multiplying the increment by 2 267 m.timeIncrement *= 2 268 } 269 270 // the container exited successfully so we need to reset the failure counter 271 if successful { 272 m.failureCount = 0 273 } else { 274 m.failureCount++ 275 } 276 } 277 278 // waitForNextRestart waits with the default time increment to restart the container unless 279 // a user or docker asks for the container to be stopped 280 func (m *containerMonitor) waitForNextRestart() { 281 select { 282 case <-time.After(time.Duration(m.timeIncrement) * time.Millisecond): 283 case <-m.stopChan: 284 } 285 } 286 287 // shouldRestart checks the restart policy and applies the rules to determine if 288 // the container's process should be restarted 289 func (m *containerMonitor) shouldRestart(exitCode int) bool { 290 m.mux.Lock() 291 defer m.mux.Unlock() 292 293 // do not restart if the user or docker has requested that this container be stopped 294 if m.shouldStop { 295 m.container.HasBeenManuallyStopped = !m.supervisor.IsShuttingDown() 296 return false 297 } 298 299 switch { 300 case m.restartPolicy.IsAlways(), m.restartPolicy.IsUnlessStopped(): 301 return true 302 case m.restartPolicy.IsOnFailure(): 303 // the default value of 0 for MaximumRetryCount means that we will not enforce a maximum count 304 if max := m.restartPolicy.MaximumRetryCount; max != 0 && m.failureCount > max { 305 logrus.Debugf("stopping restart of container %s because maximum failure could of %d has been reached", 306 stringid.TruncateID(m.container.ID), max) 307 return false 308 } 309 310 return exitCode != 0 311 } 312 313 return false 314 } 315 316 // callback ensures that the container's state is properly updated after we 317 // received ack from the execution drivers 318 func (m *containerMonitor) callback(processConfig *execdriver.ProcessConfig, pid int, chOOM <-chan struct{}) error { 319 go func() { 320 for range chOOM { 321 m.logEvent("oom") 322 } 323 }() 324 325 if processConfig.Tty { 326 // The callback is called after the process start() 327 // so we are in the parent process. In TTY mode, stdin/out/err is the PtySlave 328 // which we close here. 329 if c, ok := processConfig.Stdout.(io.Closer); ok { 330 c.Close() 331 } 332 } 333 334 m.container.SetRunningLocking(pid) 335 336 // signal that the process has started 337 // close channel only if not closed 338 select { 339 case <-m.startSignal: 340 default: 341 close(m.startSignal) 342 } 343 344 if err := m.container.ToDiskLocking(); err != nil { 345 logrus.Errorf("Error saving container to disk: %v", err) 346 } 347 return nil 348 } 349 350 // resetContainer resets the container's IO and ensures that the command is able to be executed again 351 // by copying the data into a new struct 352 // if lock is true, then container locked during reset 353 func (m *containerMonitor) resetContainer(lock bool) { 354 container := m.container 355 if lock { 356 container.Lock() 357 defer container.Unlock() 358 } 359 360 if err := container.CloseStreams(); err != nil { 361 logrus.Errorf("%s: %s", container.ID, err) 362 } 363 364 if container.Command != nil && container.Command.ProcessConfig.Terminal != nil { 365 if err := container.Command.ProcessConfig.Terminal.Close(); err != nil { 366 logrus.Errorf("%s: Error closing terminal: %s", container.ID, err) 367 } 368 } 369 370 // Re-create a brand new stdin pipe once the container exited 371 if container.Config.OpenStdin { 372 container.NewInputPipes() 373 } 374 375 if container.LogDriver != nil { 376 if container.LogCopier != nil { 377 exit := make(chan struct{}) 378 go func() { 379 container.LogCopier.Wait() 380 close(exit) 381 }() 382 select { 383 case <-time.After(loggerCloseTimeout): 384 logrus.Warnf("Logger didn't exit in time: logs may be truncated") 385 case <-exit: 386 } 387 } 388 container.LogDriver.Close() 389 container.LogCopier = nil 390 container.LogDriver = nil 391 } 392 393 c := container.Command.ProcessConfig.Cmd 394 395 container.Command.ProcessConfig.Cmd = exec.Cmd{ 396 Stdin: c.Stdin, 397 Stdout: c.Stdout, 398 Stderr: c.Stderr, 399 Path: c.Path, 400 Env: c.Env, 401 ExtraFiles: c.ExtraFiles, 402 Args: c.Args, 403 Dir: c.Dir, 404 SysProcAttr: c.SysProcAttr, 405 } 406 } 407 408 func (m *containerMonitor) logEvent(action string) { 409 m.supervisor.LogContainerEvent(m.container, action) 410 }