github.com/hernad/nomad@v1.6.112/drivers/docker/handle.go (about) 1 // Copyright (c) HashiCorp, Inc. 2 // SPDX-License-Identifier: MPL-2.0 3 4 package docker 5 6 import ( 7 "context" 8 "fmt" 9 "os" 10 "runtime" 11 "strings" 12 "sync" 13 "syscall" 14 "time" 15 16 "github.com/armon/circbuf" 17 docker "github.com/fsouza/go-dockerclient" 18 "github.com/hernad/consul-template/signals" 19 "github.com/hashicorp/go-hclog" 20 "github.com/hashicorp/go-plugin" 21 22 "github.com/hernad/nomad/drivers/docker/docklog" 23 "github.com/hernad/nomad/plugins/drivers" 24 pstructs "github.com/hernad/nomad/plugins/shared/structs" 25 ) 26 27 type taskHandle struct { 28 // dockerClient is useful for normal docker API calls. It should be used 29 // for all calls that aren't Wait() or Stop() (and their variations). 30 dockerClient *docker.Client 31 32 // infinityClient is useful for 33 // - the Wait docker API call(s) (no limit on container lifetime) 34 // - the Stop docker API call(s) (context with task kill_timeout required) 35 // Do not use this client for any other docker API calls, instead use the 36 // normal dockerClient which includes a default timeout. 37 infinityClient *docker.Client 38 39 logger hclog.Logger 40 dlogger docklog.DockerLogger 41 dloggerPluginClient *plugin.Client 42 task *drivers.TaskConfig 43 containerID string 44 containerImage string 45 doneCh chan bool 46 waitCh chan struct{} 47 removeContainerOnExit bool 48 net *drivers.DriverNetwork 49 50 exitResult *drivers.ExitResult 51 exitResultLock sync.Mutex 52 } 53 54 func (h *taskHandle) ExitResult() *drivers.ExitResult { 55 h.exitResultLock.Lock() 56 defer h.exitResultLock.Unlock() 57 return h.exitResult.Copy() 58 } 59 60 type taskHandleState struct { 61 // ReattachConfig for the docker logger plugin 62 ReattachConfig *pstructs.ReattachConfig 63 64 ContainerID string 65 DriverNetwork *drivers.DriverNetwork 66 } 67 68 func (h *taskHandle) buildState() *taskHandleState { 69 s := &taskHandleState{ 70 ContainerID: h.containerID, 71 DriverNetwork: h.net, 72 } 73 if h.dloggerPluginClient != nil { 74 s.ReattachConfig = pstructs.ReattachConfigFromGoPlugin(h.dloggerPluginClient.ReattachConfig()) 75 } 76 return s 77 } 78 79 func (h *taskHandle) Exec(ctx context.Context, cmd string, args []string) (*drivers.ExecTaskResult, error) { 80 fullCmd := make([]string, len(args)+1) 81 fullCmd[0] = cmd 82 copy(fullCmd[1:], args) 83 createExecOpts := docker.CreateExecOptions{ 84 AttachStdin: false, 85 AttachStdout: true, 86 AttachStderr: true, 87 Tty: false, 88 Cmd: fullCmd, 89 Container: h.containerID, 90 Context: ctx, 91 } 92 exec, err := h.dockerClient.CreateExec(createExecOpts) 93 if err != nil { 94 return nil, err 95 } 96 97 execResult := &drivers.ExecTaskResult{ExitResult: &drivers.ExitResult{}} 98 stdout, _ := circbuf.NewBuffer(int64(drivers.CheckBufSize)) 99 stderr, _ := circbuf.NewBuffer(int64(drivers.CheckBufSize)) 100 startOpts := docker.StartExecOptions{ 101 Detach: false, 102 Tty: false, 103 OutputStream: stdout, 104 ErrorStream: stderr, 105 Context: ctx, 106 } 107 if err := h.dockerClient.StartExec(exec.ID, startOpts); err != nil { 108 return nil, err 109 } 110 execResult.Stdout = stdout.Bytes() 111 execResult.Stderr = stderr.Bytes() 112 res, err := h.dockerClient.InspectExec(exec.ID) 113 if err != nil { 114 return execResult, err 115 } 116 117 execResult.ExitResult.ExitCode = res.ExitCode 118 return execResult, nil 119 } 120 121 func (h *taskHandle) Signal(ctx context.Context, s os.Signal) error { 122 // Convert types 123 sysSig, ok := s.(syscall.Signal) 124 if !ok { 125 return fmt.Errorf("Failed to determine signal number") 126 } 127 128 // TODO When we expose signals we will need a mapping layer that converts 129 // MacOS signals to the correct signal number for docker. Or we change the 130 // interface to take a signal string and leave it up to driver to map? 131 132 opts := docker.KillContainerOptions{ 133 ID: h.containerID, 134 Signal: docker.Signal(sysSig), 135 Context: ctx, 136 } 137 138 // remember Kill just means send a signal; this is not the complex StopContainer case 139 return h.dockerClient.KillContainer(opts) 140 } 141 142 // parseSignal interprets the signal name into an os.Signal. If no name is 143 // provided, the docker driver defaults to SIGTERM. If the OS is Windows and 144 // SIGINT is provided, the signal is converted to SIGTERM. 145 func parseSignal(os, signal string) (os.Signal, error) { 146 // Unlike other drivers, docker defaults to SIGTERM, aiming for consistency 147 // with the 'docker stop' command. 148 // https://docs.docker.com/engine/reference/commandline/stop/#extended-description 149 if signal == "" { 150 signal = "SIGTERM" 151 } 152 153 // Windows Docker daemon does not support SIGINT, SIGTERM is the semantic equivalent that 154 // allows for graceful shutdown before being followed up by a SIGKILL. 155 // Supported signals: 156 // https://github.com/moby/moby/blob/0111ee70874a4947d93f64b672f66a2a35071ee2/pkg/signal/signal_windows.go#L17-L26 157 if os == "windows" && signal == "SIGINT" { 158 signal = "SIGTERM" 159 } 160 161 return signals.Parse(signal) 162 } 163 164 // Kill is used to terminate the task. 165 func (h *taskHandle) Kill(killTimeout time.Duration, signal string) error { 166 var err error 167 // Calling StopContainer lets docker handle the stop signal (specified 168 // in the Dockerfile or defaulting to SIGTERM). If kill_signal is specified, 169 // Signal is used to kill the container with the desired signal before 170 // calling StopContainer 171 if signal == "" { 172 // give the context timeout some wiggle room beyond the kill timeout 173 // docker will use, so we can happy path even in the force kill case 174 graciousTimeout := killTimeout + dockerTimeout 175 ctx, cancel := context.WithTimeout(context.Background(), graciousTimeout) 176 defer cancel() 177 apiTimeout := uint(killTimeout.Seconds()) 178 err = h.infinityClient.StopContainerWithContext(h.containerID, apiTimeout, ctx) 179 } else { 180 ctx, cancel := context.WithTimeout(context.Background(), killTimeout) 181 defer cancel() 182 183 sig, parseErr := parseSignal(runtime.GOOS, signal) 184 if parseErr != nil { 185 return fmt.Errorf("failed to parse signal: %v", parseErr) 186 } 187 188 if err := h.Signal(ctx, sig); err != nil { 189 // Container has already been removed. 190 if strings.Contains(err.Error(), NoSuchContainerError) { 191 h.logger.Debug("attempted to signal nonexistent container") 192 return nil 193 } 194 // Container has already been stopped. 195 if strings.Contains(err.Error(), ContainerNotRunningError) { 196 h.logger.Debug("attempted to signal a not-running container") 197 return nil 198 } 199 200 h.logger.Error("failed to signal container while killing", "error", err) 201 return fmt.Errorf("Failed to signal container %q while killing: %v", h.containerID, err) 202 } 203 204 select { 205 case <-h.waitCh: 206 return nil 207 case <-ctx.Done(): 208 } 209 210 // Stop the container forcefully. 211 err = h.dockerClient.StopContainer(h.containerID, 0) 212 } 213 214 if err != nil { 215 // Container has already been removed. 216 if strings.Contains(err.Error(), NoSuchContainerError) { 217 h.logger.Debug("attempted to stop nonexistent container") 218 return nil 219 } 220 // Container has already been stopped. 221 if strings.Contains(err.Error(), ContainerNotRunningError) { 222 h.logger.Debug("attempted to stop an not-running container") 223 return nil 224 } 225 226 h.logger.Error("failed to stop container", "error", err) 227 return fmt.Errorf("Failed to stop container %s: %s", h.containerID, err) 228 } 229 230 h.logger.Info("stopped container") 231 return nil 232 } 233 234 func (h *taskHandle) shutdownLogger() { 235 if h.dlogger == nil { 236 return 237 } 238 239 if err := h.dlogger.Stop(); err != nil { 240 h.logger.Error("failed to stop docker logger process during StopTask", 241 "error", err, "logger_pid", h.dloggerPluginClient.ReattachConfig().Pid) 242 } 243 h.dloggerPluginClient.Kill() 244 } 245 246 func (h *taskHandle) run() { 247 defer h.shutdownLogger() 248 249 exitCode, werr := h.infinityClient.WaitContainer(h.containerID) 250 if werr != nil { 251 h.logger.Error("failed to wait for container; already terminated") 252 } 253 254 if exitCode != 0 { 255 werr = fmt.Errorf("Docker container exited with non-zero exit code: %d", exitCode) 256 } 257 258 container, ierr := h.dockerClient.InspectContainerWithOptions(docker.InspectContainerOptions{ 259 ID: h.containerID, 260 }) 261 oom := false 262 if ierr != nil { 263 h.logger.Error("failed to inspect container", "error", ierr) 264 } else if container.State.OOMKilled { 265 h.logger.Error("OOM Killed", 266 "container_id", h.containerID, 267 "container_image", h.containerImage, 268 "nomad_job_name", h.task.JobName, 269 "nomad_task_name", h.task.Name, 270 "nomad_alloc_id", h.task.AllocID) 271 272 // Note that with cgroups.v2 the cgroup OOM killer is not 273 // observed by docker container status. But we can't test the 274 // exit code, as 137 is used for any SIGKILL 275 oom = true 276 werr = fmt.Errorf("OOM Killed") 277 } 278 279 // Shutdown stats collection 280 close(h.doneCh) 281 282 // Stop the container just incase the docker daemon's wait returned 283 // incorrectly. 284 if err := h.dockerClient.StopContainer(h.containerID, 0); err != nil { 285 _, noSuchContainer := err.(*docker.NoSuchContainer) 286 _, containerNotRunning := err.(*docker.ContainerNotRunning) 287 if !containerNotRunning && !noSuchContainer { 288 h.logger.Error("error stopping container", "error", err) 289 } 290 } 291 292 // Set the result 293 h.exitResultLock.Lock() 294 h.exitResult = &drivers.ExitResult{ 295 ExitCode: exitCode, 296 Signal: 0, 297 OOMKilled: oom, 298 Err: werr, 299 } 300 h.exitResultLock.Unlock() 301 close(h.waitCh) 302 }