github.com/hernad/nomad@v1.6.112/drivers/docker/handle.go (about)

     1  // Copyright (c) HashiCorp, Inc.
     2  // SPDX-License-Identifier: MPL-2.0
     3  
     4  package docker
     5  
     6  import (
     7  	"context"
     8  	"fmt"
     9  	"os"
    10  	"runtime"
    11  	"strings"
    12  	"sync"
    13  	"syscall"
    14  	"time"
    15  
    16  	"github.com/armon/circbuf"
    17  	docker "github.com/fsouza/go-dockerclient"
    18  	"github.com/hernad/consul-template/signals"
    19  	"github.com/hashicorp/go-hclog"
    20  	"github.com/hashicorp/go-plugin"
    21  
    22  	"github.com/hernad/nomad/drivers/docker/docklog"
    23  	"github.com/hernad/nomad/plugins/drivers"
    24  	pstructs "github.com/hernad/nomad/plugins/shared/structs"
    25  )
    26  
    27  type taskHandle struct {
    28  	// dockerClient is useful for normal docker API calls. It should be used
    29  	// for all calls that aren't Wait() or Stop() (and their variations).
    30  	dockerClient *docker.Client
    31  
    32  	// infinityClient is useful for
    33  	// - the Wait docker API call(s) (no limit on container lifetime)
    34  	// - the Stop docker API call(s) (context with task kill_timeout required)
    35  	// Do not use this client for any other docker API calls, instead use the
    36  	// normal dockerClient which includes a default timeout.
    37  	infinityClient *docker.Client
    38  
    39  	logger                hclog.Logger
    40  	dlogger               docklog.DockerLogger
    41  	dloggerPluginClient   *plugin.Client
    42  	task                  *drivers.TaskConfig
    43  	containerID           string
    44  	containerImage        string
    45  	doneCh                chan bool
    46  	waitCh                chan struct{}
    47  	removeContainerOnExit bool
    48  	net                   *drivers.DriverNetwork
    49  
    50  	exitResult     *drivers.ExitResult
    51  	exitResultLock sync.Mutex
    52  }
    53  
    54  func (h *taskHandle) ExitResult() *drivers.ExitResult {
    55  	h.exitResultLock.Lock()
    56  	defer h.exitResultLock.Unlock()
    57  	return h.exitResult.Copy()
    58  }
    59  
    60  type taskHandleState struct {
    61  	// ReattachConfig for the docker logger plugin
    62  	ReattachConfig *pstructs.ReattachConfig
    63  
    64  	ContainerID   string
    65  	DriverNetwork *drivers.DriverNetwork
    66  }
    67  
    68  func (h *taskHandle) buildState() *taskHandleState {
    69  	s := &taskHandleState{
    70  		ContainerID:   h.containerID,
    71  		DriverNetwork: h.net,
    72  	}
    73  	if h.dloggerPluginClient != nil {
    74  		s.ReattachConfig = pstructs.ReattachConfigFromGoPlugin(h.dloggerPluginClient.ReattachConfig())
    75  	}
    76  	return s
    77  }
    78  
    79  func (h *taskHandle) Exec(ctx context.Context, cmd string, args []string) (*drivers.ExecTaskResult, error) {
    80  	fullCmd := make([]string, len(args)+1)
    81  	fullCmd[0] = cmd
    82  	copy(fullCmd[1:], args)
    83  	createExecOpts := docker.CreateExecOptions{
    84  		AttachStdin:  false,
    85  		AttachStdout: true,
    86  		AttachStderr: true,
    87  		Tty:          false,
    88  		Cmd:          fullCmd,
    89  		Container:    h.containerID,
    90  		Context:      ctx,
    91  	}
    92  	exec, err := h.dockerClient.CreateExec(createExecOpts)
    93  	if err != nil {
    94  		return nil, err
    95  	}
    96  
    97  	execResult := &drivers.ExecTaskResult{ExitResult: &drivers.ExitResult{}}
    98  	stdout, _ := circbuf.NewBuffer(int64(drivers.CheckBufSize))
    99  	stderr, _ := circbuf.NewBuffer(int64(drivers.CheckBufSize))
   100  	startOpts := docker.StartExecOptions{
   101  		Detach:       false,
   102  		Tty:          false,
   103  		OutputStream: stdout,
   104  		ErrorStream:  stderr,
   105  		Context:      ctx,
   106  	}
   107  	if err := h.dockerClient.StartExec(exec.ID, startOpts); err != nil {
   108  		return nil, err
   109  	}
   110  	execResult.Stdout = stdout.Bytes()
   111  	execResult.Stderr = stderr.Bytes()
   112  	res, err := h.dockerClient.InspectExec(exec.ID)
   113  	if err != nil {
   114  		return execResult, err
   115  	}
   116  
   117  	execResult.ExitResult.ExitCode = res.ExitCode
   118  	return execResult, nil
   119  }
   120  
   121  func (h *taskHandle) Signal(ctx context.Context, s os.Signal) error {
   122  	// Convert types
   123  	sysSig, ok := s.(syscall.Signal)
   124  	if !ok {
   125  		return fmt.Errorf("Failed to determine signal number")
   126  	}
   127  
   128  	// TODO When we expose signals we will need a mapping layer that converts
   129  	// MacOS signals to the correct signal number for docker. Or we change the
   130  	// interface to take a signal string and leave it up to driver to map?
   131  
   132  	opts := docker.KillContainerOptions{
   133  		ID:      h.containerID,
   134  		Signal:  docker.Signal(sysSig),
   135  		Context: ctx,
   136  	}
   137  
   138  	// remember Kill just means send a signal; this is not the complex StopContainer case
   139  	return h.dockerClient.KillContainer(opts)
   140  }
   141  
   142  // parseSignal interprets the signal name into an os.Signal. If no name is
   143  // provided, the docker driver defaults to SIGTERM. If the OS is Windows and
   144  // SIGINT is provided, the signal is converted to SIGTERM.
   145  func parseSignal(os, signal string) (os.Signal, error) {
   146  	// Unlike other drivers, docker defaults to SIGTERM, aiming for consistency
   147  	// with the 'docker stop' command.
   148  	// https://docs.docker.com/engine/reference/commandline/stop/#extended-description
   149  	if signal == "" {
   150  		signal = "SIGTERM"
   151  	}
   152  
   153  	// Windows Docker daemon does not support SIGINT, SIGTERM is the semantic equivalent that
   154  	// allows for graceful shutdown before being followed up by a SIGKILL.
   155  	// Supported signals:
   156  	//   https://github.com/moby/moby/blob/0111ee70874a4947d93f64b672f66a2a35071ee2/pkg/signal/signal_windows.go#L17-L26
   157  	if os == "windows" && signal == "SIGINT" {
   158  		signal = "SIGTERM"
   159  	}
   160  
   161  	return signals.Parse(signal)
   162  }
   163  
   164  // Kill is used to terminate the task.
   165  func (h *taskHandle) Kill(killTimeout time.Duration, signal string) error {
   166  	var err error
   167  	// Calling StopContainer lets docker handle the stop signal (specified
   168  	// in the Dockerfile or defaulting to SIGTERM). If kill_signal is specified,
   169  	// Signal is used to kill the container with the desired signal before
   170  	// calling StopContainer
   171  	if signal == "" {
   172  		// give the context timeout some wiggle room beyond the kill timeout
   173  		// docker will use, so we can happy path even in the force kill case
   174  		graciousTimeout := killTimeout + dockerTimeout
   175  		ctx, cancel := context.WithTimeout(context.Background(), graciousTimeout)
   176  		defer cancel()
   177  		apiTimeout := uint(killTimeout.Seconds())
   178  		err = h.infinityClient.StopContainerWithContext(h.containerID, apiTimeout, ctx)
   179  	} else {
   180  		ctx, cancel := context.WithTimeout(context.Background(), killTimeout)
   181  		defer cancel()
   182  
   183  		sig, parseErr := parseSignal(runtime.GOOS, signal)
   184  		if parseErr != nil {
   185  			return fmt.Errorf("failed to parse signal: %v", parseErr)
   186  		}
   187  
   188  		if err := h.Signal(ctx, sig); err != nil {
   189  			// Container has already been removed.
   190  			if strings.Contains(err.Error(), NoSuchContainerError) {
   191  				h.logger.Debug("attempted to signal nonexistent container")
   192  				return nil
   193  			}
   194  			// Container has already been stopped.
   195  			if strings.Contains(err.Error(), ContainerNotRunningError) {
   196  				h.logger.Debug("attempted to signal a not-running container")
   197  				return nil
   198  			}
   199  
   200  			h.logger.Error("failed to signal container while killing", "error", err)
   201  			return fmt.Errorf("Failed to signal container %q while killing: %v", h.containerID, err)
   202  		}
   203  
   204  		select {
   205  		case <-h.waitCh:
   206  			return nil
   207  		case <-ctx.Done():
   208  		}
   209  
   210  		// Stop the container forcefully.
   211  		err = h.dockerClient.StopContainer(h.containerID, 0)
   212  	}
   213  
   214  	if err != nil {
   215  		// Container has already been removed.
   216  		if strings.Contains(err.Error(), NoSuchContainerError) {
   217  			h.logger.Debug("attempted to stop nonexistent container")
   218  			return nil
   219  		}
   220  		// Container has already been stopped.
   221  		if strings.Contains(err.Error(), ContainerNotRunningError) {
   222  			h.logger.Debug("attempted to stop an not-running container")
   223  			return nil
   224  		}
   225  
   226  		h.logger.Error("failed to stop container", "error", err)
   227  		return fmt.Errorf("Failed to stop container %s: %s", h.containerID, err)
   228  	}
   229  
   230  	h.logger.Info("stopped container")
   231  	return nil
   232  }
   233  
   234  func (h *taskHandle) shutdownLogger() {
   235  	if h.dlogger == nil {
   236  		return
   237  	}
   238  
   239  	if err := h.dlogger.Stop(); err != nil {
   240  		h.logger.Error("failed to stop docker logger process during StopTask",
   241  			"error", err, "logger_pid", h.dloggerPluginClient.ReattachConfig().Pid)
   242  	}
   243  	h.dloggerPluginClient.Kill()
   244  }
   245  
   246  func (h *taskHandle) run() {
   247  	defer h.shutdownLogger()
   248  
   249  	exitCode, werr := h.infinityClient.WaitContainer(h.containerID)
   250  	if werr != nil {
   251  		h.logger.Error("failed to wait for container; already terminated")
   252  	}
   253  
   254  	if exitCode != 0 {
   255  		werr = fmt.Errorf("Docker container exited with non-zero exit code: %d", exitCode)
   256  	}
   257  
   258  	container, ierr := h.dockerClient.InspectContainerWithOptions(docker.InspectContainerOptions{
   259  		ID: h.containerID,
   260  	})
   261  	oom := false
   262  	if ierr != nil {
   263  		h.logger.Error("failed to inspect container", "error", ierr)
   264  	} else if container.State.OOMKilled {
   265  		h.logger.Error("OOM Killed",
   266  			"container_id", h.containerID,
   267  			"container_image", h.containerImage,
   268  			"nomad_job_name", h.task.JobName,
   269  			"nomad_task_name", h.task.Name,
   270  			"nomad_alloc_id", h.task.AllocID)
   271  
   272  		// Note that with cgroups.v2 the cgroup OOM killer is not
   273  		// observed by docker container status. But we can't test the
   274  		// exit code, as 137 is used for any SIGKILL
   275  		oom = true
   276  		werr = fmt.Errorf("OOM Killed")
   277  	}
   278  
   279  	// Shutdown stats collection
   280  	close(h.doneCh)
   281  
   282  	// Stop the container just incase the docker daemon's wait returned
   283  	// incorrectly.
   284  	if err := h.dockerClient.StopContainer(h.containerID, 0); err != nil {
   285  		_, noSuchContainer := err.(*docker.NoSuchContainer)
   286  		_, containerNotRunning := err.(*docker.ContainerNotRunning)
   287  		if !containerNotRunning && !noSuchContainer {
   288  			h.logger.Error("error stopping container", "error", err)
   289  		}
   290  	}
   291  
   292  	// Set the result
   293  	h.exitResultLock.Lock()
   294  	h.exitResult = &drivers.ExitResult{
   295  		ExitCode:  exitCode,
   296  		Signal:    0,
   297  		OOMKilled: oom,
   298  		Err:       werr,
   299  	}
   300  	h.exitResultLock.Unlock()
   301  	close(h.waitCh)
   302  }