github.com/docker/engine@v22.0.0-20211208180946-d456264580cf+incompatible/libcontainerd/supervisor/remote_daemon.go (about)

     1  package supervisor // import "github.com/docker/docker/libcontainerd/supervisor"
     2  
     3  import (
     4  	"context"
     5  	"fmt"
     6  	"io"
     7  	"os"
     8  	"os/exec"
     9  	"path/filepath"
    10  	"strconv"
    11  	"strings"
    12  	"sync"
    13  	"time"
    14  
    15  	"github.com/containerd/containerd"
    16  	"github.com/containerd/containerd/services/server/config"
    17  	"github.com/docker/docker/pkg/system"
    18  	"github.com/pelletier/go-toml"
    19  	"github.com/pkg/errors"
    20  	"github.com/sirupsen/logrus"
    21  )
    22  
    23  const (
    24  	maxConnectionRetryCount = 3
    25  	healthCheckTimeout      = 3 * time.Second
    26  	shutdownTimeout         = 15 * time.Second
    27  	startupTimeout          = 15 * time.Second
    28  	configFile              = "containerd.toml"
    29  	binaryName              = "containerd"
    30  	pidFile                 = "containerd.pid"
    31  )
    32  
    33  type remote struct {
    34  	sync.RWMutex
    35  	config.Config
    36  	// Plugins overrides `Plugins map[string]toml.Tree` in config config.
    37  	Plugins map[string]interface{} `toml:"plugins"`
    38  
    39  	daemonPid int
    40  	logger    *logrus.Entry
    41  
    42  	daemonWaitCh  chan struct{}
    43  	daemonStartCh chan error
    44  	daemonStopCh  chan struct{}
    45  
    46  	rootDir  string
    47  	stateDir string
    48  }
    49  
    50  // Daemon represents a running containerd daemon
    51  type Daemon interface {
    52  	WaitTimeout(time.Duration) error
    53  	Address() string
    54  }
    55  
    56  // DaemonOpt allows to configure parameters of container daemons
    57  type DaemonOpt func(c *remote) error
    58  
    59  // Start starts a containerd daemon and monitors it
    60  func Start(ctx context.Context, rootDir, stateDir string, opts ...DaemonOpt) (Daemon, error) {
    61  	r := &remote{
    62  		rootDir:  rootDir,
    63  		stateDir: stateDir,
    64  		Config: config.Config{
    65  			Root:  filepath.Join(rootDir, "daemon"),
    66  			State: filepath.Join(stateDir, "daemon"),
    67  		},
    68  		Plugins:       make(map[string]interface{}),
    69  		daemonPid:     -1,
    70  		logger:        logrus.WithField("module", "libcontainerd"),
    71  		daemonStartCh: make(chan error, 1),
    72  		daemonStopCh:  make(chan struct{}),
    73  	}
    74  
    75  	for _, opt := range opts {
    76  		if err := opt(r); err != nil {
    77  			return nil, err
    78  		}
    79  	}
    80  	r.setDefaults()
    81  
    82  	if err := system.MkdirAll(stateDir, 0700); err != nil {
    83  		return nil, err
    84  	}
    85  
    86  	go r.monitorDaemon(ctx)
    87  
    88  	timeout := time.NewTimer(startupTimeout)
    89  	defer timeout.Stop()
    90  
    91  	select {
    92  	case <-timeout.C:
    93  		return nil, errors.New("timeout waiting for containerd to start")
    94  	case err := <-r.daemonStartCh:
    95  		if err != nil {
    96  			return nil, err
    97  		}
    98  	}
    99  
   100  	return r, nil
   101  }
   102  func (r *remote) WaitTimeout(d time.Duration) error {
   103  	timeout := time.NewTimer(d)
   104  	defer timeout.Stop()
   105  
   106  	select {
   107  	case <-timeout.C:
   108  		return errors.New("timeout waiting for containerd to stop")
   109  	case <-r.daemonStopCh:
   110  	}
   111  
   112  	return nil
   113  }
   114  
   115  func (r *remote) Address() string {
   116  	return r.GRPC.Address
   117  }
   118  func (r *remote) getContainerdPid() (int, error) {
   119  	pidFile := filepath.Join(r.stateDir, pidFile)
   120  	f, err := os.OpenFile(pidFile, os.O_RDWR, 0600)
   121  	if err != nil {
   122  		if os.IsNotExist(err) {
   123  			return -1, nil
   124  		}
   125  		return -1, err
   126  	}
   127  	defer f.Close()
   128  
   129  	b := make([]byte, 8)
   130  	n, err := f.Read(b)
   131  	if err != nil && err != io.EOF {
   132  		return -1, err
   133  	}
   134  
   135  	if n > 0 {
   136  		pid, err := strconv.ParseUint(string(b[:n]), 10, 64)
   137  		if err != nil {
   138  			return -1, err
   139  		}
   140  		if system.IsProcessAlive(int(pid)) {
   141  			return int(pid), nil
   142  		}
   143  	}
   144  
   145  	return -1, nil
   146  }
   147  
   148  func (r *remote) getContainerdConfig() (string, error) {
   149  	path := filepath.Join(r.stateDir, configFile)
   150  	f, err := os.OpenFile(path, os.O_CREATE|os.O_TRUNC|os.O_WRONLY, 0600)
   151  	if err != nil {
   152  		return "", errors.Wrapf(err, "failed to open containerd config file at %s", path)
   153  	}
   154  	defer f.Close()
   155  
   156  	if err := toml.NewEncoder(f).Encode(r); err != nil {
   157  		return "", errors.Wrapf(err, "failed to write containerd config file (%s)", path)
   158  	}
   159  	return path, nil
   160  }
   161  
   162  func (r *remote) startContainerd() error {
   163  	pid, err := r.getContainerdPid()
   164  	if err != nil {
   165  		return err
   166  	}
   167  
   168  	if pid != -1 {
   169  		r.daemonPid = pid
   170  		logrus.WithField("pid", pid).
   171  			Infof("libcontainerd: %s is still running", binaryName)
   172  		return nil
   173  	}
   174  
   175  	configFile, err := r.getContainerdConfig()
   176  	if err != nil {
   177  		return err
   178  	}
   179  
   180  	args := []string{"--config", configFile}
   181  
   182  	if r.Debug.Level != "" {
   183  		args = append(args, "--log-level", r.Debug.Level)
   184  	}
   185  
   186  	cmd := exec.Command(binaryName, args...)
   187  	// redirect containerd logs to docker logs
   188  	cmd.Stdout = os.Stdout
   189  	cmd.Stderr = os.Stderr
   190  	cmd.SysProcAttr = containerdSysProcAttr()
   191  	// clear the NOTIFY_SOCKET from the env when starting containerd
   192  	cmd.Env = nil
   193  	for _, e := range os.Environ() {
   194  		if !strings.HasPrefix(e, "NOTIFY_SOCKET") {
   195  			cmd.Env = append(cmd.Env, e)
   196  		}
   197  	}
   198  	if err := cmd.Start(); err != nil {
   199  		return err
   200  	}
   201  
   202  	r.daemonWaitCh = make(chan struct{})
   203  	go func() {
   204  		// Reap our child when needed
   205  		if err := cmd.Wait(); err != nil {
   206  			r.logger.WithError(err).Errorf("containerd did not exit successfully")
   207  		}
   208  		close(r.daemonWaitCh)
   209  	}()
   210  
   211  	r.daemonPid = cmd.Process.Pid
   212  
   213  	err = os.WriteFile(filepath.Join(r.stateDir, pidFile), []byte(fmt.Sprintf("%d", r.daemonPid)), 0660)
   214  	if err != nil {
   215  		system.KillProcess(r.daemonPid)
   216  		return errors.Wrap(err, "libcontainerd: failed to save daemon pid to disk")
   217  	}
   218  
   219  	logrus.WithField("pid", r.daemonPid).
   220  		Infof("libcontainerd: started new %s process", binaryName)
   221  
   222  	return nil
   223  }
   224  
   225  func (r *remote) monitorDaemon(ctx context.Context) {
   226  	var (
   227  		transientFailureCount = 0
   228  		client                *containerd.Client
   229  		err                   error
   230  		delay                 time.Duration
   231  		timer                 = time.NewTimer(0)
   232  		started               bool
   233  	)
   234  
   235  	defer func() {
   236  		if r.daemonPid != -1 {
   237  			r.stopDaemon()
   238  		}
   239  
   240  		// cleanup some files
   241  		os.Remove(filepath.Join(r.stateDir, pidFile))
   242  
   243  		r.platformCleanup()
   244  
   245  		close(r.daemonStopCh)
   246  		timer.Stop()
   247  	}()
   248  
   249  	// ensure no races on sending to timer.C even though there is a 0 duration.
   250  	if !timer.Stop() {
   251  		<-timer.C
   252  	}
   253  
   254  	for {
   255  		timer.Reset(delay)
   256  
   257  		select {
   258  		case <-ctx.Done():
   259  			r.logger.Info("stopping healthcheck following graceful shutdown")
   260  			if client != nil {
   261  				client.Close()
   262  			}
   263  			return
   264  		case <-timer.C:
   265  		}
   266  
   267  		if r.daemonPid == -1 {
   268  			if r.daemonWaitCh != nil {
   269  				select {
   270  				case <-ctx.Done():
   271  					r.logger.Info("stopping containerd startup following graceful shutdown")
   272  					return
   273  				case <-r.daemonWaitCh:
   274  				}
   275  			}
   276  
   277  			os.RemoveAll(r.GRPC.Address)
   278  			if err := r.startContainerd(); err != nil {
   279  				if !started {
   280  					r.daemonStartCh <- err
   281  					return
   282  				}
   283  				r.logger.WithError(err).Error("failed restarting containerd")
   284  				delay = 50 * time.Millisecond
   285  				continue
   286  			}
   287  
   288  			client, err = containerd.New(r.GRPC.Address, containerd.WithTimeout(60*time.Second))
   289  			if err != nil {
   290  				r.logger.WithError(err).Error("failed connecting to containerd")
   291  				delay = 100 * time.Millisecond
   292  				continue
   293  			}
   294  			logrus.WithField("address", r.GRPC.Address).Debug("Created containerd monitoring client")
   295  		}
   296  
   297  		if client != nil {
   298  			tctx, cancel := context.WithTimeout(ctx, healthCheckTimeout)
   299  			_, err := client.IsServing(tctx)
   300  			cancel()
   301  			if err == nil {
   302  				if !started {
   303  					close(r.daemonStartCh)
   304  					started = true
   305  				}
   306  
   307  				transientFailureCount = 0
   308  
   309  				select {
   310  				case <-r.daemonWaitCh:
   311  				case <-ctx.Done():
   312  				}
   313  
   314  				// Set a small delay in case there is a recurring failure (or bug in this code)
   315  				// to ensure we don't end up in a super tight loop.
   316  				delay = 500 * time.Millisecond
   317  				continue
   318  			}
   319  
   320  			r.logger.WithError(err).WithField("binary", binaryName).Debug("daemon is not responding")
   321  
   322  			transientFailureCount++
   323  			if transientFailureCount < maxConnectionRetryCount || system.IsProcessAlive(r.daemonPid) {
   324  				delay = time.Duration(transientFailureCount) * 200 * time.Millisecond
   325  				continue
   326  			}
   327  			client.Close()
   328  			client = nil
   329  		}
   330  
   331  		if system.IsProcessAlive(r.daemonPid) {
   332  			r.logger.WithField("pid", r.daemonPid).Info("killing and restarting containerd")
   333  			r.killDaemon()
   334  		}
   335  
   336  		r.daemonPid = -1
   337  		delay = 0
   338  		transientFailureCount = 0
   339  	}
   340  }