gopkg.in/docker/docker.v20@v20.10.27/libcontainerd/supervisor/remote_daemon.go (about)

     1  package supervisor // import "github.com/docker/docker/libcontainerd/supervisor"
     2  
     3  import (
     4  	"context"
     5  	"fmt"
     6  	"io"
     7  	"os"
     8  	"os/exec"
     9  	"path/filepath"
    10  	"strconv"
    11  	"strings"
    12  	"sync"
    13  	"time"
    14  
    15  	"github.com/BurntSushi/toml"
    16  	"github.com/containerd/containerd"
    17  	"github.com/containerd/containerd/services/server/config"
    18  	"github.com/docker/docker/pkg/system"
    19  	"github.com/pkg/errors"
    20  	"github.com/sirupsen/logrus"
    21  )
    22  
    23  const (
    24  	maxConnectionRetryCount = 3
    25  	healthCheckTimeout      = 3 * time.Second
    26  	shutdownTimeout         = 15 * time.Second
    27  	startupTimeout          = 15 * time.Second
    28  	configFile              = "containerd.toml"
    29  	binaryName              = "containerd"
    30  	pidFile                 = "containerd.pid"
    31  )
    32  
    33  type pluginConfigs struct {
    34  	Plugins map[string]interface{} `toml:"plugins"`
    35  }
    36  
    37  type remote struct {
    38  	sync.RWMutex
    39  	config.Config
    40  
    41  	daemonPid int
    42  	logger    *logrus.Entry
    43  
    44  	daemonWaitCh  chan struct{}
    45  	daemonStartCh chan error
    46  	daemonStopCh  chan struct{}
    47  
    48  	rootDir     string
    49  	stateDir    string
    50  	pluginConfs pluginConfigs
    51  }
    52  
    53  // Daemon represents a running containerd daemon
    54  type Daemon interface {
    55  	WaitTimeout(time.Duration) error
    56  	Address() string
    57  }
    58  
    59  // DaemonOpt allows to configure parameters of container daemons
    60  type DaemonOpt func(c *remote) error
    61  
    62  // Start starts a containerd daemon and monitors it
    63  func Start(ctx context.Context, rootDir, stateDir string, opts ...DaemonOpt) (Daemon, error) {
    64  	r := &remote{
    65  		rootDir:  rootDir,
    66  		stateDir: stateDir,
    67  		Config: config.Config{
    68  			Root:  filepath.Join(rootDir, "daemon"),
    69  			State: filepath.Join(stateDir, "daemon"),
    70  		},
    71  		pluginConfs:   pluginConfigs{make(map[string]interface{})},
    72  		daemonPid:     -1,
    73  		logger:        logrus.WithField("module", "libcontainerd"),
    74  		daemonStartCh: make(chan error, 1),
    75  		daemonStopCh:  make(chan struct{}),
    76  	}
    77  
    78  	for _, opt := range opts {
    79  		if err := opt(r); err != nil {
    80  			return nil, err
    81  		}
    82  	}
    83  	r.setDefaults()
    84  
    85  	if err := system.MkdirAll(stateDir, 0700); err != nil {
    86  		return nil, err
    87  	}
    88  
    89  	go r.monitorDaemon(ctx)
    90  
    91  	timeout := time.NewTimer(startupTimeout)
    92  	defer timeout.Stop()
    93  
    94  	select {
    95  	case <-timeout.C:
    96  		return nil, errors.New("timeout waiting for containerd to start")
    97  	case err := <-r.daemonStartCh:
    98  		if err != nil {
    99  			return nil, err
   100  		}
   101  	}
   102  
   103  	return r, nil
   104  }
   105  func (r *remote) WaitTimeout(d time.Duration) error {
   106  	timeout := time.NewTimer(d)
   107  	defer timeout.Stop()
   108  
   109  	select {
   110  	case <-timeout.C:
   111  		return errors.New("timeout waiting for containerd to stop")
   112  	case <-r.daemonStopCh:
   113  	}
   114  
   115  	return nil
   116  }
   117  
   118  func (r *remote) Address() string {
   119  	return r.GRPC.Address
   120  }
   121  func (r *remote) getContainerdPid() (int, error) {
   122  	pidFile := filepath.Join(r.stateDir, pidFile)
   123  	f, err := os.OpenFile(pidFile, os.O_RDWR, 0600)
   124  	if err != nil {
   125  		if os.IsNotExist(err) {
   126  			return -1, nil
   127  		}
   128  		return -1, err
   129  	}
   130  	defer f.Close()
   131  
   132  	b := make([]byte, 8)
   133  	n, err := f.Read(b)
   134  	if err != nil && err != io.EOF {
   135  		return -1, err
   136  	}
   137  
   138  	if n > 0 {
   139  		pid, err := strconv.ParseUint(string(b[:n]), 10, 64)
   140  		if err != nil {
   141  			return -1, err
   142  		}
   143  		if system.IsProcessAlive(int(pid)) {
   144  			return int(pid), nil
   145  		}
   146  	}
   147  
   148  	return -1, nil
   149  }
   150  
   151  func (r *remote) getContainerdConfig() (string, error) {
   152  	path := filepath.Join(r.stateDir, configFile)
   153  	f, err := os.OpenFile(path, os.O_CREATE|os.O_TRUNC|os.O_WRONLY, 0600)
   154  	if err != nil {
   155  		return "", errors.Wrapf(err, "failed to open containerd config file at %s", path)
   156  	}
   157  	defer f.Close()
   158  
   159  	enc := toml.NewEncoder(f)
   160  	if err = enc.Encode(r.Config); err != nil {
   161  		return "", errors.Wrapf(err, "failed to encode general config")
   162  	}
   163  	if err = enc.Encode(r.pluginConfs); err != nil {
   164  		return "", errors.Wrapf(err, "failed to encode plugin configs")
   165  	}
   166  
   167  	return path, nil
   168  }
   169  
   170  func (r *remote) startContainerd() error {
   171  	pid, err := r.getContainerdPid()
   172  	if err != nil {
   173  		return err
   174  	}
   175  
   176  	if pid != -1 {
   177  		r.daemonPid = pid
   178  		logrus.WithField("pid", pid).
   179  			Infof("libcontainerd: %s is still running", binaryName)
   180  		return nil
   181  	}
   182  
   183  	configFile, err := r.getContainerdConfig()
   184  	if err != nil {
   185  		return err
   186  	}
   187  
   188  	args := []string{"--config", configFile}
   189  
   190  	if r.Debug.Level != "" {
   191  		args = append(args, "--log-level", r.Debug.Level)
   192  	}
   193  
   194  	cmd := exec.Command(binaryName, args...)
   195  	// redirect containerd logs to docker logs
   196  	cmd.Stdout = os.Stdout
   197  	cmd.Stderr = os.Stderr
   198  	cmd.SysProcAttr = containerdSysProcAttr()
   199  	// clear the NOTIFY_SOCKET from the env when starting containerd
   200  	cmd.Env = nil
   201  	for _, e := range os.Environ() {
   202  		if !strings.HasPrefix(e, "NOTIFY_SOCKET") {
   203  			cmd.Env = append(cmd.Env, e)
   204  		}
   205  	}
   206  	if err := cmd.Start(); err != nil {
   207  		return err
   208  	}
   209  
   210  	r.daemonWaitCh = make(chan struct{})
   211  	go func() {
   212  		// Reap our child when needed
   213  		if err := cmd.Wait(); err != nil {
   214  			r.logger.WithError(err).Errorf("containerd did not exit successfully")
   215  		}
   216  		close(r.daemonWaitCh)
   217  	}()
   218  
   219  	r.daemonPid = cmd.Process.Pid
   220  
   221  	err = os.WriteFile(filepath.Join(r.stateDir, pidFile), []byte(fmt.Sprintf("%d", r.daemonPid)), 0660)
   222  	if err != nil {
   223  		system.KillProcess(r.daemonPid)
   224  		return errors.Wrap(err, "libcontainerd: failed to save daemon pid to disk")
   225  	}
   226  
   227  	logrus.WithField("pid", r.daemonPid).
   228  		Infof("libcontainerd: started new %s process", binaryName)
   229  
   230  	return nil
   231  }
   232  
   233  func (r *remote) monitorDaemon(ctx context.Context) {
   234  	var (
   235  		transientFailureCount = 0
   236  		client                *containerd.Client
   237  		err                   error
   238  		delay                 time.Duration
   239  		timer                 = time.NewTimer(0)
   240  		started               bool
   241  	)
   242  
   243  	defer func() {
   244  		if r.daemonPid != -1 {
   245  			r.stopDaemon()
   246  		}
   247  
   248  		// cleanup some files
   249  		os.Remove(filepath.Join(r.stateDir, pidFile))
   250  
   251  		r.platformCleanup()
   252  
   253  		close(r.daemonStopCh)
   254  		timer.Stop()
   255  	}()
   256  
   257  	// ensure no races on sending to timer.C even though there is a 0 duration.
   258  	if !timer.Stop() {
   259  		<-timer.C
   260  	}
   261  
   262  	for {
   263  		timer.Reset(delay)
   264  
   265  		select {
   266  		case <-ctx.Done():
   267  			r.logger.Info("stopping healthcheck following graceful shutdown")
   268  			if client != nil {
   269  				client.Close()
   270  			}
   271  			return
   272  		case <-timer.C:
   273  		}
   274  
   275  		if r.daemonPid == -1 {
   276  			if r.daemonWaitCh != nil {
   277  				select {
   278  				case <-ctx.Done():
   279  					r.logger.Info("stopping containerd startup following graceful shutdown")
   280  					return
   281  				case <-r.daemonWaitCh:
   282  				}
   283  			}
   284  
   285  			os.RemoveAll(r.GRPC.Address)
   286  			if err := r.startContainerd(); err != nil {
   287  				if !started {
   288  					r.daemonStartCh <- err
   289  					return
   290  				}
   291  				r.logger.WithError(err).Error("failed restarting containerd")
   292  				delay = 50 * time.Millisecond
   293  				continue
   294  			}
   295  
   296  			client, err = containerd.New(r.GRPC.Address, containerd.WithTimeout(60*time.Second))
   297  			if err != nil {
   298  				r.logger.WithError(err).Error("failed connecting to containerd")
   299  				delay = 100 * time.Millisecond
   300  				continue
   301  			}
   302  			logrus.WithField("address", r.GRPC.Address).Debug("Created containerd monitoring client")
   303  		}
   304  
   305  		if client != nil {
   306  			tctx, cancel := context.WithTimeout(ctx, healthCheckTimeout)
   307  			_, err := client.IsServing(tctx)
   308  			cancel()
   309  			if err == nil {
   310  				if !started {
   311  					close(r.daemonStartCh)
   312  					started = true
   313  				}
   314  
   315  				transientFailureCount = 0
   316  
   317  				select {
   318  				case <-r.daemonWaitCh:
   319  				case <-ctx.Done():
   320  				}
   321  
   322  				// Set a small delay in case there is a recurring failure (or bug in this code)
   323  				// to ensure we don't end up in a super tight loop.
   324  				delay = 500 * time.Millisecond
   325  				continue
   326  			}
   327  
   328  			r.logger.WithError(err).WithField("binary", binaryName).Debug("daemon is not responding")
   329  
   330  			transientFailureCount++
   331  			if transientFailureCount < maxConnectionRetryCount || system.IsProcessAlive(r.daemonPid) {
   332  				delay = time.Duration(transientFailureCount) * 200 * time.Millisecond
   333  				continue
   334  			}
   335  			client.Close()
   336  			client = nil
   337  		}
   338  
   339  		if system.IsProcessAlive(r.daemonPid) {
   340  			r.logger.WithField("pid", r.daemonPid).Info("killing and restarting containerd")
   341  			r.killDaemon()
   342  		}
   343  
   344  		r.daemonPid = -1
   345  		delay = 0
   346  		transientFailureCount = 0
   347  	}
   348  }