github.com/adityamillind98/moby@v23.0.0-rc.4+incompatible/libcontainerd/supervisor/remote_daemon.go (about)

     1  package supervisor // import "github.com/docker/docker/libcontainerd/supervisor"
     2  
     3  import (
     4  	"context"
     5  	"fmt"
     6  	"io"
     7  	"os"
     8  	"os/exec"
     9  	"path/filepath"
    10  	"strconv"
    11  	"strings"
    12  	"sync"
    13  	"time"
    14  
    15  	"github.com/containerd/containerd"
    16  	"github.com/containerd/containerd/services/server/config"
    17  	"github.com/docker/docker/pkg/system"
    18  	"github.com/pelletier/go-toml"
    19  	"github.com/pkg/errors"
    20  	"github.com/sirupsen/logrus"
    21  )
    22  
    23  const (
    24  	maxConnectionRetryCount = 3
    25  	healthCheckTimeout      = 3 * time.Second
    26  	shutdownTimeout         = 15 * time.Second
    27  	startupTimeout          = 15 * time.Second
    28  	configFile              = "containerd.toml"
    29  	binaryName              = "containerd"
    30  	pidFile                 = "containerd.pid"
    31  )
    32  
    33  type remote struct {
    34  	sync.RWMutex
    35  	config.Config
    36  	// Plugins overrides `Plugins map[string]toml.Tree` in config config.
    37  	Plugins map[string]interface{} `toml:"plugins"`
    38  
    39  	daemonPid int
    40  	logger    *logrus.Entry
    41  
    42  	daemonWaitCh  chan struct{}
    43  	daemonStartCh chan error
    44  	daemonStopCh  chan struct{}
    45  
    46  	rootDir  string
    47  	stateDir string
    48  }
    49  
    50  // Daemon represents a running containerd daemon
    51  type Daemon interface {
    52  	WaitTimeout(time.Duration) error
    53  	Address() string
    54  }
    55  
    56  // DaemonOpt allows to configure parameters of container daemons
    57  type DaemonOpt func(c *remote) error
    58  
    59  // Start starts a containerd daemon and monitors it
    60  func Start(ctx context.Context, rootDir, stateDir string, opts ...DaemonOpt) (Daemon, error) {
    61  	r := &remote{
    62  		rootDir:  rootDir,
    63  		stateDir: stateDir,
    64  		Config: config.Config{
    65  			Version: 2,
    66  			Root:    filepath.Join(rootDir, "daemon"),
    67  			State:   filepath.Join(stateDir, "daemon"),
    68  		},
    69  		Plugins:       make(map[string]interface{}),
    70  		daemonPid:     -1,
    71  		logger:        logrus.WithField("module", "libcontainerd"),
    72  		daemonStartCh: make(chan error, 1),
    73  		daemonStopCh:  make(chan struct{}),
    74  	}
    75  
    76  	for _, opt := range opts {
    77  		if err := opt(r); err != nil {
    78  			return nil, err
    79  		}
    80  	}
    81  	r.setDefaults()
    82  
    83  	if err := system.MkdirAll(stateDir, 0700); err != nil {
    84  		return nil, err
    85  	}
    86  
    87  	go r.monitorDaemon(ctx)
    88  
    89  	timeout := time.NewTimer(startupTimeout)
    90  	defer timeout.Stop()
    91  
    92  	select {
    93  	case <-timeout.C:
    94  		return nil, errors.New("timeout waiting for containerd to start")
    95  	case err := <-r.daemonStartCh:
    96  		if err != nil {
    97  			return nil, err
    98  		}
    99  	}
   100  
   101  	return r, nil
   102  }
   103  func (r *remote) WaitTimeout(d time.Duration) error {
   104  	timeout := time.NewTimer(d)
   105  	defer timeout.Stop()
   106  
   107  	select {
   108  	case <-timeout.C:
   109  		return errors.New("timeout waiting for containerd to stop")
   110  	case <-r.daemonStopCh:
   111  	}
   112  
   113  	return nil
   114  }
   115  
   116  func (r *remote) Address() string {
   117  	return r.GRPC.Address
   118  }
   119  func (r *remote) getContainerdPid() (int, error) {
   120  	pidFile := filepath.Join(r.stateDir, pidFile)
   121  	f, err := os.OpenFile(pidFile, os.O_RDWR, 0600)
   122  	if err != nil {
   123  		if os.IsNotExist(err) {
   124  			return -1, nil
   125  		}
   126  		return -1, err
   127  	}
   128  	defer f.Close()
   129  
   130  	b := make([]byte, 8)
   131  	n, err := f.Read(b)
   132  	if err != nil && err != io.EOF {
   133  		return -1, err
   134  	}
   135  
   136  	if n > 0 {
   137  		pid, err := strconv.ParseUint(string(b[:n]), 10, 64)
   138  		if err != nil {
   139  			return -1, err
   140  		}
   141  		if system.IsProcessAlive(int(pid)) {
   142  			return int(pid), nil
   143  		}
   144  	}
   145  
   146  	return -1, nil
   147  }
   148  
   149  func (r *remote) getContainerdConfig() (string, error) {
   150  	path := filepath.Join(r.stateDir, configFile)
   151  	f, err := os.OpenFile(path, os.O_CREATE|os.O_TRUNC|os.O_WRONLY, 0600)
   152  	if err != nil {
   153  		return "", errors.Wrapf(err, "failed to open containerd config file at %s", path)
   154  	}
   155  	defer f.Close()
   156  
   157  	if err := toml.NewEncoder(f).Encode(r); err != nil {
   158  		return "", errors.Wrapf(err, "failed to write containerd config file (%s)", path)
   159  	}
   160  	return path, nil
   161  }
   162  
   163  func (r *remote) startContainerd() error {
   164  	pid, err := r.getContainerdPid()
   165  	if err != nil {
   166  		return err
   167  	}
   168  
   169  	if pid != -1 {
   170  		r.daemonPid = pid
   171  		logrus.WithField("pid", pid).
   172  			Infof("libcontainerd: %s is still running", binaryName)
   173  		return nil
   174  	}
   175  
   176  	configFile, err := r.getContainerdConfig()
   177  	if err != nil {
   178  		return err
   179  	}
   180  
   181  	args := []string{"--config", configFile}
   182  
   183  	if r.Debug.Level != "" {
   184  		args = append(args, "--log-level", r.Debug.Level)
   185  	}
   186  
   187  	cmd := exec.Command(binaryName, args...)
   188  	// redirect containerd logs to docker logs
   189  	cmd.Stdout = os.Stdout
   190  	cmd.Stderr = os.Stderr
   191  	cmd.SysProcAttr = containerdSysProcAttr()
   192  	// clear the NOTIFY_SOCKET from the env when starting containerd
   193  	cmd.Env = nil
   194  	for _, e := range os.Environ() {
   195  		if !strings.HasPrefix(e, "NOTIFY_SOCKET") {
   196  			cmd.Env = append(cmd.Env, e)
   197  		}
   198  	}
   199  	if err := cmd.Start(); err != nil {
   200  		return err
   201  	}
   202  
   203  	r.daemonWaitCh = make(chan struct{})
   204  	go func() {
   205  		// Reap our child when needed
   206  		if err := cmd.Wait(); err != nil {
   207  			r.logger.WithError(err).Errorf("containerd did not exit successfully")
   208  		}
   209  		close(r.daemonWaitCh)
   210  	}()
   211  
   212  	r.daemonPid = cmd.Process.Pid
   213  
   214  	err = os.WriteFile(filepath.Join(r.stateDir, pidFile), []byte(fmt.Sprintf("%d", r.daemonPid)), 0660)
   215  	if err != nil {
   216  		system.KillProcess(r.daemonPid)
   217  		return errors.Wrap(err, "libcontainerd: failed to save daemon pid to disk")
   218  	}
   219  
   220  	logrus.WithField("pid", r.daemonPid).
   221  		Infof("libcontainerd: started new %s process", binaryName)
   222  
   223  	return nil
   224  }
   225  
   226  func (r *remote) monitorDaemon(ctx context.Context) {
   227  	var (
   228  		transientFailureCount = 0
   229  		client                *containerd.Client
   230  		err                   error
   231  		delay                 time.Duration
   232  		timer                 = time.NewTimer(0)
   233  		started               bool
   234  	)
   235  
   236  	defer func() {
   237  		if r.daemonPid != -1 {
   238  			r.stopDaemon()
   239  		}
   240  
   241  		// cleanup some files
   242  		os.Remove(filepath.Join(r.stateDir, pidFile))
   243  
   244  		r.platformCleanup()
   245  
   246  		close(r.daemonStopCh)
   247  		timer.Stop()
   248  	}()
   249  
   250  	// ensure no races on sending to timer.C even though there is a 0 duration.
   251  	if !timer.Stop() {
   252  		<-timer.C
   253  	}
   254  
   255  	for {
   256  		timer.Reset(delay)
   257  
   258  		select {
   259  		case <-ctx.Done():
   260  			r.logger.Info("stopping healthcheck following graceful shutdown")
   261  			if client != nil {
   262  				client.Close()
   263  			}
   264  			return
   265  		case <-timer.C:
   266  		}
   267  
   268  		if r.daemonPid == -1 {
   269  			if r.daemonWaitCh != nil {
   270  				select {
   271  				case <-ctx.Done():
   272  					r.logger.Info("stopping containerd startup following graceful shutdown")
   273  					return
   274  				case <-r.daemonWaitCh:
   275  				}
   276  			}
   277  
   278  			os.RemoveAll(r.GRPC.Address)
   279  			if err := r.startContainerd(); err != nil {
   280  				if !started {
   281  					r.daemonStartCh <- err
   282  					return
   283  				}
   284  				r.logger.WithError(err).Error("failed restarting containerd")
   285  				delay = 50 * time.Millisecond
   286  				continue
   287  			}
   288  
   289  			client, err = containerd.New(r.GRPC.Address, containerd.WithTimeout(60*time.Second))
   290  			if err != nil {
   291  				r.logger.WithError(err).Error("failed connecting to containerd")
   292  				delay = 100 * time.Millisecond
   293  				continue
   294  			}
   295  			logrus.WithField("address", r.GRPC.Address).Debug("Created containerd monitoring client")
   296  		}
   297  
   298  		if client != nil {
   299  			tctx, cancel := context.WithTimeout(ctx, healthCheckTimeout)
   300  			_, err := client.IsServing(tctx)
   301  			cancel()
   302  			if err == nil {
   303  				if !started {
   304  					close(r.daemonStartCh)
   305  					started = true
   306  				}
   307  
   308  				transientFailureCount = 0
   309  
   310  				select {
   311  				case <-r.daemonWaitCh:
   312  				case <-ctx.Done():
   313  				}
   314  
   315  				// Set a small delay in case there is a recurring failure (or bug in this code)
   316  				// to ensure we don't end up in a super tight loop.
   317  				delay = 500 * time.Millisecond
   318  				continue
   319  			}
   320  
   321  			r.logger.WithError(err).WithField("binary", binaryName).Debug("daemon is not responding")
   322  
   323  			transientFailureCount++
   324  			if transientFailureCount < maxConnectionRetryCount || system.IsProcessAlive(r.daemonPid) {
   325  				delay = time.Duration(transientFailureCount) * 200 * time.Millisecond
   326  				continue
   327  			}
   328  			client.Close()
   329  			client = nil
   330  		}
   331  
   332  		if system.IsProcessAlive(r.daemonPid) {
   333  			r.logger.WithField("pid", r.daemonPid).Info("killing and restarting containerd")
   334  			r.killDaemon()
   335  		}
   336  
   337  		r.daemonPid = -1
   338  		delay = 0
   339  		transientFailureCount = 0
   340  	}
   341  }