github.com/docker/docker@v299999999.0.0-20200612211812-aaf470eca7b5+incompatible/libcontainerd/supervisor/remote_daemon.go (about)

     1  package supervisor // import "github.com/docker/docker/libcontainerd/supervisor"
     2  
     3  import (
     4  	"context"
     5  	"fmt"
     6  	"io"
     7  	"io/ioutil"
     8  	"os"
     9  	"os/exec"
    10  	"path/filepath"
    11  	"strconv"
    12  	"strings"
    13  	"sync"
    14  	"time"
    15  
    16  	"github.com/BurntSushi/toml"
    17  	"github.com/containerd/containerd"
    18  	"github.com/containerd/containerd/services/server/config"
    19  	"github.com/docker/docker/pkg/system"
    20  	"github.com/pkg/errors"
    21  	"github.com/sirupsen/logrus"
    22  )
    23  
    24  const (
    25  	maxConnectionRetryCount = 3
    26  	healthCheckTimeout      = 3 * time.Second
    27  	shutdownTimeout         = 15 * time.Second
    28  	startupTimeout          = 15 * time.Second
    29  	configFile              = "containerd.toml"
    30  	binaryName              = "containerd"
    31  	pidFile                 = "containerd.pid"
    32  )
    33  
    34  type pluginConfigs struct {
    35  	Plugins map[string]interface{} `toml:"plugins"`
    36  }
    37  
    38  type remote struct {
    39  	sync.RWMutex
    40  	config.Config
    41  
    42  	daemonPid int
    43  	logger    *logrus.Entry
    44  
    45  	daemonWaitCh  chan struct{}
    46  	daemonStartCh chan error
    47  	daemonStopCh  chan struct{}
    48  
    49  	rootDir     string
    50  	stateDir    string
    51  	pluginConfs pluginConfigs
    52  }
    53  
    54  // Daemon represents a running containerd daemon
    55  type Daemon interface {
    56  	WaitTimeout(time.Duration) error
    57  	Address() string
    58  }
    59  
    60  // DaemonOpt allows to configure parameters of container daemons
    61  type DaemonOpt func(c *remote) error
    62  
    63  // Start starts a containerd daemon and monitors it
    64  func Start(ctx context.Context, rootDir, stateDir string, opts ...DaemonOpt) (Daemon, error) {
    65  	r := &remote{
    66  		rootDir:  rootDir,
    67  		stateDir: stateDir,
    68  		Config: config.Config{
    69  			Root:  filepath.Join(rootDir, "daemon"),
    70  			State: filepath.Join(stateDir, "daemon"),
    71  		},
    72  		pluginConfs:   pluginConfigs{make(map[string]interface{})},
    73  		daemonPid:     -1,
    74  		logger:        logrus.WithField("module", "libcontainerd"),
    75  		daemonStartCh: make(chan error, 1),
    76  		daemonStopCh:  make(chan struct{}),
    77  	}
    78  
    79  	for _, opt := range opts {
    80  		if err := opt(r); err != nil {
    81  			return nil, err
    82  		}
    83  	}
    84  	r.setDefaults()
    85  
    86  	if err := system.MkdirAll(stateDir, 0700); err != nil {
    87  		return nil, err
    88  	}
    89  
    90  	go r.monitorDaemon(ctx)
    91  
    92  	timeout := time.NewTimer(startupTimeout)
    93  	defer timeout.Stop()
    94  
    95  	select {
    96  	case <-timeout.C:
    97  		return nil, errors.New("timeout waiting for containerd to start")
    98  	case err := <-r.daemonStartCh:
    99  		if err != nil {
   100  			return nil, err
   101  		}
   102  	}
   103  
   104  	return r, nil
   105  }
   106  func (r *remote) WaitTimeout(d time.Duration) error {
   107  	timeout := time.NewTimer(d)
   108  	defer timeout.Stop()
   109  
   110  	select {
   111  	case <-timeout.C:
   112  		return errors.New("timeout waiting for containerd to stop")
   113  	case <-r.daemonStopCh:
   114  	}
   115  
   116  	return nil
   117  }
   118  
   119  func (r *remote) Address() string {
   120  	return r.GRPC.Address
   121  }
   122  func (r *remote) getContainerdPid() (int, error) {
   123  	pidFile := filepath.Join(r.stateDir, pidFile)
   124  	f, err := os.OpenFile(pidFile, os.O_RDWR, 0600)
   125  	if err != nil {
   126  		if os.IsNotExist(err) {
   127  			return -1, nil
   128  		}
   129  		return -1, err
   130  	}
   131  	defer f.Close()
   132  
   133  	b := make([]byte, 8)
   134  	n, err := f.Read(b)
   135  	if err != nil && err != io.EOF {
   136  		return -1, err
   137  	}
   138  
   139  	if n > 0 {
   140  		pid, err := strconv.ParseUint(string(b[:n]), 10, 64)
   141  		if err != nil {
   142  			return -1, err
   143  		}
   144  		if system.IsProcessAlive(int(pid)) {
   145  			return int(pid), nil
   146  		}
   147  	}
   148  
   149  	return -1, nil
   150  }
   151  
   152  func (r *remote) getContainerdConfig() (string, error) {
   153  	path := filepath.Join(r.stateDir, configFile)
   154  	f, err := os.OpenFile(path, os.O_CREATE|os.O_TRUNC|os.O_WRONLY, 0600)
   155  	if err != nil {
   156  		return "", errors.Wrapf(err, "failed to open containerd config file at %s", path)
   157  	}
   158  	defer f.Close()
   159  
   160  	enc := toml.NewEncoder(f)
   161  	if err = enc.Encode(r.Config); err != nil {
   162  		return "", errors.Wrapf(err, "failed to encode general config")
   163  	}
   164  	if err = enc.Encode(r.pluginConfs); err != nil {
   165  		return "", errors.Wrapf(err, "failed to encode plugin configs")
   166  	}
   167  
   168  	return path, nil
   169  }
   170  
   171  func (r *remote) startContainerd() error {
   172  	pid, err := r.getContainerdPid()
   173  	if err != nil {
   174  		return err
   175  	}
   176  
   177  	if pid != -1 {
   178  		r.daemonPid = pid
   179  		logrus.WithField("pid", pid).
   180  			Infof("libcontainerd: %s is still running", binaryName)
   181  		return nil
   182  	}
   183  
   184  	configFile, err := r.getContainerdConfig()
   185  	if err != nil {
   186  		return err
   187  	}
   188  
   189  	args := []string{"--config", configFile}
   190  
   191  	if r.Debug.Level != "" {
   192  		args = append(args, "--log-level", r.Debug.Level)
   193  	}
   194  
   195  	cmd := exec.Command(binaryName, args...)
   196  	// redirect containerd logs to docker logs
   197  	cmd.Stdout = os.Stdout
   198  	cmd.Stderr = os.Stderr
   199  	cmd.SysProcAttr = containerdSysProcAttr()
   200  	// clear the NOTIFY_SOCKET from the env when starting containerd
   201  	cmd.Env = nil
   202  	for _, e := range os.Environ() {
   203  		if !strings.HasPrefix(e, "NOTIFY_SOCKET") {
   204  			cmd.Env = append(cmd.Env, e)
   205  		}
   206  	}
   207  	if err := cmd.Start(); err != nil {
   208  		return err
   209  	}
   210  
   211  	r.daemonWaitCh = make(chan struct{})
   212  	go func() {
   213  		// Reap our child when needed
   214  		if err := cmd.Wait(); err != nil {
   215  			r.logger.WithError(err).Errorf("containerd did not exit successfully")
   216  		}
   217  		close(r.daemonWaitCh)
   218  	}()
   219  
   220  	r.daemonPid = cmd.Process.Pid
   221  
   222  	err = ioutil.WriteFile(filepath.Join(r.stateDir, pidFile), []byte(fmt.Sprintf("%d", r.daemonPid)), 0660)
   223  	if err != nil {
   224  		system.KillProcess(r.daemonPid)
   225  		return errors.Wrap(err, "libcontainerd: failed to save daemon pid to disk")
   226  	}
   227  
   228  	logrus.WithField("pid", r.daemonPid).
   229  		Infof("libcontainerd: started new %s process", binaryName)
   230  
   231  	return nil
   232  }
   233  
   234  func (r *remote) monitorDaemon(ctx context.Context) {
   235  	var (
   236  		transientFailureCount = 0
   237  		client                *containerd.Client
   238  		err                   error
   239  		delay                 time.Duration
   240  		timer                 = time.NewTimer(0)
   241  		started               bool
   242  	)
   243  
   244  	defer func() {
   245  		if r.daemonPid != -1 {
   246  			r.stopDaemon()
   247  		}
   248  
   249  		// cleanup some files
   250  		os.Remove(filepath.Join(r.stateDir, pidFile))
   251  
   252  		r.platformCleanup()
   253  
   254  		close(r.daemonStopCh)
   255  		timer.Stop()
   256  	}()
   257  
   258  	// ensure no races on sending to timer.C even though there is a 0 duration.
   259  	if !timer.Stop() {
   260  		<-timer.C
   261  	}
   262  
   263  	for {
   264  		timer.Reset(delay)
   265  
   266  		select {
   267  		case <-ctx.Done():
   268  			r.logger.Info("stopping healthcheck following graceful shutdown")
   269  			if client != nil {
   270  				client.Close()
   271  			}
   272  			return
   273  		case <-timer.C:
   274  		}
   275  
   276  		if r.daemonPid == -1 {
   277  			if r.daemonWaitCh != nil {
   278  				select {
   279  				case <-ctx.Done():
   280  					r.logger.Info("stopping containerd startup following graceful shutdown")
   281  					return
   282  				case <-r.daemonWaitCh:
   283  				}
   284  			}
   285  
   286  			os.RemoveAll(r.GRPC.Address)
   287  			if err := r.startContainerd(); err != nil {
   288  				if !started {
   289  					r.daemonStartCh <- err
   290  					return
   291  				}
   292  				r.logger.WithError(err).Error("failed restarting containerd")
   293  				delay = 50 * time.Millisecond
   294  				continue
   295  			}
   296  
   297  			client, err = containerd.New(r.GRPC.Address, containerd.WithTimeout(60*time.Second))
   298  			if err != nil {
   299  				r.logger.WithError(err).Error("failed connecting to containerd")
   300  				delay = 100 * time.Millisecond
   301  				continue
   302  			}
   303  			logrus.WithField("address", r.GRPC.Address).Debug("Created containerd monitoring client")
   304  		}
   305  
   306  		if client != nil {
   307  			tctx, cancel := context.WithTimeout(ctx, healthCheckTimeout)
   308  			_, err := client.IsServing(tctx)
   309  			cancel()
   310  			if err == nil {
   311  				if !started {
   312  					close(r.daemonStartCh)
   313  					started = true
   314  				}
   315  
   316  				transientFailureCount = 0
   317  
   318  				select {
   319  				case <-r.daemonWaitCh:
   320  				case <-ctx.Done():
   321  				}
   322  
   323  				// Set a small delay in case there is a recurring failure (or bug in this code)
   324  				// to ensure we don't end up in a super tight loop.
   325  				delay = 500 * time.Millisecond
   326  				continue
   327  			}
   328  
   329  			r.logger.WithError(err).WithField("binary", binaryName).Debug("daemon is not responding")
   330  
   331  			transientFailureCount++
   332  			if transientFailureCount < maxConnectionRetryCount || system.IsProcessAlive(r.daemonPid) {
   333  				delay = time.Duration(transientFailureCount) * 200 * time.Millisecond
   334  				continue
   335  			}
   336  			client.Close()
   337  			client = nil
   338  		}
   339  
   340  		if system.IsProcessAlive(r.daemonPid) {
   341  			r.logger.WithField("pid", r.daemonPid).Info("killing and restarting containerd")
   342  			r.killDaemon()
   343  		}
   344  
   345  		r.daemonPid = -1
   346  		delay = 0
   347  		transientFailureCount = 0
   348  	}
   349  }