github.com/MaximeAubanel/moby@v1.13.1/libcontainerd/remote_unix.go (about)

     1  // +build linux solaris
     2  
     3  package libcontainerd
     4  
     5  import (
     6  	"fmt"
     7  	"io"
     8  	"io/ioutil"
     9  	"log"
    10  	"net"
    11  	"os"
    12  	"os/exec"
    13  	"path/filepath"
    14  	goruntime "runtime"
    15  	"strconv"
    16  	"strings"
    17  	"sync"
    18  	"syscall"
    19  	"time"
    20  
    21  	"github.com/Sirupsen/logrus"
    22  	containerd "github.com/docker/containerd/api/grpc/types"
    23  	"github.com/docker/docker/pkg/locker"
    24  	sysinfo "github.com/docker/docker/pkg/system"
    25  	"github.com/docker/docker/utils"
    26  	"github.com/golang/protobuf/ptypes"
    27  	"github.com/golang/protobuf/ptypes/timestamp"
    28  	"golang.org/x/net/context"
    29  	"google.golang.org/grpc"
    30  	"google.golang.org/grpc/grpclog"
    31  	"google.golang.org/grpc/health/grpc_health_v1"
    32  	"google.golang.org/grpc/transport"
    33  )
    34  
    35  const (
    36  	maxConnectionRetryCount      = 3
    37  	containerdHealthCheckTimeout = 3 * time.Second
    38  	containerdShutdownTimeout    = 15 * time.Second
    39  	containerdBinary             = "docker-containerd"
    40  	containerdPidFilename        = "docker-containerd.pid"
    41  	containerdSockFilename       = "docker-containerd.sock"
    42  	containerdStateDir           = "containerd"
    43  	eventTimestampFilename       = "event.ts"
    44  )
    45  
    46  type remote struct {
    47  	sync.RWMutex
    48  	apiClient            containerd.APIClient
    49  	daemonPid            int
    50  	stateDir             string
    51  	rpcAddr              string
    52  	startDaemon          bool
    53  	closeManually        bool
    54  	debugLog             bool
    55  	rpcConn              *grpc.ClientConn
    56  	clients              []*client
    57  	eventTsPath          string
    58  	runtime              string
    59  	runtimeArgs          []string
    60  	daemonWaitCh         chan struct{}
    61  	liveRestore          bool
    62  	oomScore             int
    63  	restoreFromTimestamp *timestamp.Timestamp
    64  }
    65  
    66  // New creates a fresh instance of libcontainerd remote.
    67  func New(stateDir string, options ...RemoteOption) (_ Remote, err error) {
    68  	defer func() {
    69  		if err != nil {
    70  			err = fmt.Errorf("Failed to connect to containerd. Please make sure containerd is installed in your PATH or you have specified the correct address. Got error: %v", err)
    71  		}
    72  	}()
    73  	r := &remote{
    74  		stateDir:    stateDir,
    75  		daemonPid:   -1,
    76  		eventTsPath: filepath.Join(stateDir, eventTimestampFilename),
    77  	}
    78  	for _, option := range options {
    79  		if err := option.Apply(r); err != nil {
    80  			return nil, err
    81  		}
    82  	}
    83  
    84  	if err := sysinfo.MkdirAll(stateDir, 0700); err != nil {
    85  		return nil, err
    86  	}
    87  
    88  	if r.rpcAddr == "" {
    89  		r.rpcAddr = filepath.Join(stateDir, containerdSockFilename)
    90  	}
    91  
    92  	if r.startDaemon {
    93  		if err := r.runContainerdDaemon(); err != nil {
    94  			return nil, err
    95  		}
    96  	}
    97  
    98  	// don't output the grpc reconnect logging
    99  	grpclog.SetLogger(log.New(ioutil.Discard, "", log.LstdFlags))
   100  	dialOpts := append([]grpc.DialOption{grpc.WithInsecure()},
   101  		grpc.WithDialer(func(addr string, timeout time.Duration) (net.Conn, error) {
   102  			return net.DialTimeout("unix", addr, timeout)
   103  		}),
   104  	)
   105  	conn, err := grpc.Dial(r.rpcAddr, dialOpts...)
   106  	if err != nil {
   107  		return nil, fmt.Errorf("error connecting to containerd: %v", err)
   108  	}
   109  
   110  	r.rpcConn = conn
   111  	r.apiClient = containerd.NewAPIClient(conn)
   112  
   113  	// Get the timestamp to restore from
   114  	t := r.getLastEventTimestamp()
   115  	tsp, err := ptypes.TimestampProto(t)
   116  	if err != nil {
   117  		logrus.Errorf("libcontainerd: failed to convert timestamp: %q", err)
   118  	}
   119  	r.restoreFromTimestamp = tsp
   120  
   121  	go r.handleConnectionChange()
   122  
   123  	if err := r.startEventsMonitor(); err != nil {
   124  		return nil, err
   125  	}
   126  
   127  	return r, nil
   128  }
   129  
   130  func (r *remote) UpdateOptions(options ...RemoteOption) error {
   131  	for _, option := range options {
   132  		if err := option.Apply(r); err != nil {
   133  			return err
   134  		}
   135  	}
   136  	return nil
   137  }
   138  
   139  func (r *remote) handleConnectionChange() {
   140  	var transientFailureCount = 0
   141  
   142  	ticker := time.NewTicker(500 * time.Millisecond)
   143  	defer ticker.Stop()
   144  	healthClient := grpc_health_v1.NewHealthClient(r.rpcConn)
   145  
   146  	for {
   147  		<-ticker.C
   148  		ctx, cancel := context.WithTimeout(context.Background(), containerdHealthCheckTimeout)
   149  		_, err := healthClient.Check(ctx, &grpc_health_v1.HealthCheckRequest{})
   150  		cancel()
   151  		if err == nil {
   152  			continue
   153  		}
   154  
   155  		logrus.Debugf("libcontainerd: containerd health check returned error: %v", err)
   156  
   157  		if r.daemonPid != -1 {
   158  			if strings.Contains(err.Error(), "is closing") {
   159  				// Well, we asked for it to stop, just return
   160  				return
   161  			}
   162  			// all other errors are transient
   163  			// Reset state to be notified of next failure
   164  			transientFailureCount++
   165  			if transientFailureCount >= maxConnectionRetryCount {
   166  				transientFailureCount = 0
   167  				if utils.IsProcessAlive(r.daemonPid) {
   168  					utils.KillProcess(r.daemonPid)
   169  				}
   170  				<-r.daemonWaitCh
   171  				if err := r.runContainerdDaemon(); err != nil { //FIXME: Handle error
   172  					logrus.Errorf("libcontainerd: error restarting containerd: %v", err)
   173  				}
   174  				continue
   175  			}
   176  		}
   177  	}
   178  }
   179  
   180  func (r *remote) Cleanup() {
   181  	if r.daemonPid == -1 {
   182  		return
   183  	}
   184  	r.closeManually = true
   185  	r.rpcConn.Close()
   186  	// Ask the daemon to quit
   187  	syscall.Kill(r.daemonPid, syscall.SIGTERM)
   188  
   189  	// Wait up to 15secs for it to stop
   190  	for i := time.Duration(0); i < containerdShutdownTimeout; i += time.Second {
   191  		if !utils.IsProcessAlive(r.daemonPid) {
   192  			break
   193  		}
   194  		time.Sleep(time.Second)
   195  	}
   196  
   197  	if utils.IsProcessAlive(r.daemonPid) {
   198  		logrus.Warnf("libcontainerd: containerd (%d) didn't stop within 15 secs, killing it\n", r.daemonPid)
   199  		syscall.Kill(r.daemonPid, syscall.SIGKILL)
   200  	}
   201  
   202  	// cleanup some files
   203  	os.Remove(filepath.Join(r.stateDir, containerdPidFilename))
   204  	os.Remove(filepath.Join(r.stateDir, containerdSockFilename))
   205  }
   206  
   207  func (r *remote) Client(b Backend) (Client, error) {
   208  	c := &client{
   209  		clientCommon: clientCommon{
   210  			backend:    b,
   211  			containers: make(map[string]*container),
   212  			locker:     locker.New(),
   213  		},
   214  		remote:        r,
   215  		exitNotifiers: make(map[string]*exitNotifier),
   216  		liveRestore:   r.liveRestore,
   217  	}
   218  
   219  	r.Lock()
   220  	r.clients = append(r.clients, c)
   221  	r.Unlock()
   222  	return c, nil
   223  }
   224  
   225  func (r *remote) updateEventTimestamp(t time.Time) {
   226  	f, err := os.OpenFile(r.eventTsPath, syscall.O_CREAT|syscall.O_WRONLY|syscall.O_TRUNC, 0600)
   227  	if err != nil {
   228  		logrus.Warnf("libcontainerd: failed to open event timestamp file: %v", err)
   229  		return
   230  	}
   231  	defer f.Close()
   232  
   233  	b, err := t.MarshalText()
   234  	if err != nil {
   235  		logrus.Warnf("libcontainerd: failed to encode timestamp: %v", err)
   236  		return
   237  	}
   238  
   239  	n, err := f.Write(b)
   240  	if err != nil || n != len(b) {
   241  		logrus.Warnf("libcontainerd: failed to update event timestamp file: %v", err)
   242  		f.Truncate(0)
   243  		return
   244  	}
   245  }
   246  
   247  func (r *remote) getLastEventTimestamp() time.Time {
   248  	t := time.Now()
   249  
   250  	fi, err := os.Stat(r.eventTsPath)
   251  	if os.IsNotExist(err) || fi.Size() == 0 {
   252  		return t
   253  	}
   254  
   255  	f, err := os.Open(r.eventTsPath)
   256  	if err != nil {
   257  		logrus.Warnf("libcontainerd: Unable to access last event ts: %v", err)
   258  		return t
   259  	}
   260  	defer f.Close()
   261  
   262  	b := make([]byte, fi.Size())
   263  	n, err := f.Read(b)
   264  	if err != nil || n != len(b) {
   265  		logrus.Warnf("libcontainerd: Unable to read last event ts: %v", err)
   266  		return t
   267  	}
   268  
   269  	t.UnmarshalText(b)
   270  
   271  	return t
   272  }
   273  
   274  func (r *remote) startEventsMonitor() error {
   275  	// First, get past events
   276  	t := r.getLastEventTimestamp()
   277  	tsp, err := ptypes.TimestampProto(t)
   278  	if err != nil {
   279  		logrus.Errorf("libcontainerd: failed to convert timestamp: %q", err)
   280  	}
   281  	er := &containerd.EventsRequest{
   282  		Timestamp: tsp,
   283  	}
   284  	events, err := r.apiClient.Events(context.Background(), er, grpc.FailFast(false))
   285  	if err != nil {
   286  		return err
   287  	}
   288  	go r.handleEventStream(events)
   289  	return nil
   290  }
   291  
   292  func (r *remote) handleEventStream(events containerd.API_EventsClient) {
   293  	for {
   294  		e, err := events.Recv()
   295  		if err != nil {
   296  			if grpc.ErrorDesc(err) == transport.ErrConnClosing.Desc &&
   297  				r.closeManually {
   298  				// ignore error if grpc remote connection is closed manually
   299  				return
   300  			}
   301  			logrus.Errorf("libcontainerd: failed to receive event from containerd: %v", err)
   302  			go r.startEventsMonitor()
   303  			return
   304  		}
   305  
   306  		logrus.Debugf("libcontainerd: received containerd event: %#v", e)
   307  
   308  		var container *container
   309  		var c *client
   310  		r.RLock()
   311  		for _, c = range r.clients {
   312  			container, err = c.getContainer(e.Id)
   313  			if err == nil {
   314  				break
   315  			}
   316  		}
   317  		r.RUnlock()
   318  		if container == nil {
   319  			logrus.Warnf("libcontainerd: unknown container %s", e.Id)
   320  			continue
   321  		}
   322  
   323  		if err := container.handleEvent(e); err != nil {
   324  			logrus.Errorf("libcontainerd: error processing state change for %s: %v", e.Id, err)
   325  		}
   326  
   327  		tsp, err := ptypes.Timestamp(e.Timestamp)
   328  		if err != nil {
   329  			logrus.Errorf("libcontainerd: failed to convert event timestamp: %q", err)
   330  			continue
   331  		}
   332  
   333  		r.updateEventTimestamp(tsp)
   334  	}
   335  }
   336  
   337  func (r *remote) runContainerdDaemon() error {
   338  	pidFilename := filepath.Join(r.stateDir, containerdPidFilename)
   339  	f, err := os.OpenFile(pidFilename, os.O_RDWR|os.O_CREATE, 0600)
   340  	if err != nil {
   341  		return err
   342  	}
   343  	defer f.Close()
   344  
   345  	// File exist, check if the daemon is alive
   346  	b := make([]byte, 8)
   347  	n, err := f.Read(b)
   348  	if err != nil && err != io.EOF {
   349  		return err
   350  	}
   351  
   352  	if n > 0 {
   353  		pid, err := strconv.ParseUint(string(b[:n]), 10, 64)
   354  		if err != nil {
   355  			return err
   356  		}
   357  		if utils.IsProcessAlive(int(pid)) {
   358  			logrus.Infof("libcontainerd: previous instance of containerd still alive (%d)", pid)
   359  			r.daemonPid = int(pid)
   360  			return nil
   361  		}
   362  	}
   363  
   364  	// rewind the file
   365  	_, err = f.Seek(0, os.SEEK_SET)
   366  	if err != nil {
   367  		return err
   368  	}
   369  
   370  	// Truncate it
   371  	err = f.Truncate(0)
   372  	if err != nil {
   373  		return err
   374  	}
   375  
   376  	// Start a new instance
   377  	args := []string{
   378  		"-l", fmt.Sprintf("unix://%s", r.rpcAddr),
   379  		"--metrics-interval=0",
   380  		"--start-timeout", "2m",
   381  		"--state-dir", filepath.Join(r.stateDir, containerdStateDir),
   382  	}
   383  	if goruntime.GOOS == "solaris" {
   384  		args = append(args, "--shim", "containerd-shim", "--runtime", "runc")
   385  	} else {
   386  		args = append(args, "--shim", "docker-containerd-shim")
   387  		if r.runtime != "" {
   388  			args = append(args, "--runtime")
   389  			args = append(args, r.runtime)
   390  		}
   391  	}
   392  	if r.debugLog {
   393  		args = append(args, "--debug")
   394  	}
   395  	if len(r.runtimeArgs) > 0 {
   396  		for _, v := range r.runtimeArgs {
   397  			args = append(args, "--runtime-args")
   398  			args = append(args, v)
   399  		}
   400  		logrus.Debugf("libcontainerd: runContainerdDaemon: runtimeArgs: %s", args)
   401  	}
   402  
   403  	cmd := exec.Command(containerdBinary, args...)
   404  	// redirect containerd logs to docker logs
   405  	cmd.Stdout = os.Stdout
   406  	cmd.Stderr = os.Stderr
   407  	cmd.SysProcAttr = setSysProcAttr(true)
   408  	cmd.Env = nil
   409  	// clear the NOTIFY_SOCKET from the env when starting containerd
   410  	for _, e := range os.Environ() {
   411  		if !strings.HasPrefix(e, "NOTIFY_SOCKET") {
   412  			cmd.Env = append(cmd.Env, e)
   413  		}
   414  	}
   415  	if err := cmd.Start(); err != nil {
   416  		return err
   417  	}
   418  	logrus.Infof("libcontainerd: new containerd process, pid: %d", cmd.Process.Pid)
   419  	if err := setOOMScore(cmd.Process.Pid, r.oomScore); err != nil {
   420  		utils.KillProcess(cmd.Process.Pid)
   421  		return err
   422  	}
   423  	if _, err := f.WriteString(fmt.Sprintf("%d", cmd.Process.Pid)); err != nil {
   424  		utils.KillProcess(cmd.Process.Pid)
   425  		return err
   426  	}
   427  
   428  	r.daemonWaitCh = make(chan struct{})
   429  	go func() {
   430  		cmd.Wait()
   431  		close(r.daemonWaitCh)
   432  	}() // Reap our child when needed
   433  	r.daemonPid = cmd.Process.Pid
   434  	return nil
   435  }
   436  
   437  // WithRemoteAddr sets the external containerd socket to connect to.
   438  func WithRemoteAddr(addr string) RemoteOption {
   439  	return rpcAddr(addr)
   440  }
   441  
   442  type rpcAddr string
   443  
   444  func (a rpcAddr) Apply(r Remote) error {
   445  	if remote, ok := r.(*remote); ok {
   446  		remote.rpcAddr = string(a)
   447  		return nil
   448  	}
   449  	return fmt.Errorf("WithRemoteAddr option not supported for this remote")
   450  }
   451  
   452  // WithRuntimePath sets the path of the runtime to be used as the
   453  // default by containerd
   454  func WithRuntimePath(rt string) RemoteOption {
   455  	return runtimePath(rt)
   456  }
   457  
   458  type runtimePath string
   459  
   460  func (rt runtimePath) Apply(r Remote) error {
   461  	if remote, ok := r.(*remote); ok {
   462  		remote.runtime = string(rt)
   463  		return nil
   464  	}
   465  	return fmt.Errorf("WithRuntime option not supported for this remote")
   466  }
   467  
   468  // WithRuntimeArgs sets the list of runtime args passed to containerd
   469  func WithRuntimeArgs(args []string) RemoteOption {
   470  	return runtimeArgs(args)
   471  }
   472  
   473  type runtimeArgs []string
   474  
   475  func (rt runtimeArgs) Apply(r Remote) error {
   476  	if remote, ok := r.(*remote); ok {
   477  		remote.runtimeArgs = rt
   478  		return nil
   479  	}
   480  	return fmt.Errorf("WithRuntimeArgs option not supported for this remote")
   481  }
   482  
   483  // WithStartDaemon defines if libcontainerd should also run containerd daemon.
   484  func WithStartDaemon(start bool) RemoteOption {
   485  	return startDaemon(start)
   486  }
   487  
   488  type startDaemon bool
   489  
   490  func (s startDaemon) Apply(r Remote) error {
   491  	if remote, ok := r.(*remote); ok {
   492  		remote.startDaemon = bool(s)
   493  		return nil
   494  	}
   495  	return fmt.Errorf("WithStartDaemon option not supported for this remote")
   496  }
   497  
   498  // WithDebugLog defines if containerd debug logs will be enabled for daemon.
   499  func WithDebugLog(debug bool) RemoteOption {
   500  	return debugLog(debug)
   501  }
   502  
   503  type debugLog bool
   504  
   505  func (d debugLog) Apply(r Remote) error {
   506  	if remote, ok := r.(*remote); ok {
   507  		remote.debugLog = bool(d)
   508  		return nil
   509  	}
   510  	return fmt.Errorf("WithDebugLog option not supported for this remote")
   511  }
   512  
   513  // WithLiveRestore defines if containers are stopped on shutdown or restored.
   514  func WithLiveRestore(v bool) RemoteOption {
   515  	return liveRestore(v)
   516  }
   517  
   518  type liveRestore bool
   519  
   520  func (l liveRestore) Apply(r Remote) error {
   521  	if remote, ok := r.(*remote); ok {
   522  		remote.liveRestore = bool(l)
   523  		for _, c := range remote.clients {
   524  			c.liveRestore = bool(l)
   525  		}
   526  		return nil
   527  	}
   528  	return fmt.Errorf("WithLiveRestore option not supported for this remote")
   529  }
   530  
   531  // WithOOMScore defines the oom_score_adj to set for the containerd process.
   532  func WithOOMScore(score int) RemoteOption {
   533  	return oomScore(score)
   534  }
   535  
   536  type oomScore int
   537  
   538  func (o oomScore) Apply(r Remote) error {
   539  	if remote, ok := r.(*remote); ok {
   540  		remote.oomScore = int(o)
   541  		return nil
   542  	}
   543  	return fmt.Errorf("WithOOMScore option not supported for this remote")
   544  }