github.com/go/docker@v1.12.0-rc2/libcontainerd/remote_linux.go (about)

     1  package libcontainerd
     2  
     3  import (
     4  	"fmt"
     5  	"io"
     6  	"io/ioutil"
     7  	"log"
     8  	"net"
     9  	"os"
    10  	"os/exec"
    11  	"path/filepath"
    12  	"strconv"
    13  	"strings"
    14  	"sync"
    15  	"syscall"
    16  	"time"
    17  
    18  	"github.com/Sirupsen/logrus"
    19  	containerd "github.com/docker/containerd/api/grpc/types"
    20  	"github.com/docker/docker/pkg/locker"
    21  	sysinfo "github.com/docker/docker/pkg/system"
    22  	"github.com/docker/docker/utils"
    23  	"golang.org/x/net/context"
    24  	"google.golang.org/grpc"
    25  	"google.golang.org/grpc/grpclog"
    26  	"google.golang.org/grpc/transport"
    27  )
    28  
    29  const (
    30  	maxConnectionRetryCount   = 3
    31  	connectionRetryDelay      = 3 * time.Second
    32  	containerdShutdownTimeout = 15 * time.Second
    33  	containerdBinary          = "docker-containerd"
    34  	containerdPidFilename     = "docker-containerd.pid"
    35  	containerdSockFilename    = "docker-containerd.sock"
    36  	containerdStateDir        = "containerd"
    37  	eventTimestampFilename    = "event.ts"
    38  )
    39  
    40  type remote struct {
    41  	sync.RWMutex
    42  	apiClient     containerd.APIClient
    43  	daemonPid     int
    44  	stateDir      string
    45  	rpcAddr       string
    46  	startDaemon   bool
    47  	closeManually bool
    48  	debugLog      bool
    49  	rpcConn       *grpc.ClientConn
    50  	clients       []*client
    51  	eventTsPath   string
    52  	pastEvents    map[string]*containerd.Event
    53  	runtime       string
    54  	runtimeArgs   []string
    55  	daemonWaitCh  chan struct{}
    56  	liveRestore   bool
    57  }
    58  
    59  // New creates a fresh instance of libcontainerd remote.
    60  func New(stateDir string, options ...RemoteOption) (_ Remote, err error) {
    61  	defer func() {
    62  		if err != nil {
    63  			err = fmt.Errorf("Failed to connect to containerd. Please make sure containerd is installed in your PATH or you have specificed the correct address. Got error: %v", err)
    64  		}
    65  	}()
    66  	r := &remote{
    67  		stateDir:    stateDir,
    68  		daemonPid:   -1,
    69  		eventTsPath: filepath.Join(stateDir, eventTimestampFilename),
    70  		pastEvents:  make(map[string]*containerd.Event),
    71  	}
    72  	for _, option := range options {
    73  		if err := option.Apply(r); err != nil {
    74  			return nil, err
    75  		}
    76  	}
    77  
    78  	if err := sysinfo.MkdirAll(stateDir, 0700); err != nil {
    79  		return nil, err
    80  	}
    81  
    82  	if r.rpcAddr == "" {
    83  		r.rpcAddr = filepath.Join(stateDir, containerdSockFilename)
    84  	}
    85  
    86  	if r.startDaemon {
    87  		if err := r.runContainerdDaemon(); err != nil {
    88  			return nil, err
    89  		}
    90  	}
    91  
    92  	// don't output the grpc reconnect logging
    93  	grpclog.SetLogger(log.New(ioutil.Discard, "", log.LstdFlags))
    94  	dialOpts := append([]grpc.DialOption{grpc.WithInsecure()},
    95  		grpc.WithDialer(func(addr string, timeout time.Duration) (net.Conn, error) {
    96  			return net.DialTimeout("unix", addr, timeout)
    97  		}),
    98  	)
    99  	conn, err := grpc.Dial(r.rpcAddr, dialOpts...)
   100  	if err != nil {
   101  		return nil, fmt.Errorf("error connecting to containerd: %v", err)
   102  	}
   103  
   104  	r.rpcConn = conn
   105  	r.apiClient = containerd.NewAPIClient(conn)
   106  
   107  	go r.handleConnectionChange()
   108  
   109  	if err := r.startEventsMonitor(); err != nil {
   110  		return nil, err
   111  	}
   112  
   113  	return r, nil
   114  }
   115  
   116  func (r *remote) UpdateOptions(options ...RemoteOption) error {
   117  	for _, option := range options {
   118  		if err := option.Apply(r); err != nil {
   119  			return err
   120  		}
   121  	}
   122  	return nil
   123  }
   124  
   125  func (r *remote) handleConnectionChange() {
   126  	var transientFailureCount = 0
   127  	state := grpc.Idle
   128  	for {
   129  		s, err := r.rpcConn.WaitForStateChange(context.Background(), state)
   130  		if err != nil {
   131  			break
   132  		}
   133  		state = s
   134  		logrus.Debugf("containerd connection state change: %v", s)
   135  
   136  		if r.daemonPid != -1 {
   137  			switch state {
   138  			case grpc.TransientFailure:
   139  				// Reset state to be notified of next failure
   140  				transientFailureCount++
   141  				if transientFailureCount >= maxConnectionRetryCount {
   142  					transientFailureCount = 0
   143  					if utils.IsProcessAlive(r.daemonPid) {
   144  						utils.KillProcess(r.daemonPid)
   145  						<-r.daemonWaitCh
   146  					}
   147  					if err := r.runContainerdDaemon(); err != nil { //FIXME: Handle error
   148  						logrus.Errorf("error restarting containerd: %v", err)
   149  					}
   150  				} else {
   151  					state = grpc.Idle
   152  					time.Sleep(connectionRetryDelay)
   153  				}
   154  			case grpc.Shutdown:
   155  				// Well, we asked for it to stop, just return
   156  				return
   157  			}
   158  		}
   159  	}
   160  }
   161  
   162  func (r *remote) Cleanup() {
   163  	if r.daemonPid == -1 {
   164  		return
   165  	}
   166  	r.closeManually = true
   167  	r.rpcConn.Close()
   168  	// Ask the daemon to quit
   169  	syscall.Kill(r.daemonPid, syscall.SIGTERM)
   170  
   171  	// Wait up to 15secs for it to stop
   172  	for i := time.Duration(0); i < containerdShutdownTimeout; i += time.Second {
   173  		if !utils.IsProcessAlive(r.daemonPid) {
   174  			break
   175  		}
   176  		time.Sleep(time.Second)
   177  	}
   178  
   179  	if utils.IsProcessAlive(r.daemonPid) {
   180  		logrus.Warnf("libcontainerd: containerd (%d) didn't stop within 15 secs, killing it\n", r.daemonPid)
   181  		syscall.Kill(r.daemonPid, syscall.SIGKILL)
   182  	}
   183  
   184  	// cleanup some files
   185  	os.Remove(filepath.Join(r.stateDir, containerdPidFilename))
   186  	os.Remove(filepath.Join(r.stateDir, containerdSockFilename))
   187  }
   188  
   189  func (r *remote) Client(b Backend) (Client, error) {
   190  	c := &client{
   191  		clientCommon: clientCommon{
   192  			backend:    b,
   193  			containers: make(map[string]*container),
   194  			locker:     locker.New(),
   195  		},
   196  		remote:        r,
   197  		exitNotifiers: make(map[string]*exitNotifier),
   198  		liveRestore:   r.liveRestore,
   199  	}
   200  
   201  	r.Lock()
   202  	r.clients = append(r.clients, c)
   203  	r.Unlock()
   204  	return c, nil
   205  }
   206  
   207  func (r *remote) updateEventTimestamp(t time.Time) {
   208  	f, err := os.OpenFile(r.eventTsPath, syscall.O_CREAT|syscall.O_WRONLY|syscall.O_TRUNC, 0600)
   209  	defer f.Close()
   210  	if err != nil {
   211  		logrus.Warnf("libcontainerd: failed to open event timestamp file: %v", err)
   212  		return
   213  	}
   214  
   215  	b, err := t.MarshalText()
   216  	if err != nil {
   217  		logrus.Warnf("libcontainerd: failed to encode timestamp: %v", err)
   218  		return
   219  	}
   220  
   221  	n, err := f.Write(b)
   222  	if err != nil || n != len(b) {
   223  		logrus.Warnf("libcontainerd: failed to update event timestamp file: %v", err)
   224  		f.Truncate(0)
   225  		return
   226  	}
   227  
   228  }
   229  
   230  func (r *remote) getLastEventTimestamp() int64 {
   231  	t := time.Now()
   232  
   233  	fi, err := os.Stat(r.eventTsPath)
   234  	if os.IsNotExist(err) || fi.Size() == 0 {
   235  		return t.Unix()
   236  	}
   237  
   238  	f, err := os.Open(r.eventTsPath)
   239  	defer f.Close()
   240  	if err != nil {
   241  		logrus.Warnf("libcontainerd: Unable to access last event ts: %v", err)
   242  		return t.Unix()
   243  	}
   244  
   245  	b := make([]byte, fi.Size())
   246  	n, err := f.Read(b)
   247  	if err != nil || n != len(b) {
   248  		logrus.Warnf("libcontainerd: Unable to read last event ts: %v", err)
   249  		return t.Unix()
   250  	}
   251  
   252  	t.UnmarshalText(b)
   253  
   254  	return t.Unix()
   255  }
   256  
   257  func (r *remote) startEventsMonitor() error {
   258  	// First, get past events
   259  	er := &containerd.EventsRequest{
   260  		Timestamp: uint64(r.getLastEventTimestamp()),
   261  	}
   262  	events, err := r.apiClient.Events(context.Background(), er)
   263  	if err != nil {
   264  		return err
   265  	}
   266  	go r.handleEventStream(events)
   267  	return nil
   268  }
   269  
   270  func (r *remote) handleEventStream(events containerd.API_EventsClient) {
   271  	live := false
   272  	for {
   273  		e, err := events.Recv()
   274  		if err != nil {
   275  			if grpc.ErrorDesc(err) == transport.ErrConnClosing.Desc &&
   276  				r.closeManually {
   277  				// ignore error if grpc remote connection is closed manually
   278  				return
   279  			}
   280  			logrus.Errorf("failed to receive event from containerd: %v", err)
   281  			go r.startEventsMonitor()
   282  			return
   283  		}
   284  
   285  		if live == false {
   286  			logrus.Debugf("received past containerd event: %#v", e)
   287  
   288  			// Pause/Resume events should never happens after exit one
   289  			switch e.Type {
   290  			case StateExit:
   291  				r.pastEvents[e.Id] = e
   292  			case StatePause:
   293  				r.pastEvents[e.Id] = e
   294  			case StateResume:
   295  				r.pastEvents[e.Id] = e
   296  			case stateLive:
   297  				live = true
   298  				r.updateEventTimestamp(time.Unix(int64(e.Timestamp), 0))
   299  			}
   300  		} else {
   301  			logrus.Debugf("received containerd event: %#v", e)
   302  
   303  			var container *container
   304  			var c *client
   305  			r.RLock()
   306  			for _, c = range r.clients {
   307  				container, err = c.getContainer(e.Id)
   308  				if err == nil {
   309  					break
   310  				}
   311  			}
   312  			r.RUnlock()
   313  			if container == nil {
   314  				logrus.Errorf("no state for container: %q", err)
   315  				continue
   316  			}
   317  
   318  			if err := container.handleEvent(e); err != nil {
   319  				logrus.Errorf("error processing state change for %s: %v", e.Id, err)
   320  			}
   321  
   322  			r.updateEventTimestamp(time.Unix(int64(e.Timestamp), 0))
   323  		}
   324  	}
   325  }
   326  
   327  func (r *remote) runContainerdDaemon() error {
   328  	pidFilename := filepath.Join(r.stateDir, containerdPidFilename)
   329  	f, err := os.OpenFile(pidFilename, os.O_RDWR|os.O_CREATE, 0600)
   330  	defer f.Close()
   331  	if err != nil {
   332  		return err
   333  	}
   334  
   335  	// File exist, check if the daemon is alive
   336  	b := make([]byte, 8)
   337  	n, err := f.Read(b)
   338  	if err != nil && err != io.EOF {
   339  		return err
   340  	}
   341  
   342  	if n > 0 {
   343  		pid, err := strconv.ParseUint(string(b[:n]), 10, 64)
   344  		if err != nil {
   345  			return err
   346  		}
   347  		if utils.IsProcessAlive(int(pid)) {
   348  			logrus.Infof("previous instance of containerd still alive (%d)", pid)
   349  			r.daemonPid = int(pid)
   350  			return nil
   351  		}
   352  	}
   353  
   354  	// rewind the file
   355  	_, err = f.Seek(0, os.SEEK_SET)
   356  	if err != nil {
   357  		return err
   358  	}
   359  
   360  	// Truncate it
   361  	err = f.Truncate(0)
   362  	if err != nil {
   363  		return err
   364  	}
   365  
   366  	// Start a new instance
   367  	args := []string{
   368  		"-l", fmt.Sprintf("unix://%s", r.rpcAddr),
   369  		"--shim", "docker-containerd-shim",
   370  		"--metrics-interval=0",
   371  		"--start-timeout", "2m",
   372  		"--state-dir", filepath.Join(r.stateDir, containerdStateDir),
   373  	}
   374  	if r.runtime != "" {
   375  		args = append(args, "--runtime")
   376  		args = append(args, r.runtime)
   377  	}
   378  	if r.debugLog {
   379  		args = append(args, "--debug")
   380  	}
   381  	if len(r.runtimeArgs) > 0 {
   382  		for _, v := range r.runtimeArgs {
   383  			args = append(args, "--runtime-args")
   384  			args = append(args, v)
   385  		}
   386  		logrus.Debugf("runContainerdDaemon: runtimeArgs: %s", args)
   387  	}
   388  
   389  	cmd := exec.Command(containerdBinary, args...)
   390  	// redirect containerd logs to docker logs
   391  	cmd.Stdout = os.Stdout
   392  	cmd.Stderr = os.Stderr
   393  	cmd.SysProcAttr = &syscall.SysProcAttr{Setsid: true, Pdeathsig: syscall.SIGKILL}
   394  	cmd.Env = nil
   395  	// clear the NOTIFY_SOCKET from the env when starting containerd
   396  	for _, e := range os.Environ() {
   397  		if !strings.HasPrefix(e, "NOTIFY_SOCKET") {
   398  			cmd.Env = append(cmd.Env, e)
   399  		}
   400  	}
   401  	if err := cmd.Start(); err != nil {
   402  		return err
   403  	}
   404  	logrus.Infof("New containerd process, pid: %d", cmd.Process.Pid)
   405  
   406  	if _, err := f.WriteString(fmt.Sprintf("%d", cmd.Process.Pid)); err != nil {
   407  		utils.KillProcess(cmd.Process.Pid)
   408  		return err
   409  	}
   410  
   411  	r.daemonWaitCh = make(chan struct{})
   412  	go func() {
   413  		cmd.Wait()
   414  		close(r.daemonWaitCh)
   415  	}() // Reap our child when needed
   416  	r.daemonPid = cmd.Process.Pid
   417  	return nil
   418  }
   419  
   420  // WithRemoteAddr sets the external containerd socket to connect to.
   421  func WithRemoteAddr(addr string) RemoteOption {
   422  	return rpcAddr(addr)
   423  }
   424  
   425  type rpcAddr string
   426  
   427  func (a rpcAddr) Apply(r Remote) error {
   428  	if remote, ok := r.(*remote); ok {
   429  		remote.rpcAddr = string(a)
   430  		return nil
   431  	}
   432  	return fmt.Errorf("WithRemoteAddr option not supported for this remote")
   433  }
   434  
   435  // WithRuntimePath sets the path of the runtime to be used as the
   436  // default by containerd
   437  func WithRuntimePath(rt string) RemoteOption {
   438  	return runtimePath(rt)
   439  }
   440  
   441  type runtimePath string
   442  
   443  func (rt runtimePath) Apply(r Remote) error {
   444  	if remote, ok := r.(*remote); ok {
   445  		remote.runtime = string(rt)
   446  		return nil
   447  	}
   448  	return fmt.Errorf("WithRuntime option not supported for this remote")
   449  }
   450  
   451  // WithRuntimeArgs sets the list of runtime args passed to containerd
   452  func WithRuntimeArgs(args []string) RemoteOption {
   453  	return runtimeArgs(args)
   454  }
   455  
   456  type runtimeArgs []string
   457  
   458  func (rt runtimeArgs) Apply(r Remote) error {
   459  	if remote, ok := r.(*remote); ok {
   460  		remote.runtimeArgs = rt
   461  		return nil
   462  	}
   463  	return fmt.Errorf("WithRuntimeArgs option not supported for this remote")
   464  }
   465  
   466  // WithStartDaemon defines if libcontainerd should also run containerd daemon.
   467  func WithStartDaemon(start bool) RemoteOption {
   468  	return startDaemon(start)
   469  }
   470  
   471  type startDaemon bool
   472  
   473  func (s startDaemon) Apply(r Remote) error {
   474  	if remote, ok := r.(*remote); ok {
   475  		remote.startDaemon = bool(s)
   476  		return nil
   477  	}
   478  	return fmt.Errorf("WithStartDaemon option not supported for this remote")
   479  }
   480  
   481  // WithDebugLog defines if containerd debug logs will be enabled for daemon.
   482  func WithDebugLog(debug bool) RemoteOption {
   483  	return debugLog(debug)
   484  }
   485  
   486  type debugLog bool
   487  
   488  func (d debugLog) Apply(r Remote) error {
   489  	if remote, ok := r.(*remote); ok {
   490  		remote.debugLog = bool(d)
   491  		return nil
   492  	}
   493  	return fmt.Errorf("WithDebugLog option not supported for this remote")
   494  }
   495  
   496  // WithLiveRestore defines if containers are stopped on shutdown or restored.
   497  func WithLiveRestore(v bool) RemoteOption {
   498  	return liveRestore(v)
   499  }
   500  
   501  type liveRestore bool
   502  
   503  func (l liveRestore) Apply(r Remote) error {
   504  	if remote, ok := r.(*remote); ok {
   505  		remote.liveRestore = bool(l)
   506  		for _, c := range remote.clients {
   507  			c.liveRestore = bool(l)
   508  		}
   509  		return nil
   510  	}
   511  	return fmt.Errorf("WithLiveRestore option not supported for this remote")
   512  }