github.com/fabiokung/docker@v0.11.2-0.20170222101415-4534dcd49497/libcontainerd/remote_unix.go (about)

     1  // +build linux solaris
     2  
     3  package libcontainerd
     4  
     5  import (
     6  	"fmt"
     7  	"io"
     8  	"io/ioutil"
     9  	"log"
    10  	"net"
    11  	"os"
    12  	"os/exec"
    13  	"path/filepath"
    14  	goruntime "runtime"
    15  	"strconv"
    16  	"strings"
    17  	"sync"
    18  	"syscall"
    19  	"time"
    20  
    21  	"github.com/Sirupsen/logrus"
    22  	containerd "github.com/docker/containerd/api/grpc/types"
    23  	"github.com/docker/docker/pkg/locker"
    24  	"github.com/docker/docker/pkg/system"
    25  	"github.com/golang/protobuf/ptypes"
    26  	"github.com/golang/protobuf/ptypes/timestamp"
    27  	"golang.org/x/net/context"
    28  	"google.golang.org/grpc"
    29  	"google.golang.org/grpc/grpclog"
    30  	"google.golang.org/grpc/health/grpc_health_v1"
    31  	"google.golang.org/grpc/transport"
    32  )
    33  
    34  const (
    35  	maxConnectionRetryCount      = 3
    36  	containerdHealthCheckTimeout = 3 * time.Second
    37  	containerdShutdownTimeout    = 15 * time.Second
    38  	containerdBinary             = "docker-containerd"
    39  	containerdPidFilename        = "docker-containerd.pid"
    40  	containerdSockFilename       = "docker-containerd.sock"
    41  	containerdStateDir           = "containerd"
    42  	eventTimestampFilename       = "event.ts"
    43  )
    44  
    45  type remote struct {
    46  	sync.RWMutex
    47  	apiClient            containerd.APIClient
    48  	daemonPid            int
    49  	stateDir             string
    50  	rpcAddr              string
    51  	startDaemon          bool
    52  	closeManually        bool
    53  	debugLog             bool
    54  	rpcConn              *grpc.ClientConn
    55  	clients              []*client
    56  	eventTsPath          string
    57  	runtime              string
    58  	runtimeArgs          []string
    59  	daemonWaitCh         chan struct{}
    60  	liveRestore          bool
    61  	oomScore             int
    62  	restoreFromTimestamp *timestamp.Timestamp
    63  }
    64  
    65  // New creates a fresh instance of libcontainerd remote.
    66  func New(stateDir string, options ...RemoteOption) (_ Remote, err error) {
    67  	defer func() {
    68  		if err != nil {
    69  			err = fmt.Errorf("Failed to connect to containerd. Please make sure containerd is installed in your PATH or you have specified the correct address. Got error: %v", err)
    70  		}
    71  	}()
    72  	r := &remote{
    73  		stateDir:    stateDir,
    74  		daemonPid:   -1,
    75  		eventTsPath: filepath.Join(stateDir, eventTimestampFilename),
    76  	}
    77  	for _, option := range options {
    78  		if err := option.Apply(r); err != nil {
    79  			return nil, err
    80  		}
    81  	}
    82  
    83  	if err := system.MkdirAll(stateDir, 0700); err != nil {
    84  		return nil, err
    85  	}
    86  
    87  	if r.rpcAddr == "" {
    88  		r.rpcAddr = filepath.Join(stateDir, containerdSockFilename)
    89  	}
    90  
    91  	if r.startDaemon {
    92  		if err := r.runContainerdDaemon(); err != nil {
    93  			return nil, err
    94  		}
    95  	}
    96  
    97  	// don't output the grpc reconnect logging
    98  	grpclog.SetLogger(log.New(ioutil.Discard, "", log.LstdFlags))
    99  	dialOpts := append([]grpc.DialOption{grpc.WithInsecure()},
   100  		grpc.WithDialer(func(addr string, timeout time.Duration) (net.Conn, error) {
   101  			return net.DialTimeout("unix", addr, timeout)
   102  		}),
   103  	)
   104  	conn, err := grpc.Dial(r.rpcAddr, dialOpts...)
   105  	if err != nil {
   106  		return nil, fmt.Errorf("error connecting to containerd: %v", err)
   107  	}
   108  
   109  	r.rpcConn = conn
   110  	r.apiClient = containerd.NewAPIClient(conn)
   111  
   112  	// Get the timestamp to restore from
   113  	t := r.getLastEventTimestamp()
   114  	tsp, err := ptypes.TimestampProto(t)
   115  	if err != nil {
   116  		logrus.Errorf("libcontainerd: failed to convert timestamp: %q", err)
   117  	}
   118  	r.restoreFromTimestamp = tsp
   119  
   120  	go r.handleConnectionChange()
   121  
   122  	if err := r.startEventsMonitor(); err != nil {
   123  		return nil, err
   124  	}
   125  
   126  	return r, nil
   127  }
   128  
   129  func (r *remote) UpdateOptions(options ...RemoteOption) error {
   130  	for _, option := range options {
   131  		if err := option.Apply(r); err != nil {
   132  			return err
   133  		}
   134  	}
   135  	return nil
   136  }
   137  
   138  func (r *remote) handleConnectionChange() {
   139  	var transientFailureCount = 0
   140  
   141  	ticker := time.NewTicker(500 * time.Millisecond)
   142  	defer ticker.Stop()
   143  	healthClient := grpc_health_v1.NewHealthClient(r.rpcConn)
   144  
   145  	for {
   146  		<-ticker.C
   147  		ctx, cancel := context.WithTimeout(context.Background(), containerdHealthCheckTimeout)
   148  		_, err := healthClient.Check(ctx, &grpc_health_v1.HealthCheckRequest{})
   149  		cancel()
   150  		if err == nil {
   151  			continue
   152  		}
   153  
   154  		logrus.Debugf("libcontainerd: containerd health check returned error: %v", err)
   155  
   156  		if r.daemonPid != -1 {
   157  			if strings.Contains(err.Error(), "is closing") {
   158  				// Well, we asked for it to stop, just return
   159  				return
   160  			}
   161  			// all other errors are transient
   162  			// Reset state to be notified of next failure
   163  			transientFailureCount++
   164  			if transientFailureCount >= maxConnectionRetryCount {
   165  				transientFailureCount = 0
   166  				if system.IsProcessAlive(r.daemonPid) {
   167  					system.KillProcess(r.daemonPid)
   168  				}
   169  				<-r.daemonWaitCh
   170  				if err := r.runContainerdDaemon(); err != nil { //FIXME: Handle error
   171  					logrus.Errorf("libcontainerd: error restarting containerd: %v", err)
   172  				}
   173  				continue
   174  			}
   175  		}
   176  	}
   177  }
   178  
   179  func (r *remote) Cleanup() {
   180  	if r.daemonPid == -1 {
   181  		return
   182  	}
   183  	r.closeManually = true
   184  	r.rpcConn.Close()
   185  	// Ask the daemon to quit
   186  	syscall.Kill(r.daemonPid, syscall.SIGTERM)
   187  
   188  	// Wait up to 15secs for it to stop
   189  	for i := time.Duration(0); i < containerdShutdownTimeout; i += time.Second {
   190  		if !system.IsProcessAlive(r.daemonPid) {
   191  			break
   192  		}
   193  		time.Sleep(time.Second)
   194  	}
   195  
   196  	if system.IsProcessAlive(r.daemonPid) {
   197  		logrus.Warnf("libcontainerd: containerd (%d) didn't stop within 15 secs, killing it\n", r.daemonPid)
   198  		syscall.Kill(r.daemonPid, syscall.SIGKILL)
   199  	}
   200  
   201  	// cleanup some files
   202  	os.Remove(filepath.Join(r.stateDir, containerdPidFilename))
   203  	os.Remove(filepath.Join(r.stateDir, containerdSockFilename))
   204  }
   205  
   206  func (r *remote) Client(b Backend) (Client, error) {
   207  	c := &client{
   208  		clientCommon: clientCommon{
   209  			backend:    b,
   210  			containers: make(map[string]*container),
   211  			locker:     locker.New(),
   212  		},
   213  		remote:        r,
   214  		exitNotifiers: make(map[string]*exitNotifier),
   215  		liveRestore:   r.liveRestore,
   216  	}
   217  
   218  	r.Lock()
   219  	r.clients = append(r.clients, c)
   220  	r.Unlock()
   221  	return c, nil
   222  }
   223  
   224  func (r *remote) updateEventTimestamp(t time.Time) {
   225  	f, err := os.OpenFile(r.eventTsPath, syscall.O_CREAT|syscall.O_WRONLY|syscall.O_TRUNC, 0600)
   226  	if err != nil {
   227  		logrus.Warnf("libcontainerd: failed to open event timestamp file: %v", err)
   228  		return
   229  	}
   230  	defer f.Close()
   231  
   232  	b, err := t.MarshalText()
   233  	if err != nil {
   234  		logrus.Warnf("libcontainerd: failed to encode timestamp: %v", err)
   235  		return
   236  	}
   237  
   238  	n, err := f.Write(b)
   239  	if err != nil || n != len(b) {
   240  		logrus.Warnf("libcontainerd: failed to update event timestamp file: %v", err)
   241  		f.Truncate(0)
   242  		return
   243  	}
   244  }
   245  
   246  func (r *remote) getLastEventTimestamp() time.Time {
   247  	t := time.Now()
   248  
   249  	fi, err := os.Stat(r.eventTsPath)
   250  	if os.IsNotExist(err) || fi.Size() == 0 {
   251  		return t
   252  	}
   253  
   254  	f, err := os.Open(r.eventTsPath)
   255  	if err != nil {
   256  		logrus.Warnf("libcontainerd: Unable to access last event ts: %v", err)
   257  		return t
   258  	}
   259  	defer f.Close()
   260  
   261  	b := make([]byte, fi.Size())
   262  	n, err := f.Read(b)
   263  	if err != nil || n != len(b) {
   264  		logrus.Warnf("libcontainerd: Unable to read last event ts: %v", err)
   265  		return t
   266  	}
   267  
   268  	t.UnmarshalText(b)
   269  
   270  	return t
   271  }
   272  
   273  func (r *remote) startEventsMonitor() error {
   274  	// First, get past events
   275  	t := r.getLastEventTimestamp()
   276  	tsp, err := ptypes.TimestampProto(t)
   277  	if err != nil {
   278  		logrus.Errorf("libcontainerd: failed to convert timestamp: %q", err)
   279  	}
   280  	er := &containerd.EventsRequest{
   281  		Timestamp: tsp,
   282  	}
   283  	events, err := r.apiClient.Events(context.Background(), er, grpc.FailFast(false))
   284  	if err != nil {
   285  		return err
   286  	}
   287  	go r.handleEventStream(events)
   288  	return nil
   289  }
   290  
   291  func (r *remote) handleEventStream(events containerd.API_EventsClient) {
   292  	for {
   293  		e, err := events.Recv()
   294  		if err != nil {
   295  			if grpc.ErrorDesc(err) == transport.ErrConnClosing.Desc &&
   296  				r.closeManually {
   297  				// ignore error if grpc remote connection is closed manually
   298  				return
   299  			}
   300  			logrus.Errorf("libcontainerd: failed to receive event from containerd: %v", err)
   301  			go r.startEventsMonitor()
   302  			return
   303  		}
   304  
   305  		logrus.Debugf("libcontainerd: received containerd event: %#v", e)
   306  
   307  		var container *container
   308  		var c *client
   309  		r.RLock()
   310  		for _, c = range r.clients {
   311  			container, err = c.getContainer(e.Id)
   312  			if err == nil {
   313  				break
   314  			}
   315  		}
   316  		r.RUnlock()
   317  		if container == nil {
   318  			logrus.Warnf("libcontainerd: unknown container %s", e.Id)
   319  			continue
   320  		}
   321  
   322  		if err := container.handleEvent(e); err != nil {
   323  			logrus.Errorf("libcontainerd: error processing state change for %s: %v", e.Id, err)
   324  		}
   325  
   326  		tsp, err := ptypes.Timestamp(e.Timestamp)
   327  		if err != nil {
   328  			logrus.Errorf("libcontainerd: failed to convert event timestamp: %q", err)
   329  			continue
   330  		}
   331  
   332  		r.updateEventTimestamp(tsp)
   333  	}
   334  }
   335  
   336  func (r *remote) runContainerdDaemon() error {
   337  	pidFilename := filepath.Join(r.stateDir, containerdPidFilename)
   338  	f, err := os.OpenFile(pidFilename, os.O_RDWR|os.O_CREATE, 0600)
   339  	if err != nil {
   340  		return err
   341  	}
   342  	defer f.Close()
   343  
   344  	// File exist, check if the daemon is alive
   345  	b := make([]byte, 8)
   346  	n, err := f.Read(b)
   347  	if err != nil && err != io.EOF {
   348  		return err
   349  	}
   350  
   351  	if n > 0 {
   352  		pid, err := strconv.ParseUint(string(b[:n]), 10, 64)
   353  		if err != nil {
   354  			return err
   355  		}
   356  		if system.IsProcessAlive(int(pid)) {
   357  			logrus.Infof("libcontainerd: previous instance of containerd still alive (%d)", pid)
   358  			r.daemonPid = int(pid)
   359  			return nil
   360  		}
   361  	}
   362  
   363  	// rewind the file
   364  	_, err = f.Seek(0, os.SEEK_SET)
   365  	if err != nil {
   366  		return err
   367  	}
   368  
   369  	// Truncate it
   370  	err = f.Truncate(0)
   371  	if err != nil {
   372  		return err
   373  	}
   374  
   375  	// Start a new instance
   376  	args := []string{
   377  		"-l", fmt.Sprintf("unix://%s", r.rpcAddr),
   378  		"--metrics-interval=0",
   379  		"--start-timeout", "2m",
   380  		"--state-dir", filepath.Join(r.stateDir, containerdStateDir),
   381  	}
   382  	if goruntime.GOOS == "solaris" {
   383  		args = append(args, "--shim", "containerd-shim", "--runtime", "runc")
   384  	} else {
   385  		args = append(args, "--shim", "docker-containerd-shim")
   386  		if r.runtime != "" {
   387  			args = append(args, "--runtime")
   388  			args = append(args, r.runtime)
   389  		}
   390  	}
   391  	if r.debugLog {
   392  		args = append(args, "--debug")
   393  	}
   394  	if len(r.runtimeArgs) > 0 {
   395  		for _, v := range r.runtimeArgs {
   396  			args = append(args, "--runtime-args")
   397  			args = append(args, v)
   398  		}
   399  		logrus.Debugf("libcontainerd: runContainerdDaemon: runtimeArgs: %s", args)
   400  	}
   401  
   402  	cmd := exec.Command(containerdBinary, args...)
   403  	// redirect containerd logs to docker logs
   404  	cmd.Stdout = os.Stdout
   405  	cmd.Stderr = os.Stderr
   406  	cmd.SysProcAttr = setSysProcAttr(true)
   407  	cmd.Env = nil
   408  	// clear the NOTIFY_SOCKET from the env when starting containerd
   409  	for _, e := range os.Environ() {
   410  		if !strings.HasPrefix(e, "NOTIFY_SOCKET") {
   411  			cmd.Env = append(cmd.Env, e)
   412  		}
   413  	}
   414  	if err := cmd.Start(); err != nil {
   415  		return err
   416  	}
   417  	logrus.Infof("libcontainerd: new containerd process, pid: %d", cmd.Process.Pid)
   418  	if err := setOOMScore(cmd.Process.Pid, r.oomScore); err != nil {
   419  		system.KillProcess(cmd.Process.Pid)
   420  		return err
   421  	}
   422  	if _, err := f.WriteString(fmt.Sprintf("%d", cmd.Process.Pid)); err != nil {
   423  		system.KillProcess(cmd.Process.Pid)
   424  		return err
   425  	}
   426  
   427  	r.daemonWaitCh = make(chan struct{})
   428  	go func() {
   429  		cmd.Wait()
   430  		close(r.daemonWaitCh)
   431  	}() // Reap our child when needed
   432  	r.daemonPid = cmd.Process.Pid
   433  	return nil
   434  }
   435  
   436  // WithRemoteAddr sets the external containerd socket to connect to.
   437  func WithRemoteAddr(addr string) RemoteOption {
   438  	return rpcAddr(addr)
   439  }
   440  
   441  type rpcAddr string
   442  
   443  func (a rpcAddr) Apply(r Remote) error {
   444  	if remote, ok := r.(*remote); ok {
   445  		remote.rpcAddr = string(a)
   446  		return nil
   447  	}
   448  	return fmt.Errorf("WithRemoteAddr option not supported for this remote")
   449  }
   450  
   451  // WithRuntimePath sets the path of the runtime to be used as the
   452  // default by containerd
   453  func WithRuntimePath(rt string) RemoteOption {
   454  	return runtimePath(rt)
   455  }
   456  
   457  type runtimePath string
   458  
   459  func (rt runtimePath) Apply(r Remote) error {
   460  	if remote, ok := r.(*remote); ok {
   461  		remote.runtime = string(rt)
   462  		return nil
   463  	}
   464  	return fmt.Errorf("WithRuntime option not supported for this remote")
   465  }
   466  
   467  // WithRuntimeArgs sets the list of runtime args passed to containerd
   468  func WithRuntimeArgs(args []string) RemoteOption {
   469  	return runtimeArgs(args)
   470  }
   471  
   472  type runtimeArgs []string
   473  
   474  func (rt runtimeArgs) Apply(r Remote) error {
   475  	if remote, ok := r.(*remote); ok {
   476  		remote.runtimeArgs = rt
   477  		return nil
   478  	}
   479  	return fmt.Errorf("WithRuntimeArgs option not supported for this remote")
   480  }
   481  
   482  // WithStartDaemon defines if libcontainerd should also run containerd daemon.
   483  func WithStartDaemon(start bool) RemoteOption {
   484  	return startDaemon(start)
   485  }
   486  
   487  type startDaemon bool
   488  
   489  func (s startDaemon) Apply(r Remote) error {
   490  	if remote, ok := r.(*remote); ok {
   491  		remote.startDaemon = bool(s)
   492  		return nil
   493  	}
   494  	return fmt.Errorf("WithStartDaemon option not supported for this remote")
   495  }
   496  
   497  // WithDebugLog defines if containerd debug logs will be enabled for daemon.
   498  func WithDebugLog(debug bool) RemoteOption {
   499  	return debugLog(debug)
   500  }
   501  
   502  type debugLog bool
   503  
   504  func (d debugLog) Apply(r Remote) error {
   505  	if remote, ok := r.(*remote); ok {
   506  		remote.debugLog = bool(d)
   507  		return nil
   508  	}
   509  	return fmt.Errorf("WithDebugLog option not supported for this remote")
   510  }
   511  
   512  // WithLiveRestore defines if containers are stopped on shutdown or restored.
   513  func WithLiveRestore(v bool) RemoteOption {
   514  	return liveRestore(v)
   515  }
   516  
   517  type liveRestore bool
   518  
   519  func (l liveRestore) Apply(r Remote) error {
   520  	if remote, ok := r.(*remote); ok {
   521  		remote.liveRestore = bool(l)
   522  		for _, c := range remote.clients {
   523  			c.liveRestore = bool(l)
   524  		}
   525  		return nil
   526  	}
   527  	return fmt.Errorf("WithLiveRestore option not supported for this remote")
   528  }
   529  
   530  // WithOOMScore defines the oom_score_adj to set for the containerd process.
   531  func WithOOMScore(score int) RemoteOption {
   532  	return oomScore(score)
   533  }
   534  
   535  type oomScore int
   536  
   537  func (o oomScore) Apply(r Remote) error {
   538  	if remote, ok := r.(*remote); ok {
   539  		remote.oomScore = int(o)
   540  		return nil
   541  	}
   542  	return fmt.Errorf("WithOOMScore option not supported for this remote")
   543  }