github.com/go/docker@v1.12.0-rc2/libcontainerd/remote_linux.go (about) 1 package libcontainerd 2 3 import ( 4 "fmt" 5 "io" 6 "io/ioutil" 7 "log" 8 "net" 9 "os" 10 "os/exec" 11 "path/filepath" 12 "strconv" 13 "strings" 14 "sync" 15 "syscall" 16 "time" 17 18 "github.com/Sirupsen/logrus" 19 containerd "github.com/docker/containerd/api/grpc/types" 20 "github.com/docker/docker/pkg/locker" 21 sysinfo "github.com/docker/docker/pkg/system" 22 "github.com/docker/docker/utils" 23 "golang.org/x/net/context" 24 "google.golang.org/grpc" 25 "google.golang.org/grpc/grpclog" 26 "google.golang.org/grpc/transport" 27 ) 28 29 const ( 30 maxConnectionRetryCount = 3 31 connectionRetryDelay = 3 * time.Second 32 containerdShutdownTimeout = 15 * time.Second 33 containerdBinary = "docker-containerd" 34 containerdPidFilename = "docker-containerd.pid" 35 containerdSockFilename = "docker-containerd.sock" 36 containerdStateDir = "containerd" 37 eventTimestampFilename = "event.ts" 38 ) 39 40 type remote struct { 41 sync.RWMutex 42 apiClient containerd.APIClient 43 daemonPid int 44 stateDir string 45 rpcAddr string 46 startDaemon bool 47 closeManually bool 48 debugLog bool 49 rpcConn *grpc.ClientConn 50 clients []*client 51 eventTsPath string 52 pastEvents map[string]*containerd.Event 53 runtime string 54 runtimeArgs []string 55 daemonWaitCh chan struct{} 56 liveRestore bool 57 } 58 59 // New creates a fresh instance of libcontainerd remote. 60 func New(stateDir string, options ...RemoteOption) (_ Remote, err error) { 61 defer func() { 62 if err != nil { 63 err = fmt.Errorf("Failed to connect to containerd. Please make sure containerd is installed in your PATH or you have specificed the correct address. Got error: %v", err) 64 } 65 }() 66 r := &remote{ 67 stateDir: stateDir, 68 daemonPid: -1, 69 eventTsPath: filepath.Join(stateDir, eventTimestampFilename), 70 pastEvents: make(map[string]*containerd.Event), 71 } 72 for _, option := range options { 73 if err := option.Apply(r); err != nil { 74 return nil, err 75 } 76 } 77 78 if err := sysinfo.MkdirAll(stateDir, 0700); err != nil { 79 return nil, err 80 } 81 82 if r.rpcAddr == "" { 83 r.rpcAddr = filepath.Join(stateDir, containerdSockFilename) 84 } 85 86 if r.startDaemon { 87 if err := r.runContainerdDaemon(); err != nil { 88 return nil, err 89 } 90 } 91 92 // don't output the grpc reconnect logging 93 grpclog.SetLogger(log.New(ioutil.Discard, "", log.LstdFlags)) 94 dialOpts := append([]grpc.DialOption{grpc.WithInsecure()}, 95 grpc.WithDialer(func(addr string, timeout time.Duration) (net.Conn, error) { 96 return net.DialTimeout("unix", addr, timeout) 97 }), 98 ) 99 conn, err := grpc.Dial(r.rpcAddr, dialOpts...) 100 if err != nil { 101 return nil, fmt.Errorf("error connecting to containerd: %v", err) 102 } 103 104 r.rpcConn = conn 105 r.apiClient = containerd.NewAPIClient(conn) 106 107 go r.handleConnectionChange() 108 109 if err := r.startEventsMonitor(); err != nil { 110 return nil, err 111 } 112 113 return r, nil 114 } 115 116 func (r *remote) UpdateOptions(options ...RemoteOption) error { 117 for _, option := range options { 118 if err := option.Apply(r); err != nil { 119 return err 120 } 121 } 122 return nil 123 } 124 125 func (r *remote) handleConnectionChange() { 126 var transientFailureCount = 0 127 state := grpc.Idle 128 for { 129 s, err := r.rpcConn.WaitForStateChange(context.Background(), state) 130 if err != nil { 131 break 132 } 133 state = s 134 logrus.Debugf("containerd connection state change: %v", s) 135 136 if r.daemonPid != -1 { 137 switch state { 138 case grpc.TransientFailure: 139 // Reset state to be notified of next failure 140 transientFailureCount++ 141 if transientFailureCount >= maxConnectionRetryCount { 142 transientFailureCount = 0 143 if utils.IsProcessAlive(r.daemonPid) { 144 utils.KillProcess(r.daemonPid) 145 <-r.daemonWaitCh 146 } 147 if err := r.runContainerdDaemon(); err != nil { //FIXME: Handle error 148 logrus.Errorf("error restarting containerd: %v", err) 149 } 150 } else { 151 state = grpc.Idle 152 time.Sleep(connectionRetryDelay) 153 } 154 case grpc.Shutdown: 155 // Well, we asked for it to stop, just return 156 return 157 } 158 } 159 } 160 } 161 162 func (r *remote) Cleanup() { 163 if r.daemonPid == -1 { 164 return 165 } 166 r.closeManually = true 167 r.rpcConn.Close() 168 // Ask the daemon to quit 169 syscall.Kill(r.daemonPid, syscall.SIGTERM) 170 171 // Wait up to 15secs for it to stop 172 for i := time.Duration(0); i < containerdShutdownTimeout; i += time.Second { 173 if !utils.IsProcessAlive(r.daemonPid) { 174 break 175 } 176 time.Sleep(time.Second) 177 } 178 179 if utils.IsProcessAlive(r.daemonPid) { 180 logrus.Warnf("libcontainerd: containerd (%d) didn't stop within 15 secs, killing it\n", r.daemonPid) 181 syscall.Kill(r.daemonPid, syscall.SIGKILL) 182 } 183 184 // cleanup some files 185 os.Remove(filepath.Join(r.stateDir, containerdPidFilename)) 186 os.Remove(filepath.Join(r.stateDir, containerdSockFilename)) 187 } 188 189 func (r *remote) Client(b Backend) (Client, error) { 190 c := &client{ 191 clientCommon: clientCommon{ 192 backend: b, 193 containers: make(map[string]*container), 194 locker: locker.New(), 195 }, 196 remote: r, 197 exitNotifiers: make(map[string]*exitNotifier), 198 liveRestore: r.liveRestore, 199 } 200 201 r.Lock() 202 r.clients = append(r.clients, c) 203 r.Unlock() 204 return c, nil 205 } 206 207 func (r *remote) updateEventTimestamp(t time.Time) { 208 f, err := os.OpenFile(r.eventTsPath, syscall.O_CREAT|syscall.O_WRONLY|syscall.O_TRUNC, 0600) 209 defer f.Close() 210 if err != nil { 211 logrus.Warnf("libcontainerd: failed to open event timestamp file: %v", err) 212 return 213 } 214 215 b, err := t.MarshalText() 216 if err != nil { 217 logrus.Warnf("libcontainerd: failed to encode timestamp: %v", err) 218 return 219 } 220 221 n, err := f.Write(b) 222 if err != nil || n != len(b) { 223 logrus.Warnf("libcontainerd: failed to update event timestamp file: %v", err) 224 f.Truncate(0) 225 return 226 } 227 228 } 229 230 func (r *remote) getLastEventTimestamp() int64 { 231 t := time.Now() 232 233 fi, err := os.Stat(r.eventTsPath) 234 if os.IsNotExist(err) || fi.Size() == 0 { 235 return t.Unix() 236 } 237 238 f, err := os.Open(r.eventTsPath) 239 defer f.Close() 240 if err != nil { 241 logrus.Warnf("libcontainerd: Unable to access last event ts: %v", err) 242 return t.Unix() 243 } 244 245 b := make([]byte, fi.Size()) 246 n, err := f.Read(b) 247 if err != nil || n != len(b) { 248 logrus.Warnf("libcontainerd: Unable to read last event ts: %v", err) 249 return t.Unix() 250 } 251 252 t.UnmarshalText(b) 253 254 return t.Unix() 255 } 256 257 func (r *remote) startEventsMonitor() error { 258 // First, get past events 259 er := &containerd.EventsRequest{ 260 Timestamp: uint64(r.getLastEventTimestamp()), 261 } 262 events, err := r.apiClient.Events(context.Background(), er) 263 if err != nil { 264 return err 265 } 266 go r.handleEventStream(events) 267 return nil 268 } 269 270 func (r *remote) handleEventStream(events containerd.API_EventsClient) { 271 live := false 272 for { 273 e, err := events.Recv() 274 if err != nil { 275 if grpc.ErrorDesc(err) == transport.ErrConnClosing.Desc && 276 r.closeManually { 277 // ignore error if grpc remote connection is closed manually 278 return 279 } 280 logrus.Errorf("failed to receive event from containerd: %v", err) 281 go r.startEventsMonitor() 282 return 283 } 284 285 if live == false { 286 logrus.Debugf("received past containerd event: %#v", e) 287 288 // Pause/Resume events should never happens after exit one 289 switch e.Type { 290 case StateExit: 291 r.pastEvents[e.Id] = e 292 case StatePause: 293 r.pastEvents[e.Id] = e 294 case StateResume: 295 r.pastEvents[e.Id] = e 296 case stateLive: 297 live = true 298 r.updateEventTimestamp(time.Unix(int64(e.Timestamp), 0)) 299 } 300 } else { 301 logrus.Debugf("received containerd event: %#v", e) 302 303 var container *container 304 var c *client 305 r.RLock() 306 for _, c = range r.clients { 307 container, err = c.getContainer(e.Id) 308 if err == nil { 309 break 310 } 311 } 312 r.RUnlock() 313 if container == nil { 314 logrus.Errorf("no state for container: %q", err) 315 continue 316 } 317 318 if err := container.handleEvent(e); err != nil { 319 logrus.Errorf("error processing state change for %s: %v", e.Id, err) 320 } 321 322 r.updateEventTimestamp(time.Unix(int64(e.Timestamp), 0)) 323 } 324 } 325 } 326 327 func (r *remote) runContainerdDaemon() error { 328 pidFilename := filepath.Join(r.stateDir, containerdPidFilename) 329 f, err := os.OpenFile(pidFilename, os.O_RDWR|os.O_CREATE, 0600) 330 defer f.Close() 331 if err != nil { 332 return err 333 } 334 335 // File exist, check if the daemon is alive 336 b := make([]byte, 8) 337 n, err := f.Read(b) 338 if err != nil && err != io.EOF { 339 return err 340 } 341 342 if n > 0 { 343 pid, err := strconv.ParseUint(string(b[:n]), 10, 64) 344 if err != nil { 345 return err 346 } 347 if utils.IsProcessAlive(int(pid)) { 348 logrus.Infof("previous instance of containerd still alive (%d)", pid) 349 r.daemonPid = int(pid) 350 return nil 351 } 352 } 353 354 // rewind the file 355 _, err = f.Seek(0, os.SEEK_SET) 356 if err != nil { 357 return err 358 } 359 360 // Truncate it 361 err = f.Truncate(0) 362 if err != nil { 363 return err 364 } 365 366 // Start a new instance 367 args := []string{ 368 "-l", fmt.Sprintf("unix://%s", r.rpcAddr), 369 "--shim", "docker-containerd-shim", 370 "--metrics-interval=0", 371 "--start-timeout", "2m", 372 "--state-dir", filepath.Join(r.stateDir, containerdStateDir), 373 } 374 if r.runtime != "" { 375 args = append(args, "--runtime") 376 args = append(args, r.runtime) 377 } 378 if r.debugLog { 379 args = append(args, "--debug") 380 } 381 if len(r.runtimeArgs) > 0 { 382 for _, v := range r.runtimeArgs { 383 args = append(args, "--runtime-args") 384 args = append(args, v) 385 } 386 logrus.Debugf("runContainerdDaemon: runtimeArgs: %s", args) 387 } 388 389 cmd := exec.Command(containerdBinary, args...) 390 // redirect containerd logs to docker logs 391 cmd.Stdout = os.Stdout 392 cmd.Stderr = os.Stderr 393 cmd.SysProcAttr = &syscall.SysProcAttr{Setsid: true, Pdeathsig: syscall.SIGKILL} 394 cmd.Env = nil 395 // clear the NOTIFY_SOCKET from the env when starting containerd 396 for _, e := range os.Environ() { 397 if !strings.HasPrefix(e, "NOTIFY_SOCKET") { 398 cmd.Env = append(cmd.Env, e) 399 } 400 } 401 if err := cmd.Start(); err != nil { 402 return err 403 } 404 logrus.Infof("New containerd process, pid: %d", cmd.Process.Pid) 405 406 if _, err := f.WriteString(fmt.Sprintf("%d", cmd.Process.Pid)); err != nil { 407 utils.KillProcess(cmd.Process.Pid) 408 return err 409 } 410 411 r.daemonWaitCh = make(chan struct{}) 412 go func() { 413 cmd.Wait() 414 close(r.daemonWaitCh) 415 }() // Reap our child when needed 416 r.daemonPid = cmd.Process.Pid 417 return nil 418 } 419 420 // WithRemoteAddr sets the external containerd socket to connect to. 421 func WithRemoteAddr(addr string) RemoteOption { 422 return rpcAddr(addr) 423 } 424 425 type rpcAddr string 426 427 func (a rpcAddr) Apply(r Remote) error { 428 if remote, ok := r.(*remote); ok { 429 remote.rpcAddr = string(a) 430 return nil 431 } 432 return fmt.Errorf("WithRemoteAddr option not supported for this remote") 433 } 434 435 // WithRuntimePath sets the path of the runtime to be used as the 436 // default by containerd 437 func WithRuntimePath(rt string) RemoteOption { 438 return runtimePath(rt) 439 } 440 441 type runtimePath string 442 443 func (rt runtimePath) Apply(r Remote) error { 444 if remote, ok := r.(*remote); ok { 445 remote.runtime = string(rt) 446 return nil 447 } 448 return fmt.Errorf("WithRuntime option not supported for this remote") 449 } 450 451 // WithRuntimeArgs sets the list of runtime args passed to containerd 452 func WithRuntimeArgs(args []string) RemoteOption { 453 return runtimeArgs(args) 454 } 455 456 type runtimeArgs []string 457 458 func (rt runtimeArgs) Apply(r Remote) error { 459 if remote, ok := r.(*remote); ok { 460 remote.runtimeArgs = rt 461 return nil 462 } 463 return fmt.Errorf("WithRuntimeArgs option not supported for this remote") 464 } 465 466 // WithStartDaemon defines if libcontainerd should also run containerd daemon. 467 func WithStartDaemon(start bool) RemoteOption { 468 return startDaemon(start) 469 } 470 471 type startDaemon bool 472 473 func (s startDaemon) Apply(r Remote) error { 474 if remote, ok := r.(*remote); ok { 475 remote.startDaemon = bool(s) 476 return nil 477 } 478 return fmt.Errorf("WithStartDaemon option not supported for this remote") 479 } 480 481 // WithDebugLog defines if containerd debug logs will be enabled for daemon. 482 func WithDebugLog(debug bool) RemoteOption { 483 return debugLog(debug) 484 } 485 486 type debugLog bool 487 488 func (d debugLog) Apply(r Remote) error { 489 if remote, ok := r.(*remote); ok { 490 remote.debugLog = bool(d) 491 return nil 492 } 493 return fmt.Errorf("WithDebugLog option not supported for this remote") 494 } 495 496 // WithLiveRestore defines if containers are stopped on shutdown or restored. 497 func WithLiveRestore(v bool) RemoteOption { 498 return liveRestore(v) 499 } 500 501 type liveRestore bool 502 503 func (l liveRestore) Apply(r Remote) error { 504 if remote, ok := r.(*remote); ok { 505 remote.liveRestore = bool(l) 506 for _, c := range remote.clients { 507 c.liveRestore = bool(l) 508 } 509 return nil 510 } 511 return fmt.Errorf("WithLiveRestore option not supported for this remote") 512 }