github.com/fabiokung/docker@v0.11.2-0.20170222101415-4534dcd49497/libcontainerd/remote_unix.go (about) 1 // +build linux solaris 2 3 package libcontainerd 4 5 import ( 6 "fmt" 7 "io" 8 "io/ioutil" 9 "log" 10 "net" 11 "os" 12 "os/exec" 13 "path/filepath" 14 goruntime "runtime" 15 "strconv" 16 "strings" 17 "sync" 18 "syscall" 19 "time" 20 21 "github.com/Sirupsen/logrus" 22 containerd "github.com/docker/containerd/api/grpc/types" 23 "github.com/docker/docker/pkg/locker" 24 "github.com/docker/docker/pkg/system" 25 "github.com/golang/protobuf/ptypes" 26 "github.com/golang/protobuf/ptypes/timestamp" 27 "golang.org/x/net/context" 28 "google.golang.org/grpc" 29 "google.golang.org/grpc/grpclog" 30 "google.golang.org/grpc/health/grpc_health_v1" 31 "google.golang.org/grpc/transport" 32 ) 33 34 const ( 35 maxConnectionRetryCount = 3 36 containerdHealthCheckTimeout = 3 * time.Second 37 containerdShutdownTimeout = 15 * time.Second 38 containerdBinary = "docker-containerd" 39 containerdPidFilename = "docker-containerd.pid" 40 containerdSockFilename = "docker-containerd.sock" 41 containerdStateDir = "containerd" 42 eventTimestampFilename = "event.ts" 43 ) 44 45 type remote struct { 46 sync.RWMutex 47 apiClient containerd.APIClient 48 daemonPid int 49 stateDir string 50 rpcAddr string 51 startDaemon bool 52 closeManually bool 53 debugLog bool 54 rpcConn *grpc.ClientConn 55 clients []*client 56 eventTsPath string 57 runtime string 58 runtimeArgs []string 59 daemonWaitCh chan struct{} 60 liveRestore bool 61 oomScore int 62 restoreFromTimestamp *timestamp.Timestamp 63 } 64 65 // New creates a fresh instance of libcontainerd remote. 66 func New(stateDir string, options ...RemoteOption) (_ Remote, err error) { 67 defer func() { 68 if err != nil { 69 err = fmt.Errorf("Failed to connect to containerd. Please make sure containerd is installed in your PATH or you have specified the correct address. Got error: %v", err) 70 } 71 }() 72 r := &remote{ 73 stateDir: stateDir, 74 daemonPid: -1, 75 eventTsPath: filepath.Join(stateDir, eventTimestampFilename), 76 } 77 for _, option := range options { 78 if err := option.Apply(r); err != nil { 79 return nil, err 80 } 81 } 82 83 if err := system.MkdirAll(stateDir, 0700); err != nil { 84 return nil, err 85 } 86 87 if r.rpcAddr == "" { 88 r.rpcAddr = filepath.Join(stateDir, containerdSockFilename) 89 } 90 91 if r.startDaemon { 92 if err := r.runContainerdDaemon(); err != nil { 93 return nil, err 94 } 95 } 96 97 // don't output the grpc reconnect logging 98 grpclog.SetLogger(log.New(ioutil.Discard, "", log.LstdFlags)) 99 dialOpts := append([]grpc.DialOption{grpc.WithInsecure()}, 100 grpc.WithDialer(func(addr string, timeout time.Duration) (net.Conn, error) { 101 return net.DialTimeout("unix", addr, timeout) 102 }), 103 ) 104 conn, err := grpc.Dial(r.rpcAddr, dialOpts...) 105 if err != nil { 106 return nil, fmt.Errorf("error connecting to containerd: %v", err) 107 } 108 109 r.rpcConn = conn 110 r.apiClient = containerd.NewAPIClient(conn) 111 112 // Get the timestamp to restore from 113 t := r.getLastEventTimestamp() 114 tsp, err := ptypes.TimestampProto(t) 115 if err != nil { 116 logrus.Errorf("libcontainerd: failed to convert timestamp: %q", err) 117 } 118 r.restoreFromTimestamp = tsp 119 120 go r.handleConnectionChange() 121 122 if err := r.startEventsMonitor(); err != nil { 123 return nil, err 124 } 125 126 return r, nil 127 } 128 129 func (r *remote) UpdateOptions(options ...RemoteOption) error { 130 for _, option := range options { 131 if err := option.Apply(r); err != nil { 132 return err 133 } 134 } 135 return nil 136 } 137 138 func (r *remote) handleConnectionChange() { 139 var transientFailureCount = 0 140 141 ticker := time.NewTicker(500 * time.Millisecond) 142 defer ticker.Stop() 143 healthClient := grpc_health_v1.NewHealthClient(r.rpcConn) 144 145 for { 146 <-ticker.C 147 ctx, cancel := context.WithTimeout(context.Background(), containerdHealthCheckTimeout) 148 _, err := healthClient.Check(ctx, &grpc_health_v1.HealthCheckRequest{}) 149 cancel() 150 if err == nil { 151 continue 152 } 153 154 logrus.Debugf("libcontainerd: containerd health check returned error: %v", err) 155 156 if r.daemonPid != -1 { 157 if strings.Contains(err.Error(), "is closing") { 158 // Well, we asked for it to stop, just return 159 return 160 } 161 // all other errors are transient 162 // Reset state to be notified of next failure 163 transientFailureCount++ 164 if transientFailureCount >= maxConnectionRetryCount { 165 transientFailureCount = 0 166 if system.IsProcessAlive(r.daemonPid) { 167 system.KillProcess(r.daemonPid) 168 } 169 <-r.daemonWaitCh 170 if err := r.runContainerdDaemon(); err != nil { //FIXME: Handle error 171 logrus.Errorf("libcontainerd: error restarting containerd: %v", err) 172 } 173 continue 174 } 175 } 176 } 177 } 178 179 func (r *remote) Cleanup() { 180 if r.daemonPid == -1 { 181 return 182 } 183 r.closeManually = true 184 r.rpcConn.Close() 185 // Ask the daemon to quit 186 syscall.Kill(r.daemonPid, syscall.SIGTERM) 187 188 // Wait up to 15secs for it to stop 189 for i := time.Duration(0); i < containerdShutdownTimeout; i += time.Second { 190 if !system.IsProcessAlive(r.daemonPid) { 191 break 192 } 193 time.Sleep(time.Second) 194 } 195 196 if system.IsProcessAlive(r.daemonPid) { 197 logrus.Warnf("libcontainerd: containerd (%d) didn't stop within 15 secs, killing it\n", r.daemonPid) 198 syscall.Kill(r.daemonPid, syscall.SIGKILL) 199 } 200 201 // cleanup some files 202 os.Remove(filepath.Join(r.stateDir, containerdPidFilename)) 203 os.Remove(filepath.Join(r.stateDir, containerdSockFilename)) 204 } 205 206 func (r *remote) Client(b Backend) (Client, error) { 207 c := &client{ 208 clientCommon: clientCommon{ 209 backend: b, 210 containers: make(map[string]*container), 211 locker: locker.New(), 212 }, 213 remote: r, 214 exitNotifiers: make(map[string]*exitNotifier), 215 liveRestore: r.liveRestore, 216 } 217 218 r.Lock() 219 r.clients = append(r.clients, c) 220 r.Unlock() 221 return c, nil 222 } 223 224 func (r *remote) updateEventTimestamp(t time.Time) { 225 f, err := os.OpenFile(r.eventTsPath, syscall.O_CREAT|syscall.O_WRONLY|syscall.O_TRUNC, 0600) 226 if err != nil { 227 logrus.Warnf("libcontainerd: failed to open event timestamp file: %v", err) 228 return 229 } 230 defer f.Close() 231 232 b, err := t.MarshalText() 233 if err != nil { 234 logrus.Warnf("libcontainerd: failed to encode timestamp: %v", err) 235 return 236 } 237 238 n, err := f.Write(b) 239 if err != nil || n != len(b) { 240 logrus.Warnf("libcontainerd: failed to update event timestamp file: %v", err) 241 f.Truncate(0) 242 return 243 } 244 } 245 246 func (r *remote) getLastEventTimestamp() time.Time { 247 t := time.Now() 248 249 fi, err := os.Stat(r.eventTsPath) 250 if os.IsNotExist(err) || fi.Size() == 0 { 251 return t 252 } 253 254 f, err := os.Open(r.eventTsPath) 255 if err != nil { 256 logrus.Warnf("libcontainerd: Unable to access last event ts: %v", err) 257 return t 258 } 259 defer f.Close() 260 261 b := make([]byte, fi.Size()) 262 n, err := f.Read(b) 263 if err != nil || n != len(b) { 264 logrus.Warnf("libcontainerd: Unable to read last event ts: %v", err) 265 return t 266 } 267 268 t.UnmarshalText(b) 269 270 return t 271 } 272 273 func (r *remote) startEventsMonitor() error { 274 // First, get past events 275 t := r.getLastEventTimestamp() 276 tsp, err := ptypes.TimestampProto(t) 277 if err != nil { 278 logrus.Errorf("libcontainerd: failed to convert timestamp: %q", err) 279 } 280 er := &containerd.EventsRequest{ 281 Timestamp: tsp, 282 } 283 events, err := r.apiClient.Events(context.Background(), er, grpc.FailFast(false)) 284 if err != nil { 285 return err 286 } 287 go r.handleEventStream(events) 288 return nil 289 } 290 291 func (r *remote) handleEventStream(events containerd.API_EventsClient) { 292 for { 293 e, err := events.Recv() 294 if err != nil { 295 if grpc.ErrorDesc(err) == transport.ErrConnClosing.Desc && 296 r.closeManually { 297 // ignore error if grpc remote connection is closed manually 298 return 299 } 300 logrus.Errorf("libcontainerd: failed to receive event from containerd: %v", err) 301 go r.startEventsMonitor() 302 return 303 } 304 305 logrus.Debugf("libcontainerd: received containerd event: %#v", e) 306 307 var container *container 308 var c *client 309 r.RLock() 310 for _, c = range r.clients { 311 container, err = c.getContainer(e.Id) 312 if err == nil { 313 break 314 } 315 } 316 r.RUnlock() 317 if container == nil { 318 logrus.Warnf("libcontainerd: unknown container %s", e.Id) 319 continue 320 } 321 322 if err := container.handleEvent(e); err != nil { 323 logrus.Errorf("libcontainerd: error processing state change for %s: %v", e.Id, err) 324 } 325 326 tsp, err := ptypes.Timestamp(e.Timestamp) 327 if err != nil { 328 logrus.Errorf("libcontainerd: failed to convert event timestamp: %q", err) 329 continue 330 } 331 332 r.updateEventTimestamp(tsp) 333 } 334 } 335 336 func (r *remote) runContainerdDaemon() error { 337 pidFilename := filepath.Join(r.stateDir, containerdPidFilename) 338 f, err := os.OpenFile(pidFilename, os.O_RDWR|os.O_CREATE, 0600) 339 if err != nil { 340 return err 341 } 342 defer f.Close() 343 344 // File exist, check if the daemon is alive 345 b := make([]byte, 8) 346 n, err := f.Read(b) 347 if err != nil && err != io.EOF { 348 return err 349 } 350 351 if n > 0 { 352 pid, err := strconv.ParseUint(string(b[:n]), 10, 64) 353 if err != nil { 354 return err 355 } 356 if system.IsProcessAlive(int(pid)) { 357 logrus.Infof("libcontainerd: previous instance of containerd still alive (%d)", pid) 358 r.daemonPid = int(pid) 359 return nil 360 } 361 } 362 363 // rewind the file 364 _, err = f.Seek(0, os.SEEK_SET) 365 if err != nil { 366 return err 367 } 368 369 // Truncate it 370 err = f.Truncate(0) 371 if err != nil { 372 return err 373 } 374 375 // Start a new instance 376 args := []string{ 377 "-l", fmt.Sprintf("unix://%s", r.rpcAddr), 378 "--metrics-interval=0", 379 "--start-timeout", "2m", 380 "--state-dir", filepath.Join(r.stateDir, containerdStateDir), 381 } 382 if goruntime.GOOS == "solaris" { 383 args = append(args, "--shim", "containerd-shim", "--runtime", "runc") 384 } else { 385 args = append(args, "--shim", "docker-containerd-shim") 386 if r.runtime != "" { 387 args = append(args, "--runtime") 388 args = append(args, r.runtime) 389 } 390 } 391 if r.debugLog { 392 args = append(args, "--debug") 393 } 394 if len(r.runtimeArgs) > 0 { 395 for _, v := range r.runtimeArgs { 396 args = append(args, "--runtime-args") 397 args = append(args, v) 398 } 399 logrus.Debugf("libcontainerd: runContainerdDaemon: runtimeArgs: %s", args) 400 } 401 402 cmd := exec.Command(containerdBinary, args...) 403 // redirect containerd logs to docker logs 404 cmd.Stdout = os.Stdout 405 cmd.Stderr = os.Stderr 406 cmd.SysProcAttr = setSysProcAttr(true) 407 cmd.Env = nil 408 // clear the NOTIFY_SOCKET from the env when starting containerd 409 for _, e := range os.Environ() { 410 if !strings.HasPrefix(e, "NOTIFY_SOCKET") { 411 cmd.Env = append(cmd.Env, e) 412 } 413 } 414 if err := cmd.Start(); err != nil { 415 return err 416 } 417 logrus.Infof("libcontainerd: new containerd process, pid: %d", cmd.Process.Pid) 418 if err := setOOMScore(cmd.Process.Pid, r.oomScore); err != nil { 419 system.KillProcess(cmd.Process.Pid) 420 return err 421 } 422 if _, err := f.WriteString(fmt.Sprintf("%d", cmd.Process.Pid)); err != nil { 423 system.KillProcess(cmd.Process.Pid) 424 return err 425 } 426 427 r.daemonWaitCh = make(chan struct{}) 428 go func() { 429 cmd.Wait() 430 close(r.daemonWaitCh) 431 }() // Reap our child when needed 432 r.daemonPid = cmd.Process.Pid 433 return nil 434 } 435 436 // WithRemoteAddr sets the external containerd socket to connect to. 437 func WithRemoteAddr(addr string) RemoteOption { 438 return rpcAddr(addr) 439 } 440 441 type rpcAddr string 442 443 func (a rpcAddr) Apply(r Remote) error { 444 if remote, ok := r.(*remote); ok { 445 remote.rpcAddr = string(a) 446 return nil 447 } 448 return fmt.Errorf("WithRemoteAddr option not supported for this remote") 449 } 450 451 // WithRuntimePath sets the path of the runtime to be used as the 452 // default by containerd 453 func WithRuntimePath(rt string) RemoteOption { 454 return runtimePath(rt) 455 } 456 457 type runtimePath string 458 459 func (rt runtimePath) Apply(r Remote) error { 460 if remote, ok := r.(*remote); ok { 461 remote.runtime = string(rt) 462 return nil 463 } 464 return fmt.Errorf("WithRuntime option not supported for this remote") 465 } 466 467 // WithRuntimeArgs sets the list of runtime args passed to containerd 468 func WithRuntimeArgs(args []string) RemoteOption { 469 return runtimeArgs(args) 470 } 471 472 type runtimeArgs []string 473 474 func (rt runtimeArgs) Apply(r Remote) error { 475 if remote, ok := r.(*remote); ok { 476 remote.runtimeArgs = rt 477 return nil 478 } 479 return fmt.Errorf("WithRuntimeArgs option not supported for this remote") 480 } 481 482 // WithStartDaemon defines if libcontainerd should also run containerd daemon. 483 func WithStartDaemon(start bool) RemoteOption { 484 return startDaemon(start) 485 } 486 487 type startDaemon bool 488 489 func (s startDaemon) Apply(r Remote) error { 490 if remote, ok := r.(*remote); ok { 491 remote.startDaemon = bool(s) 492 return nil 493 } 494 return fmt.Errorf("WithStartDaemon option not supported for this remote") 495 } 496 497 // WithDebugLog defines if containerd debug logs will be enabled for daemon. 498 func WithDebugLog(debug bool) RemoteOption { 499 return debugLog(debug) 500 } 501 502 type debugLog bool 503 504 func (d debugLog) Apply(r Remote) error { 505 if remote, ok := r.(*remote); ok { 506 remote.debugLog = bool(d) 507 return nil 508 } 509 return fmt.Errorf("WithDebugLog option not supported for this remote") 510 } 511 512 // WithLiveRestore defines if containers are stopped on shutdown or restored. 513 func WithLiveRestore(v bool) RemoteOption { 514 return liveRestore(v) 515 } 516 517 type liveRestore bool 518 519 func (l liveRestore) Apply(r Remote) error { 520 if remote, ok := r.(*remote); ok { 521 remote.liveRestore = bool(l) 522 for _, c := range remote.clients { 523 c.liveRestore = bool(l) 524 } 525 return nil 526 } 527 return fmt.Errorf("WithLiveRestore option not supported for this remote") 528 } 529 530 // WithOOMScore defines the oom_score_adj to set for the containerd process. 531 func WithOOMScore(score int) RemoteOption { 532 return oomScore(score) 533 } 534 535 type oomScore int 536 537 func (o oomScore) Apply(r Remote) error { 538 if remote, ok := r.(*remote); ok { 539 remote.oomScore = int(o) 540 return nil 541 } 542 return fmt.Errorf("WithOOMScore option not supported for this remote") 543 }