gitee.com/bomy/docker.git@v1.13.1/libcontainerd/remote_unix.go (about) 1 // +build linux solaris 2 3 package libcontainerd 4 5 import ( 6 "fmt" 7 "io" 8 "io/ioutil" 9 "log" 10 "net" 11 "os" 12 "os/exec" 13 "path/filepath" 14 goruntime "runtime" 15 "strconv" 16 "strings" 17 "sync" 18 "syscall" 19 "time" 20 21 "github.com/Sirupsen/logrus" 22 containerd "github.com/docker/containerd/api/grpc/types" 23 "github.com/docker/docker/pkg/locker" 24 sysinfo "github.com/docker/docker/pkg/system" 25 "github.com/docker/docker/utils" 26 "github.com/golang/protobuf/ptypes" 27 "github.com/golang/protobuf/ptypes/timestamp" 28 "golang.org/x/net/context" 29 "google.golang.org/grpc" 30 "google.golang.org/grpc/grpclog" 31 "google.golang.org/grpc/health/grpc_health_v1" 32 "google.golang.org/grpc/transport" 33 ) 34 35 const ( 36 maxConnectionRetryCount = 3 37 containerdHealthCheckTimeout = 3 * time.Second 38 containerdShutdownTimeout = 15 * time.Second 39 containerdBinary = "docker-containerd" 40 containerdPidFilename = "docker-containerd.pid" 41 containerdSockFilename = "docker-containerd.sock" 42 containerdStateDir = "containerd" 43 eventTimestampFilename = "event.ts" 44 ) 45 46 type remote struct { 47 sync.RWMutex 48 apiClient containerd.APIClient 49 daemonPid int 50 stateDir string 51 rpcAddr string 52 startDaemon bool 53 closeManually bool 54 debugLog bool 55 rpcConn *grpc.ClientConn 56 clients []*client 57 eventTsPath string 58 runtime string 59 runtimeArgs []string 60 daemonWaitCh chan struct{} 61 liveRestore bool 62 oomScore int 63 restoreFromTimestamp *timestamp.Timestamp 64 } 65 66 // New creates a fresh instance of libcontainerd remote. 67 func New(stateDir string, options ...RemoteOption) (_ Remote, err error) { 68 defer func() { 69 if err != nil { 70 err = fmt.Errorf("Failed to connect to containerd. Please make sure containerd is installed in your PATH or you have specified the correct address. Got error: %v", err) 71 } 72 }() 73 r := &remote{ 74 stateDir: stateDir, 75 daemonPid: -1, 76 eventTsPath: filepath.Join(stateDir, eventTimestampFilename), 77 } 78 for _, option := range options { 79 if err := option.Apply(r); err != nil { 80 return nil, err 81 } 82 } 83 84 if err := sysinfo.MkdirAll(stateDir, 0700); err != nil { 85 return nil, err 86 } 87 88 if r.rpcAddr == "" { 89 r.rpcAddr = filepath.Join(stateDir, containerdSockFilename) 90 } 91 92 if r.startDaemon { 93 if err := r.runContainerdDaemon(); err != nil { 94 return nil, err 95 } 96 } 97 98 // don't output the grpc reconnect logging 99 grpclog.SetLogger(log.New(ioutil.Discard, "", log.LstdFlags)) 100 dialOpts := append([]grpc.DialOption{grpc.WithInsecure()}, 101 grpc.WithDialer(func(addr string, timeout time.Duration) (net.Conn, error) { 102 return net.DialTimeout("unix", addr, timeout) 103 }), 104 ) 105 conn, err := grpc.Dial(r.rpcAddr, dialOpts...) 106 if err != nil { 107 return nil, fmt.Errorf("error connecting to containerd: %v", err) 108 } 109 110 r.rpcConn = conn 111 r.apiClient = containerd.NewAPIClient(conn) 112 113 // Get the timestamp to restore from 114 t := r.getLastEventTimestamp() 115 tsp, err := ptypes.TimestampProto(t) 116 if err != nil { 117 logrus.Errorf("libcontainerd: failed to convert timestamp: %q", err) 118 } 119 r.restoreFromTimestamp = tsp 120 121 go r.handleConnectionChange() 122 123 if err := r.startEventsMonitor(); err != nil { 124 return nil, err 125 } 126 127 return r, nil 128 } 129 130 func (r *remote) UpdateOptions(options ...RemoteOption) error { 131 for _, option := range options { 132 if err := option.Apply(r); err != nil { 133 return err 134 } 135 } 136 return nil 137 } 138 139 func (r *remote) handleConnectionChange() { 140 var transientFailureCount = 0 141 142 ticker := time.NewTicker(500 * time.Millisecond) 143 defer ticker.Stop() 144 healthClient := grpc_health_v1.NewHealthClient(r.rpcConn) 145 146 for { 147 <-ticker.C 148 ctx, cancel := context.WithTimeout(context.Background(), containerdHealthCheckTimeout) 149 _, err := healthClient.Check(ctx, &grpc_health_v1.HealthCheckRequest{}) 150 cancel() 151 if err == nil { 152 continue 153 } 154 155 logrus.Debugf("libcontainerd: containerd health check returned error: %v", err) 156 157 if r.daemonPid != -1 { 158 if strings.Contains(err.Error(), "is closing") { 159 // Well, we asked for it to stop, just return 160 return 161 } 162 // all other errors are transient 163 // Reset state to be notified of next failure 164 transientFailureCount++ 165 if transientFailureCount >= maxConnectionRetryCount { 166 transientFailureCount = 0 167 if utils.IsProcessAlive(r.daemonPid) { 168 utils.KillProcess(r.daemonPid) 169 } 170 <-r.daemonWaitCh 171 if err := r.runContainerdDaemon(); err != nil { //FIXME: Handle error 172 logrus.Errorf("libcontainerd: error restarting containerd: %v", err) 173 } 174 continue 175 } 176 } 177 } 178 } 179 180 func (r *remote) Cleanup() { 181 if r.daemonPid == -1 { 182 return 183 } 184 r.closeManually = true 185 r.rpcConn.Close() 186 // Ask the daemon to quit 187 syscall.Kill(r.daemonPid, syscall.SIGTERM) 188 189 // Wait up to 15secs for it to stop 190 for i := time.Duration(0); i < containerdShutdownTimeout; i += time.Second { 191 if !utils.IsProcessAlive(r.daemonPid) { 192 break 193 } 194 time.Sleep(time.Second) 195 } 196 197 if utils.IsProcessAlive(r.daemonPid) { 198 logrus.Warnf("libcontainerd: containerd (%d) didn't stop within 15 secs, killing it\n", r.daemonPid) 199 syscall.Kill(r.daemonPid, syscall.SIGKILL) 200 } 201 202 // cleanup some files 203 os.Remove(filepath.Join(r.stateDir, containerdPidFilename)) 204 os.Remove(filepath.Join(r.stateDir, containerdSockFilename)) 205 } 206 207 func (r *remote) Client(b Backend) (Client, error) { 208 c := &client{ 209 clientCommon: clientCommon{ 210 backend: b, 211 containers: make(map[string]*container), 212 locker: locker.New(), 213 }, 214 remote: r, 215 exitNotifiers: make(map[string]*exitNotifier), 216 liveRestore: r.liveRestore, 217 } 218 219 r.Lock() 220 r.clients = append(r.clients, c) 221 r.Unlock() 222 return c, nil 223 } 224 225 func (r *remote) updateEventTimestamp(t time.Time) { 226 f, err := os.OpenFile(r.eventTsPath, syscall.O_CREAT|syscall.O_WRONLY|syscall.O_TRUNC, 0600) 227 if err != nil { 228 logrus.Warnf("libcontainerd: failed to open event timestamp file: %v", err) 229 return 230 } 231 defer f.Close() 232 233 b, err := t.MarshalText() 234 if err != nil { 235 logrus.Warnf("libcontainerd: failed to encode timestamp: %v", err) 236 return 237 } 238 239 n, err := f.Write(b) 240 if err != nil || n != len(b) { 241 logrus.Warnf("libcontainerd: failed to update event timestamp file: %v", err) 242 f.Truncate(0) 243 return 244 } 245 } 246 247 func (r *remote) getLastEventTimestamp() time.Time { 248 t := time.Now() 249 250 fi, err := os.Stat(r.eventTsPath) 251 if os.IsNotExist(err) || fi.Size() == 0 { 252 return t 253 } 254 255 f, err := os.Open(r.eventTsPath) 256 if err != nil { 257 logrus.Warnf("libcontainerd: Unable to access last event ts: %v", err) 258 return t 259 } 260 defer f.Close() 261 262 b := make([]byte, fi.Size()) 263 n, err := f.Read(b) 264 if err != nil || n != len(b) { 265 logrus.Warnf("libcontainerd: Unable to read last event ts: %v", err) 266 return t 267 } 268 269 t.UnmarshalText(b) 270 271 return t 272 } 273 274 func (r *remote) startEventsMonitor() error { 275 // First, get past events 276 t := r.getLastEventTimestamp() 277 tsp, err := ptypes.TimestampProto(t) 278 if err != nil { 279 logrus.Errorf("libcontainerd: failed to convert timestamp: %q", err) 280 } 281 er := &containerd.EventsRequest{ 282 Timestamp: tsp, 283 } 284 events, err := r.apiClient.Events(context.Background(), er, grpc.FailFast(false)) 285 if err != nil { 286 return err 287 } 288 go r.handleEventStream(events) 289 return nil 290 } 291 292 func (r *remote) handleEventStream(events containerd.API_EventsClient) { 293 for { 294 e, err := events.Recv() 295 if err != nil { 296 if grpc.ErrorDesc(err) == transport.ErrConnClosing.Desc && 297 r.closeManually { 298 // ignore error if grpc remote connection is closed manually 299 return 300 } 301 logrus.Errorf("libcontainerd: failed to receive event from containerd: %v", err) 302 go r.startEventsMonitor() 303 return 304 } 305 306 logrus.Debugf("libcontainerd: received containerd event: %#v", e) 307 308 var container *container 309 var c *client 310 r.RLock() 311 for _, c = range r.clients { 312 container, err = c.getContainer(e.Id) 313 if err == nil { 314 break 315 } 316 } 317 r.RUnlock() 318 if container == nil { 319 logrus.Warnf("libcontainerd: unknown container %s", e.Id) 320 continue 321 } 322 323 if err := container.handleEvent(e); err != nil { 324 logrus.Errorf("libcontainerd: error processing state change for %s: %v", e.Id, err) 325 } 326 327 tsp, err := ptypes.Timestamp(e.Timestamp) 328 if err != nil { 329 logrus.Errorf("libcontainerd: failed to convert event timestamp: %q", err) 330 continue 331 } 332 333 r.updateEventTimestamp(tsp) 334 } 335 } 336 337 func (r *remote) runContainerdDaemon() error { 338 pidFilename := filepath.Join(r.stateDir, containerdPidFilename) 339 f, err := os.OpenFile(pidFilename, os.O_RDWR|os.O_CREATE, 0600) 340 if err != nil { 341 return err 342 } 343 defer f.Close() 344 345 // File exist, check if the daemon is alive 346 b := make([]byte, 8) 347 n, err := f.Read(b) 348 if err != nil && err != io.EOF { 349 return err 350 } 351 352 if n > 0 { 353 pid, err := strconv.ParseUint(string(b[:n]), 10, 64) 354 if err != nil { 355 return err 356 } 357 if utils.IsProcessAlive(int(pid)) { 358 logrus.Infof("libcontainerd: previous instance of containerd still alive (%d)", pid) 359 r.daemonPid = int(pid) 360 return nil 361 } 362 } 363 364 // rewind the file 365 _, err = f.Seek(0, os.SEEK_SET) 366 if err != nil { 367 return err 368 } 369 370 // Truncate it 371 err = f.Truncate(0) 372 if err != nil { 373 return err 374 } 375 376 // Start a new instance 377 args := []string{ 378 "-l", fmt.Sprintf("unix://%s", r.rpcAddr), 379 "--metrics-interval=0", 380 "--start-timeout", "2m", 381 "--state-dir", filepath.Join(r.stateDir, containerdStateDir), 382 } 383 if goruntime.GOOS == "solaris" { 384 args = append(args, "--shim", "containerd-shim", "--runtime", "runc") 385 } else { 386 args = append(args, "--shim", "docker-containerd-shim") 387 if r.runtime != "" { 388 args = append(args, "--runtime") 389 args = append(args, r.runtime) 390 } 391 } 392 if r.debugLog { 393 args = append(args, "--debug") 394 } 395 if len(r.runtimeArgs) > 0 { 396 for _, v := range r.runtimeArgs { 397 args = append(args, "--runtime-args") 398 args = append(args, v) 399 } 400 logrus.Debugf("libcontainerd: runContainerdDaemon: runtimeArgs: %s", args) 401 } 402 403 cmd := exec.Command(containerdBinary, args...) 404 // redirect containerd logs to docker logs 405 cmd.Stdout = os.Stdout 406 cmd.Stderr = os.Stderr 407 cmd.SysProcAttr = setSysProcAttr(true) 408 cmd.Env = nil 409 // clear the NOTIFY_SOCKET from the env when starting containerd 410 for _, e := range os.Environ() { 411 if !strings.HasPrefix(e, "NOTIFY_SOCKET") { 412 cmd.Env = append(cmd.Env, e) 413 } 414 } 415 if err := cmd.Start(); err != nil { 416 return err 417 } 418 logrus.Infof("libcontainerd: new containerd process, pid: %d", cmd.Process.Pid) 419 if err := setOOMScore(cmd.Process.Pid, r.oomScore); err != nil { 420 utils.KillProcess(cmd.Process.Pid) 421 return err 422 } 423 if _, err := f.WriteString(fmt.Sprintf("%d", cmd.Process.Pid)); err != nil { 424 utils.KillProcess(cmd.Process.Pid) 425 return err 426 } 427 428 r.daemonWaitCh = make(chan struct{}) 429 go func() { 430 cmd.Wait() 431 close(r.daemonWaitCh) 432 }() // Reap our child when needed 433 r.daemonPid = cmd.Process.Pid 434 return nil 435 } 436 437 // WithRemoteAddr sets the external containerd socket to connect to. 438 func WithRemoteAddr(addr string) RemoteOption { 439 return rpcAddr(addr) 440 } 441 442 type rpcAddr string 443 444 func (a rpcAddr) Apply(r Remote) error { 445 if remote, ok := r.(*remote); ok { 446 remote.rpcAddr = string(a) 447 return nil 448 } 449 return fmt.Errorf("WithRemoteAddr option not supported for this remote") 450 } 451 452 // WithRuntimePath sets the path of the runtime to be used as the 453 // default by containerd 454 func WithRuntimePath(rt string) RemoteOption { 455 return runtimePath(rt) 456 } 457 458 type runtimePath string 459 460 func (rt runtimePath) Apply(r Remote) error { 461 if remote, ok := r.(*remote); ok { 462 remote.runtime = string(rt) 463 return nil 464 } 465 return fmt.Errorf("WithRuntime option not supported for this remote") 466 } 467 468 // WithRuntimeArgs sets the list of runtime args passed to containerd 469 func WithRuntimeArgs(args []string) RemoteOption { 470 return runtimeArgs(args) 471 } 472 473 type runtimeArgs []string 474 475 func (rt runtimeArgs) Apply(r Remote) error { 476 if remote, ok := r.(*remote); ok { 477 remote.runtimeArgs = rt 478 return nil 479 } 480 return fmt.Errorf("WithRuntimeArgs option not supported for this remote") 481 } 482 483 // WithStartDaemon defines if libcontainerd should also run containerd daemon. 484 func WithStartDaemon(start bool) RemoteOption { 485 return startDaemon(start) 486 } 487 488 type startDaemon bool 489 490 func (s startDaemon) Apply(r Remote) error { 491 if remote, ok := r.(*remote); ok { 492 remote.startDaemon = bool(s) 493 return nil 494 } 495 return fmt.Errorf("WithStartDaemon option not supported for this remote") 496 } 497 498 // WithDebugLog defines if containerd debug logs will be enabled for daemon. 499 func WithDebugLog(debug bool) RemoteOption { 500 return debugLog(debug) 501 } 502 503 type debugLog bool 504 505 func (d debugLog) Apply(r Remote) error { 506 if remote, ok := r.(*remote); ok { 507 remote.debugLog = bool(d) 508 return nil 509 } 510 return fmt.Errorf("WithDebugLog option not supported for this remote") 511 } 512 513 // WithLiveRestore defines if containers are stopped on shutdown or restored. 514 func WithLiveRestore(v bool) RemoteOption { 515 return liveRestore(v) 516 } 517 518 type liveRestore bool 519 520 func (l liveRestore) Apply(r Remote) error { 521 if remote, ok := r.(*remote); ok { 522 remote.liveRestore = bool(l) 523 for _, c := range remote.clients { 524 c.liveRestore = bool(l) 525 } 526 return nil 527 } 528 return fmt.Errorf("WithLiveRestore option not supported for this remote") 529 } 530 531 // WithOOMScore defines the oom_score_adj to set for the containerd process. 532 func WithOOMScore(score int) RemoteOption { 533 return oomScore(score) 534 } 535 536 type oomScore int 537 538 func (o oomScore) Apply(r Remote) error { 539 if remote, ok := r.(*remote); ok { 540 remote.oomScore = int(o) 541 return nil 542 } 543 return fmt.Errorf("WithOOMScore option not supported for this remote") 544 }