github.com/docker/docker@v299999999.0.0-20200612211812-aaf470eca7b5+incompatible/libcontainerd/supervisor/remote_daemon.go (about) 1 package supervisor // import "github.com/docker/docker/libcontainerd/supervisor" 2 3 import ( 4 "context" 5 "fmt" 6 "io" 7 "io/ioutil" 8 "os" 9 "os/exec" 10 "path/filepath" 11 "strconv" 12 "strings" 13 "sync" 14 "time" 15 16 "github.com/BurntSushi/toml" 17 "github.com/containerd/containerd" 18 "github.com/containerd/containerd/services/server/config" 19 "github.com/docker/docker/pkg/system" 20 "github.com/pkg/errors" 21 "github.com/sirupsen/logrus" 22 ) 23 24 const ( 25 maxConnectionRetryCount = 3 26 healthCheckTimeout = 3 * time.Second 27 shutdownTimeout = 15 * time.Second 28 startupTimeout = 15 * time.Second 29 configFile = "containerd.toml" 30 binaryName = "containerd" 31 pidFile = "containerd.pid" 32 ) 33 34 type pluginConfigs struct { 35 Plugins map[string]interface{} `toml:"plugins"` 36 } 37 38 type remote struct { 39 sync.RWMutex 40 config.Config 41 42 daemonPid int 43 logger *logrus.Entry 44 45 daemonWaitCh chan struct{} 46 daemonStartCh chan error 47 daemonStopCh chan struct{} 48 49 rootDir string 50 stateDir string 51 pluginConfs pluginConfigs 52 } 53 54 // Daemon represents a running containerd daemon 55 type Daemon interface { 56 WaitTimeout(time.Duration) error 57 Address() string 58 } 59 60 // DaemonOpt allows to configure parameters of container daemons 61 type DaemonOpt func(c *remote) error 62 63 // Start starts a containerd daemon and monitors it 64 func Start(ctx context.Context, rootDir, stateDir string, opts ...DaemonOpt) (Daemon, error) { 65 r := &remote{ 66 rootDir: rootDir, 67 stateDir: stateDir, 68 Config: config.Config{ 69 Root: filepath.Join(rootDir, "daemon"), 70 State: filepath.Join(stateDir, "daemon"), 71 }, 72 pluginConfs: pluginConfigs{make(map[string]interface{})}, 73 daemonPid: -1, 74 logger: logrus.WithField("module", "libcontainerd"), 75 daemonStartCh: make(chan error, 1), 76 daemonStopCh: make(chan struct{}), 77 } 78 79 for _, opt := range opts { 80 if err := opt(r); err != nil { 81 return nil, err 82 } 83 } 84 r.setDefaults() 85 86 if err := system.MkdirAll(stateDir, 0700); err != nil { 87 return nil, err 88 } 89 90 go r.monitorDaemon(ctx) 91 92 timeout := time.NewTimer(startupTimeout) 93 defer timeout.Stop() 94 95 select { 96 case <-timeout.C: 97 return nil, errors.New("timeout waiting for containerd to start") 98 case err := <-r.daemonStartCh: 99 if err != nil { 100 return nil, err 101 } 102 } 103 104 return r, nil 105 } 106 func (r *remote) WaitTimeout(d time.Duration) error { 107 timeout := time.NewTimer(d) 108 defer timeout.Stop() 109 110 select { 111 case <-timeout.C: 112 return errors.New("timeout waiting for containerd to stop") 113 case <-r.daemonStopCh: 114 } 115 116 return nil 117 } 118 119 func (r *remote) Address() string { 120 return r.GRPC.Address 121 } 122 func (r *remote) getContainerdPid() (int, error) { 123 pidFile := filepath.Join(r.stateDir, pidFile) 124 f, err := os.OpenFile(pidFile, os.O_RDWR, 0600) 125 if err != nil { 126 if os.IsNotExist(err) { 127 return -1, nil 128 } 129 return -1, err 130 } 131 defer f.Close() 132 133 b := make([]byte, 8) 134 n, err := f.Read(b) 135 if err != nil && err != io.EOF { 136 return -1, err 137 } 138 139 if n > 0 { 140 pid, err := strconv.ParseUint(string(b[:n]), 10, 64) 141 if err != nil { 142 return -1, err 143 } 144 if system.IsProcessAlive(int(pid)) { 145 return int(pid), nil 146 } 147 } 148 149 return -1, nil 150 } 151 152 func (r *remote) getContainerdConfig() (string, error) { 153 path := filepath.Join(r.stateDir, configFile) 154 f, err := os.OpenFile(path, os.O_CREATE|os.O_TRUNC|os.O_WRONLY, 0600) 155 if err != nil { 156 return "", errors.Wrapf(err, "failed to open containerd config file at %s", path) 157 } 158 defer f.Close() 159 160 enc := toml.NewEncoder(f) 161 if err = enc.Encode(r.Config); err != nil { 162 return "", errors.Wrapf(err, "failed to encode general config") 163 } 164 if err = enc.Encode(r.pluginConfs); err != nil { 165 return "", errors.Wrapf(err, "failed to encode plugin configs") 166 } 167 168 return path, nil 169 } 170 171 func (r *remote) startContainerd() error { 172 pid, err := r.getContainerdPid() 173 if err != nil { 174 return err 175 } 176 177 if pid != -1 { 178 r.daemonPid = pid 179 logrus.WithField("pid", pid). 180 Infof("libcontainerd: %s is still running", binaryName) 181 return nil 182 } 183 184 configFile, err := r.getContainerdConfig() 185 if err != nil { 186 return err 187 } 188 189 args := []string{"--config", configFile} 190 191 if r.Debug.Level != "" { 192 args = append(args, "--log-level", r.Debug.Level) 193 } 194 195 cmd := exec.Command(binaryName, args...) 196 // redirect containerd logs to docker logs 197 cmd.Stdout = os.Stdout 198 cmd.Stderr = os.Stderr 199 cmd.SysProcAttr = containerdSysProcAttr() 200 // clear the NOTIFY_SOCKET from the env when starting containerd 201 cmd.Env = nil 202 for _, e := range os.Environ() { 203 if !strings.HasPrefix(e, "NOTIFY_SOCKET") { 204 cmd.Env = append(cmd.Env, e) 205 } 206 } 207 if err := cmd.Start(); err != nil { 208 return err 209 } 210 211 r.daemonWaitCh = make(chan struct{}) 212 go func() { 213 // Reap our child when needed 214 if err := cmd.Wait(); err != nil { 215 r.logger.WithError(err).Errorf("containerd did not exit successfully") 216 } 217 close(r.daemonWaitCh) 218 }() 219 220 r.daemonPid = cmd.Process.Pid 221 222 err = ioutil.WriteFile(filepath.Join(r.stateDir, pidFile), []byte(fmt.Sprintf("%d", r.daemonPid)), 0660) 223 if err != nil { 224 system.KillProcess(r.daemonPid) 225 return errors.Wrap(err, "libcontainerd: failed to save daemon pid to disk") 226 } 227 228 logrus.WithField("pid", r.daemonPid). 229 Infof("libcontainerd: started new %s process", binaryName) 230 231 return nil 232 } 233 234 func (r *remote) monitorDaemon(ctx context.Context) { 235 var ( 236 transientFailureCount = 0 237 client *containerd.Client 238 err error 239 delay time.Duration 240 timer = time.NewTimer(0) 241 started bool 242 ) 243 244 defer func() { 245 if r.daemonPid != -1 { 246 r.stopDaemon() 247 } 248 249 // cleanup some files 250 os.Remove(filepath.Join(r.stateDir, pidFile)) 251 252 r.platformCleanup() 253 254 close(r.daemonStopCh) 255 timer.Stop() 256 }() 257 258 // ensure no races on sending to timer.C even though there is a 0 duration. 259 if !timer.Stop() { 260 <-timer.C 261 } 262 263 for { 264 timer.Reset(delay) 265 266 select { 267 case <-ctx.Done(): 268 r.logger.Info("stopping healthcheck following graceful shutdown") 269 if client != nil { 270 client.Close() 271 } 272 return 273 case <-timer.C: 274 } 275 276 if r.daemonPid == -1 { 277 if r.daemonWaitCh != nil { 278 select { 279 case <-ctx.Done(): 280 r.logger.Info("stopping containerd startup following graceful shutdown") 281 return 282 case <-r.daemonWaitCh: 283 } 284 } 285 286 os.RemoveAll(r.GRPC.Address) 287 if err := r.startContainerd(); err != nil { 288 if !started { 289 r.daemonStartCh <- err 290 return 291 } 292 r.logger.WithError(err).Error("failed restarting containerd") 293 delay = 50 * time.Millisecond 294 continue 295 } 296 297 client, err = containerd.New(r.GRPC.Address, containerd.WithTimeout(60*time.Second)) 298 if err != nil { 299 r.logger.WithError(err).Error("failed connecting to containerd") 300 delay = 100 * time.Millisecond 301 continue 302 } 303 logrus.WithField("address", r.GRPC.Address).Debug("Created containerd monitoring client") 304 } 305 306 if client != nil { 307 tctx, cancel := context.WithTimeout(ctx, healthCheckTimeout) 308 _, err := client.IsServing(tctx) 309 cancel() 310 if err == nil { 311 if !started { 312 close(r.daemonStartCh) 313 started = true 314 } 315 316 transientFailureCount = 0 317 318 select { 319 case <-r.daemonWaitCh: 320 case <-ctx.Done(): 321 } 322 323 // Set a small delay in case there is a recurring failure (or bug in this code) 324 // to ensure we don't end up in a super tight loop. 325 delay = 500 * time.Millisecond 326 continue 327 } 328 329 r.logger.WithError(err).WithField("binary", binaryName).Debug("daemon is not responding") 330 331 transientFailureCount++ 332 if transientFailureCount < maxConnectionRetryCount || system.IsProcessAlive(r.daemonPid) { 333 delay = time.Duration(transientFailureCount) * 200 * time.Millisecond 334 continue 335 } 336 client.Close() 337 client = nil 338 } 339 340 if system.IsProcessAlive(r.daemonPid) { 341 r.logger.WithField("pid", r.daemonPid).Info("killing and restarting containerd") 342 r.killDaemon() 343 } 344 345 r.daemonPid = -1 346 delay = 0 347 transientFailureCount = 0 348 } 349 }