github.com/docker/engine@v22.0.0-20211208180946-d456264580cf+incompatible/libcontainerd/supervisor/remote_daemon.go (about) 1 package supervisor // import "github.com/docker/docker/libcontainerd/supervisor" 2 3 import ( 4 "context" 5 "fmt" 6 "io" 7 "os" 8 "os/exec" 9 "path/filepath" 10 "strconv" 11 "strings" 12 "sync" 13 "time" 14 15 "github.com/containerd/containerd" 16 "github.com/containerd/containerd/services/server/config" 17 "github.com/docker/docker/pkg/system" 18 "github.com/pelletier/go-toml" 19 "github.com/pkg/errors" 20 "github.com/sirupsen/logrus" 21 ) 22 23 const ( 24 maxConnectionRetryCount = 3 25 healthCheckTimeout = 3 * time.Second 26 shutdownTimeout = 15 * time.Second 27 startupTimeout = 15 * time.Second 28 configFile = "containerd.toml" 29 binaryName = "containerd" 30 pidFile = "containerd.pid" 31 ) 32 33 type remote struct { 34 sync.RWMutex 35 config.Config 36 // Plugins overrides `Plugins map[string]toml.Tree` in config config. 37 Plugins map[string]interface{} `toml:"plugins"` 38 39 daemonPid int 40 logger *logrus.Entry 41 42 daemonWaitCh chan struct{} 43 daemonStartCh chan error 44 daemonStopCh chan struct{} 45 46 rootDir string 47 stateDir string 48 } 49 50 // Daemon represents a running containerd daemon 51 type Daemon interface { 52 WaitTimeout(time.Duration) error 53 Address() string 54 } 55 56 // DaemonOpt allows to configure parameters of container daemons 57 type DaemonOpt func(c *remote) error 58 59 // Start starts a containerd daemon and monitors it 60 func Start(ctx context.Context, rootDir, stateDir string, opts ...DaemonOpt) (Daemon, error) { 61 r := &remote{ 62 rootDir: rootDir, 63 stateDir: stateDir, 64 Config: config.Config{ 65 Root: filepath.Join(rootDir, "daemon"), 66 State: filepath.Join(stateDir, "daemon"), 67 }, 68 Plugins: make(map[string]interface{}), 69 daemonPid: -1, 70 logger: logrus.WithField("module", "libcontainerd"), 71 daemonStartCh: make(chan error, 1), 72 daemonStopCh: make(chan struct{}), 73 } 74 75 for _, opt := range opts { 76 if err := opt(r); err != nil { 77 return nil, err 78 } 79 } 80 r.setDefaults() 81 82 if err := system.MkdirAll(stateDir, 0700); err != nil { 83 return nil, err 84 } 85 86 go r.monitorDaemon(ctx) 87 88 timeout := time.NewTimer(startupTimeout) 89 defer timeout.Stop() 90 91 select { 92 case <-timeout.C: 93 return nil, errors.New("timeout waiting for containerd to start") 94 case err := <-r.daemonStartCh: 95 if err != nil { 96 return nil, err 97 } 98 } 99 100 return r, nil 101 } 102 func (r *remote) WaitTimeout(d time.Duration) error { 103 timeout := time.NewTimer(d) 104 defer timeout.Stop() 105 106 select { 107 case <-timeout.C: 108 return errors.New("timeout waiting for containerd to stop") 109 case <-r.daemonStopCh: 110 } 111 112 return nil 113 } 114 115 func (r *remote) Address() string { 116 return r.GRPC.Address 117 } 118 func (r *remote) getContainerdPid() (int, error) { 119 pidFile := filepath.Join(r.stateDir, pidFile) 120 f, err := os.OpenFile(pidFile, os.O_RDWR, 0600) 121 if err != nil { 122 if os.IsNotExist(err) { 123 return -1, nil 124 } 125 return -1, err 126 } 127 defer f.Close() 128 129 b := make([]byte, 8) 130 n, err := f.Read(b) 131 if err != nil && err != io.EOF { 132 return -1, err 133 } 134 135 if n > 0 { 136 pid, err := strconv.ParseUint(string(b[:n]), 10, 64) 137 if err != nil { 138 return -1, err 139 } 140 if system.IsProcessAlive(int(pid)) { 141 return int(pid), nil 142 } 143 } 144 145 return -1, nil 146 } 147 148 func (r *remote) getContainerdConfig() (string, error) { 149 path := filepath.Join(r.stateDir, configFile) 150 f, err := os.OpenFile(path, os.O_CREATE|os.O_TRUNC|os.O_WRONLY, 0600) 151 if err != nil { 152 return "", errors.Wrapf(err, "failed to open containerd config file at %s", path) 153 } 154 defer f.Close() 155 156 if err := toml.NewEncoder(f).Encode(r); err != nil { 157 return "", errors.Wrapf(err, "failed to write containerd config file (%s)", path) 158 } 159 return path, nil 160 } 161 162 func (r *remote) startContainerd() error { 163 pid, err := r.getContainerdPid() 164 if err != nil { 165 return err 166 } 167 168 if pid != -1 { 169 r.daemonPid = pid 170 logrus.WithField("pid", pid). 171 Infof("libcontainerd: %s is still running", binaryName) 172 return nil 173 } 174 175 configFile, err := r.getContainerdConfig() 176 if err != nil { 177 return err 178 } 179 180 args := []string{"--config", configFile} 181 182 if r.Debug.Level != "" { 183 args = append(args, "--log-level", r.Debug.Level) 184 } 185 186 cmd := exec.Command(binaryName, args...) 187 // redirect containerd logs to docker logs 188 cmd.Stdout = os.Stdout 189 cmd.Stderr = os.Stderr 190 cmd.SysProcAttr = containerdSysProcAttr() 191 // clear the NOTIFY_SOCKET from the env when starting containerd 192 cmd.Env = nil 193 for _, e := range os.Environ() { 194 if !strings.HasPrefix(e, "NOTIFY_SOCKET") { 195 cmd.Env = append(cmd.Env, e) 196 } 197 } 198 if err := cmd.Start(); err != nil { 199 return err 200 } 201 202 r.daemonWaitCh = make(chan struct{}) 203 go func() { 204 // Reap our child when needed 205 if err := cmd.Wait(); err != nil { 206 r.logger.WithError(err).Errorf("containerd did not exit successfully") 207 } 208 close(r.daemonWaitCh) 209 }() 210 211 r.daemonPid = cmd.Process.Pid 212 213 err = os.WriteFile(filepath.Join(r.stateDir, pidFile), []byte(fmt.Sprintf("%d", r.daemonPid)), 0660) 214 if err != nil { 215 system.KillProcess(r.daemonPid) 216 return errors.Wrap(err, "libcontainerd: failed to save daemon pid to disk") 217 } 218 219 logrus.WithField("pid", r.daemonPid). 220 Infof("libcontainerd: started new %s process", binaryName) 221 222 return nil 223 } 224 225 func (r *remote) monitorDaemon(ctx context.Context) { 226 var ( 227 transientFailureCount = 0 228 client *containerd.Client 229 err error 230 delay time.Duration 231 timer = time.NewTimer(0) 232 started bool 233 ) 234 235 defer func() { 236 if r.daemonPid != -1 { 237 r.stopDaemon() 238 } 239 240 // cleanup some files 241 os.Remove(filepath.Join(r.stateDir, pidFile)) 242 243 r.platformCleanup() 244 245 close(r.daemonStopCh) 246 timer.Stop() 247 }() 248 249 // ensure no races on sending to timer.C even though there is a 0 duration. 250 if !timer.Stop() { 251 <-timer.C 252 } 253 254 for { 255 timer.Reset(delay) 256 257 select { 258 case <-ctx.Done(): 259 r.logger.Info("stopping healthcheck following graceful shutdown") 260 if client != nil { 261 client.Close() 262 } 263 return 264 case <-timer.C: 265 } 266 267 if r.daemonPid == -1 { 268 if r.daemonWaitCh != nil { 269 select { 270 case <-ctx.Done(): 271 r.logger.Info("stopping containerd startup following graceful shutdown") 272 return 273 case <-r.daemonWaitCh: 274 } 275 } 276 277 os.RemoveAll(r.GRPC.Address) 278 if err := r.startContainerd(); err != nil { 279 if !started { 280 r.daemonStartCh <- err 281 return 282 } 283 r.logger.WithError(err).Error("failed restarting containerd") 284 delay = 50 * time.Millisecond 285 continue 286 } 287 288 client, err = containerd.New(r.GRPC.Address, containerd.WithTimeout(60*time.Second)) 289 if err != nil { 290 r.logger.WithError(err).Error("failed connecting to containerd") 291 delay = 100 * time.Millisecond 292 continue 293 } 294 logrus.WithField("address", r.GRPC.Address).Debug("Created containerd monitoring client") 295 } 296 297 if client != nil { 298 tctx, cancel := context.WithTimeout(ctx, healthCheckTimeout) 299 _, err := client.IsServing(tctx) 300 cancel() 301 if err == nil { 302 if !started { 303 close(r.daemonStartCh) 304 started = true 305 } 306 307 transientFailureCount = 0 308 309 select { 310 case <-r.daemonWaitCh: 311 case <-ctx.Done(): 312 } 313 314 // Set a small delay in case there is a recurring failure (or bug in this code) 315 // to ensure we don't end up in a super tight loop. 316 delay = 500 * time.Millisecond 317 continue 318 } 319 320 r.logger.WithError(err).WithField("binary", binaryName).Debug("daemon is not responding") 321 322 transientFailureCount++ 323 if transientFailureCount < maxConnectionRetryCount || system.IsProcessAlive(r.daemonPid) { 324 delay = time.Duration(transientFailureCount) * 200 * time.Millisecond 325 continue 326 } 327 client.Close() 328 client = nil 329 } 330 331 if system.IsProcessAlive(r.daemonPid) { 332 r.logger.WithField("pid", r.daemonPid).Info("killing and restarting containerd") 333 r.killDaemon() 334 } 335 336 r.daemonPid = -1 337 delay = 0 338 transientFailureCount = 0 339 } 340 }