github.com/adityamillind98/moby@v23.0.0-rc.4+incompatible/libcontainerd/supervisor/remote_daemon.go (about) 1 package supervisor // import "github.com/docker/docker/libcontainerd/supervisor" 2 3 import ( 4 "context" 5 "fmt" 6 "io" 7 "os" 8 "os/exec" 9 "path/filepath" 10 "strconv" 11 "strings" 12 "sync" 13 "time" 14 15 "github.com/containerd/containerd" 16 "github.com/containerd/containerd/services/server/config" 17 "github.com/docker/docker/pkg/system" 18 "github.com/pelletier/go-toml" 19 "github.com/pkg/errors" 20 "github.com/sirupsen/logrus" 21 ) 22 23 const ( 24 maxConnectionRetryCount = 3 25 healthCheckTimeout = 3 * time.Second 26 shutdownTimeout = 15 * time.Second 27 startupTimeout = 15 * time.Second 28 configFile = "containerd.toml" 29 binaryName = "containerd" 30 pidFile = "containerd.pid" 31 ) 32 33 type remote struct { 34 sync.RWMutex 35 config.Config 36 // Plugins overrides `Plugins map[string]toml.Tree` in config config. 37 Plugins map[string]interface{} `toml:"plugins"` 38 39 daemonPid int 40 logger *logrus.Entry 41 42 daemonWaitCh chan struct{} 43 daemonStartCh chan error 44 daemonStopCh chan struct{} 45 46 rootDir string 47 stateDir string 48 } 49 50 // Daemon represents a running containerd daemon 51 type Daemon interface { 52 WaitTimeout(time.Duration) error 53 Address() string 54 } 55 56 // DaemonOpt allows to configure parameters of container daemons 57 type DaemonOpt func(c *remote) error 58 59 // Start starts a containerd daemon and monitors it 60 func Start(ctx context.Context, rootDir, stateDir string, opts ...DaemonOpt) (Daemon, error) { 61 r := &remote{ 62 rootDir: rootDir, 63 stateDir: stateDir, 64 Config: config.Config{ 65 Version: 2, 66 Root: filepath.Join(rootDir, "daemon"), 67 State: filepath.Join(stateDir, "daemon"), 68 }, 69 Plugins: make(map[string]interface{}), 70 daemonPid: -1, 71 logger: logrus.WithField("module", "libcontainerd"), 72 daemonStartCh: make(chan error, 1), 73 daemonStopCh: make(chan struct{}), 74 } 75 76 for _, opt := range opts { 77 if err := opt(r); err != nil { 78 return nil, err 79 } 80 } 81 r.setDefaults() 82 83 if err := system.MkdirAll(stateDir, 0700); err != nil { 84 return nil, err 85 } 86 87 go r.monitorDaemon(ctx) 88 89 timeout := time.NewTimer(startupTimeout) 90 defer timeout.Stop() 91 92 select { 93 case <-timeout.C: 94 return nil, errors.New("timeout waiting for containerd to start") 95 case err := <-r.daemonStartCh: 96 if err != nil { 97 return nil, err 98 } 99 } 100 101 return r, nil 102 } 103 func (r *remote) WaitTimeout(d time.Duration) error { 104 timeout := time.NewTimer(d) 105 defer timeout.Stop() 106 107 select { 108 case <-timeout.C: 109 return errors.New("timeout waiting for containerd to stop") 110 case <-r.daemonStopCh: 111 } 112 113 return nil 114 } 115 116 func (r *remote) Address() string { 117 return r.GRPC.Address 118 } 119 func (r *remote) getContainerdPid() (int, error) { 120 pidFile := filepath.Join(r.stateDir, pidFile) 121 f, err := os.OpenFile(pidFile, os.O_RDWR, 0600) 122 if err != nil { 123 if os.IsNotExist(err) { 124 return -1, nil 125 } 126 return -1, err 127 } 128 defer f.Close() 129 130 b := make([]byte, 8) 131 n, err := f.Read(b) 132 if err != nil && err != io.EOF { 133 return -1, err 134 } 135 136 if n > 0 { 137 pid, err := strconv.ParseUint(string(b[:n]), 10, 64) 138 if err != nil { 139 return -1, err 140 } 141 if system.IsProcessAlive(int(pid)) { 142 return int(pid), nil 143 } 144 } 145 146 return -1, nil 147 } 148 149 func (r *remote) getContainerdConfig() (string, error) { 150 path := filepath.Join(r.stateDir, configFile) 151 f, err := os.OpenFile(path, os.O_CREATE|os.O_TRUNC|os.O_WRONLY, 0600) 152 if err != nil { 153 return "", errors.Wrapf(err, "failed to open containerd config file at %s", path) 154 } 155 defer f.Close() 156 157 if err := toml.NewEncoder(f).Encode(r); err != nil { 158 return "", errors.Wrapf(err, "failed to write containerd config file (%s)", path) 159 } 160 return path, nil 161 } 162 163 func (r *remote) startContainerd() error { 164 pid, err := r.getContainerdPid() 165 if err != nil { 166 return err 167 } 168 169 if pid != -1 { 170 r.daemonPid = pid 171 logrus.WithField("pid", pid). 172 Infof("libcontainerd: %s is still running", binaryName) 173 return nil 174 } 175 176 configFile, err := r.getContainerdConfig() 177 if err != nil { 178 return err 179 } 180 181 args := []string{"--config", configFile} 182 183 if r.Debug.Level != "" { 184 args = append(args, "--log-level", r.Debug.Level) 185 } 186 187 cmd := exec.Command(binaryName, args...) 188 // redirect containerd logs to docker logs 189 cmd.Stdout = os.Stdout 190 cmd.Stderr = os.Stderr 191 cmd.SysProcAttr = containerdSysProcAttr() 192 // clear the NOTIFY_SOCKET from the env when starting containerd 193 cmd.Env = nil 194 for _, e := range os.Environ() { 195 if !strings.HasPrefix(e, "NOTIFY_SOCKET") { 196 cmd.Env = append(cmd.Env, e) 197 } 198 } 199 if err := cmd.Start(); err != nil { 200 return err 201 } 202 203 r.daemonWaitCh = make(chan struct{}) 204 go func() { 205 // Reap our child when needed 206 if err := cmd.Wait(); err != nil { 207 r.logger.WithError(err).Errorf("containerd did not exit successfully") 208 } 209 close(r.daemonWaitCh) 210 }() 211 212 r.daemonPid = cmd.Process.Pid 213 214 err = os.WriteFile(filepath.Join(r.stateDir, pidFile), []byte(fmt.Sprintf("%d", r.daemonPid)), 0660) 215 if err != nil { 216 system.KillProcess(r.daemonPid) 217 return errors.Wrap(err, "libcontainerd: failed to save daemon pid to disk") 218 } 219 220 logrus.WithField("pid", r.daemonPid). 221 Infof("libcontainerd: started new %s process", binaryName) 222 223 return nil 224 } 225 226 func (r *remote) monitorDaemon(ctx context.Context) { 227 var ( 228 transientFailureCount = 0 229 client *containerd.Client 230 err error 231 delay time.Duration 232 timer = time.NewTimer(0) 233 started bool 234 ) 235 236 defer func() { 237 if r.daemonPid != -1 { 238 r.stopDaemon() 239 } 240 241 // cleanup some files 242 os.Remove(filepath.Join(r.stateDir, pidFile)) 243 244 r.platformCleanup() 245 246 close(r.daemonStopCh) 247 timer.Stop() 248 }() 249 250 // ensure no races on sending to timer.C even though there is a 0 duration. 251 if !timer.Stop() { 252 <-timer.C 253 } 254 255 for { 256 timer.Reset(delay) 257 258 select { 259 case <-ctx.Done(): 260 r.logger.Info("stopping healthcheck following graceful shutdown") 261 if client != nil { 262 client.Close() 263 } 264 return 265 case <-timer.C: 266 } 267 268 if r.daemonPid == -1 { 269 if r.daemonWaitCh != nil { 270 select { 271 case <-ctx.Done(): 272 r.logger.Info("stopping containerd startup following graceful shutdown") 273 return 274 case <-r.daemonWaitCh: 275 } 276 } 277 278 os.RemoveAll(r.GRPC.Address) 279 if err := r.startContainerd(); err != nil { 280 if !started { 281 r.daemonStartCh <- err 282 return 283 } 284 r.logger.WithError(err).Error("failed restarting containerd") 285 delay = 50 * time.Millisecond 286 continue 287 } 288 289 client, err = containerd.New(r.GRPC.Address, containerd.WithTimeout(60*time.Second)) 290 if err != nil { 291 r.logger.WithError(err).Error("failed connecting to containerd") 292 delay = 100 * time.Millisecond 293 continue 294 } 295 logrus.WithField("address", r.GRPC.Address).Debug("Created containerd monitoring client") 296 } 297 298 if client != nil { 299 tctx, cancel := context.WithTimeout(ctx, healthCheckTimeout) 300 _, err := client.IsServing(tctx) 301 cancel() 302 if err == nil { 303 if !started { 304 close(r.daemonStartCh) 305 started = true 306 } 307 308 transientFailureCount = 0 309 310 select { 311 case <-r.daemonWaitCh: 312 case <-ctx.Done(): 313 } 314 315 // Set a small delay in case there is a recurring failure (or bug in this code) 316 // to ensure we don't end up in a super tight loop. 317 delay = 500 * time.Millisecond 318 continue 319 } 320 321 r.logger.WithError(err).WithField("binary", binaryName).Debug("daemon is not responding") 322 323 transientFailureCount++ 324 if transientFailureCount < maxConnectionRetryCount || system.IsProcessAlive(r.daemonPid) { 325 delay = time.Duration(transientFailureCount) * 200 * time.Millisecond 326 continue 327 } 328 client.Close() 329 client = nil 330 } 331 332 if system.IsProcessAlive(r.daemonPid) { 333 r.logger.WithField("pid", r.daemonPid).Info("killing and restarting containerd") 334 r.killDaemon() 335 } 336 337 r.daemonPid = -1 338 delay = 0 339 transientFailureCount = 0 340 } 341 }