github.com/kaisenlinux/docker.io@v0.0.0-20230510090727-ea55db55fac7/engine/libcontainerd/supervisor/remote_daemon.go (about) 1 package supervisor // import "github.com/docker/docker/libcontainerd/supervisor" 2 3 import ( 4 "context" 5 "fmt" 6 "io" 7 "os" 8 "os/exec" 9 "path/filepath" 10 "strconv" 11 "strings" 12 "sync" 13 "time" 14 15 "github.com/BurntSushi/toml" 16 "github.com/containerd/containerd" 17 "github.com/containerd/containerd/services/server/config" 18 "github.com/docker/docker/pkg/system" 19 "github.com/pkg/errors" 20 "github.com/sirupsen/logrus" 21 ) 22 23 const ( 24 maxConnectionRetryCount = 3 25 healthCheckTimeout = 3 * time.Second 26 shutdownTimeout = 15 * time.Second 27 startupTimeout = 15 * time.Second 28 configFile = "containerd.toml" 29 binaryName = "containerd" 30 pidFile = "containerd.pid" 31 ) 32 33 type pluginConfigs struct { 34 Plugins map[string]interface{} `toml:"plugins"` 35 } 36 37 type remote struct { 38 sync.RWMutex 39 config.Config 40 41 daemonPid int 42 logger *logrus.Entry 43 44 daemonWaitCh chan struct{} 45 daemonStartCh chan error 46 daemonStopCh chan struct{} 47 48 rootDir string 49 stateDir string 50 pluginConfs pluginConfigs 51 } 52 53 // Daemon represents a running containerd daemon 54 type Daemon interface { 55 WaitTimeout(time.Duration) error 56 Address() string 57 } 58 59 // DaemonOpt allows to configure parameters of container daemons 60 type DaemonOpt func(c *remote) error 61 62 // Start starts a containerd daemon and monitors it 63 func Start(ctx context.Context, rootDir, stateDir string, opts ...DaemonOpt) (Daemon, error) { 64 r := &remote{ 65 rootDir: rootDir, 66 stateDir: stateDir, 67 Config: config.Config{ 68 Root: filepath.Join(rootDir, "daemon"), 69 State: filepath.Join(stateDir, "daemon"), 70 }, 71 pluginConfs: pluginConfigs{make(map[string]interface{})}, 72 daemonPid: -1, 73 logger: logrus.WithField("module", "libcontainerd"), 74 daemonStartCh: make(chan error, 1), 75 daemonStopCh: make(chan struct{}), 76 } 77 78 for _, opt := range opts { 79 if err := opt(r); err != nil { 80 return nil, err 81 } 82 } 83 r.setDefaults() 84 85 if err := system.MkdirAll(stateDir, 0700); err != nil { 86 return nil, err 87 } 88 89 go r.monitorDaemon(ctx) 90 91 timeout := time.NewTimer(startupTimeout) 92 defer timeout.Stop() 93 94 select { 95 case <-timeout.C: 96 return nil, errors.New("timeout waiting for containerd to start") 97 case err := <-r.daemonStartCh: 98 if err != nil { 99 return nil, err 100 } 101 } 102 103 return r, nil 104 } 105 func (r *remote) WaitTimeout(d time.Duration) error { 106 timeout := time.NewTimer(d) 107 defer timeout.Stop() 108 109 select { 110 case <-timeout.C: 111 return errors.New("timeout waiting for containerd to stop") 112 case <-r.daemonStopCh: 113 } 114 115 return nil 116 } 117 118 func (r *remote) Address() string { 119 return r.GRPC.Address 120 } 121 func (r *remote) getContainerdPid() (int, error) { 122 pidFile := filepath.Join(r.stateDir, pidFile) 123 f, err := os.OpenFile(pidFile, os.O_RDWR, 0600) 124 if err != nil { 125 if os.IsNotExist(err) { 126 return -1, nil 127 } 128 return -1, err 129 } 130 defer f.Close() 131 132 b := make([]byte, 8) 133 n, err := f.Read(b) 134 if err != nil && err != io.EOF { 135 return -1, err 136 } 137 138 if n > 0 { 139 pid, err := strconv.ParseUint(string(b[:n]), 10, 64) 140 if err != nil { 141 return -1, err 142 } 143 if system.IsProcessAlive(int(pid)) { 144 return int(pid), nil 145 } 146 } 147 148 return -1, nil 149 } 150 151 func (r *remote) getContainerdConfig() (string, error) { 152 path := filepath.Join(r.stateDir, configFile) 153 f, err := os.OpenFile(path, os.O_CREATE|os.O_TRUNC|os.O_WRONLY, 0600) 154 if err != nil { 155 return "", errors.Wrapf(err, "failed to open containerd config file at %s", path) 156 } 157 defer f.Close() 158 159 enc := toml.NewEncoder(f) 160 if err = enc.Encode(r.Config); err != nil { 161 return "", errors.Wrapf(err, "failed to encode general config") 162 } 163 if err = enc.Encode(r.pluginConfs); err != nil { 164 return "", errors.Wrapf(err, "failed to encode plugin configs") 165 } 166 167 return path, nil 168 } 169 170 func (r *remote) startContainerd() error { 171 pid, err := r.getContainerdPid() 172 if err != nil { 173 return err 174 } 175 176 if pid != -1 { 177 r.daemonPid = pid 178 logrus.WithField("pid", pid). 179 Infof("libcontainerd: %s is still running", binaryName) 180 return nil 181 } 182 183 configFile, err := r.getContainerdConfig() 184 if err != nil { 185 return err 186 } 187 188 args := []string{"--config", configFile} 189 190 if r.Debug.Level != "" { 191 args = append(args, "--log-level", r.Debug.Level) 192 } 193 194 cmd := exec.Command(binaryName, args...) 195 // redirect containerd logs to docker logs 196 cmd.Stdout = os.Stdout 197 cmd.Stderr = os.Stderr 198 cmd.SysProcAttr = containerdSysProcAttr() 199 // clear the NOTIFY_SOCKET from the env when starting containerd 200 cmd.Env = nil 201 for _, e := range os.Environ() { 202 if !strings.HasPrefix(e, "NOTIFY_SOCKET") { 203 cmd.Env = append(cmd.Env, e) 204 } 205 } 206 if err := cmd.Start(); err != nil { 207 return err 208 } 209 210 r.daemonWaitCh = make(chan struct{}) 211 go func() { 212 // Reap our child when needed 213 if err := cmd.Wait(); err != nil { 214 r.logger.WithError(err).Errorf("containerd did not exit successfully") 215 } 216 close(r.daemonWaitCh) 217 }() 218 219 r.daemonPid = cmd.Process.Pid 220 221 err = os.WriteFile(filepath.Join(r.stateDir, pidFile), []byte(fmt.Sprintf("%d", r.daemonPid)), 0660) 222 if err != nil { 223 system.KillProcess(r.daemonPid) 224 return errors.Wrap(err, "libcontainerd: failed to save daemon pid to disk") 225 } 226 227 logrus.WithField("pid", r.daemonPid). 228 Infof("libcontainerd: started new %s process", binaryName) 229 230 return nil 231 } 232 233 func (r *remote) monitorDaemon(ctx context.Context) { 234 var ( 235 transientFailureCount = 0 236 client *containerd.Client 237 err error 238 delay time.Duration 239 timer = time.NewTimer(0) 240 started bool 241 ) 242 243 defer func() { 244 if r.daemonPid != -1 { 245 r.stopDaemon() 246 } 247 248 // cleanup some files 249 os.Remove(filepath.Join(r.stateDir, pidFile)) 250 251 r.platformCleanup() 252 253 close(r.daemonStopCh) 254 timer.Stop() 255 }() 256 257 // ensure no races on sending to timer.C even though there is a 0 duration. 258 if !timer.Stop() { 259 <-timer.C 260 } 261 262 for { 263 timer.Reset(delay) 264 265 select { 266 case <-ctx.Done(): 267 r.logger.Info("stopping healthcheck following graceful shutdown") 268 if client != nil { 269 client.Close() 270 } 271 return 272 case <-timer.C: 273 } 274 275 if r.daemonPid == -1 { 276 if r.daemonWaitCh != nil { 277 select { 278 case <-ctx.Done(): 279 r.logger.Info("stopping containerd startup following graceful shutdown") 280 return 281 case <-r.daemonWaitCh: 282 } 283 } 284 285 os.RemoveAll(r.GRPC.Address) 286 if err := r.startContainerd(); err != nil { 287 if !started { 288 r.daemonStartCh <- err 289 return 290 } 291 r.logger.WithError(err).Error("failed restarting containerd") 292 delay = 50 * time.Millisecond 293 continue 294 } 295 296 client, err = containerd.New(r.GRPC.Address, containerd.WithTimeout(60*time.Second)) 297 if err != nil { 298 r.logger.WithError(err).Error("failed connecting to containerd") 299 delay = 100 * time.Millisecond 300 continue 301 } 302 logrus.WithField("address", r.GRPC.Address).Debug("Created containerd monitoring client") 303 } 304 305 if client != nil { 306 tctx, cancel := context.WithTimeout(ctx, healthCheckTimeout) 307 _, err := client.IsServing(tctx) 308 cancel() 309 if err == nil { 310 if !started { 311 close(r.daemonStartCh) 312 started = true 313 } 314 315 transientFailureCount = 0 316 317 select { 318 case <-r.daemonWaitCh: 319 case <-ctx.Done(): 320 } 321 322 // Set a small delay in case there is a recurring failure (or bug in this code) 323 // to ensure we don't end up in a super tight loop. 324 delay = 500 * time.Millisecond 325 continue 326 } 327 328 r.logger.WithError(err).WithField("binary", binaryName).Debug("daemon is not responding") 329 330 transientFailureCount++ 331 if transientFailureCount < maxConnectionRetryCount || system.IsProcessAlive(r.daemonPid) { 332 delay = time.Duration(transientFailureCount) * 200 * time.Millisecond 333 continue 334 } 335 client.Close() 336 client = nil 337 } 338 339 if system.IsProcessAlive(r.daemonPid) { 340 r.logger.WithField("pid", r.daemonPid).Info("killing and restarting containerd") 341 r.killDaemon() 342 } 343 344 r.daemonPid = -1 345 delay = 0 346 transientFailureCount = 0 347 } 348 }