github.com/Heebron/moby@v0.0.0-20221111184709-6eab4f55faf7/libcontainerd/supervisor/remote_daemon.go (about) 1 package supervisor // import "github.com/docker/docker/libcontainerd/supervisor" 2 3 import ( 4 "context" 5 "os" 6 "os/exec" 7 "path/filepath" 8 "runtime" 9 "strings" 10 "time" 11 12 "github.com/containerd/containerd" 13 "github.com/containerd/containerd/services/server/config" 14 "github.com/containerd/containerd/sys" 15 "github.com/docker/docker/pkg/pidfile" 16 "github.com/docker/docker/pkg/process" 17 "github.com/docker/docker/pkg/system" 18 "github.com/pelletier/go-toml" 19 "github.com/pkg/errors" 20 "github.com/sirupsen/logrus" 21 ) 22 23 const ( 24 maxConnectionRetryCount = 3 25 healthCheckTimeout = 3 * time.Second 26 shutdownTimeout = 15 * time.Second 27 startupTimeout = 15 * time.Second 28 configFile = "containerd.toml" 29 binaryName = "containerd" 30 pidFile = "containerd.pid" 31 ) 32 33 type remote struct { 34 config.Config 35 36 // configFile is the location where the generated containerd configuration 37 // file is saved. 38 configFile string 39 40 daemonPid int 41 pidFile string 42 logger *logrus.Entry 43 44 daemonWaitCh chan struct{} 45 daemonStartCh chan error 46 daemonStopCh chan struct{} 47 48 stateDir string 49 50 // oomScore adjusts the OOM score for the containerd process. 51 oomScore int 52 53 // logLevel overrides the containerd logging-level through the --log-level 54 // command-line option. 55 logLevel string 56 } 57 58 // Daemon represents a running containerd daemon 59 type Daemon interface { 60 WaitTimeout(time.Duration) error 61 Address() string 62 } 63 64 // DaemonOpt allows to configure parameters of container daemons 65 type DaemonOpt func(c *remote) error 66 67 // Start starts a containerd daemon and monitors it 68 func Start(ctx context.Context, rootDir, stateDir string, opts ...DaemonOpt) (Daemon, error) { 69 r := &remote{ 70 stateDir: stateDir, 71 Config: config.Config{ 72 Version: 2, 73 Root: filepath.Join(rootDir, "daemon"), 74 State: filepath.Join(stateDir, "daemon"), 75 }, 76 configFile: filepath.Join(stateDir, configFile), 77 daemonPid: -1, 78 pidFile: filepath.Join(stateDir, pidFile), 79 logger: logrus.WithField("module", "libcontainerd"), 80 daemonStartCh: make(chan error, 1), 81 daemonStopCh: make(chan struct{}), 82 } 83 84 for _, opt := range opts { 85 if err := opt(r); err != nil { 86 return nil, err 87 } 88 } 89 r.setDefaults() 90 91 if err := system.MkdirAll(stateDir, 0700); err != nil { 92 return nil, err 93 } 94 95 go r.monitorDaemon(ctx) 96 97 timeout := time.NewTimer(startupTimeout) 98 defer timeout.Stop() 99 100 select { 101 case <-timeout.C: 102 return nil, errors.New("timeout waiting for containerd to start") 103 case err := <-r.daemonStartCh: 104 if err != nil { 105 return nil, err 106 } 107 } 108 109 return r, nil 110 } 111 func (r *remote) WaitTimeout(d time.Duration) error { 112 timeout := time.NewTimer(d) 113 defer timeout.Stop() 114 115 select { 116 case <-timeout.C: 117 return errors.New("timeout waiting for containerd to stop") 118 case <-r.daemonStopCh: 119 } 120 121 return nil 122 } 123 124 func (r *remote) Address() string { 125 return r.GRPC.Address 126 } 127 128 func (r *remote) getContainerdConfig() (string, error) { 129 f, err := os.OpenFile(r.configFile, os.O_CREATE|os.O_TRUNC|os.O_WRONLY, 0600) 130 if err != nil { 131 return "", errors.Wrapf(err, "failed to open containerd config file (%s)", r.configFile) 132 } 133 defer f.Close() 134 135 if err := toml.NewEncoder(f).Encode(r); err != nil { 136 return "", errors.Wrapf(err, "failed to write containerd config file (%s)", r.configFile) 137 } 138 return r.configFile, nil 139 } 140 141 func (r *remote) startContainerd() error { 142 pid, err := pidfile.Read(r.pidFile) 143 if err != nil && !errors.Is(err, os.ErrNotExist) { 144 return err 145 } 146 147 if pid > 0 { 148 r.daemonPid = pid 149 r.logger.WithField("pid", pid).Infof("%s is still running", binaryName) 150 return nil 151 } 152 153 cfgFile, err := r.getContainerdConfig() 154 if err != nil { 155 return err 156 } 157 args := []string{"--config", cfgFile} 158 159 if r.logLevel != "" { 160 args = append(args, "--log-level", r.logLevel) 161 } 162 163 cmd := exec.Command(binaryName, args...) 164 // redirect containerd logs to docker logs 165 cmd.Stdout = os.Stdout 166 cmd.Stderr = os.Stderr 167 cmd.SysProcAttr = containerdSysProcAttr() 168 // clear the NOTIFY_SOCKET from the env when starting containerd 169 cmd.Env = nil 170 for _, e := range os.Environ() { 171 if !strings.HasPrefix(e, "NOTIFY_SOCKET") { 172 cmd.Env = append(cmd.Env, e) 173 } 174 } 175 176 startedCh := make(chan error) 177 go func() { 178 // On Linux, when cmd.SysProcAttr.Pdeathsig is set, 179 // the signal is sent to the subprocess when the creating thread 180 // terminates. The runtime terminates a thread if a goroutine 181 // exits while locked to it. Prevent the containerd process 182 // from getting killed prematurely by ensuring that the thread 183 // used to start it remains alive until it or the daemon process 184 // exits. See https://go.dev/issue/27505 for more details. 185 runtime.LockOSThread() 186 defer runtime.UnlockOSThread() 187 err := cmd.Start() 188 startedCh <- err 189 if err != nil { 190 return 191 } 192 193 r.daemonWaitCh = make(chan struct{}) 194 // Reap our child when needed 195 if err := cmd.Wait(); err != nil { 196 r.logger.WithError(err).Errorf("containerd did not exit successfully") 197 } 198 close(r.daemonWaitCh) 199 }() 200 if err := <-startedCh; err != nil { 201 return err 202 } 203 204 r.daemonPid = cmd.Process.Pid 205 206 if err := r.adjustOOMScore(); err != nil { 207 r.logger.WithError(err).Warn("failed to adjust OOM score") 208 } 209 210 err = pidfile.Write(r.pidFile, r.daemonPid) 211 if err != nil { 212 process.Kill(r.daemonPid) 213 return errors.Wrap(err, "libcontainerd: failed to save daemon pid to disk") 214 } 215 216 r.logger.WithField("pid", r.daemonPid).WithField("address", r.Address()).Infof("started new %s process", binaryName) 217 218 return nil 219 } 220 221 func (r *remote) adjustOOMScore() error { 222 if r.oomScore == 0 || r.daemonPid <= 1 { 223 // no score configured, or daemonPid contains an invalid PID (we don't 224 // expect containerd to be running as PID 1 :)). 225 return nil 226 } 227 if err := sys.SetOOMScore(r.daemonPid, r.oomScore); err != nil { 228 return errors.Wrap(err, "failed to adjust OOM score for containerd process") 229 } 230 return nil 231 } 232 233 func (r *remote) monitorDaemon(ctx context.Context) { 234 var ( 235 transientFailureCount = 0 236 client *containerd.Client 237 err error 238 delay time.Duration 239 timer = time.NewTimer(0) 240 started bool 241 ) 242 243 defer func() { 244 if r.daemonPid != -1 { 245 r.stopDaemon() 246 } 247 248 // cleanup some files 249 _ = os.Remove(r.pidFile) 250 251 r.platformCleanup() 252 253 close(r.daemonStopCh) 254 timer.Stop() 255 }() 256 257 // ensure no races on sending to timer.C even though there is a 0 duration. 258 if !timer.Stop() { 259 <-timer.C 260 } 261 262 for { 263 timer.Reset(delay) 264 265 select { 266 case <-ctx.Done(): 267 r.logger.Info("stopping healthcheck following graceful shutdown") 268 if client != nil { 269 client.Close() 270 } 271 return 272 case <-timer.C: 273 } 274 275 if r.daemonPid == -1 { 276 if r.daemonWaitCh != nil { 277 select { 278 case <-ctx.Done(): 279 r.logger.Info("stopping containerd startup following graceful shutdown") 280 return 281 case <-r.daemonWaitCh: 282 } 283 } 284 285 os.RemoveAll(r.GRPC.Address) 286 if err := r.startContainerd(); err != nil { 287 if !started { 288 r.daemonStartCh <- err 289 return 290 } 291 r.logger.WithError(err).Error("failed restarting containerd") 292 delay = 50 * time.Millisecond 293 continue 294 } 295 296 client, err = containerd.New(r.GRPC.Address, containerd.WithTimeout(60*time.Second)) 297 if err != nil { 298 r.logger.WithError(err).Error("failed connecting to containerd") 299 delay = 100 * time.Millisecond 300 continue 301 } 302 r.logger.WithField("address", r.GRPC.Address).Debug("created containerd monitoring client") 303 } 304 305 if client != nil { 306 tctx, cancel := context.WithTimeout(ctx, healthCheckTimeout) 307 _, err := client.IsServing(tctx) 308 cancel() 309 if err == nil { 310 if !started { 311 close(r.daemonStartCh) 312 started = true 313 } 314 315 transientFailureCount = 0 316 317 select { 318 case <-r.daemonWaitCh: 319 case <-ctx.Done(): 320 } 321 322 // Set a small delay in case there is a recurring failure (or bug in this code) 323 // to ensure we don't end up in a super tight loop. 324 delay = 500 * time.Millisecond 325 continue 326 } 327 328 r.logger.WithError(err).WithField("binary", binaryName).Debug("daemon is not responding") 329 330 transientFailureCount++ 331 if transientFailureCount < maxConnectionRetryCount || process.Alive(r.daemonPid) { 332 delay = time.Duration(transientFailureCount) * 200 * time.Millisecond 333 continue 334 } 335 client.Close() 336 client = nil 337 } 338 339 if process.Alive(r.daemonPid) { 340 r.logger.WithField("pid", r.daemonPid).Info("killing and restarting containerd") 341 r.killDaemon() 342 } 343 344 r.daemonPid = -1 345 delay = 0 346 transientFailureCount = 0 347 } 348 }