github.com/MerlinKodo/gvisor@v0.0.0-20231110090155-957f62ecf90e/pkg/shim/service.go (about) 1 // Copyright 2018 The gVisor Authors. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // https://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 // Package shim implements Containerd Shim v2 interface. 16 package shim 17 18 import ( 19 "context" 20 "fmt" 21 "io" 22 "os" 23 "os/exec" 24 "path/filepath" 25 "strings" 26 "sync" 27 "time" 28 29 "github.com/BurntSushi/toml" 30 "github.com/MerlinKodo/gvisor/pkg/cleanup" 31 v14 "github.com/MerlinKodo/gvisor/pkg/shim/runtimeoptions/v14" 32 "github.com/containerd/cgroups" 33 cgroupsstats "github.com/containerd/cgroups/stats/v1" 34 cgroupsv2 "github.com/containerd/cgroups/v2" 35 "github.com/containerd/console" 36 "github.com/containerd/containerd/api/events" 37 "github.com/containerd/containerd/api/types/task" 38 "github.com/containerd/containerd/errdefs" 39 "github.com/containerd/containerd/log" 40 "github.com/containerd/containerd/mount" 41 "github.com/containerd/containerd/namespaces" 42 "github.com/containerd/containerd/pkg/process" 43 "github.com/containerd/containerd/pkg/stdio" 44 "github.com/containerd/containerd/runtime" 45 "github.com/containerd/containerd/runtime/linux/runctypes" 46 "github.com/containerd/containerd/runtime/v2/shim" 47 taskAPI "github.com/containerd/containerd/runtime/v2/task" 48 "github.com/containerd/containerd/sys/reaper" 49 "github.com/containerd/typeurl" 50 "github.com/gogo/protobuf/types" 51 specs "github.com/opencontainers/runtime-spec/specs-go" 52 "github.com/sirupsen/logrus" 53 "golang.org/x/sys/unix" 54 55 "github.com/MerlinKodo/gvisor/pkg/shim/proc" 56 "github.com/MerlinKodo/gvisor/pkg/shim/runsc" 57 "github.com/MerlinKodo/gvisor/pkg/shim/runtimeoptions" 58 "github.com/MerlinKodo/gvisor/pkg/shim/utils" 59 "github.com/MerlinKodo/gvisor/runsc/specutils" 60 ) 61 62 var ( 63 empty = &types.Empty{} 64 bufPool = sync.Pool{ 65 New: func() any { 66 buffer := make([]byte, 32<<10) 67 return &buffer 68 }, 69 } 70 ) 71 72 const ( 73 // configFile is the default config file name. For containerd 1.2, 74 // we assume that a config.toml should exist in the runtime root. 75 configFile = "config.toml" 76 77 // shimAddressPath is the relative path to a file that contains the address 78 // to the shim UDS. See service.shimAddress. 79 shimAddressPath = "address" 80 81 cgroupParentAnnotation = "dev.gvisor.spec.cgroup-parent" 82 ) 83 84 type oomPoller interface { 85 io.Closer 86 // add adds `cg` cgroup to oom poller. `cg` is cgroups.Cgroup in v1 and 87 // `cgroupsv2.Manager` in v2 88 add(id string, cg any) error 89 // run monitors oom event and notifies the shim about them 90 run(ctx context.Context) 91 } 92 93 // New returns a new shim service that can be used via GRPC. 94 func New(ctx context.Context, id string, publisher shim.Publisher, cancel func()) (shim.Shim, error) { 95 var opts shim.Opts 96 if ctxOpts := ctx.Value(shim.OptsKey{}); ctxOpts != nil { 97 opts = ctxOpts.(shim.Opts) 98 } 99 100 var ( 101 ep oomPoller 102 err error 103 ) 104 if cgroups.Mode() == cgroups.Unified { 105 ep, err = newOOMv2Poller(publisher) 106 } else { 107 ep, err = newOOMEpoller(publisher) 108 } 109 if err != nil { 110 return nil, err 111 } 112 go ep.run(ctx) 113 s := &service{ 114 id: id, 115 processes: make(map[string]process.Process), 116 events: make(chan any, 128), 117 ec: proc.ExitCh, 118 oomPoller: ep, 119 cancel: cancel, 120 genericOptions: opts, 121 } 122 go s.processExits(ctx) 123 runsc.Monitor = &runsc.LogMonitor{Next: reaper.Default} 124 if err := s.initPlatform(); err != nil { 125 cancel() 126 return nil, fmt.Errorf("failed to initialized platform behavior: %w", err) 127 } 128 go s.forward(ctx, publisher) 129 130 if address, err := shim.ReadAddress(shimAddressPath); err == nil { 131 s.shimAddress = address 132 } 133 134 return s, nil 135 } 136 137 // service is the shim implementation of a remote shim over GRPC. It runs in 2 138 // different modes: 139 // 1. Service: process runs for the life time of the container and receives 140 // calls described in shimapi.TaskService interface. 141 // 2. Tool: process is short lived and runs only to perform the requested 142 // operations and then exits. It implements the direct functions in 143 // shim.Shim interface. 144 // 145 // When the service is running, it saves a json file with state information so 146 // that commands sent to the tool can load the state and perform the operation. 147 type service struct { 148 mu sync.Mutex 149 150 // id is the container ID. 151 id string 152 153 // bundle is a path provided by the caller on container creation. Store 154 // because it's needed in commands that don't receive bundle in the request. 155 bundle string 156 157 // task is the main process that is running the container. 158 task *proc.Init 159 160 // processes maps ExecId to processes running through exec. 161 processes map[string]process.Process 162 163 events chan any 164 165 // platform handles operations related to the console. 166 platform stdio.Platform 167 168 // genericOptions are options that come from the shim interface and are common 169 // to all shims. 170 genericOptions shim.Opts 171 172 // opts are configuration options specific for this shim. 173 opts options 174 175 // ex gets notified whenever the container init process or an exec'd process 176 // exits from inside the sandbox. 177 ec chan proc.Exit 178 179 // oomPoller monitors the sandbox's cgroup for OOM notifications. 180 oomPoller oomPoller 181 182 // cancel is a function that needs to be called before the shim stops. The 183 // function is provided by the caller to New(). 184 cancel func() 185 186 // shimAddress is the location of the UDS used to communicate to containerd. 187 shimAddress string 188 } 189 190 var _ shim.Shim = (*service)(nil) 191 192 func (s *service) newCommand(ctx context.Context, containerdBinary, containerdAddress string) (*exec.Cmd, error) { 193 ns, err := namespaces.NamespaceRequired(ctx) 194 if err != nil { 195 return nil, err 196 } 197 self, err := os.Executable() 198 if err != nil { 199 return nil, err 200 } 201 cwd, err := os.Getwd() 202 if err != nil { 203 return nil, err 204 } 205 args := []string{ 206 "-namespace", ns, 207 "-address", containerdAddress, 208 "-publish-binary", containerdBinary, 209 } 210 if s.genericOptions.Debug { 211 args = append(args, "-debug") 212 } 213 cmd := exec.Command(self, args...) 214 cmd.Dir = cwd 215 cmd.Env = append(os.Environ(), "GOMAXPROCS=2") 216 cmd.SysProcAttr = &unix.SysProcAttr{ 217 Setpgid: true, 218 } 219 return cmd, nil 220 } 221 222 func (s *service) StartShim(ctx context.Context, id, containerdBinary, containerdAddress, containerdTTRPCAddress string) (string, error) { 223 log.L.Debugf("StartShim, id: %s, binary: %q, address: %q", id, containerdBinary, containerdAddress) 224 225 cmd, err := s.newCommand(ctx, containerdBinary, containerdAddress) 226 if err != nil { 227 return "", err 228 } 229 address, err := shim.SocketAddress(ctx, containerdAddress, id) 230 if err != nil { 231 return "", err 232 } 233 socket, err := shim.NewSocket(address) 234 if err != nil { 235 // The only time where this would happen is if there is a bug and the socket 236 // was not cleaned up in the cleanup method of the shim or we are using the 237 // grouping functionality where the new process should be run with the same 238 // shim as an existing container. 239 if !shim.SocketEaddrinuse(err) { 240 return "", fmt.Errorf("create new shim socket: %w", err) 241 } 242 if shim.CanConnect(address) { 243 if err := shim.WriteAddress(shimAddressPath, address); err != nil { 244 return "", fmt.Errorf("write existing socket for shim: %w", err) 245 } 246 return address, nil 247 } 248 if err := shim.RemoveSocket(address); err != nil { 249 return "", fmt.Errorf("remove pre-existing socket: %w", err) 250 } 251 if socket, err = shim.NewSocket(address); err != nil { 252 return "", fmt.Errorf("try create new shim socket 2x: %w", err) 253 } 254 } 255 cu := cleanup.Make(func() { 256 socket.Close() 257 _ = shim.RemoveSocket(address) 258 }) 259 defer cu.Clean() 260 261 f, err := socket.File() 262 if err != nil { 263 return "", err 264 } 265 266 cmd.ExtraFiles = append(cmd.ExtraFiles, f) 267 268 log.L.Debugf("Executing: %q %s", cmd.Path, cmd.Args) 269 if err := cmd.Start(); err != nil { 270 f.Close() 271 return "", err 272 } 273 cu.Add(func() { cmd.Process.Kill() }) 274 275 // make sure to wait after start 276 go cmd.Wait() 277 if err := shim.WritePidFile("shim.pid", cmd.Process.Pid); err != nil { 278 return "", err 279 } 280 if err := shim.WriteAddress(shimAddressPath, address); err != nil { 281 return "", err 282 } 283 if err := shim.SetScore(cmd.Process.Pid); err != nil { 284 return "", fmt.Errorf("failed to set OOM Score on shim: %w", err) 285 } 286 cu.Release() 287 return address, nil 288 } 289 290 // Cleanup is called from another process (need to reload state) to stop the 291 // container and undo all operations done in Create(). 292 func (s *service) Cleanup(ctx context.Context) (*taskAPI.DeleteResponse, error) { 293 log.L.Debugf("Cleanup") 294 295 path, err := os.Getwd() 296 if err != nil { 297 return nil, err 298 } 299 ns, err := namespaces.NamespaceRequired(ctx) 300 if err != nil { 301 return nil, err 302 } 303 var st state 304 if err := st.load(path); err != nil { 305 return nil, err 306 } 307 r := proc.NewRunsc(s.opts.Root, path, ns, st.Options.BinaryName, nil, nil) 308 309 if err := r.Delete(ctx, s.id, &runsc.DeleteOpts{ 310 Force: true, 311 }); err != nil { 312 log.L.Infof("failed to remove runc container: %v", err) 313 } 314 if err := mount.UnmountAll(st.Rootfs, 0); err != nil { 315 log.L.Infof("failed to cleanup rootfs mount: %v", err) 316 } 317 return &taskAPI.DeleteResponse{ 318 ExitedAt: time.Now(), 319 ExitStatus: 128 + uint32(unix.SIGKILL), 320 }, nil 321 } 322 323 // Create creates a new initial process and container with the underlying OCI 324 // runtime. 325 func (s *service) Create(ctx context.Context, r *taskAPI.CreateTaskRequest) (*taskAPI.CreateTaskResponse, error) { 326 resp, err := s.create(ctx, r) 327 return resp, errdefs.ToGRPC(err) 328 } 329 330 func (s *service) create(ctx context.Context, r *taskAPI.CreateTaskRequest) (*taskAPI.CreateTaskResponse, error) { 331 s.mu.Lock() 332 defer s.mu.Unlock() 333 334 // Save the main task id and bundle to the shim for additional requests. 335 s.id = r.ID 336 s.bundle = r.Bundle 337 338 ns, err := namespaces.NamespaceRequired(ctx) 339 if err != nil { 340 return nil, fmt.Errorf("create namespace: %w", err) 341 } 342 343 // Read from root for now. 344 if r.Options != nil { 345 v, err := typeurl.UnmarshalAny(r.Options) 346 if err != nil { 347 return nil, err 348 } 349 var path string 350 switch o := v.(type) { 351 case *runctypes.CreateOptions: // containerd 1.2.x 352 s.opts.IoUID = o.IoUid 353 s.opts.IoGID = o.IoGid 354 s.opts.ShimCgroup = o.ShimCgroup 355 case *runctypes.RuncOptions: // containerd 1.2.x 356 root := proc.RunscRoot 357 if o.RuntimeRoot != "" { 358 root = o.RuntimeRoot 359 } 360 361 s.opts.BinaryName = o.Runtime 362 363 path = filepath.Join(root, configFile) 364 if _, err := os.Stat(path); err != nil { 365 if !os.IsNotExist(err) { 366 return nil, fmt.Errorf("stat config file %q: %w", path, err) 367 } 368 // A config file in runtime root is not required. 369 path = "" 370 } 371 case *runtimeoptions.Options: // containerd 1.5+ 372 if o.ConfigPath == "" { 373 break 374 } 375 if o.TypeUrl != optionsType { 376 return nil, fmt.Errorf("unsupported option type %q", o.TypeUrl) 377 } 378 path = o.ConfigPath 379 case *v14.Options: // containerd 1.4- 380 if o.ConfigPath == "" { 381 break 382 } 383 if o.TypeUrl != optionsType { 384 return nil, fmt.Errorf("unsupported option type %q", o.TypeUrl) 385 } 386 path = o.ConfigPath 387 default: 388 return nil, fmt.Errorf("unsupported option type %q", r.Options.TypeUrl) 389 } 390 if path != "" { 391 if _, err = toml.DecodeFile(path, &s.opts); err != nil { 392 return nil, fmt.Errorf("decode config file %q: %w", path, err) 393 } 394 } 395 } 396 397 if len(s.opts.LogLevel) != 0 { 398 lvl, err := logrus.ParseLevel(s.opts.LogLevel) 399 if err != nil { 400 return nil, err 401 } 402 logrus.SetLevel(lvl) 403 } 404 for _, emittedPath := range runsc.EmittedPaths(s.id, s.opts.RunscConfig) { 405 if err := os.MkdirAll(filepath.Dir(emittedPath), 0777); err != nil { 406 return nil, fmt.Errorf("failed to create parent directories for file %v: %w", emittedPath, err) 407 } 408 } 409 if len(s.opts.LogPath) != 0 { 410 logPath := runsc.FormatShimLogPath(s.opts.LogPath, s.id) 411 if err := os.MkdirAll(filepath.Dir(logPath), 0777); err != nil { 412 return nil, fmt.Errorf("failed to create log dir: %w", err) 413 } 414 logFile, err := os.Create(logPath) 415 if err != nil { 416 return nil, fmt.Errorf("failed to create log file: %w", err) 417 } 418 log.L.Debugf("Starting mirror log at %q", logPath) 419 std := logrus.StandardLogger() 420 std.SetOutput(io.MultiWriter(std.Out, logFile)) 421 422 log.L.Debugf("Create shim") 423 log.L.Debugf("***************************") 424 log.L.Debugf("Args: %s", os.Args) 425 log.L.Debugf("PID: %d", os.Getpid()) 426 log.L.Debugf("ID: %s", s.id) 427 log.L.Debugf("Options: %+v", s.opts) 428 log.L.Debugf("Bundle: %s", r.Bundle) 429 log.L.Debugf("Terminal: %t", r.Terminal) 430 log.L.Debugf("stdin: %s", r.Stdin) 431 log.L.Debugf("stdout: %s", r.Stdout) 432 log.L.Debugf("stderr: %s", r.Stderr) 433 log.L.Debugf("***************************") 434 if log.L.Logger.IsLevelEnabled(logrus.DebugLevel) { 435 setDebugSigHandler() 436 } 437 } 438 439 // Save state before any action is taken to ensure Cleanup() will have all 440 // the information it needs to undo the operations. 441 st := state{ 442 Rootfs: filepath.Join(r.Bundle, "rootfs"), 443 Options: s.opts, 444 } 445 if err := st.save(r.Bundle); err != nil { 446 return nil, err 447 } 448 449 if err := os.Mkdir(st.Rootfs, 0711); err != nil && !os.IsExist(err) { 450 return nil, err 451 } 452 453 // Convert from types.Mount to proc.Mount. 454 var mounts []proc.Mount 455 for _, m := range r.Rootfs { 456 mounts = append(mounts, proc.Mount{ 457 Type: m.Type, 458 Source: m.Source, 459 Target: m.Target, 460 Options: m.Options, 461 }) 462 } 463 464 // Cleans up all mounts in case of failure. 465 cu := cleanup.Make(func() { 466 if err := mount.UnmountAll(st.Rootfs, 0); err != nil { 467 log.L.Infof("failed to cleanup rootfs mount: %v", err) 468 } 469 }) 470 defer cu.Clean() 471 for _, rm := range mounts { 472 m := &mount.Mount{ 473 Type: rm.Type, 474 Source: rm.Source, 475 Options: rm.Options, 476 } 477 if err := m.Mount(st.Rootfs); err != nil { 478 return nil, fmt.Errorf("failed to mount rootfs component %v: %w", m, err) 479 } 480 } 481 482 config := &proc.CreateConfig{ 483 ID: r.ID, 484 Bundle: r.Bundle, 485 Runtime: s.opts.BinaryName, 486 Rootfs: mounts, 487 Terminal: r.Terminal, 488 Stdin: r.Stdin, 489 Stdout: r.Stdout, 490 Stderr: r.Stderr, 491 } 492 process, err := newInit(r.Bundle, filepath.Join(r.Bundle, "work"), ns, s.platform, config, &s.opts, st.Rootfs) 493 if err != nil { 494 return nil, err 495 } 496 if err := process.Create(ctx, config); err != nil { 497 return nil, err 498 } 499 500 // Set up OOM notification on the sandbox's cgroup. This is done on 501 // sandbox create since the sandbox process will be created here. 502 pid := process.Pid() 503 if pid > 0 { 504 var ( 505 cg any 506 err error 507 ) 508 if cgroups.Mode() == cgroups.Unified { 509 var cgPath string 510 cgPath, err = cgroupsv2.PidGroupPath(pid) 511 if err == nil { 512 cg, err = cgroupsv2.LoadManager("/sys/fs/cgroup", cgPath) 513 } 514 } else { 515 cg, err = cgroups.Load(cgroups.V1, cgroups.PidPath(pid)) 516 } 517 if err != nil { 518 return nil, fmt.Errorf("loading cgroup for %d: %w", pid, err) 519 } 520 if err := s.oomPoller.add(s.id, cg); err != nil { 521 return nil, fmt.Errorf("add cg to OOM monitor: %w", err) 522 } 523 } 524 525 // Success 526 cu.Release() 527 s.task = process 528 return &taskAPI.CreateTaskResponse{ 529 Pid: uint32(process.Pid()), 530 }, nil 531 } 532 533 // Start starts a process. 534 func (s *service) Start(ctx context.Context, r *taskAPI.StartRequest) (*taskAPI.StartResponse, error) { 535 resp, err := s.start(ctx, r) 536 return resp, errdefs.ToGRPC(err) 537 } 538 539 func (s *service) start(ctx context.Context, r *taskAPI.StartRequest) (*taskAPI.StartResponse, error) { 540 log.L.Debugf("Start, id: %s, execID: %s", r.ID, r.ExecID) 541 542 p, err := s.getProcess(r.ExecID) 543 if err != nil { 544 return nil, err 545 } 546 if err := p.Start(ctx); err != nil { 547 return nil, err 548 } 549 // TODO: Set the cgroup and oom notifications on restore. 550 // https://github.com/google/gvisor-containerd-shim/issues/58 551 return &taskAPI.StartResponse{ 552 Pid: uint32(p.Pid()), 553 }, nil 554 } 555 556 // Delete deletes the initial process and container. 557 func (s *service) Delete(ctx context.Context, r *taskAPI.DeleteRequest) (*taskAPI.DeleteResponse, error) { 558 resp, err := s.delete(ctx, r) 559 return resp, errdefs.ToGRPC(err) 560 } 561 562 func (s *service) delete(ctx context.Context, r *taskAPI.DeleteRequest) (*taskAPI.DeleteResponse, error) { 563 log.L.Debugf("Delete, id: %s, execID: %s", r.ID, r.ExecID) 564 565 p, err := s.getProcess(r.ExecID) 566 if err != nil { 567 return nil, err 568 } 569 if err := p.Delete(ctx); err != nil { 570 return nil, err 571 } 572 if len(r.ExecID) != 0 { 573 s.mu.Lock() 574 delete(s.processes, r.ExecID) 575 s.mu.Unlock() 576 } else if s.platform != nil { 577 s.platform.Close() 578 } 579 return &taskAPI.DeleteResponse{ 580 ExitStatus: uint32(p.ExitStatus()), 581 ExitedAt: p.ExitedAt(), 582 Pid: uint32(p.Pid()), 583 }, nil 584 } 585 586 // Exec spawns an additional process inside the container. 587 func (s *service) Exec(ctx context.Context, r *taskAPI.ExecProcessRequest) (*types.Empty, error) { 588 resp, err := s.exec(ctx, r) 589 return resp, errdefs.ToGRPC(err) 590 } 591 592 func (s *service) exec(ctx context.Context, r *taskAPI.ExecProcessRequest) (*types.Empty, error) { 593 log.L.Debugf("Exec, id: %s, execID: %s", r.ID, r.ExecID) 594 595 s.mu.Lock() 596 p := s.processes[r.ExecID] 597 s.mu.Unlock() 598 if p != nil { 599 return nil, errdefs.ToGRPCf(errdefs.ErrAlreadyExists, "id %s", r.ExecID) 600 } 601 if s.task == nil { 602 return nil, errdefs.ToGRPCf(errdefs.ErrFailedPrecondition, "container must be created") 603 } 604 process, err := s.task.Exec(ctx, s.bundle, &proc.ExecConfig{ 605 ID: r.ExecID, 606 Terminal: r.Terminal, 607 Stdin: r.Stdin, 608 Stdout: r.Stdout, 609 Stderr: r.Stderr, 610 Spec: r.Spec, 611 }) 612 if err != nil { 613 return nil, err 614 } 615 s.mu.Lock() 616 s.processes[r.ExecID] = process 617 s.mu.Unlock() 618 return empty, nil 619 } 620 621 // ResizePty resizes the terminal of a process. 622 func (s *service) ResizePty(ctx context.Context, r *taskAPI.ResizePtyRequest) (*types.Empty, error) { 623 resp, err := s.resizePty(ctx, r) 624 return resp, errdefs.ToGRPC(err) 625 } 626 627 func (s *service) resizePty(ctx context.Context, r *taskAPI.ResizePtyRequest) (*types.Empty, error) { 628 log.L.Debugf("ResizePty, id: %s, execID: %s, dimension: %dx%d", r.ID, r.ExecID, r.Height, r.Width) 629 630 p, err := s.getProcess(r.ExecID) 631 if err != nil { 632 return nil, err 633 } 634 ws := console.WinSize{ 635 Width: uint16(r.Width), 636 Height: uint16(r.Height), 637 } 638 if err := p.Resize(ws); err != nil { 639 return nil, err 640 } 641 return empty, nil 642 } 643 644 // State returns runtime state information for a process. 645 func (s *service) State(ctx context.Context, r *taskAPI.StateRequest) (*taskAPI.StateResponse, error) { 646 resp, err := s.state(ctx, r) 647 return resp, errdefs.ToGRPC(err) 648 } 649 650 func (s *service) state(ctx context.Context, r *taskAPI.StateRequest) (*taskAPI.StateResponse, error) { 651 log.L.Debugf("State, id: %s, execID: %s", r.ID, r.ExecID) 652 653 p, err := s.getProcess(r.ExecID) 654 if err != nil { 655 log.L.Debugf("State failed to find process: %v", err) 656 return nil, err 657 } 658 st, err := p.Status(ctx) 659 if err != nil { 660 log.L.Debugf("State failed: %v", err) 661 return nil, err 662 } 663 status := task.StatusUnknown 664 switch st { 665 case "created": 666 status = task.StatusCreated 667 case "running": 668 status = task.StatusRunning 669 case "stopped": 670 status = task.StatusStopped 671 } 672 sio := p.Stdio() 673 res := &taskAPI.StateResponse{ 674 ID: p.ID(), 675 Bundle: s.bundle, 676 Pid: uint32(p.Pid()), 677 Status: status, 678 Stdin: sio.Stdin, 679 Stdout: sio.Stdout, 680 Stderr: sio.Stderr, 681 Terminal: sio.Terminal, 682 ExitStatus: uint32(p.ExitStatus()), 683 ExitedAt: p.ExitedAt(), 684 } 685 log.L.Debugf("State succeeded, response: %+v", res) 686 return res, nil 687 } 688 689 // Pause the container. 690 func (s *service) Pause(ctx context.Context, r *taskAPI.PauseRequest) (*types.Empty, error) { 691 resp, err := s.pause(ctx, r) 692 return resp, errdefs.ToGRPC(err) 693 } 694 695 func (s *service) pause(ctx context.Context, r *taskAPI.PauseRequest) (*types.Empty, error) { 696 log.L.Debugf("Pause, id: %s", r.ID) 697 if s.task == nil { 698 log.L.Debugf("Pause error, id: %s: container not created", r.ID) 699 return nil, errdefs.ToGRPCf(errdefs.ErrFailedPrecondition, "container must be created") 700 } 701 err := s.task.Runtime().Pause(ctx, r.ID) 702 if err != nil { 703 return nil, err 704 } 705 return empty, nil 706 } 707 708 // Resume the container. 709 func (s *service) Resume(ctx context.Context, r *taskAPI.ResumeRequest) (*types.Empty, error) { 710 resp, err := s.resume(ctx, r) 711 return resp, errdefs.ToGRPC(err) 712 } 713 714 func (s *service) resume(ctx context.Context, r *taskAPI.ResumeRequest) (*types.Empty, error) { 715 log.L.Debugf("Resume, id: %s", r.ID) 716 if s.task == nil { 717 log.L.Debugf("Resume error, id: %s: container not created", r.ID) 718 return nil, errdefs.ToGRPCf(errdefs.ErrFailedPrecondition, "container must be created") 719 } 720 err := s.task.Runtime().Resume(ctx, r.ID) 721 if err != nil { 722 return nil, err 723 } 724 return empty, nil 725 } 726 727 // Kill a process with the provided signal. 728 func (s *service) Kill(ctx context.Context, r *taskAPI.KillRequest) (*types.Empty, error) { 729 resp, err := s.kill(ctx, r) 730 return resp, errdefs.ToGRPC(err) 731 } 732 733 func (s *service) kill(ctx context.Context, r *taskAPI.KillRequest) (*types.Empty, error) { 734 log.L.Debugf("Kill, id: %s, execID: %s, signal: %d, all: %t", r.ID, r.ExecID, r.Signal, r.All) 735 736 p, err := s.getProcess(r.ExecID) 737 if err != nil { 738 return nil, err 739 } 740 if err := p.Kill(ctx, r.Signal, r.All); err != nil { 741 log.L.Debugf("Kill failed: %v", err) 742 return nil, err 743 } 744 log.L.Debugf("Kill succeeded") 745 return empty, nil 746 } 747 748 // Pids returns all pids inside the container. 749 func (s *service) Pids(ctx context.Context, r *taskAPI.PidsRequest) (*taskAPI.PidsResponse, error) { 750 resp, err := s.pids(ctx, r) 751 return resp, errdefs.ToGRPC(err) 752 } 753 754 func (s *service) pids(ctx context.Context, r *taskAPI.PidsRequest) (*taskAPI.PidsResponse, error) { 755 log.L.Debugf("Pids, id: %s", r.ID) 756 757 pids, err := s.getContainerPids(ctx, r.ID) 758 if err != nil { 759 return nil, err 760 } 761 var processes []*task.ProcessInfo 762 for _, pid := range pids { 763 pInfo := task.ProcessInfo{ 764 Pid: pid, 765 } 766 for _, p := range s.processes { 767 if p.Pid() == int(pid) { 768 d := &runctypes.ProcessDetails{ 769 ExecID: p.ID(), 770 } 771 a, err := typeurl.MarshalAny(d) 772 if err != nil { 773 return nil, fmt.Errorf("failed to marshal process %d info: %w", pid, err) 774 } 775 pInfo.Info = a 776 break 777 } 778 } 779 processes = append(processes, &pInfo) 780 } 781 return &taskAPI.PidsResponse{ 782 Processes: processes, 783 }, nil 784 } 785 786 // CloseIO closes the I/O context of a process. 787 func (s *service) CloseIO(ctx context.Context, r *taskAPI.CloseIORequest) (*types.Empty, error) { 788 resp, err := s.closeIO(ctx, r) 789 return resp, errdefs.ToGRPC(err) 790 } 791 792 func (s *service) closeIO(ctx context.Context, r *taskAPI.CloseIORequest) (*types.Empty, error) { 793 log.L.Debugf("CloseIO, id: %s, execID: %s, stdin: %t", r.ID, r.ExecID, r.Stdin) 794 795 p, err := s.getProcess(r.ExecID) 796 if err != nil { 797 return nil, err 798 } 799 if stdin := p.Stdin(); stdin != nil { 800 if err := stdin.Close(); err != nil { 801 return nil, fmt.Errorf("close stdin: %w", err) 802 } 803 } 804 return empty, nil 805 } 806 807 // Checkpoint checkpoints the container. 808 func (s *service) Checkpoint(ctx context.Context, r *taskAPI.CheckpointTaskRequest) (*types.Empty, error) { 809 log.L.Debugf("Checkpoint, id: %s", r.ID) 810 return empty, errdefs.ToGRPC(errdefs.ErrNotImplemented) 811 } 812 813 // Connect returns shim information such as the shim's pid. 814 func (s *service) Connect(ctx context.Context, r *taskAPI.ConnectRequest) (*taskAPI.ConnectResponse, error) { 815 resp, err := s.connect(ctx, r) 816 return resp, errdefs.ToGRPC(err) 817 } 818 819 func (s *service) connect(ctx context.Context, r *taskAPI.ConnectRequest) (*taskAPI.ConnectResponse, error) { 820 log.L.Debugf("Connect, id: %s", r.ID) 821 822 var pid int 823 if s.task != nil { 824 pid = s.task.Pid() 825 } 826 return &taskAPI.ConnectResponse{ 827 ShimPid: uint32(os.Getpid()), 828 TaskPid: uint32(pid), 829 }, nil 830 } 831 832 func (s *service) Shutdown(ctx context.Context, r *taskAPI.ShutdownRequest) (*types.Empty, error) { 833 resp, err := s.shutdown(ctx, r) 834 return resp, errdefs.ToGRPC(err) 835 } 836 837 func (s *service) shutdown(ctx context.Context, r *taskAPI.ShutdownRequest) (*types.Empty, error) { 838 log.L.Debugf("Shutdown, id: %s", r.ID) 839 s.cancel() 840 if s.shimAddress != "" { 841 _ = shim.RemoveSocket(s.shimAddress) 842 } 843 os.Exit(0) 844 panic("Should not get here") 845 } 846 847 func (s *service) Stats(ctx context.Context, r *taskAPI.StatsRequest) (*taskAPI.StatsResponse, error) { 848 resp, err := s.stats(ctx, r) 849 return resp, errdefs.ToGRPC(err) 850 } 851 852 func (s *service) stats(ctx context.Context, r *taskAPI.StatsRequest) (*taskAPI.StatsResponse, error) { 853 log.L.Debugf("Stats, id: %s", r.ID) 854 if s.task == nil { 855 log.L.Debugf("Stats error, id: %s: container not created", r.ID) 856 return nil, errdefs.ToGRPCf(errdefs.ErrFailedPrecondition, "container must be created") 857 } 858 stats, err := s.task.Stats(ctx, s.id) 859 if err != nil { 860 log.L.Debugf("Stats error, id: %s: %v", r.ID, err) 861 return nil, err 862 } 863 864 // gvisor currently (as of 2020-03-03) only returns the total memory 865 // usage and current PID value[0]. However, we copy the common fields here 866 // so that future updates will propagate correct information. We're 867 // using the cgroups.Metrics structure so we're returning the same type 868 // as runc. 869 // 870 // [0]: https://github.com/google/gvisor/blob/277a0d5a1fbe8272d4729c01ee4c6e374d047ebc/runsc/boot/events.go#L61-L81 871 metrics := &cgroupsstats.Metrics{ 872 CPU: &cgroupsstats.CPUStat{ 873 Usage: &cgroupsstats.CPUUsage{ 874 Total: stats.Cpu.Usage.Total, 875 Kernel: stats.Cpu.Usage.Kernel, 876 User: stats.Cpu.Usage.User, 877 PerCPU: stats.Cpu.Usage.Percpu, 878 }, 879 Throttling: &cgroupsstats.Throttle{ 880 Periods: stats.Cpu.Throttling.Periods, 881 ThrottledPeriods: stats.Cpu.Throttling.ThrottledPeriods, 882 ThrottledTime: stats.Cpu.Throttling.ThrottledTime, 883 }, 884 }, 885 Memory: &cgroupsstats.MemoryStat{ 886 Cache: stats.Memory.Cache, 887 Usage: &cgroupsstats.MemoryEntry{ 888 Limit: stats.Memory.Usage.Limit, 889 Usage: stats.Memory.Usage.Usage, 890 Max: stats.Memory.Usage.Max, 891 Failcnt: stats.Memory.Usage.Failcnt, 892 }, 893 Swap: &cgroupsstats.MemoryEntry{ 894 Limit: stats.Memory.Swap.Limit, 895 Usage: stats.Memory.Swap.Usage, 896 Max: stats.Memory.Swap.Max, 897 Failcnt: stats.Memory.Swap.Failcnt, 898 }, 899 Kernel: &cgroupsstats.MemoryEntry{ 900 Limit: stats.Memory.Kernel.Limit, 901 Usage: stats.Memory.Kernel.Usage, 902 Max: stats.Memory.Kernel.Max, 903 Failcnt: stats.Memory.Kernel.Failcnt, 904 }, 905 KernelTCP: &cgroupsstats.MemoryEntry{ 906 Limit: stats.Memory.KernelTCP.Limit, 907 Usage: stats.Memory.KernelTCP.Usage, 908 Max: stats.Memory.KernelTCP.Max, 909 Failcnt: stats.Memory.KernelTCP.Failcnt, 910 }, 911 }, 912 Pids: &cgroupsstats.PidsStat{ 913 Current: stats.Pids.Current, 914 Limit: stats.Pids.Limit, 915 }, 916 } 917 data, err := typeurl.MarshalAny(metrics) 918 if err != nil { 919 log.L.Debugf("Stats error, id: %s: %v", r.ID, err) 920 return nil, err 921 } 922 log.L.Debugf("Stats success, id: %s: %+v", r.ID, data) 923 return &taskAPI.StatsResponse{ 924 Stats: data, 925 }, nil 926 } 927 928 // Update updates a running container. 929 func (s *service) Update(ctx context.Context, r *taskAPI.UpdateTaskRequest) (*types.Empty, error) { 930 return empty, errdefs.ToGRPC(errdefs.ErrNotImplemented) 931 } 932 933 // Wait waits for a process to exit. 934 func (s *service) Wait(ctx context.Context, r *taskAPI.WaitRequest) (*taskAPI.WaitResponse, error) { 935 resp, err := s.wait(ctx, r) 936 return resp, errdefs.ToGRPC(err) 937 } 938 939 func (s *service) wait(ctx context.Context, r *taskAPI.WaitRequest) (*taskAPI.WaitResponse, error) { 940 log.L.Debugf("Wait, id: %s, execID: %s", r.ID, r.ExecID) 941 942 p, err := s.getProcess(r.ExecID) 943 if err != nil { 944 log.L.Debugf("Wait failed to find process: %v", err) 945 return nil, err 946 } 947 p.Wait() 948 949 res := &taskAPI.WaitResponse{ 950 ExitStatus: uint32(p.ExitStatus()), 951 ExitedAt: p.ExitedAt(), 952 } 953 log.L.Debugf("Wait succeeded, response: %+v", res) 954 return res, nil 955 } 956 957 func (s *service) processExits(ctx context.Context) { 958 for e := range s.ec { 959 s.checkProcesses(ctx, e) 960 } 961 } 962 963 func (s *service) checkProcesses(ctx context.Context, e proc.Exit) { 964 // TODO(random-liu): Add `shouldKillAll` logic if container pid 965 // namespace is supported. 966 for _, p := range s.allProcesses() { 967 if p.ID() == e.ID { 968 if ip, ok := p.(*proc.Init); ok { 969 // Ensure all children are killed. 970 log.L.Debugf("Container init process exited, killing all container processes") 971 ip.KillAll(ctx) 972 } 973 p.SetExited(e.Status) 974 s.events <- &events.TaskExit{ 975 ContainerID: s.id, 976 ID: p.ID(), 977 Pid: uint32(p.Pid()), 978 ExitStatus: uint32(e.Status), 979 ExitedAt: p.ExitedAt(), 980 } 981 return 982 } 983 } 984 } 985 986 func (s *service) allProcesses() (o []process.Process) { 987 s.mu.Lock() 988 defer s.mu.Unlock() 989 for _, p := range s.processes { 990 o = append(o, p) 991 } 992 if s.task != nil { 993 o = append(o, s.task) 994 } 995 return o 996 } 997 998 func (s *service) getContainerPids(ctx context.Context, id string) ([]uint32, error) { 999 s.mu.Lock() 1000 p := s.task 1001 s.mu.Unlock() 1002 if p == nil { 1003 return nil, fmt.Errorf("container must be created: %w", errdefs.ErrFailedPrecondition) 1004 } 1005 ps, err := p.Runtime().Ps(ctx, id) 1006 if err != nil { 1007 return nil, err 1008 } 1009 pids := make([]uint32, 0, len(ps)) 1010 for _, pid := range ps { 1011 pids = append(pids, uint32(pid)) 1012 } 1013 return pids, nil 1014 } 1015 1016 func (s *service) forward(ctx context.Context, publisher shim.Publisher) { 1017 for e := range s.events { 1018 err := publisher.Publish(ctx, getTopic(e), e) 1019 if err != nil { 1020 // Should not happen. 1021 panic(fmt.Errorf("post event: %w", err)) 1022 } 1023 } 1024 } 1025 1026 func (s *service) getProcess(execID string) (process.Process, error) { 1027 s.mu.Lock() 1028 defer s.mu.Unlock() 1029 1030 if execID == "" { 1031 if s.task == nil { 1032 return nil, errdefs.ToGRPCf(errdefs.ErrFailedPrecondition, "container must be created") 1033 } 1034 return s.task, nil 1035 } 1036 1037 p := s.processes[execID] 1038 if p == nil { 1039 return nil, errdefs.ToGRPCf(errdefs.ErrNotFound, "process does not exist %s", execID) 1040 } 1041 return p, nil 1042 } 1043 1044 func getTopic(e any) string { 1045 switch e.(type) { 1046 case *events.TaskCreate: 1047 return runtime.TaskCreateEventTopic 1048 case *events.TaskStart: 1049 return runtime.TaskStartEventTopic 1050 case *events.TaskOOM: 1051 return runtime.TaskOOMEventTopic 1052 case *events.TaskExit: 1053 return runtime.TaskExitEventTopic 1054 case *events.TaskDelete: 1055 return runtime.TaskDeleteEventTopic 1056 case *events.TaskExecAdded: 1057 return runtime.TaskExecAddedEventTopic 1058 case *events.TaskExecStarted: 1059 return runtime.TaskExecStartedEventTopic 1060 default: 1061 log.L.Infof("no topic for type %#v", e) 1062 } 1063 return runtime.TaskUnknownTopic 1064 } 1065 1066 func newInit(path, workDir, namespace string, platform stdio.Platform, r *proc.CreateConfig, options *options, rootfs string) (*proc.Init, error) { 1067 spec, err := utils.ReadSpec(r.Bundle) 1068 if err != nil { 1069 return nil, fmt.Errorf("read oci spec: %w", err) 1070 } 1071 1072 updated, err := utils.UpdateVolumeAnnotations(spec) 1073 if err != nil { 1074 return nil, fmt.Errorf("update volume annotations: %w", err) 1075 } 1076 updated = setPodCgroup(spec) || updated 1077 1078 if updated { 1079 if err := utils.WriteSpec(r.Bundle, spec); err != nil { 1080 return nil, err 1081 } 1082 } 1083 1084 runsc.FormatRunscPaths(r.ID, options.RunscConfig) 1085 runtime := proc.NewRunsc(options.Root, path, namespace, options.BinaryName, options.RunscConfig, spec) 1086 p := proc.New(r.ID, runtime, stdio.Stdio{ 1087 Stdin: r.Stdin, 1088 Stdout: r.Stdout, 1089 Stderr: r.Stderr, 1090 Terminal: r.Terminal, 1091 }) 1092 p.Bundle = r.Bundle 1093 p.Platform = platform 1094 p.Rootfs = rootfs 1095 p.WorkDir = workDir 1096 p.IoUID = int(options.IoUID) 1097 p.IoGID = int(options.IoGID) 1098 p.Sandbox = specutils.SpecContainerType(spec) == specutils.ContainerTypeSandbox 1099 p.UserLog = utils.UserLogPath(spec) 1100 p.Monitor = reaper.Default 1101 return p, nil 1102 } 1103 1104 // setPodCgroup searches for the pod cgroup path inside the container's cgroup 1105 // path. If found, it's set as an annotation in the spec. This is done so that 1106 // the sandbox joins the pod cgroup. Otherwise, the sandbox would join the pause 1107 // container cgroup. Returns true if the spec was modified. Ex.: 1108 // /kubepods/burstable/pod123/container123 => kubepods/burstable/pod123 1109 func setPodCgroup(spec *specs.Spec) bool { 1110 if !utils.IsSandbox(spec) { 1111 return false 1112 } 1113 if spec.Linux == nil || len(spec.Linux.CgroupsPath) == 0 { 1114 return false 1115 } 1116 1117 // Search backwards for the pod cgroup path to make the sandbox use it, 1118 // instead of the pause container's cgroup. 1119 parts := strings.Split(spec.Linux.CgroupsPath, string(filepath.Separator)) 1120 for i := len(parts) - 1; i >= 0; i-- { 1121 if strings.HasPrefix(parts[i], "pod") { 1122 var path string 1123 for j := 0; j <= i; j++ { 1124 path = filepath.Join(path, parts[j]) 1125 } 1126 // Add back the initial '/' that may have been lost above. 1127 if filepath.IsAbs(spec.Linux.CgroupsPath) { 1128 path = string(filepath.Separator) + path 1129 } 1130 if spec.Linux.CgroupsPath == path { 1131 return false 1132 } 1133 if spec.Annotations == nil { 1134 spec.Annotations = make(map[string]string) 1135 } 1136 spec.Annotations[cgroupParentAnnotation] = path 1137 return true 1138 } 1139 } 1140 return false 1141 }