github.com/ttpreport/gvisor-ligolo@v0.0.0-20240123134145-a858404967ba/pkg/shim/service.go (about)

     1  // Copyright 2018 The gVisor Authors.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     https://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  // Package shim implements Containerd Shim v2 interface.
    16  package shim
    17  
    18  import (
    19  	"context"
    20  	"fmt"
    21  	"io"
    22  	"os"
    23  	"os/exec"
    24  	"path/filepath"
    25  	"strings"
    26  	"sync"
    27  	"time"
    28  
    29  	"github.com/BurntSushi/toml"
    30  	"github.com/containerd/cgroups"
    31  	cgroupsstats "github.com/containerd/cgroups/stats/v1"
    32  	cgroupsv2 "github.com/containerd/cgroups/v2"
    33  	"github.com/containerd/console"
    34  	"github.com/containerd/containerd/api/events"
    35  	"github.com/containerd/containerd/api/types/task"
    36  	"github.com/containerd/containerd/errdefs"
    37  	"github.com/containerd/containerd/log"
    38  	"github.com/containerd/containerd/mount"
    39  	"github.com/containerd/containerd/namespaces"
    40  	"github.com/containerd/containerd/pkg/process"
    41  	"github.com/containerd/containerd/pkg/stdio"
    42  	"github.com/containerd/containerd/runtime"
    43  	"github.com/containerd/containerd/runtime/linux/runctypes"
    44  	"github.com/containerd/containerd/runtime/v2/shim"
    45  	taskAPI "github.com/containerd/containerd/runtime/v2/task"
    46  	"github.com/containerd/containerd/sys/reaper"
    47  	"github.com/containerd/typeurl"
    48  	"github.com/gogo/protobuf/types"
    49  	specs "github.com/opencontainers/runtime-spec/specs-go"
    50  	"github.com/sirupsen/logrus"
    51  	"github.com/ttpreport/gvisor-ligolo/pkg/cleanup"
    52  	"github.com/ttpreport/gvisor-ligolo/pkg/shim/runtimeoptions/v14"
    53  	v14 "github.com/ttpreport/gvisor-ligolo/pkg/shim/runtimeoptions/v14"
    54  	"golang.org/x/sys/unix"
    55  
    56  	"github.com/ttpreport/gvisor-ligolo/pkg/shim/proc"
    57  	"github.com/ttpreport/gvisor-ligolo/pkg/shim/runsc"
    58  	"github.com/ttpreport/gvisor-ligolo/pkg/shim/runtimeoptions"
    59  	"github.com/ttpreport/gvisor-ligolo/pkg/shim/utils"
    60  	"github.com/ttpreport/gvisor-ligolo/runsc/specutils"
    61  )
    62  
    63  var (
    64  	empty   = &types.Empty{}
    65  	bufPool = sync.Pool{
    66  		New: func() any {
    67  			buffer := make([]byte, 32<<10)
    68  			return &buffer
    69  		},
    70  	}
    71  )
    72  
    73  const (
    74  	// configFile is the default config file name. For containerd 1.2,
    75  	// we assume that a config.toml should exist in the runtime root.
    76  	configFile = "config.toml"
    77  
    78  	// shimAddressPath is the relative path to a file that contains the address
    79  	// to the shim UDS. See service.shimAddress.
    80  	shimAddressPath = "address"
    81  
    82  	cgroupParentAnnotation = "dev.gvisor.spec.cgroup-parent"
    83  )
    84  
    85  type oomPoller interface {
    86  	io.Closer
    87  	// add adds `cg` cgroup to oom poller. `cg` is cgroups.Cgroup in v1 and
    88  	// `cgroupsv2.Manager` in v2
    89  	add(id string, cg any) error
    90  	// run monitors oom event and notifies the shim about them
    91  	run(ctx context.Context)
    92  }
    93  
    94  // New returns a new shim service that can be used via GRPC.
    95  func New(ctx context.Context, id string, publisher shim.Publisher, cancel func()) (shim.Shim, error) {
    96  	var opts shim.Opts
    97  	if ctxOpts := ctx.Value(shim.OptsKey{}); ctxOpts != nil {
    98  		opts = ctxOpts.(shim.Opts)
    99  	}
   100  
   101  	var (
   102  		ep  oomPoller
   103  		err error
   104  	)
   105  	if cgroups.Mode() == cgroups.Unified {
   106  		ep, err = newOOMv2Poller(publisher)
   107  	} else {
   108  		ep, err = newOOMEpoller(publisher)
   109  	}
   110  	if err != nil {
   111  		return nil, err
   112  	}
   113  	go ep.run(ctx)
   114  	s := &service{
   115  		id:             id,
   116  		processes:      make(map[string]process.Process),
   117  		events:         make(chan any, 128),
   118  		ec:             proc.ExitCh,
   119  		oomPoller:      ep,
   120  		cancel:         cancel,
   121  		genericOptions: opts,
   122  	}
   123  	go s.processExits(ctx)
   124  	runsc.Monitor = &runsc.LogMonitor{Next: reaper.Default}
   125  	if err := s.initPlatform(); err != nil {
   126  		cancel()
   127  		return nil, fmt.Errorf("failed to initialized platform behavior: %w", err)
   128  	}
   129  	go s.forward(ctx, publisher)
   130  
   131  	if address, err := shim.ReadAddress(shimAddressPath); err == nil {
   132  		s.shimAddress = address
   133  	}
   134  
   135  	return s, nil
   136  }
   137  
   138  // service is the shim implementation of a remote shim over GRPC. It runs in 2
   139  // different modes:
   140  //  1. Service: process runs for the life time of the container and receives
   141  //     calls described in shimapi.TaskService interface.
   142  //  2. Tool: process is short lived and runs only to perform the requested
   143  //     operations and then exits. It implements the direct functions in
   144  //     shim.Shim interface.
   145  //
   146  // When the service is running, it saves a json file with state information so
   147  // that commands sent to the tool can load the state and perform the operation.
   148  type service struct {
   149  	mu sync.Mutex
   150  
   151  	// id is the container ID.
   152  	id string
   153  
   154  	// bundle is a path provided by the caller on container creation. Store
   155  	// because it's needed in commands that don't receive bundle in the request.
   156  	bundle string
   157  
   158  	// task is the main process that is running the container.
   159  	task *proc.Init
   160  
   161  	// processes maps ExecId to processes running through exec.
   162  	processes map[string]process.Process
   163  
   164  	events chan any
   165  
   166  	// platform handles operations related to the console.
   167  	platform stdio.Platform
   168  
   169  	// genericOptions are options that come from the shim interface and are common
   170  	// to all shims.
   171  	genericOptions shim.Opts
   172  
   173  	// opts are configuration options specific for this shim.
   174  	opts options
   175  
   176  	// ex gets notified whenever the container init process or an exec'd process
   177  	// exits from inside the sandbox.
   178  	ec chan proc.Exit
   179  
   180  	// oomPoller monitors the sandbox's cgroup for OOM notifications.
   181  	oomPoller oomPoller
   182  
   183  	// cancel is a function that needs to be called before the shim stops. The
   184  	// function is provided by the caller to New().
   185  	cancel func()
   186  
   187  	// shimAddress is the location of the UDS used to communicate to containerd.
   188  	shimAddress string
   189  }
   190  
   191  var _ shim.Shim = (*service)(nil)
   192  
   193  func (s *service) newCommand(ctx context.Context, containerdBinary, containerdAddress string) (*exec.Cmd, error) {
   194  	ns, err := namespaces.NamespaceRequired(ctx)
   195  	if err != nil {
   196  		return nil, err
   197  	}
   198  	self, err := os.Executable()
   199  	if err != nil {
   200  		return nil, err
   201  	}
   202  	cwd, err := os.Getwd()
   203  	if err != nil {
   204  		return nil, err
   205  	}
   206  	args := []string{
   207  		"-namespace", ns,
   208  		"-address", containerdAddress,
   209  		"-publish-binary", containerdBinary,
   210  	}
   211  	if s.genericOptions.Debug {
   212  		args = append(args, "-debug")
   213  	}
   214  	cmd := exec.Command(self, args...)
   215  	cmd.Dir = cwd
   216  	cmd.Env = append(os.Environ(), "GOMAXPROCS=2")
   217  	cmd.SysProcAttr = &unix.SysProcAttr{
   218  		Setpgid: true,
   219  	}
   220  	return cmd, nil
   221  }
   222  
   223  func (s *service) StartShim(ctx context.Context, id, containerdBinary, containerdAddress, containerdTTRPCAddress string) (string, error) {
   224  	log.L.Debugf("StartShim, id: %s, binary: %q, address: %q", id, containerdBinary, containerdAddress)
   225  
   226  	cmd, err := s.newCommand(ctx, containerdBinary, containerdAddress)
   227  	if err != nil {
   228  		return "", err
   229  	}
   230  	address, err := shim.SocketAddress(ctx, containerdAddress, id)
   231  	if err != nil {
   232  		return "", err
   233  	}
   234  	socket, err := shim.NewSocket(address)
   235  	if err != nil {
   236  		// The only time where this would happen is if there is a bug and the socket
   237  		// was not cleaned up in the cleanup method of the shim or we are using the
   238  		// grouping functionality where the new process should be run with the same
   239  		// shim as an existing container.
   240  		if !shim.SocketEaddrinuse(err) {
   241  			return "", fmt.Errorf("create new shim socket: %w", err)
   242  		}
   243  		if shim.CanConnect(address) {
   244  			if err := shim.WriteAddress(shimAddressPath, address); err != nil {
   245  				return "", fmt.Errorf("write existing socket for shim: %w", err)
   246  			}
   247  			return address, nil
   248  		}
   249  		if err := shim.RemoveSocket(address); err != nil {
   250  			return "", fmt.Errorf("remove pre-existing socket: %w", err)
   251  		}
   252  		if socket, err = shim.NewSocket(address); err != nil {
   253  			return "", fmt.Errorf("try create new shim socket 2x: %w", err)
   254  		}
   255  	}
   256  	cu := cleanup.Make(func() {
   257  		socket.Close()
   258  		_ = shim.RemoveSocket(address)
   259  	})
   260  	defer cu.Clean()
   261  
   262  	f, err := socket.File()
   263  	if err != nil {
   264  		return "", err
   265  	}
   266  
   267  	cmd.ExtraFiles = append(cmd.ExtraFiles, f)
   268  
   269  	log.L.Debugf("Executing: %q %s", cmd.Path, cmd.Args)
   270  	if err := cmd.Start(); err != nil {
   271  		f.Close()
   272  		return "", err
   273  	}
   274  	cu.Add(func() { cmd.Process.Kill() })
   275  
   276  	// make sure to wait after start
   277  	go cmd.Wait()
   278  	if err := shim.WritePidFile("shim.pid", cmd.Process.Pid); err != nil {
   279  		return "", err
   280  	}
   281  	if err := shim.WriteAddress(shimAddressPath, address); err != nil {
   282  		return "", err
   283  	}
   284  	if err := shim.SetScore(cmd.Process.Pid); err != nil {
   285  		return "", fmt.Errorf("failed to set OOM Score on shim: %w", err)
   286  	}
   287  	cu.Release()
   288  	return address, nil
   289  }
   290  
   291  // Cleanup is called from another process (need to reload state) to stop the
   292  // container and undo all operations done in Create().
   293  func (s *service) Cleanup(ctx context.Context) (*taskAPI.DeleteResponse, error) {
   294  	log.L.Debugf("Cleanup")
   295  
   296  	path, err := os.Getwd()
   297  	if err != nil {
   298  		return nil, err
   299  	}
   300  	ns, err := namespaces.NamespaceRequired(ctx)
   301  	if err != nil {
   302  		return nil, err
   303  	}
   304  	var st state
   305  	if err := st.load(path); err != nil {
   306  		return nil, err
   307  	}
   308  	r := proc.NewRunsc(s.opts.Root, path, ns, st.Options.BinaryName, nil, nil)
   309  
   310  	if err := r.Delete(ctx, s.id, &runsc.DeleteOpts{
   311  		Force: true,
   312  	}); err != nil {
   313  		log.L.Infof("failed to remove runc container: %v", err)
   314  	}
   315  	if err := mount.UnmountAll(st.Rootfs, 0); err != nil {
   316  		log.L.Infof("failed to cleanup rootfs mount: %v", err)
   317  	}
   318  	return &taskAPI.DeleteResponse{
   319  		ExitedAt:   time.Now(),
   320  		ExitStatus: 128 + uint32(unix.SIGKILL),
   321  	}, nil
   322  }
   323  
   324  // Create creates a new initial process and container with the underlying OCI
   325  // runtime.
   326  func (s *service) Create(ctx context.Context, r *taskAPI.CreateTaskRequest) (*taskAPI.CreateTaskResponse, error) {
   327  	resp, err := s.create(ctx, r)
   328  	return resp, errdefs.ToGRPC(err)
   329  }
   330  
   331  func (s *service) create(ctx context.Context, r *taskAPI.CreateTaskRequest) (*taskAPI.CreateTaskResponse, error) {
   332  	s.mu.Lock()
   333  	defer s.mu.Unlock()
   334  
   335  	// Save the main task id and bundle to the shim for additional requests.
   336  	s.id = r.ID
   337  	s.bundle = r.Bundle
   338  
   339  	ns, err := namespaces.NamespaceRequired(ctx)
   340  	if err != nil {
   341  		return nil, fmt.Errorf("create namespace: %w", err)
   342  	}
   343  
   344  	// Read from root for now.
   345  	if r.Options != nil {
   346  		v, err := typeurl.UnmarshalAny(r.Options)
   347  		if err != nil {
   348  			return nil, err
   349  		}
   350  		var path string
   351  		switch o := v.(type) {
   352  		case *runctypes.CreateOptions: // containerd 1.2.x
   353  			s.opts.IoUID = o.IoUid
   354  			s.opts.IoGID = o.IoGid
   355  			s.opts.ShimCgroup = o.ShimCgroup
   356  		case *runctypes.RuncOptions: // containerd 1.2.x
   357  			root := proc.RunscRoot
   358  			if o.RuntimeRoot != "" {
   359  				root = o.RuntimeRoot
   360  			}
   361  
   362  			s.opts.BinaryName = o.Runtime
   363  
   364  			path = filepath.Join(root, configFile)
   365  			if _, err := os.Stat(path); err != nil {
   366  				if !os.IsNotExist(err) {
   367  					return nil, fmt.Errorf("stat config file %q: %w", path, err)
   368  				}
   369  				// A config file in runtime root is not required.
   370  				path = ""
   371  			}
   372  		case *runtimeoptions.Options: // containerd 1.5+
   373  			if o.ConfigPath == "" {
   374  				break
   375  			}
   376  			if o.TypeUrl != optionsType {
   377  				return nil, fmt.Errorf("unsupported option type %q", o.TypeUrl)
   378  			}
   379  			path = o.ConfigPath
   380  		case *v14.Options: // containerd 1.4-
   381  			if o.ConfigPath == "" {
   382  				break
   383  			}
   384  			if o.TypeUrl != optionsType {
   385  				return nil, fmt.Errorf("unsupported option type %q", o.TypeUrl)
   386  			}
   387  			path = o.ConfigPath
   388  		default:
   389  			return nil, fmt.Errorf("unsupported option type %q", r.Options.TypeUrl)
   390  		}
   391  		if path != "" {
   392  			if _, err = toml.DecodeFile(path, &s.opts); err != nil {
   393  				return nil, fmt.Errorf("decode config file %q: %w", path, err)
   394  			}
   395  		}
   396  	}
   397  
   398  	if len(s.opts.LogLevel) != 0 {
   399  		lvl, err := logrus.ParseLevel(s.opts.LogLevel)
   400  		if err != nil {
   401  			return nil, err
   402  		}
   403  		logrus.SetLevel(lvl)
   404  	}
   405  	for _, emittedPath := range runsc.EmittedPaths(s.id, s.opts.RunscConfig) {
   406  		if err := os.MkdirAll(filepath.Dir(emittedPath), 0777); err != nil {
   407  			return nil, fmt.Errorf("failed to create parent directories for file %v: %w", emittedPath, err)
   408  		}
   409  	}
   410  	if len(s.opts.LogPath) != 0 {
   411  		logPath := runsc.FormatShimLogPath(s.opts.LogPath, s.id)
   412  		if err := os.MkdirAll(filepath.Dir(logPath), 0777); err != nil {
   413  			return nil, fmt.Errorf("failed to create log dir: %w", err)
   414  		}
   415  		logFile, err := os.Create(logPath)
   416  		if err != nil {
   417  			return nil, fmt.Errorf("failed to create log file: %w", err)
   418  		}
   419  		log.L.Debugf("Starting mirror log at %q", logPath)
   420  		std := logrus.StandardLogger()
   421  		std.SetOutput(io.MultiWriter(std.Out, logFile))
   422  
   423  		log.L.Debugf("Create shim")
   424  		log.L.Debugf("***************************")
   425  		log.L.Debugf("Args: %s", os.Args)
   426  		log.L.Debugf("PID: %d", os.Getpid())
   427  		log.L.Debugf("ID: %s", s.id)
   428  		log.L.Debugf("Options: %+v", s.opts)
   429  		log.L.Debugf("Bundle: %s", r.Bundle)
   430  		log.L.Debugf("Terminal: %t", r.Terminal)
   431  		log.L.Debugf("stdin: %s", r.Stdin)
   432  		log.L.Debugf("stdout: %s", r.Stdout)
   433  		log.L.Debugf("stderr: %s", r.Stderr)
   434  		log.L.Debugf("***************************")
   435  		if log.L.Logger.IsLevelEnabled(logrus.DebugLevel) {
   436  			setDebugSigHandler()
   437  		}
   438  	}
   439  
   440  	// Save state before any action is taken to ensure Cleanup() will have all
   441  	// the information it needs to undo the operations.
   442  	st := state{
   443  		Rootfs:  filepath.Join(r.Bundle, "rootfs"),
   444  		Options: s.opts,
   445  	}
   446  	if err := st.save(r.Bundle); err != nil {
   447  		return nil, err
   448  	}
   449  
   450  	if err := os.Mkdir(st.Rootfs, 0711); err != nil && !os.IsExist(err) {
   451  		return nil, err
   452  	}
   453  
   454  	// Convert from types.Mount to proc.Mount.
   455  	var mounts []proc.Mount
   456  	for _, m := range r.Rootfs {
   457  		mounts = append(mounts, proc.Mount{
   458  			Type:    m.Type,
   459  			Source:  m.Source,
   460  			Target:  m.Target,
   461  			Options: m.Options,
   462  		})
   463  	}
   464  
   465  	// Cleans up all mounts in case of failure.
   466  	cu := cleanup.Make(func() {
   467  		if err := mount.UnmountAll(st.Rootfs, 0); err != nil {
   468  			log.L.Infof("failed to cleanup rootfs mount: %v", err)
   469  		}
   470  	})
   471  	defer cu.Clean()
   472  	for _, rm := range mounts {
   473  		m := &mount.Mount{
   474  			Type:    rm.Type,
   475  			Source:  rm.Source,
   476  			Options: rm.Options,
   477  		}
   478  		if err := m.Mount(st.Rootfs); err != nil {
   479  			return nil, fmt.Errorf("failed to mount rootfs component %v: %w", m, err)
   480  		}
   481  	}
   482  
   483  	config := &proc.CreateConfig{
   484  		ID:       r.ID,
   485  		Bundle:   r.Bundle,
   486  		Runtime:  s.opts.BinaryName,
   487  		Rootfs:   mounts,
   488  		Terminal: r.Terminal,
   489  		Stdin:    r.Stdin,
   490  		Stdout:   r.Stdout,
   491  		Stderr:   r.Stderr,
   492  	}
   493  	process, err := newInit(r.Bundle, filepath.Join(r.Bundle, "work"), ns, s.platform, config, &s.opts, st.Rootfs)
   494  	if err != nil {
   495  		return nil, err
   496  	}
   497  	if err := process.Create(ctx, config); err != nil {
   498  		return nil, err
   499  	}
   500  
   501  	// Set up OOM notification on the sandbox's cgroup. This is done on
   502  	// sandbox create since the sandbox process will be created here.
   503  	pid := process.Pid()
   504  	if pid > 0 {
   505  		var (
   506  			cg  any
   507  			err error
   508  		)
   509  		if cgroups.Mode() == cgroups.Unified {
   510  			var cgPath string
   511  			cgPath, err = cgroupsv2.PidGroupPath(pid)
   512  			if err == nil {
   513  				cg, err = cgroupsv2.LoadManager("/sys/fs/cgroup", cgPath)
   514  			}
   515  		} else {
   516  			cg, err = cgroups.Load(cgroups.V1, cgroups.PidPath(pid))
   517  		}
   518  		if err != nil {
   519  			return nil, fmt.Errorf("loading cgroup for %d: %w", pid, err)
   520  		}
   521  		if err := s.oomPoller.add(s.id, cg); err != nil {
   522  			return nil, fmt.Errorf("add cg to OOM monitor: %w", err)
   523  		}
   524  	}
   525  
   526  	// Success
   527  	cu.Release()
   528  	s.task = process
   529  	return &taskAPI.CreateTaskResponse{
   530  		Pid: uint32(process.Pid()),
   531  	}, nil
   532  }
   533  
   534  // Start starts a process.
   535  func (s *service) Start(ctx context.Context, r *taskAPI.StartRequest) (*taskAPI.StartResponse, error) {
   536  	resp, err := s.start(ctx, r)
   537  	return resp, errdefs.ToGRPC(err)
   538  }
   539  
   540  func (s *service) start(ctx context.Context, r *taskAPI.StartRequest) (*taskAPI.StartResponse, error) {
   541  	log.L.Debugf("Start, id: %s, execID: %s", r.ID, r.ExecID)
   542  
   543  	p, err := s.getProcess(r.ExecID)
   544  	if err != nil {
   545  		return nil, err
   546  	}
   547  	if err := p.Start(ctx); err != nil {
   548  		return nil, err
   549  	}
   550  	// TODO: Set the cgroup and oom notifications on restore.
   551  	// https://github.com/google/gvisor-containerd-shim/issues/58
   552  	return &taskAPI.StartResponse{
   553  		Pid: uint32(p.Pid()),
   554  	}, nil
   555  }
   556  
   557  // Delete deletes the initial process and container.
   558  func (s *service) Delete(ctx context.Context, r *taskAPI.DeleteRequest) (*taskAPI.DeleteResponse, error) {
   559  	resp, err := s.delete(ctx, r)
   560  	return resp, errdefs.ToGRPC(err)
   561  }
   562  
   563  func (s *service) delete(ctx context.Context, r *taskAPI.DeleteRequest) (*taskAPI.DeleteResponse, error) {
   564  	log.L.Debugf("Delete, id: %s, execID: %s", r.ID, r.ExecID)
   565  
   566  	p, err := s.getProcess(r.ExecID)
   567  	if err != nil {
   568  		return nil, err
   569  	}
   570  	if err := p.Delete(ctx); err != nil {
   571  		return nil, err
   572  	}
   573  	if len(r.ExecID) != 0 {
   574  		s.mu.Lock()
   575  		delete(s.processes, r.ExecID)
   576  		s.mu.Unlock()
   577  	} else if s.platform != nil {
   578  		s.platform.Close()
   579  	}
   580  	return &taskAPI.DeleteResponse{
   581  		ExitStatus: uint32(p.ExitStatus()),
   582  		ExitedAt:   p.ExitedAt(),
   583  		Pid:        uint32(p.Pid()),
   584  	}, nil
   585  }
   586  
   587  // Exec spawns an additional process inside the container.
   588  func (s *service) Exec(ctx context.Context, r *taskAPI.ExecProcessRequest) (*types.Empty, error) {
   589  	resp, err := s.exec(ctx, r)
   590  	return resp, errdefs.ToGRPC(err)
   591  }
   592  
   593  func (s *service) exec(ctx context.Context, r *taskAPI.ExecProcessRequest) (*types.Empty, error) {
   594  	log.L.Debugf("Exec, id: %s, execID: %s", r.ID, r.ExecID)
   595  
   596  	s.mu.Lock()
   597  	p := s.processes[r.ExecID]
   598  	s.mu.Unlock()
   599  	if p != nil {
   600  		return nil, errdefs.ToGRPCf(errdefs.ErrAlreadyExists, "id %s", r.ExecID)
   601  	}
   602  	if s.task == nil {
   603  		return nil, errdefs.ToGRPCf(errdefs.ErrFailedPrecondition, "container must be created")
   604  	}
   605  	process, err := s.task.Exec(ctx, s.bundle, &proc.ExecConfig{
   606  		ID:       r.ExecID,
   607  		Terminal: r.Terminal,
   608  		Stdin:    r.Stdin,
   609  		Stdout:   r.Stdout,
   610  		Stderr:   r.Stderr,
   611  		Spec:     r.Spec,
   612  	})
   613  	if err != nil {
   614  		return nil, err
   615  	}
   616  	s.mu.Lock()
   617  	s.processes[r.ExecID] = process
   618  	s.mu.Unlock()
   619  	return empty, nil
   620  }
   621  
   622  // ResizePty resizes the terminal of a process.
   623  func (s *service) ResizePty(ctx context.Context, r *taskAPI.ResizePtyRequest) (*types.Empty, error) {
   624  	resp, err := s.resizePty(ctx, r)
   625  	return resp, errdefs.ToGRPC(err)
   626  }
   627  
   628  func (s *service) resizePty(ctx context.Context, r *taskAPI.ResizePtyRequest) (*types.Empty, error) {
   629  	log.L.Debugf("ResizePty, id: %s, execID: %s, dimension: %dx%d", r.ID, r.ExecID, r.Height, r.Width)
   630  
   631  	p, err := s.getProcess(r.ExecID)
   632  	if err != nil {
   633  		return nil, err
   634  	}
   635  	ws := console.WinSize{
   636  		Width:  uint16(r.Width),
   637  		Height: uint16(r.Height),
   638  	}
   639  	if err := p.Resize(ws); err != nil {
   640  		return nil, err
   641  	}
   642  	return empty, nil
   643  }
   644  
   645  // State returns runtime state information for a process.
   646  func (s *service) State(ctx context.Context, r *taskAPI.StateRequest) (*taskAPI.StateResponse, error) {
   647  	resp, err := s.state(ctx, r)
   648  	return resp, errdefs.ToGRPC(err)
   649  }
   650  
   651  func (s *service) state(ctx context.Context, r *taskAPI.StateRequest) (*taskAPI.StateResponse, error) {
   652  	log.L.Debugf("State, id: %s, execID: %s", r.ID, r.ExecID)
   653  
   654  	p, err := s.getProcess(r.ExecID)
   655  	if err != nil {
   656  		log.L.Debugf("State failed to find process: %v", err)
   657  		return nil, err
   658  	}
   659  	st, err := p.Status(ctx)
   660  	if err != nil {
   661  		log.L.Debugf("State failed: %v", err)
   662  		return nil, err
   663  	}
   664  	status := task.StatusUnknown
   665  	switch st {
   666  	case "created":
   667  		status = task.StatusCreated
   668  	case "running":
   669  		status = task.StatusRunning
   670  	case "stopped":
   671  		status = task.StatusStopped
   672  	}
   673  	sio := p.Stdio()
   674  	res := &taskAPI.StateResponse{
   675  		ID:         p.ID(),
   676  		Bundle:     s.bundle,
   677  		Pid:        uint32(p.Pid()),
   678  		Status:     status,
   679  		Stdin:      sio.Stdin,
   680  		Stdout:     sio.Stdout,
   681  		Stderr:     sio.Stderr,
   682  		Terminal:   sio.Terminal,
   683  		ExitStatus: uint32(p.ExitStatus()),
   684  		ExitedAt:   p.ExitedAt(),
   685  	}
   686  	log.L.Debugf("State succeeded, response: %+v", res)
   687  	return res, nil
   688  }
   689  
   690  // Pause the container.
   691  func (s *service) Pause(ctx context.Context, r *taskAPI.PauseRequest) (*types.Empty, error) {
   692  	resp, err := s.pause(ctx, r)
   693  	return resp, errdefs.ToGRPC(err)
   694  }
   695  
   696  func (s *service) pause(ctx context.Context, r *taskAPI.PauseRequest) (*types.Empty, error) {
   697  	log.L.Debugf("Pause, id: %s", r.ID)
   698  	if s.task == nil {
   699  		log.L.Debugf("Pause error, id: %s: container not created", r.ID)
   700  		return nil, errdefs.ToGRPCf(errdefs.ErrFailedPrecondition, "container must be created")
   701  	}
   702  	err := s.task.Runtime().Pause(ctx, r.ID)
   703  	if err != nil {
   704  		return nil, err
   705  	}
   706  	return empty, nil
   707  }
   708  
   709  // Resume the container.
   710  func (s *service) Resume(ctx context.Context, r *taskAPI.ResumeRequest) (*types.Empty, error) {
   711  	resp, err := s.resume(ctx, r)
   712  	return resp, errdefs.ToGRPC(err)
   713  }
   714  
   715  func (s *service) resume(ctx context.Context, r *taskAPI.ResumeRequest) (*types.Empty, error) {
   716  	log.L.Debugf("Resume, id: %s", r.ID)
   717  	if s.task == nil {
   718  		log.L.Debugf("Resume error, id: %s: container not created", r.ID)
   719  		return nil, errdefs.ToGRPCf(errdefs.ErrFailedPrecondition, "container must be created")
   720  	}
   721  	err := s.task.Runtime().Resume(ctx, r.ID)
   722  	if err != nil {
   723  		return nil, err
   724  	}
   725  	return empty, nil
   726  }
   727  
   728  // Kill a process with the provided signal.
   729  func (s *service) Kill(ctx context.Context, r *taskAPI.KillRequest) (*types.Empty, error) {
   730  	resp, err := s.kill(ctx, r)
   731  	return resp, errdefs.ToGRPC(err)
   732  }
   733  
   734  func (s *service) kill(ctx context.Context, r *taskAPI.KillRequest) (*types.Empty, error) {
   735  	log.L.Debugf("Kill, id: %s, execID: %s, signal: %d, all: %t", r.ID, r.ExecID, r.Signal, r.All)
   736  
   737  	p, err := s.getProcess(r.ExecID)
   738  	if err != nil {
   739  		return nil, err
   740  	}
   741  	if err := p.Kill(ctx, r.Signal, r.All); err != nil {
   742  		log.L.Debugf("Kill failed: %v", err)
   743  		return nil, err
   744  	}
   745  	log.L.Debugf("Kill succeeded")
   746  	return empty, nil
   747  }
   748  
   749  // Pids returns all pids inside the container.
   750  func (s *service) Pids(ctx context.Context, r *taskAPI.PidsRequest) (*taskAPI.PidsResponse, error) {
   751  	resp, err := s.pids(ctx, r)
   752  	return resp, errdefs.ToGRPC(err)
   753  }
   754  
   755  func (s *service) pids(ctx context.Context, r *taskAPI.PidsRequest) (*taskAPI.PidsResponse, error) {
   756  	log.L.Debugf("Pids, id: %s", r.ID)
   757  
   758  	pids, err := s.getContainerPids(ctx, r.ID)
   759  	if err != nil {
   760  		return nil, err
   761  	}
   762  	var processes []*task.ProcessInfo
   763  	for _, pid := range pids {
   764  		pInfo := task.ProcessInfo{
   765  			Pid: pid,
   766  		}
   767  		for _, p := range s.processes {
   768  			if p.Pid() == int(pid) {
   769  				d := &runctypes.ProcessDetails{
   770  					ExecID: p.ID(),
   771  				}
   772  				a, err := typeurl.MarshalAny(d)
   773  				if err != nil {
   774  					return nil, fmt.Errorf("failed to marshal process %d info: %w", pid, err)
   775  				}
   776  				pInfo.Info = a
   777  				break
   778  			}
   779  		}
   780  		processes = append(processes, &pInfo)
   781  	}
   782  	return &taskAPI.PidsResponse{
   783  		Processes: processes,
   784  	}, nil
   785  }
   786  
   787  // CloseIO closes the I/O context of a process.
   788  func (s *service) CloseIO(ctx context.Context, r *taskAPI.CloseIORequest) (*types.Empty, error) {
   789  	resp, err := s.closeIO(ctx, r)
   790  	return resp, errdefs.ToGRPC(err)
   791  }
   792  
   793  func (s *service) closeIO(ctx context.Context, r *taskAPI.CloseIORequest) (*types.Empty, error) {
   794  	log.L.Debugf("CloseIO, id: %s, execID: %s, stdin: %t", r.ID, r.ExecID, r.Stdin)
   795  
   796  	p, err := s.getProcess(r.ExecID)
   797  	if err != nil {
   798  		return nil, err
   799  	}
   800  	if stdin := p.Stdin(); stdin != nil {
   801  		if err := stdin.Close(); err != nil {
   802  			return nil, fmt.Errorf("close stdin: %w", err)
   803  		}
   804  	}
   805  	return empty, nil
   806  }
   807  
   808  // Checkpoint checkpoints the container.
   809  func (s *service) Checkpoint(ctx context.Context, r *taskAPI.CheckpointTaskRequest) (*types.Empty, error) {
   810  	log.L.Debugf("Checkpoint, id: %s", r.ID)
   811  	return empty, errdefs.ToGRPC(errdefs.ErrNotImplemented)
   812  }
   813  
   814  // Connect returns shim information such as the shim's pid.
   815  func (s *service) Connect(ctx context.Context, r *taskAPI.ConnectRequest) (*taskAPI.ConnectResponse, error) {
   816  	resp, err := s.connect(ctx, r)
   817  	return resp, errdefs.ToGRPC(err)
   818  }
   819  
   820  func (s *service) connect(ctx context.Context, r *taskAPI.ConnectRequest) (*taskAPI.ConnectResponse, error) {
   821  	log.L.Debugf("Connect, id: %s", r.ID)
   822  
   823  	var pid int
   824  	if s.task != nil {
   825  		pid = s.task.Pid()
   826  	}
   827  	return &taskAPI.ConnectResponse{
   828  		ShimPid: uint32(os.Getpid()),
   829  		TaskPid: uint32(pid),
   830  	}, nil
   831  }
   832  
   833  func (s *service) Shutdown(ctx context.Context, r *taskAPI.ShutdownRequest) (*types.Empty, error) {
   834  	resp, err := s.shutdown(ctx, r)
   835  	return resp, errdefs.ToGRPC(err)
   836  }
   837  
   838  func (s *service) shutdown(ctx context.Context, r *taskAPI.ShutdownRequest) (*types.Empty, error) {
   839  	log.L.Debugf("Shutdown, id: %s", r.ID)
   840  	s.cancel()
   841  	if s.shimAddress != "" {
   842  		_ = shim.RemoveSocket(s.shimAddress)
   843  	}
   844  	os.Exit(0)
   845  	panic("Should not get here")
   846  }
   847  
   848  func (s *service) Stats(ctx context.Context, r *taskAPI.StatsRequest) (*taskAPI.StatsResponse, error) {
   849  	resp, err := s.stats(ctx, r)
   850  	return resp, errdefs.ToGRPC(err)
   851  }
   852  
   853  func (s *service) stats(ctx context.Context, r *taskAPI.StatsRequest) (*taskAPI.StatsResponse, error) {
   854  	log.L.Debugf("Stats, id: %s", r.ID)
   855  	if s.task == nil {
   856  		log.L.Debugf("Stats error, id: %s: container not created", r.ID)
   857  		return nil, errdefs.ToGRPCf(errdefs.ErrFailedPrecondition, "container must be created")
   858  	}
   859  	stats, err := s.task.Stats(ctx, s.id)
   860  	if err != nil {
   861  		log.L.Debugf("Stats error, id: %s: %v", r.ID, err)
   862  		return nil, err
   863  	}
   864  
   865  	// gvisor currently (as of 2020-03-03) only returns the total memory
   866  	// usage and current PID value[0]. However, we copy the common fields here
   867  	// so that future updates will propagate correct information.  We're
   868  	// using the cgroups.Metrics structure so we're returning the same type
   869  	// as runc.
   870  	//
   871  	// [0]: https://github.com/google/gvisor/blob/277a0d5a1fbe8272d4729c01ee4c6e374d047ebc/runsc/boot/events.go#L61-L81
   872  	metrics := &cgroupsstats.Metrics{
   873  		CPU: &cgroupsstats.CPUStat{
   874  			Usage: &cgroupsstats.CPUUsage{
   875  				Total:  stats.Cpu.Usage.Total,
   876  				Kernel: stats.Cpu.Usage.Kernel,
   877  				User:   stats.Cpu.Usage.User,
   878  				PerCPU: stats.Cpu.Usage.Percpu,
   879  			},
   880  			Throttling: &cgroupsstats.Throttle{
   881  				Periods:          stats.Cpu.Throttling.Periods,
   882  				ThrottledPeriods: stats.Cpu.Throttling.ThrottledPeriods,
   883  				ThrottledTime:    stats.Cpu.Throttling.ThrottledTime,
   884  			},
   885  		},
   886  		Memory: &cgroupsstats.MemoryStat{
   887  			Cache: stats.Memory.Cache,
   888  			Usage: &cgroupsstats.MemoryEntry{
   889  				Limit:   stats.Memory.Usage.Limit,
   890  				Usage:   stats.Memory.Usage.Usage,
   891  				Max:     stats.Memory.Usage.Max,
   892  				Failcnt: stats.Memory.Usage.Failcnt,
   893  			},
   894  			Swap: &cgroupsstats.MemoryEntry{
   895  				Limit:   stats.Memory.Swap.Limit,
   896  				Usage:   stats.Memory.Swap.Usage,
   897  				Max:     stats.Memory.Swap.Max,
   898  				Failcnt: stats.Memory.Swap.Failcnt,
   899  			},
   900  			Kernel: &cgroupsstats.MemoryEntry{
   901  				Limit:   stats.Memory.Kernel.Limit,
   902  				Usage:   stats.Memory.Kernel.Usage,
   903  				Max:     stats.Memory.Kernel.Max,
   904  				Failcnt: stats.Memory.Kernel.Failcnt,
   905  			},
   906  			KernelTCP: &cgroupsstats.MemoryEntry{
   907  				Limit:   stats.Memory.KernelTCP.Limit,
   908  				Usage:   stats.Memory.KernelTCP.Usage,
   909  				Max:     stats.Memory.KernelTCP.Max,
   910  				Failcnt: stats.Memory.KernelTCP.Failcnt,
   911  			},
   912  		},
   913  		Pids: &cgroupsstats.PidsStat{
   914  			Current: stats.Pids.Current,
   915  			Limit:   stats.Pids.Limit,
   916  		},
   917  	}
   918  	data, err := typeurl.MarshalAny(metrics)
   919  	if err != nil {
   920  		log.L.Debugf("Stats error, id: %s: %v", r.ID, err)
   921  		return nil, err
   922  	}
   923  	log.L.Debugf("Stats success, id: %s: %+v", r.ID, data)
   924  	return &taskAPI.StatsResponse{
   925  		Stats: data,
   926  	}, nil
   927  }
   928  
   929  // Update updates a running container.
   930  func (s *service) Update(ctx context.Context, r *taskAPI.UpdateTaskRequest) (*types.Empty, error) {
   931  	return empty, errdefs.ToGRPC(errdefs.ErrNotImplemented)
   932  }
   933  
   934  // Wait waits for a process to exit.
   935  func (s *service) Wait(ctx context.Context, r *taskAPI.WaitRequest) (*taskAPI.WaitResponse, error) {
   936  	resp, err := s.wait(ctx, r)
   937  	return resp, errdefs.ToGRPC(err)
   938  }
   939  
   940  func (s *service) wait(ctx context.Context, r *taskAPI.WaitRequest) (*taskAPI.WaitResponse, error) {
   941  	log.L.Debugf("Wait, id: %s, execID: %s", r.ID, r.ExecID)
   942  
   943  	p, err := s.getProcess(r.ExecID)
   944  	if err != nil {
   945  		log.L.Debugf("Wait failed to find process: %v", err)
   946  		return nil, err
   947  	}
   948  	p.Wait()
   949  
   950  	res := &taskAPI.WaitResponse{
   951  		ExitStatus: uint32(p.ExitStatus()),
   952  		ExitedAt:   p.ExitedAt(),
   953  	}
   954  	log.L.Debugf("Wait succeeded, response: %+v", res)
   955  	return res, nil
   956  }
   957  
   958  func (s *service) processExits(ctx context.Context) {
   959  	for e := range s.ec {
   960  		s.checkProcesses(ctx, e)
   961  	}
   962  }
   963  
   964  func (s *service) checkProcesses(ctx context.Context, e proc.Exit) {
   965  	// TODO(random-liu): Add `shouldKillAll` logic if container pid
   966  	// namespace is supported.
   967  	for _, p := range s.allProcesses() {
   968  		if p.ID() == e.ID {
   969  			if ip, ok := p.(*proc.Init); ok {
   970  				// Ensure all children are killed.
   971  				log.L.Debugf("Container init process exited, killing all container processes")
   972  				ip.KillAll(ctx)
   973  			}
   974  			p.SetExited(e.Status)
   975  			s.events <- &events.TaskExit{
   976  				ContainerID: s.id,
   977  				ID:          p.ID(),
   978  				Pid:         uint32(p.Pid()),
   979  				ExitStatus:  uint32(e.Status),
   980  				ExitedAt:    p.ExitedAt(),
   981  			}
   982  			return
   983  		}
   984  	}
   985  }
   986  
   987  func (s *service) allProcesses() (o []process.Process) {
   988  	s.mu.Lock()
   989  	defer s.mu.Unlock()
   990  	for _, p := range s.processes {
   991  		o = append(o, p)
   992  	}
   993  	if s.task != nil {
   994  		o = append(o, s.task)
   995  	}
   996  	return o
   997  }
   998  
   999  func (s *service) getContainerPids(ctx context.Context, id string) ([]uint32, error) {
  1000  	s.mu.Lock()
  1001  	p := s.task
  1002  	s.mu.Unlock()
  1003  	if p == nil {
  1004  		return nil, fmt.Errorf("container must be created: %w", errdefs.ErrFailedPrecondition)
  1005  	}
  1006  	ps, err := p.Runtime().Ps(ctx, id)
  1007  	if err != nil {
  1008  		return nil, err
  1009  	}
  1010  	pids := make([]uint32, 0, len(ps))
  1011  	for _, pid := range ps {
  1012  		pids = append(pids, uint32(pid))
  1013  	}
  1014  	return pids, nil
  1015  }
  1016  
  1017  func (s *service) forward(ctx context.Context, publisher shim.Publisher) {
  1018  	for e := range s.events {
  1019  		err := publisher.Publish(ctx, getTopic(e), e)
  1020  		if err != nil {
  1021  			// Should not happen.
  1022  			panic(fmt.Errorf("post event: %w", err))
  1023  		}
  1024  	}
  1025  }
  1026  
  1027  func (s *service) getProcess(execID string) (process.Process, error) {
  1028  	s.mu.Lock()
  1029  	defer s.mu.Unlock()
  1030  
  1031  	if execID == "" {
  1032  		if s.task == nil {
  1033  			return nil, errdefs.ToGRPCf(errdefs.ErrFailedPrecondition, "container must be created")
  1034  		}
  1035  		return s.task, nil
  1036  	}
  1037  
  1038  	p := s.processes[execID]
  1039  	if p == nil {
  1040  		return nil, errdefs.ToGRPCf(errdefs.ErrNotFound, "process does not exist %s", execID)
  1041  	}
  1042  	return p, nil
  1043  }
  1044  
  1045  func getTopic(e any) string {
  1046  	switch e.(type) {
  1047  	case *events.TaskCreate:
  1048  		return runtime.TaskCreateEventTopic
  1049  	case *events.TaskStart:
  1050  		return runtime.TaskStartEventTopic
  1051  	case *events.TaskOOM:
  1052  		return runtime.TaskOOMEventTopic
  1053  	case *events.TaskExit:
  1054  		return runtime.TaskExitEventTopic
  1055  	case *events.TaskDelete:
  1056  		return runtime.TaskDeleteEventTopic
  1057  	case *events.TaskExecAdded:
  1058  		return runtime.TaskExecAddedEventTopic
  1059  	case *events.TaskExecStarted:
  1060  		return runtime.TaskExecStartedEventTopic
  1061  	default:
  1062  		log.L.Infof("no topic for type %#v", e)
  1063  	}
  1064  	return runtime.TaskUnknownTopic
  1065  }
  1066  
  1067  func newInit(path, workDir, namespace string, platform stdio.Platform, r *proc.CreateConfig, options *options, rootfs string) (*proc.Init, error) {
  1068  	spec, err := utils.ReadSpec(r.Bundle)
  1069  	if err != nil {
  1070  		return nil, fmt.Errorf("read oci spec: %w", err)
  1071  	}
  1072  
  1073  	updated, err := utils.UpdateVolumeAnnotations(spec)
  1074  	if err != nil {
  1075  		return nil, fmt.Errorf("update volume annotations: %w", err)
  1076  	}
  1077  	updated = setPodCgroup(spec) || updated
  1078  
  1079  	if updated {
  1080  		if err := utils.WriteSpec(r.Bundle, spec); err != nil {
  1081  			return nil, err
  1082  		}
  1083  	}
  1084  
  1085  	runsc.FormatRunscPaths(r.ID, options.RunscConfig)
  1086  	runtime := proc.NewRunsc(options.Root, path, namespace, options.BinaryName, options.RunscConfig, spec)
  1087  	p := proc.New(r.ID, runtime, stdio.Stdio{
  1088  		Stdin:    r.Stdin,
  1089  		Stdout:   r.Stdout,
  1090  		Stderr:   r.Stderr,
  1091  		Terminal: r.Terminal,
  1092  	})
  1093  	p.Bundle = r.Bundle
  1094  	p.Platform = platform
  1095  	p.Rootfs = rootfs
  1096  	p.WorkDir = workDir
  1097  	p.IoUID = int(options.IoUID)
  1098  	p.IoGID = int(options.IoGID)
  1099  	p.Sandbox = specutils.SpecContainerType(spec) == specutils.ContainerTypeSandbox
  1100  	p.UserLog = utils.UserLogPath(spec)
  1101  	p.Monitor = reaper.Default
  1102  	return p, nil
  1103  }
  1104  
  1105  // setPodCgroup searches for the pod cgroup path inside the container's cgroup
  1106  // path. If found, it's set as an annotation in the spec. This is done so that
  1107  // the sandbox joins the pod cgroup. Otherwise, the sandbox would join the pause
  1108  // container cgroup. Returns true if the spec was modified. Ex.:
  1109  // /kubepods/burstable/pod123/container123 => kubepods/burstable/pod123
  1110  func setPodCgroup(spec *specs.Spec) bool {
  1111  	if !utils.IsSandbox(spec) {
  1112  		return false
  1113  	}
  1114  	if spec.Linux == nil || len(spec.Linux.CgroupsPath) == 0 {
  1115  		return false
  1116  	}
  1117  
  1118  	// Search backwards for the pod cgroup path to make the sandbox use it,
  1119  	// instead of the pause container's cgroup.
  1120  	parts := strings.Split(spec.Linux.CgroupsPath, string(filepath.Separator))
  1121  	for i := len(parts) - 1; i >= 0; i-- {
  1122  		if strings.HasPrefix(parts[i], "pod") {
  1123  			var path string
  1124  			for j := 0; j <= i; j++ {
  1125  				path = filepath.Join(path, parts[j])
  1126  			}
  1127  			// Add back the initial '/' that may have been lost above.
  1128  			if filepath.IsAbs(spec.Linux.CgroupsPath) {
  1129  				path = string(filepath.Separator) + path
  1130  			}
  1131  			if spec.Linux.CgroupsPath == path {
  1132  				return false
  1133  			}
  1134  			if spec.Annotations == nil {
  1135  				spec.Annotations = make(map[string]string)
  1136  			}
  1137  			spec.Annotations[cgroupParentAnnotation] = path
  1138  			return true
  1139  		}
  1140  	}
  1141  	return false
  1142  }