github.com/opencontainers/runc@v1.2.0-rc.1.0.20240520010911-492dc558cdd6/libcontainer/process_linux.go (about)

     1  package libcontainer
     2  
     3  import (
     4  	"bytes"
     5  	"context"
     6  	"encoding/json"
     7  	"errors"
     8  	"fmt"
     9  	"io"
    10  	"io/fs"
    11  	"net"
    12  	"os"
    13  	"os/exec"
    14  	"path/filepath"
    15  	"runtime"
    16  	"strconv"
    17  	"sync"
    18  	"time"
    19  
    20  	"github.com/opencontainers/runtime-spec/specs-go"
    21  	"github.com/sirupsen/logrus"
    22  	"golang.org/x/sys/unix"
    23  
    24  	"github.com/opencontainers/runc/libcontainer/cgroups"
    25  	"github.com/opencontainers/runc/libcontainer/cgroups/fs2"
    26  	"github.com/opencontainers/runc/libcontainer/cgroups/systemd"
    27  	"github.com/opencontainers/runc/libcontainer/configs"
    28  	"github.com/opencontainers/runc/libcontainer/intelrdt"
    29  	"github.com/opencontainers/runc/libcontainer/logs"
    30  	"github.com/opencontainers/runc/libcontainer/system"
    31  	"github.com/opencontainers/runc/libcontainer/system/kernelparam"
    32  	"github.com/opencontainers/runc/libcontainer/userns"
    33  	"github.com/opencontainers/runc/libcontainer/utils"
    34  )
    35  
    36  type parentProcess interface {
    37  	// pid returns the pid for the running process.
    38  	pid() int
    39  
    40  	// start starts the process execution.
    41  	start() error
    42  
    43  	// send a SIGKILL to the process and wait for the exit.
    44  	terminate() error
    45  
    46  	// wait waits on the process returning the process state.
    47  	wait() (*os.ProcessState, error)
    48  
    49  	// startTime returns the process start time.
    50  	startTime() (uint64, error)
    51  	signal(os.Signal) error
    52  	externalDescriptors() []string
    53  	setExternalDescriptors(fds []string)
    54  	forwardChildLogs() chan error
    55  }
    56  
    57  type processComm struct {
    58  	// Used to send initial configuration to "runc init" and for "runc init" to
    59  	// indicate that it is ready.
    60  	initSockParent *os.File
    61  	initSockChild  *os.File
    62  	// Used for control messages between parent and "runc init".
    63  	syncSockParent *syncSocket
    64  	syncSockChild  *syncSocket
    65  	// Used for log forwarding from "runc init" to the parent.
    66  	logPipeParent *os.File
    67  	logPipeChild  *os.File
    68  }
    69  
    70  func newProcessComm() (*processComm, error) {
    71  	var (
    72  		comm processComm
    73  		err  error
    74  	)
    75  	comm.initSockParent, comm.initSockChild, err = utils.NewSockPair("init")
    76  	if err != nil {
    77  		return nil, fmt.Errorf("unable to create init pipe: %w", err)
    78  	}
    79  	comm.syncSockParent, comm.syncSockChild, err = newSyncSockpair("sync")
    80  	if err != nil {
    81  		return nil, fmt.Errorf("unable to create sync pipe: %w", err)
    82  	}
    83  	comm.logPipeParent, comm.logPipeChild, err = os.Pipe()
    84  	if err != nil {
    85  		return nil, fmt.Errorf("unable to create log pipe: %w", err)
    86  	}
    87  	return &comm, nil
    88  }
    89  
    90  func (c *processComm) closeChild() {
    91  	_ = c.initSockChild.Close()
    92  	_ = c.syncSockChild.Close()
    93  	_ = c.logPipeChild.Close()
    94  }
    95  
    96  func (c *processComm) closeParent() {
    97  	_ = c.initSockParent.Close()
    98  	_ = c.syncSockParent.Close()
    99  	// c.logPipeParent is kept alive for ForwardLogs
   100  }
   101  
   102  type setnsProcess struct {
   103  	cmd             *exec.Cmd
   104  	comm            *processComm
   105  	cgroupPaths     map[string]string
   106  	rootlessCgroups bool
   107  	manager         cgroups.Manager
   108  	intelRdtPath    string
   109  	config          *initConfig
   110  	fds             []string
   111  	process         *Process
   112  	bootstrapData   io.Reader
   113  	initProcessPid  int
   114  }
   115  
   116  func (p *setnsProcess) startTime() (uint64, error) {
   117  	stat, err := system.Stat(p.pid())
   118  	return stat.StartTime, err
   119  }
   120  
   121  func (p *setnsProcess) signal(sig os.Signal) error {
   122  	s, ok := sig.(unix.Signal)
   123  	if !ok {
   124  		return errors.New("os: unsupported signal type")
   125  	}
   126  	return unix.Kill(p.pid(), s)
   127  }
   128  
   129  func (p *setnsProcess) start() (retErr error) {
   130  	defer p.comm.closeParent()
   131  
   132  	if p.process.IOPriority != nil {
   133  		if err := setIOPriority(p.process.IOPriority); err != nil {
   134  			return err
   135  		}
   136  	}
   137  
   138  	// get the "before" value of oom kill count
   139  	oom, _ := p.manager.OOMKillCount()
   140  
   141  	// When greater or equal to zero, it will set a temporary single CPU
   142  	// affinity before cgroup cpuset transition, this handles a corner
   143  	// case when joining a container having all the processes running
   144  	// exclusively on isolated CPU cores to force the kernel to schedule
   145  	// runc process on the first CPU core within the cgroups cpuset.
   146  	// The introduction of the kernel commit 46a87b3851f0d6eb05e6d83d5c5a30df0eca8f76
   147  	// in 5.7 has affected this deterministic scheduling behavior by
   148  	// distributing tasks across CPU cores within the cgroups cpuset.
   149  	// Some intensive real-time application are relying on this
   150  	// deterministic behavior and use the first CPU core to run a slow
   151  	// thread while other CPU cores are fully used by real-time threads
   152  	// with SCHED_FIFO policy. Such applications prevent runc process
   153  	// from joining a container when the runc process is randomly
   154  	// scheduled on a CPU core owned by a real-time thread.
   155  	cpuAffinity := -1
   156  	resetCPUAffinity := true
   157  
   158  	if len(p.manager.GetPaths()) > 0 {
   159  		// Get the target container cgroup.
   160  		if cg, err := p.manager.GetCgroups(); err != nil {
   161  			// Close the pipe to not be blocked in the parent.
   162  			p.comm.closeChild()
   163  			return fmt.Errorf("getting container cgroups: %w", err)
   164  		} else if cg.CpusetCpus != "" {
   165  			definitive := false
   166  
   167  			_, annotations := utils.Annotations(p.config.Config.Labels)
   168  			cpuAffinity, definitive, err = isolatedCPUAffinityTransition(
   169  				os.DirFS("/"),
   170  				cg.CpusetCpus,
   171  				annotations,
   172  			)
   173  			if err != nil {
   174  				// Close the pipe to not be blocked in the parent.
   175  				p.comm.closeChild()
   176  				return fmt.Errorf("getting CPU affinity: %w", err)
   177  			} else if definitive {
   178  				resetCPUAffinity = false
   179  			}
   180  		}
   181  	}
   182  
   183  	var err error
   184  
   185  	if cpuAffinity < 0 {
   186  		err = p.cmd.Start()
   187  	} else {
   188  		err = startCommandWithCPUAffinity(p.cmd, cpuAffinity)
   189  	}
   190  
   191  	// Close the write-side of the pipes (controlled by child).
   192  	p.comm.closeChild()
   193  	if err != nil {
   194  		return fmt.Errorf("error starting setns process: %w", err)
   195  	}
   196  
   197  	waitInit := initWaiter(p.comm.initSockParent)
   198  	defer func() {
   199  		if retErr != nil {
   200  			if newOom, err := p.manager.OOMKillCount(); err == nil && newOom != oom {
   201  				// Someone in this cgroup was killed, this _might_ be us.
   202  				retErr = fmt.Errorf("%w (possibly OOM-killed)", retErr)
   203  			}
   204  			werr := <-waitInit
   205  			if werr != nil {
   206  				logrus.WithError(werr).Warn()
   207  			}
   208  			err := ignoreTerminateErrors(p.terminate())
   209  			if err != nil {
   210  				logrus.WithError(err).Warn("unable to terminate setnsProcess")
   211  			}
   212  		}
   213  	}()
   214  
   215  	if p.bootstrapData != nil {
   216  		if _, err := io.Copy(p.comm.initSockParent, p.bootstrapData); err != nil {
   217  			return fmt.Errorf("error copying bootstrap data to pipe: %w", err)
   218  		}
   219  	}
   220  	err = <-waitInit
   221  	if err != nil {
   222  		return err
   223  	}
   224  	if err := p.execSetns(); err != nil {
   225  		return fmt.Errorf("error executing setns process: %w", err)
   226  	}
   227  	for _, path := range p.cgroupPaths {
   228  		if err := cgroups.WriteCgroupProc(path, p.pid()); err != nil && !p.rootlessCgroups {
   229  			// On cgroup v2 + nesting + domain controllers, WriteCgroupProc may fail with EBUSY.
   230  			// https://github.com/opencontainers/runc/issues/2356#issuecomment-621277643
   231  			// Try to join the cgroup of InitProcessPid.
   232  			if cgroups.IsCgroup2UnifiedMode() && p.initProcessPid != 0 {
   233  				initProcCgroupFile := fmt.Sprintf("/proc/%d/cgroup", p.initProcessPid)
   234  				initCg, initCgErr := cgroups.ParseCgroupFile(initProcCgroupFile)
   235  				if initCgErr == nil {
   236  					if initCgPath, ok := initCg[""]; ok {
   237  						initCgDirpath := filepath.Join(fs2.UnifiedMountpoint, initCgPath)
   238  						logrus.Debugf("adding pid %d to cgroups %v failed (%v), attempting to join %q (obtained from %s)",
   239  							p.pid(), p.cgroupPaths, err, initCg, initCgDirpath)
   240  						// NOTE: initCgDirPath is not guaranteed to exist because we didn't pause the container.
   241  						err = cgroups.WriteCgroupProc(initCgDirpath, p.pid())
   242  					}
   243  				}
   244  			}
   245  			if err != nil {
   246  				return fmt.Errorf("error adding pid %d to cgroups: %w", p.pid(), err)
   247  			}
   248  		}
   249  	}
   250  
   251  	if resetCPUAffinity {
   252  		// Fix the container process CPU affinity to match container cgroup cpuset,
   253  		// since kernel 6.2, the runc CPU affinity might affect the container process
   254  		// CPU affinity after cgroup cpuset transition, by example if runc is running
   255  		// with CPU affinity 0-1 and container process has cpuset.cpus set to 1-2, the
   256  		// resulting container process CPU affinity will be 1 instead of 1-2.
   257  		if err := fixProcessCPUAffinity(p.pid(), p.manager); err != nil {
   258  			return fmt.Errorf("error resetting container process CPU affinity: %w", err)
   259  		}
   260  	}
   261  
   262  	if p.intelRdtPath != "" {
   263  		// if Intel RDT "resource control" filesystem path exists
   264  		_, err := os.Stat(p.intelRdtPath)
   265  		if err == nil {
   266  			if err := intelrdt.WriteIntelRdtTasks(p.intelRdtPath, p.pid()); err != nil {
   267  				return fmt.Errorf("error adding pid %d to Intel RDT: %w", p.pid(), err)
   268  			}
   269  		}
   270  	}
   271  
   272  	if err := utils.WriteJSON(p.comm.initSockParent, p.config); err != nil {
   273  		return fmt.Errorf("error writing config to pipe: %w", err)
   274  	}
   275  
   276  	var seenProcReady bool
   277  	ierr := parseSync(p.comm.syncSockParent, func(sync *syncT) error {
   278  		switch sync.Type {
   279  		case procReady:
   280  			seenProcReady = true
   281  			// Set rlimits, this has to be done here because we lose permissions
   282  			// to raise the limits once we enter a user-namespace
   283  			if err := setupRlimits(p.config.Rlimits, p.pid()); err != nil {
   284  				return fmt.Errorf("error setting rlimits for ready process: %w", err)
   285  			}
   286  
   287  			// Sync with child.
   288  			if err := writeSync(p.comm.syncSockParent, procRun); err != nil {
   289  				return err
   290  			}
   291  		case procHooks:
   292  			// This shouldn't happen.
   293  			panic("unexpected procHooks in setns")
   294  		case procMountPlease:
   295  			// This shouldn't happen.
   296  			panic("unexpected procMountPlease in setns")
   297  		case procSeccomp:
   298  			if p.config.Config.Seccomp.ListenerPath == "" {
   299  				return errors.New("seccomp listenerPath is not set")
   300  			}
   301  			if sync.Arg == nil {
   302  				return fmt.Errorf("sync %q is missing an argument", sync.Type)
   303  			}
   304  			var srcFd int
   305  			if err := json.Unmarshal(*sync.Arg, &srcFd); err != nil {
   306  				return fmt.Errorf("sync %q passed invalid fd arg: %w", sync.Type, err)
   307  			}
   308  			seccompFd, err := pidGetFd(p.pid(), srcFd)
   309  			if err != nil {
   310  				return fmt.Errorf("sync %q get fd %d from child failed: %w", sync.Type, srcFd, err)
   311  			}
   312  			defer seccompFd.Close()
   313  			// We have a copy, the child can keep working. We don't need to
   314  			// wait for the seccomp notify listener to get the fd before we
   315  			// permit the child to continue because the child will happily wait
   316  			// for the listener if it hits SCMP_ACT_NOTIFY.
   317  			if err := writeSync(p.comm.syncSockParent, procSeccompDone); err != nil {
   318  				return err
   319  			}
   320  
   321  			bundle, annotations := utils.Annotations(p.config.Config.Labels)
   322  			containerProcessState := &specs.ContainerProcessState{
   323  				Version:  specs.Version,
   324  				Fds:      []string{specs.SeccompFdName},
   325  				Pid:      p.cmd.Process.Pid,
   326  				Metadata: p.config.Config.Seccomp.ListenerMetadata,
   327  				State: specs.State{
   328  					Version:     specs.Version,
   329  					ID:          p.config.ContainerID,
   330  					Status:      specs.StateRunning,
   331  					Pid:         p.initProcessPid,
   332  					Bundle:      bundle,
   333  					Annotations: annotations,
   334  				},
   335  			}
   336  			if err := sendContainerProcessState(p.config.Config.Seccomp.ListenerPath,
   337  				containerProcessState, seccompFd); err != nil {
   338  				return err
   339  			}
   340  		default:
   341  			return errors.New("invalid JSON payload from child")
   342  		}
   343  		return nil
   344  	})
   345  
   346  	if err := p.comm.syncSockParent.Shutdown(unix.SHUT_WR); err != nil && ierr == nil {
   347  		return err
   348  	}
   349  	if !seenProcReady && ierr == nil {
   350  		ierr = errors.New("procReady not received")
   351  	}
   352  	// Must be done after Shutdown so the child will exit and we can wait for it.
   353  	if ierr != nil {
   354  		_, _ = p.wait()
   355  		return ierr
   356  	}
   357  	return nil
   358  }
   359  
   360  // execSetns runs the process that executes C code to perform the setns calls
   361  // because setns support requires the C process to fork off a child and perform the setns
   362  // before the go runtime boots, we wait on the process to die and receive the child's pid
   363  // over the provided pipe.
   364  func (p *setnsProcess) execSetns() error {
   365  	status, err := p.cmd.Process.Wait()
   366  	if err != nil {
   367  		_ = p.cmd.Wait()
   368  		return fmt.Errorf("error waiting on setns process to finish: %w", err)
   369  	}
   370  	if !status.Success() {
   371  		_ = p.cmd.Wait()
   372  		return &exec.ExitError{ProcessState: status}
   373  	}
   374  	var pid *pid
   375  	if err := json.NewDecoder(p.comm.initSockParent).Decode(&pid); err != nil {
   376  		_ = p.cmd.Wait()
   377  		return fmt.Errorf("error reading pid from init pipe: %w", err)
   378  	}
   379  
   380  	// Clean up the zombie parent process
   381  	// On Unix systems FindProcess always succeeds.
   382  	firstChildProcess, _ := os.FindProcess(pid.PidFirstChild)
   383  
   384  	// Ignore the error in case the child has already been reaped for any reason
   385  	_, _ = firstChildProcess.Wait()
   386  
   387  	process, err := os.FindProcess(pid.Pid)
   388  	if err != nil {
   389  		return err
   390  	}
   391  	p.cmd.Process = process
   392  	p.process.ops = p
   393  	return nil
   394  }
   395  
   396  // terminate sends a SIGKILL to the forked process for the setns routine then waits to
   397  // avoid the process becoming a zombie.
   398  func (p *setnsProcess) terminate() error {
   399  	if p.cmd.Process == nil {
   400  		return nil
   401  	}
   402  	err := p.cmd.Process.Kill()
   403  	if _, werr := p.wait(); err == nil {
   404  		err = werr
   405  	}
   406  	return err
   407  }
   408  
   409  func (p *setnsProcess) wait() (*os.ProcessState, error) {
   410  	err := p.cmd.Wait()
   411  
   412  	// Return actual ProcessState even on Wait error
   413  	return p.cmd.ProcessState, err
   414  }
   415  
   416  func (p *setnsProcess) pid() int {
   417  	return p.cmd.Process.Pid
   418  }
   419  
   420  func (p *setnsProcess) externalDescriptors() []string {
   421  	return p.fds
   422  }
   423  
   424  func (p *setnsProcess) setExternalDescriptors(newFds []string) {
   425  	p.fds = newFds
   426  }
   427  
   428  func (p *setnsProcess) forwardChildLogs() chan error {
   429  	return logs.ForwardLogs(p.comm.logPipeParent)
   430  }
   431  
   432  type initProcess struct {
   433  	cmd             *exec.Cmd
   434  	comm            *processComm
   435  	config          *initConfig
   436  	manager         cgroups.Manager
   437  	intelRdtManager *intelrdt.Manager
   438  	container       *Container
   439  	fds             []string
   440  	process         *Process
   441  	bootstrapData   io.Reader
   442  }
   443  
   444  func (p *initProcess) pid() int {
   445  	return p.cmd.Process.Pid
   446  }
   447  
   448  func (p *initProcess) externalDescriptors() []string {
   449  	return p.fds
   450  }
   451  
   452  // getChildPid receives the final child's pid over the provided pipe.
   453  func (p *initProcess) getChildPid() (int, error) {
   454  	var pid pid
   455  	if err := json.NewDecoder(p.comm.initSockParent).Decode(&pid); err != nil {
   456  		_ = p.cmd.Wait()
   457  		return -1, err
   458  	}
   459  
   460  	// Clean up the zombie parent process
   461  	// On Unix systems FindProcess always succeeds.
   462  	firstChildProcess, _ := os.FindProcess(pid.PidFirstChild)
   463  
   464  	// Ignore the error in case the child has already been reaped for any reason
   465  	_, _ = firstChildProcess.Wait()
   466  
   467  	return pid.Pid, nil
   468  }
   469  
   470  func (p *initProcess) waitForChildExit(childPid int) error {
   471  	status, err := p.cmd.Process.Wait()
   472  	if err != nil {
   473  		_ = p.cmd.Wait()
   474  		return err
   475  	}
   476  	if !status.Success() {
   477  		_ = p.cmd.Wait()
   478  		return &exec.ExitError{ProcessState: status}
   479  	}
   480  
   481  	process, err := os.FindProcess(childPid)
   482  	if err != nil {
   483  		return err
   484  	}
   485  	p.cmd.Process = process
   486  	p.process.ops = p
   487  	return nil
   488  }
   489  
   490  type mountSourceRequestFn func(*configs.Mount) (*mountSource, error)
   491  
   492  // goCreateMountSources spawns a goroutine which creates open_tree(2)-style
   493  // mountfds based on the requested configs.Mount configuration. The returned
   494  // requestFn and cancelFn are used to interact with the goroutine.
   495  //
   496  // The caller of the returned mountSourceRequestFn is responsible for closing
   497  // the returned file.
   498  func (p *initProcess) goCreateMountSources(ctx context.Context) (mountSourceRequestFn, context.CancelFunc, error) {
   499  	type response struct {
   500  		src *mountSource
   501  		err error
   502  	}
   503  
   504  	errCh := make(chan error, 1)
   505  	requestCh := make(chan *configs.Mount)
   506  	responseCh := make(chan response)
   507  
   508  	ctx, cancelFn := context.WithTimeout(ctx, 1*time.Minute)
   509  	go func() {
   510  		// We lock this thread because we need to setns(2) here. There is no
   511  		// UnlockOSThread() here, to ensure that the Go runtime will kill this
   512  		// thread once this goroutine returns (ensuring no other goroutines run
   513  		// in this context).
   514  		runtime.LockOSThread()
   515  
   516  		// Detach from the shared fs of the rest of the Go process in order to
   517  		// be able to CLONE_NEWNS.
   518  		if err := unix.Unshare(unix.CLONE_FS); err != nil {
   519  			err = os.NewSyscallError("unshare(CLONE_FS)", err)
   520  			errCh <- fmt.Errorf("mount source thread: %w", err)
   521  			return
   522  		}
   523  
   524  		// Attach to the container's mount namespace.
   525  		nsFd, err := os.Open(fmt.Sprintf("/proc/%d/ns/mnt", p.pid()))
   526  		if err != nil {
   527  			errCh <- fmt.Errorf("mount source thread: open container mntns: %w", err)
   528  			return
   529  		}
   530  		defer nsFd.Close()
   531  		if err := unix.Setns(int(nsFd.Fd()), unix.CLONE_NEWNS); err != nil {
   532  			err = os.NewSyscallError("setns", err)
   533  			errCh <- fmt.Errorf("mount source thread: join container mntns: %w", err)
   534  			return
   535  		}
   536  
   537  		// No errors during setup!
   538  		close(errCh)
   539  		logrus.Debugf("mount source thread: successfully running in container mntns")
   540  
   541  		nsHandles := new(userns.Handles)
   542  		defer nsHandles.Release()
   543  	loop:
   544  		for {
   545  			select {
   546  			case m, ok := <-requestCh:
   547  				if !ok {
   548  					break loop
   549  				}
   550  				src, err := mountFd(nsHandles, m)
   551  				logrus.Debugf("mount source thread: handling request for %q: %v %v", m.Source, src, err)
   552  				responseCh <- response{
   553  					src: src,
   554  					err: err,
   555  				}
   556  			case <-ctx.Done():
   557  				break loop
   558  			}
   559  		}
   560  		logrus.Debugf("mount source thread: closing thread: %v", ctx.Err())
   561  		close(responseCh)
   562  	}()
   563  
   564  	// Check for setup errors.
   565  	err := <-errCh
   566  	if err != nil {
   567  		cancelFn()
   568  		return nil, nil, err
   569  	}
   570  
   571  	// TODO: Switch to context.AfterFunc when we switch to Go 1.21.
   572  	var requestChCloseOnce sync.Once
   573  	requestFn := func(m *configs.Mount) (*mountSource, error) {
   574  		var err error
   575  		select {
   576  		case requestCh <- m:
   577  			select {
   578  			case resp, ok := <-responseCh:
   579  				if ok {
   580  					return resp.src, resp.err
   581  				}
   582  			case <-ctx.Done():
   583  				err = fmt.Errorf("receive mount source context cancelled: %w", ctx.Err())
   584  			}
   585  		case <-ctx.Done():
   586  			err = fmt.Errorf("send mount request cancelled: %w", ctx.Err())
   587  		}
   588  		requestChCloseOnce.Do(func() { close(requestCh) })
   589  		return nil, err
   590  	}
   591  	return requestFn, cancelFn, nil
   592  }
   593  
   594  func (p *initProcess) start() (retErr error) {
   595  	defer p.comm.closeParent()
   596  	err := p.cmd.Start()
   597  	p.process.ops = p
   598  	// close the child-side of the pipes (controlled by child)
   599  	p.comm.closeChild()
   600  	if err != nil {
   601  		p.process.ops = nil
   602  		return fmt.Errorf("unable to start init: %w", err)
   603  	}
   604  
   605  	waitInit := initWaiter(p.comm.initSockParent)
   606  	defer func() {
   607  		if retErr != nil {
   608  			// Find out if init is killed by the kernel's OOM killer.
   609  			// Get the count before killing init as otherwise cgroup
   610  			// might be removed by systemd.
   611  			oom, err := p.manager.OOMKillCount()
   612  			if err != nil {
   613  				logrus.WithError(err).Warn("unable to get oom kill count")
   614  			} else if oom > 0 {
   615  				// Does not matter what the particular error was,
   616  				// its cause is most probably OOM, so report that.
   617  				const oomError = "container init was OOM-killed (memory limit too low?)"
   618  
   619  				if logrus.GetLevel() >= logrus.DebugLevel {
   620  					// Only show the original error if debug is set,
   621  					// as it is not generally very useful.
   622  					retErr = fmt.Errorf(oomError+": %w", retErr)
   623  				} else {
   624  					retErr = errors.New(oomError)
   625  				}
   626  			}
   627  
   628  			werr := <-waitInit
   629  			if werr != nil {
   630  				logrus.WithError(werr).Warn()
   631  			}
   632  
   633  			// Terminate the process to ensure we can remove cgroups.
   634  			if err := ignoreTerminateErrors(p.terminate()); err != nil {
   635  				logrus.WithError(err).Warn("unable to terminate initProcess")
   636  			}
   637  
   638  			_ = p.manager.Destroy()
   639  			if p.intelRdtManager != nil {
   640  				_ = p.intelRdtManager.Destroy()
   641  			}
   642  		}
   643  	}()
   644  
   645  	// Do this before syncing with child so that no children can escape the
   646  	// cgroup. We don't need to worry about not doing this and not being root
   647  	// because we'd be using the rootless cgroup manager in that case.
   648  	if err := p.manager.Apply(p.pid()); err != nil {
   649  		return fmt.Errorf("unable to apply cgroup configuration: %w", err)
   650  	}
   651  	if p.intelRdtManager != nil {
   652  		if err := p.intelRdtManager.Apply(p.pid()); err != nil {
   653  			return fmt.Errorf("unable to apply Intel RDT configuration: %w", err)
   654  		}
   655  	}
   656  	if _, err := io.Copy(p.comm.initSockParent, p.bootstrapData); err != nil {
   657  		return fmt.Errorf("can't copy bootstrap data to pipe: %w", err)
   658  	}
   659  	err = <-waitInit
   660  	if err != nil {
   661  		return err
   662  	}
   663  
   664  	childPid, err := p.getChildPid()
   665  	if err != nil {
   666  		return fmt.Errorf("can't get final child's PID from pipe: %w", err)
   667  	}
   668  
   669  	// Save the standard descriptor names before the container process
   670  	// can potentially move them (e.g., via dup2()).  If we don't do this now,
   671  	// we won't know at checkpoint time which file descriptor to look up.
   672  	fds, err := getPipeFds(childPid)
   673  	if err != nil {
   674  		return fmt.Errorf("error getting pipe fds for pid %d: %w", childPid, err)
   675  	}
   676  	p.setExternalDescriptors(fds)
   677  
   678  	// Wait for our first child to exit
   679  	if err := p.waitForChildExit(childPid); err != nil {
   680  		return fmt.Errorf("error waiting for our first child to exit: %w", err)
   681  	}
   682  
   683  	// Spin up a goroutine to handle remapping mount requests by runc init.
   684  	// There is no point doing this for rootless containers because they cannot
   685  	// configure MOUNT_ATTR_IDMAP, nor do OPEN_TREE_CLONE. We could just
   686  	// service plain-open requests for plain bind-mounts but there's no need
   687  	// (rootless containers will never have permission issues on a source mount
   688  	// that the parent process can help with -- they are the same user).
   689  	var mountRequest mountSourceRequestFn
   690  	if !p.container.config.RootlessEUID {
   691  		request, cancel, err := p.goCreateMountSources(context.Background())
   692  		if err != nil {
   693  			return fmt.Errorf("error spawning mount remapping thread: %w", err)
   694  		}
   695  		defer cancel()
   696  		mountRequest = request
   697  	}
   698  
   699  	if err := p.createNetworkInterfaces(); err != nil {
   700  		return fmt.Errorf("error creating network interfaces: %w", err)
   701  	}
   702  	if err := p.updateSpecState(); err != nil {
   703  		return fmt.Errorf("error updating spec state: %w", err)
   704  	}
   705  	if err := utils.WriteJSON(p.comm.initSockParent, p.config); err != nil {
   706  		return fmt.Errorf("error sending config to init process: %w", err)
   707  	}
   708  
   709  	var seenProcReady bool
   710  	ierr := parseSync(p.comm.syncSockParent, func(sync *syncT) error {
   711  		switch sync.Type {
   712  		case procMountPlease:
   713  			if mountRequest == nil {
   714  				return fmt.Errorf("cannot fulfil mount requests as a rootless user")
   715  			}
   716  			var m *configs.Mount
   717  			if sync.Arg == nil {
   718  				return fmt.Errorf("sync %q is missing an argument", sync.Type)
   719  			}
   720  			if err := json.Unmarshal(*sync.Arg, &m); err != nil {
   721  				return fmt.Errorf("sync %q passed invalid mount arg: %w", sync.Type, err)
   722  			}
   723  			mnt, err := mountRequest(m)
   724  			if err != nil {
   725  				return fmt.Errorf("failed to fulfil mount request: %w", err)
   726  			}
   727  			defer mnt.file.Close()
   728  
   729  			arg, err := json.Marshal(mnt)
   730  			if err != nil {
   731  				return fmt.Errorf("sync %q failed to marshal mountSource: %w", sync.Type, err)
   732  			}
   733  			argMsg := json.RawMessage(arg)
   734  			if err := doWriteSync(p.comm.syncSockParent, syncT{
   735  				Type: procMountFd,
   736  				Arg:  &argMsg,
   737  				File: mnt.file,
   738  			}); err != nil {
   739  				return err
   740  			}
   741  		case procSeccomp:
   742  			if p.config.Config.Seccomp.ListenerPath == "" {
   743  				return errors.New("seccomp listenerPath is not set")
   744  			}
   745  			var srcFd int
   746  			if sync.Arg == nil {
   747  				return fmt.Errorf("sync %q is missing an argument", sync.Type)
   748  			}
   749  			if err := json.Unmarshal(*sync.Arg, &srcFd); err != nil {
   750  				return fmt.Errorf("sync %q passed invalid fd arg: %w", sync.Type, err)
   751  			}
   752  			seccompFd, err := pidGetFd(p.pid(), srcFd)
   753  			if err != nil {
   754  				return fmt.Errorf("sync %q get fd %d from child failed: %w", sync.Type, srcFd, err)
   755  			}
   756  			defer seccompFd.Close()
   757  			// We have a copy, the child can keep working. We don't need to
   758  			// wait for the seccomp notify listener to get the fd before we
   759  			// permit the child to continue because the child will happily wait
   760  			// for the listener if it hits SCMP_ACT_NOTIFY.
   761  			if err := writeSync(p.comm.syncSockParent, procSeccompDone); err != nil {
   762  				return err
   763  			}
   764  
   765  			s, err := p.container.currentOCIState()
   766  			if err != nil {
   767  				return err
   768  			}
   769  
   770  			// initProcessStartTime hasn't been set yet.
   771  			s.Pid = p.cmd.Process.Pid
   772  			s.Status = specs.StateCreating
   773  			containerProcessState := &specs.ContainerProcessState{
   774  				Version:  specs.Version,
   775  				Fds:      []string{specs.SeccompFdName},
   776  				Pid:      s.Pid,
   777  				Metadata: p.config.Config.Seccomp.ListenerMetadata,
   778  				State:    *s,
   779  			}
   780  			if err := sendContainerProcessState(p.config.Config.Seccomp.ListenerPath,
   781  				containerProcessState, seccompFd); err != nil {
   782  				return err
   783  			}
   784  		case procReady:
   785  			seenProcReady = true
   786  			// Set rlimits, this has to be done here because we lose permissions
   787  			// to raise the limits once we enter a user-namespace
   788  			if err := setupRlimits(p.config.Rlimits, p.pid()); err != nil {
   789  				return fmt.Errorf("error setting rlimits for ready process: %w", err)
   790  			}
   791  
   792  			// generate a timestamp indicating when the container was started
   793  			p.container.created = time.Now().UTC()
   794  			p.container.state = &createdState{
   795  				c: p.container,
   796  			}
   797  
   798  			// NOTE: If the procRun state has been synced and the
   799  			// runc-create process has been killed for some reason,
   800  			// the runc-init[2:stage] process will be leaky. And
   801  			// the runc command also fails to parse root directory
   802  			// because the container doesn't have state.json.
   803  			//
   804  			// In order to cleanup the runc-init[2:stage] by
   805  			// runc-delete/stop, we should store the status before
   806  			// procRun sync.
   807  			state, uerr := p.container.updateState(p)
   808  			if uerr != nil {
   809  				return fmt.Errorf("unable to store init state: %w", uerr)
   810  			}
   811  			p.container.initProcessStartTime = state.InitProcessStartTime
   812  
   813  			// Sync with child.
   814  			if err := writeSync(p.comm.syncSockParent, procRun); err != nil {
   815  				return err
   816  			}
   817  		case procHooks:
   818  			// Setup cgroup before prestart hook, so that the prestart hook could apply cgroup permissions.
   819  			if err := p.manager.Set(p.config.Config.Cgroups.Resources); err != nil {
   820  				return fmt.Errorf("error setting cgroup config for procHooks process: %w", err)
   821  			}
   822  			// Reset container process CPU affinity to match container cgroup cpuset,
   823  			// since kernel 6.2, the runc CPU affinity might affect the container process
   824  			// CPU affinity after cgroup cpuset transition, by example if runc is running
   825  			// with CPU affinity 0-1 and container process has cpuset.cpus set to 1-2, the
   826  			// resulting container process CPU affinity will be 1 instead of 1-2.
   827  			if err := fixProcessCPUAffinity(p.pid(), p.manager); err != nil {
   828  				return fmt.Errorf("error resetting container process CPU affinity: %w", err)
   829  			}
   830  			if p.intelRdtManager != nil {
   831  				if err := p.intelRdtManager.Set(p.config.Config); err != nil {
   832  					return fmt.Errorf("error setting Intel RDT config for procHooks process: %w", err)
   833  				}
   834  			}
   835  			if len(p.config.Config.Hooks) != 0 {
   836  				s, err := p.container.currentOCIState()
   837  				if err != nil {
   838  					return err
   839  				}
   840  				// initProcessStartTime hasn't been set yet.
   841  				s.Pid = p.cmd.Process.Pid
   842  				s.Status = specs.StateCreating
   843  				hooks := p.config.Config.Hooks
   844  
   845  				if err := hooks.Run(configs.Prestart, s); err != nil {
   846  					return err
   847  				}
   848  				if err := hooks.Run(configs.CreateRuntime, s); err != nil {
   849  					return err
   850  				}
   851  			}
   852  			// Sync with child.
   853  			if err := writeSync(p.comm.syncSockParent, procHooksDone); err != nil {
   854  				return err
   855  			}
   856  		default:
   857  			return errors.New("invalid JSON payload from child")
   858  		}
   859  		return nil
   860  	})
   861  
   862  	if err := p.comm.syncSockParent.Shutdown(unix.SHUT_WR); err != nil && ierr == nil {
   863  		return err
   864  	}
   865  	if !seenProcReady && ierr == nil {
   866  		ierr = errors.New("procReady not received")
   867  	}
   868  	if ierr != nil {
   869  		return fmt.Errorf("error during container init: %w", ierr)
   870  	}
   871  	return nil
   872  }
   873  
   874  func (p *initProcess) wait() (*os.ProcessState, error) {
   875  	err := p.cmd.Wait()
   876  	return p.cmd.ProcessState, err
   877  }
   878  
   879  func (p *initProcess) terminate() error {
   880  	if p.cmd.Process == nil {
   881  		return nil
   882  	}
   883  	err := p.cmd.Process.Kill()
   884  	if _, werr := p.wait(); err == nil {
   885  		err = werr
   886  	}
   887  	return err
   888  }
   889  
   890  func (p *initProcess) startTime() (uint64, error) {
   891  	stat, err := system.Stat(p.pid())
   892  	return stat.StartTime, err
   893  }
   894  
   895  func (p *initProcess) updateSpecState() error {
   896  	s, err := p.container.currentOCIState()
   897  	if err != nil {
   898  		return err
   899  	}
   900  
   901  	p.config.SpecState = s
   902  	return nil
   903  }
   904  
   905  func (p *initProcess) createNetworkInterfaces() error {
   906  	for _, config := range p.config.Config.Networks {
   907  		strategy, err := getStrategy(config.Type)
   908  		if err != nil {
   909  			return err
   910  		}
   911  		n := &network{
   912  			Network: *config,
   913  		}
   914  		if err := strategy.create(n, p.pid()); err != nil {
   915  			return err
   916  		}
   917  		p.config.Networks = append(p.config.Networks, n)
   918  	}
   919  	return nil
   920  }
   921  
   922  func (p *initProcess) signal(sig os.Signal) error {
   923  	s, ok := sig.(unix.Signal)
   924  	if !ok {
   925  		return errors.New("os: unsupported signal type")
   926  	}
   927  	return unix.Kill(p.pid(), s)
   928  }
   929  
   930  func (p *initProcess) setExternalDescriptors(newFds []string) {
   931  	p.fds = newFds
   932  }
   933  
   934  func (p *initProcess) forwardChildLogs() chan error {
   935  	return logs.ForwardLogs(p.comm.logPipeParent)
   936  }
   937  
   938  func pidGetFd(pid, srcFd int) (*os.File, error) {
   939  	pidFd, err := unix.PidfdOpen(pid, 0)
   940  	if err != nil {
   941  		return nil, os.NewSyscallError("pidfd_open", err)
   942  	}
   943  	defer unix.Close(pidFd)
   944  	fd, err := unix.PidfdGetfd(pidFd, srcFd, 0)
   945  	if err != nil {
   946  		return nil, os.NewSyscallError("pidfd_getfd", err)
   947  	}
   948  	return os.NewFile(uintptr(fd), "[pidfd_getfd]"), nil
   949  }
   950  
   951  func sendContainerProcessState(listenerPath string, state *specs.ContainerProcessState, file *os.File) error {
   952  	conn, err := net.Dial("unix", listenerPath)
   953  	if err != nil {
   954  		return fmt.Errorf("failed to connect with seccomp agent specified in the seccomp profile: %w", err)
   955  	}
   956  
   957  	socket, err := conn.(*net.UnixConn).File()
   958  	if err != nil {
   959  		return fmt.Errorf("cannot get seccomp socket: %w", err)
   960  	}
   961  	defer socket.Close()
   962  
   963  	b, err := json.Marshal(state)
   964  	if err != nil {
   965  		return fmt.Errorf("cannot marshall seccomp state: %w", err)
   966  	}
   967  
   968  	if err := utils.SendRawFd(socket, string(b), file.Fd()); err != nil {
   969  		return fmt.Errorf("cannot send seccomp fd to %s: %w", listenerPath, err)
   970  	}
   971  	runtime.KeepAlive(file)
   972  	return nil
   973  }
   974  
   975  func getPipeFds(pid int) ([]string, error) {
   976  	fds := make([]string, 3)
   977  
   978  	dirPath := filepath.Join("/proc", strconv.Itoa(pid), "/fd")
   979  	for i := 0; i < 3; i++ {
   980  		// XXX: This breaks if the path is not a valid symlink (which can
   981  		//      happen in certain particularly unlucky mount namespace setups).
   982  		f := filepath.Join(dirPath, strconv.Itoa(i))
   983  		target, err := os.Readlink(f)
   984  		if err != nil {
   985  			// Ignore permission errors, for rootless containers and other
   986  			// non-dumpable processes. if we can't get the fd for a particular
   987  			// file, there's not much we can do.
   988  			if os.IsPermission(err) {
   989  				continue
   990  			}
   991  			return fds, err
   992  		}
   993  		fds[i] = target
   994  	}
   995  	return fds, nil
   996  }
   997  
   998  // InitializeIO creates pipes for use with the process's stdio and returns the
   999  // opposite side for each. Do not use this if you want to have a pseudoterminal
  1000  // set up for you by libcontainer (TODO: fix that too).
  1001  // TODO: This is mostly unnecessary, and should be handled by clients.
  1002  func (p *Process) InitializeIO(rootuid, rootgid int) (i *IO, err error) {
  1003  	var fds []uintptr
  1004  	i = &IO{}
  1005  	// cleanup in case of an error
  1006  	defer func() {
  1007  		if err != nil {
  1008  			for _, fd := range fds {
  1009  				_ = unix.Close(int(fd))
  1010  			}
  1011  		}
  1012  	}()
  1013  	// STDIN
  1014  	r, w, err := os.Pipe()
  1015  	if err != nil {
  1016  		return nil, err
  1017  	}
  1018  	fds = append(fds, r.Fd(), w.Fd())
  1019  	p.Stdin, i.Stdin = r, w
  1020  	// STDOUT
  1021  	if r, w, err = os.Pipe(); err != nil {
  1022  		return nil, err
  1023  	}
  1024  	fds = append(fds, r.Fd(), w.Fd())
  1025  	p.Stdout, i.Stdout = w, r
  1026  	// STDERR
  1027  	if r, w, err = os.Pipe(); err != nil {
  1028  		return nil, err
  1029  	}
  1030  	fds = append(fds, r.Fd(), w.Fd())
  1031  	p.Stderr, i.Stderr = w, r
  1032  	// change ownership of the pipes in case we are in a user namespace
  1033  	for _, fd := range fds {
  1034  		if err := unix.Fchown(int(fd), rootuid, rootgid); err != nil {
  1035  			return nil, &os.PathError{Op: "fchown", Path: "fd " + strconv.Itoa(int(fd)), Err: err}
  1036  		}
  1037  	}
  1038  	return i, nil
  1039  }
  1040  
  1041  // initWaiter returns a channel to wait on for making sure
  1042  // runc init has finished the initial setup.
  1043  func initWaiter(r io.Reader) chan error {
  1044  	ch := make(chan error, 1)
  1045  	go func() {
  1046  		defer close(ch)
  1047  
  1048  		inited := make([]byte, 1)
  1049  		n, err := r.Read(inited)
  1050  		if err == nil {
  1051  			if n < 1 {
  1052  				err = errors.New("short read")
  1053  			} else if inited[0] != 0 {
  1054  				err = fmt.Errorf("unexpected %d != 0", inited[0])
  1055  			} else {
  1056  				ch <- nil
  1057  				return
  1058  			}
  1059  		}
  1060  		ch <- fmt.Errorf("waiting for init preliminary setup: %w", err)
  1061  	}()
  1062  
  1063  	return ch
  1064  }
  1065  
  1066  func setIOPriority(ioprio *configs.IOPriority) error {
  1067  	const ioprioWhoPgrp = 1
  1068  
  1069  	class, ok := configs.IOPrioClassMapping[ioprio.Class]
  1070  	if !ok {
  1071  		return fmt.Errorf("invalid io priority class: %s", ioprio.Class)
  1072  	}
  1073  
  1074  	// Combine class and priority into a single value
  1075  	// https://github.com/torvalds/linux/blob/v5.18/include/uapi/linux/ioprio.h#L5-L17
  1076  	iop := (class << 13) | ioprio.Priority
  1077  	_, _, errno := unix.RawSyscall(unix.SYS_IOPRIO_SET, ioprioWhoPgrp, 0, uintptr(iop))
  1078  	if errno != 0 {
  1079  		return fmt.Errorf("failed to set io priority: %w", errno)
  1080  	}
  1081  
  1082  	return nil
  1083  }
  1084  
  1085  // isolatedCPUAffinityTransition returns a CPU affinity if necessary based on heuristics
  1086  // and org.opencontainers.runc.exec.isolated-cpu-affinity-transition annotation value.
  1087  func isolatedCPUAffinityTransition(rootFS fs.FS, cpusetList string, annotations map[string]string) (int, bool, error) {
  1088  	const (
  1089  		isolatedCPUAffinityTransitionAnnotation = "org.opencontainers.runc.exec.isolated-cpu-affinity-transition"
  1090  		nohzFullParam                           = "nohz_full"
  1091  	)
  1092  
  1093  	definitive := false
  1094  
  1095  	transition := annotations[isolatedCPUAffinityTransitionAnnotation]
  1096  	switch transition {
  1097  	case "temporary":
  1098  	case "definitive":
  1099  		definitive = true
  1100  	default:
  1101  		if transition != "" {
  1102  			return -1, false, fmt.Errorf(
  1103  				"unknown transition value %q for annotation %s",
  1104  				transition, isolatedCPUAffinityTransitionAnnotation,
  1105  			)
  1106  		}
  1107  		return -1, false, nil
  1108  	}
  1109  
  1110  	kernelParams, err := kernelparam.LookupKernelBootParameters(
  1111  		rootFS,
  1112  		nohzFullParam,
  1113  	)
  1114  	if err != nil {
  1115  		// If /proc/cmdline does not exist or isn't readable, continue to read
  1116  		// nohz_full from sysfs below.
  1117  		if !errors.Is(err, os.ErrNotExist) && !errors.Is(err, os.ErrPermission) {
  1118  			return -1, false, err
  1119  		}
  1120  	}
  1121  
  1122  	// First get nohz_full value from kernel boot params, if not
  1123  	// present, get the value from sysfs, to cover the case where
  1124  	// CONFIG_NO_HZ_FULL_ALL is set, it also makes the integration
  1125  	// tests not dependent on /sys/devices/system/cpu/nohz_full.
  1126  	isolatedList := kernelParams[nohzFullParam]
  1127  	if isolatedList == "" {
  1128  		// Get the isolated CPU list, the error is not checked here because
  1129  		// no matter what the error is, it returns without error the same way
  1130  		// as with empty data.
  1131  		isolatedData, _ := fs.ReadFile(rootFS, "sys/devices/system/cpu/nohz_full")
  1132  		isolatedList = string(bytes.TrimSpace(isolatedData))
  1133  		if isolatedList == "" || isolatedList == "(null)" {
  1134  			return -1, false, nil
  1135  		}
  1136  	}
  1137  
  1138  	cpu, err := getEligibleCPU(cpusetList, isolatedList)
  1139  	if err != nil {
  1140  		return -1, false, fmt.Errorf("getting eligible cpu: %w", err)
  1141  	} else if cpu == -1 {
  1142  		definitive = false
  1143  	}
  1144  
  1145  	return cpu, definitive, nil
  1146  }
  1147  
  1148  // getEligibleCPU returns the first eligible CPU for CPU affinity before
  1149  // entering in a cgroup cpuset:
  1150  //   - when there is not cpuset cores: no eligible CPU (-1)
  1151  //   - when there is not isolated cores: no eligible CPU (-1)
  1152  //   - when cpuset cores are not in isolated cores: no eligible CPU (-1)
  1153  //   - when cpuset cores are all isolated cores: return the first CPU of the cpuset
  1154  //   - when cpuset cores are mixed between housekeeping/isolated cores: return the
  1155  //     first housekeeping CPU not in isolated CPUs.
  1156  func getEligibleCPU(cpusetList, isolatedList string) (int, error) {
  1157  	if isolatedList == "" || cpusetList == "" {
  1158  		return -1, nil
  1159  	}
  1160  
  1161  	// The target container has a cgroup cpuset, get the bit range.
  1162  	cpusetBits, err := systemd.RangeToBits(cpusetList)
  1163  	if err != nil {
  1164  		return -1, fmt.Errorf("parsing cpuset cpus list %s: %w", cpusetList, err)
  1165  	}
  1166  
  1167  	isolatedBits, err := systemd.RangeToBits(isolatedList)
  1168  	if err != nil {
  1169  		return -1, fmt.Errorf("parsing isolated cpus list %s: %w", isolatedList, err)
  1170  	}
  1171  
  1172  	eligibleCore := -1
  1173  	isolatedCores := 0
  1174  
  1175  	// Start from cpu core #0.
  1176  	currentCore := 0
  1177  	// Handle mixed sets.
  1178  	mixed := false
  1179  
  1180  	// CPU core start from the first slice element and bits are read
  1181  	// from the least to the most significant bit.
  1182  	for byteRange := 0; byteRange < len(cpusetBits); byteRange++ {
  1183  		if byteRange >= len(isolatedBits) {
  1184  			// No more isolated cores.
  1185  			break
  1186  		}
  1187  		for bit := 0; bit < 8; bit++ {
  1188  			if cpusetBits[byteRange]&(1<<bit) != 0 {
  1189  				// Mark the first core of the cgroup cpuset as eligible.
  1190  				if eligibleCore < 0 {
  1191  					eligibleCore = currentCore
  1192  				}
  1193  
  1194  				// Isolated cores count.
  1195  				if isolatedBits[byteRange]&(1<<bit) != 0 {
  1196  					isolatedCores++
  1197  				} else if !mixed {
  1198  					// Not an isolated core, mark the current core as eligible once.
  1199  					mixed = true
  1200  					eligibleCore = currentCore
  1201  				}
  1202  				if mixed && isolatedCores > 0 {
  1203  					return eligibleCore, nil
  1204  				}
  1205  			}
  1206  			currentCore++
  1207  		}
  1208  	}
  1209  
  1210  	// We have an eligible CPU if there is at least one isolated CPU in the cpuset.
  1211  	if isolatedCores == 0 {
  1212  		return -1, nil
  1213  	}
  1214  
  1215  	return eligibleCore, nil
  1216  }
  1217  
  1218  // startCommandWithCPUAffinity starts a command on a specific CPU if set.
  1219  func startCommandWithCPUAffinity(cmd *exec.Cmd, cpuAffinity int) error {
  1220  	errCh := make(chan error)
  1221  	defer close(errCh)
  1222  
  1223  	// Use a goroutine to dedicate an OS thread.
  1224  	go func() {
  1225  		cpuSet := new(unix.CPUSet)
  1226  		cpuSet.Zero()
  1227  		cpuSet.Set(cpuAffinity)
  1228  
  1229  		// Don't call runtime.UnlockOSThread to terminate the OS thread
  1230  		// when goroutine exits.
  1231  		runtime.LockOSThread()
  1232  
  1233  		// Command inherits the CPU affinity.
  1234  		if err := unix.SchedSetaffinity(unix.Gettid(), cpuSet); err != nil {
  1235  			errCh <- fmt.Errorf("setting os thread CPU affinity: %w", err)
  1236  			return
  1237  		}
  1238  
  1239  		errCh <- cmd.Start()
  1240  	}()
  1241  
  1242  	return <-errCh
  1243  }
  1244  
  1245  // fixProcessCPUAffinity sets the CPU affinity of a container process
  1246  // to all CPUs allowed by container cgroup cpuset.
  1247  func fixProcessCPUAffinity(pid int, manager cgroups.Manager) error {
  1248  	cpusetList := manager.GetEffectiveCPUs()
  1249  	if cpusetList == "" {
  1250  		// If the cgroup cpuset is not present, the container will inherit
  1251  		// this process CPU affinity, so it can return without further actions.
  1252  		return nil
  1253  	}
  1254  
  1255  	cpusetBits, err := systemd.RangeToBits(cpusetList)
  1256  	if err != nil {
  1257  		return fmt.Errorf("parsing cpuset cpus list %s: %w", cpusetList, err)
  1258  	}
  1259  
  1260  	processCPUSet := new(unix.CPUSet)
  1261  
  1262  	for byteRange := 0; byteRange < len(cpusetBits); byteRange++ {
  1263  		for bit := 0; bit < 8; bit++ {
  1264  			processCPUSet.Set(byteRange*8 + bit)
  1265  		}
  1266  	}
  1267  
  1268  	if err := unix.SchedSetaffinity(pid, processCPUSet); err != nil {
  1269  		return fmt.Errorf("setting process PID %d CPU affinity: %w", pid, err)
  1270  	}
  1271  
  1272  	return nil
  1273  }