github.com/opencontainers/runc@v1.2.0-rc.1.0.20240520010911-492dc558cdd6/libcontainer/system/linux.go (about)

     1  //go:build linux
     2  // +build linux
     3  
     4  package system
     5  
     6  import (
     7  	"fmt"
     8  	"io"
     9  	"os"
    10  	"strconv"
    11  	"sync/atomic"
    12  	"syscall"
    13  	"unsafe"
    14  
    15  	"github.com/sirupsen/logrus"
    16  	"golang.org/x/sys/unix"
    17  )
    18  
    19  //go:linkname syscallOrigRlimitNofile syscall.origRlimitNofile
    20  var syscallOrigRlimitNofile atomic.Pointer[syscall.Rlimit]
    21  
    22  // ClearRlimitNofileCache is to clear go runtime's nofile rlimit cache.
    23  func ClearRlimitNofileCache() {
    24  	// As reported in issue #4195, the new version of go runtime(since 1.19)
    25  	// will cache rlimit-nofile. Before executing execve, the rlimit-nofile
    26  	// of the process will be restored with the cache. In runc, this will
    27  	// cause the rlimit-nofile setting by the parent process for the container
    28  	// to become invalid. It can be solved by clearing this cache. But
    29  	// unfortunately, go stdlib doesn't provide such function, so we need to
    30  	// link to the private var `origRlimitNofile` in package syscall to hack.
    31  	syscallOrigRlimitNofile.Store(nil)
    32  }
    33  
    34  type ParentDeathSignal int
    35  
    36  func (p ParentDeathSignal) Restore() error {
    37  	if p == 0 {
    38  		return nil
    39  	}
    40  	current, err := GetParentDeathSignal()
    41  	if err != nil {
    42  		return err
    43  	}
    44  	if p == current {
    45  		return nil
    46  	}
    47  	return p.Set()
    48  }
    49  
    50  func (p ParentDeathSignal) Set() error {
    51  	return SetParentDeathSignal(uintptr(p))
    52  }
    53  
    54  func Exec(cmd string, args []string, env []string) error {
    55  	for {
    56  		err := unix.Exec(cmd, args, env)
    57  		if err != unix.EINTR {
    58  			return &os.PathError{Op: "exec", Path: cmd, Err: err}
    59  		}
    60  	}
    61  }
    62  
    63  func execveat(fd uintptr, pathname string, args []string, env []string, flags int) error {
    64  	pathnamep, err := syscall.BytePtrFromString(pathname)
    65  	if err != nil {
    66  		return err
    67  	}
    68  
    69  	argvp, err := syscall.SlicePtrFromStrings(args)
    70  	if err != nil {
    71  		return err
    72  	}
    73  
    74  	envp, err := syscall.SlicePtrFromStrings(env)
    75  	if err != nil {
    76  		return err
    77  	}
    78  
    79  	_, _, errno := syscall.Syscall6(
    80  		unix.SYS_EXECVEAT,
    81  		fd,
    82  		uintptr(unsafe.Pointer(pathnamep)),
    83  		uintptr(unsafe.Pointer(&argvp[0])),
    84  		uintptr(unsafe.Pointer(&envp[0])),
    85  		uintptr(flags),
    86  		0,
    87  	)
    88  	return errno
    89  }
    90  
    91  func Fexecve(fd uintptr, args []string, env []string) error {
    92  	var err error
    93  	for {
    94  		err = execveat(fd, "", args, env, unix.AT_EMPTY_PATH)
    95  		if err != unix.EINTR { // nolint:errorlint // unix errors are bare
    96  			break
    97  		}
    98  	}
    99  	if err == unix.ENOSYS { // nolint:errorlint // unix errors are bare
   100  		// Fallback to classic /proc/self/fd/... exec.
   101  		return Exec("/proc/self/fd/"+strconv.Itoa(int(fd)), args, env)
   102  	}
   103  	return os.NewSyscallError("execveat", err)
   104  }
   105  
   106  func SetParentDeathSignal(sig uintptr) error {
   107  	if err := unix.Prctl(unix.PR_SET_PDEATHSIG, sig, 0, 0, 0); err != nil {
   108  		return err
   109  	}
   110  	return nil
   111  }
   112  
   113  func GetParentDeathSignal() (ParentDeathSignal, error) {
   114  	var sig int
   115  	if err := unix.Prctl(unix.PR_GET_PDEATHSIG, uintptr(unsafe.Pointer(&sig)), 0, 0, 0); err != nil {
   116  		return -1, err
   117  	}
   118  	return ParentDeathSignal(sig), nil
   119  }
   120  
   121  func SetKeepCaps() error {
   122  	if err := unix.Prctl(unix.PR_SET_KEEPCAPS, 1, 0, 0, 0); err != nil {
   123  		return err
   124  	}
   125  
   126  	return nil
   127  }
   128  
   129  func ClearKeepCaps() error {
   130  	if err := unix.Prctl(unix.PR_SET_KEEPCAPS, 0, 0, 0, 0); err != nil {
   131  		return err
   132  	}
   133  
   134  	return nil
   135  }
   136  
   137  func Setctty() error {
   138  	if err := unix.IoctlSetInt(0, unix.TIOCSCTTY, 0); err != nil {
   139  		return err
   140  	}
   141  	return nil
   142  }
   143  
   144  // SetSubreaper sets the value i as the subreaper setting for the calling process
   145  func SetSubreaper(i int) error {
   146  	return unix.Prctl(unix.PR_SET_CHILD_SUBREAPER, uintptr(i), 0, 0, 0)
   147  }
   148  
   149  // GetSubreaper returns the subreaper setting for the calling process
   150  func GetSubreaper() (int, error) {
   151  	var i uintptr
   152  
   153  	if err := unix.Prctl(unix.PR_GET_CHILD_SUBREAPER, uintptr(unsafe.Pointer(&i)), 0, 0, 0); err != nil {
   154  		return -1, err
   155  	}
   156  
   157  	return int(i), nil
   158  }
   159  
   160  func ExecutableMemfd(comment string, flags int) (*os.File, error) {
   161  	// Try to use MFD_EXEC first. On pre-6.3 kernels we get -EINVAL for this
   162  	// flag. On post-6.3 kernels, with vm.memfd_noexec=1 this ensures we get an
   163  	// executable memfd. For vm.memfd_noexec=2 this is a bit more complicated.
   164  	// The original vm.memfd_noexec=2 implementation incorrectly silently
   165  	// allowed MFD_EXEC[1] -- this should be fixed in 6.6. On 6.6 and newer
   166  	// kernels, we will get -EACCES if we try to use MFD_EXEC with
   167  	// vm.memfd_noexec=2 (for 6.3-6.5, -EINVAL was the intended return value).
   168  	//
   169  	// The upshot is we only need to retry without MFD_EXEC on -EINVAL because
   170  	// it just so happens that passing MFD_EXEC bypasses vm.memfd_noexec=2 on
   171  	// kernels where -EINVAL is actually a security denial.
   172  	memfd, err := unix.MemfdCreate(comment, flags|unix.MFD_EXEC)
   173  	if err == unix.EINVAL {
   174  		memfd, err = unix.MemfdCreate(comment, flags)
   175  	}
   176  	if err != nil {
   177  		if err == unix.EACCES {
   178  			logrus.Info("memfd_create(MFD_EXEC) failed, possibly due to vm.memfd_noexec=2 -- falling back to less secure O_TMPFILE")
   179  		}
   180  		err := os.NewSyscallError("memfd_create", err)
   181  		return nil, fmt.Errorf("failed to create executable memfd: %w", err)
   182  	}
   183  	return os.NewFile(uintptr(memfd), "/memfd:"+comment), nil
   184  }
   185  
   186  // Copy is like io.Copy except it uses sendfile(2) if the source and sink are
   187  // both (*os.File) as an optimisation to make copies faster.
   188  func Copy(dst io.Writer, src io.Reader) (copied int64, err error) {
   189  	dstFile, _ := dst.(*os.File)
   190  	srcFile, _ := src.(*os.File)
   191  
   192  	if dstFile != nil && srcFile != nil {
   193  		fi, err := srcFile.Stat()
   194  		if err != nil {
   195  			goto fallback
   196  		}
   197  		size := fi.Size()
   198  		for size > 0 {
   199  			n, err := unix.Sendfile(int(dstFile.Fd()), int(srcFile.Fd()), nil, int(size))
   200  			if n > 0 {
   201  				size -= int64(n)
   202  				copied += int64(n)
   203  			}
   204  			if err == unix.EINTR {
   205  				continue
   206  			}
   207  			if err != nil {
   208  				if copied == 0 {
   209  					// If we haven't copied anything so far, we can safely just
   210  					// fallback to io.Copy. We could always do the fallback but
   211  					// it's safer to error out in the case of a partial copy
   212  					// followed by an error (which should never happen).
   213  					goto fallback
   214  				}
   215  				return copied, fmt.Errorf("partial sendfile copy: %w", err)
   216  			}
   217  		}
   218  		return copied, nil
   219  	}
   220  
   221  fallback:
   222  	return io.Copy(dst, src)
   223  }
   224  
   225  // SetLinuxPersonality sets the Linux execution personality. For more information see the personality syscall documentation.
   226  // checkout getLinuxPersonalityFromStr() from libcontainer/specconv/spec_linux.go for type conversion.
   227  func SetLinuxPersonality(personality int) error {
   228  	_, _, errno := unix.Syscall(unix.SYS_PERSONALITY, uintptr(personality), 0, 0)
   229  	if errno != 0 {
   230  		return &os.SyscallError{Syscall: "set_personality", Err: errno}
   231  	}
   232  	return nil
   233  }