github.com/opencontainers/runc@v1.2.0-rc.1.0.20240520010911-492dc558cdd6/libcontainer/system/linux.go (about) 1 //go:build linux 2 // +build linux 3 4 package system 5 6 import ( 7 "fmt" 8 "io" 9 "os" 10 "strconv" 11 "sync/atomic" 12 "syscall" 13 "unsafe" 14 15 "github.com/sirupsen/logrus" 16 "golang.org/x/sys/unix" 17 ) 18 19 //go:linkname syscallOrigRlimitNofile syscall.origRlimitNofile 20 var syscallOrigRlimitNofile atomic.Pointer[syscall.Rlimit] 21 22 // ClearRlimitNofileCache is to clear go runtime's nofile rlimit cache. 23 func ClearRlimitNofileCache() { 24 // As reported in issue #4195, the new version of go runtime(since 1.19) 25 // will cache rlimit-nofile. Before executing execve, the rlimit-nofile 26 // of the process will be restored with the cache. In runc, this will 27 // cause the rlimit-nofile setting by the parent process for the container 28 // to become invalid. It can be solved by clearing this cache. But 29 // unfortunately, go stdlib doesn't provide such function, so we need to 30 // link to the private var `origRlimitNofile` in package syscall to hack. 31 syscallOrigRlimitNofile.Store(nil) 32 } 33 34 type ParentDeathSignal int 35 36 func (p ParentDeathSignal) Restore() error { 37 if p == 0 { 38 return nil 39 } 40 current, err := GetParentDeathSignal() 41 if err != nil { 42 return err 43 } 44 if p == current { 45 return nil 46 } 47 return p.Set() 48 } 49 50 func (p ParentDeathSignal) Set() error { 51 return SetParentDeathSignal(uintptr(p)) 52 } 53 54 func Exec(cmd string, args []string, env []string) error { 55 for { 56 err := unix.Exec(cmd, args, env) 57 if err != unix.EINTR { 58 return &os.PathError{Op: "exec", Path: cmd, Err: err} 59 } 60 } 61 } 62 63 func execveat(fd uintptr, pathname string, args []string, env []string, flags int) error { 64 pathnamep, err := syscall.BytePtrFromString(pathname) 65 if err != nil { 66 return err 67 } 68 69 argvp, err := syscall.SlicePtrFromStrings(args) 70 if err != nil { 71 return err 72 } 73 74 envp, err := syscall.SlicePtrFromStrings(env) 75 if err != nil { 76 return err 77 } 78 79 _, _, errno := syscall.Syscall6( 80 unix.SYS_EXECVEAT, 81 fd, 82 uintptr(unsafe.Pointer(pathnamep)), 83 uintptr(unsafe.Pointer(&argvp[0])), 84 uintptr(unsafe.Pointer(&envp[0])), 85 uintptr(flags), 86 0, 87 ) 88 return errno 89 } 90 91 func Fexecve(fd uintptr, args []string, env []string) error { 92 var err error 93 for { 94 err = execveat(fd, "", args, env, unix.AT_EMPTY_PATH) 95 if err != unix.EINTR { // nolint:errorlint // unix errors are bare 96 break 97 } 98 } 99 if err == unix.ENOSYS { // nolint:errorlint // unix errors are bare 100 // Fallback to classic /proc/self/fd/... exec. 101 return Exec("/proc/self/fd/"+strconv.Itoa(int(fd)), args, env) 102 } 103 return os.NewSyscallError("execveat", err) 104 } 105 106 func SetParentDeathSignal(sig uintptr) error { 107 if err := unix.Prctl(unix.PR_SET_PDEATHSIG, sig, 0, 0, 0); err != nil { 108 return err 109 } 110 return nil 111 } 112 113 func GetParentDeathSignal() (ParentDeathSignal, error) { 114 var sig int 115 if err := unix.Prctl(unix.PR_GET_PDEATHSIG, uintptr(unsafe.Pointer(&sig)), 0, 0, 0); err != nil { 116 return -1, err 117 } 118 return ParentDeathSignal(sig), nil 119 } 120 121 func SetKeepCaps() error { 122 if err := unix.Prctl(unix.PR_SET_KEEPCAPS, 1, 0, 0, 0); err != nil { 123 return err 124 } 125 126 return nil 127 } 128 129 func ClearKeepCaps() error { 130 if err := unix.Prctl(unix.PR_SET_KEEPCAPS, 0, 0, 0, 0); err != nil { 131 return err 132 } 133 134 return nil 135 } 136 137 func Setctty() error { 138 if err := unix.IoctlSetInt(0, unix.TIOCSCTTY, 0); err != nil { 139 return err 140 } 141 return nil 142 } 143 144 // SetSubreaper sets the value i as the subreaper setting for the calling process 145 func SetSubreaper(i int) error { 146 return unix.Prctl(unix.PR_SET_CHILD_SUBREAPER, uintptr(i), 0, 0, 0) 147 } 148 149 // GetSubreaper returns the subreaper setting for the calling process 150 func GetSubreaper() (int, error) { 151 var i uintptr 152 153 if err := unix.Prctl(unix.PR_GET_CHILD_SUBREAPER, uintptr(unsafe.Pointer(&i)), 0, 0, 0); err != nil { 154 return -1, err 155 } 156 157 return int(i), nil 158 } 159 160 func ExecutableMemfd(comment string, flags int) (*os.File, error) { 161 // Try to use MFD_EXEC first. On pre-6.3 kernels we get -EINVAL for this 162 // flag. On post-6.3 kernels, with vm.memfd_noexec=1 this ensures we get an 163 // executable memfd. For vm.memfd_noexec=2 this is a bit more complicated. 164 // The original vm.memfd_noexec=2 implementation incorrectly silently 165 // allowed MFD_EXEC[1] -- this should be fixed in 6.6. On 6.6 and newer 166 // kernels, we will get -EACCES if we try to use MFD_EXEC with 167 // vm.memfd_noexec=2 (for 6.3-6.5, -EINVAL was the intended return value). 168 // 169 // The upshot is we only need to retry without MFD_EXEC on -EINVAL because 170 // it just so happens that passing MFD_EXEC bypasses vm.memfd_noexec=2 on 171 // kernels where -EINVAL is actually a security denial. 172 memfd, err := unix.MemfdCreate(comment, flags|unix.MFD_EXEC) 173 if err == unix.EINVAL { 174 memfd, err = unix.MemfdCreate(comment, flags) 175 } 176 if err != nil { 177 if err == unix.EACCES { 178 logrus.Info("memfd_create(MFD_EXEC) failed, possibly due to vm.memfd_noexec=2 -- falling back to less secure O_TMPFILE") 179 } 180 err := os.NewSyscallError("memfd_create", err) 181 return nil, fmt.Errorf("failed to create executable memfd: %w", err) 182 } 183 return os.NewFile(uintptr(memfd), "/memfd:"+comment), nil 184 } 185 186 // Copy is like io.Copy except it uses sendfile(2) if the source and sink are 187 // both (*os.File) as an optimisation to make copies faster. 188 func Copy(dst io.Writer, src io.Reader) (copied int64, err error) { 189 dstFile, _ := dst.(*os.File) 190 srcFile, _ := src.(*os.File) 191 192 if dstFile != nil && srcFile != nil { 193 fi, err := srcFile.Stat() 194 if err != nil { 195 goto fallback 196 } 197 size := fi.Size() 198 for size > 0 { 199 n, err := unix.Sendfile(int(dstFile.Fd()), int(srcFile.Fd()), nil, int(size)) 200 if n > 0 { 201 size -= int64(n) 202 copied += int64(n) 203 } 204 if err == unix.EINTR { 205 continue 206 } 207 if err != nil { 208 if copied == 0 { 209 // If we haven't copied anything so far, we can safely just 210 // fallback to io.Copy. We could always do the fallback but 211 // it's safer to error out in the case of a partial copy 212 // followed by an error (which should never happen). 213 goto fallback 214 } 215 return copied, fmt.Errorf("partial sendfile copy: %w", err) 216 } 217 } 218 return copied, nil 219 } 220 221 fallback: 222 return io.Copy(dst, src) 223 } 224 225 // SetLinuxPersonality sets the Linux execution personality. For more information see the personality syscall documentation. 226 // checkout getLinuxPersonalityFromStr() from libcontainer/specconv/spec_linux.go for type conversion. 227 func SetLinuxPersonality(personality int) error { 228 _, _, errno := unix.Syscall(unix.SYS_PERSONALITY, uintptr(personality), 0, 0) 229 if errno != 0 { 230 return &os.SyscallError{Syscall: "set_personality", Err: errno} 231 } 232 return nil 233 }