github.com/opencontainers/runc@v1.2.0-rc.1.0.20240520010911-492dc558cdd6/libcontainer/utils/utils_unix.go (about) 1 //go:build !windows 2 // +build !windows 3 4 package utils 5 6 import ( 7 "fmt" 8 "math" 9 "os" 10 "path/filepath" 11 "runtime" 12 "strconv" 13 "sync" 14 _ "unsafe" // for go:linkname 15 16 securejoin "github.com/cyphar/filepath-securejoin" 17 "github.com/sirupsen/logrus" 18 "golang.org/x/sys/unix" 19 ) 20 21 // EnsureProcHandle returns whether or not the given file handle is on procfs. 22 func EnsureProcHandle(fh *os.File) error { 23 var buf unix.Statfs_t 24 if err := unix.Fstatfs(int(fh.Fd()), &buf); err != nil { 25 return fmt.Errorf("ensure %s is on procfs: %w", fh.Name(), err) 26 } 27 if buf.Type != unix.PROC_SUPER_MAGIC { 28 return fmt.Errorf("%s is not on procfs", fh.Name()) 29 } 30 return nil 31 } 32 33 var ( 34 haveCloseRangeCloexecBool bool 35 haveCloseRangeCloexecOnce sync.Once 36 ) 37 38 func haveCloseRangeCloexec() bool { 39 haveCloseRangeCloexecOnce.Do(func() { 40 // Make sure we're not closing a random file descriptor. 41 tmpFd, err := unix.FcntlInt(0, unix.F_DUPFD_CLOEXEC, 0) 42 if err != nil { 43 return 44 } 45 defer unix.Close(tmpFd) 46 47 err = unix.CloseRange(uint(tmpFd), uint(tmpFd), unix.CLOSE_RANGE_CLOEXEC) 48 // Any error means we cannot use close_range(CLOSE_RANGE_CLOEXEC). 49 // -ENOSYS and -EINVAL ultimately mean we don't have support, but any 50 // other potential error would imply that even the most basic close 51 // operation wouldn't work. 52 haveCloseRangeCloexecBool = err == nil 53 }) 54 return haveCloseRangeCloexecBool 55 } 56 57 type fdFunc func(fd int) 58 59 // fdRangeFrom calls the passed fdFunc for each file descriptor that is open in 60 // the current process. 61 func fdRangeFrom(minFd int, fn fdFunc) error { 62 procSelfFd, closer := ProcThreadSelf("fd") 63 defer closer() 64 65 fdDir, err := os.Open(procSelfFd) 66 if err != nil { 67 return err 68 } 69 defer fdDir.Close() 70 71 if err := EnsureProcHandle(fdDir); err != nil { 72 return err 73 } 74 75 fdList, err := fdDir.Readdirnames(-1) 76 if err != nil { 77 return err 78 } 79 for _, fdStr := range fdList { 80 fd, err := strconv.Atoi(fdStr) 81 // Ignore non-numeric file names. 82 if err != nil { 83 continue 84 } 85 // Ignore descriptors lower than our specified minimum. 86 if fd < minFd { 87 continue 88 } 89 // Ignore the file descriptor we used for readdir, as it will be closed 90 // when we return. 91 if uintptr(fd) == fdDir.Fd() { 92 continue 93 } 94 // Run the closure. 95 fn(fd) 96 } 97 return nil 98 } 99 100 // CloseExecFrom sets the O_CLOEXEC flag on all file descriptors greater or 101 // equal to minFd in the current process. 102 func CloseExecFrom(minFd int) error { 103 // Use close_range(CLOSE_RANGE_CLOEXEC) if possible. 104 if haveCloseRangeCloexec() { 105 err := unix.CloseRange(uint(minFd), math.MaxUint, unix.CLOSE_RANGE_CLOEXEC) 106 return os.NewSyscallError("close_range", err) 107 } 108 // Otherwise, fall back to the standard loop. 109 return fdRangeFrom(minFd, unix.CloseOnExec) 110 } 111 112 //go:linkname runtime_IsPollDescriptor internal/poll.IsPollDescriptor 113 114 // In order to make sure we do not close the internal epoll descriptors the Go 115 // runtime uses, we need to ensure that we skip descriptors that match 116 // "internal/poll".IsPollDescriptor. Yes, this is a Go runtime internal thing, 117 // unfortunately there's no other way to be sure we're only keeping the file 118 // descriptors the Go runtime needs. Hopefully nothing blows up doing this... 119 func runtime_IsPollDescriptor(fd uintptr) bool //nolint:revive 120 121 // UnsafeCloseFrom closes all file descriptors greater or equal to minFd in the 122 // current process, except for those critical to Go's runtime (such as the 123 // netpoll management descriptors). 124 // 125 // NOTE: That this function is incredibly dangerous to use in most Go code, as 126 // closing file descriptors from underneath *os.File handles can lead to very 127 // bad behaviour (the closed file descriptor can be re-used and then any 128 // *os.File operations would apply to the wrong file). This function is only 129 // intended to be called from the last stage of runc init. 130 func UnsafeCloseFrom(minFd int) error { 131 // We cannot use close_range(2) even if it is available, because we must 132 // not close some file descriptors. 133 return fdRangeFrom(minFd, func(fd int) { 134 if runtime_IsPollDescriptor(uintptr(fd)) { 135 // These are the Go runtimes internal netpoll file descriptors. 136 // These file descriptors are operated on deep in the Go scheduler, 137 // and closing those files from underneath Go can result in panics. 138 // There is no issue with keeping them because they are not 139 // executable and are not useful to an attacker anyway. Also we 140 // don't have any choice. 141 return 142 } 143 // There's nothing we can do about errors from close(2), and the 144 // only likely error to be seen is EBADF which indicates the fd was 145 // already closed (in which case, we got what we wanted). 146 _ = unix.Close(fd) 147 }) 148 } 149 150 // NewSockPair returns a new SOCK_STREAM unix socket pair. 151 func NewSockPair(name string) (parent, child *os.File, err error) { 152 fds, err := unix.Socketpair(unix.AF_LOCAL, unix.SOCK_STREAM|unix.SOCK_CLOEXEC, 0) 153 if err != nil { 154 return nil, nil, err 155 } 156 return os.NewFile(uintptr(fds[1]), name+"-p"), os.NewFile(uintptr(fds[0]), name+"-c"), nil 157 } 158 159 // WithProcfd runs the passed closure with a procfd path (/proc/self/fd/...) 160 // corresponding to the unsafePath resolved within the root. Before passing the 161 // fd, this path is verified to have been inside the root -- so operating on it 162 // through the passed fdpath should be safe. Do not access this path through 163 // the original path strings, and do not attempt to use the pathname outside of 164 // the passed closure (the file handle will be freed once the closure returns). 165 func WithProcfd(root, unsafePath string, fn func(procfd string) error) error { 166 // Remove the root then forcefully resolve inside the root. 167 unsafePath = stripRoot(root, unsafePath) 168 path, err := securejoin.SecureJoin(root, unsafePath) 169 if err != nil { 170 return fmt.Errorf("resolving path inside rootfs failed: %w", err) 171 } 172 173 procSelfFd, closer := ProcThreadSelf("fd/") 174 defer closer() 175 176 // Open the target path. 177 fh, err := os.OpenFile(path, unix.O_PATH|unix.O_CLOEXEC, 0) 178 if err != nil { 179 return fmt.Errorf("open o_path procfd: %w", err) 180 } 181 defer fh.Close() 182 183 procfd := filepath.Join(procSelfFd, strconv.Itoa(int(fh.Fd()))) 184 // Double-check the path is the one we expected. 185 if realpath, err := os.Readlink(procfd); err != nil { 186 return fmt.Errorf("procfd verification failed: %w", err) 187 } else if realpath != path { 188 return fmt.Errorf("possibly malicious path detected -- refusing to operate on %s", realpath) 189 } 190 191 return fn(procfd) 192 } 193 194 type ProcThreadSelfCloser func() 195 196 var ( 197 haveProcThreadSelf bool 198 haveProcThreadSelfOnce sync.Once 199 ) 200 201 // ProcThreadSelf returns a string that is equivalent to 202 // /proc/thread-self/<subpath>, with a graceful fallback on older kernels where 203 // /proc/thread-self doesn't exist. This method DOES NOT use SecureJoin, 204 // meaning that the passed string needs to be trusted. The caller _must_ call 205 // the returned procThreadSelfCloser function (which is runtime.UnlockOSThread) 206 // *only once* after it has finished using the returned path string. 207 func ProcThreadSelf(subpath string) (string, ProcThreadSelfCloser) { 208 haveProcThreadSelfOnce.Do(func() { 209 if _, err := os.Stat("/proc/thread-self/"); err == nil { 210 haveProcThreadSelf = true 211 } else { 212 logrus.Debugf("cannot stat /proc/thread-self (%v), falling back to /proc/self/task/<tid>", err) 213 } 214 }) 215 216 // We need to lock our thread until the caller is done with the path string 217 // because any non-atomic operation on the path (such as opening a file, 218 // then reading it) could be interrupted by the Go runtime where the 219 // underlying thread is swapped out and the original thread is killed, 220 // resulting in pull-your-hair-out-hard-to-debug issues in the caller. In 221 // addition, the pre-3.17 fallback makes everything non-atomic because the 222 // same thing could happen between unix.Gettid() and the path operations. 223 // 224 // In theory, we don't need to lock in the atomic user case when using 225 // /proc/thread-self/, but it's better to be safe than sorry (and there are 226 // only one or two truly atomic users of /proc/thread-self/). 227 runtime.LockOSThread() 228 229 threadSelf := "/proc/thread-self/" 230 if !haveProcThreadSelf { 231 // Pre-3.17 kernels did not have /proc/thread-self, so do it manually. 232 threadSelf = "/proc/self/task/" + strconv.Itoa(unix.Gettid()) + "/" 233 if _, err := os.Stat(threadSelf); err != nil { 234 // Unfortunately, this code is called from rootfs_linux.go where we 235 // are running inside the pid namespace of the container but /proc 236 // is the host's procfs. Unfortunately there is no real way to get 237 // the correct tid to use here (the kernel age means we cannot do 238 // things like set up a private fsopen("proc") -- even scanning 239 // NSpid in all of the tasks in /proc/self/task/*/status requires 240 // Linux 4.1). 241 // 242 // So, we just have to assume that /proc/self is acceptable in this 243 // one specific case. 244 if os.Getpid() == 1 { 245 logrus.Debugf("/proc/thread-self (tid=%d) cannot be emulated inside the initial container setup -- using /proc/self instead: %v", unix.Gettid(), err) 246 } else { 247 // This should never happen, but the fallback should work in most cases... 248 logrus.Warnf("/proc/thread-self could not be emulated for pid=%d (tid=%d) -- using more buggy /proc/self fallback instead: %v", os.Getpid(), unix.Gettid(), err) 249 } 250 threadSelf = "/proc/self/" 251 } 252 } 253 return threadSelf + subpath, runtime.UnlockOSThread 254 } 255 256 // ProcThreadSelfFd is small wrapper around ProcThreadSelf to make it easier to 257 // create a /proc/thread-self handle for given file descriptor. 258 // 259 // It is basically equivalent to ProcThreadSelf(fmt.Sprintf("fd/%d", fd)), but 260 // without using fmt.Sprintf to avoid unneeded overhead. 261 func ProcThreadSelfFd(fd uintptr) (string, ProcThreadSelfCloser) { 262 return ProcThreadSelf("fd/" + strconv.FormatUint(uint64(fd), 10)) 263 }