github.com/opencontainers/runc@v1.2.0-rc.1.0.20240520010911-492dc558cdd6/libcontainer/utils/utils_unix.go (about)

     1  //go:build !windows
     2  // +build !windows
     3  
     4  package utils
     5  
     6  import (
     7  	"fmt"
     8  	"math"
     9  	"os"
    10  	"path/filepath"
    11  	"runtime"
    12  	"strconv"
    13  	"sync"
    14  	_ "unsafe" // for go:linkname
    15  
    16  	securejoin "github.com/cyphar/filepath-securejoin"
    17  	"github.com/sirupsen/logrus"
    18  	"golang.org/x/sys/unix"
    19  )
    20  
    21  // EnsureProcHandle returns whether or not the given file handle is on procfs.
    22  func EnsureProcHandle(fh *os.File) error {
    23  	var buf unix.Statfs_t
    24  	if err := unix.Fstatfs(int(fh.Fd()), &buf); err != nil {
    25  		return fmt.Errorf("ensure %s is on procfs: %w", fh.Name(), err)
    26  	}
    27  	if buf.Type != unix.PROC_SUPER_MAGIC {
    28  		return fmt.Errorf("%s is not on procfs", fh.Name())
    29  	}
    30  	return nil
    31  }
    32  
    33  var (
    34  	haveCloseRangeCloexecBool bool
    35  	haveCloseRangeCloexecOnce sync.Once
    36  )
    37  
    38  func haveCloseRangeCloexec() bool {
    39  	haveCloseRangeCloexecOnce.Do(func() {
    40  		// Make sure we're not closing a random file descriptor.
    41  		tmpFd, err := unix.FcntlInt(0, unix.F_DUPFD_CLOEXEC, 0)
    42  		if err != nil {
    43  			return
    44  		}
    45  		defer unix.Close(tmpFd)
    46  
    47  		err = unix.CloseRange(uint(tmpFd), uint(tmpFd), unix.CLOSE_RANGE_CLOEXEC)
    48  		// Any error means we cannot use close_range(CLOSE_RANGE_CLOEXEC).
    49  		// -ENOSYS and -EINVAL ultimately mean we don't have support, but any
    50  		// other potential error would imply that even the most basic close
    51  		// operation wouldn't work.
    52  		haveCloseRangeCloexecBool = err == nil
    53  	})
    54  	return haveCloseRangeCloexecBool
    55  }
    56  
    57  type fdFunc func(fd int)
    58  
    59  // fdRangeFrom calls the passed fdFunc for each file descriptor that is open in
    60  // the current process.
    61  func fdRangeFrom(minFd int, fn fdFunc) error {
    62  	procSelfFd, closer := ProcThreadSelf("fd")
    63  	defer closer()
    64  
    65  	fdDir, err := os.Open(procSelfFd)
    66  	if err != nil {
    67  		return err
    68  	}
    69  	defer fdDir.Close()
    70  
    71  	if err := EnsureProcHandle(fdDir); err != nil {
    72  		return err
    73  	}
    74  
    75  	fdList, err := fdDir.Readdirnames(-1)
    76  	if err != nil {
    77  		return err
    78  	}
    79  	for _, fdStr := range fdList {
    80  		fd, err := strconv.Atoi(fdStr)
    81  		// Ignore non-numeric file names.
    82  		if err != nil {
    83  			continue
    84  		}
    85  		// Ignore descriptors lower than our specified minimum.
    86  		if fd < minFd {
    87  			continue
    88  		}
    89  		// Ignore the file descriptor we used for readdir, as it will be closed
    90  		// when we return.
    91  		if uintptr(fd) == fdDir.Fd() {
    92  			continue
    93  		}
    94  		// Run the closure.
    95  		fn(fd)
    96  	}
    97  	return nil
    98  }
    99  
   100  // CloseExecFrom sets the O_CLOEXEC flag on all file descriptors greater or
   101  // equal to minFd in the current process.
   102  func CloseExecFrom(minFd int) error {
   103  	// Use close_range(CLOSE_RANGE_CLOEXEC) if possible.
   104  	if haveCloseRangeCloexec() {
   105  		err := unix.CloseRange(uint(minFd), math.MaxUint, unix.CLOSE_RANGE_CLOEXEC)
   106  		return os.NewSyscallError("close_range", err)
   107  	}
   108  	// Otherwise, fall back to the standard loop.
   109  	return fdRangeFrom(minFd, unix.CloseOnExec)
   110  }
   111  
   112  //go:linkname runtime_IsPollDescriptor internal/poll.IsPollDescriptor
   113  
   114  // In order to make sure we do not close the internal epoll descriptors the Go
   115  // runtime uses, we need to ensure that we skip descriptors that match
   116  // "internal/poll".IsPollDescriptor. Yes, this is a Go runtime internal thing,
   117  // unfortunately there's no other way to be sure we're only keeping the file
   118  // descriptors the Go runtime needs. Hopefully nothing blows up doing this...
   119  func runtime_IsPollDescriptor(fd uintptr) bool //nolint:revive
   120  
   121  // UnsafeCloseFrom closes all file descriptors greater or equal to minFd in the
   122  // current process, except for those critical to Go's runtime (such as the
   123  // netpoll management descriptors).
   124  //
   125  // NOTE: That this function is incredibly dangerous to use in most Go code, as
   126  // closing file descriptors from underneath *os.File handles can lead to very
   127  // bad behaviour (the closed file descriptor can be re-used and then any
   128  // *os.File operations would apply to the wrong file). This function is only
   129  // intended to be called from the last stage of runc init.
   130  func UnsafeCloseFrom(minFd int) error {
   131  	// We cannot use close_range(2) even if it is available, because we must
   132  	// not close some file descriptors.
   133  	return fdRangeFrom(minFd, func(fd int) {
   134  		if runtime_IsPollDescriptor(uintptr(fd)) {
   135  			// These are the Go runtimes internal netpoll file descriptors.
   136  			// These file descriptors are operated on deep in the Go scheduler,
   137  			// and closing those files from underneath Go can result in panics.
   138  			// There is no issue with keeping them because they are not
   139  			// executable and are not useful to an attacker anyway. Also we
   140  			// don't have any choice.
   141  			return
   142  		}
   143  		// There's nothing we can do about errors from close(2), and the
   144  		// only likely error to be seen is EBADF which indicates the fd was
   145  		// already closed (in which case, we got what we wanted).
   146  		_ = unix.Close(fd)
   147  	})
   148  }
   149  
   150  // NewSockPair returns a new SOCK_STREAM unix socket pair.
   151  func NewSockPair(name string) (parent, child *os.File, err error) {
   152  	fds, err := unix.Socketpair(unix.AF_LOCAL, unix.SOCK_STREAM|unix.SOCK_CLOEXEC, 0)
   153  	if err != nil {
   154  		return nil, nil, err
   155  	}
   156  	return os.NewFile(uintptr(fds[1]), name+"-p"), os.NewFile(uintptr(fds[0]), name+"-c"), nil
   157  }
   158  
   159  // WithProcfd runs the passed closure with a procfd path (/proc/self/fd/...)
   160  // corresponding to the unsafePath resolved within the root. Before passing the
   161  // fd, this path is verified to have been inside the root -- so operating on it
   162  // through the passed fdpath should be safe. Do not access this path through
   163  // the original path strings, and do not attempt to use the pathname outside of
   164  // the passed closure (the file handle will be freed once the closure returns).
   165  func WithProcfd(root, unsafePath string, fn func(procfd string) error) error {
   166  	// Remove the root then forcefully resolve inside the root.
   167  	unsafePath = stripRoot(root, unsafePath)
   168  	path, err := securejoin.SecureJoin(root, unsafePath)
   169  	if err != nil {
   170  		return fmt.Errorf("resolving path inside rootfs failed: %w", err)
   171  	}
   172  
   173  	procSelfFd, closer := ProcThreadSelf("fd/")
   174  	defer closer()
   175  
   176  	// Open the target path.
   177  	fh, err := os.OpenFile(path, unix.O_PATH|unix.O_CLOEXEC, 0)
   178  	if err != nil {
   179  		return fmt.Errorf("open o_path procfd: %w", err)
   180  	}
   181  	defer fh.Close()
   182  
   183  	procfd := filepath.Join(procSelfFd, strconv.Itoa(int(fh.Fd())))
   184  	// Double-check the path is the one we expected.
   185  	if realpath, err := os.Readlink(procfd); err != nil {
   186  		return fmt.Errorf("procfd verification failed: %w", err)
   187  	} else if realpath != path {
   188  		return fmt.Errorf("possibly malicious path detected -- refusing to operate on %s", realpath)
   189  	}
   190  
   191  	return fn(procfd)
   192  }
   193  
   194  type ProcThreadSelfCloser func()
   195  
   196  var (
   197  	haveProcThreadSelf     bool
   198  	haveProcThreadSelfOnce sync.Once
   199  )
   200  
   201  // ProcThreadSelf returns a string that is equivalent to
   202  // /proc/thread-self/<subpath>, with a graceful fallback on older kernels where
   203  // /proc/thread-self doesn't exist. This method DOES NOT use SecureJoin,
   204  // meaning that the passed string needs to be trusted. The caller _must_ call
   205  // the returned procThreadSelfCloser function (which is runtime.UnlockOSThread)
   206  // *only once* after it has finished using the returned path string.
   207  func ProcThreadSelf(subpath string) (string, ProcThreadSelfCloser) {
   208  	haveProcThreadSelfOnce.Do(func() {
   209  		if _, err := os.Stat("/proc/thread-self/"); err == nil {
   210  			haveProcThreadSelf = true
   211  		} else {
   212  			logrus.Debugf("cannot stat /proc/thread-self (%v), falling back to /proc/self/task/<tid>", err)
   213  		}
   214  	})
   215  
   216  	// We need to lock our thread until the caller is done with the path string
   217  	// because any non-atomic operation on the path (such as opening a file,
   218  	// then reading it) could be interrupted by the Go runtime where the
   219  	// underlying thread is swapped out and the original thread is killed,
   220  	// resulting in pull-your-hair-out-hard-to-debug issues in the caller. In
   221  	// addition, the pre-3.17 fallback makes everything non-atomic because the
   222  	// same thing could happen between unix.Gettid() and the path operations.
   223  	//
   224  	// In theory, we don't need to lock in the atomic user case when using
   225  	// /proc/thread-self/, but it's better to be safe than sorry (and there are
   226  	// only one or two truly atomic users of /proc/thread-self/).
   227  	runtime.LockOSThread()
   228  
   229  	threadSelf := "/proc/thread-self/"
   230  	if !haveProcThreadSelf {
   231  		// Pre-3.17 kernels did not have /proc/thread-self, so do it manually.
   232  		threadSelf = "/proc/self/task/" + strconv.Itoa(unix.Gettid()) + "/"
   233  		if _, err := os.Stat(threadSelf); err != nil {
   234  			// Unfortunately, this code is called from rootfs_linux.go where we
   235  			// are running inside the pid namespace of the container but /proc
   236  			// is the host's procfs. Unfortunately there is no real way to get
   237  			// the correct tid to use here (the kernel age means we cannot do
   238  			// things like set up a private fsopen("proc") -- even scanning
   239  			// NSpid in all of the tasks in /proc/self/task/*/status requires
   240  			// Linux 4.1).
   241  			//
   242  			// So, we just have to assume that /proc/self is acceptable in this
   243  			// one specific case.
   244  			if os.Getpid() == 1 {
   245  				logrus.Debugf("/proc/thread-self (tid=%d) cannot be emulated inside the initial container setup -- using /proc/self instead: %v", unix.Gettid(), err)
   246  			} else {
   247  				// This should never happen, but the fallback should work in most cases...
   248  				logrus.Warnf("/proc/thread-self could not be emulated for pid=%d (tid=%d) -- using more buggy /proc/self fallback instead: %v", os.Getpid(), unix.Gettid(), err)
   249  			}
   250  			threadSelf = "/proc/self/"
   251  		}
   252  	}
   253  	return threadSelf + subpath, runtime.UnlockOSThread
   254  }
   255  
   256  // ProcThreadSelfFd is small wrapper around ProcThreadSelf to make it easier to
   257  // create a /proc/thread-self handle for given file descriptor.
   258  //
   259  // It is basically equivalent to ProcThreadSelf(fmt.Sprintf("fd/%d", fd)), but
   260  // without using fmt.Sprintf to avoid unneeded overhead.
   261  func ProcThreadSelfFd(fd uintptr) (string, ProcThreadSelfCloser) {
   262  	return ProcThreadSelf("fd/" + strconv.FormatUint(uint64(fd), 10))
   263  }