github.com/SagerNet/gvisor@v0.0.0-20210707092255-7731c139d75c/pkg/sentry/syscalls/linux/vfs2/poll.go (about)

     1  // Copyright 2020 The gVisor Authors.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  package vfs2
    16  
    17  import (
    18  	"fmt"
    19  	"time"
    20  
    21  	"github.com/SagerNet/gvisor/pkg/abi/linux"
    22  	"github.com/SagerNet/gvisor/pkg/errors/linuxerr"
    23  	"github.com/SagerNet/gvisor/pkg/sentry/arch"
    24  	"github.com/SagerNet/gvisor/pkg/sentry/kernel"
    25  	ktime "github.com/SagerNet/gvisor/pkg/sentry/kernel/time"
    26  	"github.com/SagerNet/gvisor/pkg/sentry/limits"
    27  	"github.com/SagerNet/gvisor/pkg/sentry/vfs"
    28  	"github.com/SagerNet/gvisor/pkg/syserror"
    29  	"github.com/SagerNet/gvisor/pkg/waiter"
    30  
    31  	"github.com/SagerNet/gvisor/pkg/hostarch"
    32  )
    33  
    34  // fileCap is the maximum allowable files for poll & select. This has no
    35  // equivalent in Linux; it exists in gVisor since allocation failure in Go is
    36  // unrecoverable.
    37  const fileCap = 1024 * 1024
    38  
    39  // Masks for "readable", "writable", and "exceptional" events as defined by
    40  // select(2).
    41  const (
    42  	// selectReadEvents is analogous to the Linux kernel's
    43  	// fs/select.c:POLLIN_SET.
    44  	selectReadEvents = linux.POLLIN | linux.POLLHUP | linux.POLLERR
    45  
    46  	// selectWriteEvents is analogous to the Linux kernel's
    47  	// fs/select.c:POLLOUT_SET.
    48  	selectWriteEvents = linux.POLLOUT | linux.POLLERR
    49  
    50  	// selectExceptEvents is analogous to the Linux kernel's
    51  	// fs/select.c:POLLEX_SET.
    52  	selectExceptEvents = linux.POLLPRI
    53  )
    54  
    55  // pollState tracks the associated file description and waiter of a PollFD.
    56  type pollState struct {
    57  	file   *vfs.FileDescription
    58  	waiter waiter.Entry
    59  }
    60  
    61  // initReadiness gets the current ready mask for the file represented by the FD
    62  // stored in pfd.FD. If a channel is passed in, the waiter entry in "state" is
    63  // used to register with the file for event notifications, and a reference to
    64  // the file is stored in "state".
    65  func initReadiness(t *kernel.Task, pfd *linux.PollFD, state *pollState, ch chan struct{}) {
    66  	if pfd.FD < 0 {
    67  		pfd.REvents = 0
    68  		return
    69  	}
    70  
    71  	file := t.GetFileVFS2(pfd.FD)
    72  	if file == nil {
    73  		pfd.REvents = linux.POLLNVAL
    74  		return
    75  	}
    76  
    77  	if ch == nil {
    78  		defer file.DecRef(t)
    79  	} else {
    80  		state.file = file
    81  		state.waiter, _ = waiter.NewChannelEntry(ch)
    82  		file.EventRegister(&state.waiter, waiter.EventMaskFromLinux(uint32(pfd.Events)))
    83  	}
    84  
    85  	r := file.Readiness(waiter.EventMaskFromLinux(uint32(pfd.Events)))
    86  	pfd.REvents = int16(r.ToLinux()) & pfd.Events
    87  }
    88  
    89  // releaseState releases all the pollState in "state".
    90  func releaseState(t *kernel.Task, state []pollState) {
    91  	for i := range state {
    92  		if state[i].file != nil {
    93  			state[i].file.EventUnregister(&state[i].waiter)
    94  			state[i].file.DecRef(t)
    95  		}
    96  	}
    97  }
    98  
    99  // pollBlock polls the PollFDs in "pfd" with a bounded time specified in "timeout"
   100  // when "timeout" is greater than zero.
   101  //
   102  // pollBlock returns the remaining timeout, which is always 0 on a timeout; and 0 or
   103  // positive if interrupted by a signal.
   104  func pollBlock(t *kernel.Task, pfd []linux.PollFD, timeout time.Duration) (time.Duration, uintptr, error) {
   105  	var ch chan struct{}
   106  	if timeout != 0 {
   107  		ch = make(chan struct{}, 1)
   108  	}
   109  
   110  	// Register for event notification in the files involved if we may
   111  	// block (timeout not zero). Once we find a file that has a non-zero
   112  	// result, we stop registering for events but still go through all files
   113  	// to get their ready masks.
   114  	state := make([]pollState, len(pfd))
   115  	defer releaseState(t, state)
   116  	n := uintptr(0)
   117  	for i := range pfd {
   118  		initReadiness(t, &pfd[i], &state[i], ch)
   119  		if pfd[i].REvents != 0 {
   120  			n++
   121  			ch = nil
   122  		}
   123  	}
   124  
   125  	if timeout == 0 {
   126  		return timeout, n, nil
   127  	}
   128  
   129  	haveTimeout := timeout >= 0
   130  
   131  	for n == 0 {
   132  		var err error
   133  		// Wait for a notification.
   134  		timeout, err = t.BlockWithTimeout(ch, haveTimeout, timeout)
   135  		if err != nil {
   136  			if linuxerr.Equals(linuxerr.ETIMEDOUT, err) {
   137  				err = nil
   138  			}
   139  			return timeout, 0, err
   140  		}
   141  
   142  		// We got notified, count how many files are ready. If none,
   143  		// then this was a spurious notification, and we just go back
   144  		// to sleep with the remaining timeout.
   145  		for i := range state {
   146  			if state[i].file == nil {
   147  				continue
   148  			}
   149  
   150  			r := state[i].file.Readiness(waiter.EventMaskFromLinux(uint32(pfd[i].Events)))
   151  			rl := int16(r.ToLinux()) & pfd[i].Events
   152  			if rl != 0 {
   153  				pfd[i].REvents = rl
   154  				n++
   155  			}
   156  		}
   157  	}
   158  
   159  	return timeout, n, nil
   160  }
   161  
   162  // copyInPollFDs copies an array of struct pollfd unless nfds exceeds the max.
   163  func copyInPollFDs(t *kernel.Task, addr hostarch.Addr, nfds uint) ([]linux.PollFD, error) {
   164  	if uint64(nfds) > t.ThreadGroup().Limits().GetCapped(limits.NumberOfFiles, fileCap) {
   165  		return nil, linuxerr.EINVAL
   166  	}
   167  
   168  	pfd := make([]linux.PollFD, nfds)
   169  	if nfds > 0 {
   170  		if _, err := linux.CopyPollFDSliceIn(t, addr, pfd); err != nil {
   171  			return nil, err
   172  		}
   173  	}
   174  
   175  	return pfd, nil
   176  }
   177  
   178  func doPoll(t *kernel.Task, addr hostarch.Addr, nfds uint, timeout time.Duration) (time.Duration, uintptr, error) {
   179  	pfd, err := copyInPollFDs(t, addr, nfds)
   180  	if err != nil {
   181  		return timeout, 0, err
   182  	}
   183  
   184  	// Compatibility warning: Linux adds POLLHUP and POLLERR just before
   185  	// polling, in fs/select.c:do_pollfd(). Since pfd is copied out after
   186  	// polling, changing event masks here is an application-visible difference.
   187  	// (Linux also doesn't copy out event masks at all, only revents.)
   188  	for i := range pfd {
   189  		pfd[i].Events |= linux.POLLHUP | linux.POLLERR
   190  	}
   191  	remainingTimeout, n, err := pollBlock(t, pfd, timeout)
   192  	err = syserror.ConvertIntr(err, syserror.EINTR)
   193  
   194  	// The poll entries are copied out regardless of whether
   195  	// any are set or not. This aligns with the Linux behavior.
   196  	if nfds > 0 && err == nil {
   197  		if _, err := linux.CopyPollFDSliceOut(t, addr, pfd); err != nil {
   198  			return remainingTimeout, 0, err
   199  		}
   200  	}
   201  
   202  	return remainingTimeout, n, err
   203  }
   204  
   205  // CopyInFDSet copies an fd set from select(2)/pselect(2).
   206  func CopyInFDSet(t *kernel.Task, addr hostarch.Addr, nBytes, nBitsInLastPartialByte int) ([]byte, error) {
   207  	set := make([]byte, nBytes)
   208  
   209  	if addr != 0 {
   210  		if _, err := t.CopyInBytes(addr, set); err != nil {
   211  			return nil, err
   212  		}
   213  		// If we only use part of the last byte, mask out the extraneous bits.
   214  		//
   215  		// N.B. This only works on little-endian architectures.
   216  		if nBitsInLastPartialByte != 0 {
   217  			set[nBytes-1] &^= byte(0xff) << nBitsInLastPartialByte
   218  		}
   219  	}
   220  	return set, nil
   221  }
   222  
   223  func doSelect(t *kernel.Task, nfds int, readFDs, writeFDs, exceptFDs hostarch.Addr, timeout time.Duration) (uintptr, error) {
   224  	if nfds < 0 || nfds > fileCap {
   225  		return 0, linuxerr.EINVAL
   226  	}
   227  
   228  	// Calculate the size of the fd sets (one bit per fd).
   229  	nBytes := (nfds + 7) / 8
   230  	nBitsInLastPartialByte := nfds % 8
   231  
   232  	// Capture all the provided input vectors.
   233  	r, err := CopyInFDSet(t, readFDs, nBytes, nBitsInLastPartialByte)
   234  	if err != nil {
   235  		return 0, err
   236  	}
   237  	w, err := CopyInFDSet(t, writeFDs, nBytes, nBitsInLastPartialByte)
   238  	if err != nil {
   239  		return 0, err
   240  	}
   241  	e, err := CopyInFDSet(t, exceptFDs, nBytes, nBitsInLastPartialByte)
   242  	if err != nil {
   243  		return 0, err
   244  	}
   245  
   246  	// Count how many FDs are actually being requested so that we can build
   247  	// a PollFD array.
   248  	fdCount := 0
   249  	for i := 0; i < nBytes; i++ {
   250  		v := r[i] | w[i] | e[i]
   251  		for v != 0 {
   252  			v &= (v - 1)
   253  			fdCount++
   254  		}
   255  	}
   256  
   257  	// Build the PollFD array.
   258  	pfd := make([]linux.PollFD, 0, fdCount)
   259  	var fd int32
   260  	for i := 0; i < nBytes; i++ {
   261  		rV, wV, eV := r[i], w[i], e[i]
   262  		v := rV | wV | eV
   263  		m := byte(1)
   264  		for j := 0; j < 8; j++ {
   265  			if (v & m) != 0 {
   266  				// Make sure the fd is valid and decrement the reference
   267  				// immediately to ensure we don't leak. Note, another thread
   268  				// might be about to close fd. This is racy, but that's
   269  				// OK. Linux is racy in the same way.
   270  				file := t.GetFileVFS2(fd)
   271  				if file == nil {
   272  					return 0, linuxerr.EBADF
   273  				}
   274  				file.DecRef(t)
   275  
   276  				var mask int16
   277  				if (rV & m) != 0 {
   278  					mask |= selectReadEvents
   279  				}
   280  
   281  				if (wV & m) != 0 {
   282  					mask |= selectWriteEvents
   283  				}
   284  
   285  				if (eV & m) != 0 {
   286  					mask |= selectExceptEvents
   287  				}
   288  
   289  				pfd = append(pfd, linux.PollFD{
   290  					FD:     fd,
   291  					Events: mask,
   292  				})
   293  			}
   294  
   295  			fd++
   296  			m <<= 1
   297  		}
   298  	}
   299  
   300  	// Do the syscall, then count the number of bits set.
   301  	if _, _, err = pollBlock(t, pfd, timeout); err != nil {
   302  		return 0, syserror.ConvertIntr(err, syserror.EINTR)
   303  	}
   304  
   305  	// r, w, and e are currently event mask bitsets; unset bits corresponding
   306  	// to events that *didn't* occur.
   307  	bitSetCount := uintptr(0)
   308  	for idx := range pfd {
   309  		events := pfd[idx].REvents
   310  		i, j := pfd[idx].FD/8, uint(pfd[idx].FD%8)
   311  		m := byte(1) << j
   312  		if r[i]&m != 0 {
   313  			if (events & selectReadEvents) != 0 {
   314  				bitSetCount++
   315  			} else {
   316  				r[i] &^= m
   317  			}
   318  		}
   319  		if w[i]&m != 0 {
   320  			if (events & selectWriteEvents) != 0 {
   321  				bitSetCount++
   322  			} else {
   323  				w[i] &^= m
   324  			}
   325  		}
   326  		if e[i]&m != 0 {
   327  			if (events & selectExceptEvents) != 0 {
   328  				bitSetCount++
   329  			} else {
   330  				e[i] &^= m
   331  			}
   332  		}
   333  	}
   334  
   335  	// Copy updated vectors back.
   336  	if readFDs != 0 {
   337  		if _, err := t.CopyOutBytes(readFDs, r); err != nil {
   338  			return 0, err
   339  		}
   340  	}
   341  
   342  	if writeFDs != 0 {
   343  		if _, err := t.CopyOutBytes(writeFDs, w); err != nil {
   344  			return 0, err
   345  		}
   346  	}
   347  
   348  	if exceptFDs != 0 {
   349  		if _, err := t.CopyOutBytes(exceptFDs, e); err != nil {
   350  			return 0, err
   351  		}
   352  	}
   353  
   354  	return bitSetCount, nil
   355  }
   356  
   357  // timeoutRemaining returns the amount of time remaining for the specified
   358  // timeout or 0 if it has elapsed.
   359  //
   360  // startNs must be from CLOCK_MONOTONIC.
   361  func timeoutRemaining(t *kernel.Task, startNs ktime.Time, timeout time.Duration) time.Duration {
   362  	now := t.Kernel().MonotonicClock().Now()
   363  	remaining := timeout - now.Sub(startNs)
   364  	if remaining < 0 {
   365  		remaining = 0
   366  	}
   367  	return remaining
   368  }
   369  
   370  // copyOutTimespecRemaining copies the time remaining in timeout to timespecAddr.
   371  //
   372  // startNs must be from CLOCK_MONOTONIC.
   373  func copyOutTimespecRemaining(t *kernel.Task, startNs ktime.Time, timeout time.Duration, timespecAddr hostarch.Addr) error {
   374  	if timeout <= 0 {
   375  		return nil
   376  	}
   377  	remaining := timeoutRemaining(t, startNs, timeout)
   378  	tsRemaining := linux.NsecToTimespec(remaining.Nanoseconds())
   379  	_, err := tsRemaining.CopyOut(t, timespecAddr)
   380  	return err
   381  }
   382  
   383  // copyOutTimevalRemaining copies the time remaining in timeout to timevalAddr.
   384  //
   385  // startNs must be from CLOCK_MONOTONIC.
   386  func copyOutTimevalRemaining(t *kernel.Task, startNs ktime.Time, timeout time.Duration, timevalAddr hostarch.Addr) error {
   387  	if timeout <= 0 {
   388  		return nil
   389  	}
   390  	remaining := timeoutRemaining(t, startNs, timeout)
   391  	tvRemaining := linux.NsecToTimeval(remaining.Nanoseconds())
   392  	_, err := tvRemaining.CopyOut(t, timevalAddr)
   393  	return err
   394  }
   395  
   396  // pollRestartBlock encapsulates the state required to restart poll(2) via
   397  // restart_syscall(2).
   398  //
   399  // +stateify savable
   400  type pollRestartBlock struct {
   401  	pfdAddr hostarch.Addr
   402  	nfds    uint
   403  	timeout time.Duration
   404  }
   405  
   406  // Restart implements kernel.SyscallRestartBlock.Restart.
   407  func (p *pollRestartBlock) Restart(t *kernel.Task) (uintptr, error) {
   408  	return poll(t, p.pfdAddr, p.nfds, p.timeout)
   409  }
   410  
   411  func poll(t *kernel.Task, pfdAddr hostarch.Addr, nfds uint, timeout time.Duration) (uintptr, error) {
   412  	remainingTimeout, n, err := doPoll(t, pfdAddr, nfds, timeout)
   413  	// On an interrupt poll(2) is restarted with the remaining timeout.
   414  	if linuxerr.Equals(linuxerr.EINTR, err) {
   415  		t.SetSyscallRestartBlock(&pollRestartBlock{
   416  			pfdAddr: pfdAddr,
   417  			nfds:    nfds,
   418  			timeout: remainingTimeout,
   419  		})
   420  		return 0, syserror.ERESTART_RESTARTBLOCK
   421  	}
   422  	return n, err
   423  }
   424  
   425  // Poll implements linux syscall poll(2).
   426  func Poll(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
   427  	pfdAddr := args[0].Pointer()
   428  	nfds := uint(args[1].Uint()) // poll(2) uses unsigned long.
   429  	timeout := time.Duration(args[2].Int()) * time.Millisecond
   430  	n, err := poll(t, pfdAddr, nfds, timeout)
   431  	return n, nil, err
   432  }
   433  
   434  // Ppoll implements linux syscall ppoll(2).
   435  func Ppoll(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
   436  	pfdAddr := args[0].Pointer()
   437  	nfds := uint(args[1].Uint()) // poll(2) uses unsigned long.
   438  	timespecAddr := args[2].Pointer()
   439  	maskAddr := args[3].Pointer()
   440  	maskSize := uint(args[4].Uint())
   441  
   442  	timeout, err := copyTimespecInToDuration(t, timespecAddr)
   443  	if err != nil {
   444  		return 0, nil, err
   445  	}
   446  
   447  	var startNs ktime.Time
   448  	if timeout > 0 {
   449  		startNs = t.Kernel().MonotonicClock().Now()
   450  	}
   451  
   452  	if err := setTempSignalSet(t, maskAddr, maskSize); err != nil {
   453  		return 0, nil, err
   454  	}
   455  
   456  	_, n, err := doPoll(t, pfdAddr, nfds, timeout)
   457  	copyErr := copyOutTimespecRemaining(t, startNs, timeout, timespecAddr)
   458  	// doPoll returns EINTR if interrupted, but ppoll is normally restartable
   459  	// if interrupted by something other than a signal handled by the
   460  	// application (i.e. returns ERESTARTNOHAND). However, if
   461  	// copyOutTimespecRemaining failed, then the restarted ppoll would use the
   462  	// wrong timeout, so the error should be left as EINTR.
   463  	//
   464  	// Note that this means that if err is nil but copyErr is not, copyErr is
   465  	// ignored. This is consistent with Linux.
   466  	if linuxerr.Equals(linuxerr.EINTR, err) && copyErr == nil {
   467  		err = syserror.ERESTARTNOHAND
   468  	}
   469  	return n, nil, err
   470  }
   471  
   472  // Select implements linux syscall select(2).
   473  func Select(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
   474  	nfds := int(args[0].Int()) // select(2) uses an int.
   475  	readFDs := args[1].Pointer()
   476  	writeFDs := args[2].Pointer()
   477  	exceptFDs := args[3].Pointer()
   478  	timevalAddr := args[4].Pointer()
   479  
   480  	// Use a negative Duration to indicate "no timeout".
   481  	timeout := time.Duration(-1)
   482  	if timevalAddr != 0 {
   483  		var timeval linux.Timeval
   484  		if _, err := timeval.CopyIn(t, timevalAddr); err != nil {
   485  			return 0, nil, err
   486  		}
   487  		if timeval.Sec < 0 || timeval.Usec < 0 {
   488  			return 0, nil, linuxerr.EINVAL
   489  		}
   490  		timeout = time.Duration(timeval.ToNsecCapped())
   491  	}
   492  	startNs := t.Kernel().MonotonicClock().Now()
   493  	n, err := doSelect(t, nfds, readFDs, writeFDs, exceptFDs, timeout)
   494  	copyErr := copyOutTimevalRemaining(t, startNs, timeout, timevalAddr)
   495  	// See comment in Ppoll.
   496  	if linuxerr.Equals(linuxerr.EINTR, err) && copyErr == nil {
   497  		err = syserror.ERESTARTNOHAND
   498  	}
   499  	return n, nil, err
   500  }
   501  
   502  // +marshal
   503  type sigSetWithSize struct {
   504  	sigsetAddr   uint64
   505  	sizeofSigset uint64
   506  }
   507  
   508  // Pselect implements linux syscall pselect(2).
   509  func Pselect(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
   510  	nfds := int(args[0].Int()) // select(2) uses an int.
   511  	readFDs := args[1].Pointer()
   512  	writeFDs := args[2].Pointer()
   513  	exceptFDs := args[3].Pointer()
   514  	timespecAddr := args[4].Pointer()
   515  	maskWithSizeAddr := args[5].Pointer()
   516  
   517  	timeout, err := copyTimespecInToDuration(t, timespecAddr)
   518  	if err != nil {
   519  		return 0, nil, err
   520  	}
   521  
   522  	var startNs ktime.Time
   523  	if timeout > 0 {
   524  		startNs = t.Kernel().MonotonicClock().Now()
   525  	}
   526  
   527  	if maskWithSizeAddr != 0 {
   528  		if t.Arch().Width() != 8 {
   529  			panic(fmt.Sprintf("unsupported sizeof(void*): %d", t.Arch().Width()))
   530  		}
   531  		var maskStruct sigSetWithSize
   532  		if _, err := maskStruct.CopyIn(t, maskWithSizeAddr); err != nil {
   533  			return 0, nil, err
   534  		}
   535  		if err := setTempSignalSet(t, hostarch.Addr(maskStruct.sigsetAddr), uint(maskStruct.sizeofSigset)); err != nil {
   536  			return 0, nil, err
   537  		}
   538  	}
   539  
   540  	n, err := doSelect(t, nfds, readFDs, writeFDs, exceptFDs, timeout)
   541  	copyErr := copyOutTimespecRemaining(t, startNs, timeout, timespecAddr)
   542  	// See comment in Ppoll.
   543  	if linuxerr.Equals(linuxerr.EINTR, err) && copyErr == nil {
   544  		err = syserror.ERESTARTNOHAND
   545  	}
   546  	return n, nil, err
   547  }
   548  
   549  // copyTimespecInToDuration copies a Timespec from the untrusted app range,
   550  // validates it and converts it to a Duration.
   551  //
   552  // If the Timespec is larger than what can be represented in a Duration, the
   553  // returned value is the maximum that Duration will allow.
   554  //
   555  // If timespecAddr is NULL, the returned value is negative.
   556  func copyTimespecInToDuration(t *kernel.Task, timespecAddr hostarch.Addr) (time.Duration, error) {
   557  	// Use a negative Duration to indicate "no timeout".
   558  	timeout := time.Duration(-1)
   559  	if timespecAddr != 0 {
   560  		var timespec linux.Timespec
   561  		if _, err := timespec.CopyIn(t, timespecAddr); err != nil {
   562  			return 0, err
   563  		}
   564  		if !timespec.Valid() {
   565  			return 0, linuxerr.EINVAL
   566  		}
   567  		timeout = time.Duration(timespec.ToNsecCapped())
   568  	}
   569  	return timeout, nil
   570  }
   571  
   572  func setTempSignalSet(t *kernel.Task, maskAddr hostarch.Addr, maskSize uint) error {
   573  	if maskAddr == 0 {
   574  		return nil
   575  	}
   576  	if maskSize != linux.SignalSetSize {
   577  		return linuxerr.EINVAL
   578  	}
   579  	var mask linux.SignalSet
   580  	if _, err := mask.CopyIn(t, maskAddr); err != nil {
   581  		return err
   582  	}
   583  	mask &^= kernel.UnblockableSignals
   584  	oldmask := t.SignalMask()
   585  	t.SetSignalMask(mask)
   586  	t.SetSavedSignalMask(oldmask)
   587  	return nil
   588  }