github.com/SagerNet/gvisor@v0.0.0-20210707092255-7731c139d75c/pkg/sentry/syscalls/linux/vfs2/poll.go (about) 1 // Copyright 2020 The gVisor Authors. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 package vfs2 16 17 import ( 18 "fmt" 19 "time" 20 21 "github.com/SagerNet/gvisor/pkg/abi/linux" 22 "github.com/SagerNet/gvisor/pkg/errors/linuxerr" 23 "github.com/SagerNet/gvisor/pkg/sentry/arch" 24 "github.com/SagerNet/gvisor/pkg/sentry/kernel" 25 ktime "github.com/SagerNet/gvisor/pkg/sentry/kernel/time" 26 "github.com/SagerNet/gvisor/pkg/sentry/limits" 27 "github.com/SagerNet/gvisor/pkg/sentry/vfs" 28 "github.com/SagerNet/gvisor/pkg/syserror" 29 "github.com/SagerNet/gvisor/pkg/waiter" 30 31 "github.com/SagerNet/gvisor/pkg/hostarch" 32 ) 33 34 // fileCap is the maximum allowable files for poll & select. This has no 35 // equivalent in Linux; it exists in gVisor since allocation failure in Go is 36 // unrecoverable. 37 const fileCap = 1024 * 1024 38 39 // Masks for "readable", "writable", and "exceptional" events as defined by 40 // select(2). 41 const ( 42 // selectReadEvents is analogous to the Linux kernel's 43 // fs/select.c:POLLIN_SET. 44 selectReadEvents = linux.POLLIN | linux.POLLHUP | linux.POLLERR 45 46 // selectWriteEvents is analogous to the Linux kernel's 47 // fs/select.c:POLLOUT_SET. 48 selectWriteEvents = linux.POLLOUT | linux.POLLERR 49 50 // selectExceptEvents is analogous to the Linux kernel's 51 // fs/select.c:POLLEX_SET. 52 selectExceptEvents = linux.POLLPRI 53 ) 54 55 // pollState tracks the associated file description and waiter of a PollFD. 56 type pollState struct { 57 file *vfs.FileDescription 58 waiter waiter.Entry 59 } 60 61 // initReadiness gets the current ready mask for the file represented by the FD 62 // stored in pfd.FD. If a channel is passed in, the waiter entry in "state" is 63 // used to register with the file for event notifications, and a reference to 64 // the file is stored in "state". 65 func initReadiness(t *kernel.Task, pfd *linux.PollFD, state *pollState, ch chan struct{}) { 66 if pfd.FD < 0 { 67 pfd.REvents = 0 68 return 69 } 70 71 file := t.GetFileVFS2(pfd.FD) 72 if file == nil { 73 pfd.REvents = linux.POLLNVAL 74 return 75 } 76 77 if ch == nil { 78 defer file.DecRef(t) 79 } else { 80 state.file = file 81 state.waiter, _ = waiter.NewChannelEntry(ch) 82 file.EventRegister(&state.waiter, waiter.EventMaskFromLinux(uint32(pfd.Events))) 83 } 84 85 r := file.Readiness(waiter.EventMaskFromLinux(uint32(pfd.Events))) 86 pfd.REvents = int16(r.ToLinux()) & pfd.Events 87 } 88 89 // releaseState releases all the pollState in "state". 90 func releaseState(t *kernel.Task, state []pollState) { 91 for i := range state { 92 if state[i].file != nil { 93 state[i].file.EventUnregister(&state[i].waiter) 94 state[i].file.DecRef(t) 95 } 96 } 97 } 98 99 // pollBlock polls the PollFDs in "pfd" with a bounded time specified in "timeout" 100 // when "timeout" is greater than zero. 101 // 102 // pollBlock returns the remaining timeout, which is always 0 on a timeout; and 0 or 103 // positive if interrupted by a signal. 104 func pollBlock(t *kernel.Task, pfd []linux.PollFD, timeout time.Duration) (time.Duration, uintptr, error) { 105 var ch chan struct{} 106 if timeout != 0 { 107 ch = make(chan struct{}, 1) 108 } 109 110 // Register for event notification in the files involved if we may 111 // block (timeout not zero). Once we find a file that has a non-zero 112 // result, we stop registering for events but still go through all files 113 // to get their ready masks. 114 state := make([]pollState, len(pfd)) 115 defer releaseState(t, state) 116 n := uintptr(0) 117 for i := range pfd { 118 initReadiness(t, &pfd[i], &state[i], ch) 119 if pfd[i].REvents != 0 { 120 n++ 121 ch = nil 122 } 123 } 124 125 if timeout == 0 { 126 return timeout, n, nil 127 } 128 129 haveTimeout := timeout >= 0 130 131 for n == 0 { 132 var err error 133 // Wait for a notification. 134 timeout, err = t.BlockWithTimeout(ch, haveTimeout, timeout) 135 if err != nil { 136 if linuxerr.Equals(linuxerr.ETIMEDOUT, err) { 137 err = nil 138 } 139 return timeout, 0, err 140 } 141 142 // We got notified, count how many files are ready. If none, 143 // then this was a spurious notification, and we just go back 144 // to sleep with the remaining timeout. 145 for i := range state { 146 if state[i].file == nil { 147 continue 148 } 149 150 r := state[i].file.Readiness(waiter.EventMaskFromLinux(uint32(pfd[i].Events))) 151 rl := int16(r.ToLinux()) & pfd[i].Events 152 if rl != 0 { 153 pfd[i].REvents = rl 154 n++ 155 } 156 } 157 } 158 159 return timeout, n, nil 160 } 161 162 // copyInPollFDs copies an array of struct pollfd unless nfds exceeds the max. 163 func copyInPollFDs(t *kernel.Task, addr hostarch.Addr, nfds uint) ([]linux.PollFD, error) { 164 if uint64(nfds) > t.ThreadGroup().Limits().GetCapped(limits.NumberOfFiles, fileCap) { 165 return nil, linuxerr.EINVAL 166 } 167 168 pfd := make([]linux.PollFD, nfds) 169 if nfds > 0 { 170 if _, err := linux.CopyPollFDSliceIn(t, addr, pfd); err != nil { 171 return nil, err 172 } 173 } 174 175 return pfd, nil 176 } 177 178 func doPoll(t *kernel.Task, addr hostarch.Addr, nfds uint, timeout time.Duration) (time.Duration, uintptr, error) { 179 pfd, err := copyInPollFDs(t, addr, nfds) 180 if err != nil { 181 return timeout, 0, err 182 } 183 184 // Compatibility warning: Linux adds POLLHUP and POLLERR just before 185 // polling, in fs/select.c:do_pollfd(). Since pfd is copied out after 186 // polling, changing event masks here is an application-visible difference. 187 // (Linux also doesn't copy out event masks at all, only revents.) 188 for i := range pfd { 189 pfd[i].Events |= linux.POLLHUP | linux.POLLERR 190 } 191 remainingTimeout, n, err := pollBlock(t, pfd, timeout) 192 err = syserror.ConvertIntr(err, syserror.EINTR) 193 194 // The poll entries are copied out regardless of whether 195 // any are set or not. This aligns with the Linux behavior. 196 if nfds > 0 && err == nil { 197 if _, err := linux.CopyPollFDSliceOut(t, addr, pfd); err != nil { 198 return remainingTimeout, 0, err 199 } 200 } 201 202 return remainingTimeout, n, err 203 } 204 205 // CopyInFDSet copies an fd set from select(2)/pselect(2). 206 func CopyInFDSet(t *kernel.Task, addr hostarch.Addr, nBytes, nBitsInLastPartialByte int) ([]byte, error) { 207 set := make([]byte, nBytes) 208 209 if addr != 0 { 210 if _, err := t.CopyInBytes(addr, set); err != nil { 211 return nil, err 212 } 213 // If we only use part of the last byte, mask out the extraneous bits. 214 // 215 // N.B. This only works on little-endian architectures. 216 if nBitsInLastPartialByte != 0 { 217 set[nBytes-1] &^= byte(0xff) << nBitsInLastPartialByte 218 } 219 } 220 return set, nil 221 } 222 223 func doSelect(t *kernel.Task, nfds int, readFDs, writeFDs, exceptFDs hostarch.Addr, timeout time.Duration) (uintptr, error) { 224 if nfds < 0 || nfds > fileCap { 225 return 0, linuxerr.EINVAL 226 } 227 228 // Calculate the size of the fd sets (one bit per fd). 229 nBytes := (nfds + 7) / 8 230 nBitsInLastPartialByte := nfds % 8 231 232 // Capture all the provided input vectors. 233 r, err := CopyInFDSet(t, readFDs, nBytes, nBitsInLastPartialByte) 234 if err != nil { 235 return 0, err 236 } 237 w, err := CopyInFDSet(t, writeFDs, nBytes, nBitsInLastPartialByte) 238 if err != nil { 239 return 0, err 240 } 241 e, err := CopyInFDSet(t, exceptFDs, nBytes, nBitsInLastPartialByte) 242 if err != nil { 243 return 0, err 244 } 245 246 // Count how many FDs are actually being requested so that we can build 247 // a PollFD array. 248 fdCount := 0 249 for i := 0; i < nBytes; i++ { 250 v := r[i] | w[i] | e[i] 251 for v != 0 { 252 v &= (v - 1) 253 fdCount++ 254 } 255 } 256 257 // Build the PollFD array. 258 pfd := make([]linux.PollFD, 0, fdCount) 259 var fd int32 260 for i := 0; i < nBytes; i++ { 261 rV, wV, eV := r[i], w[i], e[i] 262 v := rV | wV | eV 263 m := byte(1) 264 for j := 0; j < 8; j++ { 265 if (v & m) != 0 { 266 // Make sure the fd is valid and decrement the reference 267 // immediately to ensure we don't leak. Note, another thread 268 // might be about to close fd. This is racy, but that's 269 // OK. Linux is racy in the same way. 270 file := t.GetFileVFS2(fd) 271 if file == nil { 272 return 0, linuxerr.EBADF 273 } 274 file.DecRef(t) 275 276 var mask int16 277 if (rV & m) != 0 { 278 mask |= selectReadEvents 279 } 280 281 if (wV & m) != 0 { 282 mask |= selectWriteEvents 283 } 284 285 if (eV & m) != 0 { 286 mask |= selectExceptEvents 287 } 288 289 pfd = append(pfd, linux.PollFD{ 290 FD: fd, 291 Events: mask, 292 }) 293 } 294 295 fd++ 296 m <<= 1 297 } 298 } 299 300 // Do the syscall, then count the number of bits set. 301 if _, _, err = pollBlock(t, pfd, timeout); err != nil { 302 return 0, syserror.ConvertIntr(err, syserror.EINTR) 303 } 304 305 // r, w, and e are currently event mask bitsets; unset bits corresponding 306 // to events that *didn't* occur. 307 bitSetCount := uintptr(0) 308 for idx := range pfd { 309 events := pfd[idx].REvents 310 i, j := pfd[idx].FD/8, uint(pfd[idx].FD%8) 311 m := byte(1) << j 312 if r[i]&m != 0 { 313 if (events & selectReadEvents) != 0 { 314 bitSetCount++ 315 } else { 316 r[i] &^= m 317 } 318 } 319 if w[i]&m != 0 { 320 if (events & selectWriteEvents) != 0 { 321 bitSetCount++ 322 } else { 323 w[i] &^= m 324 } 325 } 326 if e[i]&m != 0 { 327 if (events & selectExceptEvents) != 0 { 328 bitSetCount++ 329 } else { 330 e[i] &^= m 331 } 332 } 333 } 334 335 // Copy updated vectors back. 336 if readFDs != 0 { 337 if _, err := t.CopyOutBytes(readFDs, r); err != nil { 338 return 0, err 339 } 340 } 341 342 if writeFDs != 0 { 343 if _, err := t.CopyOutBytes(writeFDs, w); err != nil { 344 return 0, err 345 } 346 } 347 348 if exceptFDs != 0 { 349 if _, err := t.CopyOutBytes(exceptFDs, e); err != nil { 350 return 0, err 351 } 352 } 353 354 return bitSetCount, nil 355 } 356 357 // timeoutRemaining returns the amount of time remaining for the specified 358 // timeout or 0 if it has elapsed. 359 // 360 // startNs must be from CLOCK_MONOTONIC. 361 func timeoutRemaining(t *kernel.Task, startNs ktime.Time, timeout time.Duration) time.Duration { 362 now := t.Kernel().MonotonicClock().Now() 363 remaining := timeout - now.Sub(startNs) 364 if remaining < 0 { 365 remaining = 0 366 } 367 return remaining 368 } 369 370 // copyOutTimespecRemaining copies the time remaining in timeout to timespecAddr. 371 // 372 // startNs must be from CLOCK_MONOTONIC. 373 func copyOutTimespecRemaining(t *kernel.Task, startNs ktime.Time, timeout time.Duration, timespecAddr hostarch.Addr) error { 374 if timeout <= 0 { 375 return nil 376 } 377 remaining := timeoutRemaining(t, startNs, timeout) 378 tsRemaining := linux.NsecToTimespec(remaining.Nanoseconds()) 379 _, err := tsRemaining.CopyOut(t, timespecAddr) 380 return err 381 } 382 383 // copyOutTimevalRemaining copies the time remaining in timeout to timevalAddr. 384 // 385 // startNs must be from CLOCK_MONOTONIC. 386 func copyOutTimevalRemaining(t *kernel.Task, startNs ktime.Time, timeout time.Duration, timevalAddr hostarch.Addr) error { 387 if timeout <= 0 { 388 return nil 389 } 390 remaining := timeoutRemaining(t, startNs, timeout) 391 tvRemaining := linux.NsecToTimeval(remaining.Nanoseconds()) 392 _, err := tvRemaining.CopyOut(t, timevalAddr) 393 return err 394 } 395 396 // pollRestartBlock encapsulates the state required to restart poll(2) via 397 // restart_syscall(2). 398 // 399 // +stateify savable 400 type pollRestartBlock struct { 401 pfdAddr hostarch.Addr 402 nfds uint 403 timeout time.Duration 404 } 405 406 // Restart implements kernel.SyscallRestartBlock.Restart. 407 func (p *pollRestartBlock) Restart(t *kernel.Task) (uintptr, error) { 408 return poll(t, p.pfdAddr, p.nfds, p.timeout) 409 } 410 411 func poll(t *kernel.Task, pfdAddr hostarch.Addr, nfds uint, timeout time.Duration) (uintptr, error) { 412 remainingTimeout, n, err := doPoll(t, pfdAddr, nfds, timeout) 413 // On an interrupt poll(2) is restarted with the remaining timeout. 414 if linuxerr.Equals(linuxerr.EINTR, err) { 415 t.SetSyscallRestartBlock(&pollRestartBlock{ 416 pfdAddr: pfdAddr, 417 nfds: nfds, 418 timeout: remainingTimeout, 419 }) 420 return 0, syserror.ERESTART_RESTARTBLOCK 421 } 422 return n, err 423 } 424 425 // Poll implements linux syscall poll(2). 426 func Poll(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { 427 pfdAddr := args[0].Pointer() 428 nfds := uint(args[1].Uint()) // poll(2) uses unsigned long. 429 timeout := time.Duration(args[2].Int()) * time.Millisecond 430 n, err := poll(t, pfdAddr, nfds, timeout) 431 return n, nil, err 432 } 433 434 // Ppoll implements linux syscall ppoll(2). 435 func Ppoll(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { 436 pfdAddr := args[0].Pointer() 437 nfds := uint(args[1].Uint()) // poll(2) uses unsigned long. 438 timespecAddr := args[2].Pointer() 439 maskAddr := args[3].Pointer() 440 maskSize := uint(args[4].Uint()) 441 442 timeout, err := copyTimespecInToDuration(t, timespecAddr) 443 if err != nil { 444 return 0, nil, err 445 } 446 447 var startNs ktime.Time 448 if timeout > 0 { 449 startNs = t.Kernel().MonotonicClock().Now() 450 } 451 452 if err := setTempSignalSet(t, maskAddr, maskSize); err != nil { 453 return 0, nil, err 454 } 455 456 _, n, err := doPoll(t, pfdAddr, nfds, timeout) 457 copyErr := copyOutTimespecRemaining(t, startNs, timeout, timespecAddr) 458 // doPoll returns EINTR if interrupted, but ppoll is normally restartable 459 // if interrupted by something other than a signal handled by the 460 // application (i.e. returns ERESTARTNOHAND). However, if 461 // copyOutTimespecRemaining failed, then the restarted ppoll would use the 462 // wrong timeout, so the error should be left as EINTR. 463 // 464 // Note that this means that if err is nil but copyErr is not, copyErr is 465 // ignored. This is consistent with Linux. 466 if linuxerr.Equals(linuxerr.EINTR, err) && copyErr == nil { 467 err = syserror.ERESTARTNOHAND 468 } 469 return n, nil, err 470 } 471 472 // Select implements linux syscall select(2). 473 func Select(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { 474 nfds := int(args[0].Int()) // select(2) uses an int. 475 readFDs := args[1].Pointer() 476 writeFDs := args[2].Pointer() 477 exceptFDs := args[3].Pointer() 478 timevalAddr := args[4].Pointer() 479 480 // Use a negative Duration to indicate "no timeout". 481 timeout := time.Duration(-1) 482 if timevalAddr != 0 { 483 var timeval linux.Timeval 484 if _, err := timeval.CopyIn(t, timevalAddr); err != nil { 485 return 0, nil, err 486 } 487 if timeval.Sec < 0 || timeval.Usec < 0 { 488 return 0, nil, linuxerr.EINVAL 489 } 490 timeout = time.Duration(timeval.ToNsecCapped()) 491 } 492 startNs := t.Kernel().MonotonicClock().Now() 493 n, err := doSelect(t, nfds, readFDs, writeFDs, exceptFDs, timeout) 494 copyErr := copyOutTimevalRemaining(t, startNs, timeout, timevalAddr) 495 // See comment in Ppoll. 496 if linuxerr.Equals(linuxerr.EINTR, err) && copyErr == nil { 497 err = syserror.ERESTARTNOHAND 498 } 499 return n, nil, err 500 } 501 502 // +marshal 503 type sigSetWithSize struct { 504 sigsetAddr uint64 505 sizeofSigset uint64 506 } 507 508 // Pselect implements linux syscall pselect(2). 509 func Pselect(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { 510 nfds := int(args[0].Int()) // select(2) uses an int. 511 readFDs := args[1].Pointer() 512 writeFDs := args[2].Pointer() 513 exceptFDs := args[3].Pointer() 514 timespecAddr := args[4].Pointer() 515 maskWithSizeAddr := args[5].Pointer() 516 517 timeout, err := copyTimespecInToDuration(t, timespecAddr) 518 if err != nil { 519 return 0, nil, err 520 } 521 522 var startNs ktime.Time 523 if timeout > 0 { 524 startNs = t.Kernel().MonotonicClock().Now() 525 } 526 527 if maskWithSizeAddr != 0 { 528 if t.Arch().Width() != 8 { 529 panic(fmt.Sprintf("unsupported sizeof(void*): %d", t.Arch().Width())) 530 } 531 var maskStruct sigSetWithSize 532 if _, err := maskStruct.CopyIn(t, maskWithSizeAddr); err != nil { 533 return 0, nil, err 534 } 535 if err := setTempSignalSet(t, hostarch.Addr(maskStruct.sigsetAddr), uint(maskStruct.sizeofSigset)); err != nil { 536 return 0, nil, err 537 } 538 } 539 540 n, err := doSelect(t, nfds, readFDs, writeFDs, exceptFDs, timeout) 541 copyErr := copyOutTimespecRemaining(t, startNs, timeout, timespecAddr) 542 // See comment in Ppoll. 543 if linuxerr.Equals(linuxerr.EINTR, err) && copyErr == nil { 544 err = syserror.ERESTARTNOHAND 545 } 546 return n, nil, err 547 } 548 549 // copyTimespecInToDuration copies a Timespec from the untrusted app range, 550 // validates it and converts it to a Duration. 551 // 552 // If the Timespec is larger than what can be represented in a Duration, the 553 // returned value is the maximum that Duration will allow. 554 // 555 // If timespecAddr is NULL, the returned value is negative. 556 func copyTimespecInToDuration(t *kernel.Task, timespecAddr hostarch.Addr) (time.Duration, error) { 557 // Use a negative Duration to indicate "no timeout". 558 timeout := time.Duration(-1) 559 if timespecAddr != 0 { 560 var timespec linux.Timespec 561 if _, err := timespec.CopyIn(t, timespecAddr); err != nil { 562 return 0, err 563 } 564 if !timespec.Valid() { 565 return 0, linuxerr.EINVAL 566 } 567 timeout = time.Duration(timespec.ToNsecCapped()) 568 } 569 return timeout, nil 570 } 571 572 func setTempSignalSet(t *kernel.Task, maskAddr hostarch.Addr, maskSize uint) error { 573 if maskAddr == 0 { 574 return nil 575 } 576 if maskSize != linux.SignalSetSize { 577 return linuxerr.EINVAL 578 } 579 var mask linux.SignalSet 580 if _, err := mask.CopyIn(t, maskAddr); err != nil { 581 return err 582 } 583 mask &^= kernel.UnblockableSignals 584 oldmask := t.SignalMask() 585 t.SetSignalMask(mask) 586 t.SetSavedSignalMask(oldmask) 587 return nil 588 }