github.com/SagerNet/gvisor@v0.0.0-20210707092255-7731c139d75c/pkg/sentry/syscalls/linux/vfs2/read_write.go (about) 1 // Copyright 2020 The gVisor Authors. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 package vfs2 16 17 import ( 18 "time" 19 20 "github.com/SagerNet/gvisor/pkg/abi/linux" 21 "github.com/SagerNet/gvisor/pkg/errors/linuxerr" 22 "github.com/SagerNet/gvisor/pkg/sentry/arch" 23 "github.com/SagerNet/gvisor/pkg/sentry/kernel" 24 ktime "github.com/SagerNet/gvisor/pkg/sentry/kernel/time" 25 "github.com/SagerNet/gvisor/pkg/sentry/socket" 26 slinux "github.com/SagerNet/gvisor/pkg/sentry/syscalls/linux" 27 "github.com/SagerNet/gvisor/pkg/sentry/vfs" 28 "github.com/SagerNet/gvisor/pkg/syserror" 29 "github.com/SagerNet/gvisor/pkg/usermem" 30 "github.com/SagerNet/gvisor/pkg/waiter" 31 ) 32 33 const ( 34 eventMaskRead = waiter.EventRdNorm | waiter.EventIn | waiter.EventHUp | waiter.EventErr 35 eventMaskWrite = waiter.EventWrNorm | waiter.EventOut | waiter.EventHUp | waiter.EventErr 36 ) 37 38 // Read implements Linux syscall read(2). 39 func Read(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { 40 fd := args[0].Int() 41 addr := args[1].Pointer() 42 size := args[2].SizeT() 43 44 file := t.GetFileVFS2(fd) 45 if file == nil { 46 return 0, nil, linuxerr.EBADF 47 } 48 defer file.DecRef(t) 49 50 // Check that the size is legitimate. 51 si := int(size) 52 if si < 0 { 53 return 0, nil, linuxerr.EINVAL 54 } 55 56 // Get the destination of the read. 57 dst, err := t.SingleIOSequence(addr, si, usermem.IOOpts{ 58 AddressSpaceActive: true, 59 }) 60 if err != nil { 61 return 0, nil, err 62 } 63 64 n, err := read(t, file, dst, vfs.ReadOptions{}) 65 t.IOUsage().AccountReadSyscall(n) 66 return uintptr(n), nil, slinux.HandleIOErrorVFS2(t, n != 0, err, syserror.ERESTARTSYS, "read", file) 67 } 68 69 // Readv implements Linux syscall readv(2). 70 func Readv(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { 71 fd := args[0].Int() 72 addr := args[1].Pointer() 73 iovcnt := int(args[2].Int()) 74 75 file := t.GetFileVFS2(fd) 76 if file == nil { 77 return 0, nil, linuxerr.EBADF 78 } 79 defer file.DecRef(t) 80 81 // Get the destination of the read. 82 dst, err := t.IovecsIOSequence(addr, iovcnt, usermem.IOOpts{ 83 AddressSpaceActive: true, 84 }) 85 if err != nil { 86 return 0, nil, err 87 } 88 89 n, err := read(t, file, dst, vfs.ReadOptions{}) 90 t.IOUsage().AccountReadSyscall(n) 91 return uintptr(n), nil, slinux.HandleIOErrorVFS2(t, n != 0, err, syserror.ERESTARTSYS, "readv", file) 92 } 93 94 func read(t *kernel.Task, file *vfs.FileDescription, dst usermem.IOSequence, opts vfs.ReadOptions) (int64, error) { 95 n, err := file.Read(t, dst, opts) 96 if err != syserror.ErrWouldBlock { 97 return n, err 98 } 99 100 allowBlock, deadline, hasDeadline := blockPolicy(t, file) 101 if !allowBlock { 102 return n, err 103 } 104 105 // Register for notifications. 106 w, ch := waiter.NewChannelEntry(nil) 107 file.EventRegister(&w, eventMaskRead) 108 109 total := n 110 for { 111 // Shorten dst to reflect bytes previously read. 112 dst = dst.DropFirst(int(n)) 113 114 // Issue the request and break out if it completes with anything other than 115 // "would block". 116 n, err = file.Read(t, dst, opts) 117 total += n 118 if err != syserror.ErrWouldBlock { 119 break 120 } 121 122 // Wait for a notification that we should retry. 123 if err = t.BlockWithDeadline(ch, hasDeadline, deadline); err != nil { 124 if linuxerr.Equals(linuxerr.ETIMEDOUT, err) { 125 err = syserror.ErrWouldBlock 126 } 127 break 128 } 129 } 130 file.EventUnregister(&w) 131 132 return total, err 133 } 134 135 // Pread64 implements Linux syscall pread64(2). 136 func Pread64(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { 137 fd := args[0].Int() 138 addr := args[1].Pointer() 139 size := args[2].SizeT() 140 offset := args[3].Int64() 141 142 file := t.GetFileVFS2(fd) 143 if file == nil { 144 return 0, nil, linuxerr.EBADF 145 } 146 defer file.DecRef(t) 147 148 // Check that the offset is legitimate and does not overflow. 149 if offset < 0 || offset+int64(size) < 0 { 150 return 0, nil, linuxerr.EINVAL 151 } 152 153 // Check that the size is legitimate. 154 si := int(size) 155 if si < 0 { 156 return 0, nil, linuxerr.EINVAL 157 } 158 159 // Get the destination of the read. 160 dst, err := t.SingleIOSequence(addr, si, usermem.IOOpts{ 161 AddressSpaceActive: true, 162 }) 163 if err != nil { 164 return 0, nil, err 165 } 166 167 n, err := pread(t, file, dst, offset, vfs.ReadOptions{}) 168 t.IOUsage().AccountReadSyscall(n) 169 return uintptr(n), nil, slinux.HandleIOErrorVFS2(t, n != 0, err, syserror.ERESTARTSYS, "pread64", file) 170 } 171 172 // Preadv implements Linux syscall preadv(2). 173 func Preadv(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { 174 fd := args[0].Int() 175 addr := args[1].Pointer() 176 iovcnt := int(args[2].Int()) 177 offset := args[3].Int64() 178 179 file := t.GetFileVFS2(fd) 180 if file == nil { 181 return 0, nil, linuxerr.EBADF 182 } 183 defer file.DecRef(t) 184 185 // Check that the offset is legitimate. 186 if offset < 0 { 187 return 0, nil, linuxerr.EINVAL 188 } 189 190 // Get the destination of the read. 191 dst, err := t.IovecsIOSequence(addr, iovcnt, usermem.IOOpts{ 192 AddressSpaceActive: true, 193 }) 194 if err != nil { 195 return 0, nil, err 196 } 197 198 n, err := pread(t, file, dst, offset, vfs.ReadOptions{}) 199 t.IOUsage().AccountReadSyscall(n) 200 return uintptr(n), nil, slinux.HandleIOErrorVFS2(t, n != 0, err, syserror.ERESTARTSYS, "preadv", file) 201 } 202 203 // Preadv2 implements Linux syscall preadv2(2). 204 func Preadv2(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { 205 // While the glibc signature is 206 // preadv2(int fd, struct iovec* iov, int iov_cnt, off_t offset, int flags) 207 // the actual syscall 208 // (https://elixir.bootlin.com/linux/v5.5/source/fs/read_write.c#L1142) 209 // splits the offset argument into a high/low value for compatibility with 210 // 32-bit architectures. The flags argument is the 6th argument (index 5). 211 fd := args[0].Int() 212 addr := args[1].Pointer() 213 iovcnt := int(args[2].Int()) 214 offset := args[3].Int64() 215 flags := args[5].Int() 216 217 file := t.GetFileVFS2(fd) 218 if file == nil { 219 return 0, nil, linuxerr.EBADF 220 } 221 defer file.DecRef(t) 222 223 // Check that the offset is legitimate. 224 if offset < -1 { 225 return 0, nil, linuxerr.EINVAL 226 } 227 228 // Get the destination of the read. 229 dst, err := t.IovecsIOSequence(addr, iovcnt, usermem.IOOpts{ 230 AddressSpaceActive: true, 231 }) 232 if err != nil { 233 return 0, nil, err 234 } 235 236 opts := vfs.ReadOptions{ 237 Flags: uint32(flags), 238 } 239 var n int64 240 if offset == -1 { 241 n, err = read(t, file, dst, opts) 242 } else { 243 n, err = pread(t, file, dst, offset, opts) 244 } 245 t.IOUsage().AccountReadSyscall(n) 246 return uintptr(n), nil, slinux.HandleIOErrorVFS2(t, n != 0, err, syserror.ERESTARTSYS, "preadv2", file) 247 } 248 249 func pread(t *kernel.Task, file *vfs.FileDescription, dst usermem.IOSequence, offset int64, opts vfs.ReadOptions) (int64, error) { 250 n, err := file.PRead(t, dst, offset, opts) 251 if err != syserror.ErrWouldBlock { 252 return n, err 253 } 254 255 allowBlock, deadline, hasDeadline := blockPolicy(t, file) 256 if !allowBlock { 257 return n, err 258 } 259 260 // Register for notifications. 261 w, ch := waiter.NewChannelEntry(nil) 262 file.EventRegister(&w, eventMaskRead) 263 264 total := n 265 for { 266 // Shorten dst to reflect bytes previously read. 267 dst = dst.DropFirst(int(n)) 268 269 // Issue the request and break out if it completes with anything other than 270 // "would block". 271 n, err = file.PRead(t, dst, offset+total, opts) 272 total += n 273 if err != syserror.ErrWouldBlock { 274 break 275 } 276 277 // Wait for a notification that we should retry. 278 if err = t.BlockWithDeadline(ch, hasDeadline, deadline); err != nil { 279 if linuxerr.Equals(linuxerr.ETIMEDOUT, err) { 280 err = syserror.ErrWouldBlock 281 } 282 break 283 } 284 } 285 file.EventUnregister(&w) 286 return total, err 287 } 288 289 // Write implements Linux syscall write(2). 290 func Write(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { 291 fd := args[0].Int() 292 addr := args[1].Pointer() 293 size := args[2].SizeT() 294 295 file := t.GetFileVFS2(fd) 296 if file == nil { 297 return 0, nil, linuxerr.EBADF 298 } 299 defer file.DecRef(t) 300 301 // Check that the size is legitimate. 302 si := int(size) 303 if si < 0 { 304 return 0, nil, linuxerr.EINVAL 305 } 306 307 // Get the source of the write. 308 src, err := t.SingleIOSequence(addr, si, usermem.IOOpts{ 309 AddressSpaceActive: true, 310 }) 311 if err != nil { 312 return 0, nil, err 313 } 314 315 n, err := write(t, file, src, vfs.WriteOptions{}) 316 t.IOUsage().AccountWriteSyscall(n) 317 return uintptr(n), nil, slinux.HandleIOErrorVFS2(t, n != 0, err, syserror.ERESTARTSYS, "write", file) 318 } 319 320 // Writev implements Linux syscall writev(2). 321 func Writev(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { 322 fd := args[0].Int() 323 addr := args[1].Pointer() 324 iovcnt := int(args[2].Int()) 325 326 file := t.GetFileVFS2(fd) 327 if file == nil { 328 return 0, nil, linuxerr.EBADF 329 } 330 defer file.DecRef(t) 331 332 // Get the source of the write. 333 src, err := t.IovecsIOSequence(addr, iovcnt, usermem.IOOpts{ 334 AddressSpaceActive: true, 335 }) 336 if err != nil { 337 return 0, nil, err 338 } 339 340 n, err := write(t, file, src, vfs.WriteOptions{}) 341 t.IOUsage().AccountWriteSyscall(n) 342 return uintptr(n), nil, slinux.HandleIOErrorVFS2(t, n != 0, err, syserror.ERESTARTSYS, "writev", file) 343 } 344 345 func write(t *kernel.Task, file *vfs.FileDescription, src usermem.IOSequence, opts vfs.WriteOptions) (int64, error) { 346 n, err := file.Write(t, src, opts) 347 if err != syserror.ErrWouldBlock { 348 return n, err 349 } 350 351 allowBlock, deadline, hasDeadline := blockPolicy(t, file) 352 if !allowBlock { 353 return n, err 354 } 355 356 // Register for notifications. 357 w, ch := waiter.NewChannelEntry(nil) 358 file.EventRegister(&w, eventMaskWrite) 359 360 total := n 361 for { 362 // Shorten src to reflect bytes previously written. 363 src = src.DropFirst(int(n)) 364 365 // Issue the request and break out if it completes with anything other than 366 // "would block". 367 n, err = file.Write(t, src, opts) 368 total += n 369 if err != syserror.ErrWouldBlock { 370 break 371 } 372 373 // Wait for a notification that we should retry. 374 if err = t.BlockWithDeadline(ch, hasDeadline, deadline); err != nil { 375 if linuxerr.Equals(linuxerr.ETIMEDOUT, err) { 376 err = syserror.ErrWouldBlock 377 } 378 break 379 } 380 } 381 file.EventUnregister(&w) 382 return total, err 383 } 384 385 // Pwrite64 implements Linux syscall pwrite64(2). 386 func Pwrite64(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { 387 fd := args[0].Int() 388 addr := args[1].Pointer() 389 size := args[2].SizeT() 390 offset := args[3].Int64() 391 392 file := t.GetFileVFS2(fd) 393 if file == nil { 394 return 0, nil, linuxerr.EBADF 395 } 396 defer file.DecRef(t) 397 398 // Check that the offset is legitimate and does not overflow. 399 if offset < 0 || offset+int64(size) < 0 { 400 return 0, nil, linuxerr.EINVAL 401 } 402 403 // Check that the size is legitimate. 404 si := int(size) 405 if si < 0 { 406 return 0, nil, linuxerr.EINVAL 407 } 408 409 // Get the source of the write. 410 src, err := t.SingleIOSequence(addr, si, usermem.IOOpts{ 411 AddressSpaceActive: true, 412 }) 413 if err != nil { 414 return 0, nil, err 415 } 416 417 n, err := pwrite(t, file, src, offset, vfs.WriteOptions{}) 418 t.IOUsage().AccountWriteSyscall(n) 419 return uintptr(n), nil, slinux.HandleIOErrorVFS2(t, n != 0, err, syserror.ERESTARTSYS, "pwrite64", file) 420 } 421 422 // Pwritev implements Linux syscall pwritev(2). 423 func Pwritev(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { 424 fd := args[0].Int() 425 addr := args[1].Pointer() 426 iovcnt := int(args[2].Int()) 427 offset := args[3].Int64() 428 429 file := t.GetFileVFS2(fd) 430 if file == nil { 431 return 0, nil, linuxerr.EBADF 432 } 433 defer file.DecRef(t) 434 435 // Check that the offset is legitimate. 436 if offset < 0 { 437 return 0, nil, linuxerr.EINVAL 438 } 439 440 // Get the source of the write. 441 src, err := t.IovecsIOSequence(addr, iovcnt, usermem.IOOpts{ 442 AddressSpaceActive: true, 443 }) 444 if err != nil { 445 return 0, nil, err 446 } 447 448 n, err := pwrite(t, file, src, offset, vfs.WriteOptions{}) 449 t.IOUsage().AccountReadSyscall(n) 450 return uintptr(n), nil, slinux.HandleIOErrorVFS2(t, n != 0, err, syserror.ERESTARTSYS, "pwritev", file) 451 } 452 453 // Pwritev2 implements Linux syscall pwritev2(2). 454 func Pwritev2(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { 455 // While the glibc signature is 456 // pwritev2(int fd, struct iovec* iov, int iov_cnt, off_t offset, int flags) 457 // the actual syscall 458 // (https://elixir.bootlin.com/linux/v5.5/source/fs/read_write.c#L1162) 459 // splits the offset argument into a high/low value for compatibility with 460 // 32-bit architectures. The flags argument is the 6th argument (index 5). 461 fd := args[0].Int() 462 addr := args[1].Pointer() 463 iovcnt := int(args[2].Int()) 464 offset := args[3].Int64() 465 flags := args[5].Int() 466 467 file := t.GetFileVFS2(fd) 468 if file == nil { 469 return 0, nil, linuxerr.EBADF 470 } 471 defer file.DecRef(t) 472 473 // Check that the offset is legitimate. 474 if offset < -1 { 475 return 0, nil, linuxerr.EINVAL 476 } 477 478 // Get the source of the write. 479 src, err := t.IovecsIOSequence(addr, iovcnt, usermem.IOOpts{ 480 AddressSpaceActive: true, 481 }) 482 if err != nil { 483 return 0, nil, err 484 } 485 486 opts := vfs.WriteOptions{ 487 Flags: uint32(flags), 488 } 489 var n int64 490 if offset == -1 { 491 n, err = write(t, file, src, opts) 492 } else { 493 n, err = pwrite(t, file, src, offset, opts) 494 } 495 t.IOUsage().AccountWriteSyscall(n) 496 return uintptr(n), nil, slinux.HandleIOErrorVFS2(t, n != 0, err, syserror.ERESTARTSYS, "pwritev2", file) 497 } 498 499 func pwrite(t *kernel.Task, file *vfs.FileDescription, src usermem.IOSequence, offset int64, opts vfs.WriteOptions) (int64, error) { 500 n, err := file.PWrite(t, src, offset, opts) 501 if err != syserror.ErrWouldBlock { 502 return n, err 503 } 504 505 allowBlock, deadline, hasDeadline := blockPolicy(t, file) 506 if !allowBlock { 507 return n, err 508 } 509 510 // Register for notifications. 511 w, ch := waiter.NewChannelEntry(nil) 512 file.EventRegister(&w, eventMaskWrite) 513 514 total := n 515 for { 516 // Shorten src to reflect bytes previously written. 517 src = src.DropFirst(int(n)) 518 519 // Issue the request and break out if it completes with anything other than 520 // "would block". 521 n, err = file.PWrite(t, src, offset+total, opts) 522 total += n 523 if err != syserror.ErrWouldBlock { 524 break 525 } 526 527 // Wait for a notification that we should retry. 528 if err = t.BlockWithDeadline(ch, hasDeadline, deadline); err != nil { 529 if linuxerr.Equals(linuxerr.ETIMEDOUT, err) { 530 err = syserror.ErrWouldBlock 531 } 532 break 533 } 534 } 535 file.EventUnregister(&w) 536 return total, err 537 } 538 539 func blockPolicy(t *kernel.Task, file *vfs.FileDescription) (allowBlock bool, deadline ktime.Time, hasDeadline bool) { 540 if file.StatusFlags()&linux.O_NONBLOCK != 0 { 541 return false, ktime.Time{}, false 542 } 543 // Sockets support read/write timeouts. 544 if s, ok := file.Impl().(socket.SocketVFS2); ok { 545 dl := s.RecvTimeout() 546 if dl < 0 { 547 return false, ktime.Time{}, false 548 } 549 if dl > 0 { 550 return true, t.Kernel().MonotonicClock().Now().Add(time.Duration(dl) * time.Nanosecond), true 551 } 552 } 553 return true, ktime.Time{}, false 554 } 555 556 // Lseek implements Linux syscall lseek(2). 557 func Lseek(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { 558 fd := args[0].Int() 559 offset := args[1].Int64() 560 whence := args[2].Int() 561 562 file := t.GetFileVFS2(fd) 563 if file == nil { 564 return 0, nil, linuxerr.EBADF 565 } 566 defer file.DecRef(t) 567 568 newoff, err := file.Seek(t, offset, whence) 569 return uintptr(newoff), nil, err 570 } 571 572 // Readahead implements readahead(2). 573 func Readahead(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { 574 fd := args[0].Int() 575 offset := args[1].Int64() 576 size := args[2].SizeT() 577 578 file := t.GetFileVFS2(fd) 579 if file == nil { 580 return 0, nil, linuxerr.EBADF 581 } 582 defer file.DecRef(t) 583 584 // Check that the file is readable. 585 if !file.IsReadable() { 586 return 0, nil, linuxerr.EBADF 587 } 588 589 // Check that the size is valid. 590 if int(size) < 0 { 591 return 0, nil, linuxerr.EINVAL 592 } 593 594 // Check that the offset is legitimate and does not overflow. 595 if offset < 0 || offset+int64(size) < 0 { 596 return 0, nil, linuxerr.EINVAL 597 } 598 599 // Return EINVAL; if the underlying file type does not support readahead, 600 // then Linux will return EINVAL to indicate as much. In the future, we 601 // may extend this function to actually support readahead hints. 602 return 0, nil, linuxerr.EINVAL 603 }