github.com/metacubex/gvisor@v0.0.0-20240320004321-933faba989ec/pkg/sentry/syscalls/linux/sys_read_write.go (about) 1 // Copyright 2020 The gVisor Authors. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 package linux 16 17 import ( 18 "time" 19 20 "github.com/metacubex/gvisor/pkg/abi/linux" 21 "github.com/metacubex/gvisor/pkg/errors/linuxerr" 22 "github.com/metacubex/gvisor/pkg/sentry/arch" 23 "github.com/metacubex/gvisor/pkg/sentry/kernel" 24 ktime "github.com/metacubex/gvisor/pkg/sentry/kernel/time" 25 "github.com/metacubex/gvisor/pkg/sentry/socket" 26 "github.com/metacubex/gvisor/pkg/sentry/vfs" 27 "github.com/metacubex/gvisor/pkg/usermem" 28 "github.com/metacubex/gvisor/pkg/waiter" 29 ) 30 31 const ( 32 eventMaskRead = waiter.EventRdNorm | waiter.EventIn | waiter.EventHUp | waiter.EventErr | waiter.EventRdHUp 33 eventMaskWrite = waiter.EventWrNorm | waiter.EventOut | waiter.EventHUp | waiter.EventErr 34 ) 35 36 // Read implements Linux syscall read(2). 37 func Read(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { 38 fd := args[0].Int() 39 addr := args[1].Pointer() 40 size := args[2].SizeT() 41 42 file := t.GetFile(fd) 43 if file == nil { 44 return 0, nil, linuxerr.EBADF 45 } 46 defer file.DecRef(t) 47 48 // Check that the size is legitimate. 49 si := int(size) 50 if si < 0 { 51 return 0, nil, linuxerr.EINVAL 52 } 53 54 // Get the destination of the read. 55 dst, err := t.SingleIOSequence(addr, si, usermem.IOOpts{ 56 AddressSpaceActive: true, 57 }) 58 if err != nil { 59 return 0, nil, err 60 } 61 62 n, err := read(t, file, dst, vfs.ReadOptions{}) 63 t.IOUsage().AccountReadSyscall(n) 64 return uintptr(n), nil, HandleIOError(t, n != 0, err, linuxerr.ERESTARTSYS, "read", file) 65 } 66 67 // Readv implements Linux syscall readv(2). 68 func Readv(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { 69 fd := args[0].Int() 70 addr := args[1].Pointer() 71 iovcnt := int(args[2].Int()) 72 73 file := t.GetFile(fd) 74 if file == nil { 75 return 0, nil, linuxerr.EBADF 76 } 77 defer file.DecRef(t) 78 79 // Get the destination of the read. 80 dst, err := t.IovecsIOSequence(addr, iovcnt, usermem.IOOpts{ 81 AddressSpaceActive: true, 82 }) 83 if err != nil { 84 return 0, nil, err 85 } 86 87 n, err := read(t, file, dst, vfs.ReadOptions{}) 88 t.IOUsage().AccountReadSyscall(n) 89 return uintptr(n), nil, HandleIOError(t, n != 0, err, linuxerr.ERESTARTSYS, "readv", file) 90 } 91 92 func read(t *kernel.Task, file *vfs.FileDescription, dst usermem.IOSequence, opts vfs.ReadOptions) (int64, error) { 93 n, err := file.Read(t, dst, opts) 94 if !linuxerr.Equals(linuxerr.ErrWouldBlock, err) { 95 return n, err 96 } 97 98 allowBlock, deadline, hasDeadline := blockPolicy(t, file) 99 if !allowBlock { 100 return n, err 101 } 102 103 // Register for notifications. 104 w, ch := waiter.NewChannelEntry(eventMaskRead) 105 if err := file.EventRegister(&w); err != nil { 106 return n, err 107 } 108 109 total := n 110 for { 111 // Shorten dst to reflect bytes previously read. 112 dst = dst.DropFirst(int(n)) 113 114 // Issue the request and break out if it completes with anything other than 115 // "would block". 116 n, err = file.Read(t, dst, opts) 117 total += n 118 if !linuxerr.Equals(linuxerr.ErrWouldBlock, err) { 119 break 120 } 121 122 // Wait for a notification that we should retry. 123 if err = t.BlockWithDeadline(ch, hasDeadline, deadline); err != nil { 124 if linuxerr.Equals(linuxerr.ETIMEDOUT, err) { 125 err = linuxerr.ErrWouldBlock 126 } 127 break 128 } 129 } 130 file.EventUnregister(&w) 131 132 return total, err 133 } 134 135 // Pread64 implements Linux syscall pread64(2). 136 func Pread64(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { 137 fd := args[0].Int() 138 addr := args[1].Pointer() 139 size := args[2].SizeT() 140 offset := args[3].Int64() 141 142 file := t.GetFile(fd) 143 if file == nil { 144 return 0, nil, linuxerr.EBADF 145 } 146 defer file.DecRef(t) 147 148 // Check that the offset is legitimate and does not overflow. 149 if offset < 0 || offset+int64(size) < 0 { 150 return 0, nil, linuxerr.EINVAL 151 } 152 153 // Check that the size is legitimate. 154 si := int(size) 155 if si < 0 { 156 return 0, nil, linuxerr.EINVAL 157 } 158 159 // Get the destination of the read. 160 dst, err := t.SingleIOSequence(addr, si, usermem.IOOpts{ 161 AddressSpaceActive: true, 162 }) 163 if err != nil { 164 return 0, nil, err 165 } 166 167 n, err := pread(t, file, dst, offset, vfs.ReadOptions{}) 168 t.IOUsage().AccountReadSyscall(n) 169 return uintptr(n), nil, HandleIOError(t, n != 0, err, linuxerr.ERESTARTSYS, "pread64", file) 170 } 171 172 // Preadv implements Linux syscall preadv(2). 173 func Preadv(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { 174 fd := args[0].Int() 175 addr := args[1].Pointer() 176 iovcnt := int(args[2].Int()) 177 offset := args[3].Int64() 178 179 file := t.GetFile(fd) 180 if file == nil { 181 return 0, nil, linuxerr.EBADF 182 } 183 defer file.DecRef(t) 184 185 // Check that the offset is legitimate. 186 if offset < 0 { 187 return 0, nil, linuxerr.EINVAL 188 } 189 190 // Get the destination of the read. 191 dst, err := t.IovecsIOSequence(addr, iovcnt, usermem.IOOpts{ 192 AddressSpaceActive: true, 193 }) 194 if err != nil { 195 return 0, nil, err 196 } 197 198 n, err := pread(t, file, dst, offset, vfs.ReadOptions{}) 199 t.IOUsage().AccountReadSyscall(n) 200 return uintptr(n), nil, HandleIOError(t, n != 0, err, linuxerr.ERESTARTSYS, "preadv", file) 201 } 202 203 // Preadv2 implements Linux syscall preadv2(2). 204 func Preadv2(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { 205 // While the glibc signature is 206 // preadv2(int fd, struct iovec* iov, int iov_cnt, off_t offset, int flags) 207 // the actual syscall 208 // (https://elixir.bootlin.com/linux/v5.5/source/fs/read_write.c#L1142) 209 // splits the offset argument into a high/low value for compatibility with 210 // 32-bit architectures. The flags argument is the 6th argument (index 5). 211 fd := args[0].Int() 212 addr := args[1].Pointer() 213 iovcnt := int(args[2].Int()) 214 offset := args[3].Int64() 215 flags := args[5].Int() 216 217 file := t.GetFile(fd) 218 if file == nil { 219 return 0, nil, linuxerr.EBADF 220 } 221 defer file.DecRef(t) 222 223 // Check that the offset is legitimate. 224 if offset < -1 { 225 return 0, nil, linuxerr.EINVAL 226 } 227 228 // Get the destination of the read. 229 dst, err := t.IovecsIOSequence(addr, iovcnt, usermem.IOOpts{ 230 AddressSpaceActive: true, 231 }) 232 if err != nil { 233 return 0, nil, err 234 } 235 236 opts := vfs.ReadOptions{ 237 Flags: uint32(flags), 238 } 239 var n int64 240 if offset == -1 { 241 n, err = read(t, file, dst, opts) 242 } else { 243 n, err = pread(t, file, dst, offset, opts) 244 } 245 t.IOUsage().AccountReadSyscall(n) 246 return uintptr(n), nil, HandleIOError(t, n != 0, err, linuxerr.ERESTARTSYS, "preadv2", file) 247 } 248 249 func pread(t *kernel.Task, file *vfs.FileDescription, dst usermem.IOSequence, offset int64, opts vfs.ReadOptions) (int64, error) { 250 n, err := file.PRead(t, dst, offset, opts) 251 if !linuxerr.Equals(linuxerr.ErrWouldBlock, err) { 252 return n, err 253 } 254 255 allowBlock, deadline, hasDeadline := blockPolicy(t, file) 256 if !allowBlock { 257 return n, err 258 } 259 260 // Register for notifications. 261 w, ch := waiter.NewChannelEntry(eventMaskRead) 262 if err := file.EventRegister(&w); err != nil { 263 return n, err 264 } 265 total := n 266 for { 267 // Shorten dst to reflect bytes previously read. 268 dst = dst.DropFirst(int(n)) 269 270 // Issue the request and break out if it completes with anything other than 271 // "would block". 272 n, err = file.PRead(t, dst, offset+total, opts) 273 total += n 274 if !linuxerr.Equals(linuxerr.ErrWouldBlock, err) { 275 break 276 } 277 278 // Wait for a notification that we should retry. 279 if err = t.BlockWithDeadline(ch, hasDeadline, deadline); err != nil { 280 if linuxerr.Equals(linuxerr.ETIMEDOUT, err) { 281 err = linuxerr.ErrWouldBlock 282 } 283 break 284 } 285 } 286 file.EventUnregister(&w) 287 return total, err 288 } 289 290 // Write implements Linux syscall write(2). 291 func Write(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { 292 fd := args[0].Int() 293 addr := args[1].Pointer() 294 size := args[2].SizeT() 295 296 file := t.GetFile(fd) 297 if file == nil { 298 return 0, nil, linuxerr.EBADF 299 } 300 defer file.DecRef(t) 301 302 // Check that the size is legitimate. 303 si := int(size) 304 if si < 0 { 305 return 0, nil, linuxerr.EINVAL 306 } 307 308 // Get the source of the write. 309 src, err := t.SingleIOSequence(addr, si, usermem.IOOpts{ 310 AddressSpaceActive: true, 311 }) 312 if err != nil { 313 return 0, nil, err 314 } 315 316 n, err := write(t, file, src, vfs.WriteOptions{}) 317 t.IOUsage().AccountWriteSyscall(n) 318 return uintptr(n), nil, HandleIOError(t, n != 0, err, linuxerr.ERESTARTSYS, "write", file) 319 } 320 321 // Writev implements Linux syscall writev(2). 322 func Writev(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { 323 fd := args[0].Int() 324 addr := args[1].Pointer() 325 iovcnt := int(args[2].Int()) 326 327 file := t.GetFile(fd) 328 if file == nil { 329 return 0, nil, linuxerr.EBADF 330 } 331 defer file.DecRef(t) 332 333 // Get the source of the write. 334 src, err := t.IovecsIOSequence(addr, iovcnt, usermem.IOOpts{ 335 AddressSpaceActive: true, 336 }) 337 if err != nil { 338 return 0, nil, err 339 } 340 341 n, err := write(t, file, src, vfs.WriteOptions{}) 342 t.IOUsage().AccountWriteSyscall(n) 343 return uintptr(n), nil, HandleIOError(t, n != 0, err, linuxerr.ERESTARTSYS, "writev", file) 344 } 345 346 func write(t *kernel.Task, file *vfs.FileDescription, src usermem.IOSequence, opts vfs.WriteOptions) (int64, error) { 347 n, err := file.Write(t, src, opts) 348 if !linuxerr.Equals(linuxerr.ErrWouldBlock, err) { 349 return n, err 350 } 351 352 allowBlock, deadline, hasDeadline := blockPolicy(t, file) 353 if !allowBlock { 354 return n, err 355 } 356 357 // Register for notifications. 358 w, ch := waiter.NewChannelEntry(eventMaskWrite) 359 if err := file.EventRegister(&w); err != nil { 360 return n, err 361 } 362 363 total := n 364 for { 365 // Shorten src to reflect bytes previously written. 366 src = src.DropFirst(int(n)) 367 368 // Issue the request and break out if it completes with anything other than 369 // "would block". 370 n, err = file.Write(t, src, opts) 371 total += n 372 if !linuxerr.Equals(linuxerr.ErrWouldBlock, err) { 373 break 374 } 375 376 // Wait for a notification that we should retry. 377 if err = t.BlockWithDeadline(ch, hasDeadline, deadline); err != nil { 378 if linuxerr.Equals(linuxerr.ETIMEDOUT, err) { 379 err = linuxerr.ErrWouldBlock 380 } 381 break 382 } 383 } 384 file.EventUnregister(&w) 385 return total, err 386 } 387 388 // Pwrite64 implements Linux syscall pwrite64(2). 389 func Pwrite64(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { 390 fd := args[0].Int() 391 addr := args[1].Pointer() 392 size := args[2].SizeT() 393 offset := args[3].Int64() 394 395 file := t.GetFile(fd) 396 if file == nil { 397 return 0, nil, linuxerr.EBADF 398 } 399 defer file.DecRef(t) 400 401 // Check that the offset is legitimate and does not overflow. 402 if offset < 0 || offset+int64(size) < 0 { 403 return 0, nil, linuxerr.EINVAL 404 } 405 406 // Check that the size is legitimate. 407 si := int(size) 408 if si < 0 { 409 return 0, nil, linuxerr.EINVAL 410 } 411 412 // Get the source of the write. 413 src, err := t.SingleIOSequence(addr, si, usermem.IOOpts{ 414 AddressSpaceActive: true, 415 }) 416 if err != nil { 417 return 0, nil, err 418 } 419 420 n, err := pwrite(t, file, src, offset, vfs.WriteOptions{}) 421 t.IOUsage().AccountWriteSyscall(n) 422 return uintptr(n), nil, HandleIOError(t, n != 0, err, linuxerr.ERESTARTSYS, "pwrite64", file) 423 } 424 425 // Pwritev implements Linux syscall pwritev(2). 426 func Pwritev(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { 427 fd := args[0].Int() 428 addr := args[1].Pointer() 429 iovcnt := int(args[2].Int()) 430 offset := args[3].Int64() 431 432 file := t.GetFile(fd) 433 if file == nil { 434 return 0, nil, linuxerr.EBADF 435 } 436 defer file.DecRef(t) 437 438 // Check that the offset is legitimate. 439 if offset < 0 { 440 return 0, nil, linuxerr.EINVAL 441 } 442 443 // Get the source of the write. 444 src, err := t.IovecsIOSequence(addr, iovcnt, usermem.IOOpts{ 445 AddressSpaceActive: true, 446 }) 447 if err != nil { 448 return 0, nil, err 449 } 450 451 n, err := pwrite(t, file, src, offset, vfs.WriteOptions{}) 452 t.IOUsage().AccountReadSyscall(n) 453 return uintptr(n), nil, HandleIOError(t, n != 0, err, linuxerr.ERESTARTSYS, "pwritev", file) 454 } 455 456 // Pwritev2 implements Linux syscall pwritev2(2). 457 func Pwritev2(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { 458 // While the glibc signature is 459 // pwritev2(int fd, struct iovec* iov, int iov_cnt, off_t offset, int flags) 460 // the actual syscall 461 // (https://elixir.bootlin.com/linux/v5.5/source/fs/read_write.c#L1162) 462 // splits the offset argument into a high/low value for compatibility with 463 // 32-bit architectures. The flags argument is the 6th argument (index 5). 464 fd := args[0].Int() 465 addr := args[1].Pointer() 466 iovcnt := int(args[2].Int()) 467 offset := args[3].Int64() 468 flags := args[5].Int() 469 470 file := t.GetFile(fd) 471 if file == nil { 472 return 0, nil, linuxerr.EBADF 473 } 474 defer file.DecRef(t) 475 476 // Check that the offset is legitimate. 477 if offset < -1 { 478 return 0, nil, linuxerr.EINVAL 479 } 480 481 // Get the source of the write. 482 src, err := t.IovecsIOSequence(addr, iovcnt, usermem.IOOpts{ 483 AddressSpaceActive: true, 484 }) 485 if err != nil { 486 return 0, nil, err 487 } 488 489 opts := vfs.WriteOptions{ 490 Flags: uint32(flags), 491 } 492 var n int64 493 if offset == -1 { 494 n, err = write(t, file, src, opts) 495 } else { 496 n, err = pwrite(t, file, src, offset, opts) 497 } 498 t.IOUsage().AccountWriteSyscall(n) 499 return uintptr(n), nil, HandleIOError(t, n != 0, err, linuxerr.ERESTARTSYS, "pwritev2", file) 500 } 501 502 func pwrite(t *kernel.Task, file *vfs.FileDescription, src usermem.IOSequence, offset int64, opts vfs.WriteOptions) (int64, error) { 503 n, err := file.PWrite(t, src, offset, opts) 504 if !linuxerr.Equals(linuxerr.ErrWouldBlock, err) { 505 return n, err 506 } 507 508 allowBlock, deadline, hasDeadline := blockPolicy(t, file) 509 if !allowBlock { 510 return n, err 511 } 512 513 // Register for notifications. 514 w, ch := waiter.NewChannelEntry(eventMaskWrite) 515 if err := file.EventRegister(&w); err != nil { 516 return n, err 517 } 518 519 total := n 520 for { 521 // Shorten src to reflect bytes previously written. 522 src = src.DropFirst(int(n)) 523 524 // Issue the request and break out if it completes with anything other than 525 // "would block". 526 n, err = file.PWrite(t, src, offset+total, opts) 527 total += n 528 if !linuxerr.Equals(linuxerr.ErrWouldBlock, err) { 529 break 530 } 531 532 // Wait for a notification that we should retry. 533 if err = t.BlockWithDeadline(ch, hasDeadline, deadline); err != nil { 534 if linuxerr.Equals(linuxerr.ETIMEDOUT, err) { 535 err = linuxerr.ErrWouldBlock 536 } 537 break 538 } 539 } 540 file.EventUnregister(&w) 541 return total, err 542 } 543 544 func blockPolicy(t *kernel.Task, file *vfs.FileDescription) (allowBlock bool, deadline ktime.Time, hasDeadline bool) { 545 if file.StatusFlags()&linux.O_NONBLOCK != 0 { 546 return false, ktime.Time{}, false 547 } 548 // Sockets support read/write timeouts. 549 if s, ok := file.Impl().(socket.Socket); ok { 550 dl := s.RecvTimeout() 551 if dl < 0 { 552 return false, ktime.Time{}, false 553 } 554 if dl > 0 { 555 return true, t.Kernel().MonotonicClock().Now().Add(time.Duration(dl) * time.Nanosecond), true 556 } 557 } 558 return true, ktime.Time{}, false 559 } 560 561 // Lseek implements Linux syscall lseek(2). 562 func Lseek(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { 563 fd := args[0].Int() 564 offset := args[1].Int64() 565 whence := args[2].Int() 566 567 file := t.GetFile(fd) 568 if file == nil { 569 return 0, nil, linuxerr.EBADF 570 } 571 defer file.DecRef(t) 572 573 newoff, err := file.Seek(t, offset, whence) 574 return uintptr(newoff), nil, err 575 } 576 577 // Readahead implements readahead(2). 578 func Readahead(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { 579 fd := args[0].Int() 580 offset := args[1].Int64() 581 size := args[2].SizeT() 582 583 file := t.GetFile(fd) 584 if file == nil { 585 return 0, nil, linuxerr.EBADF 586 } 587 defer file.DecRef(t) 588 589 // Check that the file is readable. 590 if !file.IsReadable() { 591 return 0, nil, linuxerr.EBADF 592 } 593 594 // Check that the size is valid. 595 if int(size) < 0 { 596 return 0, nil, linuxerr.EINVAL 597 } 598 599 // Check that the offset is legitimate and does not overflow. 600 if offset < 0 || offset+int64(size) < 0 { 601 return 0, nil, linuxerr.EINVAL 602 } 603 604 // Return EINVAL; if the underlying file type does not support readahead, 605 // then Linux will return EINVAL to indicate as much. In the future, we 606 // may extend this function to actually support readahead hints. 607 return 0, nil, linuxerr.EINVAL 608 }