github.com/metacubex/gvisor@v0.0.0-20240320004321-933faba989ec/pkg/sentry/syscalls/linux/sys_splice.go (about) 1 // Copyright 2020 The gVisor Authors. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 package linux 16 17 import ( 18 "io" 19 20 "github.com/metacubex/gvisor/pkg/abi/linux" 21 "github.com/metacubex/gvisor/pkg/errors/linuxerr" 22 "github.com/metacubex/gvisor/pkg/log" 23 "github.com/metacubex/gvisor/pkg/marshal/primitive" 24 "github.com/metacubex/gvisor/pkg/sentry/arch" 25 "github.com/metacubex/gvisor/pkg/sentry/kernel" 26 "github.com/metacubex/gvisor/pkg/sentry/kernel/pipe" 27 "github.com/metacubex/gvisor/pkg/sentry/vfs" 28 "github.com/metacubex/gvisor/pkg/usermem" 29 "github.com/metacubex/gvisor/pkg/waiter" 30 ) 31 32 // Splice implements Linux syscall splice(2). 33 func Splice(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { 34 inFD := args[0].Int() 35 inOffsetPtr := args[1].Pointer() 36 outFD := args[2].Int() 37 outOffsetPtr := args[3].Pointer() 38 count := int64(args[4].SizeT()) 39 flags := args[5].Int() 40 41 if count == 0 { 42 return 0, nil, nil 43 } 44 if count > int64(kernel.MAX_RW_COUNT) { 45 count = int64(kernel.MAX_RW_COUNT) 46 } 47 if count < 0 { 48 return 0, nil, linuxerr.EINVAL 49 } 50 51 // Check for invalid flags. 52 if flags&^(linux.SPLICE_F_MOVE|linux.SPLICE_F_NONBLOCK|linux.SPLICE_F_MORE|linux.SPLICE_F_GIFT) != 0 { 53 return 0, nil, linuxerr.EINVAL 54 } 55 56 // Get file descriptions. 57 inFile := t.GetFile(inFD) 58 if inFile == nil { 59 return 0, nil, linuxerr.EBADF 60 } 61 defer inFile.DecRef(t) 62 outFile := t.GetFile(outFD) 63 if outFile == nil { 64 return 0, nil, linuxerr.EBADF 65 } 66 defer outFile.DecRef(t) 67 68 // Check that both files support the required directionality. 69 if !inFile.IsReadable() || !outFile.IsWritable() { 70 return 0, nil, linuxerr.EBADF 71 } 72 if outFile.Options().DenySpliceIn { 73 return 0, nil, linuxerr.EINVAL 74 } 75 76 // The operation is non-blocking if anything is non-blocking. 77 // 78 // N.B. This is a rather simplistic heuristic that avoids some 79 // poor edge case behavior since the exact semantics here are 80 // underspecified and vary between versions of Linux itself. 81 nonBlock := ((inFile.StatusFlags()|outFile.StatusFlags())&linux.O_NONBLOCK != 0) || (flags&linux.SPLICE_F_NONBLOCK != 0) 82 83 // At least one file description must represent a pipe. 84 inPipeFD, inIsPipe := inFile.Impl().(*pipe.VFSPipeFD) 85 outPipeFD, outIsPipe := outFile.Impl().(*pipe.VFSPipeFD) 86 if !inIsPipe && !outIsPipe { 87 return 0, nil, linuxerr.EINVAL 88 } 89 90 // Copy in offsets. 91 inOffset := int64(-1) 92 if inOffsetPtr != 0 { 93 if inIsPipe { 94 return 0, nil, linuxerr.ESPIPE 95 } 96 if inFile.Options().DenyPRead { 97 return 0, nil, linuxerr.EINVAL 98 } 99 if _, err := primitive.CopyInt64In(t, inOffsetPtr, &inOffset); err != nil { 100 return 0, nil, err 101 } 102 if inOffset < 0 { 103 return 0, nil, linuxerr.EINVAL 104 } 105 } 106 outOffset := int64(-1) 107 if outOffsetPtr != 0 { 108 if outIsPipe { 109 return 0, nil, linuxerr.ESPIPE 110 } 111 if outFile.Options().DenyPWrite { 112 return 0, nil, linuxerr.EINVAL 113 } 114 if _, err := primitive.CopyInt64In(t, outOffsetPtr, &outOffset); err != nil { 115 return 0, nil, err 116 } 117 if outOffset < 0 { 118 return 0, nil, linuxerr.EINVAL 119 } 120 } 121 122 // Move data. 123 var ( 124 n int64 125 err error 126 ) 127 dw := dualWaiter{ 128 inFile: inFile, 129 outFile: outFile, 130 } 131 defer dw.destroy() 132 for { 133 // If both input and output are pipes, delegate to the pipe 134 // implementation. Otherwise, exactly one end is a pipe, which 135 // we ensure is consistently ordered after the non-pipe FD's 136 // locks by passing the pipe FD as usermem.IO to the non-pipe 137 // end. 138 switch { 139 case inIsPipe && outIsPipe: 140 n, err = pipe.Splice(t, outPipeFD, inPipeFD, count) 141 case inIsPipe: 142 n, err = inPipeFD.SpliceToNonPipe(t, outFile, outOffset, count) 143 if outOffset != -1 { 144 outOffset += n 145 } 146 case outIsPipe: 147 n, err = outPipeFD.SpliceFromNonPipe(t, inFile, inOffset, count) 148 if inOffset != -1 { 149 inOffset += n 150 } 151 default: 152 panic("at least one end of splice must be a pipe") 153 } 154 155 if n != 0 || !linuxerr.Equals(linuxerr.ErrWouldBlock, err) || nonBlock { 156 break 157 } 158 if err = dw.waitForBoth(t); err != nil { 159 break 160 } 161 } 162 163 // Copy updated offsets out. 164 if inOffsetPtr != 0 { 165 if _, err := primitive.CopyInt64Out(t, inOffsetPtr, inOffset); err != nil { 166 return 0, nil, err 167 } 168 } 169 if outOffsetPtr != 0 { 170 if _, err := primitive.CopyInt64Out(t, outOffsetPtr, outOffset); err != nil { 171 return 0, nil, err 172 } 173 } 174 175 // We can only pass a single file to handleIOError, so pick inFile arbitrarily. 176 // This is used only for debugging purposes. 177 return uintptr(n), nil, HandleIOError(t, n != 0, err, linuxerr.ERESTARTSYS, "splice", outFile) 178 } 179 180 // Tee implements Linux syscall tee(2). 181 func Tee(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { 182 inFD := args[0].Int() 183 outFD := args[1].Int() 184 count := int64(args[2].SizeT()) 185 flags := args[3].Int() 186 187 if count == 0 { 188 return 0, nil, nil 189 } 190 if count > int64(kernel.MAX_RW_COUNT) { 191 count = int64(kernel.MAX_RW_COUNT) 192 } 193 if count < 0 { 194 return 0, nil, linuxerr.EINVAL 195 } 196 197 // Check for invalid flags. 198 if flags&^(linux.SPLICE_F_MOVE|linux.SPLICE_F_NONBLOCK|linux.SPLICE_F_MORE|linux.SPLICE_F_GIFT) != 0 { 199 return 0, nil, linuxerr.EINVAL 200 } 201 202 // Get file descriptions. 203 inFile := t.GetFile(inFD) 204 if inFile == nil { 205 return 0, nil, linuxerr.EBADF 206 } 207 defer inFile.DecRef(t) 208 outFile := t.GetFile(outFD) 209 if outFile == nil { 210 return 0, nil, linuxerr.EBADF 211 } 212 defer outFile.DecRef(t) 213 214 // Check that both files support the required directionality. 215 if !inFile.IsReadable() || !outFile.IsWritable() { 216 return 0, nil, linuxerr.EBADF 217 } 218 if outFile.Options().DenySpliceIn { 219 return 0, nil, linuxerr.EINVAL 220 } 221 222 // The operation is non-blocking if anything is non-blocking. 223 // 224 // N.B. This is a rather simplistic heuristic that avoids some 225 // poor edge case behavior since the exact semantics here are 226 // underspecified and vary between versions of Linux itself. 227 nonBlock := ((inFile.StatusFlags()|outFile.StatusFlags())&linux.O_NONBLOCK != 0) || (flags&linux.SPLICE_F_NONBLOCK != 0) 228 229 // Both file descriptions must represent pipes. 230 inPipeFD, inIsPipe := inFile.Impl().(*pipe.VFSPipeFD) 231 outPipeFD, outIsPipe := outFile.Impl().(*pipe.VFSPipeFD) 232 if !inIsPipe || !outIsPipe { 233 return 0, nil, linuxerr.EINVAL 234 } 235 236 // Copy data. 237 var ( 238 n int64 239 err error 240 ) 241 dw := dualWaiter{ 242 inFile: inFile, 243 outFile: outFile, 244 } 245 defer dw.destroy() 246 for { 247 n, err = pipe.Tee(t, outPipeFD, inPipeFD, count) 248 if n != 0 || !linuxerr.Equals(linuxerr.ErrWouldBlock, err) || nonBlock { 249 break 250 } 251 if err = dw.waitForBoth(t); err != nil { 252 break 253 } 254 } 255 256 if n != 0 { 257 // If a partial write is completed, the error is dropped. Log it here. 258 if err != nil && err != io.EOF && !linuxerr.Equals(linuxerr.ErrWouldBlock, err) { 259 log.Debugf("tee completed a partial write with error: %v", err) 260 err = nil 261 } 262 } 263 264 // We can only pass a single file to handleIOError, so pick inFile arbitrarily. 265 // This is used only for debugging purposes. 266 return uintptr(n), nil, HandleIOError(t, n != 0, err, linuxerr.ERESTARTSYS, "tee", inFile) 267 } 268 269 // Sendfile implements linux system call sendfile(2). 270 func Sendfile(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { 271 outFD := args[0].Int() 272 inFD := args[1].Int() 273 offsetAddr := args[2].Pointer() 274 count := int64(args[3].SizeT()) 275 276 inFile := t.GetFile(inFD) 277 if inFile == nil { 278 return 0, nil, linuxerr.EBADF 279 } 280 defer inFile.DecRef(t) 281 if !inFile.IsReadable() { 282 return 0, nil, linuxerr.EBADF 283 } 284 285 outFile := t.GetFile(outFD) 286 if outFile == nil { 287 return 0, nil, linuxerr.EBADF 288 } 289 defer outFile.DecRef(t) 290 if !outFile.IsWritable() { 291 return 0, nil, linuxerr.EBADF 292 } 293 if outFile.Options().DenySpliceIn { 294 return 0, nil, linuxerr.EINVAL 295 } 296 297 // Verify that the outFile Append flag is not set. 298 if outFile.StatusFlags()&linux.O_APPEND != 0 { 299 return 0, nil, linuxerr.EINVAL 300 } 301 302 // Verify that inFile is a regular file or block device. This is a 303 // requirement; the same check appears in Linux 304 // (fs/splice.c:splice_direct_to_actor). 305 if stat, err := inFile.Stat(t, vfs.StatOptions{Mask: linux.STATX_TYPE}); err != nil { 306 return 0, nil, err 307 } else if stat.Mask&linux.STATX_TYPE == 0 || 308 (stat.Mode&linux.S_IFMT != linux.S_IFREG && stat.Mode&linux.S_IFMT != linux.S_IFBLK) { 309 return 0, nil, linuxerr.EINVAL 310 } 311 312 // Copy offset if it exists. 313 offset := int64(-1) 314 if offsetAddr != 0 { 315 if inFile.Options().DenyPRead { 316 return 0, nil, linuxerr.ESPIPE 317 } 318 var offsetP primitive.Int64 319 if _, err := offsetP.CopyIn(t, offsetAddr); err != nil { 320 return 0, nil, err 321 } 322 offset = int64(offsetP) 323 324 if offset < 0 { 325 return 0, nil, linuxerr.EINVAL 326 } 327 if offset+count < 0 { 328 return 0, nil, linuxerr.EINVAL 329 } 330 } 331 332 // Validate count. This must come after offset checks. 333 if count < 0 { 334 return 0, nil, linuxerr.EINVAL 335 } 336 if count == 0 { 337 return 0, nil, nil 338 } 339 if count > int64(kernel.MAX_RW_COUNT) { 340 count = int64(kernel.MAX_RW_COUNT) 341 } 342 343 // Copy data. 344 var ( 345 total int64 346 err error 347 ) 348 dw := dualWaiter{ 349 inFile: inFile, 350 outFile: outFile, 351 } 352 defer dw.destroy() 353 outPipeFD, outIsPipe := outFile.Impl().(*pipe.VFSPipeFD) 354 // Reading from input file should never block, since it is regular or 355 // block device. We only need to check if writing to the output file 356 // can block. 357 nonBlock := outFile.StatusFlags()&linux.O_NONBLOCK != 0 358 if outIsPipe { 359 for { 360 var n int64 361 n, err = outPipeFD.SpliceFromNonPipe(t, inFile, offset, count-total) 362 if offset != -1 { 363 offset += n 364 } 365 total += n 366 if total == count { 367 break 368 } 369 if err == nil && t.Interrupted() { 370 err = linuxerr.ErrInterrupted 371 break 372 } 373 if linuxerr.Equals(linuxerr.ErrWouldBlock, err) && !nonBlock { 374 err = dw.waitForBoth(t) 375 } 376 if err != nil { 377 break 378 } 379 } 380 } else { 381 // Read inFile to buffer, then write the contents to outFile. 382 // 383 // The buffer size has to be limited to avoid large memory 384 // allocations and long delays. In Linux, the buffer size is 385 // limited by a size of an internl pipe. Here, we repeat this 386 // behavior. 387 bufSize := count 388 if bufSize > pipe.MaximumPipeSize { 389 bufSize = pipe.MaximumPipeSize 390 } 391 buf := make([]byte, bufSize) 392 for { 393 if int64(len(buf)) > count-total { 394 buf = buf[:count-total] 395 } 396 var readN int64 397 if offset != -1 { 398 readN, err = inFile.PRead(t, usermem.BytesIOSequence(buf), offset, vfs.ReadOptions{}) 399 offset += readN 400 } else { 401 readN, err = inFile.Read(t, usermem.BytesIOSequence(buf), vfs.ReadOptions{}) 402 } 403 404 // Write all of the bytes that we read. This may need 405 // multiple write calls to complete. 406 wbuf := buf[:readN] 407 for len(wbuf) > 0 { 408 var writeN int64 409 writeN, err = outFile.Write(t, usermem.BytesIOSequence(wbuf), vfs.WriteOptions{}) 410 wbuf = wbuf[writeN:] 411 if linuxerr.Equals(linuxerr.ErrWouldBlock, err) && !nonBlock { 412 err = dw.waitForOut(t) 413 } 414 if err != nil { 415 // We didn't complete the write. Only report the bytes that were actually 416 // written, and rewind offsets as needed. 417 notWritten := int64(len(wbuf)) 418 readN -= notWritten 419 if offset == -1 { 420 // We modified the offset of the input file itself during the read 421 // operation. Rewind it. 422 if _, seekErr := inFile.Seek(t, -notWritten, linux.SEEK_CUR); seekErr != nil { 423 // Log the error but don't return it, since the write has already 424 // completed successfully. 425 log.Warningf("failed to roll back input file offset: %v", seekErr) 426 } 427 } else { 428 // The sendfile call was provided an offset parameter that should be 429 // adjusted to reflect the number of bytes sent. Rewind it. 430 offset -= notWritten 431 } 432 break 433 } 434 } 435 436 total += readN 437 if total == count { 438 break 439 } 440 if err == nil && t.Interrupted() { 441 err = linuxerr.ErrInterrupted 442 break 443 } 444 if linuxerr.Equals(linuxerr.ErrWouldBlock, err) && !nonBlock { 445 err = dw.waitForBoth(t) 446 } 447 if err != nil { 448 break 449 } 450 } 451 } 452 453 if offsetAddr != 0 { 454 // Copy out the new offset. 455 offsetP := primitive.Uint64(offset) 456 if _, err := offsetP.CopyOut(t, offsetAddr); err != nil { 457 return 0, nil, err 458 } 459 } 460 461 if total != 0 { 462 if err != nil && err != io.EOF && !linuxerr.Equals(linuxerr.ErrWouldBlock, err) { 463 // If a partial write is completed, the error is dropped. Log it here. 464 log.Debugf("sendfile completed a partial write with error: %v", err) 465 err = nil 466 } 467 } 468 469 // We can only pass a single file to handleIOError, so pick inFile arbitrarily. 470 // This is used only for debugging purposes. 471 return uintptr(total), nil, HandleIOError(t, total != 0, err, linuxerr.ERESTARTSYS, "sendfile", inFile) 472 } 473 474 // dualWaiter is used to wait on one or both vfs.FileDescriptions. It is not 475 // thread-safe, and does not take a reference on the vfs.FileDescriptions. 476 // 477 // Users must call destroy() when finished. 478 type dualWaiter struct { 479 inFile *vfs.FileDescription 480 outFile *vfs.FileDescription 481 482 inW waiter.Entry 483 inCh chan struct{} 484 outW waiter.Entry 485 outCh chan struct{} 486 } 487 488 // waitForBoth waits for both dw.inFile and dw.outFile to be ready. 489 func (dw *dualWaiter) waitForBoth(t *kernel.Task) error { 490 if dw.inFile.Readiness(eventMaskRead)&eventMaskRead == 0 { 491 if dw.inCh == nil { 492 dw.inW, dw.inCh = waiter.NewChannelEntry(eventMaskRead) 493 if err := dw.inFile.EventRegister(&dw.inW); err != nil { 494 return err 495 } 496 // We might be ready now. Try again before blocking. 497 return nil 498 } 499 if err := t.Block(dw.inCh); err != nil { 500 return err 501 } 502 } 503 return dw.waitForOut(t) 504 } 505 506 // waitForOut waits for dw.outfile to be read. 507 func (dw *dualWaiter) waitForOut(t *kernel.Task) error { 508 // Don't bother checking readiness of the outFile, because it's not a 509 // guarantee that it won't return EWOULDBLOCK. Both pipes and eventfds 510 // can be "ready" but will reject writes of certain sizes with 511 // EWOULDBLOCK. See b/172075629, b/170743336. 512 if dw.outCh == nil { 513 dw.outW, dw.outCh = waiter.NewChannelEntry(eventMaskWrite) 514 if err := dw.outFile.EventRegister(&dw.outW); err != nil { 515 return err 516 } 517 // We might be ready to write now. Try again before blocking. 518 return nil 519 } 520 return t.Block(dw.outCh) 521 } 522 523 // destroy cleans up resources help by dw. No more calls to wait* can occur 524 // after destroy is called. 525 func (dw *dualWaiter) destroy() { 526 if dw.inCh != nil { 527 dw.inFile.EventUnregister(&dw.inW) 528 dw.inCh = nil 529 } 530 if dw.outCh != nil { 531 dw.outFile.EventUnregister(&dw.outW) 532 dw.outCh = nil 533 } 534 dw.inFile = nil 535 dw.outFile = nil 536 }