github.com/SagerNet/gvisor@v0.0.0-20210707092255-7731c139d75c/pkg/sentry/syscalls/linux/vfs2/splice.go (about) 1 // Copyright 2020 The gVisor Authors. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 package vfs2 16 17 import ( 18 "io" 19 20 "github.com/SagerNet/gvisor/pkg/abi/linux" 21 "github.com/SagerNet/gvisor/pkg/errors/linuxerr" 22 "github.com/SagerNet/gvisor/pkg/log" 23 "github.com/SagerNet/gvisor/pkg/marshal/primitive" 24 "github.com/SagerNet/gvisor/pkg/sentry/arch" 25 "github.com/SagerNet/gvisor/pkg/sentry/kernel" 26 "github.com/SagerNet/gvisor/pkg/sentry/kernel/pipe" 27 slinux "github.com/SagerNet/gvisor/pkg/sentry/syscalls/linux" 28 "github.com/SagerNet/gvisor/pkg/sentry/vfs" 29 "github.com/SagerNet/gvisor/pkg/syserror" 30 "github.com/SagerNet/gvisor/pkg/usermem" 31 "github.com/SagerNet/gvisor/pkg/waiter" 32 ) 33 34 // Splice implements Linux syscall splice(2). 35 func Splice(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { 36 inFD := args[0].Int() 37 inOffsetPtr := args[1].Pointer() 38 outFD := args[2].Int() 39 outOffsetPtr := args[3].Pointer() 40 count := int64(args[4].SizeT()) 41 flags := args[5].Int() 42 43 if count == 0 { 44 return 0, nil, nil 45 } 46 if count > int64(kernel.MAX_RW_COUNT) { 47 count = int64(kernel.MAX_RW_COUNT) 48 } 49 if count < 0 { 50 return 0, nil, linuxerr.EINVAL 51 } 52 53 // Check for invalid flags. 54 if flags&^(linux.SPLICE_F_MOVE|linux.SPLICE_F_NONBLOCK|linux.SPLICE_F_MORE|linux.SPLICE_F_GIFT) != 0 { 55 return 0, nil, linuxerr.EINVAL 56 } 57 58 // Get file descriptions. 59 inFile := t.GetFileVFS2(inFD) 60 if inFile == nil { 61 return 0, nil, linuxerr.EBADF 62 } 63 defer inFile.DecRef(t) 64 outFile := t.GetFileVFS2(outFD) 65 if outFile == nil { 66 return 0, nil, linuxerr.EBADF 67 } 68 defer outFile.DecRef(t) 69 70 // Check that both files support the required directionality. 71 if !inFile.IsReadable() || !outFile.IsWritable() { 72 return 0, nil, linuxerr.EBADF 73 } 74 75 // The operation is non-blocking if anything is non-blocking. 76 // 77 // N.B. This is a rather simplistic heuristic that avoids some 78 // poor edge case behavior since the exact semantics here are 79 // underspecified and vary between versions of Linux itself. 80 nonBlock := ((inFile.StatusFlags()|outFile.StatusFlags())&linux.O_NONBLOCK != 0) || (flags&linux.SPLICE_F_NONBLOCK != 0) 81 82 // At least one file description must represent a pipe. 83 inPipeFD, inIsPipe := inFile.Impl().(*pipe.VFSPipeFD) 84 outPipeFD, outIsPipe := outFile.Impl().(*pipe.VFSPipeFD) 85 if !inIsPipe && !outIsPipe { 86 return 0, nil, linuxerr.EINVAL 87 } 88 89 // Copy in offsets. 90 inOffset := int64(-1) 91 if inOffsetPtr != 0 { 92 if inIsPipe { 93 return 0, nil, linuxerr.ESPIPE 94 } 95 if inFile.Options().DenyPRead { 96 return 0, nil, linuxerr.EINVAL 97 } 98 if _, err := primitive.CopyInt64In(t, inOffsetPtr, &inOffset); err != nil { 99 return 0, nil, err 100 } 101 if inOffset < 0 { 102 return 0, nil, linuxerr.EINVAL 103 } 104 } 105 outOffset := int64(-1) 106 if outOffsetPtr != 0 { 107 if outIsPipe { 108 return 0, nil, linuxerr.ESPIPE 109 } 110 if outFile.Options().DenyPWrite { 111 return 0, nil, linuxerr.EINVAL 112 } 113 if _, err := primitive.CopyInt64In(t, outOffsetPtr, &outOffset); err != nil { 114 return 0, nil, err 115 } 116 if outOffset < 0 { 117 return 0, nil, linuxerr.EINVAL 118 } 119 } 120 121 // Move data. 122 var ( 123 n int64 124 err error 125 ) 126 dw := dualWaiter{ 127 inFile: inFile, 128 outFile: outFile, 129 } 130 defer dw.destroy() 131 for { 132 // If both input and output are pipes, delegate to the pipe 133 // implementation. Otherwise, exactly one end is a pipe, which 134 // we ensure is consistently ordered after the non-pipe FD's 135 // locks by passing the pipe FD as usermem.IO to the non-pipe 136 // end. 137 switch { 138 case inIsPipe && outIsPipe: 139 n, err = pipe.Splice(t, outPipeFD, inPipeFD, count) 140 case inIsPipe: 141 n, err = inPipeFD.SpliceToNonPipe(t, outFile, outOffset, count) 142 if outOffset != -1 { 143 outOffset += n 144 } 145 case outIsPipe: 146 n, err = outPipeFD.SpliceFromNonPipe(t, inFile, inOffset, count) 147 if inOffset != -1 { 148 inOffset += n 149 } 150 default: 151 panic("at least one end of splice must be a pipe") 152 } 153 154 if n != 0 || err != syserror.ErrWouldBlock || nonBlock { 155 break 156 } 157 if err = dw.waitForBoth(t); err != nil { 158 break 159 } 160 } 161 162 // Copy updated offsets out. 163 if inOffsetPtr != 0 { 164 if _, err := primitive.CopyInt64Out(t, inOffsetPtr, inOffset); err != nil { 165 return 0, nil, err 166 } 167 } 168 if outOffsetPtr != 0 { 169 if _, err := primitive.CopyInt64Out(t, outOffsetPtr, outOffset); err != nil { 170 return 0, nil, err 171 } 172 } 173 174 // We can only pass a single file to handleIOError, so pick inFile arbitrarily. 175 // This is used only for debugging purposes. 176 return uintptr(n), nil, slinux.HandleIOErrorVFS2(t, n != 0, err, syserror.ERESTARTSYS, "splice", outFile) 177 } 178 179 // Tee implements Linux syscall tee(2). 180 func Tee(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { 181 inFD := args[0].Int() 182 outFD := args[1].Int() 183 count := int64(args[2].SizeT()) 184 flags := args[3].Int() 185 186 if count == 0 { 187 return 0, nil, nil 188 } 189 if count > int64(kernel.MAX_RW_COUNT) { 190 count = int64(kernel.MAX_RW_COUNT) 191 } 192 if count < 0 { 193 return 0, nil, linuxerr.EINVAL 194 } 195 196 // Check for invalid flags. 197 if flags&^(linux.SPLICE_F_MOVE|linux.SPLICE_F_NONBLOCK|linux.SPLICE_F_MORE|linux.SPLICE_F_GIFT) != 0 { 198 return 0, nil, linuxerr.EINVAL 199 } 200 201 // Get file descriptions. 202 inFile := t.GetFileVFS2(inFD) 203 if inFile == nil { 204 return 0, nil, linuxerr.EBADF 205 } 206 defer inFile.DecRef(t) 207 outFile := t.GetFileVFS2(outFD) 208 if outFile == nil { 209 return 0, nil, linuxerr.EBADF 210 } 211 defer outFile.DecRef(t) 212 213 // Check that both files support the required directionality. 214 if !inFile.IsReadable() || !outFile.IsWritable() { 215 return 0, nil, linuxerr.EBADF 216 } 217 218 // The operation is non-blocking if anything is non-blocking. 219 // 220 // N.B. This is a rather simplistic heuristic that avoids some 221 // poor edge case behavior since the exact semantics here are 222 // underspecified and vary between versions of Linux itself. 223 nonBlock := ((inFile.StatusFlags()|outFile.StatusFlags())&linux.O_NONBLOCK != 0) || (flags&linux.SPLICE_F_NONBLOCK != 0) 224 225 // Both file descriptions must represent pipes. 226 inPipeFD, inIsPipe := inFile.Impl().(*pipe.VFSPipeFD) 227 outPipeFD, outIsPipe := outFile.Impl().(*pipe.VFSPipeFD) 228 if !inIsPipe || !outIsPipe { 229 return 0, nil, linuxerr.EINVAL 230 } 231 232 // Copy data. 233 var ( 234 n int64 235 err error 236 ) 237 dw := dualWaiter{ 238 inFile: inFile, 239 outFile: outFile, 240 } 241 defer dw.destroy() 242 for { 243 n, err = pipe.Tee(t, outPipeFD, inPipeFD, count) 244 if n != 0 || err != syserror.ErrWouldBlock || nonBlock { 245 break 246 } 247 if err = dw.waitForBoth(t); err != nil { 248 break 249 } 250 } 251 252 if n != 0 { 253 // If a partial write is completed, the error is dropped. Log it here. 254 if err != nil && err != io.EOF && err != syserror.ErrWouldBlock { 255 log.Debugf("tee completed a partial write with error: %v", err) 256 err = nil 257 } 258 } 259 260 // We can only pass a single file to handleIOError, so pick inFile arbitrarily. 261 // This is used only for debugging purposes. 262 return uintptr(n), nil, slinux.HandleIOErrorVFS2(t, n != 0, err, syserror.ERESTARTSYS, "tee", inFile) 263 } 264 265 // Sendfile implements linux system call sendfile(2). 266 func Sendfile(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { 267 outFD := args[0].Int() 268 inFD := args[1].Int() 269 offsetAddr := args[2].Pointer() 270 count := int64(args[3].SizeT()) 271 272 inFile := t.GetFileVFS2(inFD) 273 if inFile == nil { 274 return 0, nil, linuxerr.EBADF 275 } 276 defer inFile.DecRef(t) 277 if !inFile.IsReadable() { 278 return 0, nil, linuxerr.EBADF 279 } 280 281 outFile := t.GetFileVFS2(outFD) 282 if outFile == nil { 283 return 0, nil, linuxerr.EBADF 284 } 285 defer outFile.DecRef(t) 286 if !outFile.IsWritable() { 287 return 0, nil, linuxerr.EBADF 288 } 289 290 // Verify that the outFile Append flag is not set. 291 if outFile.StatusFlags()&linux.O_APPEND != 0 { 292 return 0, nil, linuxerr.EINVAL 293 } 294 295 // Verify that inFile is a regular file or block device. This is a 296 // requirement; the same check appears in Linux 297 // (fs/splice.c:splice_direct_to_actor). 298 if stat, err := inFile.Stat(t, vfs.StatOptions{Mask: linux.STATX_TYPE}); err != nil { 299 return 0, nil, err 300 } else if stat.Mask&linux.STATX_TYPE == 0 || 301 (stat.Mode&linux.S_IFMT != linux.S_IFREG && stat.Mode&linux.S_IFMT != linux.S_IFBLK) { 302 return 0, nil, linuxerr.EINVAL 303 } 304 305 // Copy offset if it exists. 306 offset := int64(-1) 307 if offsetAddr != 0 { 308 if inFile.Options().DenyPRead { 309 return 0, nil, linuxerr.ESPIPE 310 } 311 var offsetP primitive.Int64 312 if _, err := offsetP.CopyIn(t, offsetAddr); err != nil { 313 return 0, nil, err 314 } 315 offset = int64(offsetP) 316 317 if offset < 0 { 318 return 0, nil, linuxerr.EINVAL 319 } 320 if offset+count < 0 { 321 return 0, nil, linuxerr.EINVAL 322 } 323 } 324 325 // Validate count. This must come after offset checks. 326 if count < 0 { 327 return 0, nil, linuxerr.EINVAL 328 } 329 if count == 0 { 330 return 0, nil, nil 331 } 332 if count > int64(kernel.MAX_RW_COUNT) { 333 count = int64(kernel.MAX_RW_COUNT) 334 } 335 336 // Copy data. 337 var ( 338 total int64 339 err error 340 ) 341 dw := dualWaiter{ 342 inFile: inFile, 343 outFile: outFile, 344 } 345 defer dw.destroy() 346 outPipeFD, outIsPipe := outFile.Impl().(*pipe.VFSPipeFD) 347 // Reading from input file should never block, since it is regular or 348 // block device. We only need to check if writing to the output file 349 // can block. 350 nonBlock := outFile.StatusFlags()&linux.O_NONBLOCK != 0 351 if outIsPipe { 352 for { 353 var n int64 354 n, err = outPipeFD.SpliceFromNonPipe(t, inFile, offset, count-total) 355 if offset != -1 { 356 offset += n 357 } 358 total += n 359 if total == count { 360 break 361 } 362 if err == nil && t.Interrupted() { 363 err = syserror.ErrInterrupted 364 break 365 } 366 if err == syserror.ErrWouldBlock && !nonBlock { 367 err = dw.waitForBoth(t) 368 } 369 if err != nil { 370 break 371 } 372 } 373 } else { 374 // Read inFile to buffer, then write the contents to outFile. 375 buf := make([]byte, count) 376 for { 377 var readN int64 378 if offset != -1 { 379 readN, err = inFile.PRead(t, usermem.BytesIOSequence(buf), offset, vfs.ReadOptions{}) 380 offset += readN 381 } else { 382 readN, err = inFile.Read(t, usermem.BytesIOSequence(buf), vfs.ReadOptions{}) 383 } 384 385 // Write all of the bytes that we read. This may need 386 // multiple write calls to complete. 387 wbuf := buf[:readN] 388 for len(wbuf) > 0 { 389 var writeN int64 390 writeN, err = outFile.Write(t, usermem.BytesIOSequence(wbuf), vfs.WriteOptions{}) 391 wbuf = wbuf[writeN:] 392 if err == syserror.ErrWouldBlock && !nonBlock { 393 err = dw.waitForOut(t) 394 } 395 if err != nil { 396 // We didn't complete the write. Only report the bytes that were actually 397 // written, and rewind offsets as needed. 398 notWritten := int64(len(wbuf)) 399 readN -= notWritten 400 if offset == -1 { 401 // We modified the offset of the input file itself during the read 402 // operation. Rewind it. 403 if _, seekErr := inFile.Seek(t, -notWritten, linux.SEEK_CUR); seekErr != nil { 404 // Log the error but don't return it, since the write has already 405 // completed successfully. 406 log.Warningf("failed to roll back input file offset: %v", seekErr) 407 } 408 } else { 409 // The sendfile call was provided an offset parameter that should be 410 // adjusted to reflect the number of bytes sent. Rewind it. 411 offset -= notWritten 412 } 413 break 414 } 415 } 416 417 total += readN 418 buf = buf[readN:] 419 if total == count { 420 break 421 } 422 if err == nil && t.Interrupted() { 423 err = syserror.ErrInterrupted 424 break 425 } 426 if err == syserror.ErrWouldBlock && !nonBlock { 427 err = dw.waitForBoth(t) 428 } 429 if err != nil { 430 break 431 } 432 } 433 } 434 435 if offsetAddr != 0 { 436 // Copy out the new offset. 437 offsetP := primitive.Uint64(offset) 438 if _, err := offsetP.CopyOut(t, offsetAddr); err != nil { 439 return 0, nil, err 440 } 441 } 442 443 if total != 0 { 444 if err != nil && err != io.EOF && err != syserror.ErrWouldBlock { 445 // If a partial write is completed, the error is dropped. Log it here. 446 log.Debugf("sendfile completed a partial write with error: %v", err) 447 err = nil 448 } 449 } 450 451 // We can only pass a single file to handleIOError, so pick inFile arbitrarily. 452 // This is used only for debugging purposes. 453 return uintptr(total), nil, slinux.HandleIOErrorVFS2(t, total != 0, err, syserror.ERESTARTSYS, "sendfile", inFile) 454 } 455 456 // dualWaiter is used to wait on one or both vfs.FileDescriptions. It is not 457 // thread-safe, and does not take a reference on the vfs.FileDescriptions. 458 // 459 // Users must call destroy() when finished. 460 type dualWaiter struct { 461 inFile *vfs.FileDescription 462 outFile *vfs.FileDescription 463 464 inW waiter.Entry 465 inCh chan struct{} 466 outW waiter.Entry 467 outCh chan struct{} 468 } 469 470 // waitForBoth waits for both dw.inFile and dw.outFile to be ready. 471 func (dw *dualWaiter) waitForBoth(t *kernel.Task) error { 472 if dw.inFile.Readiness(eventMaskRead)&eventMaskRead == 0 { 473 if dw.inCh == nil { 474 dw.inW, dw.inCh = waiter.NewChannelEntry(nil) 475 dw.inFile.EventRegister(&dw.inW, eventMaskRead) 476 // We might be ready now. Try again before blocking. 477 return nil 478 } 479 if err := t.Block(dw.inCh); err != nil { 480 return err 481 } 482 } 483 return dw.waitForOut(t) 484 } 485 486 // waitForOut waits for dw.outfile to be read. 487 func (dw *dualWaiter) waitForOut(t *kernel.Task) error { 488 // Don't bother checking readiness of the outFile, because it's not a 489 // guarantee that it won't return EWOULDBLOCK. Both pipes and eventfds 490 // can be "ready" but will reject writes of certain sizes with 491 // EWOULDBLOCK. See b/172075629, b/170743336. 492 if dw.outCh == nil { 493 dw.outW, dw.outCh = waiter.NewChannelEntry(nil) 494 dw.outFile.EventRegister(&dw.outW, eventMaskWrite) 495 // We might be ready to write now. Try again before blocking. 496 return nil 497 } 498 return t.Block(dw.outCh) 499 } 500 501 // destroy cleans up resources help by dw. No more calls to wait* can occur 502 // after destroy is called. 503 func (dw *dualWaiter) destroy() { 504 if dw.inCh != nil { 505 dw.inFile.EventUnregister(&dw.inW) 506 dw.inCh = nil 507 } 508 if dw.outCh != nil { 509 dw.outFile.EventUnregister(&dw.outW) 510 dw.outCh = nil 511 } 512 dw.inFile = nil 513 dw.outFile = nil 514 }