github.com/metacubex/gvisor@v0.0.0-20240320004321-933faba989ec/pkg/sentry/syscalls/linux/sys_socket.go (about) 1 // Copyright 2018 The gVisor Authors. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 package linux 16 17 import ( 18 "fmt" 19 "time" 20 21 "golang.org/x/sys/unix" 22 "github.com/metacubex/gvisor/pkg/abi/linux" 23 "github.com/metacubex/gvisor/pkg/context" 24 "github.com/metacubex/gvisor/pkg/errors/linuxerr" 25 "github.com/metacubex/gvisor/pkg/hostarch" 26 "github.com/metacubex/gvisor/pkg/marshal" 27 "github.com/metacubex/gvisor/pkg/marshal/primitive" 28 "github.com/metacubex/gvisor/pkg/sentry/arch" 29 "github.com/metacubex/gvisor/pkg/sentry/fsimpl/host" 30 "github.com/metacubex/gvisor/pkg/sentry/kernel" 31 "github.com/metacubex/gvisor/pkg/sentry/kernel/auth" 32 ktime "github.com/metacubex/gvisor/pkg/sentry/kernel/time" 33 "github.com/metacubex/gvisor/pkg/sentry/socket" 34 "github.com/metacubex/gvisor/pkg/sentry/socket/control" 35 "github.com/metacubex/gvisor/pkg/sentry/socket/unix/transport" 36 "github.com/metacubex/gvisor/pkg/sentry/vfs" 37 "github.com/metacubex/gvisor/pkg/syserr" 38 "github.com/metacubex/gvisor/pkg/usermem" 39 ) 40 41 // maxAddrLen is the maximum socket address length we're willing to accept. 42 const maxAddrLen = 200 43 44 // maxOptLen is the maximum sockopt parameter length we're willing to accept. 45 const maxOptLen = 1024 * 8 46 47 // maxControlLen is the maximum length of the msghdr.msg_control buffer we're 48 // willing to accept. Note that this limit is smaller than Linux, which allows 49 // buffers upto INT_MAX. 50 const maxControlLen = 10 * 1024 * 1024 51 52 // maxListenBacklog is the maximum limit of listen backlog supported. 53 const maxListenBacklog = 1024 54 55 // nameLenOffset is the offset from the start of the MessageHeader64 struct to 56 // the NameLen field. 57 const nameLenOffset = 8 58 59 // controlLenOffset is the offset form the start of the MessageHeader64 struct 60 // to the ControlLen field. 61 const controlLenOffset = 40 62 63 // flagsOffset is the offset form the start of the MessageHeader64 struct 64 // to the Flags field. 65 const flagsOffset = 48 66 67 const sizeOfInt32 = 4 68 69 // messageHeader64Len is the length of a MessageHeader64 struct. 70 var messageHeader64Len = uint64((*MessageHeader64)(nil).SizeBytes()) 71 72 // multipleMessageHeader64Len is the length of a multipeMessageHeader64 struct. 73 var multipleMessageHeader64Len = uint64((*multipleMessageHeader64)(nil).SizeBytes()) 74 75 // baseRecvFlags are the flags that are accepted across recvmsg(2), 76 // recvmmsg(2), and recvfrom(2). 77 const baseRecvFlags = linux.MSG_OOB | linux.MSG_DONTROUTE | linux.MSG_DONTWAIT | linux.MSG_NOSIGNAL | linux.MSG_WAITALL | linux.MSG_TRUNC | linux.MSG_CTRUNC 78 79 // MessageHeader64 is the 64-bit representation of the msghdr struct used in 80 // the recvmsg and sendmsg syscalls. 81 // 82 // +marshal 83 type MessageHeader64 struct { 84 // Name is the optional pointer to a network address buffer. 85 Name uint64 86 87 // NameLen is the length of the buffer pointed to by Name. 88 NameLen uint32 89 _ uint32 90 91 // Iov is a pointer to an array of io vectors that describe the memory 92 // locations involved in the io operation. 93 Iov uint64 94 95 // IovLen is the length of the array pointed to by Iov. 96 IovLen uint64 97 98 // Control is the optional pointer to ancillary control data. 99 Control uint64 100 101 // ControlLen is the length of the data pointed to by Control. 102 ControlLen uint64 103 104 // Flags on the sent/received message. 105 Flags int32 106 _ int32 107 } 108 109 // multipleMessageHeader64 is the 64-bit representation of the mmsghdr struct used in 110 // the recvmmsg and sendmmsg syscalls. 111 // 112 // +marshal 113 type multipleMessageHeader64 struct { 114 msgHdr MessageHeader64 115 msgLen uint32 116 _ int32 117 } 118 119 // CaptureAddress allocates memory for and copies a socket address structure 120 // from the untrusted address space range. 121 func CaptureAddress(t *kernel.Task, addr hostarch.Addr, addrlen uint32) ([]byte, error) { 122 if addrlen > maxAddrLen { 123 return nil, linuxerr.EINVAL 124 } 125 126 addrBuf := make([]byte, addrlen) 127 if _, err := t.CopyInBytes(addr, addrBuf); err != nil { 128 return nil, err 129 } 130 131 return addrBuf, nil 132 } 133 134 // writeAddress writes a sockaddr structure and its length to an output buffer 135 // in the unstrusted address space range. If the address is bigger than the 136 // buffer, it is truncated. 137 func writeAddress(t *kernel.Task, addr linux.SockAddr, addrLen uint32, addrPtr hostarch.Addr, addrLenPtr hostarch.Addr) error { 138 // Get the buffer length. 139 var bufLen uint32 140 if _, err := primitive.CopyUint32In(t, addrLenPtr, &bufLen); err != nil { 141 return err 142 } 143 144 if int32(bufLen) < 0 { 145 return linuxerr.EINVAL 146 } 147 148 // Write the length unconditionally. 149 if _, err := primitive.CopyUint32Out(t, addrLenPtr, addrLen); err != nil { 150 return err 151 } 152 153 if addr == nil { 154 return nil 155 } 156 157 if bufLen > addrLen { 158 bufLen = addrLen 159 } 160 161 // Copy as much of the address as will fit in the buffer. 162 encodedAddr := t.CopyScratchBuffer(addr.SizeBytes()) 163 addr.MarshalUnsafe(encodedAddr) 164 if bufLen > uint32(len(encodedAddr)) { 165 bufLen = uint32(len(encodedAddr)) 166 } 167 _, err := t.CopyOutBytes(addrPtr, encodedAddr[:int(bufLen)]) 168 return err 169 } 170 171 // Socket implements the linux syscall socket(2). 172 func Socket(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { 173 domain := int(args[0].Int()) 174 stype := args[1].Int() 175 protocol := int(args[2].Int()) 176 177 // Check and initialize the flags. 178 if stype & ^(0xf|linux.SOCK_NONBLOCK|linux.SOCK_CLOEXEC) != 0 { 179 return 0, nil, linuxerr.EINVAL 180 } 181 182 // Create the new socket. 183 s, e := socket.New(t, domain, linux.SockType(stype&0xf), protocol) 184 if e != nil { 185 return 0, nil, e.ToError() 186 } 187 defer s.DecRef(t) 188 189 if err := s.SetStatusFlags(t, t.Credentials(), uint32(stype&linux.SOCK_NONBLOCK)); err != nil { 190 return 0, nil, err 191 } 192 193 fd, err := t.NewFDFrom(0, s, kernel.FDFlags{ 194 CloseOnExec: stype&linux.SOCK_CLOEXEC != 0, 195 }) 196 if err != nil { 197 return 0, nil, err 198 } 199 200 return uintptr(fd), nil, nil 201 } 202 203 // SocketPair implements the linux syscall socketpair(2). 204 func SocketPair(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { 205 domain := int(args[0].Int()) 206 stype := args[1].Int() 207 protocol := int(args[2].Int()) 208 addr := args[3].Pointer() 209 210 // Check and initialize the flags. 211 if stype & ^(0xf|linux.SOCK_NONBLOCK|linux.SOCK_CLOEXEC) != 0 { 212 return 0, nil, linuxerr.EINVAL 213 } 214 215 // Create the socket pair. 216 s1, s2, e := socket.Pair(t, domain, linux.SockType(stype&0xf), protocol) 217 if e != nil { 218 return 0, nil, e.ToError() 219 } 220 // Adding to the FD table will cause an extra reference to be acquired. 221 defer s1.DecRef(t) 222 defer s2.DecRef(t) 223 224 nonblocking := uint32(stype & linux.SOCK_NONBLOCK) 225 if err := s1.SetStatusFlags(t, t.Credentials(), nonblocking); err != nil { 226 return 0, nil, err 227 } 228 if err := s2.SetStatusFlags(t, t.Credentials(), nonblocking); err != nil { 229 return 0, nil, err 230 } 231 232 // Create the FDs for the sockets. 233 flags := kernel.FDFlags{ 234 CloseOnExec: stype&linux.SOCK_CLOEXEC != 0, 235 } 236 fds, err := t.NewFDs(0, []*vfs.FileDescription{s1, s2}, flags) 237 if err != nil { 238 return 0, nil, err 239 } 240 241 if _, err := primitive.CopyInt32SliceOut(t, addr, fds); err != nil { 242 for _, fd := range fds { 243 if file := t.FDTable().Remove(t, fd); file != nil { 244 file.DecRef(t) 245 } 246 } 247 return 0, nil, err 248 } 249 250 return 0, nil, nil 251 } 252 253 // Connect implements the linux syscall connect(2). 254 func Connect(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { 255 fd := args[0].Int() 256 addr := args[1].Pointer() 257 addrlen := args[2].Uint() 258 259 // Get socket from the file descriptor. 260 file := t.GetFile(fd) 261 if file == nil { 262 return 0, nil, linuxerr.EBADF 263 } 264 defer file.DecRef(t) 265 266 // Extract the socket. 267 s, ok := file.Impl().(socket.Socket) 268 if !ok { 269 return 0, nil, linuxerr.ENOTSOCK 270 } 271 272 // Capture address and call syscall implementation. 273 a, err := CaptureAddress(t, addr, addrlen) 274 if err != nil { 275 return 0, nil, err 276 } 277 278 blocking := (file.StatusFlags() & linux.SOCK_NONBLOCK) == 0 279 return 0, nil, linuxerr.ConvertIntr(s.Connect(t, a, blocking).ToError(), linuxerr.ERESTARTSYS) 280 } 281 282 // accept is the implementation of the accept syscall. It is called by accept 283 // and accept4 syscall handlers. 284 func accept(t *kernel.Task, fd int32, addr hostarch.Addr, addrLen hostarch.Addr, flags int) (uintptr, error) { 285 // Check that no unsupported flags are passed in. 286 if flags & ^(linux.SOCK_NONBLOCK|linux.SOCK_CLOEXEC) != 0 { 287 return 0, linuxerr.EINVAL 288 } 289 290 // Get socket from the file descriptor. 291 file := t.GetFile(fd) 292 if file == nil { 293 return 0, linuxerr.EBADF 294 } 295 defer file.DecRef(t) 296 297 // Extract the socket. 298 s, ok := file.Impl().(socket.Socket) 299 if !ok { 300 return 0, linuxerr.ENOTSOCK 301 } 302 303 // Call the syscall implementation for this socket, then copy the 304 // output address if one is specified. 305 blocking := (file.StatusFlags() & linux.SOCK_NONBLOCK) == 0 306 307 peerRequested := addrLen != 0 308 nfd, peer, peerLen, e := s.Accept(t, peerRequested, flags, blocking) 309 if e != nil { 310 return 0, linuxerr.ConvertIntr(e.ToError(), linuxerr.ERESTARTSYS) 311 } 312 if peerRequested { 313 // NOTE(magi): Linux does not give you an error if it can't 314 // write the data back out so neither do we. 315 if err := writeAddress(t, peer, peerLen, addr, addrLen); linuxerr.Equals(linuxerr.EINVAL, err) { 316 return 0, err 317 } 318 } 319 return uintptr(nfd), nil 320 } 321 322 // Accept4 implements the linux syscall accept4(2). 323 func Accept4(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { 324 fd := args[0].Int() 325 addr := args[1].Pointer() 326 addrlen := args[2].Pointer() 327 flags := int(args[3].Int()) 328 329 n, err := accept(t, fd, addr, addrlen, flags) 330 return n, nil, err 331 } 332 333 // Accept implements the linux syscall accept(2). 334 func Accept(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { 335 fd := args[0].Int() 336 addr := args[1].Pointer() 337 addrlen := args[2].Pointer() 338 339 n, err := accept(t, fd, addr, addrlen, 0) 340 return n, nil, err 341 } 342 343 // Bind implements the linux syscall bind(2). 344 func Bind(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { 345 fd := args[0].Int() 346 addr := args[1].Pointer() 347 addrlen := args[2].Uint() 348 349 // Get socket from the file descriptor. 350 file := t.GetFile(fd) 351 if file == nil { 352 return 0, nil, linuxerr.EBADF 353 } 354 defer file.DecRef(t) 355 356 // Extract the socket. 357 s, ok := file.Impl().(socket.Socket) 358 if !ok { 359 return 0, nil, linuxerr.ENOTSOCK 360 } 361 362 // Capture address and call syscall implementation. 363 a, err := CaptureAddress(t, addr, addrlen) 364 if err != nil { 365 return 0, nil, err 366 } 367 368 return 0, nil, s.Bind(t, a).ToError() 369 } 370 371 // Listen implements the linux syscall listen(2). 372 func Listen(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { 373 fd := args[0].Int() 374 backlog := args[1].Uint() 375 376 // Get socket from the file descriptor. 377 file := t.GetFile(fd) 378 if file == nil { 379 return 0, nil, linuxerr.EBADF 380 } 381 defer file.DecRef(t) 382 383 // Extract the socket. 384 s, ok := file.Impl().(socket.Socket) 385 if !ok { 386 return 0, nil, linuxerr.ENOTSOCK 387 } 388 389 if backlog > maxListenBacklog { 390 // Linux treats incoming backlog as uint with a limit defined by 391 // sysctl_somaxconn. 392 // https://github.com/torvalds/linux/blob/7acac4b3196/net/socket.c#L1666 393 backlog = maxListenBacklog 394 } 395 396 // Accept one more than the configured listen backlog to keep in parity with 397 // Linux. Ref, because of missing equality check here: 398 // https://github.com/torvalds/linux/blob/7acac4b3196/include/net/sock.h#L937 399 // 400 // In case of unix domain sockets, the following check 401 // https://github.com/torvalds/linux/blob/7d6beb71da3/net/unix/af_unix.c#L1293 402 // will allow 1 connect through since it checks for a receive queue len > 403 // backlog and not >=. 404 backlog++ 405 406 return 0, nil, s.Listen(t, int(backlog)).ToError() 407 } 408 409 // Shutdown implements the linux syscall shutdown(2). 410 func Shutdown(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { 411 fd := args[0].Int() 412 how := args[1].Int() 413 414 // Get socket from the file descriptor. 415 file := t.GetFile(fd) 416 if file == nil { 417 return 0, nil, linuxerr.EBADF 418 } 419 defer file.DecRef(t) 420 421 // Extract the socket. 422 s, ok := file.Impl().(socket.Socket) 423 if !ok { 424 return 0, nil, linuxerr.ENOTSOCK 425 } 426 427 // Validate how, then call syscall implementation. 428 switch how { 429 case linux.SHUT_RD, linux.SHUT_WR, linux.SHUT_RDWR: 430 default: 431 return 0, nil, linuxerr.EINVAL 432 } 433 434 return 0, nil, s.Shutdown(t, int(how)).ToError() 435 } 436 437 // GetSockOpt implements the linux syscall getsockopt(2). 438 func GetSockOpt(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { 439 fd := args[0].Int() 440 level := args[1].Int() 441 name := args[2].Int() 442 optValAddr := args[3].Pointer() 443 optLenAddr := args[4].Pointer() 444 445 // Get socket from the file descriptor. 446 file := t.GetFile(fd) 447 if file == nil { 448 return 0, nil, linuxerr.EBADF 449 } 450 defer file.DecRef(t) 451 452 // Extract the socket. 453 s, ok := file.Impl().(socket.Socket) 454 if !ok { 455 return 0, nil, linuxerr.ENOTSOCK 456 } 457 458 // Read the length. Reject negative values. 459 var optLen int32 460 if _, err := primitive.CopyInt32In(t, optLenAddr, &optLen); err != nil { 461 return 0, nil, err 462 } 463 if optLen < 0 { 464 return 0, nil, linuxerr.EINVAL 465 } 466 467 // Call syscall implementation then copy both value and value len out. 468 v, e := getSockOpt(t, s, int(level), int(name), optValAddr, int(optLen)) 469 if e != nil { 470 return 0, nil, e.ToError() 471 } 472 473 if _, err := primitive.CopyInt32Out(t, optLenAddr, int32(v.SizeBytes())); err != nil { 474 return 0, nil, err 475 } 476 477 if v != nil { 478 if _, err := v.CopyOut(t, optValAddr); err != nil { 479 return 0, nil, err 480 } 481 } 482 483 return 0, nil, nil 484 } 485 486 // getSockOpt tries to handle common socket options, or dispatches to a specific 487 // socket implementation. 488 func getSockOpt(t *kernel.Task, s socket.Socket, level, name int, optValAddr hostarch.Addr, len int) (marshal.Marshallable, *syserr.Error) { 489 if level == linux.SOL_SOCKET { 490 switch name { 491 case linux.SO_TYPE, linux.SO_DOMAIN, linux.SO_PROTOCOL: 492 if len < sizeOfInt32 { 493 return nil, syserr.ErrInvalidArgument 494 } 495 } 496 497 switch name { 498 case linux.SO_TYPE: 499 _, skType, _ := s.Type() 500 v := primitive.Int32(skType) 501 return &v, nil 502 case linux.SO_DOMAIN: 503 family, _, _ := s.Type() 504 v := primitive.Int32(family) 505 return &v, nil 506 case linux.SO_PROTOCOL: 507 _, _, protocol := s.Type() 508 v := primitive.Int32(protocol) 509 return &v, nil 510 } 511 } 512 513 return s.GetSockOpt(t, level, name, optValAddr, len) 514 } 515 516 // SetSockOpt implements the linux syscall setsockopt(2). 517 // 518 // Note that unlike Linux, enabling SO_PASSCRED does not autobind the socket. 519 func SetSockOpt(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { 520 fd := args[0].Int() 521 level := args[1].Int() 522 name := args[2].Int() 523 optValAddr := args[3].Pointer() 524 optLen := args[4].Int() 525 526 // Get socket from the file descriptor. 527 file := t.GetFile(fd) 528 if file == nil { 529 return 0, nil, linuxerr.EBADF 530 } 531 defer file.DecRef(t) 532 533 // Extract the socket. 534 s, ok := file.Impl().(socket.Socket) 535 if !ok { 536 return 0, nil, linuxerr.ENOTSOCK 537 } 538 539 if optLen < 0 { 540 return 0, nil, linuxerr.EINVAL 541 } 542 if optLen > maxOptLen { 543 return 0, nil, linuxerr.EINVAL 544 } 545 buf := t.CopyScratchBuffer(int(optLen)) 546 if _, err := t.CopyInBytes(optValAddr, buf); err != nil { 547 return 0, nil, err 548 } 549 550 // Call syscall implementation. 551 if err := s.SetSockOpt(t, int(level), int(name), buf); err != nil { 552 return 0, nil, err.ToError() 553 } 554 555 return 0, nil, nil 556 } 557 558 // GetSockName implements the linux syscall getsockname(2). 559 func GetSockName(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { 560 fd := args[0].Int() 561 addr := args[1].Pointer() 562 addrlen := args[2].Pointer() 563 564 // Get socket from the file descriptor. 565 file := t.GetFile(fd) 566 if file == nil { 567 return 0, nil, linuxerr.EBADF 568 } 569 defer file.DecRef(t) 570 571 // Extract the socket. 572 s, ok := file.Impl().(socket.Socket) 573 if !ok { 574 return 0, nil, linuxerr.ENOTSOCK 575 } 576 577 // Get the socket name and copy it to the caller. 578 v, vl, err := s.GetSockName(t) 579 if err != nil { 580 return 0, nil, err.ToError() 581 } 582 583 return 0, nil, writeAddress(t, v, vl, addr, addrlen) 584 } 585 586 // GetPeerName implements the linux syscall getpeername(2). 587 func GetPeerName(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { 588 fd := args[0].Int() 589 addr := args[1].Pointer() 590 addrlen := args[2].Pointer() 591 592 // Get socket from the file descriptor. 593 file := t.GetFile(fd) 594 if file == nil { 595 return 0, nil, linuxerr.EBADF 596 } 597 defer file.DecRef(t) 598 599 // Extract the socket. 600 s, ok := file.Impl().(socket.Socket) 601 if !ok { 602 return 0, nil, linuxerr.ENOTSOCK 603 } 604 605 // Get the socket peer name and copy it to the caller. 606 v, vl, err := s.GetPeerName(t) 607 if err != nil { 608 return 0, nil, err.ToError() 609 } 610 611 return 0, nil, writeAddress(t, v, vl, addr, addrlen) 612 } 613 614 // RecvMsg implements the linux syscall recvmsg(2). 615 func RecvMsg(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { 616 fd := args[0].Int() 617 msgPtr := args[1].Pointer() 618 flags := args[2].Int() 619 620 if t.Arch().Width() != 8 { 621 // We only handle 64-bit for now. 622 return 0, nil, linuxerr.EINVAL 623 } 624 625 // Get socket from the file descriptor. 626 file := t.GetFile(fd) 627 if file == nil { 628 return 0, nil, linuxerr.EBADF 629 } 630 defer file.DecRef(t) 631 632 // Extract the socket. 633 s, ok := file.Impl().(socket.Socket) 634 if !ok { 635 return 0, nil, linuxerr.ENOTSOCK 636 } 637 638 // Reject flags that we don't handle yet. 639 if flags & ^(baseRecvFlags|linux.MSG_PEEK|linux.MSG_CMSG_CLOEXEC|linux.MSG_ERRQUEUE) != 0 { 640 return 0, nil, linuxerr.EINVAL 641 } 642 643 if (file.StatusFlags() & linux.SOCK_NONBLOCK) != 0 { 644 flags |= linux.MSG_DONTWAIT 645 } 646 647 var haveDeadline bool 648 var deadline ktime.Time 649 if dl := s.RecvTimeout(); dl > 0 { 650 deadline = t.Kernel().MonotonicClock().Now().Add(time.Duration(dl) * time.Nanosecond) 651 haveDeadline = true 652 } else if dl < 0 { 653 flags |= linux.MSG_DONTWAIT 654 } 655 656 n, err := recvSingleMsg(t, s, msgPtr, flags, haveDeadline, deadline) 657 return n, nil, err 658 } 659 660 // RecvMMsg implements the linux syscall recvmmsg(2). 661 func RecvMMsg(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { 662 fd := args[0].Int() 663 msgPtr := args[1].Pointer() 664 vlen := args[2].Uint() 665 flags := args[3].Int() 666 toPtr := args[4].Pointer() 667 668 if t.Arch().Width() != 8 { 669 // We only handle 64-bit for now. 670 return 0, nil, linuxerr.EINVAL 671 } 672 673 if vlen > linux.UIO_MAXIOV { 674 vlen = linux.UIO_MAXIOV 675 } 676 677 // Reject flags that we don't handle yet. 678 if flags & ^(baseRecvFlags|linux.MSG_CMSG_CLOEXEC|linux.MSG_ERRQUEUE) != 0 { 679 return 0, nil, linuxerr.EINVAL 680 } 681 682 // Get socket from the file descriptor. 683 file := t.GetFile(fd) 684 if file == nil { 685 return 0, nil, linuxerr.EBADF 686 } 687 defer file.DecRef(t) 688 689 // Extract the socket. 690 s, ok := file.Impl().(socket.Socket) 691 if !ok { 692 return 0, nil, linuxerr.ENOTSOCK 693 } 694 695 if (file.StatusFlags() & linux.SOCK_NONBLOCK) != 0 { 696 flags |= linux.MSG_DONTWAIT 697 } 698 699 var haveDeadline bool 700 var deadline ktime.Time 701 if toPtr != 0 { 702 var ts linux.Timespec 703 if _, err := ts.CopyIn(t, toPtr); err != nil { 704 return 0, nil, err 705 } 706 if !ts.Valid() { 707 return 0, nil, linuxerr.EINVAL 708 } 709 deadline = t.Kernel().MonotonicClock().Now().Add(ts.ToDuration()) 710 haveDeadline = true 711 } 712 713 if !haveDeadline { 714 if dl := s.RecvTimeout(); dl > 0 { 715 deadline = t.Kernel().MonotonicClock().Now().Add(time.Duration(dl) * time.Nanosecond) 716 haveDeadline = true 717 } else if dl < 0 { 718 flags |= linux.MSG_DONTWAIT 719 } 720 } 721 722 var count uint32 723 var err error 724 for i := uint64(0); i < uint64(vlen); i++ { 725 mp, ok := msgPtr.AddLength(i * multipleMessageHeader64Len) 726 if !ok { 727 return 0, nil, linuxerr.EFAULT 728 } 729 var n uintptr 730 if n, err = recvSingleMsg(t, s, mp, flags, haveDeadline, deadline); err != nil { 731 break 732 } 733 734 // Copy the received length to the caller. 735 lp, ok := mp.AddLength(messageHeader64Len) 736 if !ok { 737 return 0, nil, linuxerr.EFAULT 738 } 739 if _, err = primitive.CopyUint32Out(t, lp, uint32(n)); err != nil { 740 break 741 } 742 count++ 743 } 744 745 if count == 0 { 746 return 0, nil, err 747 } 748 return uintptr(count), nil, nil 749 } 750 751 func getSCMRights(t *kernel.Task, rights transport.RightsControlMessage) control.SCMRights { 752 switch v := rights.(type) { 753 case control.SCMRights: 754 return v 755 case *transport.SCMRights: 756 rf := control.RightsFiles(fdsToHostFiles(t, v.FDs)) 757 return &rf 758 default: 759 panic(fmt.Sprintf("rights of type %T must be *transport.SCMRights or implement SCMRights", rights)) 760 } 761 } 762 763 // If an error is encountered, only files created before the error will be 764 // returned. This is what Linux does. 765 func fdsToHostFiles(ctx context.Context, fds []int) []*vfs.FileDescription { 766 files := make([]*vfs.FileDescription, 0, len(fds)) 767 for _, fd := range fds { 768 // Get flags. We do it here because they may be modified 769 // by subsequent functions. 770 fileFlags, _, errno := unix.Syscall(unix.SYS_FCNTL, uintptr(fd), unix.F_GETFL, 0) 771 if errno != 0 { 772 ctx.Warningf("Error retrieving host FD flags: %v", error(errno)) 773 break 774 } 775 776 // Create the file backed by hostFD. 777 file, err := host.NewFD(ctx, kernel.KernelFromContext(ctx).HostMount(), fd, &host.NewFDOptions{}) 778 if err != nil { 779 ctx.Warningf("Error creating file from host FD: %v", err) 780 break 781 } 782 783 if err := file.SetStatusFlags(ctx, auth.CredentialsFromContext(ctx), uint32(fileFlags&linux.O_NONBLOCK)); err != nil { 784 ctx.Warningf("Error setting flags on host FD file: %v", err) 785 break 786 } 787 788 files = append(files, file) 789 } 790 return files 791 } 792 793 func recvSingleMsg(t *kernel.Task, s socket.Socket, msgPtr hostarch.Addr, flags int32, haveDeadline bool, deadline ktime.Time) (uintptr, error) { 794 // Capture the message header and io vectors. 795 var msg MessageHeader64 796 if _, err := msg.CopyIn(t, msgPtr); err != nil { 797 return 0, err 798 } 799 800 if msg.IovLen > linux.UIO_MAXIOV { 801 return 0, linuxerr.EMSGSIZE 802 } 803 dst, err := t.IovecsIOSequence(hostarch.Addr(msg.Iov), int(msg.IovLen), usermem.IOOpts{ 804 AddressSpaceActive: true, 805 }) 806 if err != nil { 807 return 0, err 808 } 809 810 // Fast path when no control message nor name buffers are provided. 811 if msg.ControlLen == 0 && msg.NameLen == 0 { 812 n, mflags, _, _, cms, err := s.RecvMsg(t, dst, int(flags), haveDeadline, deadline, false, 0) 813 if err != nil { 814 return 0, linuxerr.ConvertIntr(err.ToError(), linuxerr.ERESTARTSYS) 815 } 816 if !cms.Unix.Empty() { 817 mflags |= linux.MSG_CTRUNC 818 cms.Release(t) 819 } 820 821 if int(msg.Flags) != mflags { 822 // Copy out the flags to the caller. 823 if _, err := primitive.CopyInt32Out(t, msgPtr+flagsOffset, int32(mflags)); err != nil { 824 return 0, err 825 } 826 } 827 828 return uintptr(n), nil 829 } 830 831 if msg.ControlLen > maxControlLen { 832 return 0, linuxerr.ENOBUFS 833 } 834 n, mflags, sender, senderLen, cms, e := s.RecvMsg(t, dst, int(flags), haveDeadline, deadline, msg.NameLen != 0, msg.ControlLen) 835 if e != nil { 836 return 0, linuxerr.ConvertIntr(e.ToError(), linuxerr.ERESTARTSYS) 837 } 838 defer cms.Release(t) 839 840 controlData := make([]byte, 0, msg.ControlLen) 841 controlData = control.PackControlMessages(t, cms, controlData) 842 843 if cr, ok := s.(transport.Credentialer); ok && cr.Passcred() { 844 creds, _ := cms.Unix.Credentials.(control.SCMCredentials) 845 controlData, mflags = control.PackCredentials(t, creds, controlData, mflags) 846 } 847 848 if cms.Unix.Rights != nil { 849 cms.Unix.Rights = getSCMRights(t, cms.Unix.Rights) 850 controlData, mflags = control.PackRights(t, cms.Unix.Rights.(control.SCMRights), flags&linux.MSG_CMSG_CLOEXEC != 0, controlData, mflags) 851 } 852 853 // Copy the address to the caller. 854 if msg.NameLen != 0 { 855 if err := writeAddress(t, sender, senderLen, hostarch.Addr(msg.Name), hostarch.Addr(msgPtr+nameLenOffset)); err != nil { 856 return 0, err 857 } 858 } 859 860 // Copy the control data to the caller. 861 if _, err := primitive.CopyUint64Out(t, msgPtr+controlLenOffset, uint64(len(controlData))); err != nil { 862 return 0, err 863 } 864 if len(controlData) > 0 { 865 if _, err := t.CopyOutBytes(hostarch.Addr(msg.Control), controlData); err != nil { 866 return 0, err 867 } 868 } 869 870 // Copy out the flags to the caller. 871 if _, err := primitive.CopyInt32Out(t, msgPtr+flagsOffset, int32(mflags)); err != nil { 872 return 0, err 873 } 874 875 return uintptr(n), nil 876 } 877 878 // recvFrom is the implementation of the recvfrom syscall. It is called by 879 // recvfrom and recv syscall handlers. 880 func recvFrom(t *kernel.Task, fd int32, bufPtr hostarch.Addr, bufLen uint64, flags int32, namePtr hostarch.Addr, nameLenPtr hostarch.Addr) (uintptr, error) { 881 if int(bufLen) < 0 { 882 return 0, linuxerr.EINVAL 883 } 884 885 // Reject flags that we don't handle yet. 886 if flags & ^(baseRecvFlags|linux.MSG_PEEK|linux.MSG_CONFIRM) != 0 { 887 return 0, linuxerr.EINVAL 888 } 889 890 // Get socket from the file descriptor. 891 file := t.GetFile(fd) 892 if file == nil { 893 return 0, linuxerr.EBADF 894 } 895 defer file.DecRef(t) 896 897 // Extract the socket. 898 s, ok := file.Impl().(socket.Socket) 899 if !ok { 900 return 0, linuxerr.ENOTSOCK 901 } 902 903 if (file.StatusFlags() & linux.SOCK_NONBLOCK) != 0 { 904 flags |= linux.MSG_DONTWAIT 905 } 906 907 dst, err := t.SingleIOSequence(bufPtr, int(bufLen), usermem.IOOpts{ 908 AddressSpaceActive: true, 909 }) 910 if err != nil { 911 return 0, err 912 } 913 914 var haveDeadline bool 915 var deadline ktime.Time 916 if dl := s.RecvTimeout(); dl > 0 { 917 deadline = t.Kernel().MonotonicClock().Now().Add(time.Duration(dl) * time.Nanosecond) 918 haveDeadline = true 919 } else if dl < 0 { 920 flags |= linux.MSG_DONTWAIT 921 } 922 923 n, _, sender, senderLen, cm, e := s.RecvMsg(t, dst, int(flags), haveDeadline, deadline, nameLenPtr != 0, 0) 924 cm.Release(t) 925 if e != nil { 926 return 0, linuxerr.ConvertIntr(e.ToError(), linuxerr.ERESTARTSYS) 927 } 928 929 // Copy the address to the caller. 930 if nameLenPtr != 0 { 931 if err := writeAddress(t, sender, senderLen, namePtr, nameLenPtr); err != nil { 932 return 0, err 933 } 934 } 935 936 return uintptr(n), nil 937 } 938 939 // RecvFrom implements the linux syscall recvfrom(2). 940 func RecvFrom(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { 941 fd := args[0].Int() 942 bufPtr := args[1].Pointer() 943 bufLen := args[2].Uint64() 944 flags := args[3].Int() 945 namePtr := args[4].Pointer() 946 nameLenPtr := args[5].Pointer() 947 948 n, err := recvFrom(t, fd, bufPtr, bufLen, flags, namePtr, nameLenPtr) 949 return n, nil, err 950 } 951 952 // SendMsg implements the linux syscall sendmsg(2). 953 func SendMsg(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { 954 fd := args[0].Int() 955 msgPtr := args[1].Pointer() 956 flags := args[2].Int() 957 958 if t.Arch().Width() != 8 { 959 // We only handle 64-bit for now. 960 return 0, nil, linuxerr.EINVAL 961 } 962 963 // Get socket from the file descriptor. 964 file := t.GetFile(fd) 965 if file == nil { 966 return 0, nil, linuxerr.EBADF 967 } 968 defer file.DecRef(t) 969 970 // Extract the socket. 971 s, ok := file.Impl().(socket.Socket) 972 if !ok { 973 return 0, nil, linuxerr.ENOTSOCK 974 } 975 976 // Reject flags that we don't handle yet. 977 if flags & ^(linux.MSG_DONTWAIT|linux.MSG_EOR|linux.MSG_MORE|linux.MSG_NOSIGNAL) != 0 { 978 return 0, nil, linuxerr.EINVAL 979 } 980 981 if (file.StatusFlags() & linux.SOCK_NONBLOCK) != 0 { 982 flags |= linux.MSG_DONTWAIT 983 } 984 985 n, err := sendSingleMsg(t, s, file, msgPtr, flags) 986 return n, nil, err 987 } 988 989 // SendMMsg implements the linux syscall sendmmsg(2). 990 func SendMMsg(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { 991 fd := args[0].Int() 992 msgPtr := args[1].Pointer() 993 vlen := args[2].Uint() 994 flags := args[3].Int() 995 996 if t.Arch().Width() != 8 { 997 // We only handle 64-bit for now. 998 return 0, nil, linuxerr.EINVAL 999 } 1000 1001 if vlen > linux.UIO_MAXIOV { 1002 vlen = linux.UIO_MAXIOV 1003 } 1004 1005 // Get socket from the file descriptor. 1006 file := t.GetFile(fd) 1007 if file == nil { 1008 return 0, nil, linuxerr.EBADF 1009 } 1010 defer file.DecRef(t) 1011 1012 // Extract the socket. 1013 s, ok := file.Impl().(socket.Socket) 1014 if !ok { 1015 return 0, nil, linuxerr.ENOTSOCK 1016 } 1017 1018 // Reject flags that we don't handle yet. 1019 if flags & ^(linux.MSG_DONTWAIT|linux.MSG_EOR|linux.MSG_MORE|linux.MSG_NOSIGNAL) != 0 { 1020 return 0, nil, linuxerr.EINVAL 1021 } 1022 1023 if (file.StatusFlags() & linux.SOCK_NONBLOCK) != 0 { 1024 flags |= linux.MSG_DONTWAIT 1025 } 1026 1027 var count uint32 1028 var err error 1029 for i := uint64(0); i < uint64(vlen); i++ { 1030 mp, ok := msgPtr.AddLength(i * multipleMessageHeader64Len) 1031 if !ok { 1032 return 0, nil, linuxerr.EFAULT 1033 } 1034 var n uintptr 1035 if n, err = sendSingleMsg(t, s, file, mp, flags); err != nil { 1036 break 1037 } 1038 1039 // Copy the received length to the caller. 1040 lp, ok := mp.AddLength(messageHeader64Len) 1041 if !ok { 1042 return 0, nil, linuxerr.EFAULT 1043 } 1044 if _, err = primitive.CopyUint32Out(t, lp, uint32(n)); err != nil { 1045 break 1046 } 1047 count++ 1048 } 1049 1050 if count == 0 { 1051 return 0, nil, err 1052 } 1053 return uintptr(count), nil, nil 1054 } 1055 1056 func sendSingleMsg(t *kernel.Task, s socket.Socket, file *vfs.FileDescription, msgPtr hostarch.Addr, flags int32) (uintptr, error) { 1057 // Capture the message header. 1058 var msg MessageHeader64 1059 if _, err := msg.CopyIn(t, msgPtr); err != nil { 1060 return 0, err 1061 } 1062 1063 var controlData []byte 1064 if msg.ControlLen > 0 { 1065 // Put an upper bound to prevent large allocations. 1066 if msg.ControlLen > maxControlLen { 1067 return 0, linuxerr.ENOBUFS 1068 } 1069 controlData = make([]byte, msg.ControlLen) 1070 if _, err := t.CopyInBytes(hostarch.Addr(msg.Control), controlData); err != nil { 1071 return 0, err 1072 } 1073 } 1074 1075 // Read the destination address if one is specified. 1076 var to []byte 1077 if msg.NameLen != 0 { 1078 var err error 1079 to, err = CaptureAddress(t, hostarch.Addr(msg.Name), msg.NameLen) 1080 if err != nil { 1081 return 0, err 1082 } 1083 } 1084 1085 // Read data then call the sendmsg implementation. 1086 if msg.IovLen > linux.UIO_MAXIOV { 1087 return 0, linuxerr.EMSGSIZE 1088 } 1089 src, err := t.IovecsIOSequence(hostarch.Addr(msg.Iov), int(msg.IovLen), usermem.IOOpts{ 1090 AddressSpaceActive: true, 1091 }) 1092 if err != nil { 1093 return 0, err 1094 } 1095 1096 controlMessages, err := control.Parse(t, s, controlData, t.Arch().Width()) 1097 if err != nil { 1098 return 0, err 1099 } 1100 1101 var haveDeadline bool 1102 var deadline ktime.Time 1103 if dl := s.SendTimeout(); dl > 0 { 1104 deadline = t.Kernel().MonotonicClock().Now().Add(time.Duration(dl) * time.Nanosecond) 1105 haveDeadline = true 1106 } else if dl < 0 { 1107 flags |= linux.MSG_DONTWAIT 1108 } 1109 1110 // Call the syscall implementation. 1111 n, e := s.SendMsg(t, src, to, int(flags), haveDeadline, deadline, controlMessages) 1112 err = HandleIOError(t, n != 0, e.ToError(), linuxerr.ERESTARTSYS, "sendmsg", file) 1113 // Control messages should be released on error as well as for zero-length 1114 // messages, which are discarded by the receiver. 1115 if n == 0 || err != nil { 1116 controlMessages.Release(t) 1117 } 1118 return uintptr(n), err 1119 } 1120 1121 // sendTo is the implementation of the sendto syscall. It is called by sendto 1122 // and send syscall handlers. 1123 func sendTo(t *kernel.Task, fd int32, bufPtr hostarch.Addr, bufLen uint64, flags int32, namePtr hostarch.Addr, nameLen uint32) (uintptr, error) { 1124 bl := int(bufLen) 1125 if bl < 0 { 1126 return 0, linuxerr.EINVAL 1127 } 1128 1129 // Get socket from the file descriptor. 1130 file := t.GetFile(fd) 1131 if file == nil { 1132 return 0, linuxerr.EBADF 1133 } 1134 defer file.DecRef(t) 1135 1136 // Extract the socket. 1137 s, ok := file.Impl().(socket.Socket) 1138 if !ok { 1139 return 0, linuxerr.ENOTSOCK 1140 } 1141 1142 if (file.StatusFlags() & linux.SOCK_NONBLOCK) != 0 { 1143 flags |= linux.MSG_DONTWAIT 1144 } 1145 1146 // Read the destination address if one is specified. 1147 var to []byte 1148 var err error 1149 if namePtr != 0 { 1150 to, err = CaptureAddress(t, namePtr, nameLen) 1151 if err != nil { 1152 return 0, err 1153 } 1154 } 1155 1156 src, err := t.SingleIOSequence(bufPtr, bl, usermem.IOOpts{ 1157 AddressSpaceActive: true, 1158 }) 1159 if err != nil { 1160 return 0, err 1161 } 1162 1163 var haveDeadline bool 1164 var deadline ktime.Time 1165 if dl := s.SendTimeout(); dl > 0 { 1166 deadline = t.Kernel().MonotonicClock().Now().Add(time.Duration(dl) * time.Nanosecond) 1167 haveDeadline = true 1168 } else if dl < 0 { 1169 flags |= linux.MSG_DONTWAIT 1170 } 1171 1172 // Call the syscall implementation. 1173 n, e := s.SendMsg(t, src, to, int(flags), haveDeadline, deadline, socket.ControlMessages{Unix: control.New(t, s)}) 1174 return uintptr(n), HandleIOError(t, n != 0, e.ToError(), linuxerr.ERESTARTSYS, "sendto", file) 1175 } 1176 1177 // SendTo implements the linux syscall sendto(2). 1178 func SendTo(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { 1179 fd := args[0].Int() 1180 bufPtr := args[1].Pointer() 1181 bufLen := args[2].Uint64() 1182 flags := args[3].Int() 1183 namePtr := args[4].Pointer() 1184 nameLen := args[5].Uint() 1185 1186 n, err := sendTo(t, fd, bufPtr, bufLen, flags, namePtr, nameLen) 1187 return n, nil, err 1188 }