github.com/SagerNet/gvisor@v0.0.0-20210707092255-7731c139d75c/pkg/sentry/syscalls/linux/vfs2/socket.go (about)

     1  // Copyright 2018 The gVisor Authors.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  package vfs2
    16  
    17  import (
    18  	"time"
    19  
    20  	"github.com/SagerNet/gvisor/pkg/abi/linux"
    21  	"github.com/SagerNet/gvisor/pkg/errors/linuxerr"
    22  	"github.com/SagerNet/gvisor/pkg/marshal"
    23  	"github.com/SagerNet/gvisor/pkg/marshal/primitive"
    24  	"github.com/SagerNet/gvisor/pkg/sentry/arch"
    25  	"github.com/SagerNet/gvisor/pkg/sentry/kernel"
    26  	ktime "github.com/SagerNet/gvisor/pkg/sentry/kernel/time"
    27  	"github.com/SagerNet/gvisor/pkg/sentry/socket"
    28  	"github.com/SagerNet/gvisor/pkg/sentry/socket/control"
    29  	"github.com/SagerNet/gvisor/pkg/sentry/socket/unix/transport"
    30  	slinux "github.com/SagerNet/gvisor/pkg/sentry/syscalls/linux"
    31  	"github.com/SagerNet/gvisor/pkg/sentry/vfs"
    32  	"github.com/SagerNet/gvisor/pkg/syserr"
    33  	"github.com/SagerNet/gvisor/pkg/syserror"
    34  	"github.com/SagerNet/gvisor/pkg/usermem"
    35  
    36  	"github.com/SagerNet/gvisor/pkg/hostarch"
    37  )
    38  
    39  // maxAddrLen is the maximum socket address length we're willing to accept.
    40  const maxAddrLen = 200
    41  
    42  // maxOptLen is the maximum sockopt parameter length we're willing to accept.
    43  const maxOptLen = 1024 * 8
    44  
    45  // maxControlLen is the maximum length of the msghdr.msg_control buffer we're
    46  // willing to accept. Note that this limit is smaller than Linux, which allows
    47  // buffers upto INT_MAX.
    48  const maxControlLen = 10 * 1024 * 1024
    49  
    50  // maxListenBacklog is the maximum limit of listen backlog supported.
    51  const maxListenBacklog = 1024
    52  
    53  // nameLenOffset is the offset from the start of the MessageHeader64 struct to
    54  // the NameLen field.
    55  const nameLenOffset = 8
    56  
    57  // controlLenOffset is the offset form the start of the MessageHeader64 struct
    58  // to the ControlLen field.
    59  const controlLenOffset = 40
    60  
    61  // flagsOffset is the offset form the start of the MessageHeader64 struct
    62  // to the Flags field.
    63  const flagsOffset = 48
    64  
    65  const sizeOfInt32 = 4
    66  
    67  // messageHeader64Len is the length of a MessageHeader64 struct.
    68  var messageHeader64Len = uint64((*MessageHeader64)(nil).SizeBytes())
    69  
    70  // multipleMessageHeader64Len is the length of a multipeMessageHeader64 struct.
    71  var multipleMessageHeader64Len = uint64((*multipleMessageHeader64)(nil).SizeBytes())
    72  
    73  // baseRecvFlags are the flags that are accepted across recvmsg(2),
    74  // recvmmsg(2), and recvfrom(2).
    75  const baseRecvFlags = linux.MSG_OOB | linux.MSG_DONTROUTE | linux.MSG_DONTWAIT | linux.MSG_NOSIGNAL | linux.MSG_WAITALL | linux.MSG_TRUNC | linux.MSG_CTRUNC
    76  
    77  // MessageHeader64 is the 64-bit representation of the msghdr struct used in
    78  // the recvmsg and sendmsg syscalls.
    79  //
    80  // +marshal
    81  type MessageHeader64 struct {
    82  	// Name is the optional pointer to a network address buffer.
    83  	Name uint64
    84  
    85  	// NameLen is the length of the buffer pointed to by Name.
    86  	NameLen uint32
    87  	_       uint32
    88  
    89  	// Iov is a pointer to an array of io vectors that describe the memory
    90  	// locations involved in the io operation.
    91  	Iov uint64
    92  
    93  	// IovLen is the length of the array pointed to by Iov.
    94  	IovLen uint64
    95  
    96  	// Control is the optional pointer to ancillary control data.
    97  	Control uint64
    98  
    99  	// ControlLen is the length of the data pointed to by Control.
   100  	ControlLen uint64
   101  
   102  	// Flags on the sent/received message.
   103  	Flags int32
   104  	_     int32
   105  }
   106  
   107  // multipleMessageHeader64 is the 64-bit representation of the mmsghdr struct used in
   108  // the recvmmsg and sendmmsg syscalls.
   109  //
   110  // +marshal
   111  type multipleMessageHeader64 struct {
   112  	msgHdr MessageHeader64
   113  	msgLen uint32
   114  	_      int32
   115  }
   116  
   117  // CaptureAddress allocates memory for and copies a socket address structure
   118  // from the untrusted address space range.
   119  func CaptureAddress(t *kernel.Task, addr hostarch.Addr, addrlen uint32) ([]byte, error) {
   120  	if addrlen > maxAddrLen {
   121  		return nil, linuxerr.EINVAL
   122  	}
   123  
   124  	addrBuf := make([]byte, addrlen)
   125  	if _, err := t.CopyInBytes(addr, addrBuf); err != nil {
   126  		return nil, err
   127  	}
   128  
   129  	return addrBuf, nil
   130  }
   131  
   132  // writeAddress writes a sockaddr structure and its length to an output buffer
   133  // in the unstrusted address space range. If the address is bigger than the
   134  // buffer, it is truncated.
   135  func writeAddress(t *kernel.Task, addr linux.SockAddr, addrLen uint32, addrPtr hostarch.Addr, addrLenPtr hostarch.Addr) error {
   136  	// Get the buffer length.
   137  	var bufLen uint32
   138  	if _, err := primitive.CopyUint32In(t, addrLenPtr, &bufLen); err != nil {
   139  		return err
   140  	}
   141  
   142  	if int32(bufLen) < 0 {
   143  		return linuxerr.EINVAL
   144  	}
   145  
   146  	// Write the length unconditionally.
   147  	if _, err := primitive.CopyUint32Out(t, addrLenPtr, addrLen); err != nil {
   148  		return err
   149  	}
   150  
   151  	if addr == nil {
   152  		return nil
   153  	}
   154  
   155  	if bufLen > addrLen {
   156  		bufLen = addrLen
   157  	}
   158  
   159  	// Copy as much of the address as will fit in the buffer.
   160  	encodedAddr := t.CopyScratchBuffer(addr.SizeBytes())
   161  	addr.MarshalUnsafe(encodedAddr)
   162  	if bufLen > uint32(len(encodedAddr)) {
   163  		bufLen = uint32(len(encodedAddr))
   164  	}
   165  	_, err := t.CopyOutBytes(addrPtr, encodedAddr[:int(bufLen)])
   166  	return err
   167  }
   168  
   169  // Socket implements the linux syscall socket(2).
   170  func Socket(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
   171  	domain := int(args[0].Int())
   172  	stype := args[1].Int()
   173  	protocol := int(args[2].Int())
   174  
   175  	// Check and initialize the flags.
   176  	if stype & ^(0xf|linux.SOCK_NONBLOCK|linux.SOCK_CLOEXEC) != 0 {
   177  		return 0, nil, linuxerr.EINVAL
   178  	}
   179  
   180  	// Create the new socket.
   181  	s, e := socket.NewVFS2(t, domain, linux.SockType(stype&0xf), protocol)
   182  	if e != nil {
   183  		return 0, nil, e.ToError()
   184  	}
   185  	defer s.DecRef(t)
   186  
   187  	if err := s.SetStatusFlags(t, t.Credentials(), uint32(stype&linux.SOCK_NONBLOCK)); err != nil {
   188  		return 0, nil, err
   189  	}
   190  
   191  	fd, err := t.NewFDFromVFS2(0, s, kernel.FDFlags{
   192  		CloseOnExec: stype&linux.SOCK_CLOEXEC != 0,
   193  	})
   194  	if err != nil {
   195  		return 0, nil, err
   196  	}
   197  
   198  	return uintptr(fd), nil, nil
   199  }
   200  
   201  // SocketPair implements the linux syscall socketpair(2).
   202  func SocketPair(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
   203  	domain := int(args[0].Int())
   204  	stype := args[1].Int()
   205  	protocol := int(args[2].Int())
   206  	addr := args[3].Pointer()
   207  
   208  	// Check and initialize the flags.
   209  	if stype & ^(0xf|linux.SOCK_NONBLOCK|linux.SOCK_CLOEXEC) != 0 {
   210  		return 0, nil, linuxerr.EINVAL
   211  	}
   212  
   213  	// Create the socket pair.
   214  	s1, s2, e := socket.PairVFS2(t, domain, linux.SockType(stype&0xf), protocol)
   215  	if e != nil {
   216  		return 0, nil, e.ToError()
   217  	}
   218  	// Adding to the FD table will cause an extra reference to be acquired.
   219  	defer s1.DecRef(t)
   220  	defer s2.DecRef(t)
   221  
   222  	nonblocking := uint32(stype & linux.SOCK_NONBLOCK)
   223  	if err := s1.SetStatusFlags(t, t.Credentials(), nonblocking); err != nil {
   224  		return 0, nil, err
   225  	}
   226  	if err := s2.SetStatusFlags(t, t.Credentials(), nonblocking); err != nil {
   227  		return 0, nil, err
   228  	}
   229  
   230  	// Create the FDs for the sockets.
   231  	flags := kernel.FDFlags{
   232  		CloseOnExec: stype&linux.SOCK_CLOEXEC != 0,
   233  	}
   234  	fds, err := t.NewFDsVFS2(0, []*vfs.FileDescription{s1, s2}, flags)
   235  	if err != nil {
   236  		return 0, nil, err
   237  	}
   238  
   239  	if _, err := primitive.CopyInt32SliceOut(t, addr, fds); err != nil {
   240  		for _, fd := range fds {
   241  			if _, file := t.FDTable().Remove(t, fd); file != nil {
   242  				file.DecRef(t)
   243  			}
   244  		}
   245  		return 0, nil, err
   246  	}
   247  
   248  	return 0, nil, nil
   249  }
   250  
   251  // Connect implements the linux syscall connect(2).
   252  func Connect(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
   253  	fd := args[0].Int()
   254  	addr := args[1].Pointer()
   255  	addrlen := args[2].Uint()
   256  
   257  	// Get socket from the file descriptor.
   258  	file := t.GetFileVFS2(fd)
   259  	if file == nil {
   260  		return 0, nil, linuxerr.EBADF
   261  	}
   262  	defer file.DecRef(t)
   263  
   264  	// Extract the socket.
   265  	s, ok := file.Impl().(socket.SocketVFS2)
   266  	if !ok {
   267  		return 0, nil, syserror.ENOTSOCK
   268  	}
   269  
   270  	// Capture address and call syscall implementation.
   271  	a, err := CaptureAddress(t, addr, addrlen)
   272  	if err != nil {
   273  		return 0, nil, err
   274  	}
   275  
   276  	blocking := (file.StatusFlags() & linux.SOCK_NONBLOCK) == 0
   277  	return 0, nil, syserror.ConvertIntr(s.Connect(t, a, blocking).ToError(), syserror.ERESTARTSYS)
   278  }
   279  
   280  // accept is the implementation of the accept syscall. It is called by accept
   281  // and accept4 syscall handlers.
   282  func accept(t *kernel.Task, fd int32, addr hostarch.Addr, addrLen hostarch.Addr, flags int) (uintptr, error) {
   283  	// Check that no unsupported flags are passed in.
   284  	if flags & ^(linux.SOCK_NONBLOCK|linux.SOCK_CLOEXEC) != 0 {
   285  		return 0, linuxerr.EINVAL
   286  	}
   287  
   288  	// Get socket from the file descriptor.
   289  	file := t.GetFileVFS2(fd)
   290  	if file == nil {
   291  		return 0, linuxerr.EBADF
   292  	}
   293  	defer file.DecRef(t)
   294  
   295  	// Extract the socket.
   296  	s, ok := file.Impl().(socket.SocketVFS2)
   297  	if !ok {
   298  		return 0, syserror.ENOTSOCK
   299  	}
   300  
   301  	// Call the syscall implementation for this socket, then copy the
   302  	// output address if one is specified.
   303  	blocking := (file.StatusFlags() & linux.SOCK_NONBLOCK) == 0
   304  
   305  	peerRequested := addrLen != 0
   306  	nfd, peer, peerLen, e := s.Accept(t, peerRequested, flags, blocking)
   307  	if e != nil {
   308  		return 0, syserror.ConvertIntr(e.ToError(), syserror.ERESTARTSYS)
   309  	}
   310  	if peerRequested {
   311  		// NOTE(magi): Linux does not give you an error if it can't
   312  		// write the data back out so neither do we.
   313  		if err := writeAddress(t, peer, peerLen, addr, addrLen); linuxerr.Equals(linuxerr.EINVAL, err) {
   314  			return 0, err
   315  		}
   316  	}
   317  	return uintptr(nfd), nil
   318  }
   319  
   320  // Accept4 implements the linux syscall accept4(2).
   321  func Accept4(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
   322  	fd := args[0].Int()
   323  	addr := args[1].Pointer()
   324  	addrlen := args[2].Pointer()
   325  	flags := int(args[3].Int())
   326  
   327  	n, err := accept(t, fd, addr, addrlen, flags)
   328  	return n, nil, err
   329  }
   330  
   331  // Accept implements the linux syscall accept(2).
   332  func Accept(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
   333  	fd := args[0].Int()
   334  	addr := args[1].Pointer()
   335  	addrlen := args[2].Pointer()
   336  
   337  	n, err := accept(t, fd, addr, addrlen, 0)
   338  	return n, nil, err
   339  }
   340  
   341  // Bind implements the linux syscall bind(2).
   342  func Bind(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
   343  	fd := args[0].Int()
   344  	addr := args[1].Pointer()
   345  	addrlen := args[2].Uint()
   346  
   347  	// Get socket from the file descriptor.
   348  	file := t.GetFileVFS2(fd)
   349  	if file == nil {
   350  		return 0, nil, linuxerr.EBADF
   351  	}
   352  	defer file.DecRef(t)
   353  
   354  	// Extract the socket.
   355  	s, ok := file.Impl().(socket.SocketVFS2)
   356  	if !ok {
   357  		return 0, nil, syserror.ENOTSOCK
   358  	}
   359  
   360  	// Capture address and call syscall implementation.
   361  	a, err := CaptureAddress(t, addr, addrlen)
   362  	if err != nil {
   363  		return 0, nil, err
   364  	}
   365  
   366  	return 0, nil, s.Bind(t, a).ToError()
   367  }
   368  
   369  // Listen implements the linux syscall listen(2).
   370  func Listen(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
   371  	fd := args[0].Int()
   372  	backlog := args[1].Uint()
   373  
   374  	// Get socket from the file descriptor.
   375  	file := t.GetFileVFS2(fd)
   376  	if file == nil {
   377  		return 0, nil, linuxerr.EBADF
   378  	}
   379  	defer file.DecRef(t)
   380  
   381  	// Extract the socket.
   382  	s, ok := file.Impl().(socket.SocketVFS2)
   383  	if !ok {
   384  		return 0, nil, syserror.ENOTSOCK
   385  	}
   386  
   387  	if backlog > maxListenBacklog {
   388  		// Linux treats incoming backlog as uint with a limit defined by
   389  		// sysctl_somaxconn.
   390  		// https://github.com/torvalds/linux/blob/7acac4b3196/net/socket.c#L1666
   391  		backlog = maxListenBacklog
   392  	}
   393  
   394  	// Accept one more than the configured listen backlog to keep in parity with
   395  	// Linux. Ref, because of missing equality check here:
   396  	// https://github.com/torvalds/linux/blob/7acac4b3196/include/net/sock.h#L937
   397  	//
   398  	// In case of unix domain sockets, the following check
   399  	// https://github.com/torvalds/linux/blob/7d6beb71da3/net/unix/af_unix.c#L1293
   400  	// will allow 1 connect through since it checks for a receive queue len >
   401  	// backlog and not >=.
   402  	backlog++
   403  
   404  	return 0, nil, s.Listen(t, int(backlog)).ToError()
   405  }
   406  
   407  // Shutdown implements the linux syscall shutdown(2).
   408  func Shutdown(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
   409  	fd := args[0].Int()
   410  	how := args[1].Int()
   411  
   412  	// Get socket from the file descriptor.
   413  	file := t.GetFileVFS2(fd)
   414  	if file == nil {
   415  		return 0, nil, linuxerr.EBADF
   416  	}
   417  	defer file.DecRef(t)
   418  
   419  	// Extract the socket.
   420  	s, ok := file.Impl().(socket.SocketVFS2)
   421  	if !ok {
   422  		return 0, nil, syserror.ENOTSOCK
   423  	}
   424  
   425  	// Validate how, then call syscall implementation.
   426  	switch how {
   427  	case linux.SHUT_RD, linux.SHUT_WR, linux.SHUT_RDWR:
   428  	default:
   429  		return 0, nil, linuxerr.EINVAL
   430  	}
   431  
   432  	return 0, nil, s.Shutdown(t, int(how)).ToError()
   433  }
   434  
   435  // GetSockOpt implements the linux syscall getsockopt(2).
   436  func GetSockOpt(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
   437  	fd := args[0].Int()
   438  	level := args[1].Int()
   439  	name := args[2].Int()
   440  	optValAddr := args[3].Pointer()
   441  	optLenAddr := args[4].Pointer()
   442  
   443  	// Get socket from the file descriptor.
   444  	file := t.GetFileVFS2(fd)
   445  	if file == nil {
   446  		return 0, nil, linuxerr.EBADF
   447  	}
   448  	defer file.DecRef(t)
   449  
   450  	// Extract the socket.
   451  	s, ok := file.Impl().(socket.SocketVFS2)
   452  	if !ok {
   453  		return 0, nil, syserror.ENOTSOCK
   454  	}
   455  
   456  	// Read the length. Reject negative values.
   457  	var optLen int32
   458  	if _, err := primitive.CopyInt32In(t, optLenAddr, &optLen); err != nil {
   459  		return 0, nil, err
   460  	}
   461  	if optLen < 0 {
   462  		return 0, nil, linuxerr.EINVAL
   463  	}
   464  
   465  	// Call syscall implementation then copy both value and value len out.
   466  	v, e := getSockOpt(t, s, int(level), int(name), optValAddr, int(optLen))
   467  	if e != nil {
   468  		return 0, nil, e.ToError()
   469  	}
   470  
   471  	if _, err := primitive.CopyInt32Out(t, optLenAddr, int32(v.SizeBytes())); err != nil {
   472  		return 0, nil, err
   473  	}
   474  
   475  	if v != nil {
   476  		if _, err := v.CopyOut(t, optValAddr); err != nil {
   477  			return 0, nil, err
   478  		}
   479  	}
   480  
   481  	return 0, nil, nil
   482  }
   483  
   484  // getSockOpt tries to handle common socket options, or dispatches to a specific
   485  // socket implementation.
   486  func getSockOpt(t *kernel.Task, s socket.SocketVFS2, level, name int, optValAddr hostarch.Addr, len int) (marshal.Marshallable, *syserr.Error) {
   487  	if level == linux.SOL_SOCKET {
   488  		switch name {
   489  		case linux.SO_TYPE, linux.SO_DOMAIN, linux.SO_PROTOCOL:
   490  			if len < sizeOfInt32 {
   491  				return nil, syserr.ErrInvalidArgument
   492  			}
   493  		}
   494  
   495  		switch name {
   496  		case linux.SO_TYPE:
   497  			_, skType, _ := s.Type()
   498  			v := primitive.Int32(skType)
   499  			return &v, nil
   500  		case linux.SO_DOMAIN:
   501  			family, _, _ := s.Type()
   502  			v := primitive.Int32(family)
   503  			return &v, nil
   504  		case linux.SO_PROTOCOL:
   505  			_, _, protocol := s.Type()
   506  			v := primitive.Int32(protocol)
   507  			return &v, nil
   508  		}
   509  	}
   510  
   511  	return s.GetSockOpt(t, level, name, optValAddr, len)
   512  }
   513  
   514  // SetSockOpt implements the linux syscall setsockopt(2).
   515  //
   516  // Note that unlike Linux, enabling SO_PASSCRED does not autobind the socket.
   517  func SetSockOpt(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
   518  	fd := args[0].Int()
   519  	level := args[1].Int()
   520  	name := args[2].Int()
   521  	optValAddr := args[3].Pointer()
   522  	optLen := args[4].Int()
   523  
   524  	// Get socket from the file descriptor.
   525  	file := t.GetFileVFS2(fd)
   526  	if file == nil {
   527  		return 0, nil, linuxerr.EBADF
   528  	}
   529  	defer file.DecRef(t)
   530  
   531  	// Extract the socket.
   532  	s, ok := file.Impl().(socket.SocketVFS2)
   533  	if !ok {
   534  		return 0, nil, syserror.ENOTSOCK
   535  	}
   536  
   537  	if optLen < 0 {
   538  		return 0, nil, linuxerr.EINVAL
   539  	}
   540  	if optLen > maxOptLen {
   541  		return 0, nil, linuxerr.EINVAL
   542  	}
   543  	buf := t.CopyScratchBuffer(int(optLen))
   544  	if _, err := t.CopyInBytes(optValAddr, buf); err != nil {
   545  		return 0, nil, err
   546  	}
   547  
   548  	// Call syscall implementation.
   549  	if err := s.SetSockOpt(t, int(level), int(name), buf); err != nil {
   550  		return 0, nil, err.ToError()
   551  	}
   552  
   553  	return 0, nil, nil
   554  }
   555  
   556  // GetSockName implements the linux syscall getsockname(2).
   557  func GetSockName(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
   558  	fd := args[0].Int()
   559  	addr := args[1].Pointer()
   560  	addrlen := args[2].Pointer()
   561  
   562  	// Get socket from the file descriptor.
   563  	file := t.GetFileVFS2(fd)
   564  	if file == nil {
   565  		return 0, nil, linuxerr.EBADF
   566  	}
   567  	defer file.DecRef(t)
   568  
   569  	// Extract the socket.
   570  	s, ok := file.Impl().(socket.SocketVFS2)
   571  	if !ok {
   572  		return 0, nil, syserror.ENOTSOCK
   573  	}
   574  
   575  	// Get the socket name and copy it to the caller.
   576  	v, vl, err := s.GetSockName(t)
   577  	if err != nil {
   578  		return 0, nil, err.ToError()
   579  	}
   580  
   581  	return 0, nil, writeAddress(t, v, vl, addr, addrlen)
   582  }
   583  
   584  // GetPeerName implements the linux syscall getpeername(2).
   585  func GetPeerName(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
   586  	fd := args[0].Int()
   587  	addr := args[1].Pointer()
   588  	addrlen := args[2].Pointer()
   589  
   590  	// Get socket from the file descriptor.
   591  	file := t.GetFileVFS2(fd)
   592  	if file == nil {
   593  		return 0, nil, linuxerr.EBADF
   594  	}
   595  	defer file.DecRef(t)
   596  
   597  	// Extract the socket.
   598  	s, ok := file.Impl().(socket.SocketVFS2)
   599  	if !ok {
   600  		return 0, nil, syserror.ENOTSOCK
   601  	}
   602  
   603  	// Get the socket peer name and copy it to the caller.
   604  	v, vl, err := s.GetPeerName(t)
   605  	if err != nil {
   606  		return 0, nil, err.ToError()
   607  	}
   608  
   609  	return 0, nil, writeAddress(t, v, vl, addr, addrlen)
   610  }
   611  
   612  // RecvMsg implements the linux syscall recvmsg(2).
   613  func RecvMsg(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
   614  	fd := args[0].Int()
   615  	msgPtr := args[1].Pointer()
   616  	flags := args[2].Int()
   617  
   618  	if t.Arch().Width() != 8 {
   619  		// We only handle 64-bit for now.
   620  		return 0, nil, linuxerr.EINVAL
   621  	}
   622  
   623  	// Get socket from the file descriptor.
   624  	file := t.GetFileVFS2(fd)
   625  	if file == nil {
   626  		return 0, nil, linuxerr.EBADF
   627  	}
   628  	defer file.DecRef(t)
   629  
   630  	// Extract the socket.
   631  	s, ok := file.Impl().(socket.SocketVFS2)
   632  	if !ok {
   633  		return 0, nil, syserror.ENOTSOCK
   634  	}
   635  
   636  	// Reject flags that we don't handle yet.
   637  	if flags & ^(baseRecvFlags|linux.MSG_PEEK|linux.MSG_CMSG_CLOEXEC|linux.MSG_ERRQUEUE) != 0 {
   638  		return 0, nil, linuxerr.EINVAL
   639  	}
   640  
   641  	if (file.StatusFlags() & linux.SOCK_NONBLOCK) != 0 {
   642  		flags |= linux.MSG_DONTWAIT
   643  	}
   644  
   645  	var haveDeadline bool
   646  	var deadline ktime.Time
   647  	if dl := s.RecvTimeout(); dl > 0 {
   648  		deadline = t.Kernel().MonotonicClock().Now().Add(time.Duration(dl) * time.Nanosecond)
   649  		haveDeadline = true
   650  	} else if dl < 0 {
   651  		flags |= linux.MSG_DONTWAIT
   652  	}
   653  
   654  	n, err := recvSingleMsg(t, s, msgPtr, flags, haveDeadline, deadline)
   655  	return n, nil, err
   656  }
   657  
   658  // RecvMMsg implements the linux syscall recvmmsg(2).
   659  func RecvMMsg(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
   660  	fd := args[0].Int()
   661  	msgPtr := args[1].Pointer()
   662  	vlen := args[2].Uint()
   663  	flags := args[3].Int()
   664  	toPtr := args[4].Pointer()
   665  
   666  	if t.Arch().Width() != 8 {
   667  		// We only handle 64-bit for now.
   668  		return 0, nil, linuxerr.EINVAL
   669  	}
   670  
   671  	if vlen > linux.UIO_MAXIOV {
   672  		vlen = linux.UIO_MAXIOV
   673  	}
   674  
   675  	// Reject flags that we don't handle yet.
   676  	if flags & ^(baseRecvFlags|linux.MSG_CMSG_CLOEXEC|linux.MSG_ERRQUEUE) != 0 {
   677  		return 0, nil, linuxerr.EINVAL
   678  	}
   679  
   680  	// Get socket from the file descriptor.
   681  	file := t.GetFileVFS2(fd)
   682  	if file == nil {
   683  		return 0, nil, linuxerr.EBADF
   684  	}
   685  	defer file.DecRef(t)
   686  
   687  	// Extract the socket.
   688  	s, ok := file.Impl().(socket.SocketVFS2)
   689  	if !ok {
   690  		return 0, nil, syserror.ENOTSOCK
   691  	}
   692  
   693  	if (file.StatusFlags() & linux.SOCK_NONBLOCK) != 0 {
   694  		flags |= linux.MSG_DONTWAIT
   695  	}
   696  
   697  	var haveDeadline bool
   698  	var deadline ktime.Time
   699  	if toPtr != 0 {
   700  		var ts linux.Timespec
   701  		if _, err := ts.CopyIn(t, toPtr); err != nil {
   702  			return 0, nil, err
   703  		}
   704  		if !ts.Valid() {
   705  			return 0, nil, linuxerr.EINVAL
   706  		}
   707  		deadline = t.Kernel().MonotonicClock().Now().Add(ts.ToDuration())
   708  		haveDeadline = true
   709  	}
   710  
   711  	if !haveDeadline {
   712  		if dl := s.RecvTimeout(); dl > 0 {
   713  			deadline = t.Kernel().MonotonicClock().Now().Add(time.Duration(dl) * time.Nanosecond)
   714  			haveDeadline = true
   715  		} else if dl < 0 {
   716  			flags |= linux.MSG_DONTWAIT
   717  		}
   718  	}
   719  
   720  	var count uint32
   721  	var err error
   722  	for i := uint64(0); i < uint64(vlen); i++ {
   723  		mp, ok := msgPtr.AddLength(i * multipleMessageHeader64Len)
   724  		if !ok {
   725  			return 0, nil, syserror.EFAULT
   726  		}
   727  		var n uintptr
   728  		if n, err = recvSingleMsg(t, s, mp, flags, haveDeadline, deadline); err != nil {
   729  			break
   730  		}
   731  
   732  		// Copy the received length to the caller.
   733  		lp, ok := mp.AddLength(messageHeader64Len)
   734  		if !ok {
   735  			return 0, nil, syserror.EFAULT
   736  		}
   737  		if _, err = primitive.CopyUint32Out(t, lp, uint32(n)); err != nil {
   738  			break
   739  		}
   740  		count++
   741  	}
   742  
   743  	if count == 0 {
   744  		return 0, nil, err
   745  	}
   746  	return uintptr(count), nil, nil
   747  }
   748  
   749  func recvSingleMsg(t *kernel.Task, s socket.SocketVFS2, msgPtr hostarch.Addr, flags int32, haveDeadline bool, deadline ktime.Time) (uintptr, error) {
   750  	// Capture the message header and io vectors.
   751  	var msg MessageHeader64
   752  	if _, err := msg.CopyIn(t, msgPtr); err != nil {
   753  		return 0, err
   754  	}
   755  
   756  	if msg.IovLen > linux.UIO_MAXIOV {
   757  		return 0, linuxerr.EMSGSIZE
   758  	}
   759  	dst, err := t.IovecsIOSequence(hostarch.Addr(msg.Iov), int(msg.IovLen), usermem.IOOpts{
   760  		AddressSpaceActive: true,
   761  	})
   762  	if err != nil {
   763  		return 0, err
   764  	}
   765  
   766  	// Fast path when no control message nor name buffers are provided.
   767  	if msg.ControlLen == 0 && msg.NameLen == 0 {
   768  		n, mflags, _, _, cms, err := s.RecvMsg(t, dst, int(flags), haveDeadline, deadline, false, 0)
   769  		if err != nil {
   770  			return 0, syserror.ConvertIntr(err.ToError(), syserror.ERESTARTSYS)
   771  		}
   772  		if !cms.Unix.Empty() {
   773  			mflags |= linux.MSG_CTRUNC
   774  			cms.Release(t)
   775  		}
   776  
   777  		if int(msg.Flags) != mflags {
   778  			// Copy out the flags to the caller.
   779  			if _, err := primitive.CopyInt32Out(t, msgPtr+flagsOffset, int32(mflags)); err != nil {
   780  				return 0, err
   781  			}
   782  		}
   783  
   784  		return uintptr(n), nil
   785  	}
   786  
   787  	if msg.ControlLen > maxControlLen {
   788  		return 0, linuxerr.ENOBUFS
   789  	}
   790  	n, mflags, sender, senderLen, cms, e := s.RecvMsg(t, dst, int(flags), haveDeadline, deadline, msg.NameLen != 0, msg.ControlLen)
   791  	if e != nil {
   792  		return 0, syserror.ConvertIntr(e.ToError(), syserror.ERESTARTSYS)
   793  	}
   794  	defer cms.Release(t)
   795  
   796  	controlData := make([]byte, 0, msg.ControlLen)
   797  	controlData = control.PackControlMessages(t, cms, controlData)
   798  
   799  	if cr, ok := s.(transport.Credentialer); ok && cr.Passcred() {
   800  		creds, _ := cms.Unix.Credentials.(control.SCMCredentials)
   801  		controlData, mflags = control.PackCredentials(t, creds, controlData, mflags)
   802  	}
   803  
   804  	if cms.Unix.Rights != nil {
   805  		controlData, mflags = control.PackRightsVFS2(t, cms.Unix.Rights.(control.SCMRightsVFS2), flags&linux.MSG_CMSG_CLOEXEC != 0, controlData, mflags)
   806  	}
   807  
   808  	// Copy the address to the caller.
   809  	if msg.NameLen != 0 {
   810  		if err := writeAddress(t, sender, senderLen, hostarch.Addr(msg.Name), hostarch.Addr(msgPtr+nameLenOffset)); err != nil {
   811  			return 0, err
   812  		}
   813  	}
   814  
   815  	// Copy the control data to the caller.
   816  	if _, err := primitive.CopyUint64Out(t, msgPtr+controlLenOffset, uint64(len(controlData))); err != nil {
   817  		return 0, err
   818  	}
   819  	if len(controlData) > 0 {
   820  		if _, err := t.CopyOutBytes(hostarch.Addr(msg.Control), controlData); err != nil {
   821  			return 0, err
   822  		}
   823  	}
   824  
   825  	// Copy out the flags to the caller.
   826  	if _, err := primitive.CopyInt32Out(t, msgPtr+flagsOffset, int32(mflags)); err != nil {
   827  		return 0, err
   828  	}
   829  
   830  	return uintptr(n), nil
   831  }
   832  
   833  // recvFrom is the implementation of the recvfrom syscall. It is called by
   834  // recvfrom and recv syscall handlers.
   835  func recvFrom(t *kernel.Task, fd int32, bufPtr hostarch.Addr, bufLen uint64, flags int32, namePtr hostarch.Addr, nameLenPtr hostarch.Addr) (uintptr, error) {
   836  	if int(bufLen) < 0 {
   837  		return 0, linuxerr.EINVAL
   838  	}
   839  
   840  	// Reject flags that we don't handle yet.
   841  	if flags & ^(baseRecvFlags|linux.MSG_PEEK|linux.MSG_CONFIRM) != 0 {
   842  		return 0, linuxerr.EINVAL
   843  	}
   844  
   845  	// Get socket from the file descriptor.
   846  	file := t.GetFileVFS2(fd)
   847  	if file == nil {
   848  		return 0, linuxerr.EBADF
   849  	}
   850  	defer file.DecRef(t)
   851  
   852  	// Extract the socket.
   853  	s, ok := file.Impl().(socket.SocketVFS2)
   854  	if !ok {
   855  		return 0, syserror.ENOTSOCK
   856  	}
   857  
   858  	if (file.StatusFlags() & linux.SOCK_NONBLOCK) != 0 {
   859  		flags |= linux.MSG_DONTWAIT
   860  	}
   861  
   862  	dst, err := t.SingleIOSequence(bufPtr, int(bufLen), usermem.IOOpts{
   863  		AddressSpaceActive: true,
   864  	})
   865  	if err != nil {
   866  		return 0, err
   867  	}
   868  
   869  	var haveDeadline bool
   870  	var deadline ktime.Time
   871  	if dl := s.RecvTimeout(); dl > 0 {
   872  		deadline = t.Kernel().MonotonicClock().Now().Add(time.Duration(dl) * time.Nanosecond)
   873  		haveDeadline = true
   874  	} else if dl < 0 {
   875  		flags |= linux.MSG_DONTWAIT
   876  	}
   877  
   878  	n, _, sender, senderLen, cm, e := s.RecvMsg(t, dst, int(flags), haveDeadline, deadline, nameLenPtr != 0, 0)
   879  	cm.Release(t)
   880  	if e != nil {
   881  		return 0, syserror.ConvertIntr(e.ToError(), syserror.ERESTARTSYS)
   882  	}
   883  
   884  	// Copy the address to the caller.
   885  	if nameLenPtr != 0 {
   886  		if err := writeAddress(t, sender, senderLen, namePtr, nameLenPtr); err != nil {
   887  			return 0, err
   888  		}
   889  	}
   890  
   891  	return uintptr(n), nil
   892  }
   893  
   894  // RecvFrom implements the linux syscall recvfrom(2).
   895  func RecvFrom(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
   896  	fd := args[0].Int()
   897  	bufPtr := args[1].Pointer()
   898  	bufLen := args[2].Uint64()
   899  	flags := args[3].Int()
   900  	namePtr := args[4].Pointer()
   901  	nameLenPtr := args[5].Pointer()
   902  
   903  	n, err := recvFrom(t, fd, bufPtr, bufLen, flags, namePtr, nameLenPtr)
   904  	return n, nil, err
   905  }
   906  
   907  // SendMsg implements the linux syscall sendmsg(2).
   908  func SendMsg(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
   909  	fd := args[0].Int()
   910  	msgPtr := args[1].Pointer()
   911  	flags := args[2].Int()
   912  
   913  	if t.Arch().Width() != 8 {
   914  		// We only handle 64-bit for now.
   915  		return 0, nil, linuxerr.EINVAL
   916  	}
   917  
   918  	// Get socket from the file descriptor.
   919  	file := t.GetFileVFS2(fd)
   920  	if file == nil {
   921  		return 0, nil, linuxerr.EBADF
   922  	}
   923  	defer file.DecRef(t)
   924  
   925  	// Extract the socket.
   926  	s, ok := file.Impl().(socket.SocketVFS2)
   927  	if !ok {
   928  		return 0, nil, syserror.ENOTSOCK
   929  	}
   930  
   931  	// Reject flags that we don't handle yet.
   932  	if flags & ^(linux.MSG_DONTWAIT|linux.MSG_EOR|linux.MSG_MORE|linux.MSG_NOSIGNAL) != 0 {
   933  		return 0, nil, linuxerr.EINVAL
   934  	}
   935  
   936  	if (file.StatusFlags() & linux.SOCK_NONBLOCK) != 0 {
   937  		flags |= linux.MSG_DONTWAIT
   938  	}
   939  
   940  	n, err := sendSingleMsg(t, s, file, msgPtr, flags)
   941  	return n, nil, err
   942  }
   943  
   944  // SendMMsg implements the linux syscall sendmmsg(2).
   945  func SendMMsg(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
   946  	fd := args[0].Int()
   947  	msgPtr := args[1].Pointer()
   948  	vlen := args[2].Uint()
   949  	flags := args[3].Int()
   950  
   951  	if t.Arch().Width() != 8 {
   952  		// We only handle 64-bit for now.
   953  		return 0, nil, linuxerr.EINVAL
   954  	}
   955  
   956  	if vlen > linux.UIO_MAXIOV {
   957  		vlen = linux.UIO_MAXIOV
   958  	}
   959  
   960  	// Get socket from the file descriptor.
   961  	file := t.GetFileVFS2(fd)
   962  	if file == nil {
   963  		return 0, nil, linuxerr.EBADF
   964  	}
   965  	defer file.DecRef(t)
   966  
   967  	// Extract the socket.
   968  	s, ok := file.Impl().(socket.SocketVFS2)
   969  	if !ok {
   970  		return 0, nil, syserror.ENOTSOCK
   971  	}
   972  
   973  	// Reject flags that we don't handle yet.
   974  	if flags & ^(linux.MSG_DONTWAIT|linux.MSG_EOR|linux.MSG_MORE|linux.MSG_NOSIGNAL) != 0 {
   975  		return 0, nil, linuxerr.EINVAL
   976  	}
   977  
   978  	if (file.StatusFlags() & linux.SOCK_NONBLOCK) != 0 {
   979  		flags |= linux.MSG_DONTWAIT
   980  	}
   981  
   982  	var count uint32
   983  	var err error
   984  	for i := uint64(0); i < uint64(vlen); i++ {
   985  		mp, ok := msgPtr.AddLength(i * multipleMessageHeader64Len)
   986  		if !ok {
   987  			return 0, nil, syserror.EFAULT
   988  		}
   989  		var n uintptr
   990  		if n, err = sendSingleMsg(t, s, file, mp, flags); err != nil {
   991  			break
   992  		}
   993  
   994  		// Copy the received length to the caller.
   995  		lp, ok := mp.AddLength(messageHeader64Len)
   996  		if !ok {
   997  			return 0, nil, syserror.EFAULT
   998  		}
   999  		if _, err = primitive.CopyUint32Out(t, lp, uint32(n)); err != nil {
  1000  			break
  1001  		}
  1002  		count++
  1003  	}
  1004  
  1005  	if count == 0 {
  1006  		return 0, nil, err
  1007  	}
  1008  	return uintptr(count), nil, nil
  1009  }
  1010  
  1011  func sendSingleMsg(t *kernel.Task, s socket.SocketVFS2, file *vfs.FileDescription, msgPtr hostarch.Addr, flags int32) (uintptr, error) {
  1012  	// Capture the message header.
  1013  	var msg MessageHeader64
  1014  	if _, err := msg.CopyIn(t, msgPtr); err != nil {
  1015  		return 0, err
  1016  	}
  1017  
  1018  	var controlData []byte
  1019  	if msg.ControlLen > 0 {
  1020  		// Put an upper bound to prevent large allocations.
  1021  		if msg.ControlLen > maxControlLen {
  1022  			return 0, linuxerr.ENOBUFS
  1023  		}
  1024  		controlData = make([]byte, msg.ControlLen)
  1025  		if _, err := t.CopyInBytes(hostarch.Addr(msg.Control), controlData); err != nil {
  1026  			return 0, err
  1027  		}
  1028  	}
  1029  
  1030  	// Read the destination address if one is specified.
  1031  	var to []byte
  1032  	if msg.NameLen != 0 {
  1033  		var err error
  1034  		to, err = CaptureAddress(t, hostarch.Addr(msg.Name), msg.NameLen)
  1035  		if err != nil {
  1036  			return 0, err
  1037  		}
  1038  	}
  1039  
  1040  	// Read data then call the sendmsg implementation.
  1041  	if msg.IovLen > linux.UIO_MAXIOV {
  1042  		return 0, linuxerr.EMSGSIZE
  1043  	}
  1044  	src, err := t.IovecsIOSequence(hostarch.Addr(msg.Iov), int(msg.IovLen), usermem.IOOpts{
  1045  		AddressSpaceActive: true,
  1046  	})
  1047  	if err != nil {
  1048  		return 0, err
  1049  	}
  1050  
  1051  	controlMessages, err := control.Parse(t, s, controlData, t.Arch().Width())
  1052  	if err != nil {
  1053  		return 0, err
  1054  	}
  1055  
  1056  	var haveDeadline bool
  1057  	var deadline ktime.Time
  1058  	if dl := s.SendTimeout(); dl > 0 {
  1059  		deadline = t.Kernel().MonotonicClock().Now().Add(time.Duration(dl) * time.Nanosecond)
  1060  		haveDeadline = true
  1061  	} else if dl < 0 {
  1062  		flags |= linux.MSG_DONTWAIT
  1063  	}
  1064  
  1065  	// Call the syscall implementation.
  1066  	n, e := s.SendMsg(t, src, to, int(flags), haveDeadline, deadline, controlMessages)
  1067  	err = slinux.HandleIOErrorVFS2(t, n != 0, e.ToError(), syserror.ERESTARTSYS, "sendmsg", file)
  1068  	// Control messages should be released on error as well as for zero-length
  1069  	// messages, which are discarded by the receiver.
  1070  	if n == 0 || err != nil {
  1071  		controlMessages.Release(t)
  1072  	}
  1073  	return uintptr(n), err
  1074  }
  1075  
  1076  // sendTo is the implementation of the sendto syscall. It is called by sendto
  1077  // and send syscall handlers.
  1078  func sendTo(t *kernel.Task, fd int32, bufPtr hostarch.Addr, bufLen uint64, flags int32, namePtr hostarch.Addr, nameLen uint32) (uintptr, error) {
  1079  	bl := int(bufLen)
  1080  	if bl < 0 {
  1081  		return 0, linuxerr.EINVAL
  1082  	}
  1083  
  1084  	// Get socket from the file descriptor.
  1085  	file := t.GetFileVFS2(fd)
  1086  	if file == nil {
  1087  		return 0, linuxerr.EBADF
  1088  	}
  1089  	defer file.DecRef(t)
  1090  
  1091  	// Extract the socket.
  1092  	s, ok := file.Impl().(socket.SocketVFS2)
  1093  	if !ok {
  1094  		return 0, syserror.ENOTSOCK
  1095  	}
  1096  
  1097  	if (file.StatusFlags() & linux.SOCK_NONBLOCK) != 0 {
  1098  		flags |= linux.MSG_DONTWAIT
  1099  	}
  1100  
  1101  	// Read the destination address if one is specified.
  1102  	var to []byte
  1103  	var err error
  1104  	if namePtr != 0 {
  1105  		to, err = CaptureAddress(t, namePtr, nameLen)
  1106  		if err != nil {
  1107  			return 0, err
  1108  		}
  1109  	}
  1110  
  1111  	src, err := t.SingleIOSequence(bufPtr, bl, usermem.IOOpts{
  1112  		AddressSpaceActive: true,
  1113  	})
  1114  	if err != nil {
  1115  		return 0, err
  1116  	}
  1117  
  1118  	var haveDeadline bool
  1119  	var deadline ktime.Time
  1120  	if dl := s.SendTimeout(); dl > 0 {
  1121  		deadline = t.Kernel().MonotonicClock().Now().Add(time.Duration(dl) * time.Nanosecond)
  1122  		haveDeadline = true
  1123  	} else if dl < 0 {
  1124  		flags |= linux.MSG_DONTWAIT
  1125  	}
  1126  
  1127  	// Call the syscall implementation.
  1128  	n, e := s.SendMsg(t, src, to, int(flags), haveDeadline, deadline, socket.ControlMessages{Unix: control.New(t, s, nil)})
  1129  	return uintptr(n), slinux.HandleIOErrorVFS2(t, n != 0, e.ToError(), syserror.ERESTARTSYS, "sendto", file)
  1130  }
  1131  
  1132  // SendTo implements the linux syscall sendto(2).
  1133  func SendTo(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
  1134  	fd := args[0].Int()
  1135  	bufPtr := args[1].Pointer()
  1136  	bufLen := args[2].Uint64()
  1137  	flags := args[3].Int()
  1138  	namePtr := args[4].Pointer()
  1139  	nameLen := args[5].Uint()
  1140  
  1141  	n, err := sendTo(t, fd, bufPtr, bufLen, flags, namePtr, nameLen)
  1142  	return n, nil, err
  1143  }