gvisor.dev/gvisor@v0.0.0-20240520182842-f9d4d51c7e0f/pkg/sentry/socket/hostinet/socket.go (about)

     1  // Copyright 2018 The gVisor Authors.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  package hostinet
    16  
    17  import (
    18  	"fmt"
    19  
    20  	"golang.org/x/sys/unix"
    21  	"gvisor.dev/gvisor/pkg/abi/linux"
    22  	"gvisor.dev/gvisor/pkg/atomicbitops"
    23  	"gvisor.dev/gvisor/pkg/context"
    24  	"gvisor.dev/gvisor/pkg/errors/linuxerr"
    25  	"gvisor.dev/gvisor/pkg/fdnotifier"
    26  	"gvisor.dev/gvisor/pkg/log"
    27  	"gvisor.dev/gvisor/pkg/marshal/primitive"
    28  	"gvisor.dev/gvisor/pkg/safemem"
    29  	"gvisor.dev/gvisor/pkg/sentry/arch"
    30  	"gvisor.dev/gvisor/pkg/sentry/fsimpl/sockfs"
    31  	"gvisor.dev/gvisor/pkg/sentry/hostfd"
    32  	"gvisor.dev/gvisor/pkg/sentry/kernel"
    33  	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
    34  	ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time"
    35  	"gvisor.dev/gvisor/pkg/sentry/socket"
    36  	"gvisor.dev/gvisor/pkg/sentry/socket/control"
    37  	"gvisor.dev/gvisor/pkg/sentry/vfs"
    38  	"gvisor.dev/gvisor/pkg/syserr"
    39  	"gvisor.dev/gvisor/pkg/usermem"
    40  	"gvisor.dev/gvisor/pkg/waiter"
    41  )
    42  
    43  const (
    44  	// sizeofSockaddr is the size in bytes of the largest sockaddr type
    45  	// supported by this package.
    46  	sizeofSockaddr = unix.SizeofSockaddrInet6 // sizeof(sockaddr_in6) > sizeof(sockaddr_in)
    47  
    48  	// maxControlLen is the maximum size of a control message buffer used in a
    49  	// recvmsg or sendmsg unix.
    50  	maxControlLen = 1024
    51  )
    52  
    53  // AllowedSocketType is a tuple of socket family, type, and protocol.
    54  type AllowedSocketType struct {
    55  	Family int
    56  	Type   int
    57  
    58  	// Protocol of AllowAllProtocols indicates that all protocols are
    59  	// allowed.
    60  	Protocol int
    61  }
    62  
    63  // AllowAllProtocols indicates that all protocols are allowed by the stack and
    64  // in the syscall filters.
    65  var AllowAllProtocols = -1
    66  
    67  // AllowedSocketTypes are the socket types which are supported by hostinet.
    68  // These are used to validate the arguments to socket(), and also to generate
    69  // syscall filters.
    70  var AllowedSocketTypes = []AllowedSocketType{
    71  	// Family, Type, Protocol.
    72  	{unix.AF_INET, unix.SOCK_STREAM, unix.IPPROTO_TCP},
    73  	{unix.AF_INET, unix.SOCK_DGRAM, unix.IPPROTO_UDP},
    74  	{unix.AF_INET, unix.SOCK_DGRAM, unix.IPPROTO_ICMP},
    75  
    76  	{unix.AF_INET6, unix.SOCK_STREAM, unix.IPPROTO_TCP},
    77  	{unix.AF_INET6, unix.SOCK_DGRAM, unix.IPPROTO_UDP},
    78  	{unix.AF_INET6, unix.SOCK_DGRAM, unix.IPPROTO_ICMPV6},
    79  }
    80  
    81  // AllowedRawSocketTypes are the socket types which are supported by hostinet
    82  // with raw sockets enabled.
    83  var AllowedRawSocketTypes = []AllowedSocketType{
    84  	{unix.AF_INET, unix.SOCK_RAW, unix.IPPROTO_RAW},
    85  	{unix.AF_INET, unix.SOCK_RAW, unix.IPPROTO_TCP},
    86  	{unix.AF_INET, unix.SOCK_RAW, unix.IPPROTO_UDP},
    87  	{unix.AF_INET, unix.SOCK_RAW, unix.IPPROTO_ICMP},
    88  
    89  	{unix.AF_INET6, unix.SOCK_RAW, unix.IPPROTO_RAW},
    90  	{unix.AF_INET6, unix.SOCK_RAW, unix.IPPROTO_TCP},
    91  	{unix.AF_INET6, unix.SOCK_RAW, unix.IPPROTO_UDP},
    92  	{unix.AF_INET6, unix.SOCK_RAW, unix.IPPROTO_ICMPV6},
    93  
    94  	// AF_PACKET do not allow Write or SendMsg.
    95  	{unix.AF_PACKET, unix.SOCK_DGRAM, AllowAllProtocols},
    96  	{unix.AF_PACKET, unix.SOCK_RAW, AllowAllProtocols},
    97  }
    98  
    99  // Socket implements socket.Socket (and by extension, vfs.FileDescriptionImpl)
   100  // for host sockets.
   101  //
   102  // +stateify savable
   103  type Socket struct {
   104  	vfsfd vfs.FileDescription
   105  	vfs.FileDescriptionDefaultImpl
   106  	vfs.LockFD
   107  	// We store metadata for hostinet sockets internally. Technically, we should
   108  	// access metadata (e.g. through stat, chmod) on the host for correctness,
   109  	// but this is not very useful for inet socket fds, which do not belong to a
   110  	// concrete file anyway.
   111  	vfs.DentryMetadataFileDescriptionImpl
   112  	socket.SendReceiveTimeout
   113  
   114  	family   int            // Read-only.
   115  	stype    linux.SockType // Read-only.
   116  	protocol int            // Read-only.
   117  	queue    waiter.Queue
   118  
   119  	// fd is the host socket fd. It must have O_NONBLOCK, so that operations
   120  	// will return EWOULDBLOCK instead of blocking on the host. This allows us to
   121  	// handle blocking behavior independently in the sentry.
   122  	fd int
   123  
   124  	// recvClosed indicates that the socket has been shutdown for reading
   125  	// (SHUT_RD or SHUT_RDWR).
   126  	recvClosed atomicbitops.Bool
   127  }
   128  
   129  var _ = socket.Socket(&Socket{})
   130  
   131  func newSocket(t *kernel.Task, family int, stype linux.SockType, protocol int, fd int, flags uint32) (*vfs.FileDescription, *syserr.Error) {
   132  	mnt := t.Kernel().SocketMount()
   133  	d := sockfs.NewDentry(t, mnt)
   134  	defer d.DecRef(t)
   135  
   136  	s := &Socket{
   137  		family:   family,
   138  		stype:    stype,
   139  		protocol: protocol,
   140  		fd:       fd,
   141  	}
   142  	s.LockFD.Init(&vfs.FileLocks{})
   143  	if err := fdnotifier.AddFD(int32(fd), &s.queue); err != nil {
   144  		return nil, syserr.FromError(err)
   145  	}
   146  	vfsfd := &s.vfsfd
   147  	if err := vfsfd.Init(s, linux.O_RDWR|(flags&linux.O_NONBLOCK), mnt, d, &vfs.FileDescriptionOptions{
   148  		DenyPRead:         true,
   149  		DenyPWrite:        true,
   150  		UseDentryMetadata: true,
   151  	}); err != nil {
   152  		fdnotifier.RemoveFD(int32(s.fd))
   153  		return nil, syserr.FromError(err)
   154  	}
   155  	return vfsfd, nil
   156  }
   157  
   158  // Release implements vfs.FileDescriptionImpl.Release.
   159  func (s *Socket) Release(ctx context.Context) {
   160  	kernel.KernelFromContext(ctx).DeleteSocket(&s.vfsfd)
   161  	fdnotifier.RemoveFD(int32(s.fd))
   162  	_ = unix.Close(s.fd)
   163  }
   164  
   165  // Epollable implements FileDescriptionImpl.Epollable.
   166  func (s *Socket) Epollable() bool {
   167  	return true
   168  }
   169  
   170  // Ioctl implements vfs.FileDescriptionImpl.
   171  func (s *Socket) Ioctl(ctx context.Context, uio usermem.IO, sysno uintptr, args arch.SyscallArguments) (uintptr, error) {
   172  	return ioctl(ctx, s.fd, uio, sysno, args)
   173  }
   174  
   175  // PRead implements vfs.FileDescriptionImpl.PRead.
   176  func (s *Socket) PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts vfs.ReadOptions) (int64, error) {
   177  	return 0, linuxerr.ESPIPE
   178  }
   179  
   180  // Read implements vfs.FileDescriptionImpl.
   181  func (s *Socket) Read(ctx context.Context, dst usermem.IOSequence, opts vfs.ReadOptions) (int64, error) {
   182  	// All flags other than RWF_NOWAIT should be ignored.
   183  	// TODO(gvisor.dev/issue/2601): Support RWF_NOWAIT.
   184  	if opts.Flags != 0 {
   185  		return 0, linuxerr.EOPNOTSUPP
   186  	}
   187  
   188  	reader := hostfd.GetReadWriterAt(int32(s.fd), -1, opts.Flags)
   189  	defer hostfd.PutReadWriterAt(reader)
   190  	n, err := dst.CopyOutFrom(ctx, reader)
   191  	return int64(n), err
   192  }
   193  
   194  // PWrite implements vfs.FileDescriptionImpl.
   195  func (s *Socket) PWrite(ctx context.Context, dst usermem.IOSequence, offset int64, opts vfs.WriteOptions) (int64, error) {
   196  	return 0, linuxerr.ESPIPE
   197  }
   198  
   199  // Write implements vfs.FileDescriptionImpl.
   200  func (s *Socket) Write(ctx context.Context, src usermem.IOSequence, opts vfs.WriteOptions) (int64, error) {
   201  	if s.family == linux.AF_PACKET {
   202  		// Don't allow Write for AF_PACKET.
   203  		return 0, linuxerr.EACCES
   204  	}
   205  
   206  	// All flags other than RWF_NOWAIT should be ignored.
   207  	// TODO(gvisor.dev/issue/2601): Support RWF_NOWAIT.
   208  	if opts.Flags != 0 {
   209  		return 0, linuxerr.EOPNOTSUPP
   210  	}
   211  
   212  	writer := hostfd.GetReadWriterAt(int32(s.fd), -1, opts.Flags)
   213  	defer hostfd.PutReadWriterAt(writer)
   214  	n, err := src.CopyInTo(ctx, writer)
   215  	return int64(n), err
   216  }
   217  
   218  type socketProvider struct {
   219  	family int
   220  }
   221  
   222  // Socket implements socket.Provider.Socket.
   223  func (p *socketProvider) Socket(t *kernel.Task, stypeflags linux.SockType, protocol int) (*vfs.FileDescription, *syserr.Error) {
   224  	// Check that we are using the host network stack.
   225  	netCtx := t.NetworkContext()
   226  	if netCtx == nil {
   227  		return nil, nil
   228  	}
   229  	stack, ok := netCtx.(*Stack)
   230  	if !ok {
   231  		return nil, nil
   232  	}
   233  
   234  	stype := stypeflags & linux.SOCK_TYPE_MASK
   235  
   236  	// Raw and packet sockets require CAP_NET_RAW.
   237  	if stype == linux.SOCK_RAW || p.family == linux.AF_PACKET {
   238  		if creds := auth.CredentialsFromContext(t); !creds.HasCapability(linux.CAP_NET_RAW) {
   239  			return nil, syserr.ErrNotPermitted
   240  		}
   241  	}
   242  
   243  	// Convert generic IPPROTO_IP protocol to the actual protocol depending
   244  	// on family and type.
   245  	if protocol == linux.IPPROTO_IP && (p.family == linux.AF_INET || p.family == linux.AF_INET6) {
   246  		switch stype {
   247  		case linux.SOCK_STREAM:
   248  			protocol = linux.IPPROTO_TCP
   249  		case linux.SOCK_DGRAM:
   250  			protocol = linux.IPPROTO_UDP
   251  		}
   252  	}
   253  
   254  	// Validate the socket based on family, type, and protocol.
   255  	var supported bool
   256  	for _, allowed := range stack.allowedSocketTypes {
   257  		isAllowedFamily := p.family == allowed.Family
   258  		isAllowedType := int(stype) == allowed.Type
   259  		isAllowedProtocol := protocol == allowed.Protocol || allowed.Protocol == AllowAllProtocols
   260  		if isAllowedFamily && isAllowedType && isAllowedProtocol {
   261  			supported = true
   262  			break
   263  		}
   264  	}
   265  	if !supported {
   266  		// Return nil error here to give other socket providers a
   267  		// chance to create this socket.
   268  		return nil, nil
   269  	}
   270  
   271  	// Conservatively ignore all flags specified by the application and add
   272  	// SOCK_NONBLOCK since socketOperations requires it.
   273  	st := int(stype) | unix.SOCK_NONBLOCK | unix.SOCK_CLOEXEC
   274  	fd, err := unix.Socket(p.family, st, protocol)
   275  	if err != nil {
   276  		return nil, syserr.FromError(err)
   277  	}
   278  	return newSocket(t, p.family, stype, protocol, fd, uint32(stypeflags&unix.SOCK_NONBLOCK))
   279  }
   280  
   281  // Pair implements socket.Provider.Pair.
   282  func (p *socketProvider) Pair(t *kernel.Task, stype linux.SockType, protocol int) (*vfs.FileDescription, *vfs.FileDescription, *syserr.Error) {
   283  	// Not supported by AF_INET/AF_INET6.
   284  	return nil, nil, nil
   285  }
   286  
   287  // Readiness implements waiter.Waitable.Readiness.
   288  func (s *Socket) Readiness(mask waiter.EventMask) waiter.EventMask {
   289  	return fdnotifier.NonBlockingPoll(int32(s.fd), mask)
   290  }
   291  
   292  // EventRegister implements waiter.Waitable.EventRegister.
   293  func (s *Socket) EventRegister(e *waiter.Entry) error {
   294  	s.queue.EventRegister(e)
   295  	if err := fdnotifier.UpdateFD(int32(s.fd)); err != nil {
   296  		s.queue.EventUnregister(e)
   297  		return err
   298  	}
   299  	return nil
   300  }
   301  
   302  // EventUnregister implements waiter.Waitable.EventUnregister.
   303  func (s *Socket) EventUnregister(e *waiter.Entry) {
   304  	s.queue.EventUnregister(e)
   305  	if err := fdnotifier.UpdateFD(int32(s.fd)); err != nil {
   306  		panic(err)
   307  	}
   308  }
   309  
   310  // Connect implements socket.Socket.Connect.
   311  func (s *Socket) Connect(t *kernel.Task, sockaddr []byte, blocking bool) *syserr.Error {
   312  	if len(sockaddr) > sizeofSockaddr {
   313  		sockaddr = sockaddr[:sizeofSockaddr]
   314  	}
   315  
   316  	_, _, errno := unix.Syscall(unix.SYS_CONNECT, uintptr(s.fd), uintptr(firstBytePtr(sockaddr)), uintptr(len(sockaddr)))
   317  	if errno == 0 {
   318  		return nil
   319  	}
   320  	// The host socket is always non-blocking, so we expect connect to
   321  	// return EINPROGRESS. If we are emulating a blocking socket, we will
   322  	// wait for the connect to complete below.
   323  	// But if we are not emulating a blocking socket, or if we got some
   324  	// other error, then return it now.
   325  	if errno != unix.EINPROGRESS || !blocking {
   326  		return syserr.FromError(translateIOSyscallError(errno))
   327  	}
   328  
   329  	// "EINPROGRESS: The socket is nonblocking and the connection cannot be
   330  	// completed immediately. It is possible to select(2) or poll(2) for
   331  	// completion by selecting the socket for writing. After select(2)
   332  	// indicates writability, use getsockopt(2) to read the SO_ERROR option at
   333  	// level SOL-SOCKET to determine whether connect() completed successfully
   334  	// (SO_ERROR is zero) or unsuccessfully (SO_ERROR is one of the usual error
   335  	// codes listed here, explaining the reason for the failure)." - connect(2)
   336  	writableMask := waiter.WritableEvents
   337  	e, ch := waiter.NewChannelEntry(writableMask)
   338  	s.EventRegister(&e)
   339  	defer s.EventUnregister(&e)
   340  	if s.Readiness(writableMask)&writableMask == 0 {
   341  		if err := t.Block(ch); err != nil {
   342  			return syserr.FromError(err)
   343  		}
   344  	}
   345  
   346  	val, err := unix.GetsockoptInt(s.fd, unix.SOL_SOCKET, unix.SO_ERROR)
   347  	if err != nil {
   348  		return syserr.FromError(err)
   349  	}
   350  	if val != 0 {
   351  		return syserr.FromError(unix.Errno(uintptr(val)))
   352  	}
   353  
   354  	// It seems like we are all good now, but Linux has left the socket
   355  	// state as CONNECTING (not CONNECTED). This is a strange quirk of
   356  	// non-blocking sockets. See tcp_finish_connect() which sets tcp state
   357  	// but not socket state.
   358  	//
   359  	// Sockets in the CONNECTING state can call connect() a second time,
   360  	// whereas CONNECTED sockets will reject the second connect() call.
   361  	// Because we are emulating a blocking socket, we want a subsequent
   362  	// connect() call to fail. So we must kick Linux to update the socket
   363  	// to state CONNECTED, which we can do by calling connect() a second
   364  	// time ourselves.
   365  	_, _, errno = unix.Syscall(unix.SYS_CONNECT, uintptr(s.fd), uintptr(firstBytePtr(sockaddr)), uintptr(len(sockaddr)))
   366  	if errno != 0 && errno != unix.EALREADY {
   367  		return syserr.FromError(translateIOSyscallError(errno))
   368  	}
   369  	return nil
   370  }
   371  
   372  // Accept implements socket.Socket.Accept.
   373  func (s *Socket) Accept(t *kernel.Task, peerRequested bool, flags int, blocking bool) (int32, linux.SockAddr, uint32, *syserr.Error) {
   374  	var peerAddr linux.SockAddr
   375  	var peerAddrBuf []byte
   376  	var peerAddrlen uint32
   377  	var peerAddrPtr *byte
   378  	var peerAddrlenPtr *uint32
   379  	if peerRequested {
   380  		peerAddrBuf = make([]byte, sizeofSockaddr)
   381  		peerAddrlen = uint32(len(peerAddrBuf))
   382  		peerAddrPtr = &peerAddrBuf[0]
   383  		peerAddrlenPtr = &peerAddrlen
   384  	}
   385  
   386  	// Conservatively ignore all flags specified by the application and add
   387  	// SOCK_NONBLOCK since socketOpsCommon requires it.
   388  	fd, syscallErr := accept4(s.fd, peerAddrPtr, peerAddrlenPtr, unix.SOCK_NONBLOCK|unix.SOCK_CLOEXEC)
   389  	if blocking {
   390  		var ch chan struct{}
   391  		for linuxerr.Equals(linuxerr.ErrWouldBlock, syscallErr) {
   392  			if ch != nil {
   393  				if syscallErr = t.Block(ch); syscallErr != nil {
   394  					break
   395  				}
   396  			} else {
   397  				var e waiter.Entry
   398  				e, ch = waiter.NewChannelEntry(waiter.ReadableEvents | waiter.EventHUp | waiter.EventErr)
   399  				s.EventRegister(&e)
   400  				defer s.EventUnregister(&e)
   401  			}
   402  			fd, syscallErr = accept4(s.fd, peerAddrPtr, peerAddrlenPtr, unix.SOCK_NONBLOCK|unix.SOCK_CLOEXEC)
   403  		}
   404  	}
   405  
   406  	if peerRequested {
   407  		peerAddr = socket.UnmarshalSockAddr(s.family, peerAddrBuf[:peerAddrlen])
   408  	}
   409  	if syscallErr != nil {
   410  		return 0, peerAddr, peerAddrlen, syserr.FromError(syscallErr)
   411  	}
   412  
   413  	var (
   414  		kfd  int32
   415  		kerr error
   416  	)
   417  	f, err := newSocket(t, s.family, s.stype, s.protocol, fd, uint32(flags&unix.SOCK_NONBLOCK))
   418  	if err != nil {
   419  		_ = unix.Close(fd)
   420  		return 0, nil, 0, err
   421  	}
   422  	defer f.DecRef(t)
   423  
   424  	kfd, kerr = t.NewFDFrom(0, f, kernel.FDFlags{
   425  		CloseOnExec: flags&unix.SOCK_CLOEXEC != 0,
   426  	})
   427  	t.Kernel().RecordSocket(f)
   428  
   429  	return kfd, peerAddr, peerAddrlen, syserr.FromError(kerr)
   430  }
   431  
   432  // Bind implements socket.Socket.Bind.
   433  func (s *Socket) Bind(_ *kernel.Task, sockaddr []byte) *syserr.Error {
   434  	if len(sockaddr) > sizeofSockaddr {
   435  		sockaddr = sockaddr[:sizeofSockaddr]
   436  	}
   437  
   438  	_, _, errno := unix.Syscall(unix.SYS_BIND, uintptr(s.fd), uintptr(firstBytePtr(sockaddr)), uintptr(len(sockaddr)))
   439  	if errno != 0 {
   440  		return syserr.FromError(errno)
   441  	}
   442  	return nil
   443  }
   444  
   445  // Listen implements socket.Socket.Listen.
   446  func (s *Socket) Listen(_ *kernel.Task, backlog int) *syserr.Error {
   447  	return syserr.FromError(unix.Listen(s.fd, backlog))
   448  }
   449  
   450  // Shutdown implements socket.Socket.Shutdown.
   451  func (s *Socket) Shutdown(_ *kernel.Task, how int) *syserr.Error {
   452  	switch how {
   453  	case unix.SHUT_RD, unix.SHUT_RDWR:
   454  		// Mark the socket as closed for reading.
   455  		s.recvClosed.Store(true)
   456  		fallthrough
   457  	case unix.SHUT_WR:
   458  		return syserr.FromError(unix.Shutdown(s.fd, how))
   459  	default:
   460  		return syserr.ErrInvalidArgument
   461  	}
   462  }
   463  
   464  func (s *Socket) recvMsgFromHost(iovs []unix.Iovec, flags int, senderRequested bool, controlLen uint64) (uint64, int, []byte, []byte, error) {
   465  	// We always do a non-blocking recv*().
   466  	sysflags := flags | unix.MSG_DONTWAIT
   467  
   468  	msg := unix.Msghdr{}
   469  	if len(iovs) > 0 {
   470  		msg.Iov = &iovs[0]
   471  		msg.Iovlen = uint64(len(iovs))
   472  	}
   473  	var senderAddrBuf []byte
   474  	if senderRequested {
   475  		senderAddrBuf = make([]byte, sizeofSockaddr)
   476  		msg.Name = &senderAddrBuf[0]
   477  		msg.Namelen = uint32(sizeofSockaddr)
   478  	}
   479  	var controlBuf []byte
   480  	if controlLen > 0 {
   481  		if controlLen > maxControlLen {
   482  			controlLen = maxControlLen
   483  		}
   484  		controlBuf = make([]byte, controlLen)
   485  		msg.Control = &controlBuf[0]
   486  		msg.Controllen = controlLen
   487  	}
   488  	n, err := recvmsg(s.fd, &msg, sysflags)
   489  	if err != nil {
   490  		return 0 /* n */, 0 /* mFlags */, nil /* senderAddrBuf */, nil /* controlBuf */, err
   491  	}
   492  	return n, int(msg.Flags), senderAddrBuf[:msg.Namelen], controlBuf[:msg.Controllen], err
   493  }
   494  
   495  const allowedRecvMsgFlags = unix.MSG_CTRUNC |
   496  	unix.MSG_DONTWAIT |
   497  	unix.MSG_ERRQUEUE |
   498  	unix.MSG_OOB |
   499  	unix.MSG_PEEK |
   500  	unix.MSG_TRUNC |
   501  	unix.MSG_WAITALL
   502  
   503  // RecvMsg implements socket.Socket.RecvMsg.
   504  func (s *Socket) RecvMsg(t *kernel.Task, dst usermem.IOSequence, flags int, haveDeadline bool, deadline ktime.Time, senderRequested bool, controlLen uint64) (int, int, linux.SockAddr, uint32, socket.ControlMessages, *syserr.Error) {
   505  	// Only allow known and safe flags.
   506  	if flags&^allowedRecvMsgFlags != 0 {
   507  		return 0, 0, nil, 0, socket.ControlMessages{}, syserr.ErrInvalidArgument
   508  	}
   509  
   510  	var senderAddrBuf []byte
   511  	var controlBuf []byte
   512  	var msgFlags int
   513  	copyToDst := func() (int64, error) {
   514  		var n uint64
   515  		var err error
   516  		if dst.NumBytes() == 0 {
   517  			// We want to make the recvmsg(2) call to the host even if dst is empty
   518  			// to fetch control messages, sender address or errors if any occur.
   519  			n, msgFlags, senderAddrBuf, controlBuf, err = s.recvMsgFromHost(nil, flags, senderRequested, controlLen)
   520  			return int64(n), err
   521  		}
   522  
   523  		recvmsgToBlocks := safemem.ReaderFunc(func(dsts safemem.BlockSeq) (uint64, error) {
   524  			// Refuse to do anything if any part of dst.Addrs was unusable.
   525  			if uint64(dst.NumBytes()) != dsts.NumBytes() {
   526  				return 0, nil
   527  			}
   528  			if dsts.IsEmpty() {
   529  				return 0, nil
   530  			}
   531  
   532  			n, msgFlags, senderAddrBuf, controlBuf, err = s.recvMsgFromHost(safemem.IovecsFromBlockSeq(dsts), flags, senderRequested, controlLen)
   533  			return n, err
   534  		})
   535  		return dst.CopyOutFrom(t, recvmsgToBlocks)
   536  	}
   537  
   538  	var ch chan struct{}
   539  	n, err := copyToDst()
   540  
   541  	// recv*(MSG_ERRQUEUE) never blocks, even without MSG_DONTWAIT.
   542  	if flags&(unix.MSG_DONTWAIT|unix.MSG_ERRQUEUE) == 0 {
   543  		for linuxerr.Equals(linuxerr.ErrWouldBlock, err) {
   544  			// We only expect blocking to come from the actual syscall, in which
   545  			// case it can't have returned any data.
   546  			if n != 0 {
   547  				panic(fmt.Sprintf("CopyOutFrom: got (%d, %v), wanted (0, %v)", n, err, err))
   548  			}
   549  			// Are we closed for reading? No sense in trying to read if so.
   550  			if s.recvClosed.Load() {
   551  				break
   552  			}
   553  			if ch != nil {
   554  				if err = t.BlockWithDeadline(ch, haveDeadline, deadline); err != nil {
   555  					if linuxerr.Equals(linuxerr.ETIMEDOUT, err) {
   556  						err = linuxerr.ErrWouldBlock
   557  					}
   558  					break
   559  				}
   560  			} else {
   561  				var e waiter.Entry
   562  				e, ch = waiter.NewChannelEntry(waiter.ReadableEvents | waiter.EventRdHUp | waiter.EventHUp | waiter.EventErr)
   563  				s.EventRegister(&e)
   564  				defer s.EventUnregister(&e)
   565  			}
   566  			n, err = copyToDst()
   567  		}
   568  	}
   569  	if err != nil {
   570  		return 0, 0, nil, 0, socket.ControlMessages{}, syserr.FromError(err)
   571  	}
   572  
   573  	// In some circumstances (like MSG_PEEK specified), the sender address
   574  	// field is purposefully ignored. recvMsgFromHost will return an empty
   575  	// senderAddrBuf in those cases.
   576  	var senderAddr linux.SockAddr
   577  	if senderRequested && len(senderAddrBuf) > 0 {
   578  		senderAddr = socket.UnmarshalSockAddr(s.family, senderAddrBuf)
   579  	}
   580  
   581  	unixControlMessages, err := unix.ParseSocketControlMessage(controlBuf)
   582  	if err != nil {
   583  		return 0, 0, nil, 0, socket.ControlMessages{}, syserr.FromError(err)
   584  	}
   585  	return int(n), msgFlags, senderAddr, uint32(len(senderAddrBuf)), parseUnixControlMessages(unixControlMessages), nil
   586  }
   587  
   588  func parseUnixControlMessages(unixControlMessages []unix.SocketControlMessage) socket.ControlMessages {
   589  	controlMessages := socket.ControlMessages{}
   590  	for _, unixCmsg := range unixControlMessages {
   591  		switch unixCmsg.Header.Level {
   592  		case linux.SOL_SOCKET:
   593  			switch unixCmsg.Header.Type {
   594  			case linux.SO_TIMESTAMP:
   595  				controlMessages.IP.HasTimestamp = true
   596  				ts := linux.Timeval{}
   597  				ts.UnmarshalUnsafe(unixCmsg.Data)
   598  				controlMessages.IP.Timestamp = ts.ToTime()
   599  			}
   600  
   601  		case linux.SOL_IP:
   602  			switch unixCmsg.Header.Type {
   603  			case linux.IP_TOS:
   604  				controlMessages.IP.HasTOS = true
   605  				var tos primitive.Uint8
   606  				tos.UnmarshalUnsafe(unixCmsg.Data)
   607  				controlMessages.IP.TOS = uint8(tos)
   608  
   609  			case linux.IP_TTL:
   610  				controlMessages.IP.HasTTL = true
   611  				var ttl primitive.Uint32
   612  				ttl.UnmarshalUnsafe(unixCmsg.Data)
   613  				controlMessages.IP.TTL = uint32(ttl)
   614  
   615  			case linux.IP_PKTINFO:
   616  				controlMessages.IP.HasIPPacketInfo = true
   617  				var packetInfo linux.ControlMessageIPPacketInfo
   618  				packetInfo.UnmarshalUnsafe(unixCmsg.Data)
   619  				controlMessages.IP.PacketInfo = packetInfo
   620  
   621  			case linux.IP_RECVORIGDSTADDR:
   622  				var addr linux.SockAddrInet
   623  				addr.UnmarshalUnsafe(unixCmsg.Data)
   624  				controlMessages.IP.OriginalDstAddress = &addr
   625  
   626  			case unix.IP_RECVERR:
   627  				var errCmsg linux.SockErrCMsgIPv4
   628  				errCmsg.UnmarshalBytes(unixCmsg.Data)
   629  				controlMessages.IP.SockErr = &errCmsg
   630  			}
   631  
   632  		case linux.SOL_IPV6:
   633  			switch unixCmsg.Header.Type {
   634  			case linux.IPV6_TCLASS:
   635  				controlMessages.IP.HasTClass = true
   636  				var tclass primitive.Uint32
   637  				tclass.UnmarshalUnsafe(unixCmsg.Data)
   638  				controlMessages.IP.TClass = uint32(tclass)
   639  
   640  			case linux.IPV6_PKTINFO:
   641  				controlMessages.IP.HasIPv6PacketInfo = true
   642  				var packetInfo linux.ControlMessageIPv6PacketInfo
   643  				packetInfo.UnmarshalUnsafe(unixCmsg.Data)
   644  				controlMessages.IP.IPv6PacketInfo = packetInfo
   645  
   646  			case linux.IPV6_HOPLIMIT:
   647  				controlMessages.IP.HasHopLimit = true
   648  				var hoplimit primitive.Uint32
   649  				hoplimit.UnmarshalUnsafe(unixCmsg.Data)
   650  				controlMessages.IP.HopLimit = uint32(hoplimit)
   651  
   652  			case linux.IPV6_RECVORIGDSTADDR:
   653  				var addr linux.SockAddrInet6
   654  				addr.UnmarshalUnsafe(unixCmsg.Data)
   655  				controlMessages.IP.OriginalDstAddress = &addr
   656  
   657  			case unix.IPV6_RECVERR:
   658  				var errCmsg linux.SockErrCMsgIPv6
   659  				errCmsg.UnmarshalBytes(unixCmsg.Data)
   660  				controlMessages.IP.SockErr = &errCmsg
   661  			}
   662  
   663  		case linux.SOL_TCP:
   664  			switch unixCmsg.Header.Type {
   665  			case linux.TCP_INQ:
   666  				controlMessages.IP.HasInq = true
   667  				var inq primitive.Int32
   668  				inq.UnmarshalUnsafe(unixCmsg.Data)
   669  				controlMessages.IP.Inq = int32(inq)
   670  			}
   671  		}
   672  	}
   673  	return controlMessages
   674  }
   675  
   676  const allowedSendMsgFlags = unix.MSG_DONTWAIT |
   677  	unix.MSG_EOR |
   678  	unix.MSG_FASTOPEN |
   679  	unix.MSG_MORE |
   680  	unix.MSG_NOSIGNAL |
   681  	unix.MSG_OOB
   682  
   683  // SendMsg implements socket.Socket.SendMsg.
   684  func (s *Socket) SendMsg(t *kernel.Task, src usermem.IOSequence, to []byte, flags int, haveDeadline bool, deadline ktime.Time, controlMessages socket.ControlMessages) (int, *syserr.Error) {
   685  	if s.family == linux.AF_PACKET {
   686  		// Don't allow SendMesg for AF_PACKET.
   687  		return 0, syserr.ErrPermissionDenied
   688  	}
   689  
   690  	// Only allow known and safe flags.
   691  	if flags&^allowedSendMsgFlags != 0 {
   692  		return 0, syserr.ErrInvalidArgument
   693  	}
   694  
   695  	// If the src is zero-length, call SENDTO directly with a null buffer in
   696  	// order to generate poll/epoll notifications.
   697  	if src.NumBytes() == 0 {
   698  		sysflags := flags | unix.MSG_DONTWAIT
   699  		n, _, errno := unix.Syscall6(unix.SYS_SENDTO, uintptr(s.fd), 0, 0, uintptr(sysflags), uintptr(firstBytePtr(to)), uintptr(len(to)))
   700  		if errno != 0 {
   701  			return 0, syserr.FromError(errno)
   702  		}
   703  		return int(n), nil
   704  	}
   705  
   706  	space := uint64(control.CmsgsSpace(t, controlMessages))
   707  	if space > maxControlLen {
   708  		space = maxControlLen
   709  	}
   710  	controlBuf := make([]byte, 0, space)
   711  	// PackControlMessages will append up to space bytes to controlBuf.
   712  	controlBuf = control.PackControlMessages(t, controlMessages, controlBuf)
   713  
   714  	sendmsgFromBlocks := safemem.WriterFunc(func(srcs safemem.BlockSeq) (uint64, error) {
   715  		// Refuse to do anything if any part of src.Addrs was unusable.
   716  		if uint64(src.NumBytes()) != srcs.NumBytes() {
   717  			return 0, nil
   718  		}
   719  		if srcs.IsEmpty() && len(controlBuf) == 0 {
   720  			return 0, nil
   721  		}
   722  
   723  		// We always do a non-blocking send*().
   724  		sysflags := flags | unix.MSG_DONTWAIT
   725  
   726  		if srcs.NumBlocks() == 1 && len(controlBuf) == 0 {
   727  			// Skip allocating []unix.Iovec.
   728  			src := srcs.Head()
   729  			n, _, errno := unix.Syscall6(unix.SYS_SENDTO, uintptr(s.fd), src.Addr(), uintptr(src.Len()), uintptr(sysflags), uintptr(firstBytePtr(to)), uintptr(len(to)))
   730  			if errno != 0 {
   731  				return 0, translateIOSyscallError(errno)
   732  			}
   733  			return uint64(n), nil
   734  		}
   735  
   736  		iovs := safemem.IovecsFromBlockSeq(srcs)
   737  		msg := unix.Msghdr{
   738  			Iov:    &iovs[0],
   739  			Iovlen: uint64(len(iovs)),
   740  		}
   741  		if len(to) != 0 {
   742  			msg.Name = &to[0]
   743  			msg.Namelen = uint32(len(to))
   744  		}
   745  		if len(controlBuf) != 0 {
   746  			msg.Control = &controlBuf[0]
   747  			msg.Controllen = uint64(len(controlBuf))
   748  		}
   749  		return sendmsg(s.fd, &msg, sysflags)
   750  	})
   751  
   752  	var ch chan struct{}
   753  	n, err := src.CopyInTo(t, sendmsgFromBlocks)
   754  	if flags&unix.MSG_DONTWAIT == 0 {
   755  		for linuxerr.Equals(linuxerr.ErrWouldBlock, err) {
   756  			// We only expect blocking to come from the actual syscall, in which
   757  			// case it can't have returned any data.
   758  			if n != 0 {
   759  				panic(fmt.Sprintf("CopyInTo: got (%d, %v), wanted (0, %v)", n, err, err))
   760  			}
   761  			if ch != nil {
   762  				if err = t.BlockWithDeadline(ch, haveDeadline, deadline); err != nil {
   763  					if linuxerr.Equals(linuxerr.ETIMEDOUT, err) {
   764  						err = linuxerr.ErrWouldBlock
   765  					}
   766  					break
   767  				}
   768  			} else {
   769  				var e waiter.Entry
   770  				e, ch = waiter.NewChannelEntry(waiter.WritableEvents | waiter.EventHUp | waiter.EventErr)
   771  				s.EventRegister(&e)
   772  				defer s.EventUnregister(&e)
   773  			}
   774  			n, err = src.CopyInTo(t, sendmsgFromBlocks)
   775  		}
   776  	}
   777  
   778  	return int(n), syserr.FromError(err)
   779  }
   780  
   781  func translateIOSyscallError(err error) error {
   782  	if err == unix.EAGAIN || err == unix.EWOULDBLOCK {
   783  		return linuxerr.ErrWouldBlock
   784  	}
   785  	return err
   786  }
   787  
   788  // State implements socket.Socket.State.
   789  func (s *Socket) State() uint32 {
   790  	info := linux.TCPInfo{}
   791  	buf := make([]byte, linux.SizeOfTCPInfo)
   792  	var err error
   793  	buf, err = getsockopt(s.fd, unix.SOL_TCP, unix.TCP_INFO, buf)
   794  	if err != nil {
   795  		if err != unix.ENOPROTOOPT {
   796  			log.Warningf("Failed to get TCP socket info from %+v: %v", s, err)
   797  		}
   798  		// For non-TCP sockets, silently ignore the failure.
   799  		return 0
   800  	}
   801  	if len(buf) != linux.SizeOfTCPInfo {
   802  		// Unmarshal below will panic if getsockopt returns a buffer of
   803  		// unexpected size.
   804  		log.Warningf("Failed to get TCP socket info from %+v: getsockopt(2) returned %d bytes, expecting %d bytes.", s, len(buf), linux.SizeOfTCPInfo)
   805  		return 0
   806  	}
   807  
   808  	info.UnmarshalUnsafe(buf[:info.SizeBytes()])
   809  	return uint32(info.State)
   810  }
   811  
   812  // Type implements socket.Socket.Type.
   813  func (s *Socket) Type() (family int, skType linux.SockType, protocol int) {
   814  	return s.family, s.stype, s.protocol
   815  }
   816  
   817  func init() {
   818  	// Register all families in AllowedSocketTypes and AllowedRawSocket
   819  	// types. If we don't allow raw sockets, they will be rejected in the
   820  	// Socket call.
   821  	registered := make(map[int]struct{})
   822  	for _, sockType := range append(AllowedSocketTypes, AllowedRawSocketTypes...) {
   823  		fam := sockType.Family
   824  		if _, ok := registered[fam]; ok {
   825  			continue
   826  		}
   827  		socket.RegisterProvider(fam, &socketProvider{fam})
   828  		registered[fam] = struct{}{}
   829  	}
   830  }