github.com/SagerNet/gvisor@v0.0.0-20210707092255-7731c139d75c/pkg/sentry/socket/hostinet/socket.go (about)

     1  // Copyright 2018 The gVisor Authors.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  package hostinet
    16  
    17  import (
    18  	"fmt"
    19  
    20  	"golang.org/x/sys/unix"
    21  	"github.com/SagerNet/gvisor/pkg/abi/linux"
    22  	"github.com/SagerNet/gvisor/pkg/context"
    23  	"github.com/SagerNet/gvisor/pkg/errors/linuxerr"
    24  	"github.com/SagerNet/gvisor/pkg/fdnotifier"
    25  	"github.com/SagerNet/gvisor/pkg/hostarch"
    26  	"github.com/SagerNet/gvisor/pkg/log"
    27  	"github.com/SagerNet/gvisor/pkg/marshal"
    28  	"github.com/SagerNet/gvisor/pkg/marshal/primitive"
    29  	"github.com/SagerNet/gvisor/pkg/safemem"
    30  	"github.com/SagerNet/gvisor/pkg/sentry/arch"
    31  	"github.com/SagerNet/gvisor/pkg/sentry/fs"
    32  	"github.com/SagerNet/gvisor/pkg/sentry/fs/fsutil"
    33  	"github.com/SagerNet/gvisor/pkg/sentry/kernel"
    34  	ktime "github.com/SagerNet/gvisor/pkg/sentry/kernel/time"
    35  	"github.com/SagerNet/gvisor/pkg/sentry/socket"
    36  	"github.com/SagerNet/gvisor/pkg/sentry/socket/control"
    37  	"github.com/SagerNet/gvisor/pkg/syserr"
    38  	"github.com/SagerNet/gvisor/pkg/syserror"
    39  	"github.com/SagerNet/gvisor/pkg/usermem"
    40  	"github.com/SagerNet/gvisor/pkg/waiter"
    41  )
    42  
    43  const (
    44  	sizeofInt32 = 4
    45  
    46  	// sizeofSockaddr is the size in bytes of the largest sockaddr type
    47  	// supported by this package.
    48  	sizeofSockaddr = unix.SizeofSockaddrInet6 // sizeof(sockaddr_in6) > sizeof(sockaddr_in)
    49  
    50  	// maxControlLen is the maximum size of a control message buffer used in a
    51  	// recvmsg or sendmsg unix.
    52  	maxControlLen = 1024
    53  )
    54  
    55  // LINT.IfChange
    56  
    57  // socketOperations implements fs.FileOperations and socket.Socket for a socket
    58  // implemented using a host socket.
    59  type socketOperations struct {
    60  	fsutil.FilePipeSeek             `state:"nosave"`
    61  	fsutil.FileNotDirReaddir        `state:"nosave"`
    62  	fsutil.FileNoFsync              `state:"nosave"`
    63  	fsutil.FileNoMMap               `state:"nosave"`
    64  	fsutil.FileNoSplice             `state:"nosave"`
    65  	fsutil.FileNoopFlush            `state:"nosave"`
    66  	fsutil.FileUseInodeUnstableAttr `state:"nosave"`
    67  
    68  	socketOpsCommon
    69  }
    70  
    71  var _ = socket.Socket(&socketOperations{})
    72  
    73  func newSocketFile(ctx context.Context, family int, stype linux.SockType, protocol int, fd int, nonblock bool) (*fs.File, *syserr.Error) {
    74  	s := &socketOperations{
    75  		socketOpsCommon: socketOpsCommon{
    76  			family:   family,
    77  			stype:    stype,
    78  			protocol: protocol,
    79  			fd:       fd,
    80  		},
    81  	}
    82  	if err := fdnotifier.AddFD(int32(fd), &s.queue); err != nil {
    83  		return nil, syserr.FromError(err)
    84  	}
    85  	dirent := socket.NewDirent(ctx, socketDevice)
    86  	defer dirent.DecRef(ctx)
    87  	return fs.NewFile(ctx, dirent, fs.FileFlags{NonBlocking: nonblock, Read: true, Write: true, NonSeekable: true}, s), nil
    88  }
    89  
    90  // Ioctl implements fs.FileOperations.Ioctl.
    91  func (s *socketOperations) Ioctl(ctx context.Context, _ *fs.File, io usermem.IO, args arch.SyscallArguments) (uintptr, error) {
    92  	return ioctl(ctx, s.fd, io, args)
    93  }
    94  
    95  // Read implements fs.FileOperations.Read.
    96  func (s *socketOperations) Read(ctx context.Context, _ *fs.File, dst usermem.IOSequence, _ int64) (int64, error) {
    97  	n, err := dst.CopyOutFrom(ctx, safemem.ReaderFunc(func(dsts safemem.BlockSeq) (uint64, error) {
    98  		// Refuse to do anything if any part of dst.Addrs was unusable.
    99  		if uint64(dst.NumBytes()) != dsts.NumBytes() {
   100  			return 0, nil
   101  		}
   102  		if dsts.IsEmpty() {
   103  			return 0, nil
   104  		}
   105  		if dsts.NumBlocks() == 1 {
   106  			// Skip allocating []unix.Iovec.
   107  			n, err := unix.Read(s.fd, dsts.Head().ToSlice())
   108  			if err != nil {
   109  				return 0, translateIOSyscallError(err)
   110  			}
   111  			return uint64(n), nil
   112  		}
   113  		return readv(s.fd, safemem.IovecsFromBlockSeq(dsts))
   114  	}))
   115  	return int64(n), err
   116  }
   117  
   118  // Write implements fs.FileOperations.Write.
   119  func (s *socketOperations) Write(ctx context.Context, _ *fs.File, src usermem.IOSequence, _ int64) (int64, error) {
   120  	n, err := src.CopyInTo(ctx, safemem.WriterFunc(func(srcs safemem.BlockSeq) (uint64, error) {
   121  		// Refuse to do anything if any part of src.Addrs was unusable.
   122  		if uint64(src.NumBytes()) != srcs.NumBytes() {
   123  			return 0, nil
   124  		}
   125  		if srcs.IsEmpty() {
   126  			return 0, nil
   127  		}
   128  		if srcs.NumBlocks() == 1 {
   129  			// Skip allocating []unix.Iovec.
   130  			n, err := unix.Write(s.fd, srcs.Head().ToSlice())
   131  			if err != nil {
   132  				return 0, translateIOSyscallError(err)
   133  			}
   134  			return uint64(n), nil
   135  		}
   136  		return writev(s.fd, safemem.IovecsFromBlockSeq(srcs))
   137  	}))
   138  	return int64(n), err
   139  }
   140  
   141  // Socket implements socket.Provider.Socket.
   142  func (p *socketProvider) Socket(t *kernel.Task, stypeflags linux.SockType, protocol int) (*fs.File, *syserr.Error) {
   143  	// Check that we are using the host network stack.
   144  	stack := t.NetworkContext()
   145  	if stack == nil {
   146  		return nil, nil
   147  	}
   148  	if _, ok := stack.(*Stack); !ok {
   149  		return nil, nil
   150  	}
   151  
   152  	// Only accept TCP and UDP.
   153  	stype := stypeflags & linux.SOCK_TYPE_MASK
   154  	switch stype {
   155  	case unix.SOCK_STREAM:
   156  		switch protocol {
   157  		case 0, unix.IPPROTO_TCP:
   158  			// ok
   159  		default:
   160  			return nil, nil
   161  		}
   162  	case unix.SOCK_DGRAM:
   163  		switch protocol {
   164  		case 0, unix.IPPROTO_UDP:
   165  			// ok
   166  		default:
   167  			return nil, nil
   168  		}
   169  	default:
   170  		return nil, nil
   171  	}
   172  
   173  	// Conservatively ignore all flags specified by the application and add
   174  	// SOCK_NONBLOCK since socketOperations requires it. Pass a protocol of 0
   175  	// to simplify the syscall filters, since 0 and IPPROTO_* are equivalent.
   176  	fd, err := unix.Socket(p.family, int(stype)|unix.SOCK_NONBLOCK|unix.SOCK_CLOEXEC, 0)
   177  	if err != nil {
   178  		return nil, syserr.FromError(err)
   179  	}
   180  	return newSocketFile(t, p.family, stype, protocol, fd, stypeflags&unix.SOCK_NONBLOCK != 0)
   181  }
   182  
   183  // Pair implements socket.Provider.Pair.
   184  func (p *socketProvider) Pair(t *kernel.Task, stype linux.SockType, protocol int) (*fs.File, *fs.File, *syserr.Error) {
   185  	// Not supported by AF_INET/AF_INET6.
   186  	return nil, nil, nil
   187  }
   188  
   189  // LINT.ThenChange(./socket_vfs2.go)
   190  
   191  // socketOpsCommon contains the socket operations common to VFS1 and VFS2.
   192  //
   193  // +stateify savable
   194  type socketOpsCommon struct {
   195  	socket.SendReceiveTimeout
   196  
   197  	family   int            // Read-only.
   198  	stype    linux.SockType // Read-only.
   199  	protocol int            // Read-only.
   200  	queue    waiter.Queue
   201  
   202  	// fd is the host socket fd. It must have O_NONBLOCK, so that operations
   203  	// will return EWOULDBLOCK instead of blocking on the host. This allows us to
   204  	// handle blocking behavior independently in the sentry.
   205  	fd int
   206  }
   207  
   208  // Release implements fs.FileOperations.Release.
   209  func (s *socketOpsCommon) Release(context.Context) {
   210  	fdnotifier.RemoveFD(int32(s.fd))
   211  	unix.Close(s.fd)
   212  }
   213  
   214  // Readiness implements waiter.Waitable.Readiness.
   215  func (s *socketOpsCommon) Readiness(mask waiter.EventMask) waiter.EventMask {
   216  	return fdnotifier.NonBlockingPoll(int32(s.fd), mask)
   217  }
   218  
   219  // EventRegister implements waiter.Waitable.EventRegister.
   220  func (s *socketOpsCommon) EventRegister(e *waiter.Entry, mask waiter.EventMask) {
   221  	s.queue.EventRegister(e, mask)
   222  	fdnotifier.UpdateFD(int32(s.fd))
   223  }
   224  
   225  // EventUnregister implements waiter.Waitable.EventUnregister.
   226  func (s *socketOpsCommon) EventUnregister(e *waiter.Entry) {
   227  	s.queue.EventUnregister(e)
   228  	fdnotifier.UpdateFD(int32(s.fd))
   229  }
   230  
   231  // Connect implements socket.Socket.Connect.
   232  func (s *socketOpsCommon) Connect(t *kernel.Task, sockaddr []byte, blocking bool) *syserr.Error {
   233  	if len(sockaddr) > sizeofSockaddr {
   234  		sockaddr = sockaddr[:sizeofSockaddr]
   235  	}
   236  
   237  	_, _, errno := unix.Syscall(unix.SYS_CONNECT, uintptr(s.fd), uintptr(firstBytePtr(sockaddr)), uintptr(len(sockaddr)))
   238  
   239  	if errno == 0 {
   240  		return nil
   241  	}
   242  	if errno != unix.EINPROGRESS || !blocking {
   243  		return syserr.FromError(translateIOSyscallError(errno))
   244  	}
   245  
   246  	// "EINPROGRESS: The socket is nonblocking and the connection cannot be
   247  	// completed immediately. It is possible to select(2) or poll(2) for
   248  	// completion by selecting the socket for writing. After select(2)
   249  	// indicates writability, use getsockopt(2) to read the SO_ERROR option at
   250  	// level SOL-SOCKET to determine whether connect() completed successfully
   251  	// (SO_ERROR is zero) or unsuccessfully (SO_ERROR is one of the usual error
   252  	// codes listed here, explaining the reason for the failure)." - connect(2)
   253  	e, ch := waiter.NewChannelEntry(nil)
   254  	writableMask := waiter.WritableEvents
   255  	s.EventRegister(&e, writableMask)
   256  	defer s.EventUnregister(&e)
   257  	if s.Readiness(writableMask)&writableMask == 0 {
   258  		if err := t.Block(ch); err != nil {
   259  			return syserr.FromError(err)
   260  		}
   261  	}
   262  	val, err := unix.GetsockoptInt(s.fd, unix.SOL_SOCKET, unix.SO_ERROR)
   263  	if err != nil {
   264  		return syserr.FromError(err)
   265  	}
   266  	if val != 0 {
   267  		return syserr.FromError(unix.Errno(uintptr(val)))
   268  	}
   269  	return nil
   270  }
   271  
   272  // Accept implements socket.Socket.Accept.
   273  func (s *socketOpsCommon) Accept(t *kernel.Task, peerRequested bool, flags int, blocking bool) (int32, linux.SockAddr, uint32, *syserr.Error) {
   274  	var peerAddr linux.SockAddr
   275  	var peerAddrBuf []byte
   276  	var peerAddrlen uint32
   277  	var peerAddrPtr *byte
   278  	var peerAddrlenPtr *uint32
   279  	if peerRequested {
   280  		peerAddrBuf = make([]byte, sizeofSockaddr)
   281  		peerAddrlen = uint32(len(peerAddrBuf))
   282  		peerAddrPtr = &peerAddrBuf[0]
   283  		peerAddrlenPtr = &peerAddrlen
   284  	}
   285  
   286  	// Conservatively ignore all flags specified by the application and add
   287  	// SOCK_NONBLOCK since socketOpsCommon requires it.
   288  	fd, syscallErr := accept4(s.fd, peerAddrPtr, peerAddrlenPtr, unix.SOCK_NONBLOCK|unix.SOCK_CLOEXEC)
   289  	if blocking {
   290  		var ch chan struct{}
   291  		for syscallErr == syserror.ErrWouldBlock {
   292  			if ch != nil {
   293  				if syscallErr = t.Block(ch); syscallErr != nil {
   294  					break
   295  				}
   296  			} else {
   297  				var e waiter.Entry
   298  				e, ch = waiter.NewChannelEntry(nil)
   299  				s.EventRegister(&e, waiter.ReadableEvents)
   300  				defer s.EventUnregister(&e)
   301  			}
   302  			fd, syscallErr = accept4(s.fd, peerAddrPtr, peerAddrlenPtr, unix.SOCK_NONBLOCK|unix.SOCK_CLOEXEC)
   303  		}
   304  	}
   305  
   306  	if peerRequested {
   307  		peerAddr = socket.UnmarshalSockAddr(s.family, peerAddrBuf[:peerAddrlen])
   308  	}
   309  	if syscallErr != nil {
   310  		return 0, peerAddr, peerAddrlen, syserr.FromError(syscallErr)
   311  	}
   312  
   313  	var (
   314  		kfd  int32
   315  		kerr error
   316  	)
   317  	if kernel.VFS2Enabled {
   318  		f, err := newVFS2Socket(t, s.family, s.stype, s.protocol, fd, uint32(flags&unix.SOCK_NONBLOCK))
   319  		if err != nil {
   320  			unix.Close(fd)
   321  			return 0, nil, 0, err
   322  		}
   323  		defer f.DecRef(t)
   324  
   325  		kfd, kerr = t.NewFDFromVFS2(0, f, kernel.FDFlags{
   326  			CloseOnExec: flags&unix.SOCK_CLOEXEC != 0,
   327  		})
   328  		t.Kernel().RecordSocketVFS2(f)
   329  	} else {
   330  		f, err := newSocketFile(t, s.family, s.stype, s.protocol, fd, flags&unix.SOCK_NONBLOCK != 0)
   331  		if err != nil {
   332  			unix.Close(fd)
   333  			return 0, nil, 0, err
   334  		}
   335  		defer f.DecRef(t)
   336  
   337  		kfd, kerr = t.NewFDFrom(0, f, kernel.FDFlags{
   338  			CloseOnExec: flags&unix.SOCK_CLOEXEC != 0,
   339  		})
   340  		t.Kernel().RecordSocket(f)
   341  	}
   342  
   343  	return kfd, peerAddr, peerAddrlen, syserr.FromError(kerr)
   344  }
   345  
   346  // Bind implements socket.Socket.Bind.
   347  func (s *socketOpsCommon) Bind(t *kernel.Task, sockaddr []byte) *syserr.Error {
   348  	if len(sockaddr) > sizeofSockaddr {
   349  		sockaddr = sockaddr[:sizeofSockaddr]
   350  	}
   351  
   352  	_, _, errno := unix.Syscall(unix.SYS_BIND, uintptr(s.fd), uintptr(firstBytePtr(sockaddr)), uintptr(len(sockaddr)))
   353  	if errno != 0 {
   354  		return syserr.FromError(errno)
   355  	}
   356  	return nil
   357  }
   358  
   359  // Listen implements socket.Socket.Listen.
   360  func (s *socketOpsCommon) Listen(t *kernel.Task, backlog int) *syserr.Error {
   361  	return syserr.FromError(unix.Listen(s.fd, backlog))
   362  }
   363  
   364  // Shutdown implements socket.Socket.Shutdown.
   365  func (s *socketOpsCommon) Shutdown(t *kernel.Task, how int) *syserr.Error {
   366  	switch how {
   367  	case unix.SHUT_RD, unix.SHUT_WR, unix.SHUT_RDWR:
   368  		return syserr.FromError(unix.Shutdown(s.fd, how))
   369  	default:
   370  		return syserr.ErrInvalidArgument
   371  	}
   372  }
   373  
   374  // GetSockOpt implements socket.Socket.GetSockOpt.
   375  func (s *socketOpsCommon) GetSockOpt(t *kernel.Task, level int, name int, outPtr hostarch.Addr, outLen int) (marshal.Marshallable, *syserr.Error) {
   376  	if outLen < 0 {
   377  		return nil, syserr.ErrInvalidArgument
   378  	}
   379  
   380  	// Only allow known and safe options.
   381  	optlen := getSockOptLen(t, level, name)
   382  	switch level {
   383  	case linux.SOL_IP:
   384  		switch name {
   385  		case linux.IP_TOS, linux.IP_RECVTOS, linux.IP_PKTINFO, linux.IP_RECVORIGDSTADDR, linux.IP_RECVERR:
   386  			optlen = sizeofInt32
   387  		}
   388  	case linux.SOL_IPV6:
   389  		switch name {
   390  		case linux.IPV6_TCLASS, linux.IPV6_RECVTCLASS, linux.IPV6_RECVERR, linux.IPV6_V6ONLY, linux.IPV6_RECVORIGDSTADDR:
   391  			optlen = sizeofInt32
   392  		}
   393  	case linux.SOL_SOCKET:
   394  		switch name {
   395  		case linux.SO_ERROR, linux.SO_KEEPALIVE, linux.SO_SNDBUF, linux.SO_RCVBUF, linux.SO_REUSEADDR, linux.SO_TIMESTAMP:
   396  			optlen = sizeofInt32
   397  		case linux.SO_LINGER:
   398  			optlen = unix.SizeofLinger
   399  		}
   400  	case linux.SOL_TCP:
   401  		switch name {
   402  		case linux.TCP_NODELAY:
   403  			optlen = sizeofInt32
   404  		case linux.TCP_INFO:
   405  			optlen = int(linux.SizeOfTCPInfo)
   406  		}
   407  	}
   408  
   409  	if optlen == 0 {
   410  		return nil, syserr.ErrProtocolNotAvailable // ENOPROTOOPT
   411  	}
   412  	if outLen < optlen {
   413  		return nil, syserr.ErrInvalidArgument
   414  	}
   415  
   416  	opt, err := getsockopt(s.fd, level, name, optlen)
   417  	if err != nil {
   418  		return nil, syserr.FromError(err)
   419  	}
   420  	optP := primitive.ByteSlice(opt)
   421  	return &optP, nil
   422  }
   423  
   424  // SetSockOpt implements socket.Socket.SetSockOpt.
   425  func (s *socketOpsCommon) SetSockOpt(t *kernel.Task, level int, name int, opt []byte) *syserr.Error {
   426  	// Only allow known and safe options.
   427  	optlen := setSockOptLen(t, level, name)
   428  	switch level {
   429  	case linux.SOL_IP:
   430  		switch name {
   431  		case linux.IP_TOS, linux.IP_RECVTOS, linux.IP_PKTINFO, linux.IP_RECVORIGDSTADDR, linux.IP_RECVERR:
   432  			optlen = sizeofInt32
   433  		}
   434  	case linux.SOL_IPV6:
   435  		switch name {
   436  		case linux.IPV6_TCLASS, linux.IPV6_RECVTCLASS, linux.IPV6_RECVERR, linux.IPV6_V6ONLY, linux.IPV6_RECVORIGDSTADDR:
   437  			optlen = sizeofInt32
   438  		}
   439  	case linux.SOL_SOCKET:
   440  		switch name {
   441  		case linux.SO_SNDBUF, linux.SO_RCVBUF, linux.SO_REUSEADDR, linux.SO_TIMESTAMP:
   442  			optlen = sizeofInt32
   443  		}
   444  	case linux.SOL_TCP:
   445  		switch name {
   446  		case linux.TCP_NODELAY, linux.TCP_INQ:
   447  			optlen = sizeofInt32
   448  		}
   449  	}
   450  
   451  	if optlen == 0 {
   452  		// Pretend to accept socket options we don't understand. This seems
   453  		// dangerous, but it's what netstack does...
   454  		return nil
   455  	}
   456  	if len(opt) < optlen {
   457  		return syserr.ErrInvalidArgument
   458  	}
   459  	opt = opt[:optlen]
   460  
   461  	_, _, errno := unix.Syscall6(unix.SYS_SETSOCKOPT, uintptr(s.fd), uintptr(level), uintptr(name), uintptr(firstBytePtr(opt)), uintptr(len(opt)), 0)
   462  	if errno != 0 {
   463  		return syserr.FromError(errno)
   464  	}
   465  	return nil
   466  }
   467  
   468  func (s *socketOpsCommon) recvMsgFromHost(iovs []unix.Iovec, flags int, senderRequested bool, controlLen uint64) (uint64, int, []byte, []byte, error) {
   469  	// We always do a non-blocking recv*().
   470  	sysflags := flags | unix.MSG_DONTWAIT
   471  
   472  	msg := unix.Msghdr{}
   473  	if len(iovs) > 0 {
   474  		msg.Iov = &iovs[0]
   475  		msg.Iovlen = uint64(len(iovs))
   476  	}
   477  	var senderAddrBuf []byte
   478  	if senderRequested {
   479  		senderAddrBuf = make([]byte, sizeofSockaddr)
   480  		msg.Name = &senderAddrBuf[0]
   481  		msg.Namelen = uint32(sizeofSockaddr)
   482  	}
   483  	var controlBuf []byte
   484  	if controlLen > 0 {
   485  		if controlLen > maxControlLen {
   486  			controlLen = maxControlLen
   487  		}
   488  		controlBuf = make([]byte, controlLen)
   489  		msg.Control = &controlBuf[0]
   490  		msg.Controllen = controlLen
   491  	}
   492  	n, err := recvmsg(s.fd, &msg, sysflags)
   493  	if err != nil {
   494  		return 0 /* n */, 0 /* mFlags */, nil /* senderAddrBuf */, nil /* controlBuf */, err
   495  	}
   496  	return n, int(msg.Flags), senderAddrBuf[:msg.Namelen], controlBuf[:msg.Controllen], err
   497  }
   498  
   499  // RecvMsg implements socket.Socket.RecvMsg.
   500  func (s *socketOpsCommon) RecvMsg(t *kernel.Task, dst usermem.IOSequence, flags int, haveDeadline bool, deadline ktime.Time, senderRequested bool, controlLen uint64) (int, int, linux.SockAddr, uint32, socket.ControlMessages, *syserr.Error) {
   501  	// Only allow known and safe flags.
   502  	if flags&^(unix.MSG_DONTWAIT|unix.MSG_PEEK|unix.MSG_TRUNC|unix.MSG_ERRQUEUE) != 0 {
   503  		return 0, 0, nil, 0, socket.ControlMessages{}, syserr.ErrInvalidArgument
   504  	}
   505  
   506  	var senderAddrBuf []byte
   507  	var controlBuf []byte
   508  	var msgFlags int
   509  	copyToDst := func() (int64, error) {
   510  		var n uint64
   511  		var err error
   512  		if dst.NumBytes() == 0 {
   513  			// We want to make the recvmsg(2) call to the host even if dst is empty
   514  			// to fetch control messages, sender address or errors if any occur.
   515  			n, msgFlags, senderAddrBuf, controlBuf, err = s.recvMsgFromHost(nil, flags, senderRequested, controlLen)
   516  			return int64(n), err
   517  		}
   518  
   519  		recvmsgToBlocks := safemem.ReaderFunc(func(dsts safemem.BlockSeq) (uint64, error) {
   520  			// Refuse to do anything if any part of dst.Addrs was unusable.
   521  			if uint64(dst.NumBytes()) != dsts.NumBytes() {
   522  				return 0, nil
   523  			}
   524  			if dsts.IsEmpty() {
   525  				return 0, nil
   526  			}
   527  
   528  			n, msgFlags, senderAddrBuf, controlBuf, err = s.recvMsgFromHost(safemem.IovecsFromBlockSeq(dsts), flags, senderRequested, controlLen)
   529  			return n, err
   530  		})
   531  		return dst.CopyOutFrom(t, recvmsgToBlocks)
   532  	}
   533  
   534  	var ch chan struct{}
   535  	n, err := copyToDst()
   536  	// recv*(MSG_ERRQUEUE) never blocks, even without MSG_DONTWAIT.
   537  	if flags&(unix.MSG_DONTWAIT|unix.MSG_ERRQUEUE) == 0 {
   538  		for err == syserror.ErrWouldBlock {
   539  			// We only expect blocking to come from the actual syscall, in which
   540  			// case it can't have returned any data.
   541  			if n != 0 {
   542  				panic(fmt.Sprintf("CopyOutFrom: got (%d, %v), wanted (0, %v)", n, err, err))
   543  			}
   544  			if ch != nil {
   545  				if err = t.BlockWithDeadline(ch, haveDeadline, deadline); err != nil {
   546  					break
   547  				}
   548  			} else {
   549  				var e waiter.Entry
   550  				e, ch = waiter.NewChannelEntry(nil)
   551  				s.EventRegister(&e, waiter.ReadableEvents)
   552  				defer s.EventUnregister(&e)
   553  			}
   554  			n, err = copyToDst()
   555  		}
   556  	}
   557  	if err != nil {
   558  		return 0, 0, nil, 0, socket.ControlMessages{}, syserr.FromError(err)
   559  	}
   560  
   561  	var senderAddr linux.SockAddr
   562  	if senderRequested {
   563  		senderAddr = socket.UnmarshalSockAddr(s.family, senderAddrBuf)
   564  	}
   565  
   566  	unixControlMessages, err := unix.ParseSocketControlMessage(controlBuf)
   567  	if err != nil {
   568  		return 0, 0, nil, 0, socket.ControlMessages{}, syserr.FromError(err)
   569  	}
   570  	return int(n), msgFlags, senderAddr, uint32(len(senderAddrBuf)), parseUnixControlMessages(unixControlMessages), nil
   571  }
   572  
   573  func parseUnixControlMessages(unixControlMessages []unix.SocketControlMessage) socket.ControlMessages {
   574  	controlMessages := socket.ControlMessages{}
   575  	for _, unixCmsg := range unixControlMessages {
   576  		switch unixCmsg.Header.Level {
   577  		case linux.SOL_SOCKET:
   578  			switch unixCmsg.Header.Type {
   579  			case linux.SO_TIMESTAMP:
   580  				controlMessages.IP.HasTimestamp = true
   581  				ts := linux.Timeval{}
   582  				ts.UnmarshalUnsafe(unixCmsg.Data[:linux.SizeOfTimeval])
   583  				controlMessages.IP.Timestamp = ts.ToNsecCapped()
   584  			}
   585  
   586  		case linux.SOL_IP:
   587  			switch unixCmsg.Header.Type {
   588  			case linux.IP_TOS:
   589  				controlMessages.IP.HasTOS = true
   590  				var tos primitive.Uint8
   591  				tos.UnmarshalUnsafe(unixCmsg.Data[:tos.SizeBytes()])
   592  				controlMessages.IP.TOS = uint8(tos)
   593  
   594  			case linux.IP_PKTINFO:
   595  				controlMessages.IP.HasIPPacketInfo = true
   596  				var packetInfo linux.ControlMessageIPPacketInfo
   597  				packetInfo.UnmarshalUnsafe(unixCmsg.Data[:packetInfo.SizeBytes()])
   598  				controlMessages.IP.PacketInfo = packetInfo
   599  
   600  			case linux.IP_RECVORIGDSTADDR:
   601  				var addr linux.SockAddrInet
   602  				addr.UnmarshalUnsafe(unixCmsg.Data[:addr.SizeBytes()])
   603  				controlMessages.IP.OriginalDstAddress = &addr
   604  
   605  			case unix.IP_RECVERR:
   606  				var errCmsg linux.SockErrCMsgIPv4
   607  				errCmsg.UnmarshalBytes(unixCmsg.Data)
   608  				controlMessages.IP.SockErr = &errCmsg
   609  			}
   610  
   611  		case linux.SOL_IPV6:
   612  			switch unixCmsg.Header.Type {
   613  			case linux.IPV6_TCLASS:
   614  				controlMessages.IP.HasTClass = true
   615  				var tclass primitive.Uint32
   616  				tclass.UnmarshalUnsafe(unixCmsg.Data[:tclass.SizeBytes()])
   617  				controlMessages.IP.TClass = uint32(tclass)
   618  
   619  			case linux.IPV6_RECVORIGDSTADDR:
   620  				var addr linux.SockAddrInet6
   621  				addr.UnmarshalUnsafe(unixCmsg.Data[:addr.SizeBytes()])
   622  				controlMessages.IP.OriginalDstAddress = &addr
   623  
   624  			case unix.IPV6_RECVERR:
   625  				var errCmsg linux.SockErrCMsgIPv6
   626  				errCmsg.UnmarshalBytes(unixCmsg.Data)
   627  				controlMessages.IP.SockErr = &errCmsg
   628  			}
   629  
   630  		case linux.SOL_TCP:
   631  			switch unixCmsg.Header.Type {
   632  			case linux.TCP_INQ:
   633  				controlMessages.IP.HasInq = true
   634  				var inq primitive.Int32
   635  				inq.UnmarshalUnsafe(unixCmsg.Data[:linux.SizeOfControlMessageInq])
   636  				controlMessages.IP.Inq = int32(inq)
   637  			}
   638  		}
   639  	}
   640  	return controlMessages
   641  }
   642  
   643  // SendMsg implements socket.Socket.SendMsg.
   644  func (s *socketOpsCommon) SendMsg(t *kernel.Task, src usermem.IOSequence, to []byte, flags int, haveDeadline bool, deadline ktime.Time, controlMessages socket.ControlMessages) (int, *syserr.Error) {
   645  	// Only allow known and safe flags.
   646  	if flags&^(unix.MSG_DONTWAIT|unix.MSG_EOR|unix.MSG_FASTOPEN|unix.MSG_MORE|unix.MSG_NOSIGNAL) != 0 {
   647  		return 0, syserr.ErrInvalidArgument
   648  	}
   649  
   650  	// If the src is zero-length, call SENDTO directly with a null buffer in
   651  	// order to generate poll/epoll notifications.
   652  	if src.NumBytes() == 0 {
   653  		sysflags := flags | unix.MSG_DONTWAIT
   654  		n, _, errno := unix.Syscall6(unix.SYS_SENDTO, uintptr(s.fd), 0, 0, uintptr(sysflags), uintptr(firstBytePtr(to)), uintptr(len(to)))
   655  		if errno != 0 {
   656  			return 0, syserr.FromError(errno)
   657  		}
   658  		return int(n), nil
   659  	}
   660  
   661  	space := uint64(control.CmsgsSpace(t, controlMessages))
   662  	if space > maxControlLen {
   663  		space = maxControlLen
   664  	}
   665  	controlBuf := make([]byte, 0, space)
   666  	// PackControlMessages will append up to space bytes to controlBuf.
   667  	controlBuf = control.PackControlMessages(t, controlMessages, controlBuf)
   668  
   669  	sendmsgFromBlocks := safemem.WriterFunc(func(srcs safemem.BlockSeq) (uint64, error) {
   670  		// Refuse to do anything if any part of src.Addrs was unusable.
   671  		if uint64(src.NumBytes()) != srcs.NumBytes() {
   672  			return 0, nil
   673  		}
   674  		if srcs.IsEmpty() && len(controlBuf) == 0 {
   675  			return 0, nil
   676  		}
   677  
   678  		// We always do a non-blocking send*().
   679  		sysflags := flags | unix.MSG_DONTWAIT
   680  
   681  		if srcs.NumBlocks() == 1 && len(controlBuf) == 0 {
   682  			// Skip allocating []unix.Iovec.
   683  			src := srcs.Head()
   684  			n, _, errno := unix.Syscall6(unix.SYS_SENDTO, uintptr(s.fd), src.Addr(), uintptr(src.Len()), uintptr(sysflags), uintptr(firstBytePtr(to)), uintptr(len(to)))
   685  			if errno != 0 {
   686  				return 0, translateIOSyscallError(errno)
   687  			}
   688  			return uint64(n), nil
   689  		}
   690  
   691  		iovs := safemem.IovecsFromBlockSeq(srcs)
   692  		msg := unix.Msghdr{
   693  			Iov:    &iovs[0],
   694  			Iovlen: uint64(len(iovs)),
   695  		}
   696  		if len(to) != 0 {
   697  			msg.Name = &to[0]
   698  			msg.Namelen = uint32(len(to))
   699  		}
   700  		if len(controlBuf) != 0 {
   701  			msg.Control = &controlBuf[0]
   702  			msg.Controllen = uint64(len(controlBuf))
   703  		}
   704  		return sendmsg(s.fd, &msg, sysflags)
   705  	})
   706  
   707  	var ch chan struct{}
   708  	n, err := src.CopyInTo(t, sendmsgFromBlocks)
   709  	if flags&unix.MSG_DONTWAIT == 0 {
   710  		for err == syserror.ErrWouldBlock {
   711  			// We only expect blocking to come from the actual syscall, in which
   712  			// case it can't have returned any data.
   713  			if n != 0 {
   714  				panic(fmt.Sprintf("CopyInTo: got (%d, %v), wanted (0, %v)", n, err, err))
   715  			}
   716  			if ch != nil {
   717  				if err = t.BlockWithDeadline(ch, haveDeadline, deadline); err != nil {
   718  					if linuxerr.Equals(linuxerr.ETIMEDOUT, err) {
   719  						err = syserror.ErrWouldBlock
   720  					}
   721  					break
   722  				}
   723  			} else {
   724  				var e waiter.Entry
   725  				e, ch = waiter.NewChannelEntry(nil)
   726  				s.EventRegister(&e, waiter.WritableEvents)
   727  				defer s.EventUnregister(&e)
   728  			}
   729  			n, err = src.CopyInTo(t, sendmsgFromBlocks)
   730  		}
   731  	}
   732  
   733  	return int(n), syserr.FromError(err)
   734  }
   735  
   736  func translateIOSyscallError(err error) error {
   737  	if err == unix.EAGAIN || err == unix.EWOULDBLOCK {
   738  		return syserror.ErrWouldBlock
   739  	}
   740  	return err
   741  }
   742  
   743  // State implements socket.Socket.State.
   744  func (s *socketOpsCommon) State() uint32 {
   745  	info := linux.TCPInfo{}
   746  	buf, err := getsockopt(s.fd, unix.SOL_TCP, unix.TCP_INFO, linux.SizeOfTCPInfo)
   747  	if err != nil {
   748  		if err != unix.ENOPROTOOPT {
   749  			log.Warningf("Failed to get TCP socket info from %+v: %v", s, err)
   750  		}
   751  		// For non-TCP sockets, silently ignore the failure.
   752  		return 0
   753  	}
   754  	if len(buf) != linux.SizeOfTCPInfo {
   755  		// Unmarshal below will panic if getsockopt returns a buffer of
   756  		// unexpected size.
   757  		log.Warningf("Failed to get TCP socket info from %+v: getsockopt(2) returned %d bytes, expecting %d bytes.", s, len(buf), linux.SizeOfTCPInfo)
   758  		return 0
   759  	}
   760  
   761  	info.UnmarshalUnsafe(buf[:info.SizeBytes()])
   762  	return uint32(info.State)
   763  }
   764  
   765  // Type implements socket.Socket.Type.
   766  func (s *socketOpsCommon) Type() (family int, skType linux.SockType, protocol int) {
   767  	return s.family, s.stype, s.protocol
   768  }
   769  
   770  type socketProvider struct {
   771  	family int
   772  }
   773  
   774  func init() {
   775  	for _, family := range []int{unix.AF_INET, unix.AF_INET6} {
   776  		socket.RegisterProvider(family, &socketProvider{family})
   777  		socket.RegisterProviderVFS2(family, &socketProviderVFS2{family})
   778  	}
   779  }