gvisor.dev/gvisor@v0.0.0-20240520182842-f9d4d51c7e0f/pkg/sentry/socket/netlink/socket.go (about)

     1  // Copyright 2018 The gVisor Authors.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  // Package netlink provides core functionality for netlink sockets.
    16  package netlink
    17  
    18  import (
    19  	"io"
    20  	"math"
    21  	"time"
    22  
    23  	"gvisor.dev/gvisor/pkg/abi/linux"
    24  	"gvisor.dev/gvisor/pkg/abi/linux/errno"
    25  	"gvisor.dev/gvisor/pkg/context"
    26  	"gvisor.dev/gvisor/pkg/errors/linuxerr"
    27  	"gvisor.dev/gvisor/pkg/hostarch"
    28  	"gvisor.dev/gvisor/pkg/marshal"
    29  	"gvisor.dev/gvisor/pkg/marshal/primitive"
    30  	"gvisor.dev/gvisor/pkg/sentry/arch"
    31  	"gvisor.dev/gvisor/pkg/sentry/kernel"
    32  	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
    33  	ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time"
    34  	"gvisor.dev/gvisor/pkg/sentry/socket"
    35  	"gvisor.dev/gvisor/pkg/sentry/socket/netlink/nlmsg"
    36  	"gvisor.dev/gvisor/pkg/sentry/socket/netlink/port"
    37  	"gvisor.dev/gvisor/pkg/sentry/socket/unix"
    38  	"gvisor.dev/gvisor/pkg/sentry/socket/unix/transport"
    39  	"gvisor.dev/gvisor/pkg/sentry/vfs"
    40  	"gvisor.dev/gvisor/pkg/sync"
    41  	"gvisor.dev/gvisor/pkg/syserr"
    42  	"gvisor.dev/gvisor/pkg/usermem"
    43  	"gvisor.dev/gvisor/pkg/waiter"
    44  )
    45  
    46  const sizeOfInt32 int = 4
    47  
    48  const (
    49  	// minBufferSize is the smallest size of a send buffer.
    50  	minSendBufferSize = 4 << 10 // 4096 bytes.
    51  
    52  	// defaultSendBufferSize is the default size for the send buffer.
    53  	defaultSendBufferSize = 16 * 1024
    54  
    55  	// maxBufferSize is the largest size a send buffer can grow to.
    56  	maxSendBufferSize = 4 << 20 // 4MB
    57  )
    58  
    59  var errNoFilter = syserr.New("no filter attached", errno.ENOENT)
    60  
    61  // Socket is the base socket type for netlink sockets.
    62  //
    63  // This implementation only supports userspace sending and receiving messages
    64  // to/from the kernel.
    65  //
    66  // Socket implements socket.Socket and transport.Credentialer.
    67  //
    68  // +stateify savable
    69  type Socket struct {
    70  	vfsfd vfs.FileDescription
    71  	vfs.FileDescriptionDefaultImpl
    72  	vfs.DentryMetadataFileDescriptionImpl
    73  	vfs.LockFD
    74  	socket.SendReceiveTimeout
    75  
    76  	// ports provides netlink port allocation.
    77  	ports *port.Manager
    78  
    79  	// protocol is the netlink protocol implementation.
    80  	protocol Protocol
    81  
    82  	// skType is the socket type. This is either SOCK_DGRAM or SOCK_RAW for
    83  	// netlink sockets.
    84  	skType linux.SockType
    85  
    86  	// ep is a datagram unix endpoint used to buffer messages sent from the
    87  	// kernel to userspace. RecvMsg reads messages from this endpoint.
    88  	ep transport.Endpoint
    89  
    90  	// connection is the kernel's connection to ep, used to write messages
    91  	// sent to userspace.
    92  	connection transport.ConnectedEndpoint
    93  
    94  	// mu protects the fields below.
    95  	mu sync.Mutex `state:"nosave"`
    96  
    97  	// bound indicates that portid is valid.
    98  	bound bool
    99  
   100  	// portID is the port ID allocated for this socket.
   101  	portID int32
   102  
   103  	// sendBufferSize is the send buffer "size". We don't actually have a
   104  	// fixed buffer but only consume this many bytes.
   105  	sendBufferSize uint32
   106  
   107  	// filter indicates that this socket has a BPF filter "installed".
   108  	//
   109  	// TODO(gvisor.dev/issue/1119): We don't actually support filtering,
   110  	// this is just bookkeeping for tracking add/remove.
   111  	filter bool
   112  }
   113  
   114  var _ socket.Socket = (*Socket)(nil)
   115  var _ transport.Credentialer = (*Socket)(nil)
   116  
   117  // New creates a new Socket.
   118  func New(t *kernel.Task, skType linux.SockType, protocol Protocol) (*Socket, *syserr.Error) {
   119  	// Datagram endpoint used to buffer kernel -> user messages.
   120  	ep := transport.NewConnectionless(t)
   121  
   122  	// Bind the endpoint for good measure so we can connect to it. The
   123  	// bound address will never be exposed.
   124  	if err := ep.Bind(transport.Address{Addr: "dummy"}); err != nil {
   125  		ep.Close(t)
   126  		return nil, err
   127  	}
   128  
   129  	// Create a connection from which the kernel can write messages.
   130  	connection, err := ep.(transport.BoundEndpoint).UnidirectionalConnect(t)
   131  	if err != nil {
   132  		ep.Close(t)
   133  		return nil, err
   134  	}
   135  
   136  	fd := &Socket{
   137  		ports:          t.Kernel().NetlinkPorts(),
   138  		protocol:       protocol,
   139  		skType:         skType,
   140  		ep:             ep,
   141  		connection:     connection,
   142  		sendBufferSize: defaultSendBufferSize,
   143  	}
   144  	fd.LockFD.Init(&vfs.FileLocks{})
   145  	return fd, nil
   146  }
   147  
   148  // Release implements vfs.FileDescriptionImpl.Release.
   149  func (s *Socket) Release(ctx context.Context) {
   150  	t := kernel.TaskFromContext(ctx)
   151  	t.Kernel().DeleteSocket(&s.vfsfd)
   152  	s.connection.Release(ctx)
   153  	s.ep.Close(ctx)
   154  
   155  	if s.bound {
   156  		s.ports.Release(s.protocol.Protocol(), s.portID)
   157  	}
   158  }
   159  
   160  // Epollable implements FileDescriptionImpl.Epollable.
   161  func (s *Socket) Epollable() bool {
   162  	return true
   163  }
   164  
   165  // Ioctl implements vfs.FileDescriptionImpl.
   166  func (*Socket) Ioctl(ctx context.Context, uio usermem.IO, sysno uintptr, args arch.SyscallArguments) (uintptr, error) {
   167  	// TODO(b/68878065): no ioctls supported.
   168  	return 0, linuxerr.ENOTTY
   169  }
   170  
   171  // PRead implements vfs.FileDescriptionImpl.
   172  func (s *Socket) PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts vfs.ReadOptions) (int64, error) {
   173  	return 0, linuxerr.ESPIPE
   174  }
   175  
   176  // Read implements vfs.FileDescriptionImpl.
   177  func (s *Socket) Read(ctx context.Context, dst usermem.IOSequence, opts vfs.ReadOptions) (int64, error) {
   178  	// All flags other than RWF_NOWAIT should be ignored.
   179  	// TODO(gvisor.dev/issue/2601): Support RWF_NOWAIT.
   180  	if opts.Flags != 0 {
   181  		return 0, linuxerr.EOPNOTSUPP
   182  	}
   183  
   184  	if dst.NumBytes() == 0 {
   185  		return 0, nil
   186  	}
   187  	r := unix.EndpointReader{
   188  		Endpoint: s.ep,
   189  	}
   190  	n, err := dst.CopyOutFrom(ctx, &r)
   191  	if r.Notify != nil {
   192  		r.Notify()
   193  	}
   194  	return n, err
   195  }
   196  
   197  // PWrite implements vfs.FileDescriptionImpl.
   198  func (s *Socket) PWrite(ctx context.Context, src usermem.IOSequence, offset int64, opts vfs.WriteOptions) (int64, error) {
   199  	return 0, linuxerr.ESPIPE
   200  }
   201  
   202  // Write implements vfs.FileDescriptionImpl.
   203  func (s *Socket) Write(ctx context.Context, src usermem.IOSequence, opts vfs.WriteOptions) (int64, error) {
   204  	// All flags other than RWF_NOWAIT should be ignored.
   205  	// TODO(gvisor.dev/issue/2601): Support RWF_NOWAIT.
   206  	if opts.Flags != 0 {
   207  		return 0, linuxerr.EOPNOTSUPP
   208  	}
   209  
   210  	n, err := s.sendMsg(ctx, src, nil, 0, socket.ControlMessages{})
   211  	return int64(n), err.ToError()
   212  }
   213  
   214  // Readiness implements waiter.Waitable.Readiness.
   215  func (s *Socket) Readiness(mask waiter.EventMask) waiter.EventMask {
   216  	// ep holds messages to be read and thus handles EventIn readiness.
   217  	ready := s.ep.Readiness(mask)
   218  
   219  	if mask&waiter.WritableEvents != 0 {
   220  		// sendMsg handles messages synchronously and is thus always
   221  		// ready for writing.
   222  		ready |= waiter.WritableEvents
   223  	}
   224  
   225  	return ready
   226  }
   227  
   228  // EventRegister implements waiter.Waitable.EventRegister.
   229  func (s *Socket) EventRegister(e *waiter.Entry) error {
   230  	return s.ep.EventRegister(e)
   231  	// Writable readiness never changes, so no registration is needed.
   232  }
   233  
   234  // EventUnregister implements waiter.Waitable.EventUnregister.
   235  func (s *Socket) EventUnregister(e *waiter.Entry) {
   236  	s.ep.EventUnregister(e)
   237  }
   238  
   239  // Passcred implements transport.Credentialer.Passcred.
   240  func (s *Socket) Passcred() bool {
   241  	return s.ep.SocketOptions().GetPassCred()
   242  }
   243  
   244  // ConnectedPasscred implements transport.Credentialer.ConnectedPasscred.
   245  func (s *Socket) ConnectedPasscred() bool {
   246  	// This socket is connected to the kernel, which doesn't need creds.
   247  	//
   248  	// This is arbitrary, as ConnectedPasscred on this type has no callers.
   249  	return false
   250  }
   251  
   252  // ExtractSockAddr extracts the SockAddrNetlink from b.
   253  func ExtractSockAddr(b []byte) (*linux.SockAddrNetlink, *syserr.Error) {
   254  	if len(b) < linux.SockAddrNetlinkSize {
   255  		return nil, syserr.ErrBadAddress
   256  	}
   257  
   258  	var sa linux.SockAddrNetlink
   259  	sa.UnmarshalUnsafe(b)
   260  
   261  	if sa.Family != linux.AF_NETLINK {
   262  		return nil, syserr.ErrInvalidArgument
   263  	}
   264  
   265  	return &sa, nil
   266  }
   267  
   268  // bindPort binds this socket to a port, preferring 'port' if it is available.
   269  //
   270  // port of 0 defaults to the ThreadGroup ID.
   271  //
   272  // Preconditions: mu is held.
   273  func (s *Socket) bindPort(t *kernel.Task, port int32) *syserr.Error {
   274  	if s.bound {
   275  		// Re-binding is only allowed if the port doesn't change.
   276  		if port != s.portID {
   277  			return syserr.ErrInvalidArgument
   278  		}
   279  
   280  		return nil
   281  	}
   282  
   283  	if port == 0 {
   284  		port = int32(t.ThreadGroup().ID())
   285  	}
   286  	port, ok := s.ports.Allocate(s.protocol.Protocol(), port)
   287  	if !ok {
   288  		return syserr.ErrBusy
   289  	}
   290  
   291  	s.portID = port
   292  	s.bound = true
   293  	return nil
   294  }
   295  
   296  // Bind implements socket.Socket.Bind.
   297  func (s *Socket) Bind(t *kernel.Task, sockaddr []byte) *syserr.Error {
   298  	a, err := ExtractSockAddr(sockaddr)
   299  	if err != nil {
   300  		return err
   301  	}
   302  
   303  	// No support for multicast groups yet.
   304  	if a.Groups != 0 {
   305  		return syserr.ErrPermissionDenied
   306  	}
   307  
   308  	s.mu.Lock()
   309  	defer s.mu.Unlock()
   310  
   311  	return s.bindPort(t, int32(a.PortID))
   312  }
   313  
   314  // Connect implements socket.Socket.Connect.
   315  func (s *Socket) Connect(t *kernel.Task, sockaddr []byte, blocking bool) *syserr.Error {
   316  	a, err := ExtractSockAddr(sockaddr)
   317  	if err != nil {
   318  		return err
   319  	}
   320  
   321  	// No support for multicast groups yet.
   322  	if a.Groups != 0 {
   323  		return syserr.ErrPermissionDenied
   324  	}
   325  
   326  	s.mu.Lock()
   327  	defer s.mu.Unlock()
   328  
   329  	if a.PortID == 0 {
   330  		// Netlink sockets default to connected to the kernel, but
   331  		// connecting anyways automatically binds if not already bound.
   332  		if !s.bound {
   333  			// Pass port 0 to get an auto-selected port ID.
   334  			return s.bindPort(t, 0)
   335  		}
   336  		return nil
   337  	}
   338  
   339  	// We don't support non-kernel destination ports. Linux returns EPERM
   340  	// if applications attempt to do this without NL_CFG_F_NONROOT_SEND, so
   341  	// we emulate that.
   342  	return syserr.ErrPermissionDenied
   343  }
   344  
   345  // Accept implements socket.Socket.Accept.
   346  func (s *Socket) Accept(t *kernel.Task, peerRequested bool, flags int, blocking bool) (int32, linux.SockAddr, uint32, *syserr.Error) {
   347  	// Netlink sockets never support accept.
   348  	return 0, nil, 0, syserr.ErrNotSupported
   349  }
   350  
   351  // Listen implements socket.Socket.Listen.
   352  func (s *Socket) Listen(t *kernel.Task, backlog int) *syserr.Error {
   353  	// Netlink sockets never support listen.
   354  	return syserr.ErrNotSupported
   355  }
   356  
   357  // Shutdown implements socket.Socket.Shutdown.
   358  func (s *Socket) Shutdown(t *kernel.Task, how int) *syserr.Error {
   359  	// Netlink sockets never support shutdown.
   360  	return syserr.ErrNotSupported
   361  }
   362  
   363  // GetSockOpt implements socket.Socket.GetSockOpt.
   364  func (s *Socket) GetSockOpt(t *kernel.Task, level int, name int, outPtr hostarch.Addr, outLen int) (marshal.Marshallable, *syserr.Error) {
   365  	switch level {
   366  	case linux.SOL_SOCKET:
   367  		switch name {
   368  		case linux.SO_SNDBUF:
   369  			if outLen < sizeOfInt32 {
   370  				return nil, syserr.ErrInvalidArgument
   371  			}
   372  			s.mu.Lock()
   373  			defer s.mu.Unlock()
   374  			return primitive.AllocateInt32(int32(s.sendBufferSize)), nil
   375  
   376  		case linux.SO_RCVBUF:
   377  			if outLen < sizeOfInt32 {
   378  				return nil, syserr.ErrInvalidArgument
   379  			}
   380  			// We don't have limit on receiving size.
   381  			return primitive.AllocateInt32(math.MaxInt32), nil
   382  
   383  		case linux.SO_PASSCRED:
   384  			if outLen < sizeOfInt32 {
   385  				return nil, syserr.ErrInvalidArgument
   386  			}
   387  			var passcred primitive.Int32
   388  			if s.Passcred() {
   389  				passcred = 1
   390  			}
   391  			return &passcred, nil
   392  
   393  		case linux.SO_SNDTIMEO:
   394  			if outLen < linux.SizeOfTimeval {
   395  				return nil, syserr.ErrInvalidArgument
   396  			}
   397  			sendTimeout := linux.NsecToTimeval(s.SendTimeout())
   398  			return &sendTimeout, nil
   399  
   400  		case linux.SO_RCVTIMEO:
   401  			if outLen < linux.SizeOfTimeval {
   402  				return nil, syserr.ErrInvalidArgument
   403  			}
   404  			recvTimeout := linux.NsecToTimeval(s.RecvTimeout())
   405  			return &recvTimeout, nil
   406  		}
   407  	case linux.SOL_NETLINK:
   408  		switch name {
   409  		case linux.NETLINK_BROADCAST_ERROR,
   410  			linux.NETLINK_CAP_ACK,
   411  			linux.NETLINK_DUMP_STRICT_CHK,
   412  			linux.NETLINK_EXT_ACK,
   413  			linux.NETLINK_LIST_MEMBERSHIPS,
   414  			linux.NETLINK_NO_ENOBUFS,
   415  			linux.NETLINK_PKTINFO:
   416  			// Not supported.
   417  		}
   418  	}
   419  	// TODO(b/68878065): other sockopts are not supported.
   420  	return nil, syserr.ErrProtocolNotAvailable
   421  }
   422  
   423  // SetSockOpt implements socket.Socket.SetSockOpt.
   424  func (s *Socket) SetSockOpt(t *kernel.Task, level int, name int, opt []byte) *syserr.Error {
   425  	switch level {
   426  	case linux.SOL_SOCKET:
   427  		switch name {
   428  		case linux.SO_SNDBUF:
   429  			if len(opt) < sizeOfInt32 {
   430  				return syserr.ErrInvalidArgument
   431  			}
   432  			size := hostarch.ByteOrder.Uint32(opt)
   433  			if size < minSendBufferSize {
   434  				size = minSendBufferSize
   435  			} else if size > maxSendBufferSize {
   436  				size = maxSendBufferSize
   437  			}
   438  			s.mu.Lock()
   439  			s.sendBufferSize = size
   440  			s.mu.Unlock()
   441  			return nil
   442  
   443  		case linux.SO_RCVBUF:
   444  			if len(opt) < sizeOfInt32 {
   445  				return syserr.ErrInvalidArgument
   446  			}
   447  			// We don't have limit on receiving size. So just accept anything as
   448  			// valid for compatibility.
   449  			return nil
   450  
   451  		case linux.SO_PASSCRED:
   452  			if len(opt) < sizeOfInt32 {
   453  				return syserr.ErrInvalidArgument
   454  			}
   455  			passcred := hostarch.ByteOrder.Uint32(opt)
   456  
   457  			s.ep.SocketOptions().SetPassCred(passcred != 0)
   458  			return nil
   459  
   460  		case linux.SO_ATTACH_FILTER:
   461  			// TODO(gvisor.dev/issue/1119): We don't actually
   462  			// support filtering. If this socket can't ever send
   463  			// messages, then there is nothing to filter and we can
   464  			// advertise support. Otherwise, be conservative and
   465  			// return an error.
   466  			if s.protocol.CanSend() {
   467  				return syserr.ErrProtocolNotAvailable
   468  			}
   469  
   470  			s.mu.Lock()
   471  			s.filter = true
   472  			s.mu.Unlock()
   473  			return nil
   474  
   475  		case linux.SO_DETACH_FILTER:
   476  			// TODO(gvisor.dev/issue/1119): See above.
   477  			if s.protocol.CanSend() {
   478  				return syserr.ErrProtocolNotAvailable
   479  			}
   480  
   481  			s.mu.Lock()
   482  			filter := s.filter
   483  			s.filter = false
   484  			s.mu.Unlock()
   485  
   486  			if !filter {
   487  				return errNoFilter
   488  			}
   489  
   490  			return nil
   491  
   492  		case linux.SO_SNDTIMEO:
   493  			if len(opt) < linux.SizeOfTimeval {
   494  				return syserr.ErrInvalidArgument
   495  			}
   496  
   497  			var v linux.Timeval
   498  			v.UnmarshalBytes(opt)
   499  			if v.Usec < 0 || v.Usec >= int64(time.Second/time.Microsecond) {
   500  				return syserr.ErrDomain
   501  			}
   502  			s.SetSendTimeout(v.ToNsecCapped())
   503  			return nil
   504  
   505  		case linux.SO_RCVTIMEO:
   506  			if len(opt) < linux.SizeOfTimeval {
   507  				return syserr.ErrInvalidArgument
   508  			}
   509  
   510  			var v linux.Timeval
   511  			v.UnmarshalBytes(opt)
   512  			if v.Usec < 0 || v.Usec >= int64(time.Second/time.Microsecond) {
   513  				return syserr.ErrDomain
   514  			}
   515  			s.SetRecvTimeout(v.ToNsecCapped())
   516  			return nil
   517  		}
   518  	case linux.SOL_NETLINK:
   519  		switch name {
   520  		case linux.NETLINK_ADD_MEMBERSHIP,
   521  			linux.NETLINK_BROADCAST_ERROR,
   522  			linux.NETLINK_CAP_ACK,
   523  			linux.NETLINK_DROP_MEMBERSHIP,
   524  			linux.NETLINK_DUMP_STRICT_CHK,
   525  			linux.NETLINK_EXT_ACK,
   526  			linux.NETLINK_LISTEN_ALL_NSID,
   527  			linux.NETLINK_NO_ENOBUFS,
   528  			linux.NETLINK_PKTINFO:
   529  			// Not supported.
   530  		}
   531  	}
   532  
   533  	// TODO(b/68878065): other sockopts are not supported.
   534  	return syserr.ErrProtocolNotAvailable
   535  }
   536  
   537  // GetSockName implements socket.Socket.GetSockName.
   538  func (s *Socket) GetSockName(t *kernel.Task) (linux.SockAddr, uint32, *syserr.Error) {
   539  	s.mu.Lock()
   540  	defer s.mu.Unlock()
   541  
   542  	sa := &linux.SockAddrNetlink{
   543  		Family: linux.AF_NETLINK,
   544  		PortID: uint32(s.portID),
   545  	}
   546  	return sa, uint32(sa.SizeBytes()), nil
   547  }
   548  
   549  // GetPeerName implements socket.Socket.GetPeerName.
   550  func (s *Socket) GetPeerName(t *kernel.Task) (linux.SockAddr, uint32, *syserr.Error) {
   551  	sa := &linux.SockAddrNetlink{
   552  		Family: linux.AF_NETLINK,
   553  		// TODO(b/68878065): Support non-kernel peers. For now the peer
   554  		// must be the kernel.
   555  		PortID: 0,
   556  	}
   557  	return sa, uint32(sa.SizeBytes()), nil
   558  }
   559  
   560  // RecvMsg implements socket.Socket.RecvMsg.
   561  func (s *Socket) RecvMsg(t *kernel.Task, dst usermem.IOSequence, flags int, haveDeadline bool, deadline ktime.Time, senderRequested bool, controlDataLen uint64) (int, int, linux.SockAddr, uint32, socket.ControlMessages, *syserr.Error) {
   562  	from := &linux.SockAddrNetlink{
   563  		Family: linux.AF_NETLINK,
   564  		PortID: 0,
   565  	}
   566  	fromLen := uint32(from.SizeBytes())
   567  
   568  	trunc := flags&linux.MSG_TRUNC != 0
   569  
   570  	r := unix.EndpointReader{
   571  		Ctx:      t,
   572  		Endpoint: s.ep,
   573  		Peek:     flags&linux.MSG_PEEK != 0,
   574  	}
   575  
   576  	doRead := func() (int64, error) {
   577  		return dst.CopyOutFrom(t, &r)
   578  	}
   579  
   580  	// If MSG_TRUNC is set with a zero byte destination then we still need
   581  	// to read the message and discard it, or in the case where MSG_PEEK is
   582  	// set, leave it be. In both cases the full message length must be
   583  	// returned.
   584  	if trunc && dst.Addrs.NumBytes() == 0 {
   585  		doRead = func() (int64, error) {
   586  			err := r.Truncate()
   587  			// Always return zero for bytes read since the destination size is
   588  			// zero.
   589  			return 0, err
   590  		}
   591  	}
   592  
   593  	if n, err := doRead(); err != linuxerr.ErrWouldBlock || flags&linux.MSG_DONTWAIT != 0 {
   594  		var mflags int
   595  		if n < int64(r.MsgSize) {
   596  			mflags |= linux.MSG_TRUNC
   597  		}
   598  		if trunc {
   599  			n = int64(r.MsgSize)
   600  		}
   601  		return int(n), mflags, from, fromLen, socket.ControlMessages{}, syserr.FromError(err)
   602  	}
   603  
   604  	// We'll have to block. Register for notification and keep trying to
   605  	// receive all the data.
   606  	e, ch := waiter.NewChannelEntry(waiter.ReadableEvents)
   607  	if err := s.EventRegister(&e); err != nil {
   608  		return 0, 0, from, fromLen, socket.ControlMessages{}, syserr.FromError(err)
   609  	}
   610  	defer s.EventUnregister(&e)
   611  
   612  	for {
   613  		if n, err := doRead(); err != linuxerr.ErrWouldBlock {
   614  			var mflags int
   615  			if n < int64(r.MsgSize) {
   616  				mflags |= linux.MSG_TRUNC
   617  			}
   618  			if trunc {
   619  				n = int64(r.MsgSize)
   620  			}
   621  			return int(n), mflags, from, fromLen, socket.ControlMessages{}, syserr.FromError(err)
   622  		}
   623  
   624  		if err := t.BlockWithDeadline(ch, haveDeadline, deadline); err != nil {
   625  			if linuxerr.Equals(linuxerr.ETIMEDOUT, err) {
   626  				return 0, 0, nil, 0, socket.ControlMessages{}, syserr.ErrTryAgain
   627  			}
   628  			return 0, 0, nil, 0, socket.ControlMessages{}, syserr.FromError(err)
   629  		}
   630  	}
   631  }
   632  
   633  // kernelSCM implements control.SCMCredentials with credentials that represent
   634  // the kernel itself rather than a Task.
   635  //
   636  // +stateify savable
   637  type kernelSCM struct{}
   638  
   639  // Equals implements transport.CredentialsControlMessage.Equals.
   640  func (kernelSCM) Equals(oc transport.CredentialsControlMessage) bool {
   641  	_, ok := oc.(kernelSCM)
   642  	return ok
   643  }
   644  
   645  // Credentials implements control.SCMCredentials.Credentials.
   646  func (kernelSCM) Credentials(*kernel.Task) (kernel.ThreadID, auth.UID, auth.GID) {
   647  	return 0, auth.RootUID, auth.RootGID
   648  }
   649  
   650  // kernelCreds is the concrete version of kernelSCM used in all creds.
   651  var kernelCreds = &kernelSCM{}
   652  
   653  // sendResponse sends the response messages in ms back to userspace.
   654  func (s *Socket) sendResponse(ctx context.Context, ms *nlmsg.MessageSet) *syserr.Error {
   655  	// Linux combines multiple netlink messages into a single datagram.
   656  	bufs := make([][]byte, 0, len(ms.Messages))
   657  	for _, m := range ms.Messages {
   658  		bufs = append(bufs, m.Finalize())
   659  	}
   660  
   661  	// All messages are from the kernel.
   662  	cms := transport.ControlMessages{
   663  		Credentials: kernelCreds,
   664  	}
   665  
   666  	if len(bufs) > 0 {
   667  		// RecvMsg never receives the address, so we don't need to send
   668  		// one.
   669  		_, notify, err := s.connection.Send(ctx, bufs, cms, transport.Address{})
   670  		// If the buffer is full, we simply drop messages, just like
   671  		// Linux.
   672  		if err != nil && err != syserr.ErrWouldBlock {
   673  			return err
   674  		}
   675  		if notify {
   676  			s.connection.SendNotify()
   677  		}
   678  	}
   679  
   680  	// N.B. multi-part messages should still send NLMSG_DONE even if
   681  	// nlmsg.MessageSet contains no messages.
   682  	//
   683  	// N.B. NLMSG_DONE is always sent in a different datagram. See
   684  	// net/netlink/af_netlink.c:netlink_dump.
   685  	if ms.Multi {
   686  		m := nlmsg.NewMessage(linux.NetlinkMessageHeader{
   687  			Type:   linux.NLMSG_DONE,
   688  			Flags:  linux.NLM_F_MULTI,
   689  			Seq:    ms.Seq,
   690  			PortID: uint32(ms.PortID),
   691  		})
   692  
   693  		// Add the dump_done_errno payload.
   694  		m.Put(primitive.AllocateInt64(0))
   695  
   696  		_, notify, err := s.connection.Send(ctx, [][]byte{m.Finalize()}, cms, transport.Address{})
   697  		if err != nil && err != syserr.ErrWouldBlock {
   698  			return err
   699  		}
   700  		if notify {
   701  			s.connection.SendNotify()
   702  		}
   703  	}
   704  
   705  	return nil
   706  }
   707  
   708  func dumpErrorMessage(hdr linux.NetlinkMessageHeader, ms *nlmsg.MessageSet, err *syserr.Error) {
   709  	m := ms.AddMessage(linux.NetlinkMessageHeader{
   710  		Type: linux.NLMSG_ERROR,
   711  	})
   712  	m.Put(&linux.NetlinkErrorMessage{
   713  		Error:  int32(-err.ToLinux()),
   714  		Header: hdr,
   715  	})
   716  }
   717  
   718  func dumpAckMessage(hdr linux.NetlinkMessageHeader, ms *nlmsg.MessageSet) {
   719  	m := ms.AddMessage(linux.NetlinkMessageHeader{
   720  		Type: linux.NLMSG_ERROR,
   721  	})
   722  	m.Put(&linux.NetlinkErrorMessage{
   723  		Error:  0,
   724  		Header: hdr,
   725  	})
   726  }
   727  
   728  // processMessages handles each message in buf, passing it to the protocol
   729  // handler for final handling.
   730  func (s *Socket) processMessages(ctx context.Context, buf []byte) *syserr.Error {
   731  	for len(buf) > 0 {
   732  		msg, rest, ok := nlmsg.ParseMessage(buf)
   733  		if !ok {
   734  			// Linux ignores messages that are too short. See
   735  			// net/netlink/af_netlink.c:netlink_rcv_skb.
   736  			break
   737  		}
   738  		buf = rest
   739  		hdr := msg.Header()
   740  
   741  		// Ignore control messages.
   742  		if hdr.Type < linux.NLMSG_MIN_TYPE {
   743  			continue
   744  		}
   745  
   746  		ms := nlmsg.NewMessageSet(s.portID, hdr.Seq)
   747  		if err := s.protocol.ProcessMessage(ctx, msg, ms); err != nil {
   748  			dumpErrorMessage(hdr, ms, err)
   749  		} else if hdr.Flags&linux.NLM_F_ACK == linux.NLM_F_ACK {
   750  			dumpAckMessage(hdr, ms)
   751  		}
   752  
   753  		if err := s.sendResponse(ctx, ms); err != nil {
   754  			return err
   755  		}
   756  	}
   757  
   758  	return nil
   759  }
   760  
   761  // sendMsg is the core of message send, used for SendMsg and Write.
   762  func (s *Socket) sendMsg(ctx context.Context, src usermem.IOSequence, to []byte, flags int, controlMessages socket.ControlMessages) (int, *syserr.Error) {
   763  	dstPort := int32(0)
   764  
   765  	if len(to) != 0 {
   766  		a, err := ExtractSockAddr(to)
   767  		if err != nil {
   768  			return 0, err
   769  		}
   770  
   771  		// No support for multicast groups yet.
   772  		if a.Groups != 0 {
   773  			return 0, syserr.ErrPermissionDenied
   774  		}
   775  
   776  		dstPort = int32(a.PortID)
   777  	}
   778  
   779  	if dstPort != 0 {
   780  		// Non-kernel destinations not supported yet. Treat as if
   781  		// NL_CFG_F_NONROOT_SEND is not set.
   782  		return 0, syserr.ErrPermissionDenied
   783  	}
   784  
   785  	s.mu.Lock()
   786  	defer s.mu.Unlock()
   787  
   788  	// For simplicity, and consistency with Linux, we copy in the entire
   789  	// message up front.
   790  	if src.NumBytes() > int64(s.sendBufferSize) {
   791  		return 0, syserr.ErrMessageTooLong
   792  	}
   793  
   794  	buf := make([]byte, src.NumBytes())
   795  	n, err := src.CopyIn(ctx, buf)
   796  	// io.EOF can be only returned if src is a file, this means that
   797  	// sendMsg is called from splice and the error has to be ignored in
   798  	// this case.
   799  	if err == io.EOF {
   800  		err = nil
   801  	}
   802  	if err != nil {
   803  		// Don't partially consume messages.
   804  		return 0, syserr.FromError(err)
   805  	}
   806  
   807  	if err := s.processMessages(ctx, buf); err != nil {
   808  		return 0, err
   809  	}
   810  
   811  	return n, nil
   812  }
   813  
   814  // SendMsg implements socket.Socket.SendMsg.
   815  func (s *Socket) SendMsg(t *kernel.Task, src usermem.IOSequence, to []byte, flags int, haveDeadline bool, deadline ktime.Time, controlMessages socket.ControlMessages) (int, *syserr.Error) {
   816  	return s.sendMsg(t, src, to, flags, controlMessages)
   817  }
   818  
   819  // State implements socket.Socket.State.
   820  func (s *Socket) State() uint32 {
   821  	return s.ep.State()
   822  }
   823  
   824  // Type implements socket.Socket.Type.
   825  func (s *Socket) Type() (family int, skType linux.SockType, protocol int) {
   826  	return linux.AF_NETLINK, s.skType, s.protocol.Protocol()
   827  }