gvisor.dev/gvisor@v0.0.0-20240520182842-f9d4d51c7e0f/pkg/sentry/socket/unix/transport/unix.go (about)

     1  // Copyright 2018 The gVisor Authors.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  // Package transport contains the implementation of Unix endpoints.
    16  package transport
    17  
    18  import (
    19  	"gvisor.dev/gvisor/pkg/abi/linux"
    20  	"gvisor.dev/gvisor/pkg/context"
    21  	"gvisor.dev/gvisor/pkg/log"
    22  	"gvisor.dev/gvisor/pkg/syserr"
    23  	"gvisor.dev/gvisor/pkg/tcpip"
    24  	"gvisor.dev/gvisor/pkg/waiter"
    25  )
    26  
    27  const (
    28  	// The minimum size of the send/receive buffers.
    29  	minimumBufferSize = 4 << 10 // 4 KiB (match default in linux)
    30  
    31  	// The default size of the send/receive buffers.
    32  	defaultBufferSize = 208 << 10 // 208 KiB  (default in linux for net.core.wmem_default)
    33  
    34  	// The maximum permitted size for the send/receive buffers.
    35  	maxBufferSize = 4 << 20 // 4 MiB 4 MiB (default in linux for net.core.wmem_max)
    36  )
    37  
    38  // A RightsControlMessage is a control message containing FDs.
    39  //
    40  // +stateify savable
    41  type RightsControlMessage interface {
    42  	// Clone returns a copy of the RightsControlMessage.
    43  	Clone() RightsControlMessage
    44  
    45  	// Release releases any resources owned by the RightsControlMessage.
    46  	Release(ctx context.Context)
    47  }
    48  
    49  // A CredentialsControlMessage is a control message containing Unix credentials.
    50  type CredentialsControlMessage interface {
    51  	// Equals returns true iff the two messages are equal.
    52  	Equals(CredentialsControlMessage) bool
    53  }
    54  
    55  // A ControlMessages represents a collection of socket control messages.
    56  //
    57  // +stateify savable
    58  type ControlMessages struct {
    59  	// Rights is a control message containing FDs.
    60  	Rights RightsControlMessage
    61  
    62  	// Credentials is a control message containing Unix credentials.
    63  	Credentials CredentialsControlMessage
    64  }
    65  
    66  // Empty returns true iff the ControlMessages does not contain either
    67  // credentials or rights.
    68  func (c *ControlMessages) Empty() bool {
    69  	return c.Rights == nil && c.Credentials == nil
    70  }
    71  
    72  // Clone clones both the credentials and the rights.
    73  func (c *ControlMessages) Clone() ControlMessages {
    74  	cm := ControlMessages{}
    75  	if c.Rights != nil {
    76  		cm.Rights = c.Rights.Clone()
    77  	}
    78  	cm.Credentials = c.Credentials
    79  	return cm
    80  }
    81  
    82  // Release releases both the credentials and the rights.
    83  func (c *ControlMessages) Release(ctx context.Context) {
    84  	if c.Rights != nil {
    85  		c.Rights.Release(ctx)
    86  	}
    87  	*c = ControlMessages{}
    88  }
    89  
    90  // RecvArgs are the arguments to Endpoint.RecvMsg and Receiver.Recv.
    91  type RecvArgs struct {
    92  	// Creds indicates if credential control messages are requested by the
    93  	// caller. This is useful for determining if control messages can be
    94  	// coalesced. Creds is a hint and can be safely ignored by the
    95  	// implementation if no coalescing is possible. It is fine to return
    96  	// credential control messages when none were requested or to not
    97  	// return credential control messages when they were requested.
    98  	Creds bool
    99  
   100  	// NumRights is the number of SCM_RIGHTS FDs requested by the caller.
   101  	// This is useful if one must allocate a buffer to receive a SCM_RIGHTS
   102  	// message or determine if control messages can be coalesced. numRights
   103  	// is a hint and can be safely ignored by the implementation if the
   104  	// number of available SCM_RIGHTS FDs is known and no coalescing is
   105  	// possible. It is fine for the returned number of SCM_RIGHTS FDs to be
   106  	// either higher or lower than the requested number.
   107  	NumRights int
   108  
   109  	// If Peek is true, no data should be consumed from the Endpoint. Any and
   110  	// all data returned from a peek should be available in the next call to
   111  	// Recv or RecvMsg.
   112  	Peek bool
   113  }
   114  
   115  // RecvOutput is the output from Endpoint.RecvMsg and Receiver.Recv.
   116  type RecvOutput struct {
   117  	// RecvLen is the number of bytes copied into RecvArgs.Data.
   118  	RecvLen int64
   119  
   120  	// MsgLen is the length of the read message consumed for datagram Endpoints.
   121  	// MsgLen is always the same as RecvLen for stream Endpoints.
   122  	MsgLen int64
   123  
   124  	// Source is the source address we received from.
   125  	Source Address
   126  
   127  	// Control is the ControlMessages read.
   128  	Control ControlMessages
   129  
   130  	// ControlTrunc indicates that the NumRights hint was used to receive
   131  	// fewer than the total available SCM_RIGHTS FDs. Additional truncation
   132  	// may be required by the caller.
   133  	ControlTrunc bool
   134  
   135  	// UnusedRights is a slice of unused RightsControlMessage which should
   136  	// be Release()d.
   137  	UnusedRights []RightsControlMessage
   138  }
   139  
   140  // Endpoint is the interface implemented by Unix transport protocol
   141  // implementations that expose functionality like sendmsg, recvmsg, connect,
   142  // etc. to Unix socket implementations.
   143  type Endpoint interface {
   144  	Credentialer
   145  	waiter.Waitable
   146  
   147  	// Close puts the endpoint in a closed state and frees all resources
   148  	// associated with it.
   149  	Close(ctx context.Context)
   150  
   151  	// RecvMsg reads data and a control message from the endpoint. This method
   152  	// does not block if there is no data pending.
   153  	//
   154  	// The returned callback should be called if not nil.
   155  	RecvMsg(ctx context.Context, data [][]byte, args RecvArgs) (RecvOutput, func(), *syserr.Error)
   156  
   157  	// SendMsg writes data and a control message to the endpoint's peer.
   158  	// This method does not block if the data cannot be written.
   159  	//
   160  	// SendMsg does not take ownership of any of its arguments on error.
   161  	//
   162  	// If set, notify is a callback that should be called after RecvMesg
   163  	// completes without mm.activeMu held.
   164  	SendMsg(context.Context, [][]byte, ControlMessages, BoundEndpoint) (int64, func(), *syserr.Error)
   165  
   166  	// Connect connects this endpoint directly to another.
   167  	//
   168  	// This should be called on the client endpoint, and the (bound)
   169  	// endpoint passed in as a parameter.
   170  	//
   171  	// The error codes are the same as Connect.
   172  	Connect(ctx context.Context, server BoundEndpoint) *syserr.Error
   173  
   174  	// Shutdown closes the read and/or write end of the endpoint connection
   175  	// to its peer.
   176  	Shutdown(flags tcpip.ShutdownFlags) *syserr.Error
   177  
   178  	// Listen puts the endpoint in "listen" mode, which allows it to accept
   179  	// new connections.
   180  	Listen(ctx context.Context, backlog int) *syserr.Error
   181  
   182  	// Accept returns a new endpoint if a peer has established a connection
   183  	// to an endpoint previously set to listen mode. This method does not
   184  	// block if no new connections are available.
   185  	//
   186  	// The returned Queue is the wait queue for the newly created endpoint.
   187  	//
   188  	// peerAddr if not nil will be populated with the address of the connected
   189  	// peer on a successful accept.
   190  	Accept(ctx context.Context, peerAddr *Address) (Endpoint, *syserr.Error)
   191  
   192  	// Bind binds the endpoint to a specific local address and port.
   193  	// Specifying a NIC is optional.
   194  	Bind(address Address) *syserr.Error
   195  
   196  	// Type return the socket type, typically either SockStream, SockDgram
   197  	// or SockSeqpacket.
   198  	Type() linux.SockType
   199  
   200  	// GetLocalAddress returns the address to which the endpoint is bound.
   201  	GetLocalAddress() (Address, tcpip.Error)
   202  
   203  	// GetRemoteAddress returns the address to which the endpoint is
   204  	// connected.
   205  	GetRemoteAddress() (Address, tcpip.Error)
   206  
   207  	// SetSockOpt sets a socket option.
   208  	SetSockOpt(opt tcpip.SettableSocketOption) tcpip.Error
   209  
   210  	// SetSockOptInt sets a socket option for simple cases when a value has
   211  	// the int type.
   212  	SetSockOptInt(opt tcpip.SockOptInt, v int) tcpip.Error
   213  
   214  	// GetSockOpt gets a socket option.
   215  	GetSockOpt(opt tcpip.GettableSocketOption) tcpip.Error
   216  
   217  	// GetSockOptInt gets a socket option for simple cases when a return
   218  	// value has the int type.
   219  	GetSockOptInt(opt tcpip.SockOptInt) (int, tcpip.Error)
   220  
   221  	// State returns the current state of the socket, as represented by Linux in
   222  	// procfs.
   223  	State() uint32
   224  
   225  	// LastError clears and returns the last error reported by the endpoint.
   226  	LastError() tcpip.Error
   227  
   228  	// SocketOptions returns the structure which contains all the socket
   229  	// level options.
   230  	SocketOptions() *tcpip.SocketOptions
   231  }
   232  
   233  // A Credentialer is a socket or endpoint that supports the SO_PASSCRED socket
   234  // option.
   235  type Credentialer interface {
   236  	// Passcred returns whether or not the SO_PASSCRED socket option is
   237  	// enabled on this end.
   238  	Passcred() bool
   239  
   240  	// ConnectedPasscred returns whether or not the SO_PASSCRED socket option
   241  	// is enabled on the connected end.
   242  	ConnectedPasscred() bool
   243  }
   244  
   245  // A BoundEndpoint is a unix endpoint that can be connected to.
   246  type BoundEndpoint interface {
   247  	// BidirectionalConnect establishes a bi-directional connection between two
   248  	// unix endpoints in an all-or-nothing manner. If an error occurs during
   249  	// connecting, the state of neither endpoint should be modified.
   250  	//
   251  	// In order for an endpoint to establish such a bidirectional connection
   252  	// with a BoundEndpoint, the endpoint calls the BidirectionalConnect method
   253  	// on the BoundEndpoint and sends a representation of itself (the
   254  	// ConnectingEndpoint) and a callback (returnConnect) to receive the
   255  	// connection information (Receiver and ConnectedEndpoint) upon a
   256  	// successful connect. The callback should only be called on a successful
   257  	// connect.
   258  	//
   259  	// For a connection attempt to be successful, the ConnectingEndpoint must
   260  	// be unconnected and not listening and the BoundEndpoint whose
   261  	// BidirectionalConnect method is being called must be listening.
   262  	//
   263  	// This method will return syserr.ErrConnectionRefused on endpoints with a
   264  	// type that isn't SockStream or SockSeqpacket.
   265  	BidirectionalConnect(ctx context.Context, ep ConnectingEndpoint, returnConnect func(Receiver, ConnectedEndpoint)) *syserr.Error
   266  
   267  	// UnidirectionalConnect establishes a write-only connection to a unix
   268  	// endpoint.
   269  	//
   270  	// An endpoint which calls UnidirectionalConnect and supports it itself must
   271  	// not hold its own lock when calling UnidirectionalConnect.
   272  	//
   273  	// This method will return syserr.ErrConnectionRefused on a non-SockDgram
   274  	// endpoint.
   275  	UnidirectionalConnect(ctx context.Context) (ConnectedEndpoint, *syserr.Error)
   276  
   277  	// Passcred returns whether or not the SO_PASSCRED socket option is
   278  	// enabled on this end.
   279  	Passcred() bool
   280  
   281  	// Release releases any resources held by the BoundEndpoint. It must be
   282  	// called before dropping all references to a BoundEndpoint returned by a
   283  	// function.
   284  	Release(ctx context.Context)
   285  }
   286  
   287  // HostBoundEndpoint is an interface that endpoints can implement if they support
   288  // binding listening and accepting connections from a bound Unix domain socket
   289  // on the host.
   290  type HostBoundEndpoint interface {
   291  	// SetBoundSocketFD will be called on supporting endpoints after
   292  	// binding a socket on the host filesystem. Implementations should
   293  	// delegate Listen and Accept calls to the BoundSocketFD. The ownership
   294  	// of bsFD is transferred to the endpoint.
   295  	SetBoundSocketFD(ctx context.Context, bsFD BoundSocketFD) error
   296  
   297  	// ResetBoundSocketFD cleans up the BoundSocketFD set by the last successful
   298  	// SetBoundSocketFD call.
   299  	ResetBoundSocketFD(ctx context.Context)
   300  }
   301  
   302  // BoundSocketFD is an interface that wraps a socket FD that was bind(2)-ed.
   303  // It allows to listen and accept on that socket.
   304  type BoundSocketFD interface {
   305  	// Close closes the socket FD.
   306  	Close(ctx context.Context)
   307  
   308  	// NotificationFD is a host FD that can be used to notify when new clients
   309  	// connect to the socket.
   310  	NotificationFD() int32
   311  
   312  	// Listen is analogous to listen(2).
   313  	Listen(ctx context.Context, backlog int32) error
   314  
   315  	// Accept is analogous to accept(2).
   316  	Accept(ctx context.Context) (int, error)
   317  }
   318  
   319  // message represents a message passed over a Unix domain socket.
   320  //
   321  // +stateify savable
   322  type message struct {
   323  	messageEntry
   324  
   325  	// Data is the Message payload.
   326  	Data []byte
   327  
   328  	// Control is auxiliary control message data that goes along with the
   329  	// data.
   330  	Control ControlMessages
   331  
   332  	// Address is the bound address of the endpoint that sent the message.
   333  	//
   334  	// If the endpoint that sent the message is not bound, the Address is
   335  	// the empty string.
   336  	Address Address
   337  }
   338  
   339  // Length returns number of bytes stored in the message.
   340  func (m *message) Length() int64 {
   341  	return int64(len(m.Data))
   342  }
   343  
   344  // Release releases any resources held by the message.
   345  func (m *message) Release(ctx context.Context) {
   346  	m.Control.Release(ctx)
   347  }
   348  
   349  // Peek returns a copy of the message.
   350  func (m *message) Peek() *message {
   351  	return &message{Data: m.Data, Control: m.Control.Clone(), Address: m.Address}
   352  }
   353  
   354  // Truncate reduces the length of the message payload to n bytes.
   355  //
   356  // Preconditions: n <= m.Length().
   357  func (m *message) Truncate(n int64) {
   358  	m.Data = m.Data[:n]
   359  }
   360  
   361  // A Receiver can be used to receive Messages.
   362  type Receiver interface {
   363  	// Recv receives a single message. This method does not block.
   364  	//
   365  	// notify indicates if RecvNotify should be called.
   366  	Recv(ctx context.Context, data [][]byte, args RecvArgs) (out RecvOutput, notify bool, err *syserr.Error)
   367  
   368  	// RecvNotify notifies the Receiver of a successful Recv. This must not be
   369  	// called while holding any endpoint locks.
   370  	RecvNotify()
   371  
   372  	// CloseRecv prevents the receiving of additional Messages.
   373  	//
   374  	// After CloseRecv is called, CloseNotify must also be called.
   375  	CloseRecv()
   376  
   377  	// CloseNotify notifies the Receiver of recv being closed. This must not be
   378  	// called while holding any endpoint locks.
   379  	CloseNotify()
   380  
   381  	// IsRecvClosed returns true if reception of additional messages is closed.
   382  	IsRecvClosed() bool
   383  
   384  	// Readable returns if messages should be attempted to be received. This
   385  	// includes when read has been shutdown.
   386  	Readable() bool
   387  
   388  	// RecvQueuedSize returns the total amount of data currently receivable.
   389  	// RecvQueuedSize should return -1 if the operation isn't supported.
   390  	RecvQueuedSize() int64
   391  
   392  	// RecvMaxQueueSize returns maximum value for RecvQueuedSize.
   393  	// RecvMaxQueueSize should return -1 if the operation isn't supported.
   394  	RecvMaxQueueSize() int64
   395  
   396  	// Release releases any resources owned by the Receiver. It should be
   397  	// called before dropping all references to a Receiver.
   398  	Release(ctx context.Context)
   399  }
   400  
   401  // Address is a unix socket address.
   402  //
   403  // +stateify savable
   404  type Address struct {
   405  	Addr string
   406  }
   407  
   408  // queueReceiver implements Receiver for datagram sockets.
   409  //
   410  // +stateify savable
   411  type queueReceiver struct {
   412  	readQueue *queue
   413  }
   414  
   415  // Recv implements Receiver.Recv.
   416  func (q *queueReceiver) Recv(ctx context.Context, data [][]byte, args RecvArgs) (RecvOutput, bool, *syserr.Error) {
   417  	var m *message
   418  	var notify bool
   419  	var err *syserr.Error
   420  	if args.Peek {
   421  		m, err = q.readQueue.Peek()
   422  	} else {
   423  		m, notify, err = q.readQueue.Dequeue()
   424  	}
   425  	if err != nil {
   426  		return RecvOutput{}, false, err
   427  	}
   428  	src := []byte(m.Data)
   429  	var copied int64
   430  	for i := 0; i < len(data) && len(src) > 0; i++ {
   431  		n := copy(data[i], src)
   432  		copied += int64(n)
   433  		src = src[n:]
   434  	}
   435  	out := RecvOutput{
   436  		RecvLen: copied,
   437  		MsgLen:  int64(len(m.Data)),
   438  		Control: m.Control,
   439  		Source:  m.Address,
   440  	}
   441  	return out, notify, nil
   442  }
   443  
   444  // RecvNotify implements Receiver.RecvNotify.
   445  func (q *queueReceiver) RecvNotify() {
   446  	q.readQueue.WriterQueue.Notify(waiter.WritableEvents)
   447  }
   448  
   449  // CloseNotify implements Receiver.CloseNotify.
   450  func (q *queueReceiver) CloseNotify() {
   451  	q.readQueue.ReaderQueue.Notify(waiter.ReadableEvents)
   452  	q.readQueue.WriterQueue.Notify(waiter.WritableEvents)
   453  }
   454  
   455  // CloseRecv implements Receiver.CloseRecv.
   456  func (q *queueReceiver) CloseRecv() {
   457  	q.readQueue.Close()
   458  }
   459  
   460  // IsRecvClosed implements Receiver.IsRecvClosed.
   461  func (q *queueReceiver) IsRecvClosed() bool {
   462  	return q.readQueue.isClosed()
   463  }
   464  
   465  // Readable implements Receiver.Readable.
   466  func (q *queueReceiver) Readable() bool {
   467  	return q.readQueue.IsReadable()
   468  }
   469  
   470  // RecvQueuedSize implements Receiver.RecvQueuedSize.
   471  func (q *queueReceiver) RecvQueuedSize() int64 {
   472  	return q.readQueue.QueuedSize()
   473  }
   474  
   475  // RecvMaxQueueSize implements Receiver.RecvMaxQueueSize.
   476  func (q *queueReceiver) RecvMaxQueueSize() int64 {
   477  	return q.readQueue.MaxQueueSize()
   478  }
   479  
   480  // Release implements Receiver.Release.
   481  func (q *queueReceiver) Release(ctx context.Context) {
   482  	q.readQueue.DecRef(ctx)
   483  }
   484  
   485  // streamQueueReceiver implements Receiver for stream sockets.
   486  //
   487  // +stateify savable
   488  type streamQueueReceiver struct {
   489  	queueReceiver
   490  
   491  	mu      streamQueueReceiverMutex `state:"nosave"`
   492  	buffer  []byte
   493  	control ControlMessages
   494  	addr    Address
   495  }
   496  
   497  func vecCopy(data [][]byte, buf []byte) (int64, [][]byte, []byte) {
   498  	var copied int64
   499  	for len(data) > 0 && len(buf) > 0 {
   500  		n := copy(data[0], buf)
   501  		copied += int64(n)
   502  		buf = buf[n:]
   503  		data[0] = data[0][n:]
   504  		if len(data[0]) == 0 {
   505  			data = data[1:]
   506  		}
   507  	}
   508  	return copied, data, buf
   509  }
   510  
   511  // Readable implements Receiver.Readable.
   512  func (q *streamQueueReceiver) Readable() bool {
   513  	q.mu.Lock()
   514  	bl := len(q.buffer)
   515  	r := q.readQueue.IsReadable()
   516  	q.mu.Unlock()
   517  	// We're readable if we have data in our buffer or if the queue receiver is
   518  	// readable.
   519  	return bl > 0 || r
   520  }
   521  
   522  // RecvQueuedSize implements Receiver.RecvQueuedSize.
   523  func (q *streamQueueReceiver) RecvQueuedSize() int64 {
   524  	q.mu.Lock()
   525  	bl := len(q.buffer)
   526  	qs := q.readQueue.QueuedSize()
   527  	q.mu.Unlock()
   528  	return int64(bl) + qs
   529  }
   530  
   531  // RecvMaxQueueSize implements Receiver.RecvMaxQueueSize.
   532  func (q *streamQueueReceiver) RecvMaxQueueSize() int64 {
   533  	// The RecvMaxQueueSize() is the readQueue's MaxQueueSize() plus the largest
   534  	// message we can buffer which is also the largest message we can receive.
   535  	return 2 * q.readQueue.MaxQueueSize()
   536  }
   537  
   538  // Recv implements Receiver.Recv.
   539  func (q *streamQueueReceiver) Recv(ctx context.Context, data [][]byte, args RecvArgs) (RecvOutput, bool, *syserr.Error) {
   540  	q.mu.Lock()
   541  	defer q.mu.Unlock()
   542  
   543  	var notify bool
   544  
   545  	// If we have no data in the endpoint, we need to get some.
   546  	if len(q.buffer) == 0 {
   547  		// Load the next message into a buffer, even if we are peeking. Peeking
   548  		// won't consume the message, so it will be still available to be read
   549  		// the next time Recv() is called.
   550  		m, n, err := q.readQueue.Dequeue()
   551  		if err != nil {
   552  			return RecvOutput{}, false, err
   553  		}
   554  		notify = n
   555  		q.buffer = []byte(m.Data)
   556  		q.control = m.Control
   557  		q.addr = m.Address
   558  	}
   559  
   560  	var copied int64
   561  	if args.Peek {
   562  		// Don't consume control message if we are peeking.
   563  		c := q.control.Clone()
   564  
   565  		// Don't consume data since we are peeking.
   566  		copied, _, _ = vecCopy(data, q.buffer)
   567  
   568  		out := RecvOutput{
   569  			RecvLen: copied,
   570  			MsgLen:  copied,
   571  			Control: c,
   572  			Source:  q.addr,
   573  		}
   574  		return out, notify, nil
   575  	}
   576  
   577  	// Consume data and control message since we are not peeking.
   578  	copied, data, q.buffer = vecCopy(data, q.buffer)
   579  
   580  	// Save the original state of q.control.
   581  	c := q.control
   582  
   583  	// Remove rights from q.control and leave behind just the creds.
   584  	q.control.Rights = nil
   585  	if !args.Creds {
   586  		c.Credentials = nil
   587  	}
   588  
   589  	var out RecvOutput
   590  	if c.Rights != nil && args.NumRights == 0 {
   591  		// We won't use these rights.
   592  		out.UnusedRights = append(out.UnusedRights, c.Rights)
   593  		c.Rights = nil
   594  		out.ControlTrunc = true
   595  	}
   596  
   597  	haveRights := c.Rights != nil
   598  
   599  	// If we have more capacity for data and haven't received any usable
   600  	// rights.
   601  	//
   602  	// Linux never coalesces rights control messages.
   603  	for !haveRights && len(data) > 0 {
   604  		// Get a message from the readQueue.
   605  		m, n, err := q.readQueue.Dequeue()
   606  		if err != nil {
   607  			// We already got some data, so ignore this error. This will
   608  			// manifest as a short read to the user, which is what Linux
   609  			// does.
   610  			break
   611  		}
   612  		notify = notify || n
   613  		q.buffer = []byte(m.Data)
   614  		q.control = m.Control
   615  		q.addr = m.Address
   616  
   617  		if args.Creds {
   618  			if (q.control.Credentials == nil) != (c.Credentials == nil) {
   619  				// One message has credentials, the other does not.
   620  				break
   621  			}
   622  
   623  			if q.control.Credentials != nil && c.Credentials != nil && !q.control.Credentials.Equals(c.Credentials) {
   624  				// Both messages have credentials, but they don't match.
   625  				break
   626  			}
   627  		}
   628  
   629  		if args.NumRights != 0 && c.Rights != nil && q.control.Rights != nil {
   630  			// Both messages have rights.
   631  			break
   632  		}
   633  
   634  		var cpd int64
   635  		cpd, data, q.buffer = vecCopy(data, q.buffer)
   636  		copied += cpd
   637  
   638  		if cpd == 0 {
   639  			// data was actually full.
   640  			break
   641  		}
   642  
   643  		if q.control.Rights != nil {
   644  			// Consume rights.
   645  			if args.NumRights == 0 {
   646  				out.ControlTrunc = true
   647  				out.UnusedRights = append(out.UnusedRights, q.control.Rights)
   648  			} else {
   649  				c.Rights = q.control.Rights
   650  				haveRights = true
   651  			}
   652  			q.control.Rights = nil
   653  		}
   654  	}
   655  
   656  	out.MsgLen = copied
   657  	out.RecvLen = copied
   658  	out.Source = q.addr
   659  	out.Control = c
   660  	return out, notify, nil
   661  }
   662  
   663  // Release implements Receiver.Release.
   664  func (q *streamQueueReceiver) Release(ctx context.Context) {
   665  	q.queueReceiver.Release(ctx)
   666  	q.control.Release(ctx)
   667  }
   668  
   669  // A ConnectedEndpoint is an Endpoint that can be used to send Messages.
   670  type ConnectedEndpoint interface {
   671  	// Passcred implements Endpoint.Passcred.
   672  	Passcred() bool
   673  
   674  	// GetLocalAddress implements Endpoint.GetLocalAddress.
   675  	GetLocalAddress() (Address, tcpip.Error)
   676  
   677  	// Send sends a single message. This method does not block.
   678  	//
   679  	// notify indicates if SendNotify should be called.
   680  	//
   681  	// syserr.ErrWouldBlock can be returned along with a partial write if
   682  	// the caller should block to send the rest of the data.
   683  	Send(ctx context.Context, data [][]byte, c ControlMessages, from Address) (n int64, notify bool, err *syserr.Error)
   684  
   685  	// SendNotify notifies the ConnectedEndpoint of a successful Send. This
   686  	// must not be called while holding any endpoint locks.
   687  	SendNotify()
   688  
   689  	// CloseSend prevents the sending of additional Messages.
   690  	//
   691  	// After CloseSend is call, CloseNotify must also be called.
   692  	CloseSend()
   693  
   694  	// CloseNotify notifies the ConnectedEndpoint of send being closed. This
   695  	// must not be called while holding any endpoint locks.
   696  	CloseNotify()
   697  
   698  	// IsSendClosed returns true if transmission of additional messages is closed.
   699  	IsSendClosed() bool
   700  
   701  	// Writable returns if messages should be attempted to be sent. This
   702  	// includes when write has been shutdown.
   703  	Writable() bool
   704  
   705  	// EventUpdate lets the ConnectedEndpoint know that event registrations
   706  	// have changed.
   707  	EventUpdate() error
   708  
   709  	// SendQueuedSize returns the total amount of data currently queued for
   710  	// sending. SendQueuedSize should return -1 if the operation isn't
   711  	// supported.
   712  	SendQueuedSize() int64
   713  
   714  	// SendMaxQueueSize returns maximum value for SendQueuedSize.
   715  	// SendMaxQueueSize should return -1 if the operation isn't supported.
   716  	SendMaxQueueSize() int64
   717  
   718  	// Release releases any resources owned by the ConnectedEndpoint. It should
   719  	// be called before dropping all references to a ConnectedEndpoint.
   720  	Release(ctx context.Context)
   721  
   722  	// CloseUnread sets the fact that this end is closed with unread data to
   723  	// the peer socket.
   724  	CloseUnread()
   725  
   726  	// SetSendBufferSize is called when the endpoint's send buffer size is
   727  	// changed.
   728  	SetSendBufferSize(v int64) (newSz int64)
   729  }
   730  
   731  // +stateify savable
   732  type connectedEndpoint struct {
   733  	// endpoint represents the subset of the Endpoint functionality needed by
   734  	// the connectedEndpoint. It is implemented by both connectionedEndpoint
   735  	// and connectionlessEndpoint and allows the use of types which don't
   736  	// fully implement Endpoint.
   737  	endpoint interface {
   738  		// Passcred implements Endpoint.Passcred.
   739  		Passcred() bool
   740  
   741  		// GetLocalAddress implements Endpoint.GetLocalAddress.
   742  		GetLocalAddress() (Address, tcpip.Error)
   743  
   744  		// Type implements Endpoint.Type.
   745  		Type() linux.SockType
   746  	}
   747  
   748  	writeQueue *queue
   749  }
   750  
   751  // Passcred implements ConnectedEndpoint.Passcred.
   752  func (e *connectedEndpoint) Passcred() bool {
   753  	return e.endpoint.Passcred()
   754  }
   755  
   756  // GetLocalAddress implements ConnectedEndpoint.GetLocalAddress.
   757  func (e *connectedEndpoint) GetLocalAddress() (Address, tcpip.Error) {
   758  	return e.endpoint.GetLocalAddress()
   759  }
   760  
   761  // Send implements ConnectedEndpoint.Send.
   762  func (e *connectedEndpoint) Send(ctx context.Context, data [][]byte, c ControlMessages, from Address) (int64, bool, *syserr.Error) {
   763  	discardEmpty := false
   764  	truncate := false
   765  	if e.endpoint.Type() == linux.SOCK_STREAM {
   766  		// Discard empty stream packets. Since stream sockets don't
   767  		// preserve message boundaries, sending zero bytes is a no-op.
   768  		// In Linux, the receiver actually uses a zero-length receive
   769  		// as an indication that the stream was closed.
   770  		discardEmpty = true
   771  
   772  		// Since stream sockets don't preserve message boundaries, we
   773  		// can write only as much of the message as fits in the queue.
   774  		truncate = true
   775  	}
   776  
   777  	return e.writeQueue.Enqueue(ctx, data, c, from, discardEmpty, truncate)
   778  }
   779  
   780  // SendNotify implements ConnectedEndpoint.SendNotify.
   781  func (e *connectedEndpoint) SendNotify() {
   782  	e.writeQueue.ReaderQueue.Notify(waiter.ReadableEvents)
   783  }
   784  
   785  // CloseNotify implements ConnectedEndpoint.CloseNotify.
   786  func (e *connectedEndpoint) CloseNotify() {
   787  	e.writeQueue.ReaderQueue.Notify(waiter.ReadableEvents)
   788  	e.writeQueue.WriterQueue.Notify(waiter.WritableEvents)
   789  }
   790  
   791  // CloseSend implements ConnectedEndpoint.CloseSend.
   792  func (e *connectedEndpoint) CloseSend() {
   793  	e.writeQueue.Close()
   794  }
   795  
   796  // IsSendClosed implements ConnectedEndpoint.IsSendClosed.
   797  func (e *connectedEndpoint) IsSendClosed() bool {
   798  	return e.writeQueue.isClosed()
   799  }
   800  
   801  // Writable implements ConnectedEndpoint.Writable.
   802  func (e *connectedEndpoint) Writable() bool {
   803  	return e.writeQueue.IsWritable()
   804  }
   805  
   806  // EventUpdate implements ConnectedEndpoint.EventUpdate.
   807  func (*connectedEndpoint) EventUpdate() error {
   808  	return nil
   809  }
   810  
   811  // SendQueuedSize implements ConnectedEndpoint.SendQueuedSize.
   812  func (e *connectedEndpoint) SendQueuedSize() int64 {
   813  	return e.writeQueue.QueuedSize()
   814  }
   815  
   816  // SendMaxQueueSize implements ConnectedEndpoint.SendMaxQueueSize.
   817  func (e *connectedEndpoint) SendMaxQueueSize() int64 {
   818  	return e.writeQueue.MaxQueueSize()
   819  }
   820  
   821  // Release implements ConnectedEndpoint.Release.
   822  func (e *connectedEndpoint) Release(ctx context.Context) {
   823  	e.writeQueue.DecRef(ctx)
   824  }
   825  
   826  // CloseUnread implements ConnectedEndpoint.CloseUnread.
   827  func (e *connectedEndpoint) CloseUnread() {
   828  	e.writeQueue.CloseUnread()
   829  }
   830  
   831  // SetSendBufferSize implements ConnectedEndpoint.SetSendBufferSize.
   832  // SetSendBufferSize sets the send buffer size for the write queue to the
   833  // specified value.
   834  func (e *connectedEndpoint) SetSendBufferSize(v int64) (newSz int64) {
   835  	e.writeQueue.SetMaxQueueSize(v)
   836  	return v
   837  }
   838  
   839  // baseEndpoint is an embeddable unix endpoint base used in both the connected
   840  // and connectionless unix domain socket Endpoint implementations.
   841  //
   842  // Not to be used on its own.
   843  //
   844  // +stateify savable
   845  type baseEndpoint struct {
   846  	*waiter.Queue
   847  	tcpip.DefaultSocketOptionsHandler
   848  
   849  	// Mutex protects the below fields.
   850  	//
   851  	// See the lock ordering comment in package kernel/epoll regarding when
   852  	// this lock can safely be held.
   853  	endpointMutex `state:"nosave"`
   854  
   855  	// receiver allows Messages to be received.
   856  	receiver Receiver
   857  
   858  	// connected allows messages to be sent and state information about the
   859  	// connected endpoint to be read.
   860  	connected ConnectedEndpoint
   861  
   862  	// path is not empty if the endpoint has been bound,
   863  	// or may be used if the endpoint is connected.
   864  	path string
   865  
   866  	// ops is used to get socket level options.
   867  	ops tcpip.SocketOptions
   868  }
   869  
   870  // EventRegister implements waiter.Waitable.EventRegister.
   871  func (e *baseEndpoint) EventRegister(we *waiter.Entry) error {
   872  	e.Queue.EventRegister(we)
   873  	e.Lock()
   874  	c := e.connected
   875  	e.Unlock()
   876  	if c != nil {
   877  		if err := c.EventUpdate(); err != nil {
   878  			return err
   879  		}
   880  	}
   881  	return nil
   882  }
   883  
   884  // EventUnregister implements waiter.Waitable.EventUnregister.
   885  func (e *baseEndpoint) EventUnregister(we *waiter.Entry) {
   886  	e.Queue.EventUnregister(we)
   887  	e.Lock()
   888  	c := e.connected
   889  	e.Unlock()
   890  	if c != nil {
   891  		c.EventUpdate()
   892  	}
   893  }
   894  
   895  // Passcred implements Credentialer.Passcred.
   896  func (e *baseEndpoint) Passcred() bool {
   897  	return e.SocketOptions().GetPassCred()
   898  }
   899  
   900  // ConnectedPasscred implements Credentialer.ConnectedPasscred.
   901  func (e *baseEndpoint) ConnectedPasscred() bool {
   902  	e.Lock()
   903  	defer e.Unlock()
   904  	return e.connected != nil && e.connected.Passcred()
   905  }
   906  
   907  // Connected implements ConnectingEndpoint.Connected.
   908  //
   909  // Preconditions: e.mu must be held.
   910  func (e *baseEndpoint) Connected() bool {
   911  	return e.receiver != nil && e.connected != nil
   912  }
   913  
   914  // RecvMsg reads data and a control message from the endpoint.
   915  func (e *baseEndpoint) RecvMsg(ctx context.Context, data [][]byte, args RecvArgs) (RecvOutput, func(), *syserr.Error) {
   916  	e.Lock()
   917  	receiver := e.receiver
   918  	e.Unlock()
   919  
   920  	if receiver == nil {
   921  		return RecvOutput{}, nil, syserr.ErrNotConnected
   922  	}
   923  
   924  	out, notify, err := receiver.Recv(ctx, data, args)
   925  	if err != nil {
   926  		return RecvOutput{}, nil, err
   927  	}
   928  
   929  	if notify {
   930  		return out, receiver.RecvNotify, nil
   931  	}
   932  
   933  	return out, nil, nil
   934  }
   935  
   936  // SendMsg writes data and a control message to the endpoint's peer.
   937  // This method does not block if the data cannot be written.
   938  func (e *baseEndpoint) SendMsg(ctx context.Context, data [][]byte, c ControlMessages, to BoundEndpoint) (int64, func(), *syserr.Error) {
   939  	e.Lock()
   940  	if !e.Connected() {
   941  		e.Unlock()
   942  		return 0, nil, syserr.ErrNotConnected
   943  	}
   944  	if to != nil {
   945  		e.Unlock()
   946  		return 0, nil, syserr.ErrAlreadyConnected
   947  	}
   948  
   949  	connected := e.connected
   950  	n, notify, err := connected.Send(ctx, data, c, Address{Addr: e.path})
   951  	e.Unlock()
   952  
   953  	var notifyFn func()
   954  	if notify {
   955  		notifyFn = connected.SendNotify
   956  	}
   957  
   958  	return n, notifyFn, err
   959  }
   960  
   961  // SetSockOpt sets a socket option.
   962  func (e *baseEndpoint) SetSockOpt(opt tcpip.SettableSocketOption) tcpip.Error {
   963  	return nil
   964  }
   965  
   966  func (e *baseEndpoint) SetSockOptInt(opt tcpip.SockOptInt, v int) tcpip.Error {
   967  	log.Warningf("Unsupported socket option: %d", opt)
   968  	return nil
   969  }
   970  
   971  func (e *baseEndpoint) GetSockOptInt(opt tcpip.SockOptInt) (int, tcpip.Error) {
   972  	switch opt {
   973  	case tcpip.ReceiveQueueSizeOption:
   974  		v := 0
   975  		e.Lock()
   976  		if !e.Connected() {
   977  			e.Unlock()
   978  			return -1, &tcpip.ErrNotConnected{}
   979  		}
   980  		v = int(e.receiver.RecvQueuedSize())
   981  		e.Unlock()
   982  		if v < 0 {
   983  			return -1, &tcpip.ErrQueueSizeNotSupported{}
   984  		}
   985  		return v, nil
   986  
   987  	case tcpip.SendQueueSizeOption:
   988  		e.Lock()
   989  		if !e.Connected() {
   990  			e.Unlock()
   991  			return -1, &tcpip.ErrNotConnected{}
   992  		}
   993  		v := e.connected.SendQueuedSize()
   994  		e.Unlock()
   995  		if v < 0 {
   996  			return -1, &tcpip.ErrQueueSizeNotSupported{}
   997  		}
   998  		return int(v), nil
   999  
  1000  	default:
  1001  		log.Warningf("Unsupported socket option: %d", opt)
  1002  		return -1, &tcpip.ErrUnknownProtocolOption{}
  1003  	}
  1004  }
  1005  
  1006  // GetSockOpt implements tcpip.Endpoint.GetSockOpt.
  1007  func (e *baseEndpoint) GetSockOpt(opt tcpip.GettableSocketOption) tcpip.Error {
  1008  	log.Warningf("Unsupported socket option: %T", opt)
  1009  	return &tcpip.ErrUnknownProtocolOption{}
  1010  }
  1011  
  1012  // LastError implements Endpoint.LastError.
  1013  func (*baseEndpoint) LastError() tcpip.Error {
  1014  	return nil
  1015  }
  1016  
  1017  // SocketOptions implements Endpoint.SocketOptions.
  1018  func (e *baseEndpoint) SocketOptions() *tcpip.SocketOptions {
  1019  	return &e.ops
  1020  }
  1021  
  1022  // Shutdown closes the read and/or write end of the endpoint connection to its
  1023  // peer.
  1024  func (e *baseEndpoint) Shutdown(flags tcpip.ShutdownFlags) *syserr.Error {
  1025  	e.Lock()
  1026  	if !e.Connected() {
  1027  		e.Unlock()
  1028  		return syserr.ErrNotConnected
  1029  	}
  1030  
  1031  	var (
  1032  		r             = e.receiver
  1033  		c             = e.connected
  1034  		shutdownRead  = flags&tcpip.ShutdownRead != 0
  1035  		shutdownWrite = flags&tcpip.ShutdownWrite != 0
  1036  	)
  1037  	if shutdownRead {
  1038  		r.CloseRecv()
  1039  	}
  1040  	if shutdownWrite {
  1041  		c.CloseSend()
  1042  	}
  1043  	e.Unlock()
  1044  
  1045  	// Don't hold e.Mutex while calling CloseNotify.
  1046  	if shutdownRead {
  1047  		r.CloseNotify()
  1048  	}
  1049  	if shutdownWrite {
  1050  		c.CloseNotify()
  1051  	}
  1052  
  1053  	return nil
  1054  }
  1055  
  1056  // GetLocalAddress returns the bound path.
  1057  func (e *baseEndpoint) GetLocalAddress() (Address, tcpip.Error) {
  1058  	e.Lock()
  1059  	defer e.Unlock()
  1060  	return Address{Addr: e.path}, nil
  1061  }
  1062  
  1063  // GetRemoteAddress returns the local address of the connected endpoint (if
  1064  // available).
  1065  func (e *baseEndpoint) GetRemoteAddress() (Address, tcpip.Error) {
  1066  	e.Lock()
  1067  	c := e.connected
  1068  	e.Unlock()
  1069  	if c != nil {
  1070  		return c.GetLocalAddress()
  1071  	}
  1072  	return Address{}, &tcpip.ErrNotConnected{}
  1073  }
  1074  
  1075  // Release implements BoundEndpoint.Release.
  1076  func (*baseEndpoint) Release(context.Context) {
  1077  	// Binding a baseEndpoint doesn't take a reference.
  1078  }
  1079  
  1080  // stackHandler is just a stub implementation of tcpip.StackHandler to provide
  1081  // when initializing socketoptions.
  1082  type stackHandler struct {
  1083  }
  1084  
  1085  // Option implements tcpip.StackHandler.
  1086  func (h *stackHandler) Option(option any) tcpip.Error {
  1087  	panic("unimplemented")
  1088  }
  1089  
  1090  // TransportProtocolOption implements tcpip.StackHandler.
  1091  func (h *stackHandler) TransportProtocolOption(proto tcpip.TransportProtocolNumber, option tcpip.GettableTransportProtocolOption) tcpip.Error {
  1092  	panic("unimplemented")
  1093  }
  1094  
  1095  // getSendBufferLimits implements tcpip.GetSendBufferLimits.
  1096  //
  1097  // AF_UNIX sockets buffer sizes are not tied to the networking stack/namespace
  1098  // in linux but are bound by net.core.(wmem|rmem)_(max|default).
  1099  //
  1100  // In gVisor net.core sysctls today are not exposed or if exposed are currently
  1101  // tied to the networking stack in use. This makes it complicated for AF_UNIX
  1102  // when we are in a new namespace w/ no networking stack. As a result for now we
  1103  // define default/max values here in the unix socket implementation itself.
  1104  func getSendBufferLimits(tcpip.StackHandler) tcpip.SendBufferSizeOption {
  1105  	return tcpip.SendBufferSizeOption{
  1106  		Min:     minimumBufferSize,
  1107  		Default: defaultBufferSize,
  1108  		Max:     maxBufferSize,
  1109  	}
  1110  }
  1111  
  1112  // getReceiveBufferLimits implements tcpip.GetReceiveBufferLimits.
  1113  //
  1114  // We define min, max and default values for unix socket implementation. Unix
  1115  // sockets do not use receive buffer.
  1116  func getReceiveBufferLimits(tcpip.StackHandler) tcpip.ReceiveBufferSizeOption {
  1117  	return tcpip.ReceiveBufferSizeOption{
  1118  		Min:     minimumBufferSize,
  1119  		Default: defaultBufferSize,
  1120  		Max:     maxBufferSize,
  1121  	}
  1122  }