github.com/nicocha30/gvisor-ligolo@v0.0.0-20230726075806-989fa2c0a413/pkg/sentry/socket/unix/transport/unix.go (about)

     1  // Copyright 2018 The gVisor Authors.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  // Package transport contains the implementation of Unix endpoints.
    16  package transport
    17  
    18  import (
    19  	"github.com/nicocha30/gvisor-ligolo/pkg/abi/linux"
    20  	"github.com/nicocha30/gvisor-ligolo/pkg/context"
    21  	"github.com/nicocha30/gvisor-ligolo/pkg/log"
    22  	"github.com/nicocha30/gvisor-ligolo/pkg/syserr"
    23  	"github.com/nicocha30/gvisor-ligolo/pkg/tcpip"
    24  	"github.com/nicocha30/gvisor-ligolo/pkg/waiter"
    25  )
    26  
    27  const (
    28  	// The minimum size of the send/receive buffers.
    29  	minimumBufferSize = 4 << 10 // 4 KiB (match default in linux)
    30  
    31  	// The default size of the send/receive buffers.
    32  	defaultBufferSize = 208 << 10 // 208 KiB  (default in linux for net.core.wmem_default)
    33  
    34  	// The maximum permitted size for the send/receive buffers.
    35  	maxBufferSize = 4 << 20 // 4 MiB 4 MiB (default in linux for net.core.wmem_max)
    36  )
    37  
    38  // A RightsControlMessage is a control message containing FDs.
    39  //
    40  // +stateify savable
    41  type RightsControlMessage interface {
    42  	// Clone returns a copy of the RightsControlMessage.
    43  	Clone() RightsControlMessage
    44  
    45  	// Release releases any resources owned by the RightsControlMessage.
    46  	Release(ctx context.Context)
    47  }
    48  
    49  // A CredentialsControlMessage is a control message containing Unix credentials.
    50  type CredentialsControlMessage interface {
    51  	// Equals returns true iff the two messages are equal.
    52  	Equals(CredentialsControlMessage) bool
    53  }
    54  
    55  // A ControlMessages represents a collection of socket control messages.
    56  //
    57  // +stateify savable
    58  type ControlMessages struct {
    59  	// Rights is a control message containing FDs.
    60  	Rights RightsControlMessage
    61  
    62  	// Credentials is a control message containing Unix credentials.
    63  	Credentials CredentialsControlMessage
    64  }
    65  
    66  // Empty returns true iff the ControlMessages does not contain either
    67  // credentials or rights.
    68  func (c *ControlMessages) Empty() bool {
    69  	return c.Rights == nil && c.Credentials == nil
    70  }
    71  
    72  // Clone clones both the credentials and the rights.
    73  func (c *ControlMessages) Clone() ControlMessages {
    74  	cm := ControlMessages{}
    75  	if c.Rights != nil {
    76  		cm.Rights = c.Rights.Clone()
    77  	}
    78  	cm.Credentials = c.Credentials
    79  	return cm
    80  }
    81  
    82  // Release releases both the credentials and the rights.
    83  func (c *ControlMessages) Release(ctx context.Context) {
    84  	if c.Rights != nil {
    85  		c.Rights.Release(ctx)
    86  	}
    87  	*c = ControlMessages{}
    88  }
    89  
    90  // Endpoint is the interface implemented by Unix transport protocol
    91  // implementations that expose functionality like sendmsg, recvmsg, connect,
    92  // etc. to Unix socket implementations.
    93  type Endpoint interface {
    94  	Credentialer
    95  	waiter.Waitable
    96  
    97  	// Close puts the endpoint in a closed state and frees all resources
    98  	// associated with it.
    99  	Close(ctx context.Context)
   100  
   101  	// RecvMsg reads data and a control message from the endpoint. This method
   102  	// does not block if there is no data pending.
   103  	//
   104  	// creds indicates if credential control messages are requested by the
   105  	// caller. This is useful for determining if control messages can be
   106  	// coalesced. creds is a hint and can be safely ignored by the
   107  	// implementation if no coalescing is possible. It is fine to return
   108  	// credential control messages when none were requested or to not return
   109  	// credential control messages when they were requested.
   110  	//
   111  	// numRights is the number of SCM_RIGHTS FDs requested by the caller. This
   112  	// is useful if one must allocate a buffer to receive a SCM_RIGHTS message
   113  	// or determine if control messages can be coalesced. numRights is a hint
   114  	// and can be safely ignored by the implementation if the number of
   115  	// available SCM_RIGHTS FDs is known and no coalescing is possible. It is
   116  	// fine for the returned number of SCM_RIGHTS FDs to be either higher or
   117  	// lower than the requested number.
   118  	//
   119  	// If peek is true, no data should be consumed from the Endpoint. Any and
   120  	// all data returned from a peek should be available in the next call to
   121  	// RecvMsg.
   122  	//
   123  	// recvLen is the number of bytes copied into data.
   124  	//
   125  	// msgLen is the length of the read message consumed for datagram Endpoints.
   126  	// msgLen is always the same as recvLen for stream Endpoints.
   127  	//
   128  	// CMTruncated indicates that the numRights hint was used to receive fewer
   129  	// than the total available SCM_RIGHTS FDs. Additional truncation may be
   130  	// required by the caller.
   131  	//
   132  	// If set, notify is a callback that should be called after RecvMesg
   133  	// completes without mm.activeMu held.
   134  	RecvMsg(ctx context.Context, data [][]byte, creds bool, numRights int, peek bool, addr *Address) (recvLen, msgLen int64, cm ControlMessages, CMTruncated bool, notify func(), err *syserr.Error)
   135  
   136  	// SendMsg writes data and a control message to the endpoint's peer.
   137  	// This method does not block if the data cannot be written.
   138  	//
   139  	// SendMsg does not take ownership of any of its arguments on error.
   140  	//
   141  	// If set, notify is a callback that should be called after RecvMesg
   142  	// completes without mm.activeMu held.
   143  	SendMsg(context.Context, [][]byte, ControlMessages, BoundEndpoint) (int64, func(), *syserr.Error)
   144  
   145  	// Connect connects this endpoint directly to another.
   146  	//
   147  	// This should be called on the client endpoint, and the (bound)
   148  	// endpoint passed in as a parameter.
   149  	//
   150  	// The error codes are the same as Connect.
   151  	Connect(ctx context.Context, server BoundEndpoint) *syserr.Error
   152  
   153  	// Shutdown closes the read and/or write end of the endpoint connection
   154  	// to its peer.
   155  	Shutdown(flags tcpip.ShutdownFlags) *syserr.Error
   156  
   157  	// Listen puts the endpoint in "listen" mode, which allows it to accept
   158  	// new connections.
   159  	Listen(ctx context.Context, backlog int) *syserr.Error
   160  
   161  	// Accept returns a new endpoint if a peer has established a connection
   162  	// to an endpoint previously set to listen mode. This method does not
   163  	// block if no new connections are available.
   164  	//
   165  	// The returned Queue is the wait queue for the newly created endpoint.
   166  	//
   167  	// peerAddr if not nil will be populated with the address of the connected
   168  	// peer on a successful accept.
   169  	Accept(ctx context.Context, peerAddr *Address) (Endpoint, *syserr.Error)
   170  
   171  	// Bind binds the endpoint to a specific local address and port.
   172  	// Specifying a NIC is optional.
   173  	Bind(address Address) *syserr.Error
   174  
   175  	// Type return the socket type, typically either SockStream, SockDgram
   176  	// or SockSeqpacket.
   177  	Type() linux.SockType
   178  
   179  	// GetLocalAddress returns the address to which the endpoint is bound.
   180  	GetLocalAddress() (Address, tcpip.Error)
   181  
   182  	// GetRemoteAddress returns the address to which the endpoint is
   183  	// connected.
   184  	GetRemoteAddress() (Address, tcpip.Error)
   185  
   186  	// SetSockOpt sets a socket option.
   187  	SetSockOpt(opt tcpip.SettableSocketOption) tcpip.Error
   188  
   189  	// SetSockOptInt sets a socket option for simple cases when a value has
   190  	// the int type.
   191  	SetSockOptInt(opt tcpip.SockOptInt, v int) tcpip.Error
   192  
   193  	// GetSockOpt gets a socket option.
   194  	GetSockOpt(opt tcpip.GettableSocketOption) tcpip.Error
   195  
   196  	// GetSockOptInt gets a socket option for simple cases when a return
   197  	// value has the int type.
   198  	GetSockOptInt(opt tcpip.SockOptInt) (int, tcpip.Error)
   199  
   200  	// State returns the current state of the socket, as represented by Linux in
   201  	// procfs.
   202  	State() uint32
   203  
   204  	// LastError clears and returns the last error reported by the endpoint.
   205  	LastError() tcpip.Error
   206  
   207  	// SocketOptions returns the structure which contains all the socket
   208  	// level options.
   209  	SocketOptions() *tcpip.SocketOptions
   210  }
   211  
   212  // A Credentialer is a socket or endpoint that supports the SO_PASSCRED socket
   213  // option.
   214  type Credentialer interface {
   215  	// Passcred returns whether or not the SO_PASSCRED socket option is
   216  	// enabled on this end.
   217  	Passcred() bool
   218  
   219  	// ConnectedPasscred returns whether or not the SO_PASSCRED socket option
   220  	// is enabled on the connected end.
   221  	ConnectedPasscred() bool
   222  }
   223  
   224  // A BoundEndpoint is a unix endpoint that can be connected to.
   225  type BoundEndpoint interface {
   226  	// BidirectionalConnect establishes a bi-directional connection between two
   227  	// unix endpoints in an all-or-nothing manner. If an error occurs during
   228  	// connecting, the state of neither endpoint should be modified.
   229  	//
   230  	// In order for an endpoint to establish such a bidirectional connection
   231  	// with a BoundEndpoint, the endpoint calls the BidirectionalConnect method
   232  	// on the BoundEndpoint and sends a representation of itself (the
   233  	// ConnectingEndpoint) and a callback (returnConnect) to receive the
   234  	// connection information (Receiver and ConnectedEndpoint) upon a
   235  	// successful connect. The callback should only be called on a successful
   236  	// connect.
   237  	//
   238  	// For a connection attempt to be successful, the ConnectingEndpoint must
   239  	// be unconnected and not listening and the BoundEndpoint whose
   240  	// BidirectionalConnect method is being called must be listening.
   241  	//
   242  	// This method will return syserr.ErrConnectionRefused on endpoints with a
   243  	// type that isn't SockStream or SockSeqpacket.
   244  	BidirectionalConnect(ctx context.Context, ep ConnectingEndpoint, returnConnect func(Receiver, ConnectedEndpoint)) *syserr.Error
   245  
   246  	// UnidirectionalConnect establishes a write-only connection to a unix
   247  	// endpoint.
   248  	//
   249  	// An endpoint which calls UnidirectionalConnect and supports it itself must
   250  	// not hold its own lock when calling UnidirectionalConnect.
   251  	//
   252  	// This method will return syserr.ErrConnectionRefused on a non-SockDgram
   253  	// endpoint.
   254  	UnidirectionalConnect(ctx context.Context) (ConnectedEndpoint, *syserr.Error)
   255  
   256  	// Passcred returns whether or not the SO_PASSCRED socket option is
   257  	// enabled on this end.
   258  	Passcred() bool
   259  
   260  	// Release releases any resources held by the BoundEndpoint. It must be
   261  	// called before dropping all references to a BoundEndpoint returned by a
   262  	// function.
   263  	Release(ctx context.Context)
   264  }
   265  
   266  // HostBoundEndpoint is an interface that endpoints can implement if they support
   267  // binding listening and accepting connections from a bound Unix domain socket
   268  // on the host.
   269  type HostBoundEndpoint interface {
   270  	// SetBoundSocketFD will be called on supporting endpoints after
   271  	// binding a socket on the host filesystem. Implementations should
   272  	// delegate Listen and Accept calls to the BoundSocketFD. The ownership
   273  	// of bsFD is transferred to the endpoint.
   274  	SetBoundSocketFD(ctx context.Context, bsFD BoundSocketFD) error
   275  
   276  	// ResetBoundSocketFD cleans up the BoundSocketFD set by the last successful
   277  	// SetBoundSocketFD call.
   278  	ResetBoundSocketFD(ctx context.Context)
   279  }
   280  
   281  // BoundSocketFD is an interface that wraps a socket FD that was bind(2)-ed.
   282  // It allows to listen and accept on that socket.
   283  type BoundSocketFD interface {
   284  	// Close closes the socket FD.
   285  	Close(ctx context.Context)
   286  
   287  	// NotificationFD is a host FD that can be used to notify when new clients
   288  	// connect to the socket.
   289  	NotificationFD() int32
   290  
   291  	// Listen is analogous to listen(2).
   292  	Listen(ctx context.Context, backlog int32) error
   293  
   294  	// Accept is analogous to accept(2).
   295  	Accept(ctx context.Context) (int, error)
   296  }
   297  
   298  // message represents a message passed over a Unix domain socket.
   299  //
   300  // +stateify savable
   301  type message struct {
   302  	messageEntry
   303  
   304  	// Data is the Message payload.
   305  	Data []byte
   306  
   307  	// Control is auxiliary control message data that goes along with the
   308  	// data.
   309  	Control ControlMessages
   310  
   311  	// Address is the bound address of the endpoint that sent the message.
   312  	//
   313  	// If the endpoint that sent the message is not bound, the Address is
   314  	// the empty string.
   315  	Address Address
   316  }
   317  
   318  // Length returns number of bytes stored in the message.
   319  func (m *message) Length() int64 {
   320  	return int64(len(m.Data))
   321  }
   322  
   323  // Release releases any resources held by the message.
   324  func (m *message) Release(ctx context.Context) {
   325  	m.Control.Release(ctx)
   326  }
   327  
   328  // Peek returns a copy of the message.
   329  func (m *message) Peek() *message {
   330  	return &message{Data: m.Data, Control: m.Control.Clone(), Address: m.Address}
   331  }
   332  
   333  // Truncate reduces the length of the message payload to n bytes.
   334  //
   335  // Preconditions: n <= m.Length().
   336  func (m *message) Truncate(n int64) {
   337  	m.Data = m.Data[:n]
   338  }
   339  
   340  // A Receiver can be used to receive Messages.
   341  type Receiver interface {
   342  	// Recv receives a single message. This method does not block.
   343  	//
   344  	// See Endpoint.RecvMsg for documentation on shared arguments.
   345  	//
   346  	// notify indicates if RecvNotify should be called.
   347  	Recv(ctx context.Context, data [][]byte, creds bool, numRights int, peek bool) (recvLen, msgLen int64, cm ControlMessages, CMTruncated bool, source Address, notify bool, err *syserr.Error)
   348  
   349  	// RecvNotify notifies the Receiver of a successful Recv. This must not be
   350  	// called while holding any endpoint locks.
   351  	RecvNotify()
   352  
   353  	// CloseRecv prevents the receiving of additional Messages.
   354  	//
   355  	// After CloseRecv is called, CloseNotify must also be called.
   356  	CloseRecv()
   357  
   358  	// CloseNotify notifies the Receiver of recv being closed. This must not be
   359  	// called while holding any endpoint locks.
   360  	CloseNotify()
   361  
   362  	// Readable returns if messages should be attempted to be received. This
   363  	// includes when read has been shutdown.
   364  	Readable() bool
   365  
   366  	// RecvQueuedSize returns the total amount of data currently receivable.
   367  	// RecvQueuedSize should return -1 if the operation isn't supported.
   368  	RecvQueuedSize() int64
   369  
   370  	// RecvMaxQueueSize returns maximum value for RecvQueuedSize.
   371  	// RecvMaxQueueSize should return -1 if the operation isn't supported.
   372  	RecvMaxQueueSize() int64
   373  
   374  	// Release releases any resources owned by the Receiver. It should be
   375  	// called before dropping all references to a Receiver.
   376  	Release(ctx context.Context)
   377  }
   378  
   379  // Address is a unix socket address.
   380  //
   381  // +stateify savable
   382  type Address struct {
   383  	Addr string
   384  }
   385  
   386  // queueReceiver implements Receiver for datagram sockets.
   387  //
   388  // +stateify savable
   389  type queueReceiver struct {
   390  	readQueue *queue
   391  }
   392  
   393  // Recv implements Receiver.Recv.
   394  func (q *queueReceiver) Recv(ctx context.Context, data [][]byte, creds bool, numRights int, peek bool) (int64, int64, ControlMessages, bool, Address, bool, *syserr.Error) {
   395  	var m *message
   396  	var notify bool
   397  	var err *syserr.Error
   398  	if peek {
   399  		m, err = q.readQueue.Peek()
   400  	} else {
   401  		m, notify, err = q.readQueue.Dequeue()
   402  	}
   403  	if err != nil {
   404  		return 0, 0, ControlMessages{}, false, Address{}, false, err
   405  	}
   406  	src := []byte(m.Data)
   407  	var copied int64
   408  	for i := 0; i < len(data) && len(src) > 0; i++ {
   409  		n := copy(data[i], src)
   410  		copied += int64(n)
   411  		src = src[n:]
   412  	}
   413  	return copied, int64(len(m.Data)), m.Control, false, m.Address, notify, nil
   414  }
   415  
   416  // RecvNotify implements Receiver.RecvNotify.
   417  func (q *queueReceiver) RecvNotify() {
   418  	q.readQueue.WriterQueue.Notify(waiter.WritableEvents)
   419  }
   420  
   421  // CloseNotify implements Receiver.CloseNotify.
   422  func (q *queueReceiver) CloseNotify() {
   423  	q.readQueue.ReaderQueue.Notify(waiter.ReadableEvents)
   424  	q.readQueue.WriterQueue.Notify(waiter.WritableEvents)
   425  }
   426  
   427  // CloseRecv implements Receiver.CloseRecv.
   428  func (q *queueReceiver) CloseRecv() {
   429  	q.readQueue.Close()
   430  }
   431  
   432  // Readable implements Receiver.Readable.
   433  func (q *queueReceiver) Readable() bool {
   434  	return q.readQueue.IsReadable()
   435  }
   436  
   437  // RecvQueuedSize implements Receiver.RecvQueuedSize.
   438  func (q *queueReceiver) RecvQueuedSize() int64 {
   439  	return q.readQueue.QueuedSize()
   440  }
   441  
   442  // RecvMaxQueueSize implements Receiver.RecvMaxQueueSize.
   443  func (q *queueReceiver) RecvMaxQueueSize() int64 {
   444  	return q.readQueue.MaxQueueSize()
   445  }
   446  
   447  // Release implements Receiver.Release.
   448  func (q *queueReceiver) Release(ctx context.Context) {
   449  	q.readQueue.DecRef(ctx)
   450  }
   451  
   452  // streamQueueReceiver implements Receiver for stream sockets.
   453  //
   454  // +stateify savable
   455  type streamQueueReceiver struct {
   456  	queueReceiver
   457  
   458  	mu      streamQueueReceiverMutex `state:"nosave"`
   459  	buffer  []byte
   460  	control ControlMessages
   461  	addr    Address
   462  }
   463  
   464  func vecCopy(data [][]byte, buf []byte) (int64, [][]byte, []byte) {
   465  	var copied int64
   466  	for len(data) > 0 && len(buf) > 0 {
   467  		n := copy(data[0], buf)
   468  		copied += int64(n)
   469  		buf = buf[n:]
   470  		data[0] = data[0][n:]
   471  		if len(data[0]) == 0 {
   472  			data = data[1:]
   473  		}
   474  	}
   475  	return copied, data, buf
   476  }
   477  
   478  // Readable implements Receiver.Readable.
   479  func (q *streamQueueReceiver) Readable() bool {
   480  	q.mu.Lock()
   481  	bl := len(q.buffer)
   482  	r := q.readQueue.IsReadable()
   483  	q.mu.Unlock()
   484  	// We're readable if we have data in our buffer or if the queue receiver is
   485  	// readable.
   486  	return bl > 0 || r
   487  }
   488  
   489  // RecvQueuedSize implements Receiver.RecvQueuedSize.
   490  func (q *streamQueueReceiver) RecvQueuedSize() int64 {
   491  	q.mu.Lock()
   492  	bl := len(q.buffer)
   493  	qs := q.readQueue.QueuedSize()
   494  	q.mu.Unlock()
   495  	return int64(bl) + qs
   496  }
   497  
   498  // RecvMaxQueueSize implements Receiver.RecvMaxQueueSize.
   499  func (q *streamQueueReceiver) RecvMaxQueueSize() int64 {
   500  	// The RecvMaxQueueSize() is the readQueue's MaxQueueSize() plus the largest
   501  	// message we can buffer which is also the largest message we can receive.
   502  	return 2 * q.readQueue.MaxQueueSize()
   503  }
   504  
   505  // Recv implements Receiver.Recv.
   506  func (q *streamQueueReceiver) Recv(ctx context.Context, data [][]byte, wantCreds bool, numRights int, peek bool) (int64, int64, ControlMessages, bool, Address, bool, *syserr.Error) {
   507  	// RightsControlMessages must be released without q.mu held. We do this in a
   508  	// defer to simplify control flow logic.
   509  	var rightsToRelease []RightsControlMessage
   510  	defer func() {
   511  		for _, rcm := range rightsToRelease {
   512  			rcm.Release(ctx)
   513  		}
   514  	}()
   515  
   516  	q.mu.Lock()
   517  	defer q.mu.Unlock()
   518  
   519  	var notify bool
   520  
   521  	// If we have no data in the endpoint, we need to get some.
   522  	if len(q.buffer) == 0 {
   523  		// Load the next message into a buffer, even if we are peeking. Peeking
   524  		// won't consume the message, so it will be still available to be read
   525  		// the next time Recv() is called.
   526  		m, n, err := q.readQueue.Dequeue()
   527  		if err != nil {
   528  			return 0, 0, ControlMessages{}, false, Address{}, false, err
   529  		}
   530  		notify = n
   531  		q.buffer = []byte(m.Data)
   532  		q.control = m.Control
   533  		q.addr = m.Address
   534  	}
   535  
   536  	var copied int64
   537  	if peek {
   538  		// Don't consume control message if we are peeking.
   539  		c := q.control.Clone()
   540  
   541  		// Don't consume data since we are peeking.
   542  		copied, _, _ = vecCopy(data, q.buffer)
   543  
   544  		return copied, copied, c, false, q.addr, notify, nil
   545  	}
   546  
   547  	// Consume data and control message since we are not peeking.
   548  	copied, data, q.buffer = vecCopy(data, q.buffer)
   549  
   550  	// Save the original state of q.control.
   551  	c := q.control
   552  
   553  	// Remove rights from q.control and leave behind just the creds.
   554  	q.control.Rights = nil
   555  	if !wantCreds {
   556  		c.Credentials = nil
   557  	}
   558  
   559  	var cmTruncated bool
   560  	if c.Rights != nil && numRights == 0 {
   561  		rightsToRelease = append(rightsToRelease, c.Rights)
   562  		c.Rights = nil
   563  		cmTruncated = true
   564  	}
   565  
   566  	haveRights := c.Rights != nil
   567  
   568  	// If we have more capacity for data and haven't received any usable
   569  	// rights.
   570  	//
   571  	// Linux never coalesces rights control messages.
   572  	for !haveRights && len(data) > 0 {
   573  		// Get a message from the readQueue.
   574  		m, n, err := q.readQueue.Dequeue()
   575  		if err != nil {
   576  			// We already got some data, so ignore this error. This will
   577  			// manifest as a short read to the user, which is what Linux
   578  			// does.
   579  			break
   580  		}
   581  		notify = notify || n
   582  		q.buffer = []byte(m.Data)
   583  		q.control = m.Control
   584  		q.addr = m.Address
   585  
   586  		if wantCreds {
   587  			if (q.control.Credentials == nil) != (c.Credentials == nil) {
   588  				// One message has credentials, the other does not.
   589  				break
   590  			}
   591  
   592  			if q.control.Credentials != nil && c.Credentials != nil && !q.control.Credentials.Equals(c.Credentials) {
   593  				// Both messages have credentials, but they don't match.
   594  				break
   595  			}
   596  		}
   597  
   598  		if numRights != 0 && c.Rights != nil && q.control.Rights != nil {
   599  			// Both messages have rights.
   600  			break
   601  		}
   602  
   603  		var cpd int64
   604  		cpd, data, q.buffer = vecCopy(data, q.buffer)
   605  		copied += cpd
   606  
   607  		if cpd == 0 {
   608  			// data was actually full.
   609  			break
   610  		}
   611  
   612  		if q.control.Rights != nil {
   613  			// Consume rights.
   614  			if numRights == 0 {
   615  				cmTruncated = true
   616  				rightsToRelease = append(rightsToRelease, q.control.Rights)
   617  			} else {
   618  				c.Rights = q.control.Rights
   619  				haveRights = true
   620  			}
   621  			q.control.Rights = nil
   622  		}
   623  	}
   624  	return copied, copied, c, cmTruncated, q.addr, notify, nil
   625  }
   626  
   627  // Release implements Receiver.Release.
   628  func (q *streamQueueReceiver) Release(ctx context.Context) {
   629  	q.queueReceiver.Release(ctx)
   630  	q.control.Release(ctx)
   631  }
   632  
   633  // A ConnectedEndpoint is an Endpoint that can be used to send Messages.
   634  type ConnectedEndpoint interface {
   635  	// Passcred implements Endpoint.Passcred.
   636  	Passcred() bool
   637  
   638  	// GetLocalAddress implements Endpoint.GetLocalAddress.
   639  	GetLocalAddress() (Address, tcpip.Error)
   640  
   641  	// Send sends a single message. This method does not block.
   642  	//
   643  	// notify indicates if SendNotify should be called.
   644  	//
   645  	// syserr.ErrWouldBlock can be returned along with a partial write if
   646  	// the caller should block to send the rest of the data.
   647  	Send(ctx context.Context, data [][]byte, c ControlMessages, from Address) (n int64, notify bool, err *syserr.Error)
   648  
   649  	// SendNotify notifies the ConnectedEndpoint of a successful Send. This
   650  	// must not be called while holding any endpoint locks.
   651  	SendNotify()
   652  
   653  	// CloseSend prevents the sending of additional Messages.
   654  	//
   655  	// After CloseSend is call, CloseNotify must also be called.
   656  	CloseSend()
   657  
   658  	// CloseNotify notifies the ConnectedEndpoint of send being closed. This
   659  	// must not be called while holding any endpoint locks.
   660  	CloseNotify()
   661  
   662  	// Writable returns if messages should be attempted to be sent. This
   663  	// includes when write has been shutdown.
   664  	Writable() bool
   665  
   666  	// EventUpdate lets the ConnectedEndpoint know that event registrations
   667  	// have changed.
   668  	EventUpdate() error
   669  
   670  	// SendQueuedSize returns the total amount of data currently queued for
   671  	// sending. SendQueuedSize should return -1 if the operation isn't
   672  	// supported.
   673  	SendQueuedSize() int64
   674  
   675  	// SendMaxQueueSize returns maximum value for SendQueuedSize.
   676  	// SendMaxQueueSize should return -1 if the operation isn't supported.
   677  	SendMaxQueueSize() int64
   678  
   679  	// Release releases any resources owned by the ConnectedEndpoint. It should
   680  	// be called before dropping all references to a ConnectedEndpoint.
   681  	Release(ctx context.Context)
   682  
   683  	// CloseUnread sets the fact that this end is closed with unread data to
   684  	// the peer socket.
   685  	CloseUnread()
   686  
   687  	// SetSendBufferSize is called when the endpoint's send buffer size is
   688  	// changed.
   689  	SetSendBufferSize(v int64) (newSz int64)
   690  }
   691  
   692  // +stateify savable
   693  type connectedEndpoint struct {
   694  	// endpoint represents the subset of the Endpoint functionality needed by
   695  	// the connectedEndpoint. It is implemented by both connectionedEndpoint
   696  	// and connectionlessEndpoint and allows the use of types which don't
   697  	// fully implement Endpoint.
   698  	endpoint interface {
   699  		// Passcred implements Endpoint.Passcred.
   700  		Passcred() bool
   701  
   702  		// GetLocalAddress implements Endpoint.GetLocalAddress.
   703  		GetLocalAddress() (Address, tcpip.Error)
   704  
   705  		// Type implements Endpoint.Type.
   706  		Type() linux.SockType
   707  	}
   708  
   709  	writeQueue *queue
   710  }
   711  
   712  // Passcred implements ConnectedEndpoint.Passcred.
   713  func (e *connectedEndpoint) Passcred() bool {
   714  	return e.endpoint.Passcred()
   715  }
   716  
   717  // GetLocalAddress implements ConnectedEndpoint.GetLocalAddress.
   718  func (e *connectedEndpoint) GetLocalAddress() (Address, tcpip.Error) {
   719  	return e.endpoint.GetLocalAddress()
   720  }
   721  
   722  // Send implements ConnectedEndpoint.Send.
   723  func (e *connectedEndpoint) Send(ctx context.Context, data [][]byte, c ControlMessages, from Address) (int64, bool, *syserr.Error) {
   724  	discardEmpty := false
   725  	truncate := false
   726  	if e.endpoint.Type() == linux.SOCK_STREAM {
   727  		// Discard empty stream packets. Since stream sockets don't
   728  		// preserve message boundaries, sending zero bytes is a no-op.
   729  		// In Linux, the receiver actually uses a zero-length receive
   730  		// as an indication that the stream was closed.
   731  		discardEmpty = true
   732  
   733  		// Since stream sockets don't preserve message boundaries, we
   734  		// can write only as much of the message as fits in the queue.
   735  		truncate = true
   736  	}
   737  
   738  	return e.writeQueue.Enqueue(ctx, data, c, from, discardEmpty, truncate)
   739  }
   740  
   741  // SendNotify implements ConnectedEndpoint.SendNotify.
   742  func (e *connectedEndpoint) SendNotify() {
   743  	e.writeQueue.ReaderQueue.Notify(waiter.ReadableEvents)
   744  }
   745  
   746  // CloseNotify implements ConnectedEndpoint.CloseNotify.
   747  func (e *connectedEndpoint) CloseNotify() {
   748  	e.writeQueue.ReaderQueue.Notify(waiter.ReadableEvents)
   749  	e.writeQueue.WriterQueue.Notify(waiter.WritableEvents)
   750  }
   751  
   752  // CloseSend implements ConnectedEndpoint.CloseSend.
   753  func (e *connectedEndpoint) CloseSend() {
   754  	e.writeQueue.Close()
   755  }
   756  
   757  // Writable implements ConnectedEndpoint.Writable.
   758  func (e *connectedEndpoint) Writable() bool {
   759  	return e.writeQueue.IsWritable()
   760  }
   761  
   762  // EventUpdate implements ConnectedEndpoint.EventUpdate.
   763  func (*connectedEndpoint) EventUpdate() error {
   764  	return nil
   765  }
   766  
   767  // SendQueuedSize implements ConnectedEndpoint.SendQueuedSize.
   768  func (e *connectedEndpoint) SendQueuedSize() int64 {
   769  	return e.writeQueue.QueuedSize()
   770  }
   771  
   772  // SendMaxQueueSize implements ConnectedEndpoint.SendMaxQueueSize.
   773  func (e *connectedEndpoint) SendMaxQueueSize() int64 {
   774  	return e.writeQueue.MaxQueueSize()
   775  }
   776  
   777  // Release implements ConnectedEndpoint.Release.
   778  func (e *connectedEndpoint) Release(ctx context.Context) {
   779  	e.writeQueue.DecRef(ctx)
   780  }
   781  
   782  // CloseUnread implements ConnectedEndpoint.CloseUnread.
   783  func (e *connectedEndpoint) CloseUnread() {
   784  	e.writeQueue.CloseUnread()
   785  }
   786  
   787  // SetSendBufferSize implements ConnectedEndpoint.SetSendBufferSize.
   788  // SetSendBufferSize sets the send buffer size for the write queue to the
   789  // specified value.
   790  func (e *connectedEndpoint) SetSendBufferSize(v int64) (newSz int64) {
   791  	e.writeQueue.SetMaxQueueSize(v)
   792  	return v
   793  }
   794  
   795  // baseEndpoint is an embeddable unix endpoint base used in both the connected
   796  // and connectionless unix domain socket Endpoint implementations.
   797  //
   798  // Not to be used on its own.
   799  //
   800  // +stateify savable
   801  type baseEndpoint struct {
   802  	*waiter.Queue
   803  	tcpip.DefaultSocketOptionsHandler
   804  
   805  	// Mutex protects the below fields.
   806  	//
   807  	// See the lock ordering comment in package kernel/epoll regarding when
   808  	// this lock can safely be held.
   809  	endpointMutex `state:"nosave"`
   810  
   811  	// receiver allows Messages to be received.
   812  	receiver Receiver
   813  
   814  	// connected allows messages to be sent and state information about the
   815  	// connected endpoint to be read.
   816  	connected ConnectedEndpoint
   817  
   818  	// path is not empty if the endpoint has been bound,
   819  	// or may be used if the endpoint is connected.
   820  	path string
   821  
   822  	// ops is used to get socket level options.
   823  	ops tcpip.SocketOptions
   824  }
   825  
   826  // EventRegister implements waiter.Waitable.EventRegister.
   827  func (e *baseEndpoint) EventRegister(we *waiter.Entry) error {
   828  	e.Queue.EventRegister(we)
   829  	e.Lock()
   830  	c := e.connected
   831  	e.Unlock()
   832  	if c != nil {
   833  		if err := c.EventUpdate(); err != nil {
   834  			return err
   835  		}
   836  	}
   837  	return nil
   838  }
   839  
   840  // EventUnregister implements waiter.Waitable.EventUnregister.
   841  func (e *baseEndpoint) EventUnregister(we *waiter.Entry) {
   842  	e.Queue.EventUnregister(we)
   843  	e.Lock()
   844  	c := e.connected
   845  	e.Unlock()
   846  	if c != nil {
   847  		c.EventUpdate()
   848  	}
   849  }
   850  
   851  // Passcred implements Credentialer.Passcred.
   852  func (e *baseEndpoint) Passcred() bool {
   853  	return e.SocketOptions().GetPassCred()
   854  }
   855  
   856  // ConnectedPasscred implements Credentialer.ConnectedPasscred.
   857  func (e *baseEndpoint) ConnectedPasscred() bool {
   858  	e.Lock()
   859  	defer e.Unlock()
   860  	return e.connected != nil && e.connected.Passcred()
   861  }
   862  
   863  // Connected implements ConnectingEndpoint.Connected.
   864  //
   865  // Preconditions: e.mu must be held.
   866  func (e *baseEndpoint) Connected() bool {
   867  	return e.receiver != nil && e.connected != nil
   868  }
   869  
   870  // RecvMsg reads data and a control message from the endpoint.
   871  func (e *baseEndpoint) RecvMsg(ctx context.Context, data [][]byte, creds bool, numRights int, peek bool, addr *Address) (int64, int64, ControlMessages, bool, func(), *syserr.Error) {
   872  	e.Lock()
   873  	receiver := e.receiver
   874  	e.Unlock()
   875  
   876  	if receiver == nil {
   877  		return 0, 0, ControlMessages{}, false, nil, syserr.ErrNotConnected
   878  	}
   879  
   880  	recvLen, msgLen, cms, cmt, a, notify, err := receiver.Recv(ctx, data, creds, numRights, peek)
   881  	if err != nil {
   882  		return 0, 0, ControlMessages{}, false, nil, err
   883  	}
   884  
   885  	var notifyFn func()
   886  	if notify {
   887  		notifyFn = receiver.RecvNotify
   888  	}
   889  
   890  	if addr != nil {
   891  		*addr = a
   892  	}
   893  	return recvLen, msgLen, cms, cmt, notifyFn, nil
   894  }
   895  
   896  // SendMsg writes data and a control message to the endpoint's peer.
   897  // This method does not block if the data cannot be written.
   898  func (e *baseEndpoint) SendMsg(ctx context.Context, data [][]byte, c ControlMessages, to BoundEndpoint) (int64, func(), *syserr.Error) {
   899  	e.Lock()
   900  	if !e.Connected() {
   901  		e.Unlock()
   902  		return 0, nil, syserr.ErrNotConnected
   903  	}
   904  	if to != nil {
   905  		e.Unlock()
   906  		return 0, nil, syserr.ErrAlreadyConnected
   907  	}
   908  
   909  	connected := e.connected
   910  	n, notify, err := connected.Send(ctx, data, c, Address{Addr: e.path})
   911  	e.Unlock()
   912  
   913  	var notifyFn func()
   914  	if notify {
   915  		notifyFn = connected.SendNotify
   916  	}
   917  
   918  	return n, notifyFn, err
   919  }
   920  
   921  // SetSockOpt sets a socket option.
   922  func (e *baseEndpoint) SetSockOpt(opt tcpip.SettableSocketOption) tcpip.Error {
   923  	return nil
   924  }
   925  
   926  func (e *baseEndpoint) SetSockOptInt(opt tcpip.SockOptInt, v int) tcpip.Error {
   927  	log.Warningf("Unsupported socket option: %d", opt)
   928  	return nil
   929  }
   930  
   931  func (e *baseEndpoint) GetSockOptInt(opt tcpip.SockOptInt) (int, tcpip.Error) {
   932  	switch opt {
   933  	case tcpip.ReceiveQueueSizeOption:
   934  		v := 0
   935  		e.Lock()
   936  		if !e.Connected() {
   937  			e.Unlock()
   938  			return -1, &tcpip.ErrNotConnected{}
   939  		}
   940  		v = int(e.receiver.RecvQueuedSize())
   941  		e.Unlock()
   942  		if v < 0 {
   943  			return -1, &tcpip.ErrQueueSizeNotSupported{}
   944  		}
   945  		return v, nil
   946  
   947  	case tcpip.SendQueueSizeOption:
   948  		e.Lock()
   949  		if !e.Connected() {
   950  			e.Unlock()
   951  			return -1, &tcpip.ErrNotConnected{}
   952  		}
   953  		v := e.connected.SendQueuedSize()
   954  		e.Unlock()
   955  		if v < 0 {
   956  			return -1, &tcpip.ErrQueueSizeNotSupported{}
   957  		}
   958  		return int(v), nil
   959  
   960  	default:
   961  		log.Warningf("Unsupported socket option: %d", opt)
   962  		return -1, &tcpip.ErrUnknownProtocolOption{}
   963  	}
   964  }
   965  
   966  // GetSockOpt implements tcpip.Endpoint.GetSockOpt.
   967  func (e *baseEndpoint) GetSockOpt(opt tcpip.GettableSocketOption) tcpip.Error {
   968  	log.Warningf("Unsupported socket option: %T", opt)
   969  	return &tcpip.ErrUnknownProtocolOption{}
   970  }
   971  
   972  // LastError implements Endpoint.LastError.
   973  func (*baseEndpoint) LastError() tcpip.Error {
   974  	return nil
   975  }
   976  
   977  // SocketOptions implements Endpoint.SocketOptions.
   978  func (e *baseEndpoint) SocketOptions() *tcpip.SocketOptions {
   979  	return &e.ops
   980  }
   981  
   982  // Shutdown closes the read and/or write end of the endpoint connection to its
   983  // peer.
   984  func (e *baseEndpoint) Shutdown(flags tcpip.ShutdownFlags) *syserr.Error {
   985  	e.Lock()
   986  	if !e.Connected() {
   987  		e.Unlock()
   988  		return syserr.ErrNotConnected
   989  	}
   990  
   991  	var (
   992  		r             = e.receiver
   993  		c             = e.connected
   994  		shutdownRead  = flags&tcpip.ShutdownRead != 0
   995  		shutdownWrite = flags&tcpip.ShutdownWrite != 0
   996  	)
   997  	if shutdownRead {
   998  		r.CloseRecv()
   999  	}
  1000  	if shutdownWrite {
  1001  		c.CloseSend()
  1002  	}
  1003  	e.Unlock()
  1004  
  1005  	// Don't hold e.Mutex while calling CloseNotify.
  1006  	if shutdownRead {
  1007  		r.CloseNotify()
  1008  	}
  1009  	if shutdownWrite {
  1010  		c.CloseNotify()
  1011  	}
  1012  
  1013  	return nil
  1014  }
  1015  
  1016  // GetLocalAddress returns the bound path.
  1017  func (e *baseEndpoint) GetLocalAddress() (Address, tcpip.Error) {
  1018  	e.Lock()
  1019  	defer e.Unlock()
  1020  	return Address{Addr: e.path}, nil
  1021  }
  1022  
  1023  // GetRemoteAddress returns the local address of the connected endpoint (if
  1024  // available).
  1025  func (e *baseEndpoint) GetRemoteAddress() (Address, tcpip.Error) {
  1026  	e.Lock()
  1027  	c := e.connected
  1028  	e.Unlock()
  1029  	if c != nil {
  1030  		return c.GetLocalAddress()
  1031  	}
  1032  	return Address{}, &tcpip.ErrNotConnected{}
  1033  }
  1034  
  1035  // Release implements BoundEndpoint.Release.
  1036  func (*baseEndpoint) Release(context.Context) {
  1037  	// Binding a baseEndpoint doesn't take a reference.
  1038  }
  1039  
  1040  // stackHandler is just a stub implementation of tcpip.StackHandler to provide
  1041  // when initializing socketoptions.
  1042  type stackHandler struct {
  1043  }
  1044  
  1045  // Option implements tcpip.StackHandler.
  1046  func (h *stackHandler) Option(option any) tcpip.Error {
  1047  	panic("unimplemented")
  1048  }
  1049  
  1050  // TransportProtocolOption implements tcpip.StackHandler.
  1051  func (h *stackHandler) TransportProtocolOption(proto tcpip.TransportProtocolNumber, option tcpip.GettableTransportProtocolOption) tcpip.Error {
  1052  	panic("unimplemented")
  1053  }
  1054  
  1055  // getSendBufferLimits implements tcpip.GetSendBufferLimits.
  1056  //
  1057  // AF_UNIX sockets buffer sizes are not tied to the networking stack/namespace
  1058  // in linux but are bound by net.core.(wmem|rmem)_(max|default).
  1059  //
  1060  // In gVisor net.core sysctls today are not exposed or if exposed are currently
  1061  // tied to the networking stack in use. This makes it complicated for AF_UNIX
  1062  // when we are in a new namespace w/ no networking stack. As a result for now we
  1063  // define default/max values here in the unix socket implementation itself.
  1064  func getSendBufferLimits(tcpip.StackHandler) tcpip.SendBufferSizeOption {
  1065  	return tcpip.SendBufferSizeOption{
  1066  		Min:     minimumBufferSize,
  1067  		Default: defaultBufferSize,
  1068  		Max:     maxBufferSize,
  1069  	}
  1070  }
  1071  
  1072  // getReceiveBufferLimits implements tcpip.GetReceiveBufferLimits.
  1073  //
  1074  // We define min, max and default values for unix socket implementation. Unix
  1075  // sockets do not use receive buffer.
  1076  func getReceiveBufferLimits(tcpip.StackHandler) tcpip.ReceiveBufferSizeOption {
  1077  	return tcpip.ReceiveBufferSizeOption{
  1078  		Min:     minimumBufferSize,
  1079  		Default: defaultBufferSize,
  1080  		Max:     maxBufferSize,
  1081  	}
  1082  }