github.com/SagerNet/gvisor@v0.0.0-20210707092255-7731c139d75c/pkg/sentry/socket/unix/transport/unix.go (about)

     1  // Copyright 2018 The gVisor Authors.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  // Package transport contains the implementation of Unix endpoints.
    16  package transport
    17  
    18  import (
    19  	"github.com/SagerNet/gvisor/pkg/abi/linux"
    20  	"github.com/SagerNet/gvisor/pkg/context"
    21  	"github.com/SagerNet/gvisor/pkg/log"
    22  	"github.com/SagerNet/gvisor/pkg/sync"
    23  	"github.com/SagerNet/gvisor/pkg/syserr"
    24  	"github.com/SagerNet/gvisor/pkg/tcpip"
    25  	"github.com/SagerNet/gvisor/pkg/tcpip/buffer"
    26  	"github.com/SagerNet/gvisor/pkg/waiter"
    27  )
    28  
    29  const (
    30  	// The minimum size of the send/receive buffers.
    31  	minimumBufferSize = 4 << 10 // 4 KiB (match default in linux)
    32  
    33  	// The default size of the send/receive buffers.
    34  	defaultBufferSize = 208 << 10 // 208 KiB  (default in linux for net.core.wmem_default)
    35  
    36  	// The maximum permitted size for the send/receive buffers.
    37  	maxBufferSize = 4 << 20 // 4 MiB 4 MiB (default in linux for net.core.wmem_max)
    38  )
    39  
    40  // A RightsControlMessage is a control message containing FDs.
    41  //
    42  // +stateify savable
    43  type RightsControlMessage interface {
    44  	// Clone returns a copy of the RightsControlMessage.
    45  	Clone() RightsControlMessage
    46  
    47  	// Release releases any resources owned by the RightsControlMessage.
    48  	Release(ctx context.Context)
    49  }
    50  
    51  // A CredentialsControlMessage is a control message containing Unix credentials.
    52  type CredentialsControlMessage interface {
    53  	// Equals returns true iff the two messages are equal.
    54  	Equals(CredentialsControlMessage) bool
    55  }
    56  
    57  // A ControlMessages represents a collection of socket control messages.
    58  //
    59  // +stateify savable
    60  type ControlMessages struct {
    61  	// Rights is a control message containing FDs.
    62  	Rights RightsControlMessage
    63  
    64  	// Credentials is a control message containing Unix credentials.
    65  	Credentials CredentialsControlMessage
    66  }
    67  
    68  // Empty returns true iff the ControlMessages does not contain either
    69  // credentials or rights.
    70  func (c *ControlMessages) Empty() bool {
    71  	return c.Rights == nil && c.Credentials == nil
    72  }
    73  
    74  // Clone clones both the credentials and the rights.
    75  func (c *ControlMessages) Clone() ControlMessages {
    76  	cm := ControlMessages{}
    77  	if c.Rights != nil {
    78  		cm.Rights = c.Rights.Clone()
    79  	}
    80  	cm.Credentials = c.Credentials
    81  	return cm
    82  }
    83  
    84  // Release releases both the credentials and the rights.
    85  func (c *ControlMessages) Release(ctx context.Context) {
    86  	if c.Rights != nil {
    87  		c.Rights.Release(ctx)
    88  	}
    89  	*c = ControlMessages{}
    90  }
    91  
    92  // Endpoint is the interface implemented by Unix transport protocol
    93  // implementations that expose functionality like sendmsg, recvmsg, connect,
    94  // etc. to Unix socket implementations.
    95  type Endpoint interface {
    96  	Credentialer
    97  	waiter.Waitable
    98  
    99  	// Close puts the endpoint in a closed state and frees all resources
   100  	// associated with it.
   101  	Close(ctx context.Context)
   102  
   103  	// RecvMsg reads data and a control message from the endpoint. This method
   104  	// does not block if there is no data pending.
   105  	//
   106  	// creds indicates if credential control messages are requested by the
   107  	// caller. This is useful for determining if control messages can be
   108  	// coalesced. creds is a hint and can be safely ignored by the
   109  	// implementation if no coalescing is possible. It is fine to return
   110  	// credential control messages when none were requested or to not return
   111  	// credential control messages when they were requested.
   112  	//
   113  	// numRights is the number of SCM_RIGHTS FDs requested by the caller. This
   114  	// is useful if one must allocate a buffer to receive a SCM_RIGHTS message
   115  	// or determine if control messages can be coalesced. numRights is a hint
   116  	// and can be safely ignored by the implementation if the number of
   117  	// available SCM_RIGHTS FDs is known and no coalescing is possible. It is
   118  	// fine for the returned number of SCM_RIGHTS FDs to be either higher or
   119  	// lower than the requested number.
   120  	//
   121  	// If peek is true, no data should be consumed from the Endpoint. Any and
   122  	// all data returned from a peek should be available in the next call to
   123  	// RecvMsg.
   124  	//
   125  	// recvLen is the number of bytes copied into data.
   126  	//
   127  	// msgLen is the length of the read message consumed for datagram Endpoints.
   128  	// msgLen is always the same as recvLen for stream Endpoints.
   129  	//
   130  	// CMTruncated indicates that the numRights hint was used to receive fewer
   131  	// than the total available SCM_RIGHTS FDs. Additional truncation may be
   132  	// required by the caller.
   133  	RecvMsg(ctx context.Context, data [][]byte, creds bool, numRights int, peek bool, addr *tcpip.FullAddress) (recvLen, msgLen int64, cm ControlMessages, CMTruncated bool, err *syserr.Error)
   134  
   135  	// SendMsg writes data and a control message to the endpoint's peer.
   136  	// This method does not block if the data cannot be written.
   137  	//
   138  	// SendMsg does not take ownership of any of its arguments on error.
   139  	SendMsg(context.Context, [][]byte, ControlMessages, BoundEndpoint) (int64, *syserr.Error)
   140  
   141  	// Connect connects this endpoint directly to another.
   142  	//
   143  	// This should be called on the client endpoint, and the (bound)
   144  	// endpoint passed in as a parameter.
   145  	//
   146  	// The error codes are the same as Connect.
   147  	Connect(ctx context.Context, server BoundEndpoint) *syserr.Error
   148  
   149  	// Shutdown closes the read and/or write end of the endpoint connection
   150  	// to its peer.
   151  	Shutdown(flags tcpip.ShutdownFlags) *syserr.Error
   152  
   153  	// Listen puts the endpoint in "listen" mode, which allows it to accept
   154  	// new connections.
   155  	Listen(backlog int) *syserr.Error
   156  
   157  	// Accept returns a new endpoint if a peer has established a connection
   158  	// to an endpoint previously set to listen mode. This method does not
   159  	// block if no new connections are available.
   160  	//
   161  	// The returned Queue is the wait queue for the newly created endpoint.
   162  	//
   163  	// peerAddr if not nil will be populated with the address of the connected
   164  	// peer on a successful accept.
   165  	Accept(peerAddr *tcpip.FullAddress) (Endpoint, *syserr.Error)
   166  
   167  	// Bind binds the endpoint to a specific local address and port.
   168  	// Specifying a NIC is optional.
   169  	//
   170  	// An optional commit function will be executed atomically with respect
   171  	// to binding the endpoint. If this returns an error, the bind will not
   172  	// occur and the error will be propagated back to the caller.
   173  	Bind(address tcpip.FullAddress, commit func() *syserr.Error) *syserr.Error
   174  
   175  	// Type return the socket type, typically either SockStream, SockDgram
   176  	// or SockSeqpacket.
   177  	Type() linux.SockType
   178  
   179  	// GetLocalAddress returns the address to which the endpoint is bound.
   180  	GetLocalAddress() (tcpip.FullAddress, tcpip.Error)
   181  
   182  	// GetRemoteAddress returns the address to which the endpoint is
   183  	// connected.
   184  	GetRemoteAddress() (tcpip.FullAddress, tcpip.Error)
   185  
   186  	// SetSockOpt sets a socket option.
   187  	SetSockOpt(opt tcpip.SettableSocketOption) tcpip.Error
   188  
   189  	// SetSockOptInt sets a socket option for simple cases when a value has
   190  	// the int type.
   191  	SetSockOptInt(opt tcpip.SockOptInt, v int) tcpip.Error
   192  
   193  	// GetSockOpt gets a socket option.
   194  	GetSockOpt(opt tcpip.GettableSocketOption) tcpip.Error
   195  
   196  	// GetSockOptInt gets a socket option for simple cases when a return
   197  	// value has the int type.
   198  	GetSockOptInt(opt tcpip.SockOptInt) (int, tcpip.Error)
   199  
   200  	// State returns the current state of the socket, as represented by Linux in
   201  	// procfs.
   202  	State() uint32
   203  
   204  	// LastError clears and returns the last error reported by the endpoint.
   205  	LastError() tcpip.Error
   206  
   207  	// SocketOptions returns the structure which contains all the socket
   208  	// level options.
   209  	SocketOptions() *tcpip.SocketOptions
   210  }
   211  
   212  // A Credentialer is a socket or endpoint that supports the SO_PASSCRED socket
   213  // option.
   214  type Credentialer interface {
   215  	// Passcred returns whether or not the SO_PASSCRED socket option is
   216  	// enabled on this end.
   217  	Passcred() bool
   218  
   219  	// ConnectedPasscred returns whether or not the SO_PASSCRED socket option
   220  	// is enabled on the connected end.
   221  	ConnectedPasscred() bool
   222  }
   223  
   224  // A BoundEndpoint is a unix endpoint that can be connected to.
   225  type BoundEndpoint interface {
   226  	// BidirectionalConnect establishes a bi-directional connection between two
   227  	// unix endpoints in an all-or-nothing manner. If an error occurs during
   228  	// connecting, the state of neither endpoint should be modified.
   229  	//
   230  	// In order for an endpoint to establish such a bidirectional connection
   231  	// with a BoundEndpoint, the endpoint calls the BidirectionalConnect method
   232  	// on the BoundEndpoint and sends a representation of itself (the
   233  	// ConnectingEndpoint) and a callback (returnConnect) to receive the
   234  	// connection information (Receiver and ConnectedEndpoint) upon a
   235  	// successful connect. The callback should only be called on a successful
   236  	// connect.
   237  	//
   238  	// For a connection attempt to be successful, the ConnectingEndpoint must
   239  	// be unconnected and not listening and the BoundEndpoint whose
   240  	// BidirectionalConnect method is being called must be listening.
   241  	//
   242  	// This method will return syserr.ErrConnectionRefused on endpoints with a
   243  	// type that isn't SockStream or SockSeqpacket.
   244  	BidirectionalConnect(ctx context.Context, ep ConnectingEndpoint, returnConnect func(Receiver, ConnectedEndpoint)) *syserr.Error
   245  
   246  	// UnidirectionalConnect establishes a write-only connection to a unix
   247  	// endpoint.
   248  	//
   249  	// An endpoint which calls UnidirectionalConnect and supports it itself must
   250  	// not hold its own lock when calling UnidirectionalConnect.
   251  	//
   252  	// This method will return syserr.ErrConnectionRefused on a non-SockDgram
   253  	// endpoint.
   254  	UnidirectionalConnect(ctx context.Context) (ConnectedEndpoint, *syserr.Error)
   255  
   256  	// Passcred returns whether or not the SO_PASSCRED socket option is
   257  	// enabled on this end.
   258  	Passcred() bool
   259  
   260  	// Release releases any resources held by the BoundEndpoint. It must be
   261  	// called before dropping all references to a BoundEndpoint returned by a
   262  	// function.
   263  	Release(ctx context.Context)
   264  }
   265  
   266  // message represents a message passed over a Unix domain socket.
   267  //
   268  // +stateify savable
   269  type message struct {
   270  	messageEntry
   271  
   272  	// Data is the Message payload.
   273  	Data buffer.View
   274  
   275  	// Control is auxiliary control message data that goes along with the
   276  	// data.
   277  	Control ControlMessages
   278  
   279  	// Address is the bound address of the endpoint that sent the message.
   280  	//
   281  	// If the endpoint that sent the message is not bound, the Address is
   282  	// the empty string.
   283  	Address tcpip.FullAddress
   284  }
   285  
   286  // Length returns number of bytes stored in the message.
   287  func (m *message) Length() int64 {
   288  	return int64(len(m.Data))
   289  }
   290  
   291  // Release releases any resources held by the message.
   292  func (m *message) Release(ctx context.Context) {
   293  	m.Control.Release(ctx)
   294  }
   295  
   296  // Peek returns a copy of the message.
   297  func (m *message) Peek() *message {
   298  	return &message{Data: m.Data, Control: m.Control.Clone(), Address: m.Address}
   299  }
   300  
   301  // Truncate reduces the length of the message payload to n bytes.
   302  //
   303  // Preconditions: n <= m.Length().
   304  func (m *message) Truncate(n int64) {
   305  	m.Data.CapLength(int(n))
   306  }
   307  
   308  // A Receiver can be used to receive Messages.
   309  type Receiver interface {
   310  	// Recv receives a single message. This method does not block.
   311  	//
   312  	// See Endpoint.RecvMsg for documentation on shared arguments.
   313  	//
   314  	// notify indicates if RecvNotify should be called.
   315  	Recv(ctx context.Context, data [][]byte, creds bool, numRights int, peek bool) (recvLen, msgLen int64, cm ControlMessages, CMTruncated bool, source tcpip.FullAddress, notify bool, err *syserr.Error)
   316  
   317  	// RecvNotify notifies the Receiver of a successful Recv. This must not be
   318  	// called while holding any endpoint locks.
   319  	RecvNotify()
   320  
   321  	// CloseRecv prevents the receiving of additional Messages.
   322  	//
   323  	// After CloseRecv is called, CloseNotify must also be called.
   324  	CloseRecv()
   325  
   326  	// CloseNotify notifies the Receiver of recv being closed. This must not be
   327  	// called while holding any endpoint locks.
   328  	CloseNotify()
   329  
   330  	// Readable returns if messages should be attempted to be received. This
   331  	// includes when read has been shutdown.
   332  	Readable() bool
   333  
   334  	// RecvQueuedSize returns the total amount of data currently receivable.
   335  	// RecvQueuedSize should return -1 if the operation isn't supported.
   336  	RecvQueuedSize() int64
   337  
   338  	// RecvMaxQueueSize returns maximum value for RecvQueuedSize.
   339  	// RecvMaxQueueSize should return -1 if the operation isn't supported.
   340  	RecvMaxQueueSize() int64
   341  
   342  	// Release releases any resources owned by the Receiver. It should be
   343  	// called before dropping all references to a Receiver.
   344  	Release(ctx context.Context)
   345  }
   346  
   347  // queueReceiver implements Receiver for datagram sockets.
   348  //
   349  // +stateify savable
   350  type queueReceiver struct {
   351  	readQueue *queue
   352  }
   353  
   354  // Recv implements Receiver.Recv.
   355  func (q *queueReceiver) Recv(ctx context.Context, data [][]byte, creds bool, numRights int, peek bool) (int64, int64, ControlMessages, bool, tcpip.FullAddress, bool, *syserr.Error) {
   356  	var m *message
   357  	var notify bool
   358  	var err *syserr.Error
   359  	if peek {
   360  		m, err = q.readQueue.Peek()
   361  	} else {
   362  		m, notify, err = q.readQueue.Dequeue()
   363  	}
   364  	if err != nil {
   365  		return 0, 0, ControlMessages{}, false, tcpip.FullAddress{}, false, err
   366  	}
   367  	src := []byte(m.Data)
   368  	var copied int64
   369  	for i := 0; i < len(data) && len(src) > 0; i++ {
   370  		n := copy(data[i], src)
   371  		copied += int64(n)
   372  		src = src[n:]
   373  	}
   374  	return copied, int64(len(m.Data)), m.Control, false, m.Address, notify, nil
   375  }
   376  
   377  // RecvNotify implements Receiver.RecvNotify.
   378  func (q *queueReceiver) RecvNotify() {
   379  	q.readQueue.WriterQueue.Notify(waiter.WritableEvents)
   380  }
   381  
   382  // CloseNotify implements Receiver.CloseNotify.
   383  func (q *queueReceiver) CloseNotify() {
   384  	q.readQueue.ReaderQueue.Notify(waiter.ReadableEvents)
   385  	q.readQueue.WriterQueue.Notify(waiter.WritableEvents)
   386  }
   387  
   388  // CloseRecv implements Receiver.CloseRecv.
   389  func (q *queueReceiver) CloseRecv() {
   390  	q.readQueue.Close()
   391  }
   392  
   393  // Readable implements Receiver.Readable.
   394  func (q *queueReceiver) Readable() bool {
   395  	return q.readQueue.IsReadable()
   396  }
   397  
   398  // RecvQueuedSize implements Receiver.RecvQueuedSize.
   399  func (q *queueReceiver) RecvQueuedSize() int64 {
   400  	return q.readQueue.QueuedSize()
   401  }
   402  
   403  // RecvMaxQueueSize implements Receiver.RecvMaxQueueSize.
   404  func (q *queueReceiver) RecvMaxQueueSize() int64 {
   405  	return q.readQueue.MaxQueueSize()
   406  }
   407  
   408  // Release implements Receiver.Release.
   409  func (q *queueReceiver) Release(ctx context.Context) {
   410  	q.readQueue.DecRef(ctx)
   411  }
   412  
   413  // streamQueueReceiver implements Receiver for stream sockets.
   414  //
   415  // +stateify savable
   416  type streamQueueReceiver struct {
   417  	queueReceiver
   418  
   419  	mu      sync.Mutex `state:"nosave"`
   420  	buffer  []byte
   421  	control ControlMessages
   422  	addr    tcpip.FullAddress
   423  }
   424  
   425  func vecCopy(data [][]byte, buf []byte) (int64, [][]byte, []byte) {
   426  	var copied int64
   427  	for len(data) > 0 && len(buf) > 0 {
   428  		n := copy(data[0], buf)
   429  		copied += int64(n)
   430  		buf = buf[n:]
   431  		data[0] = data[0][n:]
   432  		if len(data[0]) == 0 {
   433  			data = data[1:]
   434  		}
   435  	}
   436  	return copied, data, buf
   437  }
   438  
   439  // Readable implements Receiver.Readable.
   440  func (q *streamQueueReceiver) Readable() bool {
   441  	q.mu.Lock()
   442  	bl := len(q.buffer)
   443  	r := q.readQueue.IsReadable()
   444  	q.mu.Unlock()
   445  	// We're readable if we have data in our buffer or if the queue receiver is
   446  	// readable.
   447  	return bl > 0 || r
   448  }
   449  
   450  // RecvQueuedSize implements Receiver.RecvQueuedSize.
   451  func (q *streamQueueReceiver) RecvQueuedSize() int64 {
   452  	q.mu.Lock()
   453  	bl := len(q.buffer)
   454  	qs := q.readQueue.QueuedSize()
   455  	q.mu.Unlock()
   456  	return int64(bl) + qs
   457  }
   458  
   459  // RecvMaxQueueSize implements Receiver.RecvMaxQueueSize.
   460  func (q *streamQueueReceiver) RecvMaxQueueSize() int64 {
   461  	// The RecvMaxQueueSize() is the readQueue's MaxQueueSize() plus the largest
   462  	// message we can buffer which is also the largest message we can receive.
   463  	return 2 * q.readQueue.MaxQueueSize()
   464  }
   465  
   466  // Recv implements Receiver.Recv.
   467  func (q *streamQueueReceiver) Recv(ctx context.Context, data [][]byte, wantCreds bool, numRights int, peek bool) (int64, int64, ControlMessages, bool, tcpip.FullAddress, bool, *syserr.Error) {
   468  	q.mu.Lock()
   469  	defer q.mu.Unlock()
   470  
   471  	var notify bool
   472  
   473  	// If we have no data in the endpoint, we need to get some.
   474  	if len(q.buffer) == 0 {
   475  		// Load the next message into a buffer, even if we are peeking. Peeking
   476  		// won't consume the message, so it will be still available to be read
   477  		// the next time Recv() is called.
   478  		m, n, err := q.readQueue.Dequeue()
   479  		if err != nil {
   480  			return 0, 0, ControlMessages{}, false, tcpip.FullAddress{}, false, err
   481  		}
   482  		notify = n
   483  		q.buffer = []byte(m.Data)
   484  		q.control = m.Control
   485  		q.addr = m.Address
   486  	}
   487  
   488  	var copied int64
   489  	if peek {
   490  		// Don't consume control message if we are peeking.
   491  		c := q.control.Clone()
   492  
   493  		// Don't consume data since we are peeking.
   494  		copied, _, _ = vecCopy(data, q.buffer)
   495  
   496  		return copied, copied, c, false, q.addr, notify, nil
   497  	}
   498  
   499  	// Consume data and control message since we are not peeking.
   500  	copied, data, q.buffer = vecCopy(data, q.buffer)
   501  
   502  	// Save the original state of q.control.
   503  	c := q.control
   504  
   505  	// Remove rights from q.control and leave behind just the creds.
   506  	q.control.Rights = nil
   507  	if !wantCreds {
   508  		c.Credentials = nil
   509  	}
   510  
   511  	var cmTruncated bool
   512  	if c.Rights != nil && numRights == 0 {
   513  		c.Rights.Release(ctx)
   514  		c.Rights = nil
   515  		cmTruncated = true
   516  	}
   517  
   518  	haveRights := c.Rights != nil
   519  
   520  	// If we have more capacity for data and haven't received any usable
   521  	// rights.
   522  	//
   523  	// Linux never coalesces rights control messages.
   524  	for !haveRights && len(data) > 0 {
   525  		// Get a message from the readQueue.
   526  		m, n, err := q.readQueue.Dequeue()
   527  		if err != nil {
   528  			// We already got some data, so ignore this error. This will
   529  			// manifest as a short read to the user, which is what Linux
   530  			// does.
   531  			break
   532  		}
   533  		notify = notify || n
   534  		q.buffer = []byte(m.Data)
   535  		q.control = m.Control
   536  		q.addr = m.Address
   537  
   538  		if wantCreds {
   539  			if (q.control.Credentials == nil) != (c.Credentials == nil) {
   540  				// One message has credentials, the other does not.
   541  				break
   542  			}
   543  
   544  			if q.control.Credentials != nil && c.Credentials != nil && !q.control.Credentials.Equals(c.Credentials) {
   545  				// Both messages have credentials, but they don't match.
   546  				break
   547  			}
   548  		}
   549  
   550  		if numRights != 0 && c.Rights != nil && q.control.Rights != nil {
   551  			// Both messages have rights.
   552  			break
   553  		}
   554  
   555  		var cpd int64
   556  		cpd, data, q.buffer = vecCopy(data, q.buffer)
   557  		copied += cpd
   558  
   559  		if cpd == 0 {
   560  			// data was actually full.
   561  			break
   562  		}
   563  
   564  		if q.control.Rights != nil {
   565  			// Consume rights.
   566  			if numRights == 0 {
   567  				cmTruncated = true
   568  				q.control.Rights.Release(ctx)
   569  			} else {
   570  				c.Rights = q.control.Rights
   571  				haveRights = true
   572  			}
   573  			q.control.Rights = nil
   574  		}
   575  	}
   576  	return copied, copied, c, cmTruncated, q.addr, notify, nil
   577  }
   578  
   579  // Release implements Receiver.Release.
   580  func (q *streamQueueReceiver) Release(ctx context.Context) {
   581  	q.queueReceiver.Release(ctx)
   582  	q.control.Release(ctx)
   583  }
   584  
   585  // A ConnectedEndpoint is an Endpoint that can be used to send Messages.
   586  type ConnectedEndpoint interface {
   587  	// Passcred implements Endpoint.Passcred.
   588  	Passcred() bool
   589  
   590  	// GetLocalAddress implements Endpoint.GetLocalAddress.
   591  	GetLocalAddress() (tcpip.FullAddress, tcpip.Error)
   592  
   593  	// Send sends a single message. This method does not block.
   594  	//
   595  	// notify indicates if SendNotify should be called.
   596  	//
   597  	// syserr.ErrWouldBlock can be returned along with a partial write if
   598  	// the caller should block to send the rest of the data.
   599  	Send(ctx context.Context, data [][]byte, c ControlMessages, from tcpip.FullAddress) (n int64, notify bool, err *syserr.Error)
   600  
   601  	// SendNotify notifies the ConnectedEndpoint of a successful Send. This
   602  	// must not be called while holding any endpoint locks.
   603  	SendNotify()
   604  
   605  	// CloseSend prevents the sending of additional Messages.
   606  	//
   607  	// After CloseSend is call, CloseNotify must also be called.
   608  	CloseSend()
   609  
   610  	// CloseNotify notifies the ConnectedEndpoint of send being closed. This
   611  	// must not be called while holding any endpoint locks.
   612  	CloseNotify()
   613  
   614  	// Writable returns if messages should be attempted to be sent. This
   615  	// includes when write has been shutdown.
   616  	Writable() bool
   617  
   618  	// EventUpdate lets the ConnectedEndpoint know that event registrations
   619  	// have changed.
   620  	EventUpdate()
   621  
   622  	// SendQueuedSize returns the total amount of data currently queued for
   623  	// sending. SendQueuedSize should return -1 if the operation isn't
   624  	// supported.
   625  	SendQueuedSize() int64
   626  
   627  	// SendMaxQueueSize returns maximum value for SendQueuedSize.
   628  	// SendMaxQueueSize should return -1 if the operation isn't supported.
   629  	SendMaxQueueSize() int64
   630  
   631  	// Release releases any resources owned by the ConnectedEndpoint. It should
   632  	// be called before dropping all references to a ConnectedEndpoint.
   633  	Release(ctx context.Context)
   634  
   635  	// CloseUnread sets the fact that this end is closed with unread data to
   636  	// the peer socket.
   637  	CloseUnread()
   638  
   639  	// SetSendBufferSize is called when the endpoint's send buffer size is
   640  	// changed.
   641  	SetSendBufferSize(v int64) (newSz int64)
   642  }
   643  
   644  // +stateify savable
   645  type connectedEndpoint struct {
   646  	// endpoint represents the subset of the Endpoint functionality needed by
   647  	// the connectedEndpoint. It is implemented by both connectionedEndpoint
   648  	// and connectionlessEndpoint and allows the use of types which don't
   649  	// fully implement Endpoint.
   650  	endpoint interface {
   651  		// Passcred implements Endpoint.Passcred.
   652  		Passcred() bool
   653  
   654  		// GetLocalAddress implements Endpoint.GetLocalAddress.
   655  		GetLocalAddress() (tcpip.FullAddress, tcpip.Error)
   656  
   657  		// Type implements Endpoint.Type.
   658  		Type() linux.SockType
   659  	}
   660  
   661  	writeQueue *queue
   662  }
   663  
   664  // Passcred implements ConnectedEndpoint.Passcred.
   665  func (e *connectedEndpoint) Passcred() bool {
   666  	return e.endpoint.Passcred()
   667  }
   668  
   669  // GetLocalAddress implements ConnectedEndpoint.GetLocalAddress.
   670  func (e *connectedEndpoint) GetLocalAddress() (tcpip.FullAddress, tcpip.Error) {
   671  	return e.endpoint.GetLocalAddress()
   672  }
   673  
   674  // Send implements ConnectedEndpoint.Send.
   675  func (e *connectedEndpoint) Send(ctx context.Context, data [][]byte, c ControlMessages, from tcpip.FullAddress) (int64, bool, *syserr.Error) {
   676  	discardEmpty := false
   677  	truncate := false
   678  	if e.endpoint.Type() == linux.SOCK_STREAM {
   679  		// Discard empty stream packets. Since stream sockets don't
   680  		// preserve message boundaries, sending zero bytes is a no-op.
   681  		// In Linux, the receiver actually uses a zero-length receive
   682  		// as an indication that the stream was closed.
   683  		discardEmpty = true
   684  
   685  		// Since stream sockets don't preserve message boundaries, we
   686  		// can write only as much of the message as fits in the queue.
   687  		truncate = true
   688  	}
   689  
   690  	return e.writeQueue.Enqueue(ctx, data, c, from, discardEmpty, truncate)
   691  }
   692  
   693  // SendNotify implements ConnectedEndpoint.SendNotify.
   694  func (e *connectedEndpoint) SendNotify() {
   695  	e.writeQueue.ReaderQueue.Notify(waiter.ReadableEvents)
   696  }
   697  
   698  // CloseNotify implements ConnectedEndpoint.CloseNotify.
   699  func (e *connectedEndpoint) CloseNotify() {
   700  	e.writeQueue.ReaderQueue.Notify(waiter.ReadableEvents)
   701  	e.writeQueue.WriterQueue.Notify(waiter.WritableEvents)
   702  }
   703  
   704  // CloseSend implements ConnectedEndpoint.CloseSend.
   705  func (e *connectedEndpoint) CloseSend() {
   706  	e.writeQueue.Close()
   707  }
   708  
   709  // Writable implements ConnectedEndpoint.Writable.
   710  func (e *connectedEndpoint) Writable() bool {
   711  	return e.writeQueue.IsWritable()
   712  }
   713  
   714  // EventUpdate implements ConnectedEndpoint.EventUpdate.
   715  func (*connectedEndpoint) EventUpdate() {}
   716  
   717  // SendQueuedSize implements ConnectedEndpoint.SendQueuedSize.
   718  func (e *connectedEndpoint) SendQueuedSize() int64 {
   719  	return e.writeQueue.QueuedSize()
   720  }
   721  
   722  // SendMaxQueueSize implements ConnectedEndpoint.SendMaxQueueSize.
   723  func (e *connectedEndpoint) SendMaxQueueSize() int64 {
   724  	return e.writeQueue.MaxQueueSize()
   725  }
   726  
   727  // Release implements ConnectedEndpoint.Release.
   728  func (e *connectedEndpoint) Release(ctx context.Context) {
   729  	e.writeQueue.DecRef(ctx)
   730  }
   731  
   732  // CloseUnread implements ConnectedEndpoint.CloseUnread.
   733  func (e *connectedEndpoint) CloseUnread() {
   734  	e.writeQueue.CloseUnread()
   735  }
   736  
   737  // SetSendBufferSize implements ConnectedEndpoint.SetSendBufferSize.
   738  // SetSendBufferSize sets the send buffer size for the write queue to the
   739  // specified value.
   740  func (e *connectedEndpoint) SetSendBufferSize(v int64) (newSz int64) {
   741  	e.writeQueue.SetMaxQueueSize(v)
   742  	return v
   743  }
   744  
   745  // baseEndpoint is an embeddable unix endpoint base used in both the connected
   746  // and connectionless unix domain socket Endpoint implementations.
   747  //
   748  // Not to be used on its own.
   749  //
   750  // +stateify savable
   751  type baseEndpoint struct {
   752  	*waiter.Queue
   753  	tcpip.DefaultSocketOptionsHandler
   754  
   755  	// Mutex protects the below fields.
   756  	//
   757  	// See the lock ordering comment in package kernel/epoll regarding when
   758  	// this lock can safely be held.
   759  	sync.Mutex `state:"nosave"`
   760  
   761  	// receiver allows Messages to be received.
   762  	receiver Receiver
   763  
   764  	// connected allows messages to be sent and state information about the
   765  	// connected endpoint to be read.
   766  	connected ConnectedEndpoint
   767  
   768  	// path is not empty if the endpoint has been bound,
   769  	// or may be used if the endpoint is connected.
   770  	path string
   771  
   772  	// ops is used to get socket level options.
   773  	ops tcpip.SocketOptions
   774  }
   775  
   776  // EventRegister implements waiter.Waitable.EventRegister.
   777  func (e *baseEndpoint) EventRegister(we *waiter.Entry, mask waiter.EventMask) {
   778  	e.Queue.EventRegister(we, mask)
   779  	e.Lock()
   780  	c := e.connected
   781  	e.Unlock()
   782  	if c != nil {
   783  		c.EventUpdate()
   784  	}
   785  }
   786  
   787  // EventUnregister implements waiter.Waitable.EventUnregister.
   788  func (e *baseEndpoint) EventUnregister(we *waiter.Entry) {
   789  	e.Queue.EventUnregister(we)
   790  	e.Lock()
   791  	c := e.connected
   792  	e.Unlock()
   793  	if c != nil {
   794  		c.EventUpdate()
   795  	}
   796  }
   797  
   798  // Passcred implements Credentialer.Passcred.
   799  func (e *baseEndpoint) Passcred() bool {
   800  	return e.SocketOptions().GetPassCred()
   801  }
   802  
   803  // ConnectedPasscred implements Credentialer.ConnectedPasscred.
   804  func (e *baseEndpoint) ConnectedPasscred() bool {
   805  	e.Lock()
   806  	defer e.Unlock()
   807  	return e.connected != nil && e.connected.Passcred()
   808  }
   809  
   810  // Connected implements ConnectingEndpoint.Connected.
   811  func (e *baseEndpoint) Connected() bool {
   812  	return e.receiver != nil && e.connected != nil
   813  }
   814  
   815  // RecvMsg reads data and a control message from the endpoint.
   816  func (e *baseEndpoint) RecvMsg(ctx context.Context, data [][]byte, creds bool, numRights int, peek bool, addr *tcpip.FullAddress) (int64, int64, ControlMessages, bool, *syserr.Error) {
   817  	e.Lock()
   818  
   819  	receiver := e.receiver
   820  	if receiver == nil {
   821  		e.Unlock()
   822  		return 0, 0, ControlMessages{}, false, syserr.ErrNotConnected
   823  	}
   824  
   825  	recvLen, msgLen, cms, cmt, a, notify, err := receiver.Recv(ctx, data, creds, numRights, peek)
   826  	e.Unlock()
   827  	if err != nil {
   828  		return 0, 0, ControlMessages{}, false, err
   829  	}
   830  
   831  	if notify {
   832  		receiver.RecvNotify()
   833  	}
   834  
   835  	if addr != nil {
   836  		*addr = a
   837  	}
   838  	return recvLen, msgLen, cms, cmt, nil
   839  }
   840  
   841  // SendMsg writes data and a control message to the endpoint's peer.
   842  // This method does not block if the data cannot be written.
   843  func (e *baseEndpoint) SendMsg(ctx context.Context, data [][]byte, c ControlMessages, to BoundEndpoint) (int64, *syserr.Error) {
   844  	e.Lock()
   845  	if !e.Connected() {
   846  		e.Unlock()
   847  		return 0, syserr.ErrNotConnected
   848  	}
   849  	if to != nil {
   850  		e.Unlock()
   851  		return 0, syserr.ErrAlreadyConnected
   852  	}
   853  
   854  	connected := e.connected
   855  	n, notify, err := connected.Send(ctx, data, c, tcpip.FullAddress{Addr: tcpip.Address(e.path)})
   856  	e.Unlock()
   857  
   858  	if notify {
   859  		connected.SendNotify()
   860  	}
   861  
   862  	return n, err
   863  }
   864  
   865  // SetSockOpt sets a socket option.
   866  func (e *baseEndpoint) SetSockOpt(opt tcpip.SettableSocketOption) tcpip.Error {
   867  	return nil
   868  }
   869  
   870  func (e *baseEndpoint) SetSockOptInt(opt tcpip.SockOptInt, v int) tcpip.Error {
   871  	log.Warningf("Unsupported socket option: %d", opt)
   872  	return nil
   873  }
   874  
   875  func (e *baseEndpoint) GetSockOptInt(opt tcpip.SockOptInt) (int, tcpip.Error) {
   876  	switch opt {
   877  	case tcpip.ReceiveQueueSizeOption:
   878  		v := 0
   879  		e.Lock()
   880  		if !e.Connected() {
   881  			e.Unlock()
   882  			return -1, &tcpip.ErrNotConnected{}
   883  		}
   884  		v = int(e.receiver.RecvQueuedSize())
   885  		e.Unlock()
   886  		if v < 0 {
   887  			return -1, &tcpip.ErrQueueSizeNotSupported{}
   888  		}
   889  		return v, nil
   890  
   891  	case tcpip.SendQueueSizeOption:
   892  		e.Lock()
   893  		if !e.Connected() {
   894  			e.Unlock()
   895  			return -1, &tcpip.ErrNotConnected{}
   896  		}
   897  		v := e.connected.SendQueuedSize()
   898  		e.Unlock()
   899  		if v < 0 {
   900  			return -1, &tcpip.ErrQueueSizeNotSupported{}
   901  		}
   902  		return int(v), nil
   903  
   904  	default:
   905  		log.Warningf("Unsupported socket option: %d", opt)
   906  		return -1, &tcpip.ErrUnknownProtocolOption{}
   907  	}
   908  }
   909  
   910  // GetSockOpt implements tcpip.Endpoint.GetSockOpt.
   911  func (e *baseEndpoint) GetSockOpt(opt tcpip.GettableSocketOption) tcpip.Error {
   912  	log.Warningf("Unsupported socket option: %T", opt)
   913  	return &tcpip.ErrUnknownProtocolOption{}
   914  }
   915  
   916  // LastError implements Endpoint.LastError.
   917  func (*baseEndpoint) LastError() tcpip.Error {
   918  	return nil
   919  }
   920  
   921  // SocketOptions implements Endpoint.SocketOptions.
   922  func (e *baseEndpoint) SocketOptions() *tcpip.SocketOptions {
   923  	return &e.ops
   924  }
   925  
   926  // Shutdown closes the read and/or write end of the endpoint connection to its
   927  // peer.
   928  func (e *baseEndpoint) Shutdown(flags tcpip.ShutdownFlags) *syserr.Error {
   929  	e.Lock()
   930  	if !e.Connected() {
   931  		e.Unlock()
   932  		return syserr.ErrNotConnected
   933  	}
   934  
   935  	var (
   936  		r             = e.receiver
   937  		c             = e.connected
   938  		shutdownRead  = flags&tcpip.ShutdownRead != 0
   939  		shutdownWrite = flags&tcpip.ShutdownWrite != 0
   940  	)
   941  	if shutdownRead {
   942  		r.CloseRecv()
   943  	}
   944  	if shutdownWrite {
   945  		c.CloseSend()
   946  	}
   947  	e.Unlock()
   948  
   949  	// Don't hold e.Mutex while calling CloseNotify.
   950  	if shutdownRead {
   951  		r.CloseNotify()
   952  	}
   953  	if shutdownWrite {
   954  		c.CloseNotify()
   955  	}
   956  
   957  	return nil
   958  }
   959  
   960  // GetLocalAddress returns the bound path.
   961  func (e *baseEndpoint) GetLocalAddress() (tcpip.FullAddress, tcpip.Error) {
   962  	e.Lock()
   963  	defer e.Unlock()
   964  	return tcpip.FullAddress{Addr: tcpip.Address(e.path)}, nil
   965  }
   966  
   967  // GetRemoteAddress returns the local address of the connected endpoint (if
   968  // available).
   969  func (e *baseEndpoint) GetRemoteAddress() (tcpip.FullAddress, tcpip.Error) {
   970  	e.Lock()
   971  	c := e.connected
   972  	e.Unlock()
   973  	if c != nil {
   974  		return c.GetLocalAddress()
   975  	}
   976  	return tcpip.FullAddress{}, &tcpip.ErrNotConnected{}
   977  }
   978  
   979  // Release implements BoundEndpoint.Release.
   980  func (*baseEndpoint) Release(context.Context) {
   981  	// Binding a baseEndpoint doesn't take a reference.
   982  }
   983  
   984  // stackHandler is just a stub implementation of tcpip.StackHandler to provide
   985  // when initializing socketoptions.
   986  type stackHandler struct {
   987  }
   988  
   989  // Option implements tcpip.StackHandler.
   990  func (h *stackHandler) Option(option interface{}) tcpip.Error {
   991  	panic("unimplemented")
   992  }
   993  
   994  // TransportProtocolOption implements tcpip.StackHandler.
   995  func (h *stackHandler) TransportProtocolOption(proto tcpip.TransportProtocolNumber, option tcpip.GettableTransportProtocolOption) tcpip.Error {
   996  	panic("unimplemented")
   997  }
   998  
   999  // getSendBufferLimits implements tcpip.GetSendBufferLimits.
  1000  //
  1001  // AF_UNIX sockets buffer sizes are not tied to the networking stack/namespace
  1002  // in linux but are bound by net.core.(wmem|rmem)_(max|default).
  1003  //
  1004  // In gVisor net.core sysctls today are not exposed or if exposed are currently
  1005  // tied to the networking stack in use. This makes it complicated for AF_UNIX
  1006  // when we are in a new namespace w/ no networking stack. As a result for now we
  1007  // define default/max values here in the unix socket implementation itself.
  1008  func getSendBufferLimits(tcpip.StackHandler) tcpip.SendBufferSizeOption {
  1009  	return tcpip.SendBufferSizeOption{
  1010  		Min:     minimumBufferSize,
  1011  		Default: defaultBufferSize,
  1012  		Max:     maxBufferSize,
  1013  	}
  1014  }
  1015  
  1016  // getReceiveBufferLimits implements tcpip.GetReceiveBufferLimits.
  1017  //
  1018  // We define min, max and default values for unix socket implementation. Unix
  1019  // sockets do not use receive buffer.
  1020  func getReceiveBufferLimits(tcpip.StackHandler) tcpip.ReceiveBufferSizeOption {
  1021  	return tcpip.ReceiveBufferSizeOption{
  1022  		Min:     minimumBufferSize,
  1023  		Default: defaultBufferSize,
  1024  		Max:     maxBufferSize,
  1025  	}
  1026  }