github.com/SagerNet/gvisor@v0.0.0-20210707092255-7731c139d75c/pkg/sentry/socket/unix/unix.go (about)

     1  // Copyright 2018 The gVisor Authors.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  // Package unix provides an implementation of the socket.Socket interface for
    16  // the AF_UNIX protocol family.
    17  package unix
    18  
    19  import (
    20  	"fmt"
    21  	"strings"
    22  
    23  	"golang.org/x/sys/unix"
    24  	"github.com/SagerNet/gvisor/pkg/abi/linux"
    25  	"github.com/SagerNet/gvisor/pkg/context"
    26  	"github.com/SagerNet/gvisor/pkg/errors/linuxerr"
    27  	"github.com/SagerNet/gvisor/pkg/fspath"
    28  	"github.com/SagerNet/gvisor/pkg/hostarch"
    29  	"github.com/SagerNet/gvisor/pkg/marshal"
    30  	"github.com/SagerNet/gvisor/pkg/sentry/arch"
    31  	"github.com/SagerNet/gvisor/pkg/sentry/fs"
    32  	"github.com/SagerNet/gvisor/pkg/sentry/fs/fsutil"
    33  	"github.com/SagerNet/gvisor/pkg/sentry/kernel"
    34  	ktime "github.com/SagerNet/gvisor/pkg/sentry/kernel/time"
    35  	"github.com/SagerNet/gvisor/pkg/sentry/socket"
    36  	"github.com/SagerNet/gvisor/pkg/sentry/socket/control"
    37  	"github.com/SagerNet/gvisor/pkg/sentry/socket/netstack"
    38  	"github.com/SagerNet/gvisor/pkg/sentry/socket/unix/transport"
    39  	"github.com/SagerNet/gvisor/pkg/sentry/vfs"
    40  	"github.com/SagerNet/gvisor/pkg/syserr"
    41  	"github.com/SagerNet/gvisor/pkg/syserror"
    42  	"github.com/SagerNet/gvisor/pkg/tcpip"
    43  	"github.com/SagerNet/gvisor/pkg/usermem"
    44  	"github.com/SagerNet/gvisor/pkg/waiter"
    45  )
    46  
    47  // SocketOperations is a Unix socket. It is similar to a netstack socket,
    48  // except it is backed by a transport.Endpoint instead of a tcpip.Endpoint.
    49  //
    50  // +stateify savable
    51  type SocketOperations struct {
    52  	fsutil.FilePipeSeek             `state:"nosave"`
    53  	fsutil.FileNotDirReaddir        `state:"nosave"`
    54  	fsutil.FileNoFsync              `state:"nosave"`
    55  	fsutil.FileNoMMap               `state:"nosave"`
    56  	fsutil.FileNoSplice             `state:"nosave"`
    57  	fsutil.FileNoopFlush            `state:"nosave"`
    58  	fsutil.FileUseInodeUnstableAttr `state:"nosave"`
    59  
    60  	socketOperationsRefs
    61  	socketOpsCommon
    62  }
    63  
    64  // New creates a new unix socket.
    65  func New(ctx context.Context, endpoint transport.Endpoint, stype linux.SockType) *fs.File {
    66  	dirent := socket.NewDirent(ctx, unixSocketDevice)
    67  	defer dirent.DecRef(ctx)
    68  	return NewWithDirent(ctx, dirent, endpoint, stype, fs.FileFlags{Read: true, Write: true, NonSeekable: true})
    69  }
    70  
    71  // NewWithDirent creates a new unix socket using an existing dirent.
    72  func NewWithDirent(ctx context.Context, d *fs.Dirent, ep transport.Endpoint, stype linux.SockType, flags fs.FileFlags) *fs.File {
    73  	// You can create AF_UNIX, SOCK_RAW sockets. They're the same as
    74  	// SOCK_DGRAM and don't require CAP_NET_RAW.
    75  	if stype == linux.SOCK_RAW {
    76  		stype = linux.SOCK_DGRAM
    77  	}
    78  
    79  	s := SocketOperations{
    80  		socketOpsCommon: socketOpsCommon{
    81  			ep:    ep,
    82  			stype: stype,
    83  		},
    84  	}
    85  	s.InitRefs()
    86  	return fs.NewFile(ctx, d, flags, &s)
    87  }
    88  
    89  // DecRef implements RefCounter.DecRef.
    90  func (s *SocketOperations) DecRef(ctx context.Context) {
    91  	s.socketOperationsRefs.DecRef(func() {
    92  		s.ep.Close(ctx)
    93  		if s.abstractNamespace != nil {
    94  			s.abstractNamespace.Remove(s.abstractName, s)
    95  		}
    96  	})
    97  }
    98  
    99  // Release implemements fs.FileOperations.Release.
   100  func (s *SocketOperations) Release(ctx context.Context) {
   101  	// Release only decrements a reference on s because s may be referenced in
   102  	// the abstract socket namespace.
   103  	s.DecRef(ctx)
   104  }
   105  
   106  // socketOpsCommon contains the socket operations common to VFS1 and VFS2.
   107  //
   108  // +stateify savable
   109  type socketOpsCommon struct {
   110  	socket.SendReceiveTimeout
   111  
   112  	ep    transport.Endpoint
   113  	stype linux.SockType
   114  
   115  	// abstractName and abstractNamespace indicate the name and namespace of the
   116  	// socket if it is bound to an abstract socket namespace. Once the socket is
   117  	// bound, they cannot be modified.
   118  	abstractName      string
   119  	abstractNamespace *kernel.AbstractSocketNamespace
   120  }
   121  
   122  func (s *socketOpsCommon) isPacket() bool {
   123  	switch s.stype {
   124  	case linux.SOCK_DGRAM, linux.SOCK_SEQPACKET:
   125  		return true
   126  	case linux.SOCK_STREAM:
   127  		return false
   128  	default:
   129  		// We shouldn't have allowed any other socket types during creation.
   130  		panic(fmt.Sprintf("Invalid socket type %d", s.stype))
   131  	}
   132  }
   133  
   134  // Endpoint extracts the transport.Endpoint.
   135  func (s *socketOpsCommon) Endpoint() transport.Endpoint {
   136  	return s.ep
   137  }
   138  
   139  // extractPath extracts and validates the address.
   140  func extractPath(sockaddr []byte) (string, *syserr.Error) {
   141  	addr, family, err := socket.AddressAndFamily(sockaddr)
   142  	if err != nil {
   143  		if err == syserr.ErrAddressFamilyNotSupported {
   144  			err = syserr.ErrInvalidArgument
   145  		}
   146  		return "", err
   147  	}
   148  	if family != linux.AF_UNIX {
   149  		return "", syserr.ErrInvalidArgument
   150  	}
   151  
   152  	// The address is trimmed by GetAddress.
   153  	p := string(addr.Addr)
   154  	if p == "" {
   155  		// Not allowed.
   156  		return "", syserr.ErrInvalidArgument
   157  	}
   158  	if p[len(p)-1] == '/' {
   159  		// Weird, they tried to bind '/a/b/c/'?
   160  		return "", syserr.ErrIsDir
   161  	}
   162  
   163  	return p, nil
   164  }
   165  
   166  // GetPeerName implements the linux syscall getpeername(2) for sockets backed by
   167  // a transport.Endpoint.
   168  func (s *socketOpsCommon) GetPeerName(t *kernel.Task) (linux.SockAddr, uint32, *syserr.Error) {
   169  	addr, err := s.ep.GetRemoteAddress()
   170  	if err != nil {
   171  		return nil, 0, syserr.TranslateNetstackError(err)
   172  	}
   173  
   174  	a, l := socket.ConvertAddress(linux.AF_UNIX, addr)
   175  	return a, l, nil
   176  }
   177  
   178  // GetSockName implements the linux syscall getsockname(2) for sockets backed by
   179  // a transport.Endpoint.
   180  func (s *socketOpsCommon) GetSockName(t *kernel.Task) (linux.SockAddr, uint32, *syserr.Error) {
   181  	addr, err := s.ep.GetLocalAddress()
   182  	if err != nil {
   183  		return nil, 0, syserr.TranslateNetstackError(err)
   184  	}
   185  
   186  	a, l := socket.ConvertAddress(linux.AF_UNIX, addr)
   187  	return a, l, nil
   188  }
   189  
   190  // Ioctl implements fs.FileOperations.Ioctl.
   191  func (s *SocketOperations) Ioctl(ctx context.Context, _ *fs.File, io usermem.IO, args arch.SyscallArguments) (uintptr, error) {
   192  	return netstack.Ioctl(ctx, s.ep, io, args)
   193  }
   194  
   195  // GetSockOpt implements the linux syscall getsockopt(2) for sockets backed by
   196  // a transport.Endpoint.
   197  func (s *SocketOperations) GetSockOpt(t *kernel.Task, level, name int, outPtr hostarch.Addr, outLen int) (marshal.Marshallable, *syserr.Error) {
   198  	return netstack.GetSockOpt(t, s, s.ep, linux.AF_UNIX, s.ep.Type(), level, name, outPtr, outLen)
   199  }
   200  
   201  // Listen implements the linux syscall listen(2) for sockets backed by
   202  // a transport.Endpoint.
   203  func (s *socketOpsCommon) Listen(t *kernel.Task, backlog int) *syserr.Error {
   204  	return s.ep.Listen(backlog)
   205  }
   206  
   207  // blockingAccept implements a blocking version of accept(2), that is, if no
   208  // connections are ready to be accept, it will block until one becomes ready.
   209  func (s *SocketOperations) blockingAccept(t *kernel.Task, peerAddr *tcpip.FullAddress) (transport.Endpoint, *syserr.Error) {
   210  	// Register for notifications.
   211  	e, ch := waiter.NewChannelEntry(nil)
   212  	s.EventRegister(&e, waiter.ReadableEvents)
   213  	defer s.EventUnregister(&e)
   214  
   215  	// Try to accept the connection; if it fails, then wait until we get a
   216  	// notification.
   217  	for {
   218  		if ep, err := s.ep.Accept(peerAddr); err != syserr.ErrWouldBlock {
   219  			return ep, err
   220  		}
   221  
   222  		if err := t.Block(ch); err != nil {
   223  			return nil, syserr.FromError(err)
   224  		}
   225  	}
   226  }
   227  
   228  // Accept implements the linux syscall accept(2) for sockets backed by
   229  // a transport.Endpoint.
   230  func (s *SocketOperations) Accept(t *kernel.Task, peerRequested bool, flags int, blocking bool) (int32, linux.SockAddr, uint32, *syserr.Error) {
   231  	var peerAddr *tcpip.FullAddress
   232  	if peerRequested {
   233  		peerAddr = &tcpip.FullAddress{}
   234  	}
   235  	ep, err := s.ep.Accept(peerAddr)
   236  	if err != nil {
   237  		if err != syserr.ErrWouldBlock || !blocking {
   238  			return 0, nil, 0, err
   239  		}
   240  
   241  		var err *syserr.Error
   242  		ep, err = s.blockingAccept(t, peerAddr)
   243  		if err != nil {
   244  			return 0, nil, 0, err
   245  		}
   246  	}
   247  
   248  	ns := New(t, ep, s.stype)
   249  	defer ns.DecRef(t)
   250  
   251  	if flags&linux.SOCK_NONBLOCK != 0 {
   252  		flags := ns.Flags()
   253  		flags.NonBlocking = true
   254  		ns.SetFlags(flags.Settable())
   255  	}
   256  
   257  	var addr linux.SockAddr
   258  	var addrLen uint32
   259  	if peerAddr != nil {
   260  		addr, addrLen = socket.ConvertAddress(linux.AF_UNIX, *peerAddr)
   261  	}
   262  
   263  	fd, e := t.NewFDFrom(0, ns, kernel.FDFlags{
   264  		CloseOnExec: flags&linux.SOCK_CLOEXEC != 0,
   265  	})
   266  	if e != nil {
   267  		return 0, nil, 0, syserr.FromError(e)
   268  	}
   269  
   270  	t.Kernel().RecordSocket(ns)
   271  
   272  	return fd, addr, addrLen, nil
   273  }
   274  
   275  // Bind implements the linux syscall bind(2) for unix sockets.
   276  func (s *SocketOperations) Bind(t *kernel.Task, sockaddr []byte) *syserr.Error {
   277  	p, e := extractPath(sockaddr)
   278  	if e != nil {
   279  		return e
   280  	}
   281  
   282  	bep, ok := s.ep.(transport.BoundEndpoint)
   283  	if !ok {
   284  		// This socket can't be bound.
   285  		return syserr.ErrInvalidArgument
   286  	}
   287  
   288  	return s.ep.Bind(tcpip.FullAddress{Addr: tcpip.Address(p)}, func() *syserr.Error {
   289  		// Is it abstract?
   290  		if p[0] == 0 {
   291  			if t.IsNetworkNamespaced() {
   292  				return syserr.ErrInvalidEndpointState
   293  			}
   294  			asn := t.AbstractSockets()
   295  			name := p[1:]
   296  			if err := asn.Bind(t, name, bep, s); err != nil {
   297  				// syserr.ErrPortInUse corresponds to EADDRINUSE.
   298  				return syserr.ErrPortInUse
   299  			}
   300  			s.abstractName = name
   301  			s.abstractNamespace = asn
   302  		} else {
   303  			// The parent and name.
   304  			var d *fs.Dirent
   305  			var name string
   306  
   307  			cwd := t.FSContext().WorkingDirectory()
   308  			defer cwd.DecRef(t)
   309  
   310  			// Is there no slash at all?
   311  			if !strings.Contains(p, "/") {
   312  				d = cwd
   313  				name = p
   314  			} else {
   315  				root := t.FSContext().RootDirectory()
   316  				defer root.DecRef(t)
   317  				// Find the last path component, we know that something follows
   318  				// that final slash, otherwise extractPath() would have failed.
   319  				lastSlash := strings.LastIndex(p, "/")
   320  				subPath := p[:lastSlash]
   321  				if subPath == "" {
   322  					// Fix up subpath in case file is in root.
   323  					subPath = "/"
   324  				}
   325  				var err error
   326  				remainingTraversals := uint(fs.DefaultTraversalLimit)
   327  				d, err = t.MountNamespace().FindInode(t, root, cwd, subPath, &remainingTraversals)
   328  				if err != nil {
   329  					// No path available.
   330  					return syserr.ErrNoSuchFile
   331  				}
   332  				defer d.DecRef(t)
   333  				name = p[lastSlash+1:]
   334  			}
   335  
   336  			// Create the socket.
   337  			//
   338  			// Note that the file permissions here are not set correctly (see
   339  			// github.com/SagerNet/issue/2324). There is no convenient way to get permissions
   340  			// on the socket referred to by s, so we will leave this discrepancy
   341  			// unresolved until VFS2 replaces this code.
   342  			childDir, err := d.Bind(t, t.FSContext().RootDirectory(), name, bep, fs.FilePermissions{User: fs.PermMask{Read: true}})
   343  			if err != nil {
   344  				return syserr.ErrPortInUse
   345  			}
   346  			childDir.DecRef(t)
   347  		}
   348  
   349  		return nil
   350  	})
   351  }
   352  
   353  // extractEndpoint retrieves the transport.BoundEndpoint associated with a Unix
   354  // socket path. The Release must be called on the transport.BoundEndpoint when
   355  // the caller is done with it.
   356  func extractEndpoint(t *kernel.Task, sockaddr []byte) (transport.BoundEndpoint, *syserr.Error) {
   357  	path, err := extractPath(sockaddr)
   358  	if err != nil {
   359  		return nil, err
   360  	}
   361  
   362  	// Is it abstract?
   363  	if path[0] == 0 {
   364  		if t.IsNetworkNamespaced() {
   365  			return nil, syserr.ErrInvalidArgument
   366  		}
   367  
   368  		ep := t.AbstractSockets().BoundEndpoint(path[1:])
   369  		if ep == nil {
   370  			// No socket found.
   371  			return nil, syserr.ErrConnectionRefused
   372  		}
   373  
   374  		return ep, nil
   375  	}
   376  
   377  	if kernel.VFS2Enabled {
   378  		p := fspath.Parse(path)
   379  		root := t.FSContext().RootDirectoryVFS2()
   380  		start := root
   381  		relPath := !p.Absolute
   382  		if relPath {
   383  			start = t.FSContext().WorkingDirectoryVFS2()
   384  		}
   385  		pop := vfs.PathOperation{
   386  			Root:               root,
   387  			Start:              start,
   388  			Path:               p,
   389  			FollowFinalSymlink: true,
   390  		}
   391  		ep, e := t.Kernel().VFS().BoundEndpointAt(t, t.Credentials(), &pop, &vfs.BoundEndpointOptions{path})
   392  		root.DecRef(t)
   393  		if relPath {
   394  			start.DecRef(t)
   395  		}
   396  		if e != nil {
   397  			return nil, syserr.FromError(e)
   398  		}
   399  		return ep, nil
   400  	}
   401  
   402  	// Find the node in the filesystem.
   403  	root := t.FSContext().RootDirectory()
   404  	cwd := t.FSContext().WorkingDirectory()
   405  	remainingTraversals := uint(fs.DefaultTraversalLimit)
   406  	d, e := t.MountNamespace().FindInode(t, root, cwd, path, &remainingTraversals)
   407  	cwd.DecRef(t)
   408  	root.DecRef(t)
   409  	if e != nil {
   410  		return nil, syserr.FromError(e)
   411  	}
   412  
   413  	// Extract the endpoint if one is there.
   414  	ep := d.Inode.BoundEndpoint(path)
   415  	d.DecRef(t)
   416  	if ep == nil {
   417  		// No socket!
   418  		return nil, syserr.ErrConnectionRefused
   419  	}
   420  	return ep, nil
   421  }
   422  
   423  // Connect implements the linux syscall connect(2) for unix sockets.
   424  func (s *socketOpsCommon) Connect(t *kernel.Task, sockaddr []byte, blocking bool) *syserr.Error {
   425  	ep, err := extractEndpoint(t, sockaddr)
   426  	if err != nil {
   427  		return err
   428  	}
   429  	defer ep.Release(t)
   430  
   431  	// Connect the server endpoint.
   432  	err = s.ep.Connect(t, ep)
   433  
   434  	if err == syserr.ErrWrongProtocolForSocket {
   435  		// Linux for abstract sockets returns ErrConnectionRefused
   436  		// instead of ErrWrongProtocolForSocket.
   437  		path, _ := extractPath(sockaddr)
   438  		if len(path) > 0 && path[0] == 0 {
   439  			err = syserr.ErrConnectionRefused
   440  		}
   441  	}
   442  
   443  	return err
   444  }
   445  
   446  // Write implements fs.FileOperations.Write.
   447  func (s *SocketOperations) Write(ctx context.Context, _ *fs.File, src usermem.IOSequence, _ int64) (int64, error) {
   448  	t := kernel.TaskFromContext(ctx)
   449  	ctrl := control.New(t, s.ep, nil)
   450  
   451  	if src.NumBytes() == 0 {
   452  		nInt, err := s.ep.SendMsg(ctx, [][]byte{}, ctrl, nil)
   453  		return int64(nInt), err.ToError()
   454  	}
   455  
   456  	return src.CopyInTo(ctx, &EndpointWriter{
   457  		Ctx:      ctx,
   458  		Endpoint: s.ep,
   459  		Control:  ctrl,
   460  		To:       nil,
   461  	})
   462  }
   463  
   464  // SendMsg implements the linux syscall sendmsg(2) for unix sockets backed by
   465  // a transport.Endpoint.
   466  func (s *socketOpsCommon) SendMsg(t *kernel.Task, src usermem.IOSequence, to []byte, flags int, haveDeadline bool, deadline ktime.Time, controlMessages socket.ControlMessages) (int, *syserr.Error) {
   467  	w := EndpointWriter{
   468  		Ctx:      t,
   469  		Endpoint: s.ep,
   470  		Control:  controlMessages.Unix,
   471  		To:       nil,
   472  	}
   473  	if len(to) > 0 {
   474  		switch s.stype {
   475  		case linux.SOCK_SEQPACKET:
   476  			// to is ignored.
   477  		case linux.SOCK_STREAM:
   478  			if s.State() == linux.SS_CONNECTED {
   479  				return 0, syserr.ErrAlreadyConnected
   480  			}
   481  			return 0, syserr.ErrNotSupported
   482  		default:
   483  			ep, err := extractEndpoint(t, to)
   484  			if err != nil {
   485  				return 0, err
   486  			}
   487  			defer ep.Release(t)
   488  			w.To = ep
   489  
   490  			if ep.Passcred() && w.Control.Credentials == nil {
   491  				w.Control.Credentials = control.MakeCreds(t)
   492  			}
   493  		}
   494  	}
   495  
   496  	n, err := src.CopyInTo(t, &w)
   497  	if err != syserror.ErrWouldBlock || flags&linux.MSG_DONTWAIT != 0 {
   498  		return int(n), syserr.FromError(err)
   499  	}
   500  
   501  	// Only send SCM Rights once (see net/unix/af_unix.c:unix_stream_sendmsg).
   502  	w.Control.Rights = nil
   503  
   504  	// We'll have to block. Register for notification and keep trying to
   505  	// send all the data.
   506  	e, ch := waiter.NewChannelEntry(nil)
   507  	s.EventRegister(&e, waiter.WritableEvents)
   508  	defer s.EventUnregister(&e)
   509  
   510  	total := n
   511  	for {
   512  		// Shorten src to reflect bytes previously written.
   513  		src = src.DropFirst64(n)
   514  
   515  		n, err = src.CopyInTo(t, &w)
   516  		total += n
   517  		if err != syserror.ErrWouldBlock {
   518  			break
   519  		}
   520  
   521  		if err = t.BlockWithDeadline(ch, haveDeadline, deadline); err != nil {
   522  			if linuxerr.Equals(linuxerr.ETIMEDOUT, err) {
   523  				err = syserror.ErrWouldBlock
   524  			}
   525  			break
   526  		}
   527  	}
   528  
   529  	return int(total), syserr.FromError(err)
   530  }
   531  
   532  // Passcred implements transport.Credentialer.Passcred.
   533  func (s *socketOpsCommon) Passcred() bool {
   534  	return s.ep.Passcred()
   535  }
   536  
   537  // ConnectedPasscred implements transport.Credentialer.ConnectedPasscred.
   538  func (s *socketOpsCommon) ConnectedPasscred() bool {
   539  	return s.ep.ConnectedPasscred()
   540  }
   541  
   542  // Readiness implements waiter.Waitable.Readiness.
   543  func (s *socketOpsCommon) Readiness(mask waiter.EventMask) waiter.EventMask {
   544  	return s.ep.Readiness(mask)
   545  }
   546  
   547  // EventRegister implements waiter.Waitable.EventRegister.
   548  func (s *socketOpsCommon) EventRegister(e *waiter.Entry, mask waiter.EventMask) {
   549  	s.ep.EventRegister(e, mask)
   550  }
   551  
   552  // EventUnregister implements waiter.Waitable.EventUnregister.
   553  func (s *socketOpsCommon) EventUnregister(e *waiter.Entry) {
   554  	s.ep.EventUnregister(e)
   555  }
   556  
   557  // SetSockOpt implements the linux syscall setsockopt(2) for sockets backed by
   558  // a transport.Endpoint.
   559  func (s *SocketOperations) SetSockOpt(t *kernel.Task, level int, name int, optVal []byte) *syserr.Error {
   560  	return netstack.SetSockOpt(t, s, s.ep, level, name, optVal)
   561  }
   562  
   563  // Shutdown implements the linux syscall shutdown(2) for sockets backed by
   564  // a transport.Endpoint.
   565  func (s *socketOpsCommon) Shutdown(t *kernel.Task, how int) *syserr.Error {
   566  	f, err := netstack.ConvertShutdown(how)
   567  	if err != nil {
   568  		return err
   569  	}
   570  
   571  	// Issue shutdown request.
   572  	return s.ep.Shutdown(f)
   573  }
   574  
   575  // Read implements fs.FileOperations.Read.
   576  func (s *SocketOperations) Read(ctx context.Context, _ *fs.File, dst usermem.IOSequence, _ int64) (int64, error) {
   577  	if dst.NumBytes() == 0 {
   578  		return 0, nil
   579  	}
   580  	r := &EndpointReader{
   581  		Ctx:       ctx,
   582  		Endpoint:  s.ep,
   583  		NumRights: 0,
   584  		Peek:      false,
   585  		From:      nil,
   586  	}
   587  	n, err := dst.CopyOutFrom(ctx, r)
   588  	// Drop control messages.
   589  	r.Control.Release(ctx)
   590  	return n, err
   591  }
   592  
   593  // RecvMsg implements the linux syscall recvmsg(2) for sockets backed by
   594  // a transport.Endpoint.
   595  func (s *socketOpsCommon) RecvMsg(t *kernel.Task, dst usermem.IOSequence, flags int, haveDeadline bool, deadline ktime.Time, senderRequested bool, controlDataLen uint64) (n int, msgFlags int, senderAddr linux.SockAddr, senderAddrLen uint32, controlMessages socket.ControlMessages, err *syserr.Error) {
   596  	trunc := flags&linux.MSG_TRUNC != 0
   597  	peek := flags&linux.MSG_PEEK != 0
   598  	dontWait := flags&linux.MSG_DONTWAIT != 0
   599  	waitAll := flags&linux.MSG_WAITALL != 0
   600  	isPacket := s.isPacket()
   601  
   602  	// Calculate the number of FDs for which we have space and if we are
   603  	// requesting credentials.
   604  	var wantCreds bool
   605  	rightsLen := int(controlDataLen) - unix.SizeofCmsghdr
   606  	if s.Passcred() {
   607  		// Credentials take priority if they are enabled and there is space.
   608  		wantCreds = rightsLen > 0
   609  		if !wantCreds {
   610  			msgFlags |= linux.MSG_CTRUNC
   611  		}
   612  		credLen := unix.CmsgSpace(unix.SizeofUcred)
   613  		rightsLen -= credLen
   614  	}
   615  	// FDs are 32 bit (4 byte) ints.
   616  	numRights := rightsLen / 4
   617  	if numRights < 0 {
   618  		numRights = 0
   619  	}
   620  
   621  	r := EndpointReader{
   622  		Ctx:       t,
   623  		Endpoint:  s.ep,
   624  		Creds:     wantCreds,
   625  		NumRights: numRights,
   626  		Peek:      peek,
   627  	}
   628  	if senderRequested {
   629  		r.From = &tcpip.FullAddress{}
   630  	}
   631  
   632  	doRead := func() (int64, error) {
   633  		return dst.CopyOutFrom(t, &r)
   634  	}
   635  
   636  	// If MSG_TRUNC is set with a zero byte destination then we still need
   637  	// to read the message and discard it, or in the case where MSG_PEEK is
   638  	// set, leave it be. In both cases the full message length must be
   639  	// returned.
   640  	if trunc && dst.Addrs.NumBytes() == 0 {
   641  		doRead = func() (int64, error) {
   642  			err := r.Truncate()
   643  			// Always return zero for bytes read since the destination size is
   644  			// zero.
   645  			return 0, err
   646  		}
   647  
   648  	}
   649  
   650  	var total int64
   651  	if n, err := doRead(); err != syserror.ErrWouldBlock || dontWait {
   652  		var from linux.SockAddr
   653  		var fromLen uint32
   654  		if r.From != nil && len([]byte(r.From.Addr)) != 0 {
   655  			from, fromLen = socket.ConvertAddress(linux.AF_UNIX, *r.From)
   656  		}
   657  
   658  		if r.ControlTrunc {
   659  			msgFlags |= linux.MSG_CTRUNC
   660  		}
   661  
   662  		if err != nil || dontWait || !waitAll || isPacket || n >= dst.NumBytes() {
   663  			if isPacket && n < int64(r.MsgSize) {
   664  				msgFlags |= linux.MSG_TRUNC
   665  			}
   666  
   667  			if trunc {
   668  				n = int64(r.MsgSize)
   669  			}
   670  
   671  			return int(n), msgFlags, from, fromLen, socket.ControlMessages{Unix: r.Control}, syserr.FromError(err)
   672  		}
   673  
   674  		// Don't overwrite any data we received.
   675  		dst = dst.DropFirst64(n)
   676  		total += n
   677  	}
   678  
   679  	// We'll have to block. Register for notification and keep trying to
   680  	// send all the data.
   681  	e, ch := waiter.NewChannelEntry(nil)
   682  	s.EventRegister(&e, waiter.ReadableEvents)
   683  	defer s.EventUnregister(&e)
   684  
   685  	for {
   686  		if n, err := doRead(); err != syserror.ErrWouldBlock {
   687  			var from linux.SockAddr
   688  			var fromLen uint32
   689  			if r.From != nil {
   690  				from, fromLen = socket.ConvertAddress(linux.AF_UNIX, *r.From)
   691  			}
   692  
   693  			if r.ControlTrunc {
   694  				msgFlags |= linux.MSG_CTRUNC
   695  			}
   696  
   697  			if trunc {
   698  				// n and r.MsgSize are the same for streams.
   699  				total += int64(r.MsgSize)
   700  			} else {
   701  				total += n
   702  			}
   703  
   704  			streamPeerClosed := s.stype == linux.SOCK_STREAM && n == 0 && err == nil
   705  			if err != nil || !waitAll || isPacket || n >= dst.NumBytes() || streamPeerClosed {
   706  				if total > 0 {
   707  					err = nil
   708  				}
   709  				if isPacket && n < int64(r.MsgSize) {
   710  					msgFlags |= linux.MSG_TRUNC
   711  				}
   712  				return int(total), msgFlags, from, fromLen, socket.ControlMessages{Unix: r.Control}, syserr.FromError(err)
   713  			}
   714  
   715  			// Don't overwrite any data we received.
   716  			dst = dst.DropFirst64(n)
   717  		}
   718  
   719  		if err := t.BlockWithDeadline(ch, haveDeadline, deadline); err != nil {
   720  			if total > 0 {
   721  				err = nil
   722  			}
   723  			if linuxerr.Equals(linuxerr.ETIMEDOUT, err) {
   724  				return int(total), msgFlags, nil, 0, socket.ControlMessages{}, syserr.ErrTryAgain
   725  			}
   726  			return int(total), msgFlags, nil, 0, socket.ControlMessages{}, syserr.FromError(err)
   727  		}
   728  	}
   729  }
   730  
   731  // State implements socket.Socket.State.
   732  func (s *socketOpsCommon) State() uint32 {
   733  	return s.ep.State()
   734  }
   735  
   736  // Type implements socket.Socket.Type.
   737  func (s *socketOpsCommon) Type() (family int, skType linux.SockType, protocol int) {
   738  	// Unix domain sockets always have a protocol of 0.
   739  	return linux.AF_UNIX, s.stype, 0
   740  }
   741  
   742  // provider is a unix domain socket provider.
   743  type provider struct{}
   744  
   745  // Socket returns a new unix domain socket.
   746  func (*provider) Socket(t *kernel.Task, stype linux.SockType, protocol int) (*fs.File, *syserr.Error) {
   747  	// Check arguments.
   748  	if protocol != 0 && protocol != linux.AF_UNIX /* PF_UNIX */ {
   749  		return nil, syserr.ErrProtocolNotSupported
   750  	}
   751  
   752  	// Create the endpoint and socket.
   753  	var ep transport.Endpoint
   754  	switch stype {
   755  	case linux.SOCK_DGRAM, linux.SOCK_RAW:
   756  		ep = transport.NewConnectionless(t)
   757  	case linux.SOCK_SEQPACKET, linux.SOCK_STREAM:
   758  		ep = transport.NewConnectioned(t, stype, t.Kernel())
   759  	default:
   760  		return nil, syserr.ErrInvalidArgument
   761  	}
   762  
   763  	return New(t, ep, stype), nil
   764  }
   765  
   766  // Pair creates a new pair of AF_UNIX connected sockets.
   767  func (*provider) Pair(t *kernel.Task, stype linux.SockType, protocol int) (*fs.File, *fs.File, *syserr.Error) {
   768  	// Check arguments.
   769  	if protocol != 0 && protocol != linux.AF_UNIX /* PF_UNIX */ {
   770  		return nil, nil, syserr.ErrProtocolNotSupported
   771  	}
   772  
   773  	switch stype {
   774  	case linux.SOCK_STREAM, linux.SOCK_DGRAM, linux.SOCK_SEQPACKET, linux.SOCK_RAW:
   775  		// Ok
   776  	default:
   777  		return nil, nil, syserr.ErrInvalidArgument
   778  	}
   779  
   780  	// Create the endpoints and sockets.
   781  	ep1, ep2 := transport.NewPair(t, stype, t.Kernel())
   782  	s1 := New(t, ep1, stype)
   783  	s2 := New(t, ep2, stype)
   784  
   785  	return s1, s2, nil
   786  }
   787  
   788  func init() {
   789  	socket.RegisterProvider(linux.AF_UNIX, &provider{})
   790  	socket.RegisterProviderVFS2(linux.AF_UNIX, &providerVFS2{})
   791  }