github.com/metacubex/gvisor@v0.0.0-20240320004321-933faba989ec/pkg/sentry/socket/unix/transport/unix.go (about) 1 // Copyright 2018 The gVisor Authors. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 // Package transport contains the implementation of Unix endpoints. 16 package transport 17 18 import ( 19 "github.com/metacubex/gvisor/pkg/abi/linux" 20 "github.com/metacubex/gvisor/pkg/context" 21 "github.com/metacubex/gvisor/pkg/log" 22 "github.com/metacubex/gvisor/pkg/syserr" 23 "github.com/metacubex/gvisor/pkg/tcpip" 24 "github.com/metacubex/gvisor/pkg/waiter" 25 ) 26 27 const ( 28 // The minimum size of the send/receive buffers. 29 minimumBufferSize = 4 << 10 // 4 KiB (match default in linux) 30 31 // The default size of the send/receive buffers. 32 defaultBufferSize = 208 << 10 // 208 KiB (default in linux for net.core.wmem_default) 33 34 // The maximum permitted size for the send/receive buffers. 35 maxBufferSize = 4 << 20 // 4 MiB 4 MiB (default in linux for net.core.wmem_max) 36 ) 37 38 // A RightsControlMessage is a control message containing FDs. 39 // 40 // +stateify savable 41 type RightsControlMessage interface { 42 // Clone returns a copy of the RightsControlMessage. 43 Clone() RightsControlMessage 44 45 // Release releases any resources owned by the RightsControlMessage. 46 Release(ctx context.Context) 47 } 48 49 // A CredentialsControlMessage is a control message containing Unix credentials. 50 type CredentialsControlMessage interface { 51 // Equals returns true iff the two messages are equal. 52 Equals(CredentialsControlMessage) bool 53 } 54 55 // A ControlMessages represents a collection of socket control messages. 56 // 57 // +stateify savable 58 type ControlMessages struct { 59 // Rights is a control message containing FDs. 60 Rights RightsControlMessage 61 62 // Credentials is a control message containing Unix credentials. 63 Credentials CredentialsControlMessage 64 } 65 66 // Empty returns true iff the ControlMessages does not contain either 67 // credentials or rights. 68 func (c *ControlMessages) Empty() bool { 69 return c.Rights == nil && c.Credentials == nil 70 } 71 72 // Clone clones both the credentials and the rights. 73 func (c *ControlMessages) Clone() ControlMessages { 74 cm := ControlMessages{} 75 if c.Rights != nil { 76 cm.Rights = c.Rights.Clone() 77 } 78 cm.Credentials = c.Credentials 79 return cm 80 } 81 82 // Release releases both the credentials and the rights. 83 func (c *ControlMessages) Release(ctx context.Context) { 84 if c.Rights != nil { 85 c.Rights.Release(ctx) 86 } 87 *c = ControlMessages{} 88 } 89 90 // RecvArgs are the arguments to Endpoint.RecvMsg and Receiver.Recv. 91 type RecvArgs struct { 92 // Creds indicates if credential control messages are requested by the 93 // caller. This is useful for determining if control messages can be 94 // coalesced. Creds is a hint and can be safely ignored by the 95 // implementation if no coalescing is possible. It is fine to return 96 // credential control messages when none were requested or to not 97 // return credential control messages when they were requested. 98 Creds bool 99 100 // NumRights is the number of SCM_RIGHTS FDs requested by the caller. 101 // This is useful if one must allocate a buffer to receive a SCM_RIGHTS 102 // message or determine if control messages can be coalesced. numRights 103 // is a hint and can be safely ignored by the implementation if the 104 // number of available SCM_RIGHTS FDs is known and no coalescing is 105 // possible. It is fine for the returned number of SCM_RIGHTS FDs to be 106 // either higher or lower than the requested number. 107 NumRights int 108 109 // If Peek is true, no data should be consumed from the Endpoint. Any and 110 // all data returned from a peek should be available in the next call to 111 // Recv or RecvMsg. 112 Peek bool 113 } 114 115 // RecvOutput is the output from Endpoint.RecvMsg and Receiver.Recv. 116 type RecvOutput struct { 117 // RecvLen is the number of bytes copied into RecvArgs.Data. 118 RecvLen int64 119 120 // MsgLen is the length of the read message consumed for datagram Endpoints. 121 // MsgLen is always the same as RecvLen for stream Endpoints. 122 MsgLen int64 123 124 // Source is the source address we received from. 125 Source Address 126 127 // Control is the ControlMessages read. 128 Control ControlMessages 129 130 // ControlTrunc indicates that the NumRights hint was used to receive 131 // fewer than the total available SCM_RIGHTS FDs. Additional truncation 132 // may be required by the caller. 133 ControlTrunc bool 134 135 // UnusedRights is a slice of unused RightsControlMessage which should 136 // be Release()d. 137 UnusedRights []RightsControlMessage 138 } 139 140 // Endpoint is the interface implemented by Unix transport protocol 141 // implementations that expose functionality like sendmsg, recvmsg, connect, 142 // etc. to Unix socket implementations. 143 type Endpoint interface { 144 Credentialer 145 waiter.Waitable 146 147 // Close puts the endpoint in a closed state and frees all resources 148 // associated with it. 149 Close(ctx context.Context) 150 151 // RecvMsg reads data and a control message from the endpoint. This method 152 // does not block if there is no data pending. 153 // 154 // The returned callback should be called if not nil. 155 RecvMsg(ctx context.Context, data [][]byte, args RecvArgs) (RecvOutput, func(), *syserr.Error) 156 157 // SendMsg writes data and a control message to the endpoint's peer. 158 // This method does not block if the data cannot be written. 159 // 160 // SendMsg does not take ownership of any of its arguments on error. 161 // 162 // If set, notify is a callback that should be called after RecvMesg 163 // completes without mm.activeMu held. 164 SendMsg(context.Context, [][]byte, ControlMessages, BoundEndpoint) (int64, func(), *syserr.Error) 165 166 // Connect connects this endpoint directly to another. 167 // 168 // This should be called on the client endpoint, and the (bound) 169 // endpoint passed in as a parameter. 170 // 171 // The error codes are the same as Connect. 172 Connect(ctx context.Context, server BoundEndpoint) *syserr.Error 173 174 // Shutdown closes the read and/or write end of the endpoint connection 175 // to its peer. 176 Shutdown(flags tcpip.ShutdownFlags) *syserr.Error 177 178 // Listen puts the endpoint in "listen" mode, which allows it to accept 179 // new connections. 180 Listen(ctx context.Context, backlog int) *syserr.Error 181 182 // Accept returns a new endpoint if a peer has established a connection 183 // to an endpoint previously set to listen mode. This method does not 184 // block if no new connections are available. 185 // 186 // The returned Queue is the wait queue for the newly created endpoint. 187 // 188 // peerAddr if not nil will be populated with the address of the connected 189 // peer on a successful accept. 190 Accept(ctx context.Context, peerAddr *Address) (Endpoint, *syserr.Error) 191 192 // Bind binds the endpoint to a specific local address and port. 193 // Specifying a NIC is optional. 194 Bind(address Address) *syserr.Error 195 196 // Type return the socket type, typically either SockStream, SockDgram 197 // or SockSeqpacket. 198 Type() linux.SockType 199 200 // GetLocalAddress returns the address to which the endpoint is bound. 201 GetLocalAddress() (Address, tcpip.Error) 202 203 // GetRemoteAddress returns the address to which the endpoint is 204 // connected. 205 GetRemoteAddress() (Address, tcpip.Error) 206 207 // SetSockOpt sets a socket option. 208 SetSockOpt(opt tcpip.SettableSocketOption) tcpip.Error 209 210 // SetSockOptInt sets a socket option for simple cases when a value has 211 // the int type. 212 SetSockOptInt(opt tcpip.SockOptInt, v int) tcpip.Error 213 214 // GetSockOpt gets a socket option. 215 GetSockOpt(opt tcpip.GettableSocketOption) tcpip.Error 216 217 // GetSockOptInt gets a socket option for simple cases when a return 218 // value has the int type. 219 GetSockOptInt(opt tcpip.SockOptInt) (int, tcpip.Error) 220 221 // State returns the current state of the socket, as represented by Linux in 222 // procfs. 223 State() uint32 224 225 // LastError clears and returns the last error reported by the endpoint. 226 LastError() tcpip.Error 227 228 // SocketOptions returns the structure which contains all the socket 229 // level options. 230 SocketOptions() *tcpip.SocketOptions 231 } 232 233 // A Credentialer is a socket or endpoint that supports the SO_PASSCRED socket 234 // option. 235 type Credentialer interface { 236 // Passcred returns whether or not the SO_PASSCRED socket option is 237 // enabled on this end. 238 Passcred() bool 239 240 // ConnectedPasscred returns whether or not the SO_PASSCRED socket option 241 // is enabled on the connected end. 242 ConnectedPasscred() bool 243 } 244 245 // A BoundEndpoint is a unix endpoint that can be connected to. 246 type BoundEndpoint interface { 247 // BidirectionalConnect establishes a bi-directional connection between two 248 // unix endpoints in an all-or-nothing manner. If an error occurs during 249 // connecting, the state of neither endpoint should be modified. 250 // 251 // In order for an endpoint to establish such a bidirectional connection 252 // with a BoundEndpoint, the endpoint calls the BidirectionalConnect method 253 // on the BoundEndpoint and sends a representation of itself (the 254 // ConnectingEndpoint) and a callback (returnConnect) to receive the 255 // connection information (Receiver and ConnectedEndpoint) upon a 256 // successful connect. The callback should only be called on a successful 257 // connect. 258 // 259 // For a connection attempt to be successful, the ConnectingEndpoint must 260 // be unconnected and not listening and the BoundEndpoint whose 261 // BidirectionalConnect method is being called must be listening. 262 // 263 // This method will return syserr.ErrConnectionRefused on endpoints with a 264 // type that isn't SockStream or SockSeqpacket. 265 BidirectionalConnect(ctx context.Context, ep ConnectingEndpoint, returnConnect func(Receiver, ConnectedEndpoint)) *syserr.Error 266 267 // UnidirectionalConnect establishes a write-only connection to a unix 268 // endpoint. 269 // 270 // An endpoint which calls UnidirectionalConnect and supports it itself must 271 // not hold its own lock when calling UnidirectionalConnect. 272 // 273 // This method will return syserr.ErrConnectionRefused on a non-SockDgram 274 // endpoint. 275 UnidirectionalConnect(ctx context.Context) (ConnectedEndpoint, *syserr.Error) 276 277 // Passcred returns whether or not the SO_PASSCRED socket option is 278 // enabled on this end. 279 Passcred() bool 280 281 // Release releases any resources held by the BoundEndpoint. It must be 282 // called before dropping all references to a BoundEndpoint returned by a 283 // function. 284 Release(ctx context.Context) 285 } 286 287 // HostBoundEndpoint is an interface that endpoints can implement if they support 288 // binding listening and accepting connections from a bound Unix domain socket 289 // on the host. 290 type HostBoundEndpoint interface { 291 // SetBoundSocketFD will be called on supporting endpoints after 292 // binding a socket on the host filesystem. Implementations should 293 // delegate Listen and Accept calls to the BoundSocketFD. The ownership 294 // of bsFD is transferred to the endpoint. 295 SetBoundSocketFD(ctx context.Context, bsFD BoundSocketFD) error 296 297 // ResetBoundSocketFD cleans up the BoundSocketFD set by the last successful 298 // SetBoundSocketFD call. 299 ResetBoundSocketFD(ctx context.Context) 300 } 301 302 // BoundSocketFD is an interface that wraps a socket FD that was bind(2)-ed. 303 // It allows to listen and accept on that socket. 304 type BoundSocketFD interface { 305 // Close closes the socket FD. 306 Close(ctx context.Context) 307 308 // NotificationFD is a host FD that can be used to notify when new clients 309 // connect to the socket. 310 NotificationFD() int32 311 312 // Listen is analogous to listen(2). 313 Listen(ctx context.Context, backlog int32) error 314 315 // Accept is analogous to accept(2). 316 Accept(ctx context.Context) (int, error) 317 } 318 319 // message represents a message passed over a Unix domain socket. 320 // 321 // +stateify savable 322 type message struct { 323 messageEntry 324 325 // Data is the Message payload. 326 Data []byte 327 328 // Control is auxiliary control message data that goes along with the 329 // data. 330 Control ControlMessages 331 332 // Address is the bound address of the endpoint that sent the message. 333 // 334 // If the endpoint that sent the message is not bound, the Address is 335 // the empty string. 336 Address Address 337 } 338 339 // Length returns number of bytes stored in the message. 340 func (m *message) Length() int64 { 341 return int64(len(m.Data)) 342 } 343 344 // Release releases any resources held by the message. 345 func (m *message) Release(ctx context.Context) { 346 m.Control.Release(ctx) 347 } 348 349 // Peek returns a copy of the message. 350 func (m *message) Peek() *message { 351 return &message{Data: m.Data, Control: m.Control.Clone(), Address: m.Address} 352 } 353 354 // Truncate reduces the length of the message payload to n bytes. 355 // 356 // Preconditions: n <= m.Length(). 357 func (m *message) Truncate(n int64) { 358 m.Data = m.Data[:n] 359 } 360 361 // A Receiver can be used to receive Messages. 362 type Receiver interface { 363 // Recv receives a single message. This method does not block. 364 // 365 // notify indicates if RecvNotify should be called. 366 Recv(ctx context.Context, data [][]byte, args RecvArgs) (out RecvOutput, notify bool, err *syserr.Error) 367 368 // RecvNotify notifies the Receiver of a successful Recv. This must not be 369 // called while holding any endpoint locks. 370 RecvNotify() 371 372 // CloseRecv prevents the receiving of additional Messages. 373 // 374 // After CloseRecv is called, CloseNotify must also be called. 375 CloseRecv() 376 377 // CloseNotify notifies the Receiver of recv being closed. This must not be 378 // called while holding any endpoint locks. 379 CloseNotify() 380 381 // IsRecvClosed returns true if reception of additional messages is closed. 382 IsRecvClosed() bool 383 384 // Readable returns if messages should be attempted to be received. This 385 // includes when read has been shutdown. 386 Readable() bool 387 388 // RecvQueuedSize returns the total amount of data currently receivable. 389 // RecvQueuedSize should return -1 if the operation isn't supported. 390 RecvQueuedSize() int64 391 392 // RecvMaxQueueSize returns maximum value for RecvQueuedSize. 393 // RecvMaxQueueSize should return -1 if the operation isn't supported. 394 RecvMaxQueueSize() int64 395 396 // Release releases any resources owned by the Receiver. It should be 397 // called before dropping all references to a Receiver. 398 Release(ctx context.Context) 399 } 400 401 // Address is a unix socket address. 402 // 403 // +stateify savable 404 type Address struct { 405 Addr string 406 } 407 408 // queueReceiver implements Receiver for datagram sockets. 409 // 410 // +stateify savable 411 type queueReceiver struct { 412 readQueue *queue 413 } 414 415 // Recv implements Receiver.Recv. 416 func (q *queueReceiver) Recv(ctx context.Context, data [][]byte, args RecvArgs) (RecvOutput, bool, *syserr.Error) { 417 var m *message 418 var notify bool 419 var err *syserr.Error 420 if args.Peek { 421 m, err = q.readQueue.Peek() 422 } else { 423 m, notify, err = q.readQueue.Dequeue() 424 } 425 if err != nil { 426 return RecvOutput{}, false, err 427 } 428 src := []byte(m.Data) 429 var copied int64 430 for i := 0; i < len(data) && len(src) > 0; i++ { 431 n := copy(data[i], src) 432 copied += int64(n) 433 src = src[n:] 434 } 435 out := RecvOutput{ 436 RecvLen: copied, 437 MsgLen: int64(len(m.Data)), 438 Control: m.Control, 439 Source: m.Address, 440 } 441 return out, notify, nil 442 } 443 444 // RecvNotify implements Receiver.RecvNotify. 445 func (q *queueReceiver) RecvNotify() { 446 q.readQueue.WriterQueue.Notify(waiter.WritableEvents) 447 } 448 449 // CloseNotify implements Receiver.CloseNotify. 450 func (q *queueReceiver) CloseNotify() { 451 q.readQueue.ReaderQueue.Notify(waiter.ReadableEvents) 452 q.readQueue.WriterQueue.Notify(waiter.WritableEvents) 453 } 454 455 // CloseRecv implements Receiver.CloseRecv. 456 func (q *queueReceiver) CloseRecv() { 457 q.readQueue.Close() 458 } 459 460 // IsRecvClosed implements Receiver.IsRecvClosed. 461 func (q *queueReceiver) IsRecvClosed() bool { 462 return q.readQueue.isClosed() 463 } 464 465 // Readable implements Receiver.Readable. 466 func (q *queueReceiver) Readable() bool { 467 return q.readQueue.IsReadable() 468 } 469 470 // RecvQueuedSize implements Receiver.RecvQueuedSize. 471 func (q *queueReceiver) RecvQueuedSize() int64 { 472 return q.readQueue.QueuedSize() 473 } 474 475 // RecvMaxQueueSize implements Receiver.RecvMaxQueueSize. 476 func (q *queueReceiver) RecvMaxQueueSize() int64 { 477 return q.readQueue.MaxQueueSize() 478 } 479 480 // Release implements Receiver.Release. 481 func (q *queueReceiver) Release(ctx context.Context) { 482 q.readQueue.DecRef(ctx) 483 } 484 485 // streamQueueReceiver implements Receiver for stream sockets. 486 // 487 // +stateify savable 488 type streamQueueReceiver struct { 489 queueReceiver 490 491 mu streamQueueReceiverMutex `state:"nosave"` 492 buffer []byte 493 control ControlMessages 494 addr Address 495 } 496 497 func vecCopy(data [][]byte, buf []byte) (int64, [][]byte, []byte) { 498 var copied int64 499 for len(data) > 0 && len(buf) > 0 { 500 n := copy(data[0], buf) 501 copied += int64(n) 502 buf = buf[n:] 503 data[0] = data[0][n:] 504 if len(data[0]) == 0 { 505 data = data[1:] 506 } 507 } 508 return copied, data, buf 509 } 510 511 // Readable implements Receiver.Readable. 512 func (q *streamQueueReceiver) Readable() bool { 513 q.mu.Lock() 514 bl := len(q.buffer) 515 r := q.readQueue.IsReadable() 516 q.mu.Unlock() 517 // We're readable if we have data in our buffer or if the queue receiver is 518 // readable. 519 return bl > 0 || r 520 } 521 522 // RecvQueuedSize implements Receiver.RecvQueuedSize. 523 func (q *streamQueueReceiver) RecvQueuedSize() int64 { 524 q.mu.Lock() 525 bl := len(q.buffer) 526 qs := q.readQueue.QueuedSize() 527 q.mu.Unlock() 528 return int64(bl) + qs 529 } 530 531 // RecvMaxQueueSize implements Receiver.RecvMaxQueueSize. 532 func (q *streamQueueReceiver) RecvMaxQueueSize() int64 { 533 // The RecvMaxQueueSize() is the readQueue's MaxQueueSize() plus the largest 534 // message we can buffer which is also the largest message we can receive. 535 return 2 * q.readQueue.MaxQueueSize() 536 } 537 538 // Recv implements Receiver.Recv. 539 func (q *streamQueueReceiver) Recv(ctx context.Context, data [][]byte, args RecvArgs) (RecvOutput, bool, *syserr.Error) { 540 q.mu.Lock() 541 defer q.mu.Unlock() 542 543 var notify bool 544 545 // If we have no data in the endpoint, we need to get some. 546 if len(q.buffer) == 0 { 547 // Load the next message into a buffer, even if we are peeking. Peeking 548 // won't consume the message, so it will be still available to be read 549 // the next time Recv() is called. 550 m, n, err := q.readQueue.Dequeue() 551 if err != nil { 552 return RecvOutput{}, false, err 553 } 554 notify = n 555 q.buffer = []byte(m.Data) 556 q.control = m.Control 557 q.addr = m.Address 558 } 559 560 var copied int64 561 if args.Peek { 562 // Don't consume control message if we are peeking. 563 c := q.control.Clone() 564 565 // Don't consume data since we are peeking. 566 copied, _, _ = vecCopy(data, q.buffer) 567 568 out := RecvOutput{ 569 RecvLen: copied, 570 MsgLen: copied, 571 Control: c, 572 Source: q.addr, 573 } 574 return out, notify, nil 575 } 576 577 // Consume data and control message since we are not peeking. 578 copied, data, q.buffer = vecCopy(data, q.buffer) 579 580 // Save the original state of q.control. 581 c := q.control 582 583 // Remove rights from q.control and leave behind just the creds. 584 q.control.Rights = nil 585 if !args.Creds { 586 c.Credentials = nil 587 } 588 589 var out RecvOutput 590 if c.Rights != nil && args.NumRights == 0 { 591 // We won't use these rights. 592 out.UnusedRights = append(out.UnusedRights, c.Rights) 593 c.Rights = nil 594 out.ControlTrunc = true 595 } 596 597 haveRights := c.Rights != nil 598 599 // If we have more capacity for data and haven't received any usable 600 // rights. 601 // 602 // Linux never coalesces rights control messages. 603 for !haveRights && len(data) > 0 { 604 // Get a message from the readQueue. 605 m, n, err := q.readQueue.Dequeue() 606 if err != nil { 607 // We already got some data, so ignore this error. This will 608 // manifest as a short read to the user, which is what Linux 609 // does. 610 break 611 } 612 notify = notify || n 613 q.buffer = []byte(m.Data) 614 q.control = m.Control 615 q.addr = m.Address 616 617 if args.Creds { 618 if (q.control.Credentials == nil) != (c.Credentials == nil) { 619 // One message has credentials, the other does not. 620 break 621 } 622 623 if q.control.Credentials != nil && c.Credentials != nil && !q.control.Credentials.Equals(c.Credentials) { 624 // Both messages have credentials, but they don't match. 625 break 626 } 627 } 628 629 if args.NumRights != 0 && c.Rights != nil && q.control.Rights != nil { 630 // Both messages have rights. 631 break 632 } 633 634 var cpd int64 635 cpd, data, q.buffer = vecCopy(data, q.buffer) 636 copied += cpd 637 638 if cpd == 0 { 639 // data was actually full. 640 break 641 } 642 643 if q.control.Rights != nil { 644 // Consume rights. 645 if args.NumRights == 0 { 646 out.ControlTrunc = true 647 out.UnusedRights = append(out.UnusedRights, q.control.Rights) 648 } else { 649 c.Rights = q.control.Rights 650 haveRights = true 651 } 652 q.control.Rights = nil 653 } 654 } 655 656 out.MsgLen = copied 657 out.RecvLen = copied 658 out.Source = q.addr 659 out.Control = c 660 return out, notify, nil 661 } 662 663 // Release implements Receiver.Release. 664 func (q *streamQueueReceiver) Release(ctx context.Context) { 665 q.queueReceiver.Release(ctx) 666 q.control.Release(ctx) 667 } 668 669 // A ConnectedEndpoint is an Endpoint that can be used to send Messages. 670 type ConnectedEndpoint interface { 671 // Passcred implements Endpoint.Passcred. 672 Passcred() bool 673 674 // GetLocalAddress implements Endpoint.GetLocalAddress. 675 GetLocalAddress() (Address, tcpip.Error) 676 677 // Send sends a single message. This method does not block. 678 // 679 // notify indicates if SendNotify should be called. 680 // 681 // syserr.ErrWouldBlock can be returned along with a partial write if 682 // the caller should block to send the rest of the data. 683 Send(ctx context.Context, data [][]byte, c ControlMessages, from Address) (n int64, notify bool, err *syserr.Error) 684 685 // SendNotify notifies the ConnectedEndpoint of a successful Send. This 686 // must not be called while holding any endpoint locks. 687 SendNotify() 688 689 // CloseSend prevents the sending of additional Messages. 690 // 691 // After CloseSend is call, CloseNotify must also be called. 692 CloseSend() 693 694 // CloseNotify notifies the ConnectedEndpoint of send being closed. This 695 // must not be called while holding any endpoint locks. 696 CloseNotify() 697 698 // IsSendClosed returns true if transmission of additional messages is closed. 699 IsSendClosed() bool 700 701 // Writable returns if messages should be attempted to be sent. This 702 // includes when write has been shutdown. 703 Writable() bool 704 705 // EventUpdate lets the ConnectedEndpoint know that event registrations 706 // have changed. 707 EventUpdate() error 708 709 // SendQueuedSize returns the total amount of data currently queued for 710 // sending. SendQueuedSize should return -1 if the operation isn't 711 // supported. 712 SendQueuedSize() int64 713 714 // SendMaxQueueSize returns maximum value for SendQueuedSize. 715 // SendMaxQueueSize should return -1 if the operation isn't supported. 716 SendMaxQueueSize() int64 717 718 // Release releases any resources owned by the ConnectedEndpoint. It should 719 // be called before dropping all references to a ConnectedEndpoint. 720 Release(ctx context.Context) 721 722 // CloseUnread sets the fact that this end is closed with unread data to 723 // the peer socket. 724 CloseUnread() 725 726 // SetSendBufferSize is called when the endpoint's send buffer size is 727 // changed. 728 SetSendBufferSize(v int64) (newSz int64) 729 } 730 731 // +stateify savable 732 type connectedEndpoint struct { 733 // endpoint represents the subset of the Endpoint functionality needed by 734 // the connectedEndpoint. It is implemented by both connectionedEndpoint 735 // and connectionlessEndpoint and allows the use of types which don't 736 // fully implement Endpoint. 737 endpoint interface { 738 // Passcred implements Endpoint.Passcred. 739 Passcred() bool 740 741 // GetLocalAddress implements Endpoint.GetLocalAddress. 742 GetLocalAddress() (Address, tcpip.Error) 743 744 // Type implements Endpoint.Type. 745 Type() linux.SockType 746 } 747 748 writeQueue *queue 749 } 750 751 // Passcred implements ConnectedEndpoint.Passcred. 752 func (e *connectedEndpoint) Passcred() bool { 753 return e.endpoint.Passcred() 754 } 755 756 // GetLocalAddress implements ConnectedEndpoint.GetLocalAddress. 757 func (e *connectedEndpoint) GetLocalAddress() (Address, tcpip.Error) { 758 return e.endpoint.GetLocalAddress() 759 } 760 761 // Send implements ConnectedEndpoint.Send. 762 func (e *connectedEndpoint) Send(ctx context.Context, data [][]byte, c ControlMessages, from Address) (int64, bool, *syserr.Error) { 763 discardEmpty := false 764 truncate := false 765 if e.endpoint.Type() == linux.SOCK_STREAM { 766 // Discard empty stream packets. Since stream sockets don't 767 // preserve message boundaries, sending zero bytes is a no-op. 768 // In Linux, the receiver actually uses a zero-length receive 769 // as an indication that the stream was closed. 770 discardEmpty = true 771 772 // Since stream sockets don't preserve message boundaries, we 773 // can write only as much of the message as fits in the queue. 774 truncate = true 775 } 776 777 return e.writeQueue.Enqueue(ctx, data, c, from, discardEmpty, truncate) 778 } 779 780 // SendNotify implements ConnectedEndpoint.SendNotify. 781 func (e *connectedEndpoint) SendNotify() { 782 e.writeQueue.ReaderQueue.Notify(waiter.ReadableEvents) 783 } 784 785 // CloseNotify implements ConnectedEndpoint.CloseNotify. 786 func (e *connectedEndpoint) CloseNotify() { 787 e.writeQueue.ReaderQueue.Notify(waiter.ReadableEvents) 788 e.writeQueue.WriterQueue.Notify(waiter.WritableEvents) 789 } 790 791 // CloseSend implements ConnectedEndpoint.CloseSend. 792 func (e *connectedEndpoint) CloseSend() { 793 e.writeQueue.Close() 794 } 795 796 // IsSendClosed implements ConnectedEndpoint.IsSendClosed. 797 func (e *connectedEndpoint) IsSendClosed() bool { 798 return e.writeQueue.isClosed() 799 } 800 801 // Writable implements ConnectedEndpoint.Writable. 802 func (e *connectedEndpoint) Writable() bool { 803 return e.writeQueue.IsWritable() 804 } 805 806 // EventUpdate implements ConnectedEndpoint.EventUpdate. 807 func (*connectedEndpoint) EventUpdate() error { 808 return nil 809 } 810 811 // SendQueuedSize implements ConnectedEndpoint.SendQueuedSize. 812 func (e *connectedEndpoint) SendQueuedSize() int64 { 813 return e.writeQueue.QueuedSize() 814 } 815 816 // SendMaxQueueSize implements ConnectedEndpoint.SendMaxQueueSize. 817 func (e *connectedEndpoint) SendMaxQueueSize() int64 { 818 return e.writeQueue.MaxQueueSize() 819 } 820 821 // Release implements ConnectedEndpoint.Release. 822 func (e *connectedEndpoint) Release(ctx context.Context) { 823 e.writeQueue.DecRef(ctx) 824 } 825 826 // CloseUnread implements ConnectedEndpoint.CloseUnread. 827 func (e *connectedEndpoint) CloseUnread() { 828 e.writeQueue.CloseUnread() 829 } 830 831 // SetSendBufferSize implements ConnectedEndpoint.SetSendBufferSize. 832 // SetSendBufferSize sets the send buffer size for the write queue to the 833 // specified value. 834 func (e *connectedEndpoint) SetSendBufferSize(v int64) (newSz int64) { 835 e.writeQueue.SetMaxQueueSize(v) 836 return v 837 } 838 839 // baseEndpoint is an embeddable unix endpoint base used in both the connected 840 // and connectionless unix domain socket Endpoint implementations. 841 // 842 // Not to be used on its own. 843 // 844 // +stateify savable 845 type baseEndpoint struct { 846 *waiter.Queue 847 tcpip.DefaultSocketOptionsHandler 848 849 // Mutex protects the below fields. 850 // 851 // See the lock ordering comment in package kernel/epoll regarding when 852 // this lock can safely be held. 853 endpointMutex `state:"nosave"` 854 855 // receiver allows Messages to be received. 856 receiver Receiver 857 858 // connected allows messages to be sent and state information about the 859 // connected endpoint to be read. 860 connected ConnectedEndpoint 861 862 // path is not empty if the endpoint has been bound, 863 // or may be used if the endpoint is connected. 864 path string 865 866 // ops is used to get socket level options. 867 ops tcpip.SocketOptions 868 } 869 870 // EventRegister implements waiter.Waitable.EventRegister. 871 func (e *baseEndpoint) EventRegister(we *waiter.Entry) error { 872 e.Queue.EventRegister(we) 873 e.Lock() 874 c := e.connected 875 e.Unlock() 876 if c != nil { 877 if err := c.EventUpdate(); err != nil { 878 return err 879 } 880 } 881 return nil 882 } 883 884 // EventUnregister implements waiter.Waitable.EventUnregister. 885 func (e *baseEndpoint) EventUnregister(we *waiter.Entry) { 886 e.Queue.EventUnregister(we) 887 e.Lock() 888 c := e.connected 889 e.Unlock() 890 if c != nil { 891 c.EventUpdate() 892 } 893 } 894 895 // Passcred implements Credentialer.Passcred. 896 func (e *baseEndpoint) Passcred() bool { 897 return e.SocketOptions().GetPassCred() 898 } 899 900 // ConnectedPasscred implements Credentialer.ConnectedPasscred. 901 func (e *baseEndpoint) ConnectedPasscred() bool { 902 e.Lock() 903 defer e.Unlock() 904 return e.connected != nil && e.connected.Passcred() 905 } 906 907 // Connected implements ConnectingEndpoint.Connected. 908 // 909 // Preconditions: e.mu must be held. 910 func (e *baseEndpoint) Connected() bool { 911 return e.receiver != nil && e.connected != nil 912 } 913 914 // RecvMsg reads data and a control message from the endpoint. 915 func (e *baseEndpoint) RecvMsg(ctx context.Context, data [][]byte, args RecvArgs) (RecvOutput, func(), *syserr.Error) { 916 e.Lock() 917 receiver := e.receiver 918 e.Unlock() 919 920 if receiver == nil { 921 return RecvOutput{}, nil, syserr.ErrNotConnected 922 } 923 924 out, notify, err := receiver.Recv(ctx, data, args) 925 if err != nil { 926 return RecvOutput{}, nil, err 927 } 928 929 if notify { 930 return out, receiver.RecvNotify, nil 931 } 932 933 return out, nil, nil 934 } 935 936 // SendMsg writes data and a control message to the endpoint's peer. 937 // This method does not block if the data cannot be written. 938 func (e *baseEndpoint) SendMsg(ctx context.Context, data [][]byte, c ControlMessages, to BoundEndpoint) (int64, func(), *syserr.Error) { 939 e.Lock() 940 if !e.Connected() { 941 e.Unlock() 942 return 0, nil, syserr.ErrNotConnected 943 } 944 if to != nil { 945 e.Unlock() 946 return 0, nil, syserr.ErrAlreadyConnected 947 } 948 949 connected := e.connected 950 n, notify, err := connected.Send(ctx, data, c, Address{Addr: e.path}) 951 e.Unlock() 952 953 var notifyFn func() 954 if notify { 955 notifyFn = connected.SendNotify 956 } 957 958 return n, notifyFn, err 959 } 960 961 // SetSockOpt sets a socket option. 962 func (e *baseEndpoint) SetSockOpt(opt tcpip.SettableSocketOption) tcpip.Error { 963 return nil 964 } 965 966 func (e *baseEndpoint) SetSockOptInt(opt tcpip.SockOptInt, v int) tcpip.Error { 967 log.Warningf("Unsupported socket option: %d", opt) 968 return nil 969 } 970 971 func (e *baseEndpoint) GetSockOptInt(opt tcpip.SockOptInt) (int, tcpip.Error) { 972 switch opt { 973 case tcpip.ReceiveQueueSizeOption: 974 v := 0 975 e.Lock() 976 if !e.Connected() { 977 e.Unlock() 978 return -1, &tcpip.ErrNotConnected{} 979 } 980 v = int(e.receiver.RecvQueuedSize()) 981 e.Unlock() 982 if v < 0 { 983 return -1, &tcpip.ErrQueueSizeNotSupported{} 984 } 985 return v, nil 986 987 case tcpip.SendQueueSizeOption: 988 e.Lock() 989 if !e.Connected() { 990 e.Unlock() 991 return -1, &tcpip.ErrNotConnected{} 992 } 993 v := e.connected.SendQueuedSize() 994 e.Unlock() 995 if v < 0 { 996 return -1, &tcpip.ErrQueueSizeNotSupported{} 997 } 998 return int(v), nil 999 1000 default: 1001 log.Warningf("Unsupported socket option: %d", opt) 1002 return -1, &tcpip.ErrUnknownProtocolOption{} 1003 } 1004 } 1005 1006 // GetSockOpt implements tcpip.Endpoint.GetSockOpt. 1007 func (e *baseEndpoint) GetSockOpt(opt tcpip.GettableSocketOption) tcpip.Error { 1008 log.Warningf("Unsupported socket option: %T", opt) 1009 return &tcpip.ErrUnknownProtocolOption{} 1010 } 1011 1012 // LastError implements Endpoint.LastError. 1013 func (*baseEndpoint) LastError() tcpip.Error { 1014 return nil 1015 } 1016 1017 // SocketOptions implements Endpoint.SocketOptions. 1018 func (e *baseEndpoint) SocketOptions() *tcpip.SocketOptions { 1019 return &e.ops 1020 } 1021 1022 // Shutdown closes the read and/or write end of the endpoint connection to its 1023 // peer. 1024 func (e *baseEndpoint) Shutdown(flags tcpip.ShutdownFlags) *syserr.Error { 1025 e.Lock() 1026 if !e.Connected() { 1027 e.Unlock() 1028 return syserr.ErrNotConnected 1029 } 1030 1031 var ( 1032 r = e.receiver 1033 c = e.connected 1034 shutdownRead = flags&tcpip.ShutdownRead != 0 1035 shutdownWrite = flags&tcpip.ShutdownWrite != 0 1036 ) 1037 if shutdownRead { 1038 r.CloseRecv() 1039 } 1040 if shutdownWrite { 1041 c.CloseSend() 1042 } 1043 e.Unlock() 1044 1045 // Don't hold e.Mutex while calling CloseNotify. 1046 if shutdownRead { 1047 r.CloseNotify() 1048 } 1049 if shutdownWrite { 1050 c.CloseNotify() 1051 } 1052 1053 return nil 1054 } 1055 1056 // GetLocalAddress returns the bound path. 1057 func (e *baseEndpoint) GetLocalAddress() (Address, tcpip.Error) { 1058 e.Lock() 1059 defer e.Unlock() 1060 return Address{Addr: e.path}, nil 1061 } 1062 1063 // GetRemoteAddress returns the local address of the connected endpoint (if 1064 // available). 1065 func (e *baseEndpoint) GetRemoteAddress() (Address, tcpip.Error) { 1066 e.Lock() 1067 c := e.connected 1068 e.Unlock() 1069 if c != nil { 1070 return c.GetLocalAddress() 1071 } 1072 return Address{}, &tcpip.ErrNotConnected{} 1073 } 1074 1075 // Release implements BoundEndpoint.Release. 1076 func (*baseEndpoint) Release(context.Context) { 1077 // Binding a baseEndpoint doesn't take a reference. 1078 } 1079 1080 // stackHandler is just a stub implementation of tcpip.StackHandler to provide 1081 // when initializing socketoptions. 1082 type stackHandler struct { 1083 } 1084 1085 // Option implements tcpip.StackHandler. 1086 func (h *stackHandler) Option(option any) tcpip.Error { 1087 panic("unimplemented") 1088 } 1089 1090 // TransportProtocolOption implements tcpip.StackHandler. 1091 func (h *stackHandler) TransportProtocolOption(proto tcpip.TransportProtocolNumber, option tcpip.GettableTransportProtocolOption) tcpip.Error { 1092 panic("unimplemented") 1093 } 1094 1095 // getSendBufferLimits implements tcpip.GetSendBufferLimits. 1096 // 1097 // AF_UNIX sockets buffer sizes are not tied to the networking stack/namespace 1098 // in linux but are bound by net.core.(wmem|rmem)_(max|default). 1099 // 1100 // In gVisor net.core sysctls today are not exposed or if exposed are currently 1101 // tied to the networking stack in use. This makes it complicated for AF_UNIX 1102 // when we are in a new namespace w/ no networking stack. As a result for now we 1103 // define default/max values here in the unix socket implementation itself. 1104 func getSendBufferLimits(tcpip.StackHandler) tcpip.SendBufferSizeOption { 1105 return tcpip.SendBufferSizeOption{ 1106 Min: minimumBufferSize, 1107 Default: defaultBufferSize, 1108 Max: maxBufferSize, 1109 } 1110 } 1111 1112 // getReceiveBufferLimits implements tcpip.GetReceiveBufferLimits. 1113 // 1114 // We define min, max and default values for unix socket implementation. Unix 1115 // sockets do not use receive buffer. 1116 func getReceiveBufferLimits(tcpip.StackHandler) tcpip.ReceiveBufferSizeOption { 1117 return tcpip.ReceiveBufferSizeOption{ 1118 Min: minimumBufferSize, 1119 Default: defaultBufferSize, 1120 Max: maxBufferSize, 1121 } 1122 }