github.com/ttpreport/gvisor-ligolo@v0.0.0-20240123134145-a858404967ba/pkg/sentry/socket/unix/transport/unix.go (about) 1 // Copyright 2018 The gVisor Authors. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 // Package transport contains the implementation of Unix endpoints. 16 package transport 17 18 import ( 19 "github.com/ttpreport/gvisor-ligolo/pkg/abi/linux" 20 "github.com/ttpreport/gvisor-ligolo/pkg/context" 21 "github.com/ttpreport/gvisor-ligolo/pkg/log" 22 "github.com/ttpreport/gvisor-ligolo/pkg/syserr" 23 "github.com/ttpreport/gvisor-ligolo/pkg/tcpip" 24 "github.com/ttpreport/gvisor-ligolo/pkg/waiter" 25 ) 26 27 const ( 28 // The minimum size of the send/receive buffers. 29 minimumBufferSize = 4 << 10 // 4 KiB (match default in linux) 30 31 // The default size of the send/receive buffers. 32 defaultBufferSize = 208 << 10 // 208 KiB (default in linux for net.core.wmem_default) 33 34 // The maximum permitted size for the send/receive buffers. 35 maxBufferSize = 4 << 20 // 4 MiB 4 MiB (default in linux for net.core.wmem_max) 36 ) 37 38 // A RightsControlMessage is a control message containing FDs. 39 // 40 // +stateify savable 41 type RightsControlMessage interface { 42 // Clone returns a copy of the RightsControlMessage. 43 Clone() RightsControlMessage 44 45 // Release releases any resources owned by the RightsControlMessage. 46 Release(ctx context.Context) 47 } 48 49 // A CredentialsControlMessage is a control message containing Unix credentials. 50 type CredentialsControlMessage interface { 51 // Equals returns true iff the two messages are equal. 52 Equals(CredentialsControlMessage) bool 53 } 54 55 // A ControlMessages represents a collection of socket control messages. 56 // 57 // +stateify savable 58 type ControlMessages struct { 59 // Rights is a control message containing FDs. 60 Rights RightsControlMessage 61 62 // Credentials is a control message containing Unix credentials. 63 Credentials CredentialsControlMessage 64 } 65 66 // Empty returns true iff the ControlMessages does not contain either 67 // credentials or rights. 68 func (c *ControlMessages) Empty() bool { 69 return c.Rights == nil && c.Credentials == nil 70 } 71 72 // Clone clones both the credentials and the rights. 73 func (c *ControlMessages) Clone() ControlMessages { 74 cm := ControlMessages{} 75 if c.Rights != nil { 76 cm.Rights = c.Rights.Clone() 77 } 78 cm.Credentials = c.Credentials 79 return cm 80 } 81 82 // Release releases both the credentials and the rights. 83 func (c *ControlMessages) Release(ctx context.Context) { 84 if c.Rights != nil { 85 c.Rights.Release(ctx) 86 } 87 *c = ControlMessages{} 88 } 89 90 // Endpoint is the interface implemented by Unix transport protocol 91 // implementations that expose functionality like sendmsg, recvmsg, connect, 92 // etc. to Unix socket implementations. 93 type Endpoint interface { 94 Credentialer 95 waiter.Waitable 96 97 // Close puts the endpoint in a closed state and frees all resources 98 // associated with it. 99 Close(ctx context.Context) 100 101 // RecvMsg reads data and a control message from the endpoint. This method 102 // does not block if there is no data pending. 103 // 104 // creds indicates if credential control messages are requested by the 105 // caller. This is useful for determining if control messages can be 106 // coalesced. creds is a hint and can be safely ignored by the 107 // implementation if no coalescing is possible. It is fine to return 108 // credential control messages when none were requested or to not return 109 // credential control messages when they were requested. 110 // 111 // numRights is the number of SCM_RIGHTS FDs requested by the caller. This 112 // is useful if one must allocate a buffer to receive a SCM_RIGHTS message 113 // or determine if control messages can be coalesced. numRights is a hint 114 // and can be safely ignored by the implementation if the number of 115 // available SCM_RIGHTS FDs is known and no coalescing is possible. It is 116 // fine for the returned number of SCM_RIGHTS FDs to be either higher or 117 // lower than the requested number. 118 // 119 // If peek is true, no data should be consumed from the Endpoint. Any and 120 // all data returned from a peek should be available in the next call to 121 // RecvMsg. 122 // 123 // recvLen is the number of bytes copied into data. 124 // 125 // msgLen is the length of the read message consumed for datagram Endpoints. 126 // msgLen is always the same as recvLen for stream Endpoints. 127 // 128 // CMTruncated indicates that the numRights hint was used to receive fewer 129 // than the total available SCM_RIGHTS FDs. Additional truncation may be 130 // required by the caller. 131 // 132 // If set, notify is a callback that should be called after RecvMesg 133 // completes without mm.activeMu held. 134 RecvMsg(ctx context.Context, data [][]byte, creds bool, numRights int, peek bool, addr *Address) (recvLen, msgLen int64, cm ControlMessages, CMTruncated bool, notify func(), err *syserr.Error) 135 136 // SendMsg writes data and a control message to the endpoint's peer. 137 // This method does not block if the data cannot be written. 138 // 139 // SendMsg does not take ownership of any of its arguments on error. 140 // 141 // If set, notify is a callback that should be called after RecvMesg 142 // completes without mm.activeMu held. 143 SendMsg(context.Context, [][]byte, ControlMessages, BoundEndpoint) (int64, func(), *syserr.Error) 144 145 // Connect connects this endpoint directly to another. 146 // 147 // This should be called on the client endpoint, and the (bound) 148 // endpoint passed in as a parameter. 149 // 150 // The error codes are the same as Connect. 151 Connect(ctx context.Context, server BoundEndpoint) *syserr.Error 152 153 // Shutdown closes the read and/or write end of the endpoint connection 154 // to its peer. 155 Shutdown(flags tcpip.ShutdownFlags) *syserr.Error 156 157 // Listen puts the endpoint in "listen" mode, which allows it to accept 158 // new connections. 159 Listen(ctx context.Context, backlog int) *syserr.Error 160 161 // Accept returns a new endpoint if a peer has established a connection 162 // to an endpoint previously set to listen mode. This method does not 163 // block if no new connections are available. 164 // 165 // The returned Queue is the wait queue for the newly created endpoint. 166 // 167 // peerAddr if not nil will be populated with the address of the connected 168 // peer on a successful accept. 169 Accept(ctx context.Context, peerAddr *Address) (Endpoint, *syserr.Error) 170 171 // Bind binds the endpoint to a specific local address and port. 172 // Specifying a NIC is optional. 173 Bind(address Address) *syserr.Error 174 175 // Type return the socket type, typically either SockStream, SockDgram 176 // or SockSeqpacket. 177 Type() linux.SockType 178 179 // GetLocalAddress returns the address to which the endpoint is bound. 180 GetLocalAddress() (Address, tcpip.Error) 181 182 // GetRemoteAddress returns the address to which the endpoint is 183 // connected. 184 GetRemoteAddress() (Address, tcpip.Error) 185 186 // SetSockOpt sets a socket option. 187 SetSockOpt(opt tcpip.SettableSocketOption) tcpip.Error 188 189 // SetSockOptInt sets a socket option for simple cases when a value has 190 // the int type. 191 SetSockOptInt(opt tcpip.SockOptInt, v int) tcpip.Error 192 193 // GetSockOpt gets a socket option. 194 GetSockOpt(opt tcpip.GettableSocketOption) tcpip.Error 195 196 // GetSockOptInt gets a socket option for simple cases when a return 197 // value has the int type. 198 GetSockOptInt(opt tcpip.SockOptInt) (int, tcpip.Error) 199 200 // State returns the current state of the socket, as represented by Linux in 201 // procfs. 202 State() uint32 203 204 // LastError clears and returns the last error reported by the endpoint. 205 LastError() tcpip.Error 206 207 // SocketOptions returns the structure which contains all the socket 208 // level options. 209 SocketOptions() *tcpip.SocketOptions 210 } 211 212 // A Credentialer is a socket or endpoint that supports the SO_PASSCRED socket 213 // option. 214 type Credentialer interface { 215 // Passcred returns whether or not the SO_PASSCRED socket option is 216 // enabled on this end. 217 Passcred() bool 218 219 // ConnectedPasscred returns whether or not the SO_PASSCRED socket option 220 // is enabled on the connected end. 221 ConnectedPasscred() bool 222 } 223 224 // A BoundEndpoint is a unix endpoint that can be connected to. 225 type BoundEndpoint interface { 226 // BidirectionalConnect establishes a bi-directional connection between two 227 // unix endpoints in an all-or-nothing manner. If an error occurs during 228 // connecting, the state of neither endpoint should be modified. 229 // 230 // In order for an endpoint to establish such a bidirectional connection 231 // with a BoundEndpoint, the endpoint calls the BidirectionalConnect method 232 // on the BoundEndpoint and sends a representation of itself (the 233 // ConnectingEndpoint) and a callback (returnConnect) to receive the 234 // connection information (Receiver and ConnectedEndpoint) upon a 235 // successful connect. The callback should only be called on a successful 236 // connect. 237 // 238 // For a connection attempt to be successful, the ConnectingEndpoint must 239 // be unconnected and not listening and the BoundEndpoint whose 240 // BidirectionalConnect method is being called must be listening. 241 // 242 // This method will return syserr.ErrConnectionRefused on endpoints with a 243 // type that isn't SockStream or SockSeqpacket. 244 BidirectionalConnect(ctx context.Context, ep ConnectingEndpoint, returnConnect func(Receiver, ConnectedEndpoint)) *syserr.Error 245 246 // UnidirectionalConnect establishes a write-only connection to a unix 247 // endpoint. 248 // 249 // An endpoint which calls UnidirectionalConnect and supports it itself must 250 // not hold its own lock when calling UnidirectionalConnect. 251 // 252 // This method will return syserr.ErrConnectionRefused on a non-SockDgram 253 // endpoint. 254 UnidirectionalConnect(ctx context.Context) (ConnectedEndpoint, *syserr.Error) 255 256 // Passcred returns whether or not the SO_PASSCRED socket option is 257 // enabled on this end. 258 Passcred() bool 259 260 // Release releases any resources held by the BoundEndpoint. It must be 261 // called before dropping all references to a BoundEndpoint returned by a 262 // function. 263 Release(ctx context.Context) 264 } 265 266 // HostBoundEndpoint is an interface that endpoints can implement if they support 267 // binding listening and accepting connections from a bound Unix domain socket 268 // on the host. 269 type HostBoundEndpoint interface { 270 // SetBoundSocketFD will be called on supporting endpoints after 271 // binding a socket on the host filesystem. Implementations should 272 // delegate Listen and Accept calls to the BoundSocketFD. The ownership 273 // of bsFD is transferred to the endpoint. 274 SetBoundSocketFD(ctx context.Context, bsFD BoundSocketFD) error 275 276 // ResetBoundSocketFD cleans up the BoundSocketFD set by the last successful 277 // SetBoundSocketFD call. 278 ResetBoundSocketFD(ctx context.Context) 279 } 280 281 // BoundSocketFD is an interface that wraps a socket FD that was bind(2)-ed. 282 // It allows to listen and accept on that socket. 283 type BoundSocketFD interface { 284 // Close closes the socket FD. 285 Close(ctx context.Context) 286 287 // NotificationFD is a host FD that can be used to notify when new clients 288 // connect to the socket. 289 NotificationFD() int32 290 291 // Listen is analogous to listen(2). 292 Listen(ctx context.Context, backlog int32) error 293 294 // Accept is analogous to accept(2). 295 Accept(ctx context.Context) (int, error) 296 } 297 298 // message represents a message passed over a Unix domain socket. 299 // 300 // +stateify savable 301 type message struct { 302 messageEntry 303 304 // Data is the Message payload. 305 Data []byte 306 307 // Control is auxiliary control message data that goes along with the 308 // data. 309 Control ControlMessages 310 311 // Address is the bound address of the endpoint that sent the message. 312 // 313 // If the endpoint that sent the message is not bound, the Address is 314 // the empty string. 315 Address Address 316 } 317 318 // Length returns number of bytes stored in the message. 319 func (m *message) Length() int64 { 320 return int64(len(m.Data)) 321 } 322 323 // Release releases any resources held by the message. 324 func (m *message) Release(ctx context.Context) { 325 m.Control.Release(ctx) 326 } 327 328 // Peek returns a copy of the message. 329 func (m *message) Peek() *message { 330 return &message{Data: m.Data, Control: m.Control.Clone(), Address: m.Address} 331 } 332 333 // Truncate reduces the length of the message payload to n bytes. 334 // 335 // Preconditions: n <= m.Length(). 336 func (m *message) Truncate(n int64) { 337 m.Data = m.Data[:n] 338 } 339 340 // A Receiver can be used to receive Messages. 341 type Receiver interface { 342 // Recv receives a single message. This method does not block. 343 // 344 // See Endpoint.RecvMsg for documentation on shared arguments. 345 // 346 // notify indicates if RecvNotify should be called. 347 Recv(ctx context.Context, data [][]byte, creds bool, numRights int, peek bool) (recvLen, msgLen int64, cm ControlMessages, CMTruncated bool, source Address, notify bool, err *syserr.Error) 348 349 // RecvNotify notifies the Receiver of a successful Recv. This must not be 350 // called while holding any endpoint locks. 351 RecvNotify() 352 353 // CloseRecv prevents the receiving of additional Messages. 354 // 355 // After CloseRecv is called, CloseNotify must also be called. 356 CloseRecv() 357 358 // CloseNotify notifies the Receiver of recv being closed. This must not be 359 // called while holding any endpoint locks. 360 CloseNotify() 361 362 // Readable returns if messages should be attempted to be received. This 363 // includes when read has been shutdown. 364 Readable() bool 365 366 // RecvQueuedSize returns the total amount of data currently receivable. 367 // RecvQueuedSize should return -1 if the operation isn't supported. 368 RecvQueuedSize() int64 369 370 // RecvMaxQueueSize returns maximum value for RecvQueuedSize. 371 // RecvMaxQueueSize should return -1 if the operation isn't supported. 372 RecvMaxQueueSize() int64 373 374 // Release releases any resources owned by the Receiver. It should be 375 // called before dropping all references to a Receiver. 376 Release(ctx context.Context) 377 } 378 379 // Address is a unix socket address. 380 // 381 // +stateify savable 382 type Address struct { 383 Addr string 384 } 385 386 // queueReceiver implements Receiver for datagram sockets. 387 // 388 // +stateify savable 389 type queueReceiver struct { 390 readQueue *queue 391 } 392 393 // Recv implements Receiver.Recv. 394 func (q *queueReceiver) Recv(ctx context.Context, data [][]byte, creds bool, numRights int, peek bool) (int64, int64, ControlMessages, bool, Address, bool, *syserr.Error) { 395 var m *message 396 var notify bool 397 var err *syserr.Error 398 if peek { 399 m, err = q.readQueue.Peek() 400 } else { 401 m, notify, err = q.readQueue.Dequeue() 402 } 403 if err != nil { 404 return 0, 0, ControlMessages{}, false, Address{}, false, err 405 } 406 src := []byte(m.Data) 407 var copied int64 408 for i := 0; i < len(data) && len(src) > 0; i++ { 409 n := copy(data[i], src) 410 copied += int64(n) 411 src = src[n:] 412 } 413 return copied, int64(len(m.Data)), m.Control, false, m.Address, notify, nil 414 } 415 416 // RecvNotify implements Receiver.RecvNotify. 417 func (q *queueReceiver) RecvNotify() { 418 q.readQueue.WriterQueue.Notify(waiter.WritableEvents) 419 } 420 421 // CloseNotify implements Receiver.CloseNotify. 422 func (q *queueReceiver) CloseNotify() { 423 q.readQueue.ReaderQueue.Notify(waiter.ReadableEvents) 424 q.readQueue.WriterQueue.Notify(waiter.WritableEvents) 425 } 426 427 // CloseRecv implements Receiver.CloseRecv. 428 func (q *queueReceiver) CloseRecv() { 429 q.readQueue.Close() 430 } 431 432 // Readable implements Receiver.Readable. 433 func (q *queueReceiver) Readable() bool { 434 return q.readQueue.IsReadable() 435 } 436 437 // RecvQueuedSize implements Receiver.RecvQueuedSize. 438 func (q *queueReceiver) RecvQueuedSize() int64 { 439 return q.readQueue.QueuedSize() 440 } 441 442 // RecvMaxQueueSize implements Receiver.RecvMaxQueueSize. 443 func (q *queueReceiver) RecvMaxQueueSize() int64 { 444 return q.readQueue.MaxQueueSize() 445 } 446 447 // Release implements Receiver.Release. 448 func (q *queueReceiver) Release(ctx context.Context) { 449 q.readQueue.DecRef(ctx) 450 } 451 452 // streamQueueReceiver implements Receiver for stream sockets. 453 // 454 // +stateify savable 455 type streamQueueReceiver struct { 456 queueReceiver 457 458 mu streamQueueReceiverMutex `state:"nosave"` 459 buffer []byte 460 control ControlMessages 461 addr Address 462 } 463 464 func vecCopy(data [][]byte, buf []byte) (int64, [][]byte, []byte) { 465 var copied int64 466 for len(data) > 0 && len(buf) > 0 { 467 n := copy(data[0], buf) 468 copied += int64(n) 469 buf = buf[n:] 470 data[0] = data[0][n:] 471 if len(data[0]) == 0 { 472 data = data[1:] 473 } 474 } 475 return copied, data, buf 476 } 477 478 // Readable implements Receiver.Readable. 479 func (q *streamQueueReceiver) Readable() bool { 480 q.mu.Lock() 481 bl := len(q.buffer) 482 r := q.readQueue.IsReadable() 483 q.mu.Unlock() 484 // We're readable if we have data in our buffer or if the queue receiver is 485 // readable. 486 return bl > 0 || r 487 } 488 489 // RecvQueuedSize implements Receiver.RecvQueuedSize. 490 func (q *streamQueueReceiver) RecvQueuedSize() int64 { 491 q.mu.Lock() 492 bl := len(q.buffer) 493 qs := q.readQueue.QueuedSize() 494 q.mu.Unlock() 495 return int64(bl) + qs 496 } 497 498 // RecvMaxQueueSize implements Receiver.RecvMaxQueueSize. 499 func (q *streamQueueReceiver) RecvMaxQueueSize() int64 { 500 // The RecvMaxQueueSize() is the readQueue's MaxQueueSize() plus the largest 501 // message we can buffer which is also the largest message we can receive. 502 return 2 * q.readQueue.MaxQueueSize() 503 } 504 505 // Recv implements Receiver.Recv. 506 func (q *streamQueueReceiver) Recv(ctx context.Context, data [][]byte, wantCreds bool, numRights int, peek bool) (int64, int64, ControlMessages, bool, Address, bool, *syserr.Error) { 507 // RightsControlMessages must be released without q.mu held. We do this in a 508 // defer to simplify control flow logic. 509 var rightsToRelease []RightsControlMessage 510 defer func() { 511 for _, rcm := range rightsToRelease { 512 rcm.Release(ctx) 513 } 514 }() 515 516 q.mu.Lock() 517 defer q.mu.Unlock() 518 519 var notify bool 520 521 // If we have no data in the endpoint, we need to get some. 522 if len(q.buffer) == 0 { 523 // Load the next message into a buffer, even if we are peeking. Peeking 524 // won't consume the message, so it will be still available to be read 525 // the next time Recv() is called. 526 m, n, err := q.readQueue.Dequeue() 527 if err != nil { 528 return 0, 0, ControlMessages{}, false, Address{}, false, err 529 } 530 notify = n 531 q.buffer = []byte(m.Data) 532 q.control = m.Control 533 q.addr = m.Address 534 } 535 536 var copied int64 537 if peek { 538 // Don't consume control message if we are peeking. 539 c := q.control.Clone() 540 541 // Don't consume data since we are peeking. 542 copied, _, _ = vecCopy(data, q.buffer) 543 544 return copied, copied, c, false, q.addr, notify, nil 545 } 546 547 // Consume data and control message since we are not peeking. 548 copied, data, q.buffer = vecCopy(data, q.buffer) 549 550 // Save the original state of q.control. 551 c := q.control 552 553 // Remove rights from q.control and leave behind just the creds. 554 q.control.Rights = nil 555 if !wantCreds { 556 c.Credentials = nil 557 } 558 559 var cmTruncated bool 560 if c.Rights != nil && numRights == 0 { 561 rightsToRelease = append(rightsToRelease, c.Rights) 562 c.Rights = nil 563 cmTruncated = true 564 } 565 566 haveRights := c.Rights != nil 567 568 // If we have more capacity for data and haven't received any usable 569 // rights. 570 // 571 // Linux never coalesces rights control messages. 572 for !haveRights && len(data) > 0 { 573 // Get a message from the readQueue. 574 m, n, err := q.readQueue.Dequeue() 575 if err != nil { 576 // We already got some data, so ignore this error. This will 577 // manifest as a short read to the user, which is what Linux 578 // does. 579 break 580 } 581 notify = notify || n 582 q.buffer = []byte(m.Data) 583 q.control = m.Control 584 q.addr = m.Address 585 586 if wantCreds { 587 if (q.control.Credentials == nil) != (c.Credentials == nil) { 588 // One message has credentials, the other does not. 589 break 590 } 591 592 if q.control.Credentials != nil && c.Credentials != nil && !q.control.Credentials.Equals(c.Credentials) { 593 // Both messages have credentials, but they don't match. 594 break 595 } 596 } 597 598 if numRights != 0 && c.Rights != nil && q.control.Rights != nil { 599 // Both messages have rights. 600 break 601 } 602 603 var cpd int64 604 cpd, data, q.buffer = vecCopy(data, q.buffer) 605 copied += cpd 606 607 if cpd == 0 { 608 // data was actually full. 609 break 610 } 611 612 if q.control.Rights != nil { 613 // Consume rights. 614 if numRights == 0 { 615 cmTruncated = true 616 rightsToRelease = append(rightsToRelease, q.control.Rights) 617 } else { 618 c.Rights = q.control.Rights 619 haveRights = true 620 } 621 q.control.Rights = nil 622 } 623 } 624 return copied, copied, c, cmTruncated, q.addr, notify, nil 625 } 626 627 // Release implements Receiver.Release. 628 func (q *streamQueueReceiver) Release(ctx context.Context) { 629 q.queueReceiver.Release(ctx) 630 q.control.Release(ctx) 631 } 632 633 // A ConnectedEndpoint is an Endpoint that can be used to send Messages. 634 type ConnectedEndpoint interface { 635 // Passcred implements Endpoint.Passcred. 636 Passcred() bool 637 638 // GetLocalAddress implements Endpoint.GetLocalAddress. 639 GetLocalAddress() (Address, tcpip.Error) 640 641 // Send sends a single message. This method does not block. 642 // 643 // notify indicates if SendNotify should be called. 644 // 645 // syserr.ErrWouldBlock can be returned along with a partial write if 646 // the caller should block to send the rest of the data. 647 Send(ctx context.Context, data [][]byte, c ControlMessages, from Address) (n int64, notify bool, err *syserr.Error) 648 649 // SendNotify notifies the ConnectedEndpoint of a successful Send. This 650 // must not be called while holding any endpoint locks. 651 SendNotify() 652 653 // CloseSend prevents the sending of additional Messages. 654 // 655 // After CloseSend is call, CloseNotify must also be called. 656 CloseSend() 657 658 // CloseNotify notifies the ConnectedEndpoint of send being closed. This 659 // must not be called while holding any endpoint locks. 660 CloseNotify() 661 662 // Writable returns if messages should be attempted to be sent. This 663 // includes when write has been shutdown. 664 Writable() bool 665 666 // EventUpdate lets the ConnectedEndpoint know that event registrations 667 // have changed. 668 EventUpdate() error 669 670 // SendQueuedSize returns the total amount of data currently queued for 671 // sending. SendQueuedSize should return -1 if the operation isn't 672 // supported. 673 SendQueuedSize() int64 674 675 // SendMaxQueueSize returns maximum value for SendQueuedSize. 676 // SendMaxQueueSize should return -1 if the operation isn't supported. 677 SendMaxQueueSize() int64 678 679 // Release releases any resources owned by the ConnectedEndpoint. It should 680 // be called before dropping all references to a ConnectedEndpoint. 681 Release(ctx context.Context) 682 683 // CloseUnread sets the fact that this end is closed with unread data to 684 // the peer socket. 685 CloseUnread() 686 687 // SetSendBufferSize is called when the endpoint's send buffer size is 688 // changed. 689 SetSendBufferSize(v int64) (newSz int64) 690 } 691 692 // +stateify savable 693 type connectedEndpoint struct { 694 // endpoint represents the subset of the Endpoint functionality needed by 695 // the connectedEndpoint. It is implemented by both connectionedEndpoint 696 // and connectionlessEndpoint and allows the use of types which don't 697 // fully implement Endpoint. 698 endpoint interface { 699 // Passcred implements Endpoint.Passcred. 700 Passcred() bool 701 702 // GetLocalAddress implements Endpoint.GetLocalAddress. 703 GetLocalAddress() (Address, tcpip.Error) 704 705 // Type implements Endpoint.Type. 706 Type() linux.SockType 707 } 708 709 writeQueue *queue 710 } 711 712 // Passcred implements ConnectedEndpoint.Passcred. 713 func (e *connectedEndpoint) Passcred() bool { 714 return e.endpoint.Passcred() 715 } 716 717 // GetLocalAddress implements ConnectedEndpoint.GetLocalAddress. 718 func (e *connectedEndpoint) GetLocalAddress() (Address, tcpip.Error) { 719 return e.endpoint.GetLocalAddress() 720 } 721 722 // Send implements ConnectedEndpoint.Send. 723 func (e *connectedEndpoint) Send(ctx context.Context, data [][]byte, c ControlMessages, from Address) (int64, bool, *syserr.Error) { 724 discardEmpty := false 725 truncate := false 726 if e.endpoint.Type() == linux.SOCK_STREAM { 727 // Discard empty stream packets. Since stream sockets don't 728 // preserve message boundaries, sending zero bytes is a no-op. 729 // In Linux, the receiver actually uses a zero-length receive 730 // as an indication that the stream was closed. 731 discardEmpty = true 732 733 // Since stream sockets don't preserve message boundaries, we 734 // can write only as much of the message as fits in the queue. 735 truncate = true 736 } 737 738 return e.writeQueue.Enqueue(ctx, data, c, from, discardEmpty, truncate) 739 } 740 741 // SendNotify implements ConnectedEndpoint.SendNotify. 742 func (e *connectedEndpoint) SendNotify() { 743 e.writeQueue.ReaderQueue.Notify(waiter.ReadableEvents) 744 } 745 746 // CloseNotify implements ConnectedEndpoint.CloseNotify. 747 func (e *connectedEndpoint) CloseNotify() { 748 e.writeQueue.ReaderQueue.Notify(waiter.ReadableEvents) 749 e.writeQueue.WriterQueue.Notify(waiter.WritableEvents) 750 } 751 752 // CloseSend implements ConnectedEndpoint.CloseSend. 753 func (e *connectedEndpoint) CloseSend() { 754 e.writeQueue.Close() 755 } 756 757 // Writable implements ConnectedEndpoint.Writable. 758 func (e *connectedEndpoint) Writable() bool { 759 return e.writeQueue.IsWritable() 760 } 761 762 // EventUpdate implements ConnectedEndpoint.EventUpdate. 763 func (*connectedEndpoint) EventUpdate() error { 764 return nil 765 } 766 767 // SendQueuedSize implements ConnectedEndpoint.SendQueuedSize. 768 func (e *connectedEndpoint) SendQueuedSize() int64 { 769 return e.writeQueue.QueuedSize() 770 } 771 772 // SendMaxQueueSize implements ConnectedEndpoint.SendMaxQueueSize. 773 func (e *connectedEndpoint) SendMaxQueueSize() int64 { 774 return e.writeQueue.MaxQueueSize() 775 } 776 777 // Release implements ConnectedEndpoint.Release. 778 func (e *connectedEndpoint) Release(ctx context.Context) { 779 e.writeQueue.DecRef(ctx) 780 } 781 782 // CloseUnread implements ConnectedEndpoint.CloseUnread. 783 func (e *connectedEndpoint) CloseUnread() { 784 e.writeQueue.CloseUnread() 785 } 786 787 // SetSendBufferSize implements ConnectedEndpoint.SetSendBufferSize. 788 // SetSendBufferSize sets the send buffer size for the write queue to the 789 // specified value. 790 func (e *connectedEndpoint) SetSendBufferSize(v int64) (newSz int64) { 791 e.writeQueue.SetMaxQueueSize(v) 792 return v 793 } 794 795 // baseEndpoint is an embeddable unix endpoint base used in both the connected 796 // and connectionless unix domain socket Endpoint implementations. 797 // 798 // Not to be used on its own. 799 // 800 // +stateify savable 801 type baseEndpoint struct { 802 *waiter.Queue 803 tcpip.DefaultSocketOptionsHandler 804 805 // Mutex protects the below fields. 806 // 807 // See the lock ordering comment in package kernel/epoll regarding when 808 // this lock can safely be held. 809 endpointMutex `state:"nosave"` 810 811 // receiver allows Messages to be received. 812 receiver Receiver 813 814 // connected allows messages to be sent and state information about the 815 // connected endpoint to be read. 816 connected ConnectedEndpoint 817 818 // path is not empty if the endpoint has been bound, 819 // or may be used if the endpoint is connected. 820 path string 821 822 // ops is used to get socket level options. 823 ops tcpip.SocketOptions 824 } 825 826 // EventRegister implements waiter.Waitable.EventRegister. 827 func (e *baseEndpoint) EventRegister(we *waiter.Entry) error { 828 e.Queue.EventRegister(we) 829 e.Lock() 830 c := e.connected 831 e.Unlock() 832 if c != nil { 833 if err := c.EventUpdate(); err != nil { 834 return err 835 } 836 } 837 return nil 838 } 839 840 // EventUnregister implements waiter.Waitable.EventUnregister. 841 func (e *baseEndpoint) EventUnregister(we *waiter.Entry) { 842 e.Queue.EventUnregister(we) 843 e.Lock() 844 c := e.connected 845 e.Unlock() 846 if c != nil { 847 c.EventUpdate() 848 } 849 } 850 851 // Passcred implements Credentialer.Passcred. 852 func (e *baseEndpoint) Passcred() bool { 853 return e.SocketOptions().GetPassCred() 854 } 855 856 // ConnectedPasscred implements Credentialer.ConnectedPasscred. 857 func (e *baseEndpoint) ConnectedPasscred() bool { 858 e.Lock() 859 defer e.Unlock() 860 return e.connected != nil && e.connected.Passcred() 861 } 862 863 // Connected implements ConnectingEndpoint.Connected. 864 // 865 // Preconditions: e.mu must be held. 866 func (e *baseEndpoint) Connected() bool { 867 return e.receiver != nil && e.connected != nil 868 } 869 870 // RecvMsg reads data and a control message from the endpoint. 871 func (e *baseEndpoint) RecvMsg(ctx context.Context, data [][]byte, creds bool, numRights int, peek bool, addr *Address) (int64, int64, ControlMessages, bool, func(), *syserr.Error) { 872 e.Lock() 873 receiver := e.receiver 874 e.Unlock() 875 876 if receiver == nil { 877 return 0, 0, ControlMessages{}, false, nil, syserr.ErrNotConnected 878 } 879 880 recvLen, msgLen, cms, cmt, a, notify, err := receiver.Recv(ctx, data, creds, numRights, peek) 881 if err != nil { 882 return 0, 0, ControlMessages{}, false, nil, err 883 } 884 885 var notifyFn func() 886 if notify { 887 notifyFn = receiver.RecvNotify 888 } 889 890 if addr != nil { 891 *addr = a 892 } 893 return recvLen, msgLen, cms, cmt, notifyFn, nil 894 } 895 896 // SendMsg writes data and a control message to the endpoint's peer. 897 // This method does not block if the data cannot be written. 898 func (e *baseEndpoint) SendMsg(ctx context.Context, data [][]byte, c ControlMessages, to BoundEndpoint) (int64, func(), *syserr.Error) { 899 e.Lock() 900 if !e.Connected() { 901 e.Unlock() 902 return 0, nil, syserr.ErrNotConnected 903 } 904 if to != nil { 905 e.Unlock() 906 return 0, nil, syserr.ErrAlreadyConnected 907 } 908 909 connected := e.connected 910 n, notify, err := connected.Send(ctx, data, c, Address{Addr: e.path}) 911 e.Unlock() 912 913 var notifyFn func() 914 if notify { 915 notifyFn = connected.SendNotify 916 } 917 918 return n, notifyFn, err 919 } 920 921 // SetSockOpt sets a socket option. 922 func (e *baseEndpoint) SetSockOpt(opt tcpip.SettableSocketOption) tcpip.Error { 923 return nil 924 } 925 926 func (e *baseEndpoint) SetSockOptInt(opt tcpip.SockOptInt, v int) tcpip.Error { 927 log.Warningf("Unsupported socket option: %d", opt) 928 return nil 929 } 930 931 func (e *baseEndpoint) GetSockOptInt(opt tcpip.SockOptInt) (int, tcpip.Error) { 932 switch opt { 933 case tcpip.ReceiveQueueSizeOption: 934 v := 0 935 e.Lock() 936 if !e.Connected() { 937 e.Unlock() 938 return -1, &tcpip.ErrNotConnected{} 939 } 940 v = int(e.receiver.RecvQueuedSize()) 941 e.Unlock() 942 if v < 0 { 943 return -1, &tcpip.ErrQueueSizeNotSupported{} 944 } 945 return v, nil 946 947 case tcpip.SendQueueSizeOption: 948 e.Lock() 949 if !e.Connected() { 950 e.Unlock() 951 return -1, &tcpip.ErrNotConnected{} 952 } 953 v := e.connected.SendQueuedSize() 954 e.Unlock() 955 if v < 0 { 956 return -1, &tcpip.ErrQueueSizeNotSupported{} 957 } 958 return int(v), nil 959 960 default: 961 log.Warningf("Unsupported socket option: %d", opt) 962 return -1, &tcpip.ErrUnknownProtocolOption{} 963 } 964 } 965 966 // GetSockOpt implements tcpip.Endpoint.GetSockOpt. 967 func (e *baseEndpoint) GetSockOpt(opt tcpip.GettableSocketOption) tcpip.Error { 968 log.Warningf("Unsupported socket option: %T", opt) 969 return &tcpip.ErrUnknownProtocolOption{} 970 } 971 972 // LastError implements Endpoint.LastError. 973 func (*baseEndpoint) LastError() tcpip.Error { 974 return nil 975 } 976 977 // SocketOptions implements Endpoint.SocketOptions. 978 func (e *baseEndpoint) SocketOptions() *tcpip.SocketOptions { 979 return &e.ops 980 } 981 982 // Shutdown closes the read and/or write end of the endpoint connection to its 983 // peer. 984 func (e *baseEndpoint) Shutdown(flags tcpip.ShutdownFlags) *syserr.Error { 985 e.Lock() 986 if !e.Connected() { 987 e.Unlock() 988 return syserr.ErrNotConnected 989 } 990 991 var ( 992 r = e.receiver 993 c = e.connected 994 shutdownRead = flags&tcpip.ShutdownRead != 0 995 shutdownWrite = flags&tcpip.ShutdownWrite != 0 996 ) 997 if shutdownRead { 998 r.CloseRecv() 999 } 1000 if shutdownWrite { 1001 c.CloseSend() 1002 } 1003 e.Unlock() 1004 1005 // Don't hold e.Mutex while calling CloseNotify. 1006 if shutdownRead { 1007 r.CloseNotify() 1008 } 1009 if shutdownWrite { 1010 c.CloseNotify() 1011 } 1012 1013 return nil 1014 } 1015 1016 // GetLocalAddress returns the bound path. 1017 func (e *baseEndpoint) GetLocalAddress() (Address, tcpip.Error) { 1018 e.Lock() 1019 defer e.Unlock() 1020 return Address{Addr: e.path}, nil 1021 } 1022 1023 // GetRemoteAddress returns the local address of the connected endpoint (if 1024 // available). 1025 func (e *baseEndpoint) GetRemoteAddress() (Address, tcpip.Error) { 1026 e.Lock() 1027 c := e.connected 1028 e.Unlock() 1029 if c != nil { 1030 return c.GetLocalAddress() 1031 } 1032 return Address{}, &tcpip.ErrNotConnected{} 1033 } 1034 1035 // Release implements BoundEndpoint.Release. 1036 func (*baseEndpoint) Release(context.Context) { 1037 // Binding a baseEndpoint doesn't take a reference. 1038 } 1039 1040 // stackHandler is just a stub implementation of tcpip.StackHandler to provide 1041 // when initializing socketoptions. 1042 type stackHandler struct { 1043 } 1044 1045 // Option implements tcpip.StackHandler. 1046 func (h *stackHandler) Option(option any) tcpip.Error { 1047 panic("unimplemented") 1048 } 1049 1050 // TransportProtocolOption implements tcpip.StackHandler. 1051 func (h *stackHandler) TransportProtocolOption(proto tcpip.TransportProtocolNumber, option tcpip.GettableTransportProtocolOption) tcpip.Error { 1052 panic("unimplemented") 1053 } 1054 1055 // getSendBufferLimits implements tcpip.GetSendBufferLimits. 1056 // 1057 // AF_UNIX sockets buffer sizes are not tied to the networking stack/namespace 1058 // in linux but are bound by net.core.(wmem|rmem)_(max|default). 1059 // 1060 // In gVisor net.core sysctls today are not exposed or if exposed are currently 1061 // tied to the networking stack in use. This makes it complicated for AF_UNIX 1062 // when we are in a new namespace w/ no networking stack. As a result for now we 1063 // define default/max values here in the unix socket implementation itself. 1064 func getSendBufferLimits(tcpip.StackHandler) tcpip.SendBufferSizeOption { 1065 return tcpip.SendBufferSizeOption{ 1066 Min: minimumBufferSize, 1067 Default: defaultBufferSize, 1068 Max: maxBufferSize, 1069 } 1070 } 1071 1072 // getReceiveBufferLimits implements tcpip.GetReceiveBufferLimits. 1073 // 1074 // We define min, max and default values for unix socket implementation. Unix 1075 // sockets do not use receive buffer. 1076 func getReceiveBufferLimits(tcpip.StackHandler) tcpip.ReceiveBufferSizeOption { 1077 return tcpip.ReceiveBufferSizeOption{ 1078 Min: minimumBufferSize, 1079 Default: defaultBufferSize, 1080 Max: maxBufferSize, 1081 } 1082 }