github.com/SagerNet/gvisor@v0.0.0-20210707092255-7731c139d75c/pkg/sentry/socket/unix/transport/unix.go (about) 1 // Copyright 2018 The gVisor Authors. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 // Package transport contains the implementation of Unix endpoints. 16 package transport 17 18 import ( 19 "github.com/SagerNet/gvisor/pkg/abi/linux" 20 "github.com/SagerNet/gvisor/pkg/context" 21 "github.com/SagerNet/gvisor/pkg/log" 22 "github.com/SagerNet/gvisor/pkg/sync" 23 "github.com/SagerNet/gvisor/pkg/syserr" 24 "github.com/SagerNet/gvisor/pkg/tcpip" 25 "github.com/SagerNet/gvisor/pkg/tcpip/buffer" 26 "github.com/SagerNet/gvisor/pkg/waiter" 27 ) 28 29 const ( 30 // The minimum size of the send/receive buffers. 31 minimumBufferSize = 4 << 10 // 4 KiB (match default in linux) 32 33 // The default size of the send/receive buffers. 34 defaultBufferSize = 208 << 10 // 208 KiB (default in linux for net.core.wmem_default) 35 36 // The maximum permitted size for the send/receive buffers. 37 maxBufferSize = 4 << 20 // 4 MiB 4 MiB (default in linux for net.core.wmem_max) 38 ) 39 40 // A RightsControlMessage is a control message containing FDs. 41 // 42 // +stateify savable 43 type RightsControlMessage interface { 44 // Clone returns a copy of the RightsControlMessage. 45 Clone() RightsControlMessage 46 47 // Release releases any resources owned by the RightsControlMessage. 48 Release(ctx context.Context) 49 } 50 51 // A CredentialsControlMessage is a control message containing Unix credentials. 52 type CredentialsControlMessage interface { 53 // Equals returns true iff the two messages are equal. 54 Equals(CredentialsControlMessage) bool 55 } 56 57 // A ControlMessages represents a collection of socket control messages. 58 // 59 // +stateify savable 60 type ControlMessages struct { 61 // Rights is a control message containing FDs. 62 Rights RightsControlMessage 63 64 // Credentials is a control message containing Unix credentials. 65 Credentials CredentialsControlMessage 66 } 67 68 // Empty returns true iff the ControlMessages does not contain either 69 // credentials or rights. 70 func (c *ControlMessages) Empty() bool { 71 return c.Rights == nil && c.Credentials == nil 72 } 73 74 // Clone clones both the credentials and the rights. 75 func (c *ControlMessages) Clone() ControlMessages { 76 cm := ControlMessages{} 77 if c.Rights != nil { 78 cm.Rights = c.Rights.Clone() 79 } 80 cm.Credentials = c.Credentials 81 return cm 82 } 83 84 // Release releases both the credentials and the rights. 85 func (c *ControlMessages) Release(ctx context.Context) { 86 if c.Rights != nil { 87 c.Rights.Release(ctx) 88 } 89 *c = ControlMessages{} 90 } 91 92 // Endpoint is the interface implemented by Unix transport protocol 93 // implementations that expose functionality like sendmsg, recvmsg, connect, 94 // etc. to Unix socket implementations. 95 type Endpoint interface { 96 Credentialer 97 waiter.Waitable 98 99 // Close puts the endpoint in a closed state and frees all resources 100 // associated with it. 101 Close(ctx context.Context) 102 103 // RecvMsg reads data and a control message from the endpoint. This method 104 // does not block if there is no data pending. 105 // 106 // creds indicates if credential control messages are requested by the 107 // caller. This is useful for determining if control messages can be 108 // coalesced. creds is a hint and can be safely ignored by the 109 // implementation if no coalescing is possible. It is fine to return 110 // credential control messages when none were requested or to not return 111 // credential control messages when they were requested. 112 // 113 // numRights is the number of SCM_RIGHTS FDs requested by the caller. This 114 // is useful if one must allocate a buffer to receive a SCM_RIGHTS message 115 // or determine if control messages can be coalesced. numRights is a hint 116 // and can be safely ignored by the implementation if the number of 117 // available SCM_RIGHTS FDs is known and no coalescing is possible. It is 118 // fine for the returned number of SCM_RIGHTS FDs to be either higher or 119 // lower than the requested number. 120 // 121 // If peek is true, no data should be consumed from the Endpoint. Any and 122 // all data returned from a peek should be available in the next call to 123 // RecvMsg. 124 // 125 // recvLen is the number of bytes copied into data. 126 // 127 // msgLen is the length of the read message consumed for datagram Endpoints. 128 // msgLen is always the same as recvLen for stream Endpoints. 129 // 130 // CMTruncated indicates that the numRights hint was used to receive fewer 131 // than the total available SCM_RIGHTS FDs. Additional truncation may be 132 // required by the caller. 133 RecvMsg(ctx context.Context, data [][]byte, creds bool, numRights int, peek bool, addr *tcpip.FullAddress) (recvLen, msgLen int64, cm ControlMessages, CMTruncated bool, err *syserr.Error) 134 135 // SendMsg writes data and a control message to the endpoint's peer. 136 // This method does not block if the data cannot be written. 137 // 138 // SendMsg does not take ownership of any of its arguments on error. 139 SendMsg(context.Context, [][]byte, ControlMessages, BoundEndpoint) (int64, *syserr.Error) 140 141 // Connect connects this endpoint directly to another. 142 // 143 // This should be called on the client endpoint, and the (bound) 144 // endpoint passed in as a parameter. 145 // 146 // The error codes are the same as Connect. 147 Connect(ctx context.Context, server BoundEndpoint) *syserr.Error 148 149 // Shutdown closes the read and/or write end of the endpoint connection 150 // to its peer. 151 Shutdown(flags tcpip.ShutdownFlags) *syserr.Error 152 153 // Listen puts the endpoint in "listen" mode, which allows it to accept 154 // new connections. 155 Listen(backlog int) *syserr.Error 156 157 // Accept returns a new endpoint if a peer has established a connection 158 // to an endpoint previously set to listen mode. This method does not 159 // block if no new connections are available. 160 // 161 // The returned Queue is the wait queue for the newly created endpoint. 162 // 163 // peerAddr if not nil will be populated with the address of the connected 164 // peer on a successful accept. 165 Accept(peerAddr *tcpip.FullAddress) (Endpoint, *syserr.Error) 166 167 // Bind binds the endpoint to a specific local address and port. 168 // Specifying a NIC is optional. 169 // 170 // An optional commit function will be executed atomically with respect 171 // to binding the endpoint. If this returns an error, the bind will not 172 // occur and the error will be propagated back to the caller. 173 Bind(address tcpip.FullAddress, commit func() *syserr.Error) *syserr.Error 174 175 // Type return the socket type, typically either SockStream, SockDgram 176 // or SockSeqpacket. 177 Type() linux.SockType 178 179 // GetLocalAddress returns the address to which the endpoint is bound. 180 GetLocalAddress() (tcpip.FullAddress, tcpip.Error) 181 182 // GetRemoteAddress returns the address to which the endpoint is 183 // connected. 184 GetRemoteAddress() (tcpip.FullAddress, tcpip.Error) 185 186 // SetSockOpt sets a socket option. 187 SetSockOpt(opt tcpip.SettableSocketOption) tcpip.Error 188 189 // SetSockOptInt sets a socket option for simple cases when a value has 190 // the int type. 191 SetSockOptInt(opt tcpip.SockOptInt, v int) tcpip.Error 192 193 // GetSockOpt gets a socket option. 194 GetSockOpt(opt tcpip.GettableSocketOption) tcpip.Error 195 196 // GetSockOptInt gets a socket option for simple cases when a return 197 // value has the int type. 198 GetSockOptInt(opt tcpip.SockOptInt) (int, tcpip.Error) 199 200 // State returns the current state of the socket, as represented by Linux in 201 // procfs. 202 State() uint32 203 204 // LastError clears and returns the last error reported by the endpoint. 205 LastError() tcpip.Error 206 207 // SocketOptions returns the structure which contains all the socket 208 // level options. 209 SocketOptions() *tcpip.SocketOptions 210 } 211 212 // A Credentialer is a socket or endpoint that supports the SO_PASSCRED socket 213 // option. 214 type Credentialer interface { 215 // Passcred returns whether or not the SO_PASSCRED socket option is 216 // enabled on this end. 217 Passcred() bool 218 219 // ConnectedPasscred returns whether or not the SO_PASSCRED socket option 220 // is enabled on the connected end. 221 ConnectedPasscred() bool 222 } 223 224 // A BoundEndpoint is a unix endpoint that can be connected to. 225 type BoundEndpoint interface { 226 // BidirectionalConnect establishes a bi-directional connection between two 227 // unix endpoints in an all-or-nothing manner. If an error occurs during 228 // connecting, the state of neither endpoint should be modified. 229 // 230 // In order for an endpoint to establish such a bidirectional connection 231 // with a BoundEndpoint, the endpoint calls the BidirectionalConnect method 232 // on the BoundEndpoint and sends a representation of itself (the 233 // ConnectingEndpoint) and a callback (returnConnect) to receive the 234 // connection information (Receiver and ConnectedEndpoint) upon a 235 // successful connect. The callback should only be called on a successful 236 // connect. 237 // 238 // For a connection attempt to be successful, the ConnectingEndpoint must 239 // be unconnected and not listening and the BoundEndpoint whose 240 // BidirectionalConnect method is being called must be listening. 241 // 242 // This method will return syserr.ErrConnectionRefused on endpoints with a 243 // type that isn't SockStream or SockSeqpacket. 244 BidirectionalConnect(ctx context.Context, ep ConnectingEndpoint, returnConnect func(Receiver, ConnectedEndpoint)) *syserr.Error 245 246 // UnidirectionalConnect establishes a write-only connection to a unix 247 // endpoint. 248 // 249 // An endpoint which calls UnidirectionalConnect and supports it itself must 250 // not hold its own lock when calling UnidirectionalConnect. 251 // 252 // This method will return syserr.ErrConnectionRefused on a non-SockDgram 253 // endpoint. 254 UnidirectionalConnect(ctx context.Context) (ConnectedEndpoint, *syserr.Error) 255 256 // Passcred returns whether or not the SO_PASSCRED socket option is 257 // enabled on this end. 258 Passcred() bool 259 260 // Release releases any resources held by the BoundEndpoint. It must be 261 // called before dropping all references to a BoundEndpoint returned by a 262 // function. 263 Release(ctx context.Context) 264 } 265 266 // message represents a message passed over a Unix domain socket. 267 // 268 // +stateify savable 269 type message struct { 270 messageEntry 271 272 // Data is the Message payload. 273 Data buffer.View 274 275 // Control is auxiliary control message data that goes along with the 276 // data. 277 Control ControlMessages 278 279 // Address is the bound address of the endpoint that sent the message. 280 // 281 // If the endpoint that sent the message is not bound, the Address is 282 // the empty string. 283 Address tcpip.FullAddress 284 } 285 286 // Length returns number of bytes stored in the message. 287 func (m *message) Length() int64 { 288 return int64(len(m.Data)) 289 } 290 291 // Release releases any resources held by the message. 292 func (m *message) Release(ctx context.Context) { 293 m.Control.Release(ctx) 294 } 295 296 // Peek returns a copy of the message. 297 func (m *message) Peek() *message { 298 return &message{Data: m.Data, Control: m.Control.Clone(), Address: m.Address} 299 } 300 301 // Truncate reduces the length of the message payload to n bytes. 302 // 303 // Preconditions: n <= m.Length(). 304 func (m *message) Truncate(n int64) { 305 m.Data.CapLength(int(n)) 306 } 307 308 // A Receiver can be used to receive Messages. 309 type Receiver interface { 310 // Recv receives a single message. This method does not block. 311 // 312 // See Endpoint.RecvMsg for documentation on shared arguments. 313 // 314 // notify indicates if RecvNotify should be called. 315 Recv(ctx context.Context, data [][]byte, creds bool, numRights int, peek bool) (recvLen, msgLen int64, cm ControlMessages, CMTruncated bool, source tcpip.FullAddress, notify bool, err *syserr.Error) 316 317 // RecvNotify notifies the Receiver of a successful Recv. This must not be 318 // called while holding any endpoint locks. 319 RecvNotify() 320 321 // CloseRecv prevents the receiving of additional Messages. 322 // 323 // After CloseRecv is called, CloseNotify must also be called. 324 CloseRecv() 325 326 // CloseNotify notifies the Receiver of recv being closed. This must not be 327 // called while holding any endpoint locks. 328 CloseNotify() 329 330 // Readable returns if messages should be attempted to be received. This 331 // includes when read has been shutdown. 332 Readable() bool 333 334 // RecvQueuedSize returns the total amount of data currently receivable. 335 // RecvQueuedSize should return -1 if the operation isn't supported. 336 RecvQueuedSize() int64 337 338 // RecvMaxQueueSize returns maximum value for RecvQueuedSize. 339 // RecvMaxQueueSize should return -1 if the operation isn't supported. 340 RecvMaxQueueSize() int64 341 342 // Release releases any resources owned by the Receiver. It should be 343 // called before dropping all references to a Receiver. 344 Release(ctx context.Context) 345 } 346 347 // queueReceiver implements Receiver for datagram sockets. 348 // 349 // +stateify savable 350 type queueReceiver struct { 351 readQueue *queue 352 } 353 354 // Recv implements Receiver.Recv. 355 func (q *queueReceiver) Recv(ctx context.Context, data [][]byte, creds bool, numRights int, peek bool) (int64, int64, ControlMessages, bool, tcpip.FullAddress, bool, *syserr.Error) { 356 var m *message 357 var notify bool 358 var err *syserr.Error 359 if peek { 360 m, err = q.readQueue.Peek() 361 } else { 362 m, notify, err = q.readQueue.Dequeue() 363 } 364 if err != nil { 365 return 0, 0, ControlMessages{}, false, tcpip.FullAddress{}, false, err 366 } 367 src := []byte(m.Data) 368 var copied int64 369 for i := 0; i < len(data) && len(src) > 0; i++ { 370 n := copy(data[i], src) 371 copied += int64(n) 372 src = src[n:] 373 } 374 return copied, int64(len(m.Data)), m.Control, false, m.Address, notify, nil 375 } 376 377 // RecvNotify implements Receiver.RecvNotify. 378 func (q *queueReceiver) RecvNotify() { 379 q.readQueue.WriterQueue.Notify(waiter.WritableEvents) 380 } 381 382 // CloseNotify implements Receiver.CloseNotify. 383 func (q *queueReceiver) CloseNotify() { 384 q.readQueue.ReaderQueue.Notify(waiter.ReadableEvents) 385 q.readQueue.WriterQueue.Notify(waiter.WritableEvents) 386 } 387 388 // CloseRecv implements Receiver.CloseRecv. 389 func (q *queueReceiver) CloseRecv() { 390 q.readQueue.Close() 391 } 392 393 // Readable implements Receiver.Readable. 394 func (q *queueReceiver) Readable() bool { 395 return q.readQueue.IsReadable() 396 } 397 398 // RecvQueuedSize implements Receiver.RecvQueuedSize. 399 func (q *queueReceiver) RecvQueuedSize() int64 { 400 return q.readQueue.QueuedSize() 401 } 402 403 // RecvMaxQueueSize implements Receiver.RecvMaxQueueSize. 404 func (q *queueReceiver) RecvMaxQueueSize() int64 { 405 return q.readQueue.MaxQueueSize() 406 } 407 408 // Release implements Receiver.Release. 409 func (q *queueReceiver) Release(ctx context.Context) { 410 q.readQueue.DecRef(ctx) 411 } 412 413 // streamQueueReceiver implements Receiver for stream sockets. 414 // 415 // +stateify savable 416 type streamQueueReceiver struct { 417 queueReceiver 418 419 mu sync.Mutex `state:"nosave"` 420 buffer []byte 421 control ControlMessages 422 addr tcpip.FullAddress 423 } 424 425 func vecCopy(data [][]byte, buf []byte) (int64, [][]byte, []byte) { 426 var copied int64 427 for len(data) > 0 && len(buf) > 0 { 428 n := copy(data[0], buf) 429 copied += int64(n) 430 buf = buf[n:] 431 data[0] = data[0][n:] 432 if len(data[0]) == 0 { 433 data = data[1:] 434 } 435 } 436 return copied, data, buf 437 } 438 439 // Readable implements Receiver.Readable. 440 func (q *streamQueueReceiver) Readable() bool { 441 q.mu.Lock() 442 bl := len(q.buffer) 443 r := q.readQueue.IsReadable() 444 q.mu.Unlock() 445 // We're readable if we have data in our buffer or if the queue receiver is 446 // readable. 447 return bl > 0 || r 448 } 449 450 // RecvQueuedSize implements Receiver.RecvQueuedSize. 451 func (q *streamQueueReceiver) RecvQueuedSize() int64 { 452 q.mu.Lock() 453 bl := len(q.buffer) 454 qs := q.readQueue.QueuedSize() 455 q.mu.Unlock() 456 return int64(bl) + qs 457 } 458 459 // RecvMaxQueueSize implements Receiver.RecvMaxQueueSize. 460 func (q *streamQueueReceiver) RecvMaxQueueSize() int64 { 461 // The RecvMaxQueueSize() is the readQueue's MaxQueueSize() plus the largest 462 // message we can buffer which is also the largest message we can receive. 463 return 2 * q.readQueue.MaxQueueSize() 464 } 465 466 // Recv implements Receiver.Recv. 467 func (q *streamQueueReceiver) Recv(ctx context.Context, data [][]byte, wantCreds bool, numRights int, peek bool) (int64, int64, ControlMessages, bool, tcpip.FullAddress, bool, *syserr.Error) { 468 q.mu.Lock() 469 defer q.mu.Unlock() 470 471 var notify bool 472 473 // If we have no data in the endpoint, we need to get some. 474 if len(q.buffer) == 0 { 475 // Load the next message into a buffer, even if we are peeking. Peeking 476 // won't consume the message, so it will be still available to be read 477 // the next time Recv() is called. 478 m, n, err := q.readQueue.Dequeue() 479 if err != nil { 480 return 0, 0, ControlMessages{}, false, tcpip.FullAddress{}, false, err 481 } 482 notify = n 483 q.buffer = []byte(m.Data) 484 q.control = m.Control 485 q.addr = m.Address 486 } 487 488 var copied int64 489 if peek { 490 // Don't consume control message if we are peeking. 491 c := q.control.Clone() 492 493 // Don't consume data since we are peeking. 494 copied, _, _ = vecCopy(data, q.buffer) 495 496 return copied, copied, c, false, q.addr, notify, nil 497 } 498 499 // Consume data and control message since we are not peeking. 500 copied, data, q.buffer = vecCopy(data, q.buffer) 501 502 // Save the original state of q.control. 503 c := q.control 504 505 // Remove rights from q.control and leave behind just the creds. 506 q.control.Rights = nil 507 if !wantCreds { 508 c.Credentials = nil 509 } 510 511 var cmTruncated bool 512 if c.Rights != nil && numRights == 0 { 513 c.Rights.Release(ctx) 514 c.Rights = nil 515 cmTruncated = true 516 } 517 518 haveRights := c.Rights != nil 519 520 // If we have more capacity for data and haven't received any usable 521 // rights. 522 // 523 // Linux never coalesces rights control messages. 524 for !haveRights && len(data) > 0 { 525 // Get a message from the readQueue. 526 m, n, err := q.readQueue.Dequeue() 527 if err != nil { 528 // We already got some data, so ignore this error. This will 529 // manifest as a short read to the user, which is what Linux 530 // does. 531 break 532 } 533 notify = notify || n 534 q.buffer = []byte(m.Data) 535 q.control = m.Control 536 q.addr = m.Address 537 538 if wantCreds { 539 if (q.control.Credentials == nil) != (c.Credentials == nil) { 540 // One message has credentials, the other does not. 541 break 542 } 543 544 if q.control.Credentials != nil && c.Credentials != nil && !q.control.Credentials.Equals(c.Credentials) { 545 // Both messages have credentials, but they don't match. 546 break 547 } 548 } 549 550 if numRights != 0 && c.Rights != nil && q.control.Rights != nil { 551 // Both messages have rights. 552 break 553 } 554 555 var cpd int64 556 cpd, data, q.buffer = vecCopy(data, q.buffer) 557 copied += cpd 558 559 if cpd == 0 { 560 // data was actually full. 561 break 562 } 563 564 if q.control.Rights != nil { 565 // Consume rights. 566 if numRights == 0 { 567 cmTruncated = true 568 q.control.Rights.Release(ctx) 569 } else { 570 c.Rights = q.control.Rights 571 haveRights = true 572 } 573 q.control.Rights = nil 574 } 575 } 576 return copied, copied, c, cmTruncated, q.addr, notify, nil 577 } 578 579 // Release implements Receiver.Release. 580 func (q *streamQueueReceiver) Release(ctx context.Context) { 581 q.queueReceiver.Release(ctx) 582 q.control.Release(ctx) 583 } 584 585 // A ConnectedEndpoint is an Endpoint that can be used to send Messages. 586 type ConnectedEndpoint interface { 587 // Passcred implements Endpoint.Passcred. 588 Passcred() bool 589 590 // GetLocalAddress implements Endpoint.GetLocalAddress. 591 GetLocalAddress() (tcpip.FullAddress, tcpip.Error) 592 593 // Send sends a single message. This method does not block. 594 // 595 // notify indicates if SendNotify should be called. 596 // 597 // syserr.ErrWouldBlock can be returned along with a partial write if 598 // the caller should block to send the rest of the data. 599 Send(ctx context.Context, data [][]byte, c ControlMessages, from tcpip.FullAddress) (n int64, notify bool, err *syserr.Error) 600 601 // SendNotify notifies the ConnectedEndpoint of a successful Send. This 602 // must not be called while holding any endpoint locks. 603 SendNotify() 604 605 // CloseSend prevents the sending of additional Messages. 606 // 607 // After CloseSend is call, CloseNotify must also be called. 608 CloseSend() 609 610 // CloseNotify notifies the ConnectedEndpoint of send being closed. This 611 // must not be called while holding any endpoint locks. 612 CloseNotify() 613 614 // Writable returns if messages should be attempted to be sent. This 615 // includes when write has been shutdown. 616 Writable() bool 617 618 // EventUpdate lets the ConnectedEndpoint know that event registrations 619 // have changed. 620 EventUpdate() 621 622 // SendQueuedSize returns the total amount of data currently queued for 623 // sending. SendQueuedSize should return -1 if the operation isn't 624 // supported. 625 SendQueuedSize() int64 626 627 // SendMaxQueueSize returns maximum value for SendQueuedSize. 628 // SendMaxQueueSize should return -1 if the operation isn't supported. 629 SendMaxQueueSize() int64 630 631 // Release releases any resources owned by the ConnectedEndpoint. It should 632 // be called before dropping all references to a ConnectedEndpoint. 633 Release(ctx context.Context) 634 635 // CloseUnread sets the fact that this end is closed with unread data to 636 // the peer socket. 637 CloseUnread() 638 639 // SetSendBufferSize is called when the endpoint's send buffer size is 640 // changed. 641 SetSendBufferSize(v int64) (newSz int64) 642 } 643 644 // +stateify savable 645 type connectedEndpoint struct { 646 // endpoint represents the subset of the Endpoint functionality needed by 647 // the connectedEndpoint. It is implemented by both connectionedEndpoint 648 // and connectionlessEndpoint and allows the use of types which don't 649 // fully implement Endpoint. 650 endpoint interface { 651 // Passcred implements Endpoint.Passcred. 652 Passcred() bool 653 654 // GetLocalAddress implements Endpoint.GetLocalAddress. 655 GetLocalAddress() (tcpip.FullAddress, tcpip.Error) 656 657 // Type implements Endpoint.Type. 658 Type() linux.SockType 659 } 660 661 writeQueue *queue 662 } 663 664 // Passcred implements ConnectedEndpoint.Passcred. 665 func (e *connectedEndpoint) Passcred() bool { 666 return e.endpoint.Passcred() 667 } 668 669 // GetLocalAddress implements ConnectedEndpoint.GetLocalAddress. 670 func (e *connectedEndpoint) GetLocalAddress() (tcpip.FullAddress, tcpip.Error) { 671 return e.endpoint.GetLocalAddress() 672 } 673 674 // Send implements ConnectedEndpoint.Send. 675 func (e *connectedEndpoint) Send(ctx context.Context, data [][]byte, c ControlMessages, from tcpip.FullAddress) (int64, bool, *syserr.Error) { 676 discardEmpty := false 677 truncate := false 678 if e.endpoint.Type() == linux.SOCK_STREAM { 679 // Discard empty stream packets. Since stream sockets don't 680 // preserve message boundaries, sending zero bytes is a no-op. 681 // In Linux, the receiver actually uses a zero-length receive 682 // as an indication that the stream was closed. 683 discardEmpty = true 684 685 // Since stream sockets don't preserve message boundaries, we 686 // can write only as much of the message as fits in the queue. 687 truncate = true 688 } 689 690 return e.writeQueue.Enqueue(ctx, data, c, from, discardEmpty, truncate) 691 } 692 693 // SendNotify implements ConnectedEndpoint.SendNotify. 694 func (e *connectedEndpoint) SendNotify() { 695 e.writeQueue.ReaderQueue.Notify(waiter.ReadableEvents) 696 } 697 698 // CloseNotify implements ConnectedEndpoint.CloseNotify. 699 func (e *connectedEndpoint) CloseNotify() { 700 e.writeQueue.ReaderQueue.Notify(waiter.ReadableEvents) 701 e.writeQueue.WriterQueue.Notify(waiter.WritableEvents) 702 } 703 704 // CloseSend implements ConnectedEndpoint.CloseSend. 705 func (e *connectedEndpoint) CloseSend() { 706 e.writeQueue.Close() 707 } 708 709 // Writable implements ConnectedEndpoint.Writable. 710 func (e *connectedEndpoint) Writable() bool { 711 return e.writeQueue.IsWritable() 712 } 713 714 // EventUpdate implements ConnectedEndpoint.EventUpdate. 715 func (*connectedEndpoint) EventUpdate() {} 716 717 // SendQueuedSize implements ConnectedEndpoint.SendQueuedSize. 718 func (e *connectedEndpoint) SendQueuedSize() int64 { 719 return e.writeQueue.QueuedSize() 720 } 721 722 // SendMaxQueueSize implements ConnectedEndpoint.SendMaxQueueSize. 723 func (e *connectedEndpoint) SendMaxQueueSize() int64 { 724 return e.writeQueue.MaxQueueSize() 725 } 726 727 // Release implements ConnectedEndpoint.Release. 728 func (e *connectedEndpoint) Release(ctx context.Context) { 729 e.writeQueue.DecRef(ctx) 730 } 731 732 // CloseUnread implements ConnectedEndpoint.CloseUnread. 733 func (e *connectedEndpoint) CloseUnread() { 734 e.writeQueue.CloseUnread() 735 } 736 737 // SetSendBufferSize implements ConnectedEndpoint.SetSendBufferSize. 738 // SetSendBufferSize sets the send buffer size for the write queue to the 739 // specified value. 740 func (e *connectedEndpoint) SetSendBufferSize(v int64) (newSz int64) { 741 e.writeQueue.SetMaxQueueSize(v) 742 return v 743 } 744 745 // baseEndpoint is an embeddable unix endpoint base used in both the connected 746 // and connectionless unix domain socket Endpoint implementations. 747 // 748 // Not to be used on its own. 749 // 750 // +stateify savable 751 type baseEndpoint struct { 752 *waiter.Queue 753 tcpip.DefaultSocketOptionsHandler 754 755 // Mutex protects the below fields. 756 // 757 // See the lock ordering comment in package kernel/epoll regarding when 758 // this lock can safely be held. 759 sync.Mutex `state:"nosave"` 760 761 // receiver allows Messages to be received. 762 receiver Receiver 763 764 // connected allows messages to be sent and state information about the 765 // connected endpoint to be read. 766 connected ConnectedEndpoint 767 768 // path is not empty if the endpoint has been bound, 769 // or may be used if the endpoint is connected. 770 path string 771 772 // ops is used to get socket level options. 773 ops tcpip.SocketOptions 774 } 775 776 // EventRegister implements waiter.Waitable.EventRegister. 777 func (e *baseEndpoint) EventRegister(we *waiter.Entry, mask waiter.EventMask) { 778 e.Queue.EventRegister(we, mask) 779 e.Lock() 780 c := e.connected 781 e.Unlock() 782 if c != nil { 783 c.EventUpdate() 784 } 785 } 786 787 // EventUnregister implements waiter.Waitable.EventUnregister. 788 func (e *baseEndpoint) EventUnregister(we *waiter.Entry) { 789 e.Queue.EventUnregister(we) 790 e.Lock() 791 c := e.connected 792 e.Unlock() 793 if c != nil { 794 c.EventUpdate() 795 } 796 } 797 798 // Passcred implements Credentialer.Passcred. 799 func (e *baseEndpoint) Passcred() bool { 800 return e.SocketOptions().GetPassCred() 801 } 802 803 // ConnectedPasscred implements Credentialer.ConnectedPasscred. 804 func (e *baseEndpoint) ConnectedPasscred() bool { 805 e.Lock() 806 defer e.Unlock() 807 return e.connected != nil && e.connected.Passcred() 808 } 809 810 // Connected implements ConnectingEndpoint.Connected. 811 func (e *baseEndpoint) Connected() bool { 812 return e.receiver != nil && e.connected != nil 813 } 814 815 // RecvMsg reads data and a control message from the endpoint. 816 func (e *baseEndpoint) RecvMsg(ctx context.Context, data [][]byte, creds bool, numRights int, peek bool, addr *tcpip.FullAddress) (int64, int64, ControlMessages, bool, *syserr.Error) { 817 e.Lock() 818 819 receiver := e.receiver 820 if receiver == nil { 821 e.Unlock() 822 return 0, 0, ControlMessages{}, false, syserr.ErrNotConnected 823 } 824 825 recvLen, msgLen, cms, cmt, a, notify, err := receiver.Recv(ctx, data, creds, numRights, peek) 826 e.Unlock() 827 if err != nil { 828 return 0, 0, ControlMessages{}, false, err 829 } 830 831 if notify { 832 receiver.RecvNotify() 833 } 834 835 if addr != nil { 836 *addr = a 837 } 838 return recvLen, msgLen, cms, cmt, nil 839 } 840 841 // SendMsg writes data and a control message to the endpoint's peer. 842 // This method does not block if the data cannot be written. 843 func (e *baseEndpoint) SendMsg(ctx context.Context, data [][]byte, c ControlMessages, to BoundEndpoint) (int64, *syserr.Error) { 844 e.Lock() 845 if !e.Connected() { 846 e.Unlock() 847 return 0, syserr.ErrNotConnected 848 } 849 if to != nil { 850 e.Unlock() 851 return 0, syserr.ErrAlreadyConnected 852 } 853 854 connected := e.connected 855 n, notify, err := connected.Send(ctx, data, c, tcpip.FullAddress{Addr: tcpip.Address(e.path)}) 856 e.Unlock() 857 858 if notify { 859 connected.SendNotify() 860 } 861 862 return n, err 863 } 864 865 // SetSockOpt sets a socket option. 866 func (e *baseEndpoint) SetSockOpt(opt tcpip.SettableSocketOption) tcpip.Error { 867 return nil 868 } 869 870 func (e *baseEndpoint) SetSockOptInt(opt tcpip.SockOptInt, v int) tcpip.Error { 871 log.Warningf("Unsupported socket option: %d", opt) 872 return nil 873 } 874 875 func (e *baseEndpoint) GetSockOptInt(opt tcpip.SockOptInt) (int, tcpip.Error) { 876 switch opt { 877 case tcpip.ReceiveQueueSizeOption: 878 v := 0 879 e.Lock() 880 if !e.Connected() { 881 e.Unlock() 882 return -1, &tcpip.ErrNotConnected{} 883 } 884 v = int(e.receiver.RecvQueuedSize()) 885 e.Unlock() 886 if v < 0 { 887 return -1, &tcpip.ErrQueueSizeNotSupported{} 888 } 889 return v, nil 890 891 case tcpip.SendQueueSizeOption: 892 e.Lock() 893 if !e.Connected() { 894 e.Unlock() 895 return -1, &tcpip.ErrNotConnected{} 896 } 897 v := e.connected.SendQueuedSize() 898 e.Unlock() 899 if v < 0 { 900 return -1, &tcpip.ErrQueueSizeNotSupported{} 901 } 902 return int(v), nil 903 904 default: 905 log.Warningf("Unsupported socket option: %d", opt) 906 return -1, &tcpip.ErrUnknownProtocolOption{} 907 } 908 } 909 910 // GetSockOpt implements tcpip.Endpoint.GetSockOpt. 911 func (e *baseEndpoint) GetSockOpt(opt tcpip.GettableSocketOption) tcpip.Error { 912 log.Warningf("Unsupported socket option: %T", opt) 913 return &tcpip.ErrUnknownProtocolOption{} 914 } 915 916 // LastError implements Endpoint.LastError. 917 func (*baseEndpoint) LastError() tcpip.Error { 918 return nil 919 } 920 921 // SocketOptions implements Endpoint.SocketOptions. 922 func (e *baseEndpoint) SocketOptions() *tcpip.SocketOptions { 923 return &e.ops 924 } 925 926 // Shutdown closes the read and/or write end of the endpoint connection to its 927 // peer. 928 func (e *baseEndpoint) Shutdown(flags tcpip.ShutdownFlags) *syserr.Error { 929 e.Lock() 930 if !e.Connected() { 931 e.Unlock() 932 return syserr.ErrNotConnected 933 } 934 935 var ( 936 r = e.receiver 937 c = e.connected 938 shutdownRead = flags&tcpip.ShutdownRead != 0 939 shutdownWrite = flags&tcpip.ShutdownWrite != 0 940 ) 941 if shutdownRead { 942 r.CloseRecv() 943 } 944 if shutdownWrite { 945 c.CloseSend() 946 } 947 e.Unlock() 948 949 // Don't hold e.Mutex while calling CloseNotify. 950 if shutdownRead { 951 r.CloseNotify() 952 } 953 if shutdownWrite { 954 c.CloseNotify() 955 } 956 957 return nil 958 } 959 960 // GetLocalAddress returns the bound path. 961 func (e *baseEndpoint) GetLocalAddress() (tcpip.FullAddress, tcpip.Error) { 962 e.Lock() 963 defer e.Unlock() 964 return tcpip.FullAddress{Addr: tcpip.Address(e.path)}, nil 965 } 966 967 // GetRemoteAddress returns the local address of the connected endpoint (if 968 // available). 969 func (e *baseEndpoint) GetRemoteAddress() (tcpip.FullAddress, tcpip.Error) { 970 e.Lock() 971 c := e.connected 972 e.Unlock() 973 if c != nil { 974 return c.GetLocalAddress() 975 } 976 return tcpip.FullAddress{}, &tcpip.ErrNotConnected{} 977 } 978 979 // Release implements BoundEndpoint.Release. 980 func (*baseEndpoint) Release(context.Context) { 981 // Binding a baseEndpoint doesn't take a reference. 982 } 983 984 // stackHandler is just a stub implementation of tcpip.StackHandler to provide 985 // when initializing socketoptions. 986 type stackHandler struct { 987 } 988 989 // Option implements tcpip.StackHandler. 990 func (h *stackHandler) Option(option interface{}) tcpip.Error { 991 panic("unimplemented") 992 } 993 994 // TransportProtocolOption implements tcpip.StackHandler. 995 func (h *stackHandler) TransportProtocolOption(proto tcpip.TransportProtocolNumber, option tcpip.GettableTransportProtocolOption) tcpip.Error { 996 panic("unimplemented") 997 } 998 999 // getSendBufferLimits implements tcpip.GetSendBufferLimits. 1000 // 1001 // AF_UNIX sockets buffer sizes are not tied to the networking stack/namespace 1002 // in linux but are bound by net.core.(wmem|rmem)_(max|default). 1003 // 1004 // In gVisor net.core sysctls today are not exposed or if exposed are currently 1005 // tied to the networking stack in use. This makes it complicated for AF_UNIX 1006 // when we are in a new namespace w/ no networking stack. As a result for now we 1007 // define default/max values here in the unix socket implementation itself. 1008 func getSendBufferLimits(tcpip.StackHandler) tcpip.SendBufferSizeOption { 1009 return tcpip.SendBufferSizeOption{ 1010 Min: minimumBufferSize, 1011 Default: defaultBufferSize, 1012 Max: maxBufferSize, 1013 } 1014 } 1015 1016 // getReceiveBufferLimits implements tcpip.GetReceiveBufferLimits. 1017 // 1018 // We define min, max and default values for unix socket implementation. Unix 1019 // sockets do not use receive buffer. 1020 func getReceiveBufferLimits(tcpip.StackHandler) tcpip.ReceiveBufferSizeOption { 1021 return tcpip.ReceiveBufferSizeOption{ 1022 Min: minimumBufferSize, 1023 Default: defaultBufferSize, 1024 Max: maxBufferSize, 1025 } 1026 }