inet.af/netstack@v0.0.0-20220214151720-7585b01ddccf/tcpip/transport/tcp/endpoint.go (about) 1 // Copyright 2018 The gVisor Authors. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 package tcp 16 17 import ( 18 "encoding/binary" 19 "fmt" 20 "io" 21 "math" 22 "runtime" 23 "strings" 24 "sync/atomic" 25 "time" 26 27 "inet.af/netstack/sleep" 28 "inet.af/netstack/sync" 29 "inet.af/netstack/tcpip" 30 "inet.af/netstack/tcpip/hash/jenkins" 31 "inet.af/netstack/tcpip/header" 32 "inet.af/netstack/tcpip/ports" 33 "inet.af/netstack/tcpip/seqnum" 34 "inet.af/netstack/tcpip/stack" 35 "inet.af/netstack/waiter" 36 ) 37 38 // EndpointState represents the state of a TCP endpoint. 39 type EndpointState tcpip.EndpointState 40 41 // Endpoint states. Note that are represented in a netstack-specific manner and 42 // may not be meaningful externally. Specifically, they need to be translated to 43 // Linux's representation for these states if presented to userspace. 44 const ( 45 _ EndpointState = iota 46 // TCP protocol states in sync with the definitions in 47 // https://github.com/torvalds/linux/blob/7acac4b3196/include/net/tcp_states.h#L13 48 StateEstablished 49 StateSynSent 50 StateSynRecv 51 StateFinWait1 52 StateFinWait2 53 StateTimeWait 54 StateClose 55 StateCloseWait 56 StateLastAck 57 StateListen 58 StateClosing 59 60 // Endpoint states internal to netstack. 61 StateInitial 62 StateBound 63 StateConnecting // Connect() called, but the initial SYN hasn't been sent. 64 StateError 65 ) 66 67 const ( 68 // rcvAdvWndScale is used to split the available socket buffer into 69 // application buffer and the window to be advertised to the peer. This is 70 // currently hard coded to split the available space equally. 71 rcvAdvWndScale = 1 72 73 // SegOverheadFactor is used to multiply the value provided by the 74 // user on a SetSockOpt for setting the socket send/receive buffer sizes. 75 SegOverheadFactor = 2 76 ) 77 78 // connected returns true when s is one of the states representing an 79 // endpoint connected to a peer. 80 func (s EndpointState) connected() bool { 81 switch s { 82 case StateEstablished, StateFinWait1, StateFinWait2, StateTimeWait, StateCloseWait, StateLastAck, StateClosing: 83 return true 84 default: 85 return false 86 } 87 } 88 89 // connecting returns true when s is one of the states representing a 90 // connection in progress, but not yet fully established. 91 func (s EndpointState) connecting() bool { 92 switch s { 93 case StateConnecting, StateSynSent, StateSynRecv: 94 return true 95 default: 96 return false 97 } 98 } 99 100 // internal returns true when the state is netstack internal. 101 func (s EndpointState) internal() bool { 102 switch s { 103 case StateInitial, StateBound, StateConnecting, StateError: 104 return true 105 default: 106 return false 107 } 108 } 109 110 // handshake returns true when s is one of the states representing an endpoint 111 // in the middle of a TCP handshake. 112 func (s EndpointState) handshake() bool { 113 switch s { 114 case StateSynSent, StateSynRecv: 115 return true 116 default: 117 return false 118 } 119 } 120 121 // closed returns true when s is one of the states an endpoint transitions to 122 // when closed or when it encounters an error. This is distinct from a newly 123 // initialized endpoint that was never connected. 124 func (s EndpointState) closed() bool { 125 switch s { 126 case StateClose, StateError: 127 return true 128 default: 129 return false 130 } 131 } 132 133 // String implements fmt.Stringer.String. 134 func (s EndpointState) String() string { 135 switch s { 136 case StateInitial: 137 return "INITIAL" 138 case StateBound: 139 return "BOUND" 140 case StateConnecting: 141 return "CONNECTING" 142 case StateError: 143 return "ERROR" 144 case StateEstablished: 145 return "ESTABLISHED" 146 case StateSynSent: 147 return "SYN-SENT" 148 case StateSynRecv: 149 return "SYN-RCVD" 150 case StateFinWait1: 151 return "FIN-WAIT1" 152 case StateFinWait2: 153 return "FIN-WAIT2" 154 case StateTimeWait: 155 return "TIME-WAIT" 156 case StateClose: 157 return "CLOSED" 158 case StateCloseWait: 159 return "CLOSE-WAIT" 160 case StateLastAck: 161 return "LAST-ACK" 162 case StateListen: 163 return "LISTEN" 164 case StateClosing: 165 return "CLOSING" 166 default: 167 panic("unreachable") 168 } 169 } 170 171 // Reasons for notifying the protocol goroutine. 172 const ( 173 notifyNonZeroReceiveWindow = 1 << iota 174 notifyClose 175 notifyMTUChanged 176 notifyDrain 177 notifyReset 178 notifyResetByPeer 179 // notifyAbort is a request for an expedited teardown. 180 notifyAbort 181 notifyKeepaliveChanged 182 notifyMSSChanged 183 // notifyTickleWorker is used to tickle the protocol main loop during a 184 // restore after we update the endpoint state to the correct one. This 185 // ensures the loop terminates if the final state of the endpoint is 186 // say TIME_WAIT. 187 notifyTickleWorker 188 notifyError 189 // notifyShutdown means that a connecting socket was shutdown. 190 notifyShutdown 191 ) 192 193 // SACKInfo holds TCP SACK related information for a given endpoint. 194 // 195 // +stateify savable 196 type SACKInfo struct { 197 // Blocks is the maximum number of SACK blocks we track 198 // per endpoint. 199 Blocks [MaxSACKBlocks]header.SACKBlock 200 201 // NumBlocks is the number of valid SACK blocks stored in the 202 // blocks array above. 203 NumBlocks int 204 } 205 206 // ReceiveErrors collect segment receive errors within transport layer. 207 // 208 // +stateify savable 209 type ReceiveErrors struct { 210 tcpip.ReceiveErrors 211 212 // SegmentQueueDropped is the number of segments dropped due to 213 // a full segment queue. 214 SegmentQueueDropped tcpip.StatCounter 215 216 // ChecksumErrors is the number of segments dropped due to bad checksums. 217 ChecksumErrors tcpip.StatCounter 218 219 // ListenOverflowSynDrop is the number of times the listen queue overflowed 220 // and a SYN was dropped. 221 ListenOverflowSynDrop tcpip.StatCounter 222 223 // ListenOverflowAckDrop is the number of times the final ACK 224 // in the handshake was dropped due to overflow. 225 ListenOverflowAckDrop tcpip.StatCounter 226 227 // ZeroRcvWindowState is the number of times we advertised 228 // a zero receive window when rcvQueue is full. 229 ZeroRcvWindowState tcpip.StatCounter 230 231 // WantZeroWindow is the number of times we wanted to advertise a 232 // zero receive window but couldn't because it would have caused 233 // the receive window's right edge to shrink. 234 WantZeroRcvWindow tcpip.StatCounter 235 } 236 237 // SendErrors collect segment send errors within the transport layer. 238 // 239 // +stateify savable 240 type SendErrors struct { 241 tcpip.SendErrors 242 243 // SegmentSendToNetworkFailed is the number of TCP segments failed to be sent 244 // to the network endpoint. 245 SegmentSendToNetworkFailed tcpip.StatCounter 246 247 // SynSendToNetworkFailed is the number of TCP SYNs failed to be sent 248 // to the network endpoint. 249 SynSendToNetworkFailed tcpip.StatCounter 250 251 // Retransmits is the number of TCP segments retransmitted. 252 Retransmits tcpip.StatCounter 253 254 // FastRetransmit is the number of segments retransmitted in fast 255 // recovery. 256 FastRetransmit tcpip.StatCounter 257 258 // Timeouts is the number of times the RTO expired. 259 Timeouts tcpip.StatCounter 260 } 261 262 // Stats holds statistics about the endpoint. 263 // 264 // +stateify savable 265 type Stats struct { 266 // SegmentsReceived is the number of TCP segments received that 267 // the transport layer successfully parsed. 268 SegmentsReceived tcpip.StatCounter 269 270 // SegmentsSent is the number of TCP segments sent. 271 SegmentsSent tcpip.StatCounter 272 273 // FailedConnectionAttempts is the number of times we saw Connect and 274 // Accept errors. 275 FailedConnectionAttempts tcpip.StatCounter 276 277 // ReceiveErrors collects segment receive errors within the 278 // transport layer. 279 ReceiveErrors ReceiveErrors 280 281 // ReadErrors collects segment read errors from an endpoint read call. 282 ReadErrors tcpip.ReadErrors 283 284 // SendErrors collects segment send errors within the transport layer. 285 SendErrors SendErrors 286 287 // WriteErrors collects segment write errors from an endpoint write call. 288 WriteErrors tcpip.WriteErrors 289 } 290 291 // IsEndpointStats is an empty method to implement the tcpip.EndpointStats 292 // marker interface. 293 func (*Stats) IsEndpointStats() {} 294 295 // sndQueueInfo implements a send queue. 296 // 297 // +stateify savable 298 type sndQueueInfo struct { 299 sndQueueMu sync.Mutex `state:"nosave"` 300 stack.TCPSndBufState 301 302 // sndWaker is used to signal the protocol goroutine when there may be 303 // segments that need to be sent. 304 sndWaker sleep.Waker `state:"manual"` 305 } 306 307 // rcvQueueInfo contains the endpoint's rcvQueue and associated metadata. 308 // 309 // +stateify savable 310 type rcvQueueInfo struct { 311 rcvQueueMu sync.Mutex `state:"nosave"` 312 stack.TCPRcvBufState 313 314 // rcvQueue is the queue for ready-for-delivery segments. This struct's 315 // mutex must be held in order append segments to list. 316 rcvQueue segmentList `state:"wait"` 317 } 318 319 // endpoint represents a TCP endpoint. This struct serves as the interface 320 // between users of the endpoint and the protocol implementation; it is legal to 321 // have concurrent goroutines make calls into the endpoint, they are properly 322 // synchronized. The protocol implementation, however, runs in a single 323 // goroutine. 324 // 325 // Each endpoint has a few mutexes: 326 // 327 // e.mu -> Primary mutex for an endpoint must be held for all operations except 328 // in e.Readiness where acquiring it will result in a deadlock in epoll 329 // implementation. 330 // 331 // The following three mutexes can be acquired independent of e.mu but if 332 // acquired with e.mu then e.mu must be acquired first. 333 // 334 // e.acceptMu -> Protects e.acceptQueue. 335 // e.rcvQueueMu -> Protects e.rcvQueue and associated fields. 336 // e.sndQueueMu -> Protects the e.sndQueue and associated fields. 337 // e.lastErrorMu -> Protects the lastError field. 338 // 339 // LOCKING/UNLOCKING of the endpoint. The locking of an endpoint is different 340 // based on the context in which the lock is acquired. In the syscall context 341 // e.LockUser/e.UnlockUser should be used and when doing background processing 342 // e.mu.Lock/e.mu.Unlock should be used. The distinction is described below 343 // in brief. 344 // 345 // The reason for this locking behaviour is to avoid wakeups to handle packets. 346 // In cases where the endpoint is already locked the background processor can 347 // queue the packet up and go its merry way and the lock owner will eventually 348 // process the backlog when releasing the lock. Similarly when acquiring the 349 // lock from say a syscall goroutine we can implement a bit of spinning if we 350 // know that the lock is not held by another syscall goroutine. Background 351 // processors should never hold the lock for long and we can avoid an expensive 352 // sleep/wakeup by spinning for a shortwhile. 353 // 354 // For more details please see the detailed documentation on 355 // e.LockUser/e.UnlockUser methods. 356 // 357 // +stateify savable 358 type endpoint struct { 359 stack.TCPEndpointStateInner 360 stack.TransportEndpointInfo 361 tcpip.DefaultSocketOptionsHandler 362 363 // endpointEntry is used to queue endpoints for processing to the 364 // a given tcp processor goroutine. 365 // 366 // Precondition: epQueue.mu must be held to read/write this field.. 367 endpointEntry `state:"nosave"` 368 369 // pendingProcessing is true if this endpoint is queued for processing 370 // to a TCP processor. 371 // 372 // Precondition: epQueue.mu must be held to read/write this field.. 373 pendingProcessing bool `state:"nosave"` 374 375 // The following fields are initialized at creation time and do not 376 // change throughout the lifetime of the endpoint. 377 stack *stack.Stack `state:"manual"` 378 protocol *protocol `state:"manual"` 379 waiterQueue *waiter.Queue `state:"wait"` 380 uniqueID uint64 381 382 // hardError is meaningful only when state is stateError. It stores the 383 // error to be returned when read/write syscalls are called and the 384 // endpoint is in this state. hardError is protected by endpoint mu. 385 hardError tcpip.Error 386 387 // lastError represents the last error that the endpoint reported; 388 // access to it is protected by the following mutex. 389 lastErrorMu sync.Mutex `state:"nosave"` 390 lastError tcpip.Error 391 392 // rcvReadMu synchronizes calls to Read. 393 // 394 // mu and rcvQueueMu are temporarily released during data copying. rcvReadMu 395 // must be held during each read to ensure atomicity, so that multiple reads 396 // do not interleave. 397 // 398 // rcvReadMu should be held before holding mu. 399 rcvReadMu sync.Mutex `state:"nosave"` 400 401 // rcvQueueInfo holds the implementation of the endpoint's receive buffer. 402 // The data within rcvQueueInfo should only be accessed while rcvReadMu, mu, 403 // and rcvQueueMu are held, in that stated order. While processing the segment 404 // range, you can determine a range and then temporarily release mu and 405 // rcvQueueMu, which allows new segments to be appended to the queue while 406 // processing. 407 rcvQueueInfo rcvQueueInfo 408 409 // rcvMemUsed tracks the total amount of memory in use by received segments 410 // held in rcvQueue, pendingRcvdSegments and the segment queue. This is used to 411 // compute the window and the actual available buffer space. This is distinct 412 // from rcvBufUsed above which is the actual number of payload bytes held in 413 // the buffer not including any segment overheads. 414 // 415 // rcvMemUsed must be accessed atomically. 416 rcvMemUsed int32 417 418 // mu protects all endpoint fields unless documented otherwise. mu must 419 // be acquired before interacting with the endpoint fields. 420 // 421 // During handshake, mu is locked by the protocol listen goroutine and 422 // released by the handshake completion goroutine. 423 mu sync.CrossGoroutineMutex `state:"nosave"` 424 ownedByUser uint32 425 426 // state must be read/set using the EndpointState()/setEndpointState() 427 // methods. 428 state uint32 `state:".(EndpointState)"` 429 430 // origEndpointState is only used during a restore phase to save the 431 // endpoint state at restore time as the socket is moved to it's correct 432 // state. 433 origEndpointState uint32 `state:"nosave"` 434 435 isPortReserved bool `state:"manual"` 436 isRegistered bool `state:"manual"` 437 boundNICID tcpip.NICID 438 route *stack.Route `state:"manual"` 439 ttl uint8 440 isConnectNotified bool 441 442 // h stores a reference to the current handshake state if the endpoint is in 443 // the SYN-SENT or SYN-RECV states, in which case endpoint == endpoint.h.ep. 444 // nil otherwise. 445 h *handshake `state:"nosave"` 446 447 // portFlags stores the current values of port related flags. 448 portFlags ports.Flags 449 450 // Values used to reserve a port or register a transport endpoint 451 // (which ever happens first). 452 boundBindToDevice tcpip.NICID 453 boundPortFlags ports.Flags 454 boundDest tcpip.FullAddress 455 456 // effectiveNetProtos contains the network protocols actually in use. In 457 // most cases it will only contain "netProto", but in cases like IPv6 458 // endpoints with v6only set to false, this could include multiple 459 // protocols (e.g., IPv6 and IPv4) or a single different protocol (e.g., 460 // IPv4 when IPv6 endpoint is bound or connected to an IPv4 mapped 461 // address). 462 effectiveNetProtos []tcpip.NetworkProtocolNumber 463 464 // workerRunning specifies if a worker goroutine is running. 465 workerRunning bool 466 467 // workerCleanup specifies if the worker goroutine must perform cleanup 468 // before exiting. This can only be set to true when workerRunning is 469 // also true, and they're both protected by the mutex. 470 workerCleanup bool 471 472 // recentTSTime is the unix time when we last updated 473 // TCPEndpointStateInner.RecentTS. 474 recentTSTime tcpip.MonotonicTime 475 476 // shutdownFlags represent the current shutdown state of the endpoint. 477 shutdownFlags tcpip.ShutdownFlags 478 479 // tcpRecovery is the loss recovery algorithm used by TCP. 480 tcpRecovery tcpip.TCPRecovery 481 482 // sack holds TCP SACK related information for this endpoint. 483 sack SACKInfo 484 485 // delay enables Nagle's algorithm. 486 // 487 // delay is a boolean (0 is false) and must be accessed atomically. 488 delay uint32 489 490 // scoreboard holds TCP SACK Scoreboard information for this endpoint. 491 scoreboard *SACKScoreboard 492 493 // segmentQueue is used to hand received segments to the protocol 494 // goroutine. Segments are queued as long as the queue is not full, 495 // and dropped when it is. 496 segmentQueue segmentQueue `state:"wait"` 497 498 // userMSS if non-zero is the MSS value explicitly set by the user 499 // for this endpoint using the TCP_MAXSEG setsockopt. 500 userMSS uint16 501 502 // maxSynRetries is the maximum number of SYN retransmits that TCP should 503 // send before aborting the attempt to connect. It cannot exceed 255. 504 // 505 // NOTE: This is currently a no-op and does not change the SYN 506 // retransmissions. 507 maxSynRetries uint8 508 509 // windowClamp is used to bound the size of the advertised window to 510 // this value. 511 windowClamp uint32 512 513 // sndQueueInfo contains the implementation of the endpoint's send queue. 514 sndQueueInfo sndQueueInfo 515 516 // cc stores the name of the Congestion Control algorithm to use for 517 // this endpoint. 518 cc tcpip.CongestionControlOption 519 520 // newSegmentWaker is used to indicate to the protocol goroutine that 521 // it needs to wake up and handle new segments queued to it. 522 newSegmentWaker sleep.Waker `state:"manual"` 523 524 // notificationWaker is used to indicate to the protocol goroutine that 525 // it needs to wake up and check for notifications. 526 notificationWaker sleep.Waker `state:"manual"` 527 528 // notifyFlags is a bitmask of flags used to indicate to the protocol 529 // goroutine what it was notified; this is only accessed atomically. 530 notifyFlags uint32 `state:"nosave"` 531 532 // keepalive manages TCP keepalive state. When the connection is idle 533 // (no data sent or received) for keepaliveIdle, we start sending 534 // keepalives every keepalive.interval. If we send keepalive.count 535 // without hearing a response, the connection is closed. 536 keepalive keepalive 537 538 // userTimeout if non-zero specifies a user specified timeout for 539 // a connection w/ pending data to send. A connection that has pending 540 // unacked data will be forcibily aborted if the timeout is reached 541 // without any data being acked. 542 userTimeout time.Duration 543 544 // deferAccept if non-zero specifies a user specified time during 545 // which the final ACK of a handshake will be dropped provided the 546 // ACK is a bare ACK and carries no data. If the timeout is crossed then 547 // the bare ACK is accepted and the connection is delivered to the 548 // listener. 549 deferAccept time.Duration 550 551 // pendingAccepted tracks connections queued to be accepted. It is used to 552 // ensure such queued connections are terminated before the accepted queue is 553 // marked closed (by setting its capacity to zero). 554 pendingAccepted sync.WaitGroup `state:"nosave"` 555 556 // acceptMu protects accepted. 557 acceptMu sync.Mutex `state:"nosave"` 558 559 // acceptCond is a condition variable that can be used to block on when 560 // accepted is full and an endpoint is ready to be delivered. 561 // 562 // We use this condition variable to block/unblock goroutines which 563 // tried to deliver an endpoint but couldn't because accept backlog was 564 // full ( See: endpoint.deliverAccepted ). 565 acceptCond *sync.Cond `state:"nosave"` 566 567 // accepted is used by a listening endpoint protocol goroutine to 568 // send newly accepted connections to the endpoint so that they can be 569 // read by Accept() calls. 570 // +checklocks:acceptMu 571 acceptQueue acceptQueue 572 573 // The following are only used from the protocol goroutine, and 574 // therefore don't need locks to protect them. 575 rcv *receiver `state:"wait"` 576 snd *sender `state:"wait"` 577 578 // The goroutine drain completion notification channel. 579 drainDone chan struct{} `state:"nosave"` 580 581 // The goroutine undrain notification channel. This is currently used as 582 // a way to block the worker goroutines. Today nothing closes/writes 583 // this channel and this causes any goroutines waiting on this to just 584 // block. This is used during save/restore to prevent worker goroutines 585 // from mutating state as it's being saved. 586 undrain chan struct{} `state:"nosave"` 587 588 // probe if not nil is invoked on every received segment. It is passed 589 // a copy of the current state of the endpoint. 590 probe stack.TCPProbeFunc `state:"nosave"` 591 592 // The following are only used to assist the restore run to re-connect. 593 connectingAddress tcpip.Address 594 595 // amss is the advertised MSS to the peer by this endpoint. 596 amss uint16 597 598 // sendTOS represents IPv4 TOS or IPv6 TrafficClass, 599 // applied while sending packets. Defaults to 0 as on Linux. 600 sendTOS uint8 601 602 gso stack.GSO 603 604 stats Stats 605 606 // tcpLingerTimeout is the maximum amount of a time a socket 607 // a socket stays in TIME_WAIT state before being marked 608 // closed. 609 tcpLingerTimeout time.Duration 610 611 // closed indicates that the user has called closed on the 612 // endpoint and at this point the endpoint is only around 613 // to complete the TCP shutdown. 614 closed bool 615 616 // txHash is the transport layer hash to be set on outbound packets 617 // emitted by this endpoint. 618 txHash uint32 619 620 // owner is used to get uid and gid of the packet. 621 owner tcpip.PacketOwner 622 623 // ops is used to get socket level options. 624 ops tcpip.SocketOptions 625 626 // lastOutOfWindowAckTime is the time at which the an ACK was sent in response 627 // to an out of window segment being received by this endpoint. 628 lastOutOfWindowAckTime tcpip.MonotonicTime 629 } 630 631 // UniqueID implements stack.TransportEndpoint.UniqueID. 632 func (e *endpoint) UniqueID() uint64 { 633 return e.uniqueID 634 } 635 636 // calculateAdvertisedMSS calculates the MSS to advertise. 637 // 638 // If userMSS is non-zero and is not greater than the maximum possible MSS for 639 // r, it will be used; otherwise, the maximum possible MSS will be used. 640 func calculateAdvertisedMSS(userMSS uint16, r *stack.Route) uint16 { 641 // The maximum possible MSS is dependent on the route. 642 // TODO(b/143359391): Respect TCP Min and Max size. 643 maxMSS := uint16(r.MTU() - header.TCPMinimumSize) 644 645 if userMSS != 0 && userMSS < maxMSS { 646 return userMSS 647 } 648 649 return maxMSS 650 } 651 652 // LockUser tries to lock e.mu and if it fails it will check if the lock is held 653 // by another syscall goroutine. If yes, then it will goto sleep waiting for the 654 // lock to be released, if not then it will spin till it acquires the lock or 655 // another syscall goroutine acquires it in which case it will goto sleep as 656 // described above. 657 // 658 // The assumption behind spinning here being that background packet processing 659 // should not be holding the lock for long and spinning reduces latency as we 660 // avoid an expensive sleep/wakeup of of the syscall goroutine). 661 // +checklocksacquire:e.mu 662 func (e *endpoint) LockUser() { 663 for { 664 // Try first if the sock is locked then check if it's owned 665 // by another user goroutine if not then we spin, otherwise 666 // we just go to sleep on the Lock() and wait. 667 if !e.mu.TryLock() { 668 // If socket is owned by the user then just go to sleep 669 // as the lock could be held for a reasonably long time. 670 if atomic.LoadUint32(&e.ownedByUser) == 1 { 671 e.mu.Lock() 672 atomic.StoreUint32(&e.ownedByUser, 1) 673 return 674 } 675 // Spin but yield the processor since the lower half 676 // should yield the lock soon. 677 runtime.Gosched() 678 continue 679 } 680 atomic.StoreUint32(&e.ownedByUser, 1) 681 return // +checklocksforce 682 } 683 } 684 685 // UnlockUser will check if there are any segments already queued for processing 686 // and process any such segments before unlocking e.mu. This is required because 687 // we when packets arrive and endpoint lock is already held then such packets 688 // are queued up to be processed. If the lock is held by the endpoint goroutine 689 // then it will process these packets but if the lock is instead held by the 690 // syscall goroutine then we can have the syscall goroutine process the backlog 691 // before unlocking. 692 // 693 // This avoids an unnecessary wakeup of the endpoint protocol goroutine for the 694 // endpoint. It's also required eventually when we get rid of the endpoint 695 // protocol goroutine altogether. 696 // 697 // Precondition: e.LockUser() must have been called before calling e.UnlockUser() 698 // +checklocksrelease:e.mu 699 func (e *endpoint) UnlockUser() { 700 // Lock segment queue before checking so that we avoid a race where 701 // segments can be queued between the time we check if queue is empty 702 // and actually unlock the endpoint mutex. 703 for { 704 e.segmentQueue.mu.Lock() 705 if e.segmentQueue.emptyLocked() { 706 if atomic.SwapUint32(&e.ownedByUser, 0) != 1 { 707 panic("e.UnlockUser() called without calling e.LockUser()") 708 } 709 e.mu.Unlock() 710 e.segmentQueue.mu.Unlock() 711 return 712 } 713 e.segmentQueue.mu.Unlock() 714 715 switch e.EndpointState() { 716 case StateEstablished: 717 if err := e.handleSegmentsLocked(true /* fastPath */); err != nil { 718 e.notifyProtocolGoroutine(notifyTickleWorker) 719 } 720 default: 721 // Since we are waking the endpoint goroutine here just unlock 722 // and let it process the queued segments. 723 e.newSegmentWaker.Assert() 724 if atomic.SwapUint32(&e.ownedByUser, 0) != 1 { 725 panic("e.UnlockUser() called without calling e.LockUser()") 726 } 727 e.mu.Unlock() 728 return 729 } 730 } 731 } 732 733 // StopWork halts packet processing. Only to be used in tests. 734 // +checklocksacquire:e.mu 735 func (e *endpoint) StopWork() { 736 e.mu.Lock() 737 } 738 739 // ResumeWork resumes packet processing. Only to be used in tests. 740 // +checklocksrelease:e.mu 741 func (e *endpoint) ResumeWork() { 742 e.mu.Unlock() 743 } 744 745 // setEndpointState updates the state of the endpoint to state atomically. This 746 // method is unexported as the only place we should update the state is in this 747 // package but we allow the state to be read freely without holding e.mu. 748 // 749 // Precondition: e.mu must be held to call this method. 750 func (e *endpoint) setEndpointState(state EndpointState) { 751 oldstate := EndpointState(atomic.SwapUint32(&e.state, uint32(state))) 752 switch state { 753 case StateEstablished: 754 e.stack.Stats().TCP.CurrentEstablished.Increment() 755 e.stack.Stats().TCP.CurrentConnected.Increment() 756 case StateError: 757 fallthrough 758 case StateClose: 759 if oldstate == StateCloseWait || oldstate == StateEstablished { 760 e.stack.Stats().TCP.EstablishedResets.Increment() 761 } 762 fallthrough 763 default: 764 if oldstate == StateEstablished { 765 e.stack.Stats().TCP.CurrentEstablished.Decrement() 766 } 767 } 768 } 769 770 // EndpointState returns the current state of the endpoint. 771 func (e *endpoint) EndpointState() EndpointState { 772 return EndpointState(atomic.LoadUint32(&e.state)) 773 } 774 775 // setRecentTimestamp sets the recentTS field to the provided value. 776 func (e *endpoint) setRecentTimestamp(recentTS uint32) { 777 e.RecentTS = recentTS 778 e.recentTSTime = e.stack.Clock().NowMonotonic() 779 } 780 781 // recentTimestamp returns the value of the recentTS field. 782 func (e *endpoint) recentTimestamp() uint32 { 783 return e.RecentTS 784 } 785 786 // keepalive is a synchronization wrapper used to appease stateify. See the 787 // comment in endpoint, where it is used. 788 // 789 // +stateify savable 790 type keepalive struct { 791 sync.Mutex `state:"nosave"` 792 idle time.Duration 793 interval time.Duration 794 count int 795 unacked int 796 timer timer `state:"nosave"` 797 waker sleep.Waker `state:"nosave"` 798 } 799 800 func newEndpoint(s *stack.Stack, protocol *protocol, netProto tcpip.NetworkProtocolNumber, waiterQueue *waiter.Queue) *endpoint { 801 e := &endpoint{ 802 stack: s, 803 protocol: protocol, 804 TransportEndpointInfo: stack.TransportEndpointInfo{ 805 NetProto: netProto, 806 TransProto: header.TCPProtocolNumber, 807 }, 808 sndQueueInfo: sndQueueInfo{ 809 TCPSndBufState: stack.TCPSndBufState{ 810 SndMTU: math.MaxInt32, 811 }, 812 }, 813 waiterQueue: waiterQueue, 814 state: uint32(StateInitial), 815 keepalive: keepalive{ 816 idle: DefaultKeepaliveIdle, 817 interval: DefaultKeepaliveInterval, 818 count: DefaultKeepaliveCount, 819 }, 820 uniqueID: s.UniqueID(), 821 txHash: s.Rand().Uint32(), 822 windowClamp: DefaultReceiveBufferSize, 823 maxSynRetries: DefaultSynRetries, 824 } 825 e.ops.InitHandler(e, e.stack, GetTCPSendBufferLimits, GetTCPReceiveBufferLimits) 826 e.ops.SetMulticastLoop(true) 827 e.ops.SetQuickAck(true) 828 e.ops.SetSendBufferSize(DefaultSendBufferSize, false /* notify */) 829 e.ops.SetReceiveBufferSize(DefaultReceiveBufferSize, false /* notify */) 830 831 var ss tcpip.TCPSendBufferSizeRangeOption 832 if err := s.TransportProtocolOption(ProtocolNumber, &ss); err == nil { 833 e.ops.SetSendBufferSize(int64(ss.Default), false /* notify */) 834 } 835 836 var rs tcpip.TCPReceiveBufferSizeRangeOption 837 if err := s.TransportProtocolOption(ProtocolNumber, &rs); err == nil { 838 e.ops.SetReceiveBufferSize(int64(rs.Default), false /* notify */) 839 } 840 841 var cs tcpip.CongestionControlOption 842 if err := s.TransportProtocolOption(ProtocolNumber, &cs); err == nil { 843 e.cc = cs 844 } 845 846 var mrb tcpip.TCPModerateReceiveBufferOption 847 if err := s.TransportProtocolOption(ProtocolNumber, &mrb); err == nil { 848 e.rcvQueueInfo.RcvAutoParams.Disabled = !bool(mrb) 849 } 850 851 var de tcpip.TCPDelayEnabled 852 if err := s.TransportProtocolOption(ProtocolNumber, &de); err == nil && de { 853 e.ops.SetDelayOption(true) 854 } 855 856 var tcpLT tcpip.TCPLingerTimeoutOption 857 if err := s.TransportProtocolOption(ProtocolNumber, &tcpLT); err == nil { 858 e.tcpLingerTimeout = time.Duration(tcpLT) 859 } 860 861 var synRetries tcpip.TCPSynRetriesOption 862 if err := s.TransportProtocolOption(ProtocolNumber, &synRetries); err == nil { 863 e.maxSynRetries = uint8(synRetries) 864 } 865 866 if p := s.GetTCPProbe(); p != nil { 867 e.probe = p 868 } 869 870 e.segmentQueue.ep = e 871 872 e.acceptCond = sync.NewCond(&e.acceptMu) 873 e.keepalive.timer.init(e.stack.Clock(), &e.keepalive.waker) 874 875 return e 876 } 877 878 // Readiness returns the current readiness of the endpoint. For example, if 879 // waiter.EventIn is set, the endpoint is immediately readable. 880 func (e *endpoint) Readiness(mask waiter.EventMask) waiter.EventMask { 881 result := waiter.EventMask(0) 882 883 switch e.EndpointState() { 884 case StateInitial, StateBound: 885 // This prevents blocking of new sockets which are not 886 // connected when SO_LINGER is set. 887 result |= waiter.EventHUp 888 889 case StateConnecting, StateSynSent, StateSynRecv: 890 // Ready for nothing. 891 892 case StateClose, StateError, StateTimeWait: 893 // Ready for anything. 894 result = mask 895 896 case StateListen: 897 // Check if there's anything in the accepted queue. 898 if (mask & waiter.ReadableEvents) != 0 { 899 e.acceptMu.Lock() 900 if e.acceptQueue.endpoints.Len() != 0 { 901 result |= waiter.ReadableEvents 902 } 903 e.acceptMu.Unlock() 904 } 905 } 906 if e.EndpointState().connected() { 907 // Determine if the endpoint is writable if requested. 908 if (mask & waiter.WritableEvents) != 0 { 909 e.sndQueueInfo.sndQueueMu.Lock() 910 sndBufSize := e.getSendBufferSize() 911 if e.sndQueueInfo.SndClosed || e.sndQueueInfo.SndBufUsed < sndBufSize { 912 result |= waiter.WritableEvents 913 } 914 e.sndQueueInfo.sndQueueMu.Unlock() 915 } 916 917 // Determine if the endpoint is readable if requested. 918 if (mask & waiter.ReadableEvents) != 0 { 919 e.rcvQueueInfo.rcvQueueMu.Lock() 920 if e.rcvQueueInfo.RcvBufUsed > 0 || e.rcvQueueInfo.RcvClosed { 921 result |= waiter.ReadableEvents 922 } 923 e.rcvQueueInfo.rcvQueueMu.Unlock() 924 } 925 } 926 927 return result 928 } 929 930 func (e *endpoint) fetchNotifications() uint32 { 931 return atomic.SwapUint32(&e.notifyFlags, 0) 932 } 933 934 func (e *endpoint) notifyProtocolGoroutine(n uint32) { 935 for { 936 v := atomic.LoadUint32(&e.notifyFlags) 937 if v&n == n { 938 // The flags are already set. 939 return 940 } 941 942 if atomic.CompareAndSwapUint32(&e.notifyFlags, v, v|n) { 943 if v == 0 { 944 // We are causing a transition from no flags to 945 // at least one flag set, so we must cause the 946 // protocol goroutine to wake up. 947 e.notificationWaker.Assert() 948 } 949 return 950 } 951 } 952 } 953 954 // Abort implements stack.TransportEndpoint.Abort. 955 func (e *endpoint) Abort() { 956 // The abort notification is not processed synchronously, so no 957 // synchronization is needed. 958 // 959 // If the endpoint becomes connected after this check, we still close 960 // the endpoint. This worst case results in a slower abort. 961 // 962 // If the endpoint disconnected after the check, nothing needs to be 963 // done, so sending a notification which will potentially be ignored is 964 // fine. 965 // 966 // If the endpoint connecting finishes after the check, the endpoint 967 // is either in a connected state (where we would notifyAbort anyway), 968 // SYN-RECV (where we would also notifyAbort anyway), or in an error 969 // state where nothing is required and the notification can be safely 970 // ignored. 971 // 972 // Endpoints where a Close during connecting or SYN-RECV state would be 973 // problematic are set to state connecting before being registered (and 974 // thus possible to be Aborted). They are never available in initial 975 // state. 976 // 977 // Endpoints transitioning from initial to connecting state may be 978 // safely either closed or sent notifyAbort. 979 if s := e.EndpointState(); s == StateConnecting || s == StateSynRecv || s.connected() { 980 e.notifyProtocolGoroutine(notifyAbort) 981 return 982 } 983 e.Close() 984 } 985 986 // Close puts the endpoint in a closed state and frees all resources associated 987 // with it. It must be called only once and with no other concurrent calls to 988 // the endpoint. 989 func (e *endpoint) Close() { 990 e.LockUser() 991 defer e.UnlockUser() 992 if e.closed { 993 return 994 } 995 996 linger := e.SocketOptions().GetLinger() 997 if linger.Enabled && linger.Timeout == 0 { 998 s := e.EndpointState() 999 isResetState := s == StateEstablished || s == StateCloseWait || s == StateFinWait1 || s == StateFinWait2 || s == StateSynRecv 1000 if isResetState { 1001 // Close the endpoint without doing full shutdown and 1002 // send a RST. 1003 e.resetConnectionLocked(&tcpip.ErrConnectionAborted{}) 1004 e.closeNoShutdownLocked() 1005 1006 // Wake up worker to close the endpoint. 1007 switch s { 1008 case StateSynRecv: 1009 e.notifyProtocolGoroutine(notifyClose) 1010 default: 1011 e.notifyProtocolGoroutine(notifyTickleWorker) 1012 } 1013 return 1014 } 1015 } 1016 1017 // Issue a shutdown so that the peer knows we won't send any more data 1018 // if we're connected, or stop accepting if we're listening. 1019 e.shutdownLocked(tcpip.ShutdownWrite | tcpip.ShutdownRead) 1020 e.closeNoShutdownLocked() 1021 } 1022 1023 // closeNoShutdown closes the endpoint without doing a full shutdown. 1024 func (e *endpoint) closeNoShutdownLocked() { 1025 // For listening sockets, we always release ports inline so that they 1026 // are immediately available for reuse after Close() is called. If also 1027 // registered, we unregister as well otherwise the next user would fail 1028 // in Listen() when trying to register. 1029 if e.EndpointState() == StateListen && e.isPortReserved { 1030 if e.isRegistered { 1031 e.stack.StartTransportEndpointCleanup(e.effectiveNetProtos, ProtocolNumber, e.TransportEndpointInfo.ID, e, e.boundPortFlags, e.boundBindToDevice) 1032 e.isRegistered = false 1033 } 1034 1035 portRes := ports.Reservation{ 1036 Networks: e.effectiveNetProtos, 1037 Transport: ProtocolNumber, 1038 Addr: e.TransportEndpointInfo.ID.LocalAddress, 1039 Port: e.TransportEndpointInfo.ID.LocalPort, 1040 Flags: e.boundPortFlags, 1041 BindToDevice: e.boundBindToDevice, 1042 Dest: e.boundDest, 1043 } 1044 e.stack.ReleasePort(portRes) 1045 e.isPortReserved = false 1046 e.boundBindToDevice = 0 1047 e.boundPortFlags = ports.Flags{} 1048 e.boundDest = tcpip.FullAddress{} 1049 } 1050 1051 // Mark endpoint as closed. 1052 e.closed = true 1053 1054 switch e.EndpointState() { 1055 case StateClose, StateError: 1056 return 1057 } 1058 1059 eventMask := waiter.ReadableEvents | waiter.WritableEvents 1060 // Either perform the local cleanup or kick the worker to make sure it 1061 // knows it needs to cleanup. 1062 if e.workerRunning { 1063 e.workerCleanup = true 1064 tcpip.AddDanglingEndpoint(e) 1065 // Worker will remove the dangling endpoint when the endpoint 1066 // goroutine terminates. 1067 e.notifyProtocolGoroutine(notifyClose) 1068 } else { 1069 e.transitionToStateCloseLocked() 1070 // Notify that the endpoint is closed. 1071 eventMask |= waiter.EventHUp 1072 } 1073 1074 // The TCP closing state-machine would eventually notify EventHUp, but we 1075 // notify EventIn|EventOut immediately to unblock any blocked waiters. 1076 e.waiterQueue.Notify(eventMask) 1077 } 1078 1079 // closePendingAcceptableConnections closes all connections that have completed 1080 // handshake but not yet been delivered to the application. 1081 func (e *endpoint) closePendingAcceptableConnectionsLocked() { 1082 e.acceptMu.Lock() 1083 // Close any endpoints in SYN-RCVD state. 1084 for n := range e.acceptQueue.pendingEndpoints { 1085 n.notifyProtocolGoroutine(notifyClose) 1086 } 1087 e.acceptQueue.pendingEndpoints = nil 1088 // Reset all connections that are waiting to be accepted. 1089 for n := e.acceptQueue.endpoints.Front(); n != nil; n = n.Next() { 1090 n.Value.(*endpoint).notifyProtocolGoroutine(notifyReset) 1091 } 1092 e.acceptQueue.endpoints.Init() 1093 e.acceptMu.Unlock() 1094 1095 e.acceptCond.Broadcast() 1096 1097 // Wait for reset of all endpoints that are still waiting to be delivered to 1098 // the now closed accepted. 1099 e.pendingAccepted.Wait() 1100 } 1101 1102 // cleanupLocked frees all resources associated with the endpoint. It is called 1103 // after Close() is called and the worker goroutine (if any) is done with its 1104 // work. 1105 func (e *endpoint) cleanupLocked() { 1106 // Close all endpoints that might have been accepted by TCP but not by 1107 // the client. 1108 e.closePendingAcceptableConnectionsLocked() 1109 e.keepalive.timer.cleanup() 1110 1111 e.workerCleanup = false 1112 1113 if e.isRegistered { 1114 e.stack.StartTransportEndpointCleanup(e.effectiveNetProtos, ProtocolNumber, e.TransportEndpointInfo.ID, e, e.boundPortFlags, e.boundBindToDevice) 1115 e.isRegistered = false 1116 } 1117 1118 if e.isPortReserved { 1119 portRes := ports.Reservation{ 1120 Networks: e.effectiveNetProtos, 1121 Transport: ProtocolNumber, 1122 Addr: e.TransportEndpointInfo.ID.LocalAddress, 1123 Port: e.TransportEndpointInfo.ID.LocalPort, 1124 Flags: e.boundPortFlags, 1125 BindToDevice: e.boundBindToDevice, 1126 Dest: e.boundDest, 1127 } 1128 e.stack.ReleasePort(portRes) 1129 e.isPortReserved = false 1130 } 1131 e.boundBindToDevice = 0 1132 e.boundPortFlags = ports.Flags{} 1133 e.boundDest = tcpip.FullAddress{} 1134 1135 if e.route != nil { 1136 e.route.Release() 1137 e.route = nil 1138 } 1139 1140 e.stack.CompleteTransportEndpointCleanup(e) 1141 tcpip.DeleteDanglingEndpoint(e) 1142 } 1143 1144 // wndFromSpace returns the window that we can advertise based on the available 1145 // receive buffer space. 1146 func wndFromSpace(space int) int { 1147 return space >> rcvAdvWndScale 1148 } 1149 1150 // initialReceiveWindow returns the initial receive window to advertise in the 1151 // SYN/SYN-ACK. 1152 func (e *endpoint) initialReceiveWindow() int { 1153 rcvWnd := wndFromSpace(e.receiveBufferAvailable()) 1154 if rcvWnd > math.MaxUint16 { 1155 rcvWnd = math.MaxUint16 1156 } 1157 1158 // Use the user supplied MSS, if available. 1159 routeWnd := InitialCwnd * int(calculateAdvertisedMSS(e.userMSS, e.route)) * 2 1160 if rcvWnd > routeWnd { 1161 rcvWnd = routeWnd 1162 } 1163 rcvWndScale := e.rcvWndScaleForHandshake() 1164 1165 // Round-down the rcvWnd to a multiple of wndScale. This ensures that the 1166 // window offered in SYN won't be reduced due to the loss of precision if 1167 // window scaling is enabled after the handshake. 1168 rcvWnd = (rcvWnd >> uint8(rcvWndScale)) << uint8(rcvWndScale) 1169 1170 // Ensure we can always accept at least 1 byte if the scale specified 1171 // was too high for the provided rcvWnd. 1172 if rcvWnd == 0 { 1173 rcvWnd = 1 1174 } 1175 1176 return rcvWnd 1177 } 1178 1179 // ModerateRecvBuf adjusts the receive buffer and the advertised window 1180 // based on the number of bytes copied to userspace. 1181 func (e *endpoint) ModerateRecvBuf(copied int) { 1182 e.LockUser() 1183 defer e.UnlockUser() 1184 1185 e.rcvQueueInfo.rcvQueueMu.Lock() 1186 if e.rcvQueueInfo.RcvAutoParams.Disabled { 1187 e.rcvQueueInfo.rcvQueueMu.Unlock() 1188 return 1189 } 1190 now := e.stack.Clock().NowMonotonic() 1191 if rtt := e.rcvQueueInfo.RcvAutoParams.RTT; rtt == 0 || now.Sub(e.rcvQueueInfo.RcvAutoParams.MeasureTime) < rtt { 1192 e.rcvQueueInfo.RcvAutoParams.CopiedBytes += copied 1193 e.rcvQueueInfo.rcvQueueMu.Unlock() 1194 return 1195 } 1196 prevRTTCopied := e.rcvQueueInfo.RcvAutoParams.CopiedBytes + copied 1197 prevCopied := e.rcvQueueInfo.RcvAutoParams.PrevCopiedBytes 1198 rcvWnd := 0 1199 if prevRTTCopied > prevCopied { 1200 // The minimal receive window based on what was copied by the app 1201 // in the immediate preceding RTT and some extra buffer for 16 1202 // segments to account for variations. 1203 // We multiply by 2 to account for packet losses. 1204 rcvWnd = prevRTTCopied*2 + 16*int(e.amss) 1205 1206 // Scale for slow start based on bytes copied in this RTT vs previous. 1207 grow := (rcvWnd * (prevRTTCopied - prevCopied)) / prevCopied 1208 1209 // Multiply growth factor by 2 again to account for sender being 1210 // in slow-start where the sender grows it's congestion window 1211 // by 100% per RTT. 1212 rcvWnd += grow * 2 1213 1214 // Make sure auto tuned buffer size can always receive upto 2x 1215 // the initial window of 10 segments. 1216 if minRcvWnd := int(e.amss) * InitialCwnd * 2; rcvWnd < minRcvWnd { 1217 rcvWnd = minRcvWnd 1218 } 1219 1220 // Cap the auto tuned buffer size by the maximum permissible 1221 // receive buffer size. 1222 if max := e.maxReceiveBufferSize(); rcvWnd > max { 1223 rcvWnd = max 1224 } 1225 1226 // We do not adjust downwards as that can cause the receiver to 1227 // reject valid data that might already be in flight as the 1228 // acceptable window will shrink. 1229 rcvBufSize := int(e.ops.GetReceiveBufferSize()) 1230 if rcvWnd > rcvBufSize { 1231 availBefore := wndFromSpace(e.receiveBufferAvailableLocked(rcvBufSize)) 1232 e.ops.SetReceiveBufferSize(int64(rcvWnd), false /* notify */) 1233 availAfter := wndFromSpace(e.receiveBufferAvailableLocked(rcvWnd)) 1234 if crossed, above := e.windowCrossedACKThresholdLocked(availAfter-availBefore, rcvBufSize); crossed && above { 1235 e.notifyProtocolGoroutine(notifyNonZeroReceiveWindow) 1236 } 1237 } 1238 1239 // We only update PrevCopiedBytes when we grow the buffer because in cases 1240 // where PrevCopiedBytes > prevRTTCopied the existing buffer is already big 1241 // enough to handle the current rate and we don't need to do any 1242 // adjustments. 1243 e.rcvQueueInfo.RcvAutoParams.PrevCopiedBytes = prevRTTCopied 1244 } 1245 e.rcvQueueInfo.RcvAutoParams.MeasureTime = now 1246 e.rcvQueueInfo.RcvAutoParams.CopiedBytes = 0 1247 e.rcvQueueInfo.rcvQueueMu.Unlock() 1248 } 1249 1250 // SetOwner implements tcpip.Endpoint.SetOwner. 1251 func (e *endpoint) SetOwner(owner tcpip.PacketOwner) { 1252 e.owner = owner 1253 } 1254 1255 // Preconditions: e.mu must be held to call this function. 1256 func (e *endpoint) hardErrorLocked() tcpip.Error { 1257 err := e.hardError 1258 e.hardError = nil 1259 return err 1260 } 1261 1262 // Preconditions: e.mu must be held to call this function. 1263 func (e *endpoint) lastErrorLocked() tcpip.Error { 1264 e.lastErrorMu.Lock() 1265 defer e.lastErrorMu.Unlock() 1266 err := e.lastError 1267 e.lastError = nil 1268 return err 1269 } 1270 1271 // LastError implements tcpip.Endpoint.LastError. 1272 func (e *endpoint) LastError() tcpip.Error { 1273 e.LockUser() 1274 defer e.UnlockUser() 1275 if err := e.hardErrorLocked(); err != nil { 1276 return err 1277 } 1278 return e.lastErrorLocked() 1279 } 1280 1281 // LastErrorLocked reads and clears lastError with e.mu held. 1282 // Only to be used in tests. 1283 func (e *endpoint) LastErrorLocked() tcpip.Error { 1284 return e.lastErrorLocked() 1285 } 1286 1287 // UpdateLastError implements tcpip.SocketOptionsHandler.UpdateLastError. 1288 func (e *endpoint) UpdateLastError(err tcpip.Error) { 1289 e.LockUser() 1290 e.lastErrorMu.Lock() 1291 e.lastError = err 1292 e.lastErrorMu.Unlock() 1293 e.UnlockUser() 1294 } 1295 1296 // Read implements tcpip.Endpoint.Read. 1297 func (e *endpoint) Read(dst io.Writer, opts tcpip.ReadOptions) (tcpip.ReadResult, tcpip.Error) { 1298 e.rcvReadMu.Lock() 1299 defer e.rcvReadMu.Unlock() 1300 1301 // N.B. Here we get a range of segments to be processed. It is safe to not 1302 // hold rcvQueueMu when processing, since we hold rcvReadMu to ensure only we 1303 // can remove segments from the list through commitRead(). 1304 first, last, serr := e.startRead() 1305 if serr != nil { 1306 if _, ok := serr.(*tcpip.ErrClosedForReceive); ok { 1307 e.stats.ReadErrors.ReadClosed.Increment() 1308 } 1309 return tcpip.ReadResult{}, serr 1310 } 1311 1312 var err error 1313 done := 0 1314 s := first 1315 for s != nil { 1316 var n int 1317 n, err = s.data.ReadTo(dst, opts.Peek) 1318 // Book keeping first then error handling. 1319 1320 done += n 1321 1322 if opts.Peek { 1323 // For peek, we use the (first, last) range of segment returned from 1324 // startRead. We don't consume the receive buffer, so commitRead should 1325 // not be called. 1326 // 1327 // N.B. It is important to use `last` to determine the last segment, since 1328 // appending can happen while we process, and will lead to data race. 1329 if s == last { 1330 break 1331 } 1332 s = s.Next() 1333 } else { 1334 // N.B. commitRead() conveniently returns the next segment to read, after 1335 // removing the data/segment that is read. 1336 s = e.commitRead(n) 1337 } 1338 1339 if err != nil { 1340 break 1341 } 1342 } 1343 1344 // If something is read, we must report it. Report error when nothing is read. 1345 if done == 0 && err != nil { 1346 return tcpip.ReadResult{}, &tcpip.ErrBadBuffer{} 1347 } 1348 return tcpip.ReadResult{ 1349 Count: done, 1350 Total: done, 1351 }, nil 1352 } 1353 1354 // startRead checks that endpoint is in a readable state, and return the 1355 // inclusive range of segments that can be read. 1356 // 1357 // Precondition: e.rcvReadMu must be held. 1358 func (e *endpoint) startRead() (first, last *segment, err tcpip.Error) { 1359 e.LockUser() 1360 defer e.UnlockUser() 1361 1362 // When in SYN-SENT state, let the caller block on the receive. 1363 // An application can initiate a non-blocking connect and then block 1364 // on a receive. It can expect to read any data after the handshake 1365 // is complete. RFC793, section 3.9, p58. 1366 if e.EndpointState() == StateSynSent { 1367 return nil, nil, &tcpip.ErrWouldBlock{} 1368 } 1369 1370 // The endpoint can be read if it's connected, or if it's already closed 1371 // but has some pending unread data. Also note that a RST being received 1372 // would cause the state to become StateError so we should allow the 1373 // reads to proceed before returning a ECONNRESET. 1374 e.rcvQueueInfo.rcvQueueMu.Lock() 1375 defer e.rcvQueueInfo.rcvQueueMu.Unlock() 1376 1377 bufUsed := e.rcvQueueInfo.RcvBufUsed 1378 if s := e.EndpointState(); !s.connected() && s != StateClose && bufUsed == 0 { 1379 if s == StateError { 1380 if err := e.hardErrorLocked(); err != nil { 1381 return nil, nil, err 1382 } 1383 return nil, nil, &tcpip.ErrClosedForReceive{} 1384 } 1385 e.stats.ReadErrors.NotConnected.Increment() 1386 return nil, nil, &tcpip.ErrNotConnected{} 1387 } 1388 1389 if e.rcvQueueInfo.RcvBufUsed == 0 { 1390 if e.rcvQueueInfo.RcvClosed || !e.EndpointState().connected() { 1391 return nil, nil, &tcpip.ErrClosedForReceive{} 1392 } 1393 return nil, nil, &tcpip.ErrWouldBlock{} 1394 } 1395 1396 return e.rcvQueueInfo.rcvQueue.Front(), e.rcvQueueInfo.rcvQueue.Back(), nil 1397 } 1398 1399 // commitRead commits a read of done bytes and returns the next non-empty 1400 // segment to read. Data read from the segment must have also been removed from 1401 // the segment in order for this method to work correctly. 1402 // 1403 // It is performance critical to call commitRead frequently when servicing a big 1404 // Read request, so TCP can make progress timely. Right now, it is designed to 1405 // do this per segment read, hence this method conveniently returns the next 1406 // segment to read while holding the lock. 1407 // 1408 // Precondition: e.rcvReadMu must be held. 1409 func (e *endpoint) commitRead(done int) *segment { 1410 e.LockUser() 1411 defer e.UnlockUser() 1412 e.rcvQueueInfo.rcvQueueMu.Lock() 1413 defer e.rcvQueueInfo.rcvQueueMu.Unlock() 1414 1415 memDelta := 0 1416 s := e.rcvQueueInfo.rcvQueue.Front() 1417 for s != nil && s.data.Size() == 0 { 1418 e.rcvQueueInfo.rcvQueue.Remove(s) 1419 // Memory is only considered released when the whole segment has been 1420 // read. 1421 memDelta += s.segMemSize() 1422 s.decRef() 1423 s = e.rcvQueueInfo.rcvQueue.Front() 1424 } 1425 e.rcvQueueInfo.RcvBufUsed -= done 1426 1427 if memDelta > 0 { 1428 // If the window was small before this read and if the read freed up 1429 // enough buffer space, to either fit an aMSS or half a receive buffer 1430 // (whichever smaller), then notify the protocol goroutine to send a 1431 // window update. 1432 if crossed, above := e.windowCrossedACKThresholdLocked(memDelta, int(e.ops.GetReceiveBufferSize())); crossed && above { 1433 e.notifyProtocolGoroutine(notifyNonZeroReceiveWindow) 1434 } 1435 } 1436 1437 return e.rcvQueueInfo.rcvQueue.Front() 1438 } 1439 1440 // isEndpointWritableLocked checks if a given endpoint is writable 1441 // and also returns the number of bytes that can be written at this 1442 // moment. If the endpoint is not writable then it returns an error 1443 // indicating the reason why it's not writable. 1444 // Caller must hold e.mu and e.sndQueueMu 1445 func (e *endpoint) isEndpointWritableLocked() (int, tcpip.Error) { 1446 // The endpoint cannot be written to if it's not connected. 1447 switch s := e.EndpointState(); { 1448 case s == StateError: 1449 if err := e.hardErrorLocked(); err != nil { 1450 return 0, err 1451 } 1452 return 0, &tcpip.ErrClosedForSend{} 1453 case !s.connecting() && !s.connected(): 1454 return 0, &tcpip.ErrClosedForSend{} 1455 case s.connecting(): 1456 // As per RFC793, page 56, a send request arriving when in connecting 1457 // state, can be queued to be completed after the state becomes 1458 // connected. Return an error code for the caller of endpoint Write to 1459 // try again, until the connection handshake is complete. 1460 return 0, &tcpip.ErrWouldBlock{} 1461 } 1462 1463 // Check if the connection has already been closed for sends. 1464 if e.sndQueueInfo.SndClosed { 1465 return 0, &tcpip.ErrClosedForSend{} 1466 } 1467 1468 sndBufSize := e.getSendBufferSize() 1469 avail := sndBufSize - e.sndQueueInfo.SndBufUsed 1470 if avail <= 0 { 1471 return 0, &tcpip.ErrWouldBlock{} 1472 } 1473 return avail, nil 1474 } 1475 1476 // readFromPayloader reads a slice from the Payloader. 1477 // +checklocks:e.mu 1478 // +checklocks:e.sndQueueInfo.sndQueueMu 1479 func (e *endpoint) readFromPayloader(p tcpip.Payloader, opts tcpip.WriteOptions, avail int) ([]byte, tcpip.Error) { 1480 // We can release locks while copying data. 1481 // 1482 // This is not possible if atomic is set, because we can't allow the 1483 // available buffer space to be consumed by some other caller while we 1484 // are copying data in. 1485 if !opts.Atomic { 1486 e.sndQueueInfo.sndQueueMu.Unlock() 1487 defer e.sndQueueInfo.sndQueueMu.Lock() 1488 1489 e.UnlockUser() 1490 defer e.LockUser() 1491 } 1492 1493 // Fetch data. 1494 if l := p.Len(); l < avail { 1495 avail = l 1496 } 1497 if avail == 0 { 1498 return nil, nil 1499 } 1500 v := make([]byte, avail) 1501 n, err := p.Read(v) 1502 if err != nil && err != io.EOF { 1503 return nil, &tcpip.ErrBadBuffer{} 1504 } 1505 return v[:n], nil 1506 } 1507 1508 // queueSegment reads data from the payloader and returns a segment to be sent. 1509 // +checklocks:e.mu 1510 func (e *endpoint) queueSegment(p tcpip.Payloader, opts tcpip.WriteOptions) (*segment, int, tcpip.Error) { 1511 e.sndQueueInfo.sndQueueMu.Lock() 1512 defer e.sndQueueInfo.sndQueueMu.Unlock() 1513 1514 avail, err := e.isEndpointWritableLocked() 1515 if err != nil { 1516 e.stats.WriteErrors.WriteClosed.Increment() 1517 return nil, 0, err 1518 } 1519 1520 v, err := e.readFromPayloader(p, opts, avail) 1521 if err != nil { 1522 return nil, 0, err 1523 } 1524 1525 // Do not queue zero length segments. 1526 if len(v) == 0 { 1527 return nil, 0, nil 1528 } 1529 1530 if !opts.Atomic { 1531 // Since we released locks in between it's possible that the 1532 // endpoint transitioned to a CLOSED/ERROR states so make 1533 // sure endpoint is still writable before trying to write. 1534 avail, err := e.isEndpointWritableLocked() 1535 if err != nil { 1536 e.stats.WriteErrors.WriteClosed.Increment() 1537 return nil, 0, err 1538 } 1539 1540 // Discard any excess data copied in due to avail being reduced due 1541 // to a simultaneous write call to the socket. 1542 if avail < len(v) { 1543 v = v[:avail] 1544 } 1545 } 1546 1547 // Add data to the send queue. 1548 s := newOutgoingSegment(e.TransportEndpointInfo.ID, e.stack.Clock(), v) 1549 e.sndQueueInfo.SndBufUsed += len(v) 1550 e.snd.writeList.PushBack(s) 1551 1552 return s, len(v), nil 1553 } 1554 1555 // Write writes data to the endpoint's peer. 1556 func (e *endpoint) Write(p tcpip.Payloader, opts tcpip.WriteOptions) (int64, tcpip.Error) { 1557 // Linux completely ignores any address passed to sendto(2) for TCP sockets 1558 // (without the MSG_FASTOPEN flag). Corking is unimplemented, so opts.More 1559 // and opts.EndOfRecord are also ignored. 1560 1561 e.LockUser() 1562 defer e.UnlockUser() 1563 1564 // Return if either we didn't queue anything or if an error occurred while 1565 // attempting to queue data. 1566 nextSeg, n, err := e.queueSegment(p, opts) 1567 if n == 0 || err != nil { 1568 return 0, err 1569 } 1570 1571 e.sendData(nextSeg) 1572 return int64(n), nil 1573 } 1574 1575 // selectWindowLocked returns the new window without checking for shrinking or scaling 1576 // applied. 1577 // Precondition: e.mu and e.rcvQueueMu must be held. 1578 func (e *endpoint) selectWindowLocked(rcvBufSize int) (wnd seqnum.Size) { 1579 wndFromAvailable := wndFromSpace(e.receiveBufferAvailableLocked(rcvBufSize)) 1580 maxWindow := wndFromSpace(rcvBufSize) 1581 wndFromUsedBytes := maxWindow - e.rcvQueueInfo.RcvBufUsed 1582 1583 // We take the lesser of the wndFromAvailable and wndFromUsedBytes because in 1584 // cases where we receive a lot of small segments the segment overhead is a 1585 // lot higher and we can run out socket buffer space before we can fill the 1586 // previous window we advertised. In cases where we receive MSS sized or close 1587 // MSS sized segments we will probably run out of window space before we 1588 // exhaust receive buffer. 1589 newWnd := wndFromAvailable 1590 if newWnd > wndFromUsedBytes { 1591 newWnd = wndFromUsedBytes 1592 } 1593 if newWnd < 0 { 1594 newWnd = 0 1595 } 1596 return seqnum.Size(newWnd) 1597 } 1598 1599 // selectWindow invokes selectWindowLocked after acquiring e.rcvQueueMu. 1600 func (e *endpoint) selectWindow() (wnd seqnum.Size) { 1601 e.rcvQueueInfo.rcvQueueMu.Lock() 1602 wnd = e.selectWindowLocked(int(e.ops.GetReceiveBufferSize())) 1603 e.rcvQueueInfo.rcvQueueMu.Unlock() 1604 return wnd 1605 } 1606 1607 // windowCrossedACKThresholdLocked checks if the receive window to be announced 1608 // would be under aMSS or under the window derived from half receive buffer, 1609 // whichever smaller. This is useful as a receive side silly window syndrome 1610 // prevention mechanism. If window grows to reasonable value, we should send ACK 1611 // to the sender to inform the rx space is now large. We also want ensure a 1612 // series of small read()'s won't trigger a flood of spurious tiny ACK's. 1613 // 1614 // For large receive buffers, the threshold is aMSS - once reader reads more 1615 // than aMSS we'll send ACK. For tiny receive buffers, the threshold is half of 1616 // receive buffer size. This is chosen arbitrarily. 1617 // crossed will be true if the window size crossed the ACK threshold. 1618 // above will be true if the new window is >= ACK threshold and false 1619 // otherwise. 1620 // 1621 // Precondition: e.mu and e.rcvQueueMu must be held. 1622 func (e *endpoint) windowCrossedACKThresholdLocked(deltaBefore int, rcvBufSize int) (crossed bool, above bool) { 1623 newAvail := int(e.selectWindowLocked(rcvBufSize)) 1624 oldAvail := newAvail - deltaBefore 1625 if oldAvail < 0 { 1626 oldAvail = 0 1627 } 1628 threshold := int(e.amss) 1629 // rcvBufFraction is the inverse of the fraction of receive buffer size that 1630 // is used to decide if the available buffer space is now above it. 1631 const rcvBufFraction = 2 1632 if wndThreshold := wndFromSpace(rcvBufSize / rcvBufFraction); threshold > wndThreshold { 1633 threshold = wndThreshold 1634 } 1635 switch { 1636 case oldAvail < threshold && newAvail >= threshold: 1637 return true, true 1638 case oldAvail >= threshold && newAvail < threshold: 1639 return true, false 1640 } 1641 return false, false 1642 } 1643 1644 // OnReuseAddressSet implements tcpip.SocketOptionsHandler.OnReuseAddressSet. 1645 func (e *endpoint) OnReuseAddressSet(v bool) { 1646 e.LockUser() 1647 e.portFlags.TupleOnly = v 1648 e.UnlockUser() 1649 } 1650 1651 // OnReusePortSet implements tcpip.SocketOptionsHandler.OnReusePortSet. 1652 func (e *endpoint) OnReusePortSet(v bool) { 1653 e.LockUser() 1654 e.portFlags.LoadBalanced = v 1655 e.UnlockUser() 1656 } 1657 1658 // OnKeepAliveSet implements tcpip.SocketOptionsHandler.OnKeepAliveSet. 1659 func (e *endpoint) OnKeepAliveSet(bool) { 1660 e.notifyProtocolGoroutine(notifyKeepaliveChanged) 1661 } 1662 1663 // OnDelayOptionSet implements tcpip.SocketOptionsHandler.OnDelayOptionSet. 1664 func (e *endpoint) OnDelayOptionSet(v bool) { 1665 if !v { 1666 // Handle delayed data. 1667 e.sndQueueInfo.sndWaker.Assert() 1668 } 1669 } 1670 1671 // OnCorkOptionSet implements tcpip.SocketOptionsHandler.OnCorkOptionSet. 1672 func (e *endpoint) OnCorkOptionSet(v bool) { 1673 if !v { 1674 // Handle the corked data. 1675 e.sndQueueInfo.sndWaker.Assert() 1676 } 1677 } 1678 1679 func (e *endpoint) getSendBufferSize() int { 1680 return int(e.ops.GetSendBufferSize()) 1681 } 1682 1683 // OnSetReceiveBufferSize implements tcpip.SocketOptionsHandler.OnSetReceiveBufferSize. 1684 func (e *endpoint) OnSetReceiveBufferSize(rcvBufSz, oldSz int64) (newSz int64) { 1685 e.LockUser() 1686 e.rcvQueueInfo.rcvQueueMu.Lock() 1687 1688 // Make sure the receive buffer size allows us to send a 1689 // non-zero window size. 1690 scale := uint8(0) 1691 if e.rcv != nil { 1692 scale = e.rcv.RcvWndScale 1693 } 1694 if rcvBufSz>>scale == 0 { 1695 rcvBufSz = 1 << scale 1696 } 1697 1698 availBefore := wndFromSpace(e.receiveBufferAvailableLocked(int(oldSz))) 1699 availAfter := wndFromSpace(e.receiveBufferAvailableLocked(int(rcvBufSz))) 1700 e.rcvQueueInfo.RcvAutoParams.Disabled = true 1701 1702 // Immediately send an ACK to uncork the sender silly window 1703 // syndrome prevetion, when our available space grows above aMSS 1704 // or half receive buffer, whichever smaller. 1705 if crossed, above := e.windowCrossedACKThresholdLocked(availAfter-availBefore, int(rcvBufSz)); crossed && above { 1706 e.notifyProtocolGoroutine(notifyNonZeroReceiveWindow) 1707 } 1708 1709 e.rcvQueueInfo.rcvQueueMu.Unlock() 1710 e.UnlockUser() 1711 return rcvBufSz 1712 } 1713 1714 // OnSetSendBufferSize implements tcpip.SocketOptionsHandler.OnSetSendBufferSize. 1715 func (e *endpoint) OnSetSendBufferSize(sz int64) int64 { 1716 atomic.StoreUint32(&e.sndQueueInfo.TCPSndBufState.AutoTuneSndBufDisabled, 1) 1717 return sz 1718 } 1719 1720 // WakeupWriters implements tcpip.SocketOptionsHandler.WakeupWriters. 1721 func (e *endpoint) WakeupWriters() { 1722 e.LockUser() 1723 defer e.UnlockUser() 1724 1725 sendBufferSize := e.getSendBufferSize() 1726 e.sndQueueInfo.sndQueueMu.Lock() 1727 notify := (sendBufferSize - e.sndQueueInfo.SndBufUsed) >= e.sndQueueInfo.SndBufUsed>>1 1728 e.sndQueueInfo.sndQueueMu.Unlock() 1729 1730 if notify { 1731 e.waiterQueue.Notify(waiter.WritableEvents) 1732 } 1733 } 1734 1735 // SetSockOptInt sets a socket option. 1736 func (e *endpoint) SetSockOptInt(opt tcpip.SockOptInt, v int) tcpip.Error { 1737 // Lower 2 bits represents ECN bits. RFC 3168, section 23.1 1738 const inetECNMask = 3 1739 1740 switch opt { 1741 case tcpip.KeepaliveCountOption: 1742 e.keepalive.Lock() 1743 e.keepalive.count = v 1744 e.keepalive.Unlock() 1745 e.notifyProtocolGoroutine(notifyKeepaliveChanged) 1746 1747 case tcpip.IPv4TOSOption: 1748 e.LockUser() 1749 // TODO(gvisor.dev/issue/995): ECN is not currently supported, 1750 // ignore the bits for now. 1751 e.sendTOS = uint8(v) & ^uint8(inetECNMask) 1752 e.UnlockUser() 1753 1754 case tcpip.IPv6TrafficClassOption: 1755 e.LockUser() 1756 // TODO(gvisor.dev/issue/995): ECN is not currently supported, 1757 // ignore the bits for now. 1758 e.sendTOS = uint8(v) & ^uint8(inetECNMask) 1759 e.UnlockUser() 1760 1761 case tcpip.MaxSegOption: 1762 userMSS := v 1763 if userMSS < header.TCPMinimumMSS || userMSS > header.TCPMaximumMSS { 1764 return &tcpip.ErrInvalidOptionValue{} 1765 } 1766 e.LockUser() 1767 e.userMSS = uint16(userMSS) 1768 e.UnlockUser() 1769 e.notifyProtocolGoroutine(notifyMSSChanged) 1770 1771 case tcpip.MTUDiscoverOption: 1772 // Return not supported if attempting to set this option to 1773 // anything other than path MTU discovery disabled. 1774 if v != tcpip.PMTUDiscoveryDont { 1775 return &tcpip.ErrNotSupported{} 1776 } 1777 1778 case tcpip.TTLOption: 1779 e.LockUser() 1780 e.ttl = uint8(v) 1781 e.UnlockUser() 1782 1783 case tcpip.TCPSynCountOption: 1784 if v < 1 || v > 255 { 1785 return &tcpip.ErrInvalidOptionValue{} 1786 } 1787 e.LockUser() 1788 e.maxSynRetries = uint8(v) 1789 e.UnlockUser() 1790 1791 case tcpip.TCPWindowClampOption: 1792 if v == 0 { 1793 e.LockUser() 1794 switch e.EndpointState() { 1795 case StateClose, StateInitial: 1796 e.windowClamp = 0 1797 e.UnlockUser() 1798 return nil 1799 default: 1800 e.UnlockUser() 1801 return &tcpip.ErrInvalidOptionValue{} 1802 } 1803 } 1804 var rs tcpip.TCPReceiveBufferSizeRangeOption 1805 if err := e.stack.TransportProtocolOption(ProtocolNumber, &rs); err == nil { 1806 if v < rs.Min/2 { 1807 v = rs.Min / 2 1808 } 1809 } 1810 e.LockUser() 1811 e.windowClamp = uint32(v) 1812 e.UnlockUser() 1813 } 1814 return nil 1815 } 1816 1817 func (e *endpoint) HasNIC(id int32) bool { 1818 return id == 0 || e.stack.HasNIC(tcpip.NICID(id)) 1819 } 1820 1821 // SetSockOpt sets a socket option. 1822 func (e *endpoint) SetSockOpt(opt tcpip.SettableSocketOption) tcpip.Error { 1823 switch v := opt.(type) { 1824 case *tcpip.KeepaliveIdleOption: 1825 e.keepalive.Lock() 1826 e.keepalive.idle = time.Duration(*v) 1827 e.keepalive.Unlock() 1828 e.notifyProtocolGoroutine(notifyKeepaliveChanged) 1829 1830 case *tcpip.KeepaliveIntervalOption: 1831 e.keepalive.Lock() 1832 e.keepalive.interval = time.Duration(*v) 1833 e.keepalive.Unlock() 1834 e.notifyProtocolGoroutine(notifyKeepaliveChanged) 1835 1836 case *tcpip.TCPUserTimeoutOption: 1837 e.LockUser() 1838 e.userTimeout = time.Duration(*v) 1839 e.UnlockUser() 1840 1841 case *tcpip.CongestionControlOption: 1842 // Query the available cc algorithms in the stack and 1843 // validate that the specified algorithm is actually 1844 // supported in the stack. 1845 var avail tcpip.TCPAvailableCongestionControlOption 1846 if err := e.stack.TransportProtocolOption(ProtocolNumber, &avail); err != nil { 1847 return err 1848 } 1849 availCC := strings.Split(string(avail), " ") 1850 for _, cc := range availCC { 1851 if *v == tcpip.CongestionControlOption(cc) { 1852 e.LockUser() 1853 state := e.EndpointState() 1854 e.cc = *v 1855 switch state { 1856 case StateEstablished: 1857 if e.EndpointState() == state { 1858 e.snd.cc = e.snd.initCongestionControl(e.cc) 1859 } 1860 } 1861 e.UnlockUser() 1862 return nil 1863 } 1864 } 1865 1866 // Linux returns ENOENT when an invalid congestion 1867 // control algorithm is specified. 1868 return &tcpip.ErrNoSuchFile{} 1869 1870 case *tcpip.TCPLingerTimeoutOption: 1871 e.LockUser() 1872 1873 switch { 1874 case *v < 0: 1875 // Same as effectively disabling TCPLinger timeout. 1876 *v = -1 1877 case *v == 0: 1878 // Same as the stack default. 1879 var stackLingerTimeout tcpip.TCPLingerTimeoutOption 1880 if err := e.stack.TransportProtocolOption(ProtocolNumber, &stackLingerTimeout); err != nil { 1881 panic(fmt.Sprintf("e.stack.TransportProtocolOption(%d, %+v) = %v", ProtocolNumber, &stackLingerTimeout, err)) 1882 } 1883 *v = stackLingerTimeout 1884 case *v > tcpip.TCPLingerTimeoutOption(MaxTCPLingerTimeout): 1885 // Cap it to Stack's default TCP_LINGER2 timeout. 1886 *v = tcpip.TCPLingerTimeoutOption(MaxTCPLingerTimeout) 1887 default: 1888 } 1889 1890 e.tcpLingerTimeout = time.Duration(*v) 1891 e.UnlockUser() 1892 1893 case *tcpip.TCPDeferAcceptOption: 1894 e.LockUser() 1895 if time.Duration(*v) > MaxRTO { 1896 *v = tcpip.TCPDeferAcceptOption(MaxRTO) 1897 } 1898 e.deferAccept = time.Duration(*v) 1899 e.UnlockUser() 1900 1901 case *tcpip.SocketDetachFilterOption: 1902 return nil 1903 1904 default: 1905 return nil 1906 } 1907 return nil 1908 } 1909 1910 // readyReceiveSize returns the number of bytes ready to be received. 1911 func (e *endpoint) readyReceiveSize() (int, tcpip.Error) { 1912 e.LockUser() 1913 defer e.UnlockUser() 1914 1915 // The endpoint cannot be in listen state. 1916 if e.EndpointState() == StateListen { 1917 return 0, &tcpip.ErrInvalidEndpointState{} 1918 } 1919 1920 e.rcvQueueInfo.rcvQueueMu.Lock() 1921 defer e.rcvQueueInfo.rcvQueueMu.Unlock() 1922 1923 return e.rcvQueueInfo.RcvBufUsed, nil 1924 } 1925 1926 // GetSockOptInt implements tcpip.Endpoint.GetSockOptInt. 1927 func (e *endpoint) GetSockOptInt(opt tcpip.SockOptInt) (int, tcpip.Error) { 1928 switch opt { 1929 case tcpip.KeepaliveCountOption: 1930 e.keepalive.Lock() 1931 v := e.keepalive.count 1932 e.keepalive.Unlock() 1933 return v, nil 1934 1935 case tcpip.IPv4TOSOption: 1936 e.LockUser() 1937 v := int(e.sendTOS) 1938 e.UnlockUser() 1939 return v, nil 1940 1941 case tcpip.IPv6TrafficClassOption: 1942 e.LockUser() 1943 v := int(e.sendTOS) 1944 e.UnlockUser() 1945 return v, nil 1946 1947 case tcpip.MaxSegOption: 1948 // This is just stubbed out. Linux never returns the user_mss 1949 // value as it either returns the defaultMSS or returns the 1950 // actual current MSS. Netstack just returns the defaultMSS 1951 // always for now. 1952 v := header.TCPDefaultMSS 1953 return v, nil 1954 1955 case tcpip.MTUDiscoverOption: 1956 // Always return the path MTU discovery disabled setting since 1957 // it's the only one supported. 1958 return tcpip.PMTUDiscoveryDont, nil 1959 1960 case tcpip.ReceiveQueueSizeOption: 1961 return e.readyReceiveSize() 1962 1963 case tcpip.TTLOption: 1964 e.LockUser() 1965 v := int(e.ttl) 1966 e.UnlockUser() 1967 return v, nil 1968 1969 case tcpip.TCPSynCountOption: 1970 e.LockUser() 1971 v := int(e.maxSynRetries) 1972 e.UnlockUser() 1973 return v, nil 1974 1975 case tcpip.TCPWindowClampOption: 1976 e.LockUser() 1977 v := int(e.windowClamp) 1978 e.UnlockUser() 1979 return v, nil 1980 1981 case tcpip.MulticastTTLOption: 1982 return 1, nil 1983 1984 default: 1985 return -1, &tcpip.ErrUnknownProtocolOption{} 1986 } 1987 } 1988 1989 func (e *endpoint) getTCPInfo() tcpip.TCPInfoOption { 1990 info := tcpip.TCPInfoOption{} 1991 e.LockUser() 1992 if state := e.EndpointState(); state.internal() { 1993 info.State = tcpip.EndpointState(StateClose) 1994 } else { 1995 info.State = tcpip.EndpointState(state) 1996 } 1997 snd := e.snd 1998 if snd != nil { 1999 // We do not calculate RTT before sending the data packets. If 2000 // the connection did not send and receive data, then RTT will 2001 // be zero. 2002 snd.rtt.Lock() 2003 info.RTT = snd.rtt.TCPRTTState.SRTT 2004 info.RTTVar = snd.rtt.TCPRTTState.RTTVar 2005 snd.rtt.Unlock() 2006 2007 info.RTO = snd.RTO 2008 info.CcState = snd.state 2009 info.SndSsthresh = uint32(snd.Ssthresh) 2010 info.SndCwnd = uint32(snd.SndCwnd) 2011 info.ReorderSeen = snd.rc.Reord 2012 } 2013 e.UnlockUser() 2014 return info 2015 } 2016 2017 // GetSockOpt implements tcpip.Endpoint.GetSockOpt. 2018 func (e *endpoint) GetSockOpt(opt tcpip.GettableSocketOption) tcpip.Error { 2019 switch o := opt.(type) { 2020 case *tcpip.TCPInfoOption: 2021 *o = e.getTCPInfo() 2022 2023 case *tcpip.KeepaliveIdleOption: 2024 e.keepalive.Lock() 2025 *o = tcpip.KeepaliveIdleOption(e.keepalive.idle) 2026 e.keepalive.Unlock() 2027 2028 case *tcpip.KeepaliveIntervalOption: 2029 e.keepalive.Lock() 2030 *o = tcpip.KeepaliveIntervalOption(e.keepalive.interval) 2031 e.keepalive.Unlock() 2032 2033 case *tcpip.TCPUserTimeoutOption: 2034 e.LockUser() 2035 *o = tcpip.TCPUserTimeoutOption(e.userTimeout) 2036 e.UnlockUser() 2037 2038 case *tcpip.CongestionControlOption: 2039 e.LockUser() 2040 *o = e.cc 2041 e.UnlockUser() 2042 2043 case *tcpip.TCPLingerTimeoutOption: 2044 e.LockUser() 2045 *o = tcpip.TCPLingerTimeoutOption(e.tcpLingerTimeout) 2046 e.UnlockUser() 2047 2048 case *tcpip.TCPDeferAcceptOption: 2049 e.LockUser() 2050 *o = tcpip.TCPDeferAcceptOption(e.deferAccept) 2051 e.UnlockUser() 2052 2053 case *tcpip.OriginalDestinationOption: 2054 e.LockUser() 2055 ipt := e.stack.IPTables() 2056 addr, port, err := ipt.OriginalDst(e.TransportEndpointInfo.ID, e.NetProto, ProtocolNumber) 2057 e.UnlockUser() 2058 if err != nil { 2059 return err 2060 } 2061 *o = tcpip.OriginalDestinationOption{ 2062 Addr: addr, 2063 Port: port, 2064 } 2065 2066 default: 2067 return &tcpip.ErrUnknownProtocolOption{} 2068 } 2069 return nil 2070 } 2071 2072 // checkV4MappedLocked determines the effective network protocol and converts 2073 // addr to its canonical form. 2074 func (e *endpoint) checkV4MappedLocked(addr tcpip.FullAddress) (tcpip.FullAddress, tcpip.NetworkProtocolNumber, tcpip.Error) { 2075 unwrapped, netProto, err := e.TransportEndpointInfo.AddrNetProtoLocked(addr, e.ops.GetV6Only()) 2076 if err != nil { 2077 return tcpip.FullAddress{}, 0, err 2078 } 2079 return unwrapped, netProto, nil 2080 } 2081 2082 // Disconnect implements tcpip.Endpoint.Disconnect. 2083 func (*endpoint) Disconnect() tcpip.Error { 2084 return &tcpip.ErrNotSupported{} 2085 } 2086 2087 // Connect connects the endpoint to its peer. 2088 func (e *endpoint) Connect(addr tcpip.FullAddress) tcpip.Error { 2089 err := e.connect(addr, true, true) 2090 if err != nil { 2091 if !err.IgnoreStats() { 2092 // Connect failed. Let's wake up any waiters. 2093 e.waiterQueue.Notify(waiter.EventHUp | waiter.EventErr | waiter.ReadableEvents | waiter.WritableEvents) 2094 e.stack.Stats().TCP.FailedConnectionAttempts.Increment() 2095 e.stats.FailedConnectionAttempts.Increment() 2096 } 2097 } 2098 return err 2099 } 2100 2101 // connect connects the endpoint to its peer. In the normal non-S/R case, the 2102 // new connection is expected to run the main goroutine and perform handshake. 2103 // In restore of previously connected endpoints, both ends will be passively 2104 // created (so no new handshaking is done); for stack-accepted connections not 2105 // yet accepted by the app, they are restored without running the main goroutine 2106 // here. 2107 func (e *endpoint) connect(addr tcpip.FullAddress, handshake bool, run bool) tcpip.Error { 2108 e.LockUser() 2109 defer e.UnlockUser() 2110 2111 connectingAddr := addr.Addr 2112 2113 addr, netProto, err := e.checkV4MappedLocked(addr) 2114 if err != nil { 2115 return err 2116 } 2117 2118 if e.EndpointState().connected() { 2119 // The endpoint is already connected. If caller hasn't been 2120 // notified yet, return success. 2121 if !e.isConnectNotified { 2122 e.isConnectNotified = true 2123 return nil 2124 } 2125 // Otherwise return that it's already connected. 2126 return &tcpip.ErrAlreadyConnected{} 2127 } 2128 2129 nicID := addr.NIC 2130 switch e.EndpointState() { 2131 case StateBound: 2132 // If we're already bound to a NIC but the caller is requesting 2133 // that we use a different one now, we cannot proceed. 2134 if e.boundNICID == 0 { 2135 break 2136 } 2137 2138 if nicID != 0 && nicID != e.boundNICID { 2139 return &tcpip.ErrNoRoute{} 2140 } 2141 2142 nicID = e.boundNICID 2143 2144 case StateInitial: 2145 // Nothing to do. We'll eventually fill-in the gaps in the ID (if any) 2146 // when we find a route. 2147 2148 case StateConnecting, StateSynSent, StateSynRecv: 2149 // A connection request has already been issued but hasn't completed 2150 // yet. 2151 return &tcpip.ErrAlreadyConnecting{} 2152 2153 case StateError: 2154 if err := e.hardErrorLocked(); err != nil { 2155 return err 2156 } 2157 return &tcpip.ErrConnectionAborted{} 2158 2159 default: 2160 return &tcpip.ErrInvalidEndpointState{} 2161 } 2162 2163 // Find a route to the desired destination. 2164 r, err := e.stack.FindRoute(nicID, e.TransportEndpointInfo.ID.LocalAddress, addr.Addr, netProto, false /* multicastLoop */) 2165 if err != nil { 2166 return err 2167 } 2168 defer r.Release() 2169 2170 netProtos := []tcpip.NetworkProtocolNumber{netProto} 2171 e.TransportEndpointInfo.ID.LocalAddress = r.LocalAddress() 2172 e.TransportEndpointInfo.ID.RemoteAddress = r.RemoteAddress() 2173 e.TransportEndpointInfo.ID.RemotePort = addr.Port 2174 2175 if e.TransportEndpointInfo.ID.LocalPort != 0 { 2176 // The endpoint is bound to a port, attempt to register it. 2177 err := e.stack.RegisterTransportEndpoint(netProtos, ProtocolNumber, e.TransportEndpointInfo.ID, e, e.boundPortFlags, e.boundBindToDevice) 2178 if err != nil { 2179 return err 2180 } 2181 } else { 2182 // The endpoint doesn't have a local port yet, so try to get 2183 // one. Make sure that it isn't one that will result in the same 2184 // address/port for both local and remote (otherwise this 2185 // endpoint would be trying to connect to itself). 2186 sameAddr := e.TransportEndpointInfo.ID.LocalAddress == e.TransportEndpointInfo.ID.RemoteAddress 2187 2188 // Calculate a port offset based on the destination IP/port and 2189 // src IP to ensure that for a given tuple (srcIP, destIP, 2190 // destPort) the offset used as a starting point is the same to 2191 // ensure that we can cycle through the port space effectively. 2192 portBuf := make([]byte, 2) 2193 binary.LittleEndian.PutUint16(portBuf, e.ID.RemotePort) 2194 2195 h := jenkins.Sum32(e.protocol.portOffsetSecret) 2196 for _, s := range [][]byte{ 2197 []byte(e.ID.LocalAddress), 2198 []byte(e.ID.RemoteAddress), 2199 portBuf, 2200 } { 2201 // Per io.Writer.Write: 2202 // 2203 // Write must return a non-nil error if it returns n < len(p). 2204 if _, err := h.Write(s); err != nil { 2205 panic(err) 2206 } 2207 } 2208 portOffset := h.Sum32() 2209 2210 var twReuse tcpip.TCPTimeWaitReuseOption 2211 if err := e.stack.TransportProtocolOption(ProtocolNumber, &twReuse); err != nil { 2212 panic(fmt.Sprintf("e.stack.TransportProtocolOption(%d, %#v) = %s", ProtocolNumber, &twReuse, err)) 2213 } 2214 2215 reuse := twReuse == tcpip.TCPTimeWaitReuseGlobal 2216 if twReuse == tcpip.TCPTimeWaitReuseLoopbackOnly { 2217 switch netProto { 2218 case header.IPv4ProtocolNumber: 2219 reuse = header.IsV4LoopbackAddress(e.TransportEndpointInfo.ID.LocalAddress) && header.IsV4LoopbackAddress(e.TransportEndpointInfo.ID.RemoteAddress) 2220 case header.IPv6ProtocolNumber: 2221 reuse = e.TransportEndpointInfo.ID.LocalAddress == header.IPv6Loopback && e.TransportEndpointInfo.ID.RemoteAddress == header.IPv6Loopback 2222 } 2223 } 2224 2225 bindToDevice := tcpip.NICID(e.ops.GetBindToDevice()) 2226 if _, err := e.stack.PickEphemeralPortStable(portOffset, func(p uint16) (bool, tcpip.Error) { 2227 if sameAddr && p == e.TransportEndpointInfo.ID.RemotePort { 2228 return false, nil 2229 } 2230 portRes := ports.Reservation{ 2231 Networks: netProtos, 2232 Transport: ProtocolNumber, 2233 Addr: e.TransportEndpointInfo.ID.LocalAddress, 2234 Port: p, 2235 Flags: e.portFlags, 2236 BindToDevice: bindToDevice, 2237 Dest: addr, 2238 } 2239 if _, err := e.stack.ReservePort(e.stack.Rand(), portRes, nil /* testPort */); err != nil { 2240 if _, ok := err.(*tcpip.ErrPortInUse); !ok || !reuse { 2241 return false, nil 2242 } 2243 transEPID := e.TransportEndpointInfo.ID 2244 transEPID.LocalPort = p 2245 // Check if an endpoint is registered with demuxer in TIME-WAIT and if 2246 // we can reuse it. If we can't find a transport endpoint then we just 2247 // skip using this port as it's possible that either an endpoint has 2248 // bound the port but not registered with demuxer yet (no listen/connect 2249 // done yet) or the reservation was freed between the check above and 2250 // the FindTransportEndpoint below. But rather than retry the same port 2251 // we just skip it and move on. 2252 transEP := e.stack.FindTransportEndpoint(netProto, ProtocolNumber, transEPID, r.NICID()) 2253 if transEP == nil { 2254 // ReservePort failed but there is no registered endpoint with 2255 // demuxer. Which indicates there is at least some endpoint that has 2256 // bound the port. 2257 return false, nil 2258 } 2259 2260 tcpEP := transEP.(*endpoint) 2261 tcpEP.LockUser() 2262 // If the endpoint is not in TIME-WAIT or if it is in TIME-WAIT but 2263 // less than 1 second has elapsed since its recentTS was updated then 2264 // we cannot reuse the port. 2265 if tcpEP.EndpointState() != StateTimeWait || e.stack.Clock().NowMonotonic().Sub(tcpEP.recentTSTime) < 1*time.Second { 2266 tcpEP.UnlockUser() 2267 return false, nil 2268 } 2269 // Since the endpoint is in TIME-WAIT it should be safe to acquire its 2270 // Lock while holding the lock for this endpoint as endpoints in 2271 // TIME-WAIT do not acquire locks on other endpoints. 2272 tcpEP.workerCleanup = false 2273 tcpEP.cleanupLocked() 2274 tcpEP.notifyProtocolGoroutine(notifyAbort) 2275 tcpEP.UnlockUser() 2276 // Now try and Reserve again if it fails then we skip. 2277 portRes := ports.Reservation{ 2278 Networks: netProtos, 2279 Transport: ProtocolNumber, 2280 Addr: e.TransportEndpointInfo.ID.LocalAddress, 2281 Port: p, 2282 Flags: e.portFlags, 2283 BindToDevice: bindToDevice, 2284 Dest: addr, 2285 } 2286 if _, err := e.stack.ReservePort(e.stack.Rand(), portRes, nil /* testPort */); err != nil { 2287 return false, nil 2288 } 2289 } 2290 2291 id := e.TransportEndpointInfo.ID 2292 id.LocalPort = p 2293 if err := e.stack.RegisterTransportEndpoint(netProtos, ProtocolNumber, id, e, e.portFlags, bindToDevice); err != nil { 2294 portRes := ports.Reservation{ 2295 Networks: netProtos, 2296 Transport: ProtocolNumber, 2297 Addr: e.TransportEndpointInfo.ID.LocalAddress, 2298 Port: p, 2299 Flags: e.portFlags, 2300 BindToDevice: bindToDevice, 2301 Dest: addr, 2302 } 2303 e.stack.ReleasePort(portRes) 2304 if _, ok := err.(*tcpip.ErrPortInUse); ok { 2305 return false, nil 2306 } 2307 return false, err 2308 } 2309 2310 // Port picking successful. Save the details of 2311 // the selected port. 2312 e.TransportEndpointInfo.ID = id 2313 e.isPortReserved = true 2314 e.boundBindToDevice = bindToDevice 2315 e.boundPortFlags = e.portFlags 2316 e.boundDest = addr 2317 return true, nil 2318 }); err != nil { 2319 e.stack.Stats().TCP.FailedPortReservations.Increment() 2320 return err 2321 } 2322 } 2323 2324 e.isRegistered = true 2325 e.setEndpointState(StateConnecting) 2326 r.Acquire() 2327 e.route = r 2328 e.boundNICID = nicID 2329 e.effectiveNetProtos = netProtos 2330 e.connectingAddress = connectingAddr 2331 2332 e.initGSO() 2333 2334 // Connect in the restore phase does not perform handshake. Restore its 2335 // connection setting here. 2336 if !handshake { 2337 e.segmentQueue.mu.Lock() 2338 for _, l := range []segmentList{e.segmentQueue.list, e.snd.writeList} { 2339 for s := l.Front(); s != nil; s = s.Next() { 2340 s.id = e.TransportEndpointInfo.ID 2341 e.sndQueueInfo.sndWaker.Assert() 2342 } 2343 } 2344 e.segmentQueue.mu.Unlock() 2345 e.snd.updateMaxPayloadSize(int(e.route.MTU()), 0) 2346 e.setEndpointState(StateEstablished) 2347 // Set the new auto tuned send buffer size after entering 2348 // established state. 2349 e.ops.SetSendBufferSize(e.computeTCPSendBufferSize(), false /* notify */) 2350 } 2351 2352 if run { 2353 if handshake { 2354 h := e.newHandshake() 2355 e.setEndpointState(StateSynSent) 2356 h.start() 2357 } 2358 e.stack.Stats().TCP.ActiveConnectionOpenings.Increment() 2359 e.workerRunning = true 2360 go e.protocolMainLoop(handshake, nil) // S/R-SAFE: will be drained before save. 2361 } 2362 2363 return &tcpip.ErrConnectStarted{} 2364 } 2365 2366 // ConnectEndpoint is not supported. 2367 func (*endpoint) ConnectEndpoint(tcpip.Endpoint) tcpip.Error { 2368 return &tcpip.ErrInvalidEndpointState{} 2369 } 2370 2371 // Shutdown closes the read and/or write end of the endpoint connection to its 2372 // peer. 2373 func (e *endpoint) Shutdown(flags tcpip.ShutdownFlags) tcpip.Error { 2374 e.LockUser() 2375 defer e.UnlockUser() 2376 2377 if e.EndpointState().connecting() { 2378 // When calling shutdown(2) on a connecting socket, the endpoint must 2379 // enter the error state. But this logic cannot belong to the shutdownLocked 2380 // method because that method is called during a close(2) (and closing a 2381 // connecting socket is not an error). 2382 e.resetConnectionLocked(&tcpip.ErrConnectionReset{}) 2383 e.notifyProtocolGoroutine(notifyShutdown) 2384 e.waiterQueue.Notify(waiter.WritableEvents | waiter.EventHUp | waiter.EventErr) 2385 return nil 2386 } 2387 2388 return e.shutdownLocked(flags) 2389 } 2390 2391 func (e *endpoint) shutdownLocked(flags tcpip.ShutdownFlags) tcpip.Error { 2392 e.shutdownFlags |= flags 2393 switch { 2394 case e.EndpointState().connected(): 2395 // Close for read. 2396 if e.shutdownFlags&tcpip.ShutdownRead != 0 { 2397 // Mark read side as closed. 2398 e.rcvQueueInfo.rcvQueueMu.Lock() 2399 e.rcvQueueInfo.RcvClosed = true 2400 rcvBufUsed := e.rcvQueueInfo.RcvBufUsed 2401 e.rcvQueueInfo.rcvQueueMu.Unlock() 2402 2403 // If we're fully closed and we have unread data we need to abort 2404 // the connection with a RST. 2405 if e.shutdownFlags&tcpip.ShutdownWrite != 0 && rcvBufUsed > 0 { 2406 e.resetConnectionLocked(&tcpip.ErrConnectionAborted{}) 2407 // Wake up worker to terminate loop. 2408 e.notifyProtocolGoroutine(notifyTickleWorker) 2409 return nil 2410 } 2411 // Wake up any readers that maybe waiting for the stream to become 2412 // readable. 2413 e.waiterQueue.Notify(waiter.ReadableEvents) 2414 } 2415 2416 // Close for write. 2417 if e.shutdownFlags&tcpip.ShutdownWrite != 0 { 2418 e.sndQueueInfo.sndQueueMu.Lock() 2419 if e.sndQueueInfo.SndClosed { 2420 // Already closed. 2421 e.sndQueueInfo.sndQueueMu.Unlock() 2422 if e.EndpointState() == StateTimeWait { 2423 return &tcpip.ErrNotConnected{} 2424 } 2425 return nil 2426 } 2427 2428 // Queue fin segment. 2429 s := newOutgoingSegment(e.TransportEndpointInfo.ID, e.stack.Clock(), nil) 2430 e.snd.writeList.PushBack(s) 2431 // Mark endpoint as closed. 2432 e.sndQueueInfo.SndClosed = true 2433 e.sndQueueInfo.sndQueueMu.Unlock() 2434 2435 // Drain the send queue. 2436 e.sendData(s) 2437 2438 // Mark send side as closed. 2439 e.snd.Closed = true 2440 2441 // Wake up any writers that maybe waiting for the stream to become 2442 // writable. 2443 e.waiterQueue.Notify(waiter.WritableEvents) 2444 } 2445 2446 return nil 2447 case e.EndpointState() == StateListen: 2448 if e.shutdownFlags&tcpip.ShutdownRead != 0 { 2449 // Reset all connections from the accept queue and keep the 2450 // worker running so that it can continue handling incoming 2451 // segments by replying with RST. 2452 // 2453 // By not removing this endpoint from the demuxer mapping, we 2454 // ensure that any other bind to the same port fails, as on Linux. 2455 e.rcvQueueInfo.rcvQueueMu.Lock() 2456 e.rcvQueueInfo.RcvClosed = true 2457 e.rcvQueueInfo.rcvQueueMu.Unlock() 2458 e.closePendingAcceptableConnectionsLocked() 2459 // Notify waiters that the endpoint is shutdown. 2460 e.waiterQueue.Notify(waiter.ReadableEvents | waiter.WritableEvents | waiter.EventHUp | waiter.EventErr) 2461 } 2462 return nil 2463 default: 2464 return &tcpip.ErrNotConnected{} 2465 } 2466 } 2467 2468 // Listen puts the endpoint in "listen" mode, which allows it to accept 2469 // new connections. 2470 func (e *endpoint) Listen(backlog int) tcpip.Error { 2471 err := e.listen(backlog) 2472 if err != nil { 2473 if !err.IgnoreStats() { 2474 e.stack.Stats().TCP.FailedConnectionAttempts.Increment() 2475 e.stats.FailedConnectionAttempts.Increment() 2476 } 2477 } 2478 return err 2479 } 2480 2481 func (e *endpoint) listen(backlog int) tcpip.Error { 2482 e.LockUser() 2483 defer e.UnlockUser() 2484 2485 if e.EndpointState() == StateListen && !e.closed { 2486 e.acceptMu.Lock() 2487 defer e.acceptMu.Unlock() 2488 2489 // Adjust the size of the backlog iff we can fit 2490 // existing pending connections into the new one. 2491 if e.acceptQueue.endpoints.Len() > backlog { 2492 return &tcpip.ErrInvalidEndpointState{} 2493 } 2494 e.acceptQueue.capacity = backlog 2495 2496 if e.acceptQueue.pendingEndpoints == nil { 2497 e.acceptQueue.pendingEndpoints = make(map[*endpoint]struct{}) 2498 } 2499 2500 e.shutdownFlags = 0 2501 e.rcvQueueInfo.rcvQueueMu.Lock() 2502 e.rcvQueueInfo.RcvClosed = false 2503 e.rcvQueueInfo.rcvQueueMu.Unlock() 2504 2505 // Notify any blocked goroutines that they can attempt to 2506 // deliver endpoints again. 2507 e.acceptCond.Broadcast() 2508 2509 return nil 2510 } 2511 2512 if e.EndpointState() == StateInitial { 2513 // The listen is called on an unbound socket, the socket is 2514 // automatically bound to a random free port with the local 2515 // address set to INADDR_ANY. 2516 if err := e.bindLocked(tcpip.FullAddress{}); err != nil { 2517 return err 2518 } 2519 } 2520 2521 // Endpoint must be bound before it can transition to listen mode. 2522 if e.EndpointState() != StateBound { 2523 e.stats.ReadErrors.InvalidEndpointState.Increment() 2524 return &tcpip.ErrInvalidEndpointState{} 2525 } 2526 2527 // Register the endpoint. 2528 if err := e.stack.RegisterTransportEndpoint(e.effectiveNetProtos, ProtocolNumber, e.TransportEndpointInfo.ID, e, e.boundPortFlags, e.boundBindToDevice); err != nil { 2529 return err 2530 } 2531 2532 e.isRegistered = true 2533 e.setEndpointState(StateListen) 2534 2535 // The queue may be non-zero when we're restoring the endpoint, and it 2536 // may be pre-populated with some previously accepted (but not Accepted) 2537 // endpoints. 2538 e.acceptMu.Lock() 2539 if e.acceptQueue.pendingEndpoints == nil { 2540 e.acceptQueue.pendingEndpoints = make(map[*endpoint]struct{}) 2541 } 2542 if e.acceptQueue.capacity == 0 { 2543 e.acceptQueue.capacity = backlog 2544 } 2545 e.acceptMu.Unlock() 2546 2547 e.workerRunning = true 2548 go e.protocolListenLoop( // S/R-SAFE: drained on save. 2549 seqnum.Size(e.receiveBufferAvailable())) 2550 return nil 2551 } 2552 2553 // startAcceptedLoop sets up required state and starts a goroutine with the 2554 // main loop for accepted connections. 2555 // +checklocksrelease:e.mu 2556 func (e *endpoint) startAcceptedLoop() { 2557 e.workerRunning = true 2558 e.mu.Unlock() 2559 wakerInitDone := make(chan struct{}) 2560 go e.protocolMainLoop(false, wakerInitDone) // S/R-SAFE: drained on save. 2561 <-wakerInitDone 2562 } 2563 2564 // Accept returns a new endpoint if a peer has established a connection 2565 // to an endpoint previously set to listen mode. 2566 // 2567 // addr if not-nil will contain the peer address of the returned endpoint. 2568 func (e *endpoint) Accept(peerAddr *tcpip.FullAddress) (tcpip.Endpoint, *waiter.Queue, tcpip.Error) { 2569 e.LockUser() 2570 defer e.UnlockUser() 2571 2572 e.rcvQueueInfo.rcvQueueMu.Lock() 2573 rcvClosed := e.rcvQueueInfo.RcvClosed 2574 e.rcvQueueInfo.rcvQueueMu.Unlock() 2575 // Endpoint must be in listen state before it can accept connections. 2576 if rcvClosed || e.EndpointState() != StateListen { 2577 return nil, nil, &tcpip.ErrInvalidEndpointState{} 2578 } 2579 2580 // Get the new accepted endpoint. 2581 var n *endpoint 2582 e.acceptMu.Lock() 2583 if element := e.acceptQueue.endpoints.Front(); element != nil { 2584 n = e.acceptQueue.endpoints.Remove(element).(*endpoint) 2585 } 2586 e.acceptMu.Unlock() 2587 if n == nil { 2588 return nil, nil, &tcpip.ErrWouldBlock{} 2589 } 2590 e.acceptCond.Signal() 2591 if peerAddr != nil { 2592 *peerAddr = n.getRemoteAddress() 2593 } 2594 return n, n.waiterQueue, nil 2595 } 2596 2597 // Bind binds the endpoint to a specific local port and optionally address. 2598 func (e *endpoint) Bind(addr tcpip.FullAddress) (err tcpip.Error) { 2599 e.LockUser() 2600 defer e.UnlockUser() 2601 2602 return e.bindLocked(addr) 2603 } 2604 2605 func (e *endpoint) bindLocked(addr tcpip.FullAddress) (err tcpip.Error) { 2606 // Don't allow binding once endpoint is not in the initial state 2607 // anymore. This is because once the endpoint goes into a connected or 2608 // listen state, it is already bound. 2609 if e.EndpointState() != StateInitial { 2610 return &tcpip.ErrAlreadyBound{} 2611 } 2612 2613 e.BindAddr = addr.Addr 2614 addr, netProto, err := e.checkV4MappedLocked(addr) 2615 if err != nil { 2616 return err 2617 } 2618 2619 netProtos := []tcpip.NetworkProtocolNumber{netProto} 2620 2621 // Expand netProtos to include v4 and v6 under dual-stack if the caller is 2622 // binding to a wildcard (empty) address, and this is an IPv6 endpoint with 2623 // v6only set to false. 2624 if netProto == header.IPv6ProtocolNumber { 2625 stackHasV4 := e.stack.CheckNetworkProtocol(header.IPv4ProtocolNumber) 2626 alsoBindToV4 := !e.ops.GetV6Only() && addr.Addr == "" && stackHasV4 2627 if alsoBindToV4 { 2628 netProtos = append(netProtos, header.IPv4ProtocolNumber) 2629 } 2630 } 2631 2632 var nic tcpip.NICID 2633 // If an address is specified, we must ensure that it's one of our 2634 // local addresses. 2635 if len(addr.Addr) != 0 { 2636 nic = e.stack.CheckLocalAddress(addr.NIC, netProto, addr.Addr) 2637 if nic == 0 { 2638 return &tcpip.ErrBadLocalAddress{} 2639 } 2640 e.TransportEndpointInfo.ID.LocalAddress = addr.Addr 2641 } 2642 2643 bindToDevice := tcpip.NICID(e.ops.GetBindToDevice()) 2644 portRes := ports.Reservation{ 2645 Networks: netProtos, 2646 Transport: ProtocolNumber, 2647 Addr: addr.Addr, 2648 Port: addr.Port, 2649 Flags: e.portFlags, 2650 BindToDevice: bindToDevice, 2651 Dest: tcpip.FullAddress{}, 2652 } 2653 port, err := e.stack.ReservePort(e.stack.Rand(), portRes, func(p uint16) (bool, tcpip.Error) { 2654 id := e.TransportEndpointInfo.ID 2655 id.LocalPort = p 2656 // CheckRegisterTransportEndpoint should only return an error if there is a 2657 // listening endpoint bound with the same id and portFlags and bindToDevice 2658 // options. 2659 // 2660 // NOTE: Only listening and connected endpoint register with 2661 // demuxer. Further connected endpoints always have a remote 2662 // address/port. Hence this will only return an error if there is a matching 2663 // listening endpoint. 2664 if err := e.stack.CheckRegisterTransportEndpoint(netProtos, ProtocolNumber, id, e.portFlags, bindToDevice); err != nil { 2665 return false, nil 2666 } 2667 return true, nil 2668 }) 2669 if err != nil { 2670 e.stack.Stats().TCP.FailedPortReservations.Increment() 2671 return err 2672 } 2673 2674 e.boundBindToDevice = bindToDevice 2675 e.boundPortFlags = e.portFlags 2676 // TODO(gvisor.dev/issue/3691): Add test to verify boundNICID is correct. 2677 e.boundNICID = nic 2678 e.isPortReserved = true 2679 e.effectiveNetProtos = netProtos 2680 e.TransportEndpointInfo.ID.LocalPort = port 2681 2682 // Mark endpoint as bound. 2683 e.setEndpointState(StateBound) 2684 2685 return nil 2686 } 2687 2688 // GetLocalAddress returns the address to which the endpoint is bound. 2689 func (e *endpoint) GetLocalAddress() (tcpip.FullAddress, tcpip.Error) { 2690 e.LockUser() 2691 defer e.UnlockUser() 2692 2693 return tcpip.FullAddress{ 2694 Addr: e.TransportEndpointInfo.ID.LocalAddress, 2695 Port: e.TransportEndpointInfo.ID.LocalPort, 2696 NIC: e.boundNICID, 2697 }, nil 2698 } 2699 2700 // GetRemoteAddress returns the address to which the endpoint is connected. 2701 func (e *endpoint) GetRemoteAddress() (tcpip.FullAddress, tcpip.Error) { 2702 e.LockUser() 2703 defer e.UnlockUser() 2704 2705 if !e.EndpointState().connected() { 2706 return tcpip.FullAddress{}, &tcpip.ErrNotConnected{} 2707 } 2708 2709 return e.getRemoteAddress(), nil 2710 } 2711 2712 func (e *endpoint) getRemoteAddress() tcpip.FullAddress { 2713 return tcpip.FullAddress{ 2714 Addr: e.TransportEndpointInfo.ID.RemoteAddress, 2715 Port: e.TransportEndpointInfo.ID.RemotePort, 2716 NIC: e.boundNICID, 2717 } 2718 } 2719 2720 func (*endpoint) HandlePacket(stack.TransportEndpointID, *stack.PacketBuffer) { 2721 // TCP HandlePacket is not required anymore as inbound packets first 2722 // land at the Dispatcher which then can either deliver using the 2723 // worker go routine or directly do the invoke the tcp processing inline 2724 // based on the state of the endpoint. 2725 } 2726 2727 func (e *endpoint) enqueueSegment(s *segment) bool { 2728 // Send packet to worker goroutine. 2729 if !e.segmentQueue.enqueue(s) { 2730 // The queue is full, so we drop the segment. 2731 e.stack.Stats().DroppedPackets.Increment() 2732 e.stats.ReceiveErrors.SegmentQueueDropped.Increment() 2733 return false 2734 } 2735 return true 2736 } 2737 2738 func (e *endpoint) onICMPError(err tcpip.Error, transErr stack.TransportError, pkt *stack.PacketBuffer) { 2739 // Update last error first. 2740 e.lastErrorMu.Lock() 2741 e.lastError = err 2742 e.lastErrorMu.Unlock() 2743 2744 // Update the error queue if IP_RECVERR is enabled. 2745 if e.SocketOptions().GetRecvError() { 2746 e.SocketOptions().QueueErr(&tcpip.SockError{ 2747 Err: err, 2748 Cause: transErr, 2749 // Linux passes the payload with the TCP header. We don't know if the TCP 2750 // header even exists, it may not for fragmented packets. 2751 Payload: pkt.Data().AsRange().ToOwnedView(), 2752 Dst: tcpip.FullAddress{ 2753 NIC: pkt.NICID, 2754 Addr: e.TransportEndpointInfo.ID.RemoteAddress, 2755 Port: e.TransportEndpointInfo.ID.RemotePort, 2756 }, 2757 Offender: tcpip.FullAddress{ 2758 NIC: pkt.NICID, 2759 Addr: e.TransportEndpointInfo.ID.LocalAddress, 2760 Port: e.TransportEndpointInfo.ID.LocalPort, 2761 }, 2762 NetProto: pkt.NetworkProtocolNumber, 2763 }) 2764 } 2765 2766 // Notify of the error. 2767 e.notifyProtocolGoroutine(notifyError) 2768 } 2769 2770 // HandleError implements stack.TransportEndpoint. 2771 func (e *endpoint) HandleError(transErr stack.TransportError, pkt *stack.PacketBuffer) { 2772 handlePacketTooBig := func(mtu uint32) { 2773 e.sndQueueInfo.sndQueueMu.Lock() 2774 e.sndQueueInfo.PacketTooBigCount++ 2775 if v := int(mtu); v < e.sndQueueInfo.SndMTU { 2776 e.sndQueueInfo.SndMTU = v 2777 } 2778 e.sndQueueInfo.sndQueueMu.Unlock() 2779 e.notifyProtocolGoroutine(notifyMTUChanged) 2780 } 2781 2782 // TODO(gvisor.dev/issues/5270): Handle all transport errors. 2783 switch transErr.Kind() { 2784 case stack.PacketTooBigTransportError: 2785 handlePacketTooBig(transErr.Info()) 2786 case stack.DestinationHostUnreachableTransportError: 2787 e.onICMPError(&tcpip.ErrNoRoute{}, transErr, pkt) 2788 case stack.DestinationNetworkUnreachableTransportError: 2789 e.onICMPError(&tcpip.ErrNetworkUnreachable{}, transErr, pkt) 2790 } 2791 } 2792 2793 // updateSndBufferUsage is called by the protocol goroutine when room opens up 2794 // in the send buffer. The number of newly available bytes is v. 2795 func (e *endpoint) updateSndBufferUsage(v int) { 2796 sendBufferSize := e.getSendBufferSize() 2797 e.sndQueueInfo.sndQueueMu.Lock() 2798 notify := e.sndQueueInfo.SndBufUsed >= sendBufferSize>>1 2799 e.sndQueueInfo.SndBufUsed -= v 2800 2801 // Get the new send buffer size with auto tuning, but do not set it 2802 // unless we decide to notify the writers. 2803 newSndBufSz := e.computeTCPSendBufferSize() 2804 2805 // We only notify when there is half the sendBufferSize available after 2806 // a full buffer event occurs. This ensures that we don't wake up 2807 // writers to queue just 1-2 segments and go back to sleep. 2808 notify = notify && e.sndQueueInfo.SndBufUsed < int(newSndBufSz)>>1 2809 e.sndQueueInfo.sndQueueMu.Unlock() 2810 2811 if notify { 2812 // Set the new send buffer size calculated from auto tuning. 2813 e.ops.SetSendBufferSize(newSndBufSz, false /* notify */) 2814 e.waiterQueue.Notify(waiter.WritableEvents) 2815 } 2816 } 2817 2818 // readyToRead is called by the protocol goroutine when a new segment is ready 2819 // to be read, or when the connection is closed for receiving (in which case 2820 // s will be nil). 2821 func (e *endpoint) readyToRead(s *segment) { 2822 e.rcvQueueInfo.rcvQueueMu.Lock() 2823 if s != nil { 2824 e.rcvQueueInfo.RcvBufUsed += s.payloadSize() 2825 s.incRef() 2826 e.rcvQueueInfo.rcvQueue.PushBack(s) 2827 } else { 2828 e.rcvQueueInfo.RcvClosed = true 2829 } 2830 e.rcvQueueInfo.rcvQueueMu.Unlock() 2831 e.waiterQueue.Notify(waiter.ReadableEvents) 2832 } 2833 2834 // receiveBufferAvailableLocked calculates how many bytes are still available 2835 // in the receive buffer. 2836 // rcvQueueMu must be held when this function is called. 2837 func (e *endpoint) receiveBufferAvailableLocked(rcvBufSize int) int { 2838 // We may use more bytes than the buffer size when the receive buffer 2839 // shrinks. 2840 memUsed := e.receiveMemUsed() 2841 if memUsed >= rcvBufSize { 2842 return 0 2843 } 2844 2845 return rcvBufSize - memUsed 2846 } 2847 2848 // receiveBufferAvailable calculates how many bytes are still available in the 2849 // receive buffer based on the actual memory used by all segments held in 2850 // receive buffer/pending and segment queue. 2851 func (e *endpoint) receiveBufferAvailable() int { 2852 e.rcvQueueInfo.rcvQueueMu.Lock() 2853 available := e.receiveBufferAvailableLocked(int(e.ops.GetReceiveBufferSize())) 2854 e.rcvQueueInfo.rcvQueueMu.Unlock() 2855 return available 2856 } 2857 2858 // receiveBufferUsed returns the amount of in-use receive buffer. 2859 func (e *endpoint) receiveBufferUsed() int { 2860 e.rcvQueueInfo.rcvQueueMu.Lock() 2861 used := e.rcvQueueInfo.RcvBufUsed 2862 e.rcvQueueInfo.rcvQueueMu.Unlock() 2863 return used 2864 } 2865 2866 // receiveMemUsed returns the total memory in use by segments held by this 2867 // endpoint. 2868 func (e *endpoint) receiveMemUsed() int { 2869 return int(atomic.LoadInt32(&e.rcvMemUsed)) 2870 } 2871 2872 // updateReceiveMemUsed adds the provided delta to e.rcvMemUsed. 2873 func (e *endpoint) updateReceiveMemUsed(delta int) { 2874 atomic.AddInt32(&e.rcvMemUsed, int32(delta)) 2875 } 2876 2877 // maxReceiveBufferSize returns the stack wide maximum receive buffer size for 2878 // an endpoint. 2879 func (e *endpoint) maxReceiveBufferSize() int { 2880 var rs tcpip.TCPReceiveBufferSizeRangeOption 2881 if err := e.stack.TransportProtocolOption(ProtocolNumber, &rs); err != nil { 2882 // As a fallback return the hardcoded max buffer size. 2883 return MaxBufferSize 2884 } 2885 return rs.Max 2886 } 2887 2888 // rcvWndScaleForHandshake computes the receive window scale to offer to the 2889 // peer when window scaling is enabled (true by default). If auto-tuning is 2890 // disabled then the window scaling factor is based on the size of the 2891 // receiveBuffer otherwise we use the max permissible receive buffer size to 2892 // compute the scale. 2893 func (e *endpoint) rcvWndScaleForHandshake() int { 2894 bufSizeForScale := e.ops.GetReceiveBufferSize() 2895 2896 e.rcvQueueInfo.rcvQueueMu.Lock() 2897 autoTuningDisabled := e.rcvQueueInfo.RcvAutoParams.Disabled 2898 e.rcvQueueInfo.rcvQueueMu.Unlock() 2899 if autoTuningDisabled { 2900 return FindWndScale(seqnum.Size(bufSizeForScale)) 2901 } 2902 2903 return FindWndScale(seqnum.Size(e.maxReceiveBufferSize())) 2904 } 2905 2906 // updateRecentTimestamp updates the recent timestamp using the algorithm 2907 // described in https://tools.ietf.org/html/rfc7323#section-4.3 2908 func (e *endpoint) updateRecentTimestamp(tsVal uint32, maxSentAck seqnum.Value, segSeq seqnum.Value) { 2909 if e.SendTSOk && seqnum.Value(e.recentTimestamp()).LessThan(seqnum.Value(tsVal)) && segSeq.LessThanEq(maxSentAck) { 2910 e.setRecentTimestamp(tsVal) 2911 } 2912 } 2913 2914 // maybeEnableTimestamp marks the timestamp option enabled for this endpoint if 2915 // the SYN options indicate that timestamp option was negotiated. It also 2916 // initializes the recentTS with the value provided in synOpts.TSval. 2917 func (e *endpoint) maybeEnableTimestamp(synOpts header.TCPSynOptions) { 2918 if synOpts.TS { 2919 e.SendTSOk = true 2920 e.setRecentTimestamp(synOpts.TSVal) 2921 } 2922 } 2923 2924 func (e *endpoint) tsVal(now tcpip.MonotonicTime) uint32 { 2925 return e.TSOffset.TSVal(now) 2926 } 2927 2928 func (e *endpoint) tsValNow() uint32 { 2929 return e.tsVal(e.stack.Clock().NowMonotonic()) 2930 } 2931 2932 func (e *endpoint) elapsed(now tcpip.MonotonicTime, tsEcr uint32) time.Duration { 2933 return e.TSOffset.Elapsed(now, tsEcr) 2934 } 2935 2936 // maybeEnableSACKPermitted marks the SACKPermitted option enabled for this endpoint 2937 // if the SYN options indicate that the SACK option was negotiated and the TCP 2938 // stack is configured to enable TCP SACK option. 2939 func (e *endpoint) maybeEnableSACKPermitted(synOpts header.TCPSynOptions) { 2940 var v tcpip.TCPSACKEnabled 2941 if err := e.stack.TransportProtocolOption(ProtocolNumber, &v); err != nil { 2942 // Stack doesn't support SACK. So just return. 2943 return 2944 } 2945 if bool(v) && synOpts.SACKPermitted { 2946 e.SACKPermitted = true 2947 e.stack.TransportProtocolOption(ProtocolNumber, &e.tcpRecovery) 2948 } 2949 } 2950 2951 // maxOptionSize return the maximum size of TCP options. 2952 func (e *endpoint) maxOptionSize() (size int) { 2953 var maxSackBlocks [header.TCPMaxSACKBlocks]header.SACKBlock 2954 options := e.makeOptions(maxSackBlocks[:]) 2955 size = len(options) 2956 putOptions(options) 2957 2958 return size 2959 } 2960 2961 // completeStateLocked makes a full copy of the endpoint and returns it. This is 2962 // used before invoking the probe. 2963 // 2964 // Precondition: e.mu must be held. 2965 func (e *endpoint) completeStateLocked() stack.TCPEndpointState { 2966 s := stack.TCPEndpointState{ 2967 TCPEndpointStateInner: e.TCPEndpointStateInner, 2968 ID: stack.TCPEndpointID(e.TransportEndpointInfo.ID), 2969 SegTime: e.stack.Clock().NowMonotonic(), 2970 Receiver: e.rcv.TCPReceiverState, 2971 Sender: e.snd.TCPSenderState, 2972 } 2973 2974 sndBufSize := e.getSendBufferSize() 2975 // Copy the send buffer atomically. 2976 e.sndQueueInfo.sndQueueMu.Lock() 2977 s.SndBufState = e.sndQueueInfo.TCPSndBufState 2978 s.SndBufState.SndBufSize = sndBufSize 2979 e.sndQueueInfo.sndQueueMu.Unlock() 2980 2981 // Copy the receive buffer atomically. 2982 e.rcvQueueInfo.rcvQueueMu.Lock() 2983 s.RcvBufState = e.rcvQueueInfo.TCPRcvBufState 2984 e.rcvQueueInfo.rcvQueueMu.Unlock() 2985 2986 // Copy the endpoint TCP Option state. 2987 s.SACK.Blocks = make([]header.SACKBlock, e.sack.NumBlocks) 2988 copy(s.SACK.Blocks, e.sack.Blocks[:e.sack.NumBlocks]) 2989 s.SACK.ReceivedBlocks, s.SACK.MaxSACKED = e.scoreboard.Copy() 2990 2991 e.snd.rtt.Lock() 2992 s.Sender.RTTState = e.snd.rtt.TCPRTTState 2993 e.snd.rtt.Unlock() 2994 2995 if cubic, ok := e.snd.cc.(*cubicState); ok { 2996 s.Sender.Cubic = cubic.TCPCubicState 2997 s.Sender.Cubic.TimeSinceLastCongestion = e.stack.Clock().NowMonotonic().Sub(s.Sender.Cubic.T) 2998 } 2999 3000 s.Sender.RACKState = e.snd.rc.TCPRACKState 3001 s.Sender.RetransmitTS = e.snd.retransmitTS 3002 s.Sender.SpuriousRecovery = e.snd.spuriousRecovery 3003 return s 3004 } 3005 3006 func (e *endpoint) initHardwareGSO() { 3007 switch e.route.NetProto() { 3008 case header.IPv4ProtocolNumber: 3009 e.gso.Type = stack.GSOTCPv4 3010 e.gso.L3HdrLen = header.IPv4MinimumSize 3011 case header.IPv6ProtocolNumber: 3012 e.gso.Type = stack.GSOTCPv6 3013 e.gso.L3HdrLen = header.IPv6MinimumSize 3014 default: 3015 panic(fmt.Sprintf("Unknown netProto: %v", e.NetProto)) 3016 } 3017 e.gso.NeedsCsum = true 3018 e.gso.CsumOffset = header.TCPChecksumOffset 3019 e.gso.MaxSize = e.route.GSOMaxSize() 3020 } 3021 3022 func (e *endpoint) initGSO() { 3023 if e.route.HasHardwareGSOCapability() { 3024 e.initHardwareGSO() 3025 } else if e.route.HasSoftwareGSOCapability() { 3026 e.gso = stack.GSO{ 3027 MaxSize: e.route.GSOMaxSize(), 3028 Type: stack.GSOSW, 3029 NeedsCsum: false, 3030 } 3031 } 3032 } 3033 3034 // State implements tcpip.Endpoint.State. It exports the endpoint's protocol 3035 // state for diagnostics. 3036 func (e *endpoint) State() uint32 { 3037 return uint32(e.EndpointState()) 3038 } 3039 3040 // Info returns a copy of the endpoint info. 3041 func (e *endpoint) Info() tcpip.EndpointInfo { 3042 e.LockUser() 3043 // Make a copy of the endpoint info. 3044 ret := e.TransportEndpointInfo 3045 e.UnlockUser() 3046 return &ret 3047 } 3048 3049 // Stats returns a pointer to the endpoint stats. 3050 func (e *endpoint) Stats() tcpip.EndpointStats { 3051 return &e.stats 3052 } 3053 3054 // Wait implements stack.TransportEndpoint.Wait. 3055 func (e *endpoint) Wait() { 3056 waitEntry, notifyCh := waiter.NewChannelEntry(waiter.EventHUp) 3057 e.waiterQueue.EventRegister(&waitEntry) 3058 defer e.waiterQueue.EventUnregister(&waitEntry) 3059 for { 3060 e.LockUser() 3061 running := e.workerRunning 3062 e.UnlockUser() 3063 if !running { 3064 break 3065 } 3066 <-notifyCh 3067 } 3068 } 3069 3070 // SocketOptions implements tcpip.Endpoint.SocketOptions. 3071 func (e *endpoint) SocketOptions() *tcpip.SocketOptions { 3072 return &e.ops 3073 } 3074 3075 // GetTCPSendBufferLimits is used to get send buffer size limits for TCP. 3076 func GetTCPSendBufferLimits(s tcpip.StackHandler) tcpip.SendBufferSizeOption { 3077 var ss tcpip.TCPSendBufferSizeRangeOption 3078 if err := s.TransportProtocolOption(header.TCPProtocolNumber, &ss); err != nil { 3079 panic(fmt.Sprintf("s.TransportProtocolOption(%d, %#v) = %s", header.TCPProtocolNumber, ss, err)) 3080 } 3081 3082 return tcpip.SendBufferSizeOption{ 3083 Min: ss.Min, 3084 Default: ss.Default, 3085 Max: ss.Max, 3086 } 3087 } 3088 3089 // allowOutOfWindowAck returns true if an out-of-window ACK can be sent now. 3090 func (e *endpoint) allowOutOfWindowAck() bool { 3091 now := e.stack.Clock().NowMonotonic() 3092 3093 if e.lastOutOfWindowAckTime != (tcpip.MonotonicTime{}) { 3094 var limit stack.TCPInvalidRateLimitOption 3095 if err := e.stack.Option(&limit); err != nil { 3096 panic(fmt.Sprintf("e.stack.Option(%+v) failed with error: %s", limit, err)) 3097 } 3098 if now.Sub(e.lastOutOfWindowAckTime) < time.Duration(limit) { 3099 return false 3100 } 3101 } 3102 3103 e.lastOutOfWindowAckTime = now 3104 return true 3105 } 3106 3107 // GetTCPReceiveBufferLimits is used to get send buffer size limits for TCP. 3108 func GetTCPReceiveBufferLimits(s tcpip.StackHandler) tcpip.ReceiveBufferSizeOption { 3109 var ss tcpip.TCPReceiveBufferSizeRangeOption 3110 if err := s.TransportProtocolOption(header.TCPProtocolNumber, &ss); err != nil { 3111 panic(fmt.Sprintf("s.TransportProtocolOption(%d, %#v) = %s", header.TCPProtocolNumber, ss, err)) 3112 } 3113 3114 return tcpip.ReceiveBufferSizeOption{ 3115 Min: ss.Min, 3116 Default: ss.Default, 3117 Max: ss.Max, 3118 } 3119 } 3120 3121 // computeTCPSendBufferSize implements auto tuning of send buffer size and 3122 // returns the new send buffer size. 3123 func (e *endpoint) computeTCPSendBufferSize() int64 { 3124 curSndBufSz := int64(e.getSendBufferSize()) 3125 3126 // Auto tuning is disabled when the user explicitly sets the send 3127 // buffer size with SO_SNDBUF option. 3128 if disabled := atomic.LoadUint32(&e.sndQueueInfo.TCPSndBufState.AutoTuneSndBufDisabled); disabled == 1 { 3129 return curSndBufSz 3130 } 3131 3132 const packetOverheadFactor = 2 3133 curMSS := e.snd.MaxPayloadSize 3134 numSeg := InitialCwnd 3135 if numSeg < e.snd.SndCwnd { 3136 numSeg = e.snd.SndCwnd 3137 } 3138 3139 // SndCwnd indicates the number of segments that can be sent. This means 3140 // that the sender can send upto #SndCwnd segments and the send buffer 3141 // size should be set to SndCwnd*MSS to accommodate sending of all the 3142 // segments. 3143 newSndBufSz := int64(numSeg * curMSS * packetOverheadFactor) 3144 if newSndBufSz < curSndBufSz { 3145 return curSndBufSz 3146 } 3147 if ss := GetTCPSendBufferLimits(e.stack); int64(ss.Max) < newSndBufSz { 3148 newSndBufSz = int64(ss.Max) 3149 } 3150 3151 return newSndBufSz 3152 }